-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathjoin_UtilityName.Rmd
92 lines (62 loc) · 3.04 KB
/
join_UtilityName.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
---
title: "join_UtilityName"
author: "Victor Sette Gripp"
date: "November 9, 2017"
output: html_notebook
---
```{r, echo=FALSE}
library(dplyr)
library(ggplot2)
library(scales)
library(RateParser)
library(yaml)
library(purrr)
library(fuzzywuzzyR)
library(stringdist)
```
```{r, echo=FALSE}
# load useful functions and plot config
source("R/utils.R")
source("R/plots.R")
#Retrieve the directories and files in the directores from the Open-Water-Specification-File directory
owrs_path <- "../Open-Water-Rate-Specification/full_utility_rates/California";
#load OWRS
df_OWRS <- tbl_df(as.data.frame(list("filepath"=getFileNames(owrs_path)), stringsAsFactors=FALSE)) %>%
mutate(owrs_directory = map(filepath, strsplit, split="/") %>% map(c(1,1))) %>%
mutate(filename = map(filepath, strsplit, split="/") %>% map(c(1,2))) %>%
mutate(utility_id = map(owrs_directory, strsplit, split=" ") %>%
map(1) %>%map(tail, n=1) %>%
map(gsub, pattern="\\D", replacement="") %>%
map(as.numeric)) %>%
mutate(effective_date = sapply(filename, extract_date) ) %>%
mutate(utility_name = sapply(as.character(owrs_directory), extract_utility_name) )
```
```{r, echo=FALSE}
#load supplier reports, geoinformation and pwsid_record
supplier_reports <- read.csv('data/supplier_report.csv', stringsAsFactors=FALSE)
#supplier_geo <- read.csv('data/suppliers.csv', stringsAsFactors=FALSE)
supplier_pwsid <- read.csv('data/utilities_for_OWRS.csv', stringsAsFactors = FALSE)
# append to df_OWRS the best fuzzy match for utility_name to get pwsid
# cutoff chosen arbitraily, other values can be tested
# using fuzzywuzzy instead
df_OWRS$fuzzy_match <- as.character(sapply(df_OWRS$utility_name, GetCloseMatches,
sequence_strings = supplier_pwsid$Agency_Name, n=1L, cutoff = 0.85))
## We have to manually check here if the fuzzy matching is not getting "no matches" (list()) or false positives.
correct_matches <- read.csv('data/correct_fuzzy_matches.csv')
fuzzy_check <- df_OWRS[which(df_OWRS$utility_name != df_OWRS$fuzzy_match),]
fuzzy_check <- fuzzy_check[which(!(fuzzy_check$utility_name %in% correct_matches$utility_name)), ]
no_matches <- fuzzy_check[which(fuzzy_check$fuzzy_match=="list()"),]
paste("There were", nrow(no_matches), "NO-MATCHED utilities.")
paste("Please, check also the fuzzy_check data frame for false positives.")
paste("Matches to check for false positives", nrow(fuzzy_check))
## Updating the list of correct_matches after checking that they are not false positives
# correct_matches <- rbind(correct_matches,fuzzy_check[,c("utility_name", "fuzzy_match")] )
## Saving updated list of correct matches
# write.csv(fuzzy_check[,c("utility_name", "fuzzy_match")], "data/correct_fuzzy_matches.csv")
```
```{r}
# get pswid
merged_OWRS <- merge(df_OWRS, supplier_pwsid, by.x = "fuzzy_match", by.y = "Agency_Name", all.x=TRUE, all.y=FALSE)
# merge with suplier report
merged_OWRS <- merge(merged_OWRS, supplier_reports, by.x = "PWSID", by.y = "report_pwsid", all.x=TRUE, all.y=FALSE)
```