-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathImputeGenderFromName.R
58 lines (49 loc) · 1.83 KB
/
ImputeGenderFromName.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(tidyverse)
library(naniar)
NameData <- readRDS("VoterNames.rds")
NameDataSetup <- NameData %>%
ungroup() %>% mutate(ID=row_number()) %>%
arrange(first_name, birth_year) %>%
replace_with_na(replace=list(gender_code = "U")) %>%
group_by(birth_year) %>%
mutate(PctFemaleYear=mean(gender_code=="F", na.rm=T)) %>%
ungroup() %>% group_by(first_name) %>%
mutate(PctFemaleName=mean(gender_code=="F", na.rm=T),
NName=sum(!is.na(gender_code))) %>%
ungroup() %>% group_by(first_name, birth_year) %>%
mutate(PctFemaleNameYear=mean(gender_code=="F", na.rm=T),
NNameYear=sum(!is.na(gender_code)))
NameDataImp <-
NameDataSetup %>% ungroup() %>%
mutate(
PctImpute=case_when(
(!is.na(gender_code) & gender_code=="F")~1,
(!is.na(gender_code) & gender_code=="M")~0,
(NNameYear>=5)~PctFemaleNameYear,
(NName >=5)~PctFemaleName,
T ~PctFemaleYear
),
CoinFlip=rbinom(n(), 1, PctImpute),
gender_code_imp=if_else(CoinFlip==1, "F", "M")
)
GenderByGeo <- NameDataImp %>%
group_by(county_id, precinct_desc, precinct_abbrv) %>%
summarize(
PctFemaleRaw=mean(gender_code=="F", na.rm=T),
PctFemaleImp=mean(gender_code_imp=="F"),
PctDiff=abs(PctFemaleRaw-PctFemaleImp)
)
ggplot(data=GenderByGeo, aes(x=PctFemaleRaw, y=PctFemaleImp)) +
geom_point() + geom_abline(aes(intercept=0, slope=1), colour="red")
GenderImpSummary <- NameDataImp %>%
filter(!is.na(precinct_abbrv)) %>%
group_by(county_id, precinct_desc, precinct_abbrv) %>%
summarise(
Female=sum(gender_code_imp=="F"),
Male=sum(gender_code_imp=="M")
) %>%
gather(Female, Male, key="level", value="Number") %>%
mutate(Percent=Number/sum(Number)*100,
variable="Gender (Imputed)") %>%
arrange(county_id, precinct_desc, precinct_abbrv)
saveRDS(GenderImpSummary, "GenderImpSummary.rds")