-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload.R
99 lines (86 loc) · 3.2 KB
/
load.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
library(tidyverse)
library(tidytext)
library(readtext)
library(stringdist)
library(helpers)
hansards <- readtext(
"data/source/parl.canadiana.ca/*.txt",
docvarsfrom = "filenames",
docvarnames = c("parliament", "session", "volume"),
dvsep = "-"
) %>%
as_tibble() %>%
separate_rows(text, sep = "\f") %>% ## separate on form feed character, inserted by pdftotext for page breaks
group_by(doc_id) %>%
mutate(page = row_number()) %>%
ungroup() %>%
select(doc_id, parliament:page, text) %>%
mutate(uid = paste(doc_id, page))
hansard_words <- hansards %>%
select(doc_id, page, text) %>%
unnest_tokens(word, text) %>%
mutate(page_uid = paste(doc_id, page))
page_count_by_hansard <- hansards %>%
group_by(doc_id) %>%
summarize(page_count = n())
hansard_volume_details <- read_csv(
"data/indices/volume-details.csv",
col_types = cols(
doc_id = col_character(),
start_date = col_date(format = ""),
end_date = col_date(format = ""),
debates_start = col_double(),
index_start = col_double(),
is_bilingual = col_logical()
)
) %>%
separate(doc_id, c("parliament", "session", "volume"), remove = FALSE, convert = TRUE, extra = "drop") %>%
mutate(frontmatter_start = 1, frontmatter_end = debates_start - 1) %>%
mutate(debates_end = index_start - 1) %>%
left_join(page_count_by_hansard %>% rename(index_end = page_count)) %>%
select(
doc_id:end_date,
is_bilingual,
frontmatter_start, frontmatter_end,
debates_start, debates_end,
index_start, index_end
) %>%
mutate(
frontmatter_page_count = frontmatter_end - frontmatter_start + 1,
debates_page_count = debates_end - debates_start + 1,
index_page_count = index_end - index_start + 1
)
identify_section_for_page <- function(doc_id_to_check, page_number_to_lookup) {
hansard_volume <- hansard_volume_details %>%
filter(doc_id == doc_id_to_check) %>%
slice(1)
section <- case_when(
pull(hansard_volume, frontmatter_start) <= page_number_to_lookup & page_number_to_lookup <= pull(hansard_volume, frontmatter_end) ~ "frontmatter",
pull(hansard_volume, debates_start) <= page_number_to_lookup & page_number_to_lookup <= pull(hansard_volume, debates_end) ~ "debates",
pull(hansard_volume, index_start) <= page_number_to_lookup & page_number_to_lookup <= pull(hansard_volume, index_end) ~ "index",
TRUE ~ NA_character_
)
return(section)
}
## NB: uncomment to update page sections; saved file has parliaments 1 through 7
#hansards <- hansards %>%
# mutate(
# page_section = map2_chr(doc_id, page, identify_section_for_page)
# )
#hansards %>% select(uid, page_section) %>% write_csv("data/out/uids_tagged_with_section.csv.gz")
hansards <- hansards %>%
left_join(read_csv("data/out/uids_tagged_with_section.csv.gz"))
find_similar_words <- function(words_to_search, search_word, threshold) {
words_to_search %>%
mutate(search_word_dist = stringsim(word, search_word)) %>%
filter(search_word_dist >= threshold) %>%
distinct(word) %>%
pull(word)
}
find_page_uids_mentioning_words <- function(words_by_page, words_to_search) {
words_by_page %>%
filter(word %in% words_to_search) %>%
select(page_uid) %>%
distinct() %>%
pull(page_uid)
}