-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpublicru_xml_to_csv.r
181 lines (123 loc) · 6.72 KB
/
publicru_xml_to_csv.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
library(xml2)
library(tidyverse)
library(lubridate)
library(magrittr)
library(readr)
## stating where the xml files are located
filenames <- list.files(path = "~/media_nlp/xmlki/vestifm_may_sep_xmlki", pattern="*.xml", full.names=TRUE)
## reading in the data in a single object
df <- do.call(rbind, lapply(filenames, function(x) {
doc <- read_xml(x)
# deleting <author>, <url>, <page>, <issue> tags as irrelevant
xml_remove(xml_find_all(doc, xpath = ".//item/author | .//item/url | .//item/page | .//item/issue"))
# deleting items with empty <description> field
special_nodes <- xml_find_all(doc, ".//item[child::description[text()]]")
list_of_lists <- lapply(special_nodes, function(x) {
as_list(x, ns = c("title", "description",
"pubDate", "source", "place", "genre", "type", "period"))
})
df <- as.data.frame(t(matrix(unlist(list_of_lists), nrow=length(list_of_lists[[1]]))))
}))
names(df) <- c("title", "description", "pubDate", "source", "place", "genre", "type", "period")
## diagnostics
head(df)
## visual diagnostics for suddenly missed data
df %>%
# this takes from 6th to 16th character from the pubDate column
mutate(pubDate = dmy(substr(pubDate, 6, 16))) %>%
# introducing breaks by month as diagnostic feature
mutate(posted_week = as.Date(cut(pubDate, breaks = "week"))) %>%
count(posted_week) %>%
ggplot(aes (x = posted_week, y = n)) + geom_col()
## reorganizing and cleaning the data a bit
df %<>%
# this takes from 6th to 16th character from the pubDate column
mutate(pubDate = dmy(substr(pubDate, 6, 16))) %>%
# introducing breaks by month as diagnostic feature
mutate(posted_mon = as.Date(cut(pubDate, breaks = "month"))) %>%
# deleting items with duplicated titles
filter(!duplicated(title)) %>% ## for the future: implement smart filtering
# cleaning the text a bit
mutate(description = gsub("\\n|\\r|Ñëåäóþùèé ñëàéä", " ", description))
## writing the cleaned data into csv
write.csv(df, "~/media_nlp/vestifm_df_may_sep.csv", row.names = FALSE)
# creating big dataset for may-septermber on 9 medias
filenames <- list.files(path = "~/media_nlp/may_sep", pattern="*.csv", full.names=TRUE)
test <- do.call(rbind, lapply(filenames, function(x) {
filler <- read_csv(x, locale = readr::locale(encoding = "UTF-8"))
}))
filenames <- list.files(path = "~/media_nlp/may_sep_lem", pattern="*.csv", full.names=TRUE)
test_2 <- unlist(do.call(c, lapply(filenames, function(x) {
filler <- read_csv(x, col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
})))
# Do the same as previous function, but there was something wrong with it...
test_1 <- read_csv(filenames[1], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_2 <- read_csv(filenames[2], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_3 <- read_csv(filenames[3], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_4 <- read_csv(filenames[4], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_5 <- read_csv(filenames[5], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_6 <- read_csv(filenames[6], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_7 <- read_csv(filenames[7], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
test_9 <- read_csv(filenames[9], col_names = FALSE, locale = readr::locale(encoding = "UTF-8"))
gl_test <- rbind(test_1, test_2, test_3, test_4, test_5, test_6, test_7, test_9)
# Binding news info and their lemmas
may_sep <- cbind(gl_test, test)
# fultering too short news
may_sep <- may_sep %>%
mutate(X1 = gsub("[[:punct:]]|S", replacement = "", X1)) %>%
filter(nchar(X1) >= 125)
#######
#
# Playing with data
#
#######
write.csv(may_sep, file = "~/media_nlp/may_sep_full.csv", fileEncoding = "UTF-8")
library(stm)
may_sep_proc <- textProcessor(may_sep$X1, lowercase = TRUE,
removestopwords = TRUE, removenumbers = TRUE, language = "russian",
stem = FALSE, removepunctuation = FALSE,
metadata = may_sep)
plotRemoved(may_sep_proc$documents, lower.thresh=seq(0,5000, by=5))
ggplot(may_sep, aes(nchar(X1))) +
geom_histogram(bins = 500) +
scale_x_continuous(limits = c(0, 1000))
# lower.thresh = 15, upper.thresh = ??? ~230000 words removed
may_sep_out <- prepDocuments(may_sep_proc$documents, may_sep_proc$vocab, may_sep_proc$meta, lower.thresh = 10)
# Removing 219661 of 249966 terms (519006 of 10027530 tokens) due to frequency
# Your corpus now has 35063 documents, 30305 terms and 9508524 tokens.
may_sep_100 <- stm(documents = may_sep_out$documents, vocab = may_sep_out$vocab,
K = 100, max.em.its = 50, data = may_sep_out$meta,
init.type = "Spectral", seed = 42)
save(may_sep_100, file = "~/media_nlp/may_sep_100.RData")
may_sep_120 <- stm(documents = may_sep_out$documents, vocab = may_sep_out$vocab,
K = 120, max.em.its = 50, data = may_sep_out$meta,
init.type = "Spectral", seed = 42)
save(may_sep_120, file = "~/media_nlp/may_sep_120.RData")
may_sep_60 <- stm(documents = may_sep_out$documents, vocab = may_sep_out$vocab,
K = 60, max.em.its = 50, data = may_sep_out$meta,
init.type = "Spectral", seed = 42)
save(may_sep_60, file = "~/media_nlp/may_sep_60.RData")
may_sep_80 <- stm(documents = may_sep_out$documents, vocab = may_sep_out$vocab,
K = 80, max.em.its = 50, data = may_sep_out$meta,
init.type = "Spectral", seed = 42)
save(may_sep_80, file = "~/media_nlp/may_sep_80.RData")
may_sep_92 <- stm(documents = may_sep_out$documents, vocab = may_sep_out$vocab,
K =92, max.em.its = 50, data = may_sep_out$meta,
init.type = "Spectral", seed = 42)
save(may_sep_92, file = "~/media_nlp/may_sep_92.RData")
plot(may_sep_100, type = "summary", labeltype = "frex", topics = 1:30, n = 5)
findTopic(may_sep_100, type = "frex", n = 20, c("covid", "sars", "êîâèä", "êîðîíàâèðóñ"))
toLDAvis(may_sep_100, may_sep_out$documents, reorder.topics = F)
plot(topicCorr(may_sep_100, cutoff = 0.15))
heatmap(topicCorr(may_sep_100)$cor, scale = "none")
fviz_nbclust(topicCorr(frb_ctm_80)$cor, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(topicCorr(frb_ctm_80)$cor, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
fviz_nbclust(topicCorr(frb_ctm_80)$cor, kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
#stmCorrViz
# stmCorrViz(may_sep_100, "corrviz.html", documents_raw = forbes$description, documents_matrix = frb_proc$documents, verbose = T)
# toLDAvis
toLDAvis(frb_ctm_80, frb_out$documents, reorder.topics = FALSE)