-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeyword_cooccurrence_matrix
96 lines (75 loc) · 3.79 KB
/
keyword_cooccurrence_matrix
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 20200502
# cristianmejia00@gmail.com
# Create a keyword cooccurrence matrix.
# Assuming we have a selection of keywords or "master list"
# Inputs:
# - A bibliometric dataset from the Web of Science. (Or a clean text vector per each document)
# - A list of keywords we want to analyse (The master keyword list)
# Output:
# - A keyword cooccurrence matrix with rows and columns of the same size of the keyword master list,
# and the contents being the number of documents where a pair of keyword cooccur.
#######################################################################
# Call libraries
#######################################################################
# Note: the first time you run this in your PC it might take a while.
if(!"plyr" %in% rownames(installed.packages())) {install.packages("plyr")}
if(!"tm" %in% rownames(installed.packages())) {install.packages("tm")}
library(plyr)
library(tm)
#######################################################################
# Read the data
#######################################################################
# To read a file from Web of Science (.txt) in your local machine in Windows use this, and select your file
dataset <- fread(file.choose(), stringsAsFactors = FALSE)
# Or use my sample (Remove the "#" symbol in the next line)
#dataset <- fread('https://raw.githubusercontent.com/cristianmejia00/kajikawa_lab/master/test_data_WOS.csv', stringsAsFactors = FALSE, fill = TRUE)
#######################################################################
# Prepare the text
#######################################################################
# In this secuence:
# -- Unify Title and Abstract
# ---- Convert text to "tm" object
# ------ To lowercase
# ------ Remove stopwords
# ------ Remove numbers and symbols
# ------ Remove extra whitespaces
# ---- Convert "tm" object to text
documents <- paste(dataset$TI, dataset$AB, sep = ". ")
text <- Corpus(VectorSource(documents)) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
text <- unlist(sapply(1:length(text), function(x){return(text[[x]]$content)}))
#######################################################################
# Select the master list of keywords
#######################################################################
# At this point is assumed that you computed a master keyword list. Then, use that.
# Or create the list manually:
master_list <- c("business", "innovation", "digital", "ecosystem", "technology", "dataset")
#######################################################################
# Compute the keyword cooccurrence
#######################################################################
# Document frequency per keyword
# For each keyword:
# -- 1 if the keyword exists in the document (if the keyword is repeated in the document we only count 1)
# -- 0 if the keyword does not exist in the document
keyword_document_freq <- lapply(master_list, function(x){
as.numeric(grepl(x, text))
})
# Transform to Keyword X Document matrix
keyword_matrix <- matrix(unlist(keyword_document_freq),
ncol = length(myText),
nrow = length(keyword_document_freq),
byrow = TRUE)
# Coocurence matrix
# Note: The cross product of a "document x keyword matrix" is the "keyword cooccurrence matrix".
keyword_coocurrance <- keyword_matrix %*% t(keyword_matrix)
# Verify the matrix is squared. This should be true.
ncol(keyword_coocurrance) == nrow(keyword_coocurrance)
# Add the column and row names
rownames(keyword_coocurrance) <- master_list
colnames(keyword_coocurrance) <- master_list
# write the matrix (Optional because it can be very large)
write.csv(keyword_coocurrance, file = "keyword_cooccurrence.csv")