-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNB Classification (Pos, Neg, Nue).Rmd
232 lines (178 loc) · 7.41 KB
/
NB Classification (Pos, Neg, Nue).Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
---
title: "Positive/Negative/Neutral Naive Bayes Classification"
output: rmarkdown::github_document
---
#Load Library
```{r, results='hide', warning=FALSE, message=FALSE}
library(readr)
library(tm)
library(SnowballC)
library(wordcloud)
library(e1071)
library(gmodels)
library(ggplot2)
library(caret)
library(ROCR)
library(dplyr)
```
#Step 1: Load the data
```{r, message=FALSE, warning=FALSE}
tweets <- read_csv("Tweets.csv")
tweets %>% head(n = 10)
```
#Step 2: Explore the data
```{r}
str(tweets)
tweets <- tweets[, -c(1, 3, 5, 7, 9, 10, 12, 13, 14, 15)]
tweets %>% head(n = 10)
```
####Check proportions of negative, neutral, and positive
```{r}
#convert to factor before using table
tweets$airline_sentiment %>% as.factor() -> tweets$airline_sentiment
tweets$airline %>% as.factor() -> tweets$airline
table(tweets$airline_sentiment)
table(tweets$airline)
#Plot proportion table of airlines with their airline sentiment
tweets %>% ggplot(aes(x = airline, fill = airline_sentiment)) +
geom_bar(position = "fill")
```
####Begin preparing the text data
```{r}
#Remove special characters
tweets$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", tweets$text)
#Looks at tweets
tweets$text[1:5]
#remove @airline name as it is not neccessary
stopwords = c("American", "Delta", "Southwest", "United", "US Airways", "VirginAmerica", "SouthwestAirlines", "AmericanAirlines")
tweets$text %>% removeWords(stopwords) -> tweets$text
#check to make sure words were removed
tweets$text[1:5]
#create corpus and examine it
tweets$text %>% VectorSource() %>% VCorpus() -> tweet_corpus
tweet_corpus[1:5] %>% lapply(as.character)
```
####Clean up Corpus
```{r}
#Convert text to lowercase
tweet_corpus %>% tm_map(content_transformer(tolower)) -> tweet_corpus_clean
#remove numbers, stopwords, and punctuation
tweet_corpus_clean %>% tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords('english')) %>%
tm_map(removePunctuation) -> tweet_corpus_clean
#Check to see if corpus is clean
tweet_corpus_clean[1:5] %>% lapply(as.character)
```
####Wordstem and check final clean corpus
```{r}
#After word cloud visualization, came back and added more words to remove
stopwords2 <- c("southwestair", "americanair", "jetblu", "usairway", "will", "newark", "houston", "airport", "airlin", "just", "lax", "can", "ive", "flightl", "jfk", "what", "let", "want", "flightr", "your", "that", "follow", "one", "flt", "fli", "even", "use", "week", "two", "anoth", "see", "make", "got", "said", "tonight", "tomorrow", "put", "year", "dfw", "today", "get", "yet", "number", "told", "day", "also", "morn", "min", "someon", "flight", "fleek", "tweet")
#words that were taken out but could still add sentiment to tweets
#"gate", "plane", "travel", "pilot", "due", "mile", "made", "unit", "website", "night", "agent", "point", "onlin", "email", "amp", "keep", "miss", "system", "guy", "show", "think", "websit", "fleet"
#remove other common words that dont help with sentiment, wordstem and strip whitespace and create a dataframe
tweet_corpus_clean %>% tm_map(removeWords, stopwords2) %>%
tm_map(stemDocument) %>%
tm_map(stripWhitespace) %>%
DocumentTermMatrix() -> tweet_dtm
#Check final text
tweet_corpus_clean[1:5] %>% lapply(as.character)
tweet_dtm
```
####Create training, validation and test dataframe and labels
```{r}
#create random sample
set.seed(123)
idx <- sample(seq(1, 3), size = nrow(tweet_dtm), replace = TRUE, prob = c(.8, .2, .2 ))
#training, test, and validation
tweet_dtm_train <- tweet_dtm[idx == 1, ]
tweet_dtm_test <- tweet_dtm[idx == 2,]
tweet_dtm_validation <- tweet_dtm[idx == 3,]
#labels
tweet_train_labels <- tweets[idx == 1, ]$airline_sentiment
tweet_test_labels <- tweets[idx == 2, ]$airline_sentiment
tweet_validation_labels <- tweets[idx == 3, ]$airline_sentiment
#check that proportions are similar
tweet_train_labels %>% table() %>% prop.table()
tweet_test_labels %>% table() %>% prop.table()
tweet_validation_labels %>% table() %>% prop.table()
```
####Word Cloud visualization
```{r}
tweet_corpus_clean %>% wordcloud(max.words = 150, min.freq = 5, random.order = F)
```
####Subset the data to visualize common words for each sentiment
```{r, message=FALSE, warning=FALSE}
tweets %>% subset(airline_sentiment== "positive") -> positive
tweets %>% subset(airline_sentiment== "negative") -> negative
tweets %>% subset(airline_sentiment== "neutral") -> neutral
positive$text %>% wordcloud(max.words = 100, scale = c(3, .5))
negative$text %>% wordcloud(max.words = 100, scale = c(3, .5))
neutral$text %>% wordcloud(max.words = 100, scale = c(3, .5))
```
#Step 3: Training a model on the data
```{r}
tweet_dtm_train %>% removeSparseTerms(0.999) -> tweet_dtm_freq_train
tweet_dtm_freq_train
tweet_dtm_train %>% findFreqTerms(5) -> tweet_freq_words
str(tweet_freq_words)
```
####Create DTMs with only the frequent terms
```{r}
tweet_dtm_freq_train <- tweet_dtm_train[ , tweet_freq_words]
tweet_dtm_freq_validation <- tweet_dtm_validation[ , tweet_freq_words]
tweet_dtm_freq_test <- tweet_dtm_test[ , tweet_freq_words]
```
####Create a function to convert counts to a factor and apply it to columns of train/test data and begin training a model on the data
```{r}
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
tweet_dtm_freq_train %>% apply(MARGIN = 2, convert_counts) -> tweet_train
tweet_dtm_freq_validation %>% apply(MARGIN = 2, convert_counts) -> tweet_validation
tweet_dtm_freq_test %>% apply(MARGIN = 2, convert_counts) -> tweet_test
tweet_train %>% naiveBayes(tweet_train_labels) -> tweet_classifier
```
####Evaluate the model's performance on the validation dataset
```{r}
tweet_classifier %>% predict(tweet_validation) -> tweet_validation_pred
tweet_validation_pred %>% head(n = 15)
tweet_validation_pred %>% confusionMatrix(tweet_validation_labels) -> conf
conf
confusion_matrix <- as.data.frame(table(tweet_validation_pred, tweet_validation_labels))
confusion_matrix %>% ggplot(aes(x = tweet_validation_pred, y = tweet_validation_labels)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
scale_fill_gradient(low = "#ff7f50",
high = "#003767",
trans = "log")
```
####75.97% classified accurately
#Step 5: Improve the model on the validation dataset
```{r}
tweet_train %>% naiveBayes(tweet_train_labels, laplace = 1) -> tweet_classifier2
tweet_classifier2 %>% predict(tweet_validation) -> tweet_validation_pred2
tweet_validation_pred2 %>% confusionMatrix(tweet_validation_labels) -> conf2
conf2
confusion_matrix2 <- as.data.frame(table(tweet_validation_pred2, tweet_validation_labels))
confusion_matrix2 %>% ggplot(aes(x = tweet_validation_pred2, y = tweet_validation_labels)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
scale_fill_gradient(low = "#ff7f50",
high = "#003767",
trans = "log")
```
####76.55% classified accurately
#Final Test on test dataset
```{r}
tweet_classifier2 %>% predict(tweet_test) -> tweet_test_pred
tweet_test_pred %>% confusionMatrix(tweet_test_labels) -> conf3
conf3
confusion_matrix3 <- as.data.frame(table(tweet_test_pred, tweet_test_labels))
confusion_matrix3 %>% ggplot(aes(x = tweet_test_pred, y = tweet_test_labels)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
scale_fill_gradient(low = "#ff7f50",
high = "#003767",
trans = "log")
```
####77.42% classified correctly