NB Classification (Pos, Neg, Nue).Rmd

---
title: "Positive/Negative/Neutral Naive Bayes Classification"
output: rmarkdown::github_document
---

#Load Library
```{r, results='hide', warning=FALSE, message=FALSE}
library(readr)
library(tm)
library(SnowballC)
library(wordcloud)
library(e1071)
library(gmodels)
library(ggplot2)
library(caret)
library(ROCR)
library(dplyr)
```

#Step 1: Load the data
```{r, message=FALSE, warning=FALSE}
tweets <- read_csv("Tweets.csv")
tweets %>% head(n = 10)
```

#Step 2: Explore the data
```{r}
str(tweets)

tweets <- tweets[, -c(1, 3, 5, 7, 9, 10, 12, 13, 14, 15)]

tweets %>% head(n = 10)
```

####Check proportions of negative, neutral, and positive
```{r}
#convert to factor before using table
tweets$airline_sentiment %>% as.factor() -> tweets$airline_sentiment
tweets$airline %>% as.factor() -> tweets$airline 

table(tweets$airline_sentiment)
table(tweets$airline)

#Plot proportion table of airlines with their airline sentiment
tweets %>% ggplot(aes(x = airline, fill = airline_sentiment)) + 
  geom_bar(position = "fill")
```

####Begin preparing the text data
```{r}
#Remove special characters
tweets$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", tweets$text)

#Looks at tweets 
tweets$text[1:5]

#remove @airline name as it is not neccessary
stopwords = c("American", "Delta", "Southwest", "United", "US Airways", "VirginAmerica", "SouthwestAirlines", "AmericanAirlines")
tweets$text %>% removeWords(stopwords) -> tweets$text

#check to make sure words were removed 
tweets$text[1:5]

#create corpus and examine it 
tweets$text %>% VectorSource() %>% VCorpus() -> tweet_corpus

tweet_corpus[1:5] %>% lapply(as.character)
```

####Clean up Corpus
```{r}
#Convert text to lowercase 
tweet_corpus %>% tm_map(content_transformer(tolower)) -> tweet_corpus_clean

#remove numbers, stopwords, and punctuation
tweet_corpus_clean %>% tm_map(removeNumbers) %>% 
  tm_map(removeWords, stopwords('english')) %>% 
  tm_map(removePunctuation) -> tweet_corpus_clean

#Check to see if corpus is clean
tweet_corpus_clean[1:5] %>% lapply(as.character)
```

####Wordstem and check final clean corpus 
```{r}
#After word cloud visualization, came back and added more words to remove
stopwords2 <- c("southwestair", "americanair", "jetblu", "usairway", "will", "newark", "houston", "airport",  "airlin", "just", "lax", "can", "ive", "flightl", "jfk", "what", "let", "want", "flightr", "your",  "that", "follow", "one", "flt", "fli", "even", "use", "week", "two", "anoth", "see", "make", "got", "said", "tonight",  "tomorrow", "put", "year", "dfw", "today", "get", "yet", "number", "told", "day", "also", "morn", "min", "someon", "flight", "fleek",  "tweet")

#words that were taken out but could still add sentiment to tweets
#"gate", "plane", "travel", "pilot", "due", "mile",  "made", "unit", "website", "night", "agent", "point", "onlin", "email", "amp", "keep", "miss", "system", "guy", "show", "think", "websit", "fleet"

#remove other common words that dont help with sentiment, wordstem and strip whitespace and create a dataframe
tweet_corpus_clean %>% tm_map(removeWords, stopwords2) %>% 
  tm_map(stemDocument) %>% 
  tm_map(stripWhitespace) %>% 
  DocumentTermMatrix() -> tweet_dtm

#Check final text 
tweet_corpus_clean[1:5] %>% lapply(as.character)
tweet_dtm
```

####Create training, validation and test dataframe and labels
```{r}
#create random sample
set.seed(123)
idx <- sample(seq(1, 3), size = nrow(tweet_dtm), replace = TRUE, prob = c(.8, .2, .2 ))

#training, test, and validation
tweet_dtm_train <- tweet_dtm[idx == 1, ]
tweet_dtm_test <- tweet_dtm[idx == 2,]
tweet_dtm_validation <- tweet_dtm[idx == 3,]

#labels 
tweet_train_labels <- tweets[idx == 1, ]$airline_sentiment
tweet_test_labels <- tweets[idx == 2, ]$airline_sentiment
tweet_validation_labels <- tweets[idx == 3, ]$airline_sentiment

#check that proportions are similar
tweet_train_labels %>% table() %>% prop.table()
tweet_test_labels %>% table() %>% prop.table()
tweet_validation_labels %>% table() %>% prop.table()
```

####Word Cloud visualization
```{r}
tweet_corpus_clean %>% wordcloud(max.words = 150, min.freq = 5, random.order = F)
```

####Subset the data to visualize common words for each sentiment
```{r, message=FALSE, warning=FALSE}
tweets %>% subset(airline_sentiment== "positive") -> positive
tweets %>% subset(airline_sentiment== "negative") -> negative
tweets %>% subset(airline_sentiment== "neutral") -> neutral

positive$text %>% wordcloud(max.words = 100, scale = c(3, .5))
negative$text %>% wordcloud(max.words = 100, scale = c(3, .5))
neutral$text %>% wordcloud(max.words = 100, scale = c(3, .5))
```

#Step 3: Training a model on the data
```{r}
tweet_dtm_train %>% removeSparseTerms(0.999) -> tweet_dtm_freq_train
tweet_dtm_freq_train

tweet_dtm_train %>% findFreqTerms(5) -> tweet_freq_words
str(tweet_freq_words)
```

####Create DTMs with only the frequent terms
```{r}
tweet_dtm_freq_train <- tweet_dtm_train[ , tweet_freq_words]
tweet_dtm_freq_validation <- tweet_dtm_validation[ , tweet_freq_words]
tweet_dtm_freq_test <- tweet_dtm_test[ , tweet_freq_words]
```

####Create a function to convert counts to a factor and apply it to columns of train/test data and begin training a model on the data
```{r}
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}

tweet_dtm_freq_train %>% apply(MARGIN = 2, convert_counts) -> tweet_train
tweet_dtm_freq_validation %>% apply(MARGIN = 2, convert_counts) -> tweet_validation 
tweet_dtm_freq_test %>% apply(MARGIN = 2, convert_counts) -> tweet_test

tweet_train %>% naiveBayes(tweet_train_labels) -> tweet_classifier
```

####Evaluate the model's performance on the validation dataset
```{r}
tweet_classifier %>% predict(tweet_validation) -> tweet_validation_pred

tweet_validation_pred %>% head(n = 15)

tweet_validation_pred %>% confusionMatrix(tweet_validation_labels) -> conf

conf

confusion_matrix <- as.data.frame(table(tweet_validation_pred, tweet_validation_labels))

confusion_matrix %>% ggplot(aes(x = tweet_validation_pred, y = tweet_validation_labels)) + 
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "#ff7f50",
                      high = "#003767",
                      trans = "log")
```

####75.97% classified accurately

#Step 5: Improve the model on the validation dataset
```{r}
tweet_train %>% naiveBayes(tweet_train_labels, laplace = 1) -> tweet_classifier2

tweet_classifier2 %>% predict(tweet_validation) -> tweet_validation_pred2

tweet_validation_pred2 %>% confusionMatrix(tweet_validation_labels) -> conf2

conf2

confusion_matrix2 <- as.data.frame(table(tweet_validation_pred2, tweet_validation_labels))

confusion_matrix2 %>% ggplot(aes(x = tweet_validation_pred2, y = tweet_validation_labels)) +
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "#ff7f50",
                      high = "#003767",
                      trans = "log")
```

####76.55% classified accurately

#Final Test on test dataset
```{r}
tweet_classifier2 %>% predict(tweet_test) -> tweet_test_pred

tweet_test_pred %>% confusionMatrix(tweet_test_labels) -> conf3

conf3

confusion_matrix3 <- as.data.frame(table(tweet_test_pred, tweet_test_labels))

confusion_matrix3 %>% ggplot(aes(x = tweet_test_pred, y = tweet_test_labels)) +
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "#ff7f50",
                      high = "#003767",
                      trans = "log")
```

####77.42% classified correctly