.Rhistory

#setwd("......") - setting the working directory
# install.packages('tidyverse')
# install.packages('tidytext')
# install.packages('wordcloud')
# install.packages('stringr')
# install.packages('igraph')
# install.packages('ggraph')
# install.packages('widyr')
# install.packages('broom')
# install.packages('DT')
# install.packages('irlba')
# install.packages('topicmodels')
# install.packages('tm')
# install.packages('caret')
# install.packages('glmnet')
# install.packages('textdata')
# install.packages('hrbrthemes')
# install.packages('xgboost')
# install.packages('ggplot2')
# install.packages('dplyr')
# install.packages('gridExtra')
# install.packages('ggthemes')
# install.packages('RColorBrewer')
# install.packages('grid')
# install.packages('viridis')
# install.packages('lubridate')
# instrall.packages('readr')
# install.packages('plotly')
# install.packages('tidyyr')
library(tidyr) #create tidydata
library(tidyverse)
library(tidytext) # text manipulation
library(wordcloud) # word cloud
library(stringr) #string manipulation
library(igraph)
library(ggraph)
library(widyr)
library(broom)
library(DT)
library(irlba)
library(topicmodels) # for LDA topic modelling
library(tm) # general text mining functions, making document term matrixes
library(caret)
library(glmnet)
library(textdata) #To borrow an amazing package from notebook
library(xgboost) #To borrow an amazing package from notebook
library(ggplot2) #Plotting graph
library(dplyr)
library(gridExtra)
library(ggthemes)
library(RColorBrewer)#COLOUR
library(grid)
library(viridis)
library(lubridate)
library(readr)
library(plotly) #Plotting graph
library(hrbrthemes)
library(gganimate)
rm(list=ls())
#setting fill colour constraint
fillColor = "#FFA07A"
fillColor2 = "#F1C40F"
#reading the data from csv file
episode = read_csv("episode_info.csv")
scripts = read_csv("scripts.csv")
#creating a function scripts for x1 and dialogue
scripts =  scripts %>%
rename(postID = X1) %>%
rename(text = Dialogue)
#to view the data type and dataset
glimpse(episode)
View(episode)
#to view the data type and dataset
glimpse(scripts)
View(scripts)
#filer the top10 characters whom spoke most
Top10Characters = scripts %>%
group_by(Character) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(Character = reorder(Character,Count)) %>%
head(10)
Top10Characters %>%
ggplot(aes(x = Character,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = Character, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Character',
y = 'Count',
title = 'Character and Count: Which Character spoke more?') +
coord_flip() +
theme_bw()
#filer the top10 characters whom spoke the longest sentence
scripts$len = str_count(scripts$text)
scriptsTopTenCharacters = scripts %>%
filter(Character %in% Top10Characters$Character)
scriptsTopTenCharacters %>%
group_by(Character) %>%
summarise(CountMedian = median(len,na.rm = TRUE)) %>%
ungroup() %>%
mutate(Character = reorder(Character,CountMedian)) %>%
ggplot(aes(x = Character,y = CountMedian)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = Character, y = 1, label = paste0("(",CountMedian,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Character',
y = 'Count',
title = 'Character and Count: Which Character Spoke Long Sentences?') +
coord_flip() +
theme_bw()
#Tokenisation
#To group a text according to a class object
scripts %>%
unnest_tokens(word, text) %>%
head(10)
#Remove the most Commonly occuring words in English language.
scripts %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>% head(10)
#Removing the Stop words
#Top Ten  most Common Words Overall Characters
createBarPlotCommonWords = function(train,title)
{
train %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
head(10) %>%
ggplot(aes(x = word,y = n)) +
geom_bar(stat='identity',colour="white", fill =fillColor) +
geom_text(aes(x = word, y = 1, label = paste0("(",n,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Word', y = 'Word Count',
title = title) +
coord_flip() +
theme_bw()
}
createBarPlotCommonWords(scripts,'Top 10 most Common Words Overall')
#Top Ten  most Common Words for Jerry
createBarPlotCommonWords(scripts %>%
filter(str_detect(Character,"JERRY")),
'Top 10 most Common Words Of Jerry')
#Top Ten  most Common Words for George
createBarPlotCommonWords(scripts %>%
filter(str_detect(Character,"GEORGE")),
'Top 10 most Common Words Of George')
#Top Ten  most Common Words for Elaine
createBarPlotCommonWords(scripts %>%
filter(str_detect(Character,"ELAINE")),
'Top 10 most Common Words Of Elaine')
#Creating a Bigram to separate which counts all repeating two words
count_bigrams <- function(dataset) {
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
visualize_bigrams_individual <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a,end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
scripts %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
unite(bigramWord, word1, word2, sep = " ") %>%
group_by(bigramWord) %>%
tally() %>%
ungroup() %>%
arrange(desc(n)) %>%
mutate(bigramWord = reorder(bigramWord,n)) %>%
head(10) %>%
ggplot(aes(x = bigramWord,y = n)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = bigramWord, y = 1, label = paste0("(",n,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Bigram',
y = 'Count',
title = 'Bigram and Count') +
coord_flip() +
theme_bw()
#Creating a Trigram to separate which counts all repeating two words
scripts %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2","word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
unite(trigramWord, word1, word2, word3,sep = " ") %>%
group_by(trigramWord) %>%
tally() %>%
ungroup() %>%
arrange(desc(n)) %>%
mutate(trigramWord = reorder(trigramWord,n)) %>%
head(10) %>%
ggplot(aes(x = trigramWord,y = n)) +
geom_bar(stat='identity',colour="white", fill = fillColor) +
geom_text(aes(x = trigramWord, y = 1, label = paste0("(",n,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Trigram',
y = 'Count',
title = 'Trigram and Count') +
coord_flip() +
theme_bw()
#Create a Relationship network among words
trainWords <- scripts %>%
count_bigrams()
trainWords %>%
filter(n > 10) %>%
visualize_bigrams()
#Analyzing the relationship word with **jerry** and **george** and **elaine** and **kramer**
trainWords %>%
filter(word1 == "jerry" | word2 == "jerry" |
word1 == "george" | word2 == "george" |
word1 == "elaine" | word2 == "elaine" |
word1 == "kramer" | word2 == "kramer") %>%
filter(n > 5) %>%
visualize_bigrams()
#Sentiment Analysis using **NRC Sentiment lexicon**
#The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and
#Checking how feared is he
getEmotionalWords = function(emotion,Character)
{
nrcEmotions = get_sentiments("nrc") %>%
filter(sentiment == emotion)
emotionalWords = scripts %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
filter(Character == Character) %>%
inner_join(nrcEmotions) %>%
group_by(word) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
return(emotionalWords)
}
FearWordsJerry = getEmotionalWords('fear','JERRY')
datatable(head(FearWordsJerry,10), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
wordcloud(FearWordsJerry$word, FearWordsJerry$Count, max.words = 30,colors=brewer.pal(8, "Dark2"))
#Sentiment Analysis using **NRC Sentiment lexicon**
#The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and
#Checking how full of joy is he
JoyWordsJerry = getEmotionalWords('joy','JERRY')
datatable(head(JoyWordsJerry,10), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
wordcloud(JoyWordsJerry$word, JoyWordsJerry$Count, max.words = 30,colors=brewer.pal(8, "Dark2"))
#Sentiment Analysis using **NRC Sentiment lexicon**
#The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and
#Checking how suprised is he
SurpriseWordsJerry = getEmotionalWords('surprise','JERRY')
datatable(head(SurpriseWordsJerry,10), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
wordcloud(SurpriseWordsJerry$word, SurpriseWordsJerry$Count, max.words = 30,colors=brewer.pal(8, "Dark2"))
# In terms of percentages, how often did each character speak?
top_50 <- bind_rows(scripts %>%
group_by(Character) %>%
summarise(n=n()) %>%
ungroup() %>%
filter(!grepl('setting',tolower(Character))) %>%
arrange(desc(n)) %>%
top_n(49),
scripts %>%
group_by(Character) %>%
summarise(n=n()) %>%
ungroup() %>%
arrange(desc(n)) %>%
tail(1590) %>%
mutate(n=sum(n)) %>%
mutate(Character='Others') %>%
unique())
p <- top_50 %>%
mutate(tot=sum(n)) %>%
mutate(percentage=n/tot) %>%
ggplot(aes(x=reorder(Character,percentage),y=percentage))+geom_bar(stat='identity')+
theme_ipsum()+coord_flip()+
labs(x='Characters',y='Percentage',title='Percentage of Dialogues by Character')+
scale_y_continuous(labels = scales::percent)+
geom_text(aes(label=stringr::str_c(as.character(round(percentage*100,2)),"%")), position=position_dodge(width=0.9),hjust=-0.5)
p
##Data
episodes = read_csv('episode_info.csv')
scripts = read.csv('scripts.csv', stringsAsFactors = FALSE)
scripts$Dialogue= str_to_lower(scripts$Dialogue)
#This is also called *lexical diversity* .This calculates the number of unique words divided by the total number of words used.
scripts %>%
select(SEID,Character,Dialogue) %>%
mutate(SEID=stringr::str_sub(SEID,1,3)) %>%
filter(Character %in% c("JERRY","ELAINE","GEORGE","KRAMER")) %>%
unnest_tokens(word,Dialogue) %>%
group_by(SEID,Character) %>%
summarise(total_number_of_words =n()) %>%
inner_join(scripts %>%
select(SEID,Character,Dialogue) %>%
mutate(SEID=stringr::str_sub(SEID,1,3)) %>%
filter(Character %in% c("JERRY","ELAINE","GEORGE","KRAMER")) %>%
unnest_tokens(word,Dialogue) %>%unique() %>% group_by(SEID,Character) %>% summarise(n_unique=n())) %>%
mutate(percentage_diversity=n_unique/total_number_of_words) %>%
ggplot(aes(x=Character,y=percentage_diversity))+geom_bar(stat = 'identity')+theme_ipsum()+facet_wrap(~SEID)+
coord_flip()+scale_y_continuous(labels=scales::percent)+labs(y='Percentage Diversity',title='Lexical Diversity Amongst Characters across Seasons')
#On way to visualize this is to look at conversions in pairs. Here, I have tried to construct a bi-gram of characters with the help of `tidytext`'s `unnest_tokens()` functions. This might not be entirely accurate due to the lack of continuity in dialogues when the scenes end. The frequency of the character to character bi-grams is used as the strength in the network.
scripts %>%
select(Character)%>%
mutate(Character= gsub(" ", "", Character, fixed = TRUE)) %>%
mutate(Character=paste(Character,collapse=' ')) %>%
unique() %>%
unnest_tokens(ngram, Character, token = "ngrams", n = 2) %>%
tidyr::separate(ngram,into=c('ch1','ch2'),sep=' ') %>%
group_by(ch1,ch2) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
mutate(strength = ifelse(n>1000,'Strong','Weak')) %>%
filter(n>10) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_width=n,edge_colour=strength),alpha=0.2) +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1,check_overlap = TRUE,size=5,color='#109876',fontface='bold')+
theme_ipsum()+
theme(axis.text = element_blank(),axis.title = element_blank(),panel.grid = element_blank(),legend.position = 'none')
##Data
episodes = read_csv('episode_info.csv')
scripts = read.csv('scripts.csv', stringsAsFactors = FALSE)
#fix strings
episodes$strfix = str_detect(episodes$Title, "(1)")
episodes$strfix2 = str_detect(episodes$Title, "(2)")
episodes$Title = ifelse(episodes$strfix ==TRUE | episodes$strfix2 ==TRUE, str_sub(episodes$Title, 1, -4), episodes$Title )
episodes$Title[episodes$Title == 'The Reverse Peephole (a.k.a. The Man Fur)'] = 'The Reverse Peephole'
episodes$Title = str_to_lower(episodes$Title)
scripts$Dialogue= str_to_lower(scripts$Dialogue)
##Merge data using dplyr
library(dplyr)
seinfeld = scripts %>%
left_join(episodes, by=c('EpisodeNo', 'SEID'))
##find name in scripts
seinfeld$New_Title = ifelse(str_sub(seinfeld$Title, 1, 3) =='the',
str_sub(seinfeld$Title, 4, -1), seinfeld$Title)
seinfeld$count = str_count(seinfeld$Dialogue ,seinfeld$New_Title)
#count by episode
title_count = seinfeld %>% group_by(Title, New_Title, Season.x, Director, Writers) %>%
summarise(count = sum(count))
title_count$Title = str_to_title(title_count$Title)
mean(title_count$count) #8.24
####Plot
top_episodes = subset(title_count, count >= 20)
top_episodes$Season.x = as.factor(top_episodes$Season.x)
top_episodes = top_episodes %>% mutate(Season = Season.x)
g = ggplot(top_episodes, aes(x=Title, y=count)) + geom_bar(stat = 'identity', aes(fill=Season)) +
ggtitle("The Most Times The Seinfeld Episode Title is Mentioned in The Episode") +
theme_bw() + ylab("Number of Times The Title is Said in The Episode") +
theme(axis.text.x = element_text(face = 'bold', angle=450), plot.title = element_text(hjust = 0.5))
g
print(ggplotly(g))#printing plotted graph
#scripts <- read.csv(file.path("input", "scripts.csv"), stringsAsFactors = F)
scripts <- read.csv("scripts.csv", stringsAsFactors = F)
main_characters <- c("ELAINE", "GEORGE", "JERRY", "KRAMER")
#some initial cleaning:
#Elaine is sometines not in caps? fix
scripts$Character <- toupper(scripts$Character)
#occasioinally there is a space after an open-bracket. remove
scripts$Character <- gsub("\\(\\s+", "(", scripts$Character)
scripts$Dialogue <- gsub("\\(\\s+", "(", scripts$Dialogue)
scripts$Character <- gsub("\\[", "(", scripts$Character)
scripts$Dialogue <- gsub("\\[", "(", scripts$Dialogue)
scripts$Character <- gsub("\\]", ")", scripts$Character)
scripts$Dialogue <- gsub("\\]", ")", scripts$Dialogue)
w_stage_direction <- scripts %>%
filter(grepl("\\(", Dialogue))
extract_directions <- function(row_i){
text <- paste(w_stage_direction$Character[row_i], w_stage_direction$Dialogue[row_i])
text_splits <- strsplit(text, split="\\(") %>% unlist
direction_closes <- text_splits[grepl("\\)", text_splits)]
#there are some errors in the script, where a stage direction bracket is
#opend but not closed. skip these as it is hard to tell if/where it should
#be closed or if the whole line is direction
if(length(direction_closes) < 1) {
return(data.frame(direction=NA,Character=NA, EpisodeNo=NA, SEID=NA, Season=NA) )
}
direction_data <- data.frame(Character=w_stage_direction$Character[row_i],
direction=gsub("\\).*$", "",  direction_closes),
EpisodeNo=w_stage_direction$EpisodeNo[row_i],
SEID=w_stage_direction$SEID[row_i],
Season=w_stage_direction$Season[row_i],
stringsAsFactors = F)
return(direction_data)
}
stage_direction_data_list <- lapply(1:nrow(w_stage_direction), extract_directions)
stage_direction_DF <- do.call("rbind", stage_direction_data_list)
#extract the first word of the direction
stage_direction_DF$first_word <- gsub("\\s.*$", "", stage_direction_DF$direction) %>% tolower()
#exclude stopwords
stage_direction_DF <- stage_direction_DF %>%
filter(!first_word %in% stopwords())
#if the first word is a character name, then this is actually a direction for
#another direction. correct and take the 2nd word
not_me <- stage_direction_DF %>% filter(first_word %in% (main_characters %>% tolower))
not_me$Character <- toupper(not_me$first_word)
not_me$first_word <- gsub("\\s.*$", "", sub("^[^\\s]*\\s", "", not_me$direction %>% tolower()))
stage_direction_DF <- rbind(stage_direction_DF %>%
filter(!first_word %in% (main_characters %>% tolower)),
not_me)
#work out which words are used to direct which character
main_character_directions <- stage_direction_DF %>%
filter(tolower(Character) %in% tolower(main_characters)) %>%
group_by(Character, first_word) %>%
summarise(ocs=n()) %>%
spread(Character, ocs)
#sum up total for each word
main_character_directions$all_count <- rowSums(main_character_directions[,2:5], na.rm = T)
#now calculate more stats per character per word for plotting
main_character_directions_stats <- main_character_directions %>%
gather(Character, character_count, -all_count, -first_word) %>%
filter(!is.na(character_count)) %>%
mutate(character_proportion=character_count/all_count)
#see how much direction each character is given in the script
#who has been given most priority?
most_directions <- stage_direction_DF
most_directions$Character[!most_directions$Character %in% main_characters] <- "other"
most_directions <- most_directions %>%
group_by(Character) %>% summarise(direction_count=n())
p <- most_directions %>%
ggplot(aes(x=Character, y=direction_count, fill=Character)) +              geom_bar(stat="identity") +
scale_fill_manual(values=brewer.pal(5, "Set2")) + ggtitle("Which character is given direction/priority in their respective given scripts?")
ggplotly(p, tooltip = c("direction_count"))
#Counting whom has the most number of lines
most_lines <- scripts
most_lines$Character[!most_lines$Character %in% main_characters] <- "other"
most_lines <- most_lines %>%
group_by(Character) %>% summarise(num_lines=n())
#assigned variable function
p <- most_lines %>%
ggplot(aes(x=Character, y=num_lines, fill=Character)) +   geom_bar(stat="identity") +
scale_fill_manual(values=brewer.pal(5, "Set2")) + ggtitle("How many number of lines are given to each character in their scripts?")
ggplotly(p, tooltip = c("num_lines"))
#Comparing overall between all main characters
directions_per_line <- left_join(most_lines, most_directions, by="Character")
directions_per_line$directions_per_line <- directions_per_line$direction_count / directions_per_line$num_lines
#assigned variable function
p <- directions_per_line %>%
ggplot(aes(x=Character, y=directions_per_line, fill=Character,
num_lines=num_lines, direction_count=direction_count)) +
geom_bar(stat="identity") +
scale_fill_manual(values=brewer.pal(5, "Set2")) + ggtitle("Comparing the directions of in a script, which lines are more prioritized by which character?")
ggplotly(p, tooltip = c("num_lines", "direction_count", "directions_per_line"))
#Analyzing the information details of each character's top words used in their script
top_words <- main_character_directions_stats %>%
group_by(Character) %>%
top_n(7, wt=character_count)
top_words_data <- main_character_directions_stats %>%
filter(first_word %in% top_words$first_word)
top_words_data$Character_number <- as.factor(top_words_data$Character) %>% as.numeric
first_word_order <- top_words_data %>%
filter(Character==main_characters[3]) %>%
arrange(character_proportion)
first_word_order <- first_word_order$first_word
top_words_data$first_word_f <- factor(top_words_data$first_word, levels = first_word_order)
p <- top_words_data %>%
arrange(Character, character_proportion) %>%
ggplot(aes(x=first_word_f, y=character_proportion, fill=Character,
text3=character_count, text2=first_word, text1=Character)) +
scale_fill_manual(values=brewer.pal(4, "Set2")) + ggtitle("Information details of each characters top/first words that are used in their scripts") +
geom_bar(stat="identity") + coord_flip()
ggplotly(p, tooltip = c("text1", "text2", "text3"))
#p
#visually display the script number of lines of each charcter in the seasons
top<-data.frame(scripts %>% filter(!grepl("^\\[|^\\(", Character)) %>% group_by(Season, Character) %>% summarise(count=n()) %>% arrange(-count) %>% top_n(20))
top$Char<-ifelse(top$Character %in% c('JERRY','GEORGE','ELAINE','KRAMER','NEWMAN','PUDDY','PETERMAN','ESTELLE','SUSAN','HELEN','MORTY','FRANK'),top$Character,'OTHER')
top %>%
ggplot(aes(x=factor(Season),y=count,fill=Char)) +
geom_histogram(stat='identity',color='white',size=.2) + theme_fivethirtyeight() +
scale_fill_manual(name='',values=colorRampPalette(brewer.pal(11,"Paired"))(13)) +
guides(fill=guide_legend(ncol=8)) +
labs(title='Number of lines per character and season')
#manipulate data into a format comaprisoncloud can use
comp_clould_data <- stage_direction_DF %>%
filter(tolower(Character) %in% tolower(main_characters)) %>%
group_by(Character) %>%
summarise(char_directions=paste(first_word, collapse = " "))
all <- comp_clould_data$char_directions
corpus <- Corpus(VectorSource(all))
tdm <- TermDocumentMatrix(corpus)
tdm <- as.matrix(tdm)
colnames(tdm) <- comp_clould_data$Character
comparison.cloud(tdm, random.order=FALSE,
colors=brewer.pal(4, "Set1"),
title.size=1.5,
rot.per=0)