-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWine Assigment.R
226 lines (182 loc) · 7.91 KB
/
Wine Assigment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#install.packages('readr')
#install.packages('dplyr')
#install.packages('ggplot2')
#install.packages('tm')
#install.packages('tmap')
#install.packages('wordcloud')
#install.packages('RColorBrewer')
#install.packages('gridExtra')
#install.packages('kableExtra')
library(readr)
library(dplyr)
library(ggplot2)
library(tm)
library(tmap)
library(wordcloud)
library(RColorBrewer)
library(gridExtra)
library(kableExtra)
# set directory setwd("C:\\Users\\user\\Desktop\\WebA Assigment")
#import dataset
winere <- read_csv("winemag-data-130k-v2.csv")
# To get the dimension/size of the dataset
dim(winere)
#[1] 129971 14
# craete wordcloud for designation and description
makeWordCloud <- function(documents) {
corpus = Corpus(VectorSource(tolower(documents)))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
frequencies = DocumentTermMatrix(corpus)
word_frequencies = as.data.frame(as.matrix(frequencies))
words <- colnames(word_frequencies)
freq <- colSums(word_frequencies)
wordcloud(words, freq,
min.freq=sort(freq, decreasing=TRUE)[[50]],
colors=brewer.pal(8, "Dark2"),
random.color=TRUE,random.order=FALSE)
}
makeWordCloud(winere[["designation"]][1:50])
makeWordCloud(winere[["description"]][1:50])
# to analyse the distribution of graph then plot a distribution of graph.
# create 2 new variable that is totalpcnt and accum
wineCountry <- winere %>%
group_by(country) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(totpcnt = round(total/ sum(total), digits=7), accum = cumsum(totpcnt))
wineCountry
wineCountry %>% head(10) %>%
ggplot( aes(x= factor(country, levels = wineCountry$country[order(desc(wineCountry$totpcnt))]), y = total)) +
geom_col() +
geom_text(aes(label = sprintf("%.1f %%", 100*totpcnt), y = total + 1500)) +
labs(x="Country", y="Total Reviews", title="Distribution of Wine Reviews by Top 10 Countries")
# to get the mean rating of wine
wineRating = winere %>%
group_by(country) %>%
summarise_at(vars(points), funs(points = mean(., na.rm=T))) %>%
arrange(desc(points)) %>%
head(10)
# plot barchart of rating by country
ggplot(data=wineRating, aes(x=reorder(country,-points), y= points)) +
geom_bar(stat="identity", fill = "navy") +
coord_cartesian(ylim=c(85,92)) +
labs(x="Countries", y="Rating", title="Top 10 Countries by Average Rating")
# Create two graphs testing normality: histogram with normal dstribution and qq plot
q1=ggplot(winere, aes(x=points, col=I('gray'))) +
geom_histogram(binwidth = 1, aes(y=..density..)) +
stat_function(fun=dnorm, args = list(mean = mean(winere$points), sd= sd(winere$points)), col = 'blue')
q2=ggplot(winere, aes(sample=c(scale(points)))) + stat_qq() + geom_abline(intercept = 0, slope = 1)
grid.arrange(q1, q2, nrow=1)
#clean the description and filter the country by spain
sw<-winere$description[winere$country=="Spain"]
sw<-VectorSource(sw)
sw<-VCorpus(sw)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stemDocument)
#corpus <- tm_map(corpus, removeWords, stop_wine)
corpus <-tm_map(corpus, stripWhitespace)
return(corpus)
}
swc<-clean_corpus(sw)
swctdm<-TermDocumentMatrix(swc)
#creating matrix from TDM
swcm<-as.matrix(swctdm)
sf<-rowSums(swcm)
sf<-sort(sf, decreasing=T)
#creating a data frame
swf<-data.frame(term=names(sf), num=sf)
#creating a plot of 10 top stems
ggplot(data=head(swf, 10), aes(x=factor(term, levels = swf$term[order(-swf$num)]), y=num)) +
geom_col(fill="red") +
labs(x="Word Stems", y="Count", title="Top 10 Word Stems in Spanish Wine Reviews") +
scale_y_continuous(expand = c(0, 0), breaks=seq(0,3200,400)) +
#creating a wordcloud
wordcloud(swf$term, swf$num, max.words=50, color="blue")
#plot chart for review and point of density
ggplot(winere, aes(x=points)) + geom_histogram(binwidth = 1, color= 'white') +
coord_cartesian(xlim = c(75, 100))+
labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(75,100, by = 1))
countries = winere %>%
group_by(country) %>%
count()
top_countries = countries %>%
filter(n>500)
Best_Wines = winere %>%
filter(country %in% top_countries$country) %>%
select(country,points)
Best_Wines %>%
group_by(country) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score))%>%
kable()
Best_Worst = Best_Wines %>%
filter(country %in% c("Austria","Chile"))
ggplot(Best_Worst, aes(points, colour = country, fill = country)) + geom_density(alpha = .4) +
labs(title ="Point Densities of Top and Bottom Countries with at Least 500 Reviews", x = "Points Given", y = "Density")
# plot Wine Score vs price
I_Wines = winere %>%
select(price, points) %>%
filter(!is.na(price)) %>%
filter(!is.na(points))
ggplot(I_Wines, aes(x=price, y=points)) + geom_jitter(shape=1) + coord_cartesian(xlim = c(0, 4000), ylim = c(75, 100)) +
labs(title ="Score vs Price", x = "Price", y = "Score")
# categories point into 3 grade
Wine_Eng = winere %>%
mutate(grade = ifelse(points > 91,"Good",ifelse(points >86,"Average","Bad")))
Wine_Eng$grade = as.factor(Wine_Eng$grade)
Wine_Eng$grade = factor(Wine_Eng$grade,levels(Wine_Eng$grade)[c(3,1,2)])
#price of wine under 100 dollor
ggplot(Wine_Eng, aes(x=price, fill = grade)) + geom_histogram(binwidth = 5, color= 'white') +
coord_cartesian(xlim = c(0, 100))+
labs(title ="Price Histogram for Wines Under $100", x = "Price in Dollars", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(0,100, by = 10))
#price and score correlaction
ggplot(winere, aes(price, points, color = country)) +
geom_point(alpha = 0.01) +
geom_count() +
theme_minimal() +
labs(x = 'Price', y = 'Points', title = "Price Vs Ratings")+
coord_flip()+
scale_size_area(max_size = 10)
#Which wine have a better score
winere$wine_type <- ifelse(winere$variety == "Chardonnay"
| winere$variety == "Riesling"
| winere$variety == "Sauvignon Blanc"
| winere$variety == "White Blend"
| winere$variety == "Sparkling Blend"
| winere$variety == "Pinot Gris"
| winere$variety == "Champagne Blend"
| winere$variety == "Grüner Veltliner"
| winere$variety == "Pinot Grigio"
| winere$variety == "Portuguese White"
| winere$variety == "Viognier"
| winere$variety == "Gewürztraminer"
| winere$variety == "Gewürztraminer", "White Wine", "Red Wine")
winere %>%
group_by(variety, wine_type) %>%
summarise(n=n(),
avg_score = mean(points),
avg_price = mean(price)) %>%
ggplot(aes(x=avg_price, y= avg_score, size = n, colour = wine_type))+
geom_point()+
scale_color_manual(values = c("#CC3300", "#FFCC00"))
#price over continent
NAmerica <- c('US', 'Canada', 'Mexico')
SAmerica <- c('Chile', 'Argentina', 'Brazil', 'Colombia', 'Uruguay', 'Paraguay', 'Bolivia', 'Peru', 'Venezuala')
Europe <- c('Germany', 'France', 'England', 'Spain', 'Portugal', 'Italy', 'Austria')
winere$continent <- ifelse(winere$country %in% NAmerica, 'NA', ifelse(winere$country %in% SAmerica, 'SA', ifelse(winere$country %in% Europe, 'EU', 'Other')))
winere$price <- ifelse(winere$price > 250, NA, winere$price)
ggplot(winere, aes(x=points, y=price, color=continent)) + geom_point()
Price_points_reg <- lm(log(price) ~ points + wine_type, data = new_data)
summary(Price_points_reg)
#Regression Model
Price_points_reg <- lm(log(price) ~ points + wine_type, data = winere)
summary(Price_points_reg)
par(mfrow = c(2,2))
plot(Price_points_reg)