-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare.py
176 lines (130 loc) · 5.52 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pandas as pd
import numpy as nd
import os
import unicodedata
import re
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
seed = 42
#---------------------------------------------
def prep_data(filename):
df = get_data(filename)
clean_article(df, 'title')
clean_article(df, 'summary')
df1 = pd.read_csv('fiction-and-non-fiction-top-best-sellers.csv', index_col=0)
clean_article(df1, 'Book')
ser = df1['cleaned_Book']
creat_tar(df, ser)
genre_counts = df['genre'].value_counts()
genres_to_remove = genre_counts[genre_counts < 8].index
# remove the rows with those genres "filtering"
df = df[~df['genre'].isin(genres_to_remove)]
# dropping picture books
df = df[df['genre'] != 'Picture Books']
# cleaning, lemmatising, sentimenting the book summaries
df['lemmatized_summary'] = df['cleaned_summary'].apply(lemmatize_text)
df[['neg', 'neutral', 'pos', 'compound']] = df['summary'].apply(feat_sent)
df['sentiment'] = df['compound'].apply(get_sentiment)
return df
#-----pulling_the_data----------------------
def get_data(file):
'''
Will pull the current data from the 'almost_there' csv file, and prep it for deeper cleaning.
'''
df = pd.read_csv(file, index_col=0)
df = df.drop_duplicates(subset='title')
save = ['Eleven on Top', 'Winter of the World', 'Nothing to Lose', 'Reflected in You']
sub = df[df['length'].isna()]
sub1 = sub[sub['title'].isin(save)]
df = df.dropna(subset='length')
df = pd.concat([df, sub1], axis=0)
df = df.dropna(subset='summary')
df = df.dropna(subset='year_published')
df = df.reset_index()
df = df.drop(columns=['index', 'book_tag'])
df['summary'] = df['summary'].astype('string')
df['title'] = df['title'].astype('string')
df['author'] = df['author'].astype('string')
df['genre'] = df['genre'].astype('string')
df['length'] = df['length'].astype('float')
return df
#-----create_target-------------------------
def creat_tar(df, ser):
target_list = []
for index, row in df.iterrows():
if row['cleaned_title'] in ser.tolist():
target_list.append(1)
else:
target_list.append(0)
# Add the 'Target' column to the dataframe
df['successful'] = target_list
df['successful'] = df['successful'].astype(bool)
return df
# -----clean_text---------------
def clean_article(df, col_name):
cleaned_summaries = []
for summary in df[col_name]:
# Normalize the summary text and convert to lowercase
cleaned_summary = unicodedata.normalize('NFKD', summary)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')\
.lower()
cleaned_summary = re.sub(r"[^a-z0-9',\s.]", '', cleaned_summary)
cleaned_summaries.append(cleaned_summary)
df[f'cleaned_{col_name}'] = cleaned_summaries
df[f'cleaned_{col_name}'].astype('string')
# -----lemmatize_and_Stopwords------------------------------
def lemmatize_text(text):
"""
Lemmatizes input text using NLTK's WordNetLemmatizer.
This function first tokenizes the text, removes any non-alphabetic tokens, removes any stop words,
determines the part of speech of each token, and lemmatizes each token accordingly.
Args:
text (str): The text to lemmatize.
Returns:
str: The lemmatized text.
"""
# Stop words
extra_stop_words = ['book', 'novel', 'work', 'title', 'character',
'fuck', 'asshole', 'bitch', 'cunt', 'dick', 'fucking',
'fucker', 'pussy', 'fag', 'edition', 'story', 'tale', 'genre',
'new york times', 'ny times', 'nyt', 'new', 'york',
'times', 'bestseller', 'author', 'bestselling', 'one', 'two']
stop_words = set(stopwords.words('english')) | set(extra_stop_words)
#intialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# Tokenize the text and convert to lowercase
tokens = word_tokenize(text.lower())
# Remove any non-alphabetic tokens and stop words
tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
# Determine the part of speech of each token and lemmatize accordingly
pos_tags = nltk.pos_tag(tokens)
lemmatized_tokens = [lemmatizer.lemmatize(token, pos=pos_tag[0].lower()) if pos_tag[0].lower() in ['a', 's', 'r', 'v'] else lemmatizer.lemmatize(token) for token, pos_tag in pos_tags]
# Join the lemmatized tokens back into a string
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
#------------sentiment_mapping------
def get_sentiment(compound):
if compound <= -0.5:
return 'very negative'
elif compound < 0:
return 'negative'
elif compound >= 0.5:
return 'very positive'
elif compound > 0:
return 'positive'
else:
return 'neutral'
#------------feature_sentiment_score------
def feat_sent(text):
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
book_synopsis = str(text)
# get the sentiment scores for the synopsis
sentiment_scores = analyzer.polarity_scores(book_synopsis)
return pd.Series(sentiment_scores)