-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexamprep.py
154 lines (86 loc) · 2.81 KB
/
examprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# coding: utf-8
# ## Import
# In[1]:
import PyPDF2 #read the pdf
import matplotlib.pyplot as plt
import pandas as pdd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
# ## Getting the data
# In[2]:
file = open('JavaBasics-notes.pdf', 'rb')
fileReader = PyPDF2.PdfFileReader(file)
total = fileReader.numPages
# #### Getting the function for feature name
# In[3]:
def get_topics(model, feature_names, no_top_words):
all_ = []
for topic_idx, topic in enumerate(model.components_):
#print ("Topic %d:" % (topic_idx))
x = " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
all_.append(str(x))
return all_
# In[4]:
tra = []
for i in range(total):
pg = fileReader.getPage(i)
tra.append(pg.extractText())
# ### Algorithms:
# NMF :Non-negative Matrix factorization
# LDA : Latent Derilicht Analysis
# In[5]:
documents = tra
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
idf = tfidf_vectorizer.idf_
x = dict(zip(tfidf_vectorizer.get_feature_names(), idf))
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
no_topics = len(tra)
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
# In[6]:
no_top_words = 10 #words for each page
# In[7]:
all_ = get_topics(nmf, tfidf_feature_names, no_top_words)#nmf
# In[8]:
all_2 = get_topics(lda, tf_feature_names, no_top_words)#lda
# ### Getting weights
# In[9]:
weights = {}
weights_2 = {}
for i in range(len(all_)):
rest = all_[i].split(' ')
rest2 = all_2[i].split(' ')
for j in rest:
if j in x:
weights[str(j)] = x[str(j)]
for k in rest2:
if k in x:
weights_2[str(k)] = x[str(k)]
# ### Making dataframe
# In[10]:
df1 = pdd.DataFrame(list(weights.items()), columns=['topic', 'weights'])
# In[11]:
df2 = pdd.DataFrame(list(weights_2.items()), columns=['topic', 'weights'])
# In[12]:
print(df1)
# In[13]:
print(df2)
# In[14]:
print('NMF')
for i in range(len(all_)):
print('page = ', i, 'keywords : ' , all_[i])
# In[15]:
print('LDA')
for i in range(len(all_2)):
print('page = ', i , 'keywords : ', all_2[i])
# In[ ]: