-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimdbuser.py
129 lines (118 loc) · 5.3 KB
/
imdbuser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
import pandas as pd
import urllib2
import time
import lxml.html
from pandas import DataFrame
doubanMovie_info=pd.read_csv('IMDBfinal3.csv')
IMDBurl=doubanMovie_info['IMDBRate']
IMDBRateList=[]
nameList=[]
def getDoc(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
request = urllib2.Request('http://www.imdb.com'+url+'reviews?start=0', headers=headers) # send request
response = urllib2.urlopen(request) # get response
#time.sleep(1)
content = response.read() # get web content
doc = lxml.html.fromstring(content)
return doc
def getDoc2(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
request = urllib2.Request('http://www.imdb.com'+url+'reviews?start=10', headers=headers)
response = urllib2.urlopen(request)
#time.sleep(1)
content = response.read()
doc = lxml.html.fromstring(content)
return doc
def getDoc3(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
request = urllib2.Request('http://www.imdb.com'+url+'reviews?start=20', headers=headers)
response = urllib2.urlopen(request)
#time.sleep(1)
content = response.read()
doc = lxml.html.fromstring(content)
return doc
#函数:获得IMDB评分
def getIMDBRate(doc,oneIMDBurl):
#匹配IMDB评分的xpath路径
temp1=[]
temp2=[]
if doc.xpath('//*[@id="tn15content"]/div[1]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[1]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[1]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[3]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[3]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[3]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[5]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[5]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[5]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[7]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[7]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[7]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[9]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[9]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[9]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[11]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[11]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[11]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[13]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[13]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[13]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[15]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[15]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[15]/a/text()')[0])
if doc.xpath('//*[@id="tn15content"]/div[17]/a/@href'):
temp1.append(doc.xpath('//*[@id="tn15content"]/div[17]/a/@href')[0])
temp2.append(doc.xpath('//*[@id="tn15content"]/div[17]/a/text()')[0])
#print temp1
#print temp2
return temp1, temp2 # return string
def toInt(numWithDot):
temp1=numWithDot.split(',')
temp2=''
for i in range(len(temp1)):
temp2+=temp1[i]
temp2=int(temp2)
return temp2
# get rate user name
def getNumOfPeopleWhoRate(doc,oneIMDBurl):
tempList=doc.xpath('//*[@id="titleUserReviewsTeaser"]/div[1]/span/div[1]/a/span/text()')
print tempList[0]
return tempList[0]
num=1
startPoint=1
for i in range(startPoint-1,len(IMDBurl)):
print i+1
try:
if IMDBurl[i]!='-': # if this movie exists
doc=getDoc(IMDBurl[i])
doc2=getDoc2(IMDBurl[i])
doc3=getDoc3(IMDBurl[i])
#得到IMDB评分
IMDBRate,name=getIMDBRate(doc,IMDBurl[i])
for i in range(len(IMDBRate)):
IMDBRateList.append(IMDBRate[i])
nameList.append(name[i])
IMDBRate,name=getIMDBRate(doc2,IMDBurl[i])
for i in range(len(IMDBRate)):
IMDBRateList.append(IMDBRate[i])
nameList.append(name[i])
IMDBRate,name=getIMDBRate(doc3,IMDBurl[i])
for i in range(len(IMDBRate)):
IMDBRateList.append(IMDBRate[i])
nameList.append(name[i])
#IMDBRateList.append(IMDBRate)
#name=getNumOfPeopleWhoRate(doc,IMDBurl[i])
#nameList.append(name)
else:
print 'Movie:',i+1,'Without IMDBurl,No.',num
num=num+1
IMDBRateList.append('-')
nameList.append('-')
except:
print 'unknownError happened!'
#IMDBRateList.append('unknownError')
#nameList.append('unknownError')
finally:
IMDBRate=DataFrame({'IMDBRate':IMDBRateList,'name':nameList})
IMDBRate.to_csv('IMDBallusers3.csv',index=False,encoding='utf-8')