Skip to content

Commit 611afd0

Browse files
updates
1 parent ace910b commit 611afd0

File tree

3 files changed

+497
-23
lines changed

3 files changed

+497
-23
lines changed

trip_spider.py

Lines changed: 189 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
" Enter Url here"
22

3-
43
url =['http://www.tripadvisor.com/Restaurant_Review-g42251-d4164754-Reviews-Taboon-Grand_Blanc_Michigan.html','http://www.tripadvisor.com/Restaurant_Review-g28970-d2137408-Reviews-Lincoln_DC-Washington_DC_District_of_Columbia.html','http://www.tripadvisor.com/Restaurant_Review-g29556-d416774-Reviews-Zingerman_s_Delicatessen-Ann_Arbor_Michigan.html','http://www.tripadvisor.com/Restaurant_Review-g58277-d3529482-Reviews-FireBrew-Virginia_Beach_Virginia.html']
54

5+
66
from scrapy.spider import BaseSpider
77
from scrapy.http import Request
88
from urlparse import urljoin
@@ -22,7 +22,7 @@
2222
class YelpSpider(BaseSpider):
2323

2424

25-
name = "trip"
25+
name = "tripss"
2626
start_urls=[url[i] for i in range(len(url))]
2727

2828
def parse(self, response):
@@ -39,6 +39,10 @@ def parse(self, response):
3939
cur.execute(qu)
4040
rows = cur.fetchall()
4141
li = [r[0] for r in rows]
42+
43+
44+
45+
4246
# print li
4347
for i in range(len(li)):
4448
# print li[i], item['url']
@@ -54,10 +58,18 @@ def parse(self, response):
5458
# print li1
5559
if len(li1) > 0:
5660
item['bid'] = li1[0]
61+
qu2 = ("select max(d) from social_data.tripadvisor1 where id =%d")%item['bid']
62+
cur.execute(qu2)
63+
rows2=cur.fetchall()
64+
max_date = [i[0] for i in rows2][0]
65+
print qu2,rows2,max_date
66+
5767
if item['bid'] > 0:
5868
print " ID assigned"
5969
else:
6070
item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url'])
71+
max_date = datetime.strptime('','')
72+
print max_date
6173

6274
print item['bid']
6375

@@ -69,28 +81,25 @@ def parse(self, response):
6981

7082
if item['rv_count'] > 0:
7183

72-
no = len(hxs.select('//div[@class="reviewSelector "]').extract())
84+
no = hxs.select('//div[@class="review basic_review inlineReviewUpdate provider0"]').extract()
85+
86+
for i in range(len(no)):
87+
7388

74-
for i in range(no):
75-
# cur.execute("select max(d) from social_data.tripadvisor1 where id='%s'" %item['bid'])
76-
# r = cur.fetchall()
77-
# m_dt = r[0][0]
78-
7989
try:
8090
xd=hxs.select('//span[@class="ratingDate"]').extract()[i].encode('utf-8').split('Reviewed')[1].split('\n')[0].replace(',','').replace('\n','').strip()
81-
date=datetime.strptime(xd,'%B %d %Y')
82-
item['rv_date'] =str(date)
91+
92+
item['rv_date'] = str(datetime.strptime(xd,'%B %d %Y'))
8393

8494

8595
except:
86-
date=datetime.strptime('9999-01-01','%Y-%d-%m')
8796
item['rv_date'] = '0001-01-01 00:00:00'
97+
current_date = datetime.strptime(item['rv_date'],'%Y-%m-%d %X')
8898

8999
try:
90100
item['rv_profile'] = hxs.select('//div[@class="username mo"]/span/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip()
91101
except:
92-
item['rv_profile'] = 'NULL'
93-
102+
item['rv_profile'] = 'A TripAdvisor reviewer on Facebook'
94103
try:
95104
item['rv_heading'] = hxs.select('//span[@class="noQuotes"]/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip()
96105
except:
@@ -101,18 +110,25 @@ def parse(self, response):
101110
except:
102111
item['rv_rating'] = '0.0'
103112
try:
104-
item['rv_dc'] = hxs.select('//p[@class="partial_entry"]').extract()[i].encode('ascii', 'ignore').split('\n')[1].replace("'"," ").replace('.','').strip()
113+
item['rv_dc'] = hxs.select('//div[@class="entry"]/p[@class="partial_entry"]').extract()[i].encode('ascii', 'ignore').split('\n')[1].replace("'"," ").replace('.','').strip()
105114
except:
106115
item['rv_dc'] = 'NULL'
107-
# if date >= m_dt:
108-
sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_heading'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile']))
109-
# else:
110-
# break
111-
112-
cur.execute(sql)
113-
con.commit()
116+
last_date = current_date
117+
114118

115-
print item['url'],item['rv_profile']
119+
""" It Only insert the new Feeds """
120+
print current_date, max_date
121+
if current_date >= max_date:
122+
123+
sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_heading'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile']))
124+
125+
cur.execute(sql)
126+
con.commit()
127+
else:
128+
break
129+
print "No updated review are here"
130+
131+
# print item['url'],item['rv_profile']
116132

117133

118134
""" Parse Next link"""
@@ -129,9 +145,18 @@ def parse(self, response):
129145

130146
print nxt_link
131147

148+
132149
if nxt_link:
133150
""" Next link Processed """
134-
yield Request( nxt_link, callback=self.parse)
151+
152+
if last_date > max_date and '1900-01-01 00:00:00' not in str(max_date):
153+
print "enter 1st"
154+
yield Request( nxt_link, callback=self.parse)
155+
elif '1900-01-01 00:00:00' in str(max_date):
156+
print "enter 2nd"
157+
yield Request( nxt_link, callback=self.parse_sub)
158+
else:
159+
print " Do nothing No other pages available"
135160
else:
136161
print " Progress Completed "
137162

@@ -144,6 +169,147 @@ def parse(self, response):
144169
item['rv_dc'] = 'NULL'
145170
item['rv_heading'] = 'NULL'
146171

172+
def parse_sub(self, response):
173+
174+
print "Sub parse Called"
175+
176+
itm=[]
177+
hxs1 = HtmlXPathSelector(response)
178+
item = TripItem()
179+
item['bid'] = 0
180+
item['url'] = response.url
181+
182+
"""Getting the Business Id from the DB if exists """
183+
184+
qu = ("select distinct u from social_data.tripadvisor1")
185+
cur.execute(qu)
186+
rows = cur.fetchall()
187+
li = [r[0] for r in rows]
188+
189+
190+
191+
# print li
192+
for i in range(len(li)):
193+
# print li[i], item['url']
194+
try:
195+
lis=li[i].split('-or')[1].split('-')[1]
196+
except:
197+
lis=li[i].split('Reviews-')[1].split('-')[0]
198+
if item['url'].find(lis)>0:
199+
qu1 = ("select distinct id from social_data.tripadvisor1 where u='%s'")%li[i]
200+
cur.execute(qu1)
201+
rows1= cur.fetchall()
202+
li1=[i[0] for i in rows1]
203+
# print li1
204+
if len(li1) > 0:
205+
item['bid'] = li1[0]
206+
if item['bid'] > 0:
207+
print " ID assigned"
208+
else:
209+
item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url'])
210+
211+
212+
print item['bid']
213+
214+
215+
216+
217+
item['rating'] = float(hxs1.select('//div[@class="rs rating"]/span/img/@content').extract()[0].encode('ascii', 'ignore').strip())
218+
item['rv_count'] = int(hxs1.select('//div[@class="rs rating"]/a/span/text()').extract()[0].encode('ascii', 'ignore').strip())
219+
220+
if item['rv_count'] > 0:
221+
222+
no = hxs1.select('//div[@class="review basic_review inlineReviewUpdate provider0"]').extract()
223+
224+
for i in range(len(no)):
225+
226+
227+
try:
228+
xd=hxs1.select('//span[@class="ratingDate"]').extract()[i].encode('utf-8').split('Reviewed')[1].split('\n')[0].replace(',','').replace('\n','').strip()
229+
230+
item['rv_date'] = str(datetime.strptime(xd,'%B %d %Y'))
231+
232+
233+
except:
234+
item['rv_date'] = '0001-01-01 00:00:00'
235+
current_date = datetime.strptime(item['rv_date'],'%Y-%m-%d %X')
236+
237+
try:
238+
item['rv_profile'] = hxs1.select('//div[@class="username mo"]/span/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip()
239+
except:
240+
item['rv_profile'] = 'A TripAdvisor reviewer on Facebook'
241+
try:
242+
item['rv_heading'] = hxs1.select('//span[@class="noQuotes"]/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip()
243+
except:
244+
item['rv_heading'] = 'NULL'
245+
246+
try:
247+
item['rv_rating'] = float(hxs1.select('//div[@class="rating reviewItemInline"]/span/img/@alt').extract()[i].encode('ascii', 'ignore').split(' ')[0].strip())
248+
except:
249+
item['rv_rating'] = '0.0'
250+
try:
251+
item['rv_dc'] = hxs1.select('//div[@class="entry"]/p[@class="partial_entry"]').extract()[i].encode('ascii', 'ignore').split('\n')[1].replace("'"," ").replace('.','').strip()
252+
except:
253+
item['rv_dc'] = 'NULL'
254+
last_date = current_date
255+
256+
257+
""" It Only insert the new Feeds """
258+
259+
260+
261+
sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_heading'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile']))
262+
263+
cur.execute(sql)
264+
con.commit()
265+
266+
# print item['url'],item['rv_profile']
267+
268+
269+
""" Parse Next link"""
270+
271+
272+
273+
try:
274+
link = hxs1.select('//div[@class="pgLinks"]/a[@class="guiArw sprite-pageNext "]/@href').extract()[0].encode('ascii', 'ignore').strip()
275+
nxt_link = urljoin(response.url,link)
276+
except:
277+
nxt_link = []
278+
279+
280+
281+
print nxt_link
282+
283+
284+
if nxt_link:
285+
""" Next link Processed """
286+
yield Request( nxt_link, callback=self.parse_sub)
287+
288+
else:
289+
print " Progress Completed "
290+
291+
292+
293+
else:
294+
item['rv_date'] = 'NULL'
295+
item['rv_profile'] = 'NULL'
296+
item['rv_rating'] = 'NULL'
297+
item['rv_dc'] = 'NULL'
298+
item['rv_heading'] = 'NULL'
299+
300+
147301

148302

149303

304+
CREATE TABLE social_data.tripadvisor
305+
(
306+
bid integer,
307+
url text,
308+
rating double precision,
309+
rv_count integer,
310+
rv_date timestamp without time zone,
311+
rv_heading text,
312+
rv_rating double precision,
313+
rv_desc text,
314+
rv_user text
315+
)

tripdata.pgd

253 KB
Binary file not shown.

0 commit comments

Comments
 (0)