1
1
" Enter Url here"
2
2
3
-
4
3
url = ['http://www.tripadvisor.com/Restaurant_Review-g42251-d4164754-Reviews-Taboon-Grand_Blanc_Michigan.html' ,'http://www.tripadvisor.com/Restaurant_Review-g28970-d2137408-Reviews-Lincoln_DC-Washington_DC_District_of_Columbia.html' ,'http://www.tripadvisor.com/Restaurant_Review-g29556-d416774-Reviews-Zingerman_s_Delicatessen-Ann_Arbor_Michigan.html' ,'http://www.tripadvisor.com/Restaurant_Review-g58277-d3529482-Reviews-FireBrew-Virginia_Beach_Virginia.html' ]
5
4
5
+
6
6
from scrapy .spider import BaseSpider
7
7
from scrapy .http import Request
8
8
from urlparse import urljoin
22
22
class YelpSpider (BaseSpider ):
23
23
24
24
25
- name = "trip "
25
+ name = "tripss "
26
26
start_urls = [url [i ] for i in range (len (url ))]
27
27
28
28
def parse (self , response ):
@@ -39,6 +39,10 @@ def parse(self, response):
39
39
cur .execute (qu )
40
40
rows = cur .fetchall ()
41
41
li = [r [0 ] for r in rows ]
42
+
43
+
44
+
45
+
42
46
# print li
43
47
for i in range (len (li )):
44
48
# print li[i], item['url']
@@ -54,10 +58,18 @@ def parse(self, response):
54
58
# print li1
55
59
if len (li1 ) > 0 :
56
60
item ['bid' ] = li1 [0 ]
61
+ qu2 = ("select max(d) from social_data.tripadvisor1 where id =%d" )% item ['bid' ]
62
+ cur .execute (qu2 )
63
+ rows2 = cur .fetchall ()
64
+ max_date = [i [0 ] for i in rows2 ][0 ]
65
+ print qu2 ,rows2 ,max_date
66
+
57
67
if item ['bid' ] > 0 :
58
68
print " ID assigned"
59
69
else :
60
70
item ['bid' ] = input ("Enter the Business Id here for URL: %s : " % item ['url' ])
71
+ max_date = datetime .strptime ('' ,'' )
72
+ print max_date
61
73
62
74
print item ['bid' ]
63
75
@@ -69,28 +81,25 @@ def parse(self, response):
69
81
70
82
if item ['rv_count' ] > 0 :
71
83
72
- no = len (hxs .select ('//div[@class="reviewSelector "]' ).extract ())
84
+ no = hxs .select ('//div[@class="review basic_review inlineReviewUpdate provider0"]' ).extract ()
85
+
86
+ for i in range (len (no )):
87
+
73
88
74
- for i in range (no ):
75
- # cur.execute("select max(d) from social_data.tripadvisor1 where id='%s'" %item['bid'])
76
- # r = cur.fetchall()
77
- # m_dt = r[0][0]
78
-
79
89
try :
80
90
xd = hxs .select ('//span[@class="ratingDate"]' ).extract ()[i ].encode ('utf-8' ).split ('Reviewed' )[1 ].split ('\n ' )[0 ].replace (',' ,'' ).replace ('\n ' ,'' ).strip ()
81
- date = datetime . strptime ( xd , '%B %d %Y' )
82
- item ['rv_date' ] = str (date )
91
+
92
+ item ['rv_date' ] = str (datetime . strptime ( xd , '%B %d %Y' ) )
83
93
84
94
85
95
except :
86
- date = datetime .strptime ('9999-01-01' ,'%Y-%d-%m' )
87
96
item ['rv_date' ] = '0001-01-01 00:00:00'
97
+ current_date = datetime .strptime (item ['rv_date' ],'%Y-%m-%d %X' )
88
98
89
99
try :
90
100
item ['rv_profile' ] = hxs .select ('//div[@class="username mo"]/span/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
91
101
except :
92
- item ['rv_profile' ] = 'NULL'
93
-
102
+ item ['rv_profile' ] = 'A TripAdvisor reviewer on Facebook'
94
103
try :
95
104
item ['rv_heading' ] = hxs .select ('//span[@class="noQuotes"]/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
96
105
except :
@@ -101,18 +110,25 @@ def parse(self, response):
101
110
except :
102
111
item ['rv_rating' ] = '0.0'
103
112
try :
104
- item ['rv_dc' ] = hxs .select ('//p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
113
+ item ['rv_dc' ] = hxs .select ('//div[@class="entry"]/ p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
105
114
except :
106
115
item ['rv_dc' ] = 'NULL'
107
- # if date >= m_dt:
108
- sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
109
- # else:
110
- # break
111
-
112
- cur .execute (sql )
113
- con .commit ()
116
+ last_date = current_date
117
+
114
118
115
- print item ['url' ],item ['rv_profile' ]
119
+ """ It Only insert the new Feeds """
120
+ print current_date , max_date
121
+ if current_date >= max_date :
122
+
123
+ sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
124
+
125
+ cur .execute (sql )
126
+ con .commit ()
127
+ else :
128
+ break
129
+ print "No updated review are here"
130
+
131
+ # print item['url'],item['rv_profile']
116
132
117
133
118
134
""" Parse Next link"""
@@ -129,9 +145,18 @@ def parse(self, response):
129
145
130
146
print nxt_link
131
147
148
+
132
149
if nxt_link :
133
150
""" Next link Processed """
134
- yield Request ( nxt_link , callback = self .parse )
151
+
152
+ if last_date > max_date and '1900-01-01 00:00:00' not in str (max_date ):
153
+ print "enter 1st"
154
+ yield Request ( nxt_link , callback = self .parse )
155
+ elif '1900-01-01 00:00:00' in str (max_date ):
156
+ print "enter 2nd"
157
+ yield Request ( nxt_link , callback = self .parse_sub )
158
+ else :
159
+ print " Do nothing No other pages available"
135
160
else :
136
161
print " Progress Completed "
137
162
@@ -144,6 +169,147 @@ def parse(self, response):
144
169
item ['rv_dc' ] = 'NULL'
145
170
item ['rv_heading' ] = 'NULL'
146
171
172
+ def parse_sub (self , response ):
173
+
174
+ print "Sub parse Called"
175
+
176
+ itm = []
177
+ hxs1 = HtmlXPathSelector (response )
178
+ item = TripItem ()
179
+ item ['bid' ] = 0
180
+ item ['url' ] = response .url
181
+
182
+ """Getting the Business Id from the DB if exists """
183
+
184
+ qu = ("select distinct u from social_data.tripadvisor1" )
185
+ cur .execute (qu )
186
+ rows = cur .fetchall ()
187
+ li = [r [0 ] for r in rows ]
188
+
189
+
190
+
191
+ # print li
192
+ for i in range (len (li )):
193
+ # print li[i], item['url']
194
+ try :
195
+ lis = li [i ].split ('-or' )[1 ].split ('-' )[1 ]
196
+ except :
197
+ lis = li [i ].split ('Reviews-' )[1 ].split ('-' )[0 ]
198
+ if item ['url' ].find (lis )> 0 :
199
+ qu1 = ("select distinct id from social_data.tripadvisor1 where u='%s'" )% li [i ]
200
+ cur .execute (qu1 )
201
+ rows1 = cur .fetchall ()
202
+ li1 = [i [0 ] for i in rows1 ]
203
+ # print li1
204
+ if len (li1 ) > 0 :
205
+ item ['bid' ] = li1 [0 ]
206
+ if item ['bid' ] > 0 :
207
+ print " ID assigned"
208
+ else :
209
+ item ['bid' ] = input ("Enter the Business Id here for URL: %s : " % item ['url' ])
210
+
211
+
212
+ print item ['bid' ]
213
+
214
+
215
+
216
+
217
+ item ['rating' ] = float (hxs1 .select ('//div[@class="rs rating"]/span/img/@content' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ())
218
+ item ['rv_count' ] = int (hxs1 .select ('//div[@class="rs rating"]/a/span/text()' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ())
219
+
220
+ if item ['rv_count' ] > 0 :
221
+
222
+ no = hxs1 .select ('//div[@class="review basic_review inlineReviewUpdate provider0"]' ).extract ()
223
+
224
+ for i in range (len (no )):
225
+
226
+
227
+ try :
228
+ xd = hxs1 .select ('//span[@class="ratingDate"]' ).extract ()[i ].encode ('utf-8' ).split ('Reviewed' )[1 ].split ('\n ' )[0 ].replace (',' ,'' ).replace ('\n ' ,'' ).strip ()
229
+
230
+ item ['rv_date' ] = str (datetime .strptime (xd ,'%B %d %Y' ))
231
+
232
+
233
+ except :
234
+ item ['rv_date' ] = '0001-01-01 00:00:00'
235
+ current_date = datetime .strptime (item ['rv_date' ],'%Y-%m-%d %X' )
236
+
237
+ try :
238
+ item ['rv_profile' ] = hxs1 .select ('//div[@class="username mo"]/span/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
239
+ except :
240
+ item ['rv_profile' ] = 'A TripAdvisor reviewer on Facebook'
241
+ try :
242
+ item ['rv_heading' ] = hxs1 .select ('//span[@class="noQuotes"]/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
243
+ except :
244
+ item ['rv_heading' ] = 'NULL'
245
+
246
+ try :
247
+ item ['rv_rating' ] = float (hxs1 .select ('//div[@class="rating reviewItemInline"]/span/img/@alt' ).extract ()[i ].encode ('ascii' , 'ignore' ).split (' ' )[0 ].strip ())
248
+ except :
249
+ item ['rv_rating' ] = '0.0'
250
+ try :
251
+ item ['rv_dc' ] = hxs1 .select ('//div[@class="entry"]/p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
252
+ except :
253
+ item ['rv_dc' ] = 'NULL'
254
+ last_date = current_date
255
+
256
+
257
+ """ It Only insert the new Feeds """
258
+
259
+
260
+
261
+ sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
262
+
263
+ cur .execute (sql )
264
+ con .commit ()
265
+
266
+ # print item['url'],item['rv_profile']
267
+
268
+
269
+ """ Parse Next link"""
270
+
271
+
272
+
273
+ try :
274
+ link = hxs1 .select ('//div[@class="pgLinks"]/a[@class="guiArw sprite-pageNext "]/@href' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ()
275
+ nxt_link = urljoin (response .url ,link )
276
+ except :
277
+ nxt_link = []
278
+
279
+
280
+
281
+ print nxt_link
282
+
283
+
284
+ if nxt_link :
285
+ """ Next link Processed """
286
+ yield Request ( nxt_link , callback = self .parse_sub )
287
+
288
+ else :
289
+ print " Progress Completed "
290
+
291
+
292
+
293
+ else :
294
+ item ['rv_date' ] = 'NULL'
295
+ item ['rv_profile' ] = 'NULL'
296
+ item ['rv_rating' ] = 'NULL'
297
+ item ['rv_dc' ] = 'NULL'
298
+ item ['rv_heading' ] = 'NULL'
299
+
300
+
147
301
148
302
149
303
304
+ CREATE TABLE social_data .tripadvisor
305
+ (
306
+ bid integer ,
307
+ url text ,
308
+ rating double precision ,
309
+ rv_count integer ,
310
+ rv_date timestamp without time zone ,
311
+ rv_heading text ,
312
+ rv_rating double precision ,
313
+ rv_desc text ,
314
+ rv_user text
315
+ )
0 commit comments