Skip to content

Commit 4643fa1

Browse files
trip
1 parent 09ed181 commit 4643fa1

File tree

18 files changed

+853
-211
lines changed

18 files changed

+853
-211
lines changed

c.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__author__ = 'anandhakumar'

d/urbanspoon/scrapy.cfg

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# http://doc.scrapy.org/topics/scrapyd.html
5+
6+
[settings]
7+
default = urbanspoon.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = urbanspoon

d/urbanspoon/urbanspoon/__init__.py

Whitespace-only changes.

d/urbanspoon/urbanspoon/items.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# http://doc.scrapy.org/en/latest/topics/items.html
5+
6+
from scrapy.item import Item, Field
7+
8+
class UrbanspoonItem(Item):
9+
# define the fields for your item here like:
10+
# name = Field()
11+
b_id = Field()
12+
url = Field()
13+
n_votes = Field()
14+
p_like= Field()
15+
n_reviews= Field()
16+
date= Field()
17+
title= Field()
18+
description= Field()
19+
user = Field()
20+
21+
22+

d/urbanspoon/urbanspoon/pipelines.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: http://doc.scrapy.org/topics/item-pipeline.html
5+
6+
class UrbanspoonPipeline(object):
7+
def process_item(self, item, spider):
8+
return item

d/urbanspoon/urbanspoon/settings.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Scrapy settings for Urbanspoon project
2+
#
3+
# For simplicity, this file contains only the most important settings by
4+
# default. All the other settings are documented here:
5+
#
6+
# http://doc.scrapy.org/topics/settings.html
7+
#
8+
9+
BOT_NAME = 'urbanspoon'
10+
11+
SPIDER_MODULES = ['urbanspoon.spiders']
12+
NEWSPIDER_MODULE = 'urbanspoon.spiders'
13+
14+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
15+
#USER_AGENT = 'urbanspoon (+http://www.yourdomain.com)'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
""" Enter Url here"""
2+
#url =['http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach']
3+
url =['http://www.urbanspoon.com/r/1/4524/restaurant/Capitol-Hill/Honey-Hole-Sandwiches-Seattle','http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach','http://www.urbanspoon.com/r/13/169913/restaurant/North-Richland-Hills-Richland-Hills/Texs-Star-Grill-Watauga']
4+
from scrapy.spider import BaseSpider
5+
from urlparse import urljoin
6+
from scrapy.http import Request
7+
from scrapy.selector import HtmlXPathSelector
8+
from urbanspoon.items import UrbanspoonItem
9+
import psycopg2
10+
11+
12+
""" Database Creation"""
13+
try:
14+
con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost")
15+
cur = con.cursor()
16+
except:
17+
""" Database not Connected """
18+
19+
class UrbanspoonSpider(BaseSpider):
20+
21+
name = "urban"
22+
23+
# allowed_domains = ["urbanspoon.com"]
24+
start_urls=[url[i] for i in range(len(url))]
25+
26+
# filehandle = open('Urbanspoon.csv','w')
27+
# filehandle.write("Manufacturer\tDescription\tPart Number\tUrl\tAvailability\tPrice\tItem Number\tOEM\tMore_items\n")
28+
29+
def parse(self,response):
30+
31+
item=UrbanspoonItem()
32+
hxs = HtmlXPathSelector(response)
33+
item['url']= response.url
34+
item['b_id'] = 0
35+
36+
"""Getting the Business Id from the DB if exists """
37+
38+
qu = ("select distinct url from social_data.urbanspoon")
39+
cur.execute(qu)
40+
rows = cur.fetchall()
41+
li = [r[0] for r in rows]
42+
# print li
43+
44+
for i in range(len(li)):
45+
# print li[i], item['url']
46+
if li[i] in item['url']:
47+
qu1 = ("select distinct business_id from social_data.urbanspoon where url='%s'")%li[i]
48+
cur.execute(qu1)
49+
rows1= cur.fetchall()
50+
li1=[i[0] for i in rows1]
51+
# print li1
52+
if len(li1) > 0:
53+
item['b_id'] = li1[0]
54+
if item['b_id'] > 0:
55+
print " ID assigned"
56+
else:
57+
item['b_id'] = input("Enter the Business Id here for URL: %s : "%item['url'])
58+
59+
60+
print item['b_id']
61+
62+
63+
64+
item['n_votes'] = hxs.select('//div[@class="stats"]/div/text()').extract()[0].encode('utf-8').replace('\n','').strip()
65+
item['p_like'] = hxs.select('//div[@class="rating"]/text()').extract()[0].encode('utf-8').strip()
66+
item['n_reviews'] = int(hxs.select('//div[@class="stats"]/div/a[@data-ga-action="reviews"]/text()').extract()[0].split(' ')[0].encode('utf-8').strip())
67+
nxt_link = 'http://www.urbanspoon.com'+hxs.select('//div[@data-ga-action="diner-reviews"]/@data-url').extract()[0].encode('utf-8').strip()
68+
print nxt_link
69+
if nxt_link:
70+
yield Request(nxt_link, callback=self.parse_sub, meta=dict(item=item))
71+
72+
def parse_sub(self,response):
73+
74+
75+
print "Sub Parse Called"
76+
item = response.meta.get('item')
77+
hxs = HtmlXPathSelector(response)
78+
79+
x=hxs.select('//ul/li[@class="comment review"]')
80+
length=len(x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract())
81+
82+
for i in range(length):
83+
84+
try:
85+
item['date'] = x[0].select('//div[@class="details"]/div/time[@class="posted-on"]/text()').extract()[i].encode('utf-8').split(' ')[2].replace('\n','')
86+
87+
except:
88+
item['date'] = ''
89+
try:
90+
item['title'] = x[0].select('//div[@class="details"]/div[@class="title"]/text()').extract()[i].encode('utf-8').replace("'","").strip()
91+
except:
92+
item['title'] = ''
93+
94+
try:
95+
item['description'] = x[0].select('//div[@class="details"]/div[@itemprop="description"]').extract()[i].encode('utf-8').split('\n')[1].replace("'","")
96+
except:
97+
item['description'] = ''
98+
try:
99+
item['user'] = x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract()[i].encode('utf-8').replace("'","").strip()
100+
except:
101+
item['user'] = ''
102+
print item['user']
103+
sql = ("insert into social_data.urbanspoon select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.urbanspoon where review_description='%s' and review_user='%s')"%(item['b_id'],item['url'],item['n_votes'], item['p_like'], item['n_reviews'],item['date'],item['title'],item['description'], item['user'],item['description'], item['user']))
104+
cur.execute(sql)
105+
con.commit()
106+
107+
108+

d/yelp/scrapy.cfg

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# http://doc.scrapy.org/topics/scrapyd.html
5+
6+
[settings]
7+
default = yelp.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = yelp

d/yelp/yelp/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)