trip

mercuryminds-scrapy · mercuryminds-scrapy · commit 4643fa1b6f60 · 2015-01-30T17:15:06.000+05:30
diff --git a/c.py b/c.py
@@ -0,0 +1 @@
+__author__ = 'anandhakumar'
diff --git a/d/urbanspoon/scrapy.cfg b/d/urbanspoon/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = urbanspoon.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = urbanspoon
diff --git a/d/urbanspoon/urbanspoon/__init__.py b/d/urbanspoon/urbanspoon/__init__.py
diff --git a/d/urbanspoon/urbanspoon/items.py b/d/urbanspoon/urbanspoon/items.py
@@ -0,0 +1,22 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class UrbanspoonItem(Item):
+    # define the fields for your item here like:
+    # name = Field()
+    b_id = Field()
+    url = Field()
+    n_votes = Field()
+    p_like= Field()
+    n_reviews= Field()
+    date= Field()
+    title= Field()
+    description= Field()
+    user = Field()
+
+
+
diff --git a/d/urbanspoon/urbanspoon/pipelines.py b/d/urbanspoon/urbanspoon/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+
+class UrbanspoonPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/d/urbanspoon/urbanspoon/settings.py b/d/urbanspoon/urbanspoon/settings.py
@@ -0,0 +1,15 @@
+# Scrapy settings for Urbanspoon project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'urbanspoon'
+
+SPIDER_MODULES = ['urbanspoon.spiders']
+NEWSPIDER_MODULE = 'urbanspoon.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'urbanspoon (+http://www.yourdomain.com)'
diff --git a/d/urbanspoon/urbanspoon/spiders/__init__.py b/d/urbanspoon/urbanspoon/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/d/urbanspoon/urbanspoon/spiders/urban.py b/d/urbanspoon/urbanspoon/spiders/urban.py
@@ -0,0 +1,108 @@
+""" Enter Url here"""
+#url =['http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach']
+url =['http://www.urbanspoon.com/r/1/4524/restaurant/Capitol-Hill/Honey-Hole-Sandwiches-Seattle','http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach','http://www.urbanspoon.com/r/13/169913/restaurant/North-Richland-Hills-Richland-Hills/Texs-Star-Grill-Watauga']
+from scrapy.spider import BaseSpider
+from urlparse import urljoin
+from scrapy.http import Request
+from scrapy.selector import HtmlXPathSelector
+from urbanspoon.items import UrbanspoonItem
+import psycopg2
+
+
+""" Database Creation"""
+try:
+	con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost")
+	cur = con.cursor()
+except:
+	""" Database not Connected """
+
+class UrbanspoonSpider(BaseSpider):
+	
+	name = "urban"
+
+#	allowed_domains = ["urbanspoon.com"]
+	start_urls=[url[i] for i in range(len(url))]
+
+#	filehandle = open('Urbanspoon.csv','w')
+#	filehandle.write("Manufacturer\tDescription\tPart Number\tUrl\tAvailability\tPrice\tItem Number\tOEM\tMore_items\n")
+	
+	def parse(self,response):
+
+		item=UrbanspoonItem()
+		hxs = HtmlXPathSelector(response)
+		item['url']= response.url
+		item['b_id'] = 0
+
+		"""Getting the Business Id from the DB if exists """
+
+		qu = ("select distinct url from social_data.urbanspoon")
+		cur.execute(qu)
+		rows = cur.fetchall()
+		li = [r[0] for r in rows]
+#		print li
+
+		for i in range(len(li)):
+	#		print li[i], item['url']
+			if li[i] in item['url']:
+				qu1 = ("select distinct business_id from social_data.urbanspoon where url='%s'")%li[i]
+				cur.execute(qu1)
+				rows1= cur.fetchall()
+				li1=[i[0] for i in rows1]
+#				print li1				
+				if len(li1) > 0:
+					item['b_id'] = li1[0]
+		if item['b_id'] > 0:
+			print " ID assigned"
+		else:
+			item['b_id'] = input("Enter the Business Id here for URL:       %s        : "%item['url'])
+
+
+		print item['b_id']
+
+
+		
+		item['n_votes'] = hxs.select('//div[@class="stats"]/div/text()').extract()[0].encode('utf-8').replace('\n','').strip()
+		item['p_like']	= hxs.select('//div[@class="rating"]/text()').extract()[0].encode('utf-8').strip()
+		item['n_reviews'] = int(hxs.select('//div[@class="stats"]/div/a[@data-ga-action="reviews"]/text()').extract()[0].split(' ')[0].encode('utf-8').strip())
+		nxt_link = 'http://www.urbanspoon.com'+hxs.select('//div[@data-ga-action="diner-reviews"]/@data-url').extract()[0].encode('utf-8').strip()
+		print nxt_link
+		if nxt_link:
+			yield Request(nxt_link, callback=self.parse_sub, meta=dict(item=item))
+	
+	def parse_sub(self,response):
+
+
+		print "Sub Parse Called"
+		item = response.meta.get('item')
+		hxs = HtmlXPathSelector(response)
+
+		x=hxs.select('//ul/li[@class="comment review"]')
+		length=len(x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract())
+
+		for i in range(length):
+					
+			try:
+				item['date']  = x[0].select('//div[@class="details"]/div/time[@class="posted-on"]/text()').extract()[i].encode('utf-8').split(' ')[2].replace('\n','')
+				
+			except:
+				item['date'] = ''
+			try:
+				item['title'] = x[0].select('//div[@class="details"]/div[@class="title"]/text()').extract()[i].encode('utf-8').replace("'","").strip()
+			except:
+				item['title'] = ''
+				
+			try:
+				item['description'] = x[0].select('//div[@class="details"]/div[@itemprop="description"]').extract()[i].encode('utf-8').split('\n')[1].replace("'","")
+			except:
+				item['description'] = ''
+			try:
+				item['user'] = x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract()[i].encode('utf-8').replace("'","").strip()
+			except:
+				item['user'] = ''
+			print item['user']
+			sql = ("insert into social_data.urbanspoon select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.urbanspoon where review_description='%s' and review_user='%s')"%(item['b_id'],item['url'],item['n_votes'], item['p_like'], item['n_reviews'],item['date'],item['title'],item['description'], item['user'],item['description'], item['user']))
+		        cur.execute(sql)
+		        con.commit()
+
+		
+
diff --git a/d/yelp/scrapy.cfg b/d/yelp/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = yelp.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = yelp
diff --git a/d/yelp/yelp/__init__.py b/d/yelp/yelp/__init__.py
diff --git a/d/yelp/yelp/pipelines.py b/d/yelp/yelp/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+
+class YelpPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/d/yelp/yelp/settings.py b/d/yelp/yelp/settings.py
@@ -0,0 +1,15 @@
+# Scrapy settings for yelp project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'yelp'
+
+SPIDER_MODULES = ['yelp.spiders']
+NEWSPIDER_MODULE = 'yelp.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'yelp (+http://www.yourdomain.com)'
diff --git a/d/yelp/yelp/spiders/__init__.py b/d/yelp/yelp/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/d/yelp/yelp/spiders/yelp_spider.py b/d/yelp/yelp/spiders/yelp_spider.py
@@ -0,0 +1,138 @@
+" Enter Url here"
+
+url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/taboon-grand-blanc-grand-blanc-township','http://www.yelp.com/biz/lincoln-washington','http://www.yelp.com/biz/zingermans-delicatessen-ann-arbor-2']
+#url =['http://www.yelp.com/biz/firebrew-virginia-beach']
+
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from urlparse import urljoin
+from scrapy.selector import HtmlXPathSelector
+from yelp.items import YelpItem
+import psycopg2
+
+
+
+""" Database Creation"""
+try:
+	con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost")
+	cur = con.cursor()
+except:
+	""" Database not Connected """
+
+class YelpSpider(BaseSpider):
+	
+
+	name = "ylpp"
+	start_urls=[url[i] for i in range(len(url))]
+#	f=open('opt.csv','w')
+	count = 0
+	def parse(self, response):
+		
+		itm=[]
+		hxs = HtmlXPathSelector(response)
+		item = YelpItem()	 
+		item['bid'] = 0
+		item['url'] = response.url
+
+		"""Getting the Business Id from the DB if exists """
+
+		qu = ("select distinct url from social_data.yelp")
+		cur.execute(qu)
+		rows = cur.fetchall()
+		li = [r[0] for r in rows]
+#		print li
+		for i in range(len(li)):
+	#		print li[i], item['url']
+			if li[i] in item['url']:
+				qu1 = ("select distinct bid from social_data.yelp where url='%s'")%li[i]
+				cur.execute(qu1)
+				rows1= cur.fetchall()
+				li1=[i[0] for i in rows1]
+	#			print li1				
+				if len(li1) > 0:
+					item['bid'] = li1[0]
+		if item['bid'] > 0:
+			print " ID assigned"
+		else:
+			item['bid'] = input("Enter the Business Id here for URL:       %s        : "%item['url'])
+
+		print item['bid']
+
+
+
+
+		item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('ascii', 'ignore').strip())
+		item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('ascii', 'ignore').strip())
+	
+		if item['rv_count'] > 0:
+
+			no = len(hxs.select('//div[@itemprop="review"]').extract())
+			x=hxs.select('//div[@class="review-list"]/ul/li')
+			for i in range(no):
+
+				self.__class__.count  = self.__class__.count + 1
+				try:
+					item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('ascii', 'ignore').strip()
+				except:
+					item['rv_date'] = 'NULL'
+				try:
+					item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'","").strip()
+				except:
+					item['rv_profile'] = 'NULL'
+				try:
+					item['rv_rating'] =  float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('ascii', 'ignore').strip())
+				except:
+					item['rv_rating'] = 'NULL'
+				try:
+					item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('ascii', 'ignore').split('lang="en">')[1].replace("<br>",'').replace('</p>','').replace("'"," ").replace('.','').strip()
+				except:					
+					item['rv_dc'] = 'NULL'
+
+				
+			#	sql = ("insert into social_data.yelp select '%s','%s','%s,'%s','%s','%s','%s','%s' where not exists ( select * from social_data.yelp where rv_desc='%s' and rv_user='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile']))
+
+				sql = ("insert into social_data.yelp select '%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.yelp where rv_desc='%s' and rv_user='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile']))
+
+				cur.execute(sql)
+	                	con.commit()
+				print item['url'],item['rv_profile']
+			print self.__class__.count
+#			if item['rv_count'] > self.__class__.count:
+
+			"""Check the review which has more than 40"""
+			if item['rv_count'] > 40:				
+#				nxt_link = hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%self.__class__.count
+				nxt_link = []
+
+				""" Parse the nex review link by multiple of 40"""
+
+
+				if float(item['rv_count'])/40 > item['rv_count']/40:
+					x= range(40,(item['rv_count']/40)*40+1,40)
+					for i in range(len(x)):
+						nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i])
+				else:					
+					x= range(40,(item['rv_count']/40-1)*40+1,40)
+					for i in range(len(x)):
+						nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i])
+				print nxt_link
+				for i in range(len(nxt_link)):
+					if nxt_link[i]:
+						""" Next link Processed """
+						yield Request( nxt_link[i], callback=self.parse)
+				
+			else:
+
+				self.__class__.count  = 0
+
+				""" Reviews below 40 for this business"""
+				
+		else:
+			item['rv_date'] = 'NULL'
+			item['rv_profile'] = 'NULL'
+			item['rv_rating'] = 'NULL'
+			item['rv_dc'] = 'NULL'
+
+
+
+
diff --git a/items.py b/items.py
@@ -0,0 +1,19 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class TripItem(Item):
+    # define the fields for your item here like:
+    # name = Field()
+	bid = Field() 
+	url = Field() 
+	rating = Field() 
+	rv_count = Field() 
+	rv_date = Field() 
+	rv_heading = Field()
+	rv_rating = Field() 
+	rv_dc = Field() 
+	rv_profile = Field()
diff --git a/trip_spider.py b/trip_spider.py
diff --git a/ur.py b/ur.py
diff --git a/urban.py b/urban.py