diff --git a/NewsSpider/NewsSpider.exe b/NewsSpider/NewsSpider.exe deleted file mode 100644 index 3bc11566..00000000 Binary files a/NewsSpider/NewsSpider.exe and /dev/null differ diff --git a/WechatSearchProjects/Wechatproject/Wechatproject/middlewares.py b/WechatSearchProjects/Wechatproject/Wechatproject/middlewares.py new file mode 100755 index 00000000..c794b77b --- /dev/null +++ b/WechatSearchProjects/Wechatproject/Wechatproject/middlewares.py @@ -0,0 +1,17 @@ +# Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication +import base64 + + +# Start your middleware class +class ProxyMiddleware(object): + # overwrite process request + def process_request(self, request, spider): + # Set the location of the proxy + request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT" + + # Use the following lines if your proxy requires authentication + proxy_user_pass = "USERNAME:PASSWORD" + # setup basic authentication for the proxy + encoded_user_pass = base64.encodestring(proxy_user_pass) + request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass + pass \ No newline at end of file diff --git a/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py b/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py old mode 100644 new mode 100755 index 7bf76f23..197f2cd1 --- a/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py +++ b/WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py @@ -35,8 +35,9 @@ class WechatprojectPipeline(object): # connnect databases def __init__(self): - connection = pymongo.Connection(host = "localhost", port = 27017) - db = connection["testwechat"] # you need no build database named testdouban + # connection = pymongo.Connection(host = "localhost", port = 27017) + client = pymongo.MongoClient("localhost", 27017) + db = client["testwechat"] # you need no build database named testdouban # db.authenticate(name = "root", password = "fireling") # no name and password for localhost self.posts = db["result"] # you need not build collection named book # pipeline default function @@ -45,6 +46,7 @@ def process_item(self, item, spider): return item + # # Json File # import json # import codecs diff --git a/WechatSearchProjects/Wechatproject/Wechatproject/settings.py b/WechatSearchProjects/Wechatproject/Wechatproject/settings.py old mode 100644 new mode 100755 index fdc670e2..4cbd78e0 --- a/WechatSearchProjects/Wechatproject/Wechatproject/settings.py +++ b/WechatSearchProjects/Wechatproject/Wechatproject/settings.py @@ -6,12 +6,46 @@ # http://doc.scrapy.org/en/latest/topics/settings.html # +import random BOT_NAME = 'Wechatproject' SPIDER_MODULES = ['Wechatproject.spiders'] NEWSPIDER_MODULE = 'Wechatproject.spiders' -ITEM_PIPELINES = ['Wechatproject.pipelines.WechatprojectPipeline'] # add settings +user_agent_list = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" +] +UA = random.choice(user_agent_list) + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = UA + +# # Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +ITEM_PIPELINES = {'Wechatproject.pipelines.WechatprojectPipeline':1} # add settings + +# DOWNLOADER_MIDDLEWARES = { +# 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, +# 'Wechatproject.middlewares.ProxyMiddleware': 100, +# } ############################################################################################# # '''if you want to download images''' # ITEM_PIPELINES = {'Wechatproject.pipelines.WechatprojectPipeline':1, 'Wechatproject.pipelines.MyImagesPipeline':2 # add settings @@ -19,3 +53,8 @@ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Wechatproject (+http://www.yourdomain.com)' + +LOG_ENABLED=True +LOG_ENCODING='utf-8' +LOG_STDOUT=True +LOG_FILE='error.log' diff --git a/WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py b/WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py old mode 100644 new mode 100755 index 9171eefd..3a5326c1 --- a/WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py +++ b/WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py @@ -1,63 +1,89 @@ #coding: utf-8 -from scrapy.spider import BaseSpider +import scrapy from scrapy.selector import Selector from Wechatproject.items import WechatprojectItem from bs4 import BeautifulSoup from scrapy.http import Request -class WechatSpider(BaseSpider): +class WechatSpider(scrapy.Spider): ############################################################################################# '''微信搜索程序''' - name = "wechat" + name = 'spider' start_urls = [] - querystring = u"清华" + querystring = 'NBA' type = 2 # 2-文章,1-微信号 - for i in range(1, 50, 1): - start_urls.append("http://weixin.sogou.com/weixin?type=%d&query=%s&page=%d" % (type, querystring, i)) - # print start_urls + for i in range(1, 3, 1): + start_urls.append('http://weixin.sogou.com/weixin?type=%d&query=%s&page=%d' % (type, querystring, i)) + print('shuai start url is: ' + str(start_urls)) + + # def start_requests(self): + # print('shuai start request') + # hostname = 'http://weixin.sogou.com' + # yield Request(url=hostname, meta={'cookiejar':1}, callback=self.getcookie, dont_filter=True) + # pass + # + # def getcookie(self, response): + # yield Request(url=self.start_urls[0], meta={'cookiejar':response.meta['cookiejar']}, callback=self.parse, dont_filter=True) + # pass ############################################################################################# ## 递归抓取 ## 使用xpath()方法,注意item中键对值为string类型,extract()方法返回list def parse(self, response): + print('shuai: start parse00 is: ' + response.url) # print response.body sel = Selector(response) - sites = sel.xpath('//div[@class="txt-box"]/h4/a') + sites = sel.xpath('//div[@class="txt-box"]/h3/a') for site in sites: item = WechatprojectItem() - item["title"] = site.xpath("text()").extract() # 其中在item.py中定义了title = Field() + item['title'] = site.xpath("descendant::text()").extract() # 其中在item.py中定义了title = Field() item["link"] = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() - ############################################################################################# # yield item ## 只抓取当前页数据 - next_url = item["link"][0] + item["link"] = item["link"][0] + item['title'] = "".join(item['title']) + print('start open wechat title is: ' + item['title']) + print('start open wechat link is: ' + item['link']) # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 - yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 + yield Request(url=item["link"], meta={"item": item}, callback=self.parse2, + dont_filter=True) ## 抓取当前页数和二级页面数据 - ## 使用BeautifulSoup方法,注意item中键对值为string类型 - def parse(self, response): - # print response.body - soup = BeautifulSoup(response.body) - tags = soup.findAll("h4") - for tag in tags: - item = WechatprojectItem() - item["title"] = tag.text # 其中在item.py中定义了title = Field() - item["link"] = tag.find("a").get("href") # 其中在item.py中定义了link = Field() - ############################################################################################# - # yield item ## 只抓取当前页数据 - next_url = item["link"] - # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 - yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 + # item = WechatprojectItem() + # item['title'] = sites[7].xpath("descendant::text()").extract() # 其中在item.py中定义了title = Field() + # item["link"] = sites[7].xpath("@href").extract() # 其中在item.py中定义了link = Field() + # ############################################################################################# + # # yield item ## 只抓取当前页数据 + # item["link"] = item["link"][0] + # item['title'] = "".join(item['title']) + # print('start open wechat title is: ' + item['title']) + # print('start open wechat link is: ' + item['link']) + # # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 + # yield Request(url=item["link"], meta={"item": item}, callback=self.parse2,dont_filter=True) ## 抓取当前页数和二级页面数据 + + # ## 使用BeautifulSoup方法,注意item中键对值为string类型 + # def parse(self, response): + # # print response.body + # soup = BeautifulSoup(response.body, 'lxml') + # tags = soup.findAll("h4") + # for tag in tags: + # item = WechatprojectItem() + # item['title'] = tag.text # 其中在item.py中定义了title = Field() + # item['link'] = tag.find("a").get("href") # 其中在item.py中定义了link = Field() + # ############################################################################################# + # # yield item ## 只抓取当前页数据 + # next_url = item["link"] + # # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 + # yield Request(url=next_url, meta={"item":item}, callback=self.parse2, dont_filter=True) ## 抓取当前页数和二级页面数据 def parse2(self, response): - soup = BeautifulSoup(response.body) + soup = BeautifulSoup(response.body, 'lxml') tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 content_list = [tag_i.text for tag_i in tag.findAll("p")] content = "".join(content_list) - # print content + print('wechat content is: ' + content) # item = WechatprojectItem() ## 只抓取二级页面数据 item = response.meta['item'] ## 抓取当前页数和二级页面数据 - item["content"] = content + item['content'] = content return item