Skip to content

Commit eb1e1e6

Browse files
committed
提交代码
1 parent c104012 commit eb1e1e6

18 files changed

+382
-1
lines changed

fans/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Python技术 公众号文章代码库
1616

1717
[PyAutoGUI,轻松搞定图片上传!](https://github.com/JustDoPython/python-examples/tree/master/fans/imgupload):PyAutoGUI,轻松搞定图片上传!
1818

19-
19+
[为了买车,我爬了懂车帝!](https://github.com/JustDoPython/python-examples/tree/master/fans/scrapydcd):为了买车,我爬了懂车帝!
2020

2121

2222

fans/scrapydcd/dcd/dcd/__init__.py

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
import time
3+
from selenium import webdriver
4+
from scrapy.http.response.html import HtmlResponse
5+
6+
class DcdDownloaderMiddleware(object):
7+
8+
def __init__(self):
9+
# 加载测试浏览器
10+
options = webdriver.ChromeOptions()
11+
options.add_argument('--no-sandbox')
12+
options.add_argument('--disable-gpu')
13+
options.add_argument('--ignore-certificate-errors')
14+
options.add_argument('--ignore-ssl-errors')
15+
16+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
17+
self.driver.maximize_window()
18+
19+
#重写process_request方法
20+
def process_request(self, request, spider):
21+
print('request.url',request.url)
22+
self.driver.get(request.url)
23+
js = 'return document.body.scrollHeight;'
24+
height = 0
25+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
26+
while True:
27+
new_height = self.driver.execute_script(js)
28+
if new_height > height:
29+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
30+
height = new_height
31+
time.sleep(1)
32+
else:
33+
print("滚动条已经处于页面最下方!")
34+
break
35+
source = self.driver.page_source
36+
# 创建一个response对象,把页面信息都封装在reponse对象中
37+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
38+
return response

fans/scrapydcd/dcd/dcd/items.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/items.html
5+
6+
import scrapy
7+
8+
9+
class DcdItem(scrapy.Item):
10+
#品牌
11+
brand = scrapy.Field()
12+
#车型
13+
name = scrapy.Field()
14+
#评分
15+
score = scrapy.Field()
16+
#特点
17+
title = scrapy.Field()

fans/scrapydcd/dcd/dcd/middlewares.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
import time
7+
from selenium import webdriver
8+
from scrapy import signals
9+
from scrapy.http.response.html import HtmlResponse
10+
11+
# useful for handling different item types with a single interface
12+
from itemadapter import is_item, ItemAdapter
13+
14+
15+
class DcdSpiderMiddleware:
16+
# Not all methods need to be defined. If a method is not defined,
17+
# scrapy acts as if the spider middleware does not modify the
18+
# passed objects.
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
# This method is used by Scrapy to create your spiders.
23+
s = cls()
24+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
25+
return s
26+
27+
def process_spider_input(self, response, spider):
28+
# Called for each response that goes through the spider
29+
# middleware and into the spider.
30+
31+
# Should return None or raise an exception.
32+
return None
33+
34+
def process_spider_output(self, response, result, spider):
35+
# Called with the results returned from the Spider, after
36+
# it has processed the response.
37+
38+
# Must return an iterable of Request, or item objects.
39+
for i in result:
40+
yield i
41+
42+
def process_spider_exception(self, response, exception, spider):
43+
# Called when a spider or process_spider_input() method
44+
# (from other spider middleware) raises an exception.
45+
46+
# Should return either None or an iterable of Request or item objects.
47+
pass
48+
49+
def process_start_requests(self, start_requests, spider):
50+
# Called with the start requests of the spider, and works
51+
# similarly to the process_spider_output() method, except
52+
# that it doesn’t have a response associated.
53+
54+
# Must return only requests (not items).
55+
for r in start_requests:
56+
yield r
57+
58+
def spider_opened(self, spider):
59+
spider.logger.info('Spider opened: %s' % spider.name)
60+
61+
62+
class DcdDownloaderMiddleware:
63+
# Not all methods need to be defined. If a method is not defined,
64+
# scrapy acts as if the downloader middleware does not modify the
65+
# passed objects.
66+
67+
def __init__(self):
68+
# 加载测试浏览器
69+
options = webdriver.ChromeOptions()
70+
options.add_argument('--no-sandbox')
71+
options.add_argument('--disable-gpu')
72+
options.add_argument('--ignore-certificate-errors')
73+
options.add_argument('--ignore-ssl-errors')
74+
75+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
76+
self.driver.maximize_window()
77+
78+
@classmethod
79+
def from_crawler(cls, crawler):
80+
# This method is used by Scrapy to create your spiders.
81+
s = cls()
82+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
83+
return s
84+
85+
def process_request(self, request, spider):
86+
print('request.url',request.url)
87+
self.driver.get(request.url)
88+
js = 'return document.body.scrollHeight;'
89+
height = 0
90+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
91+
while True:
92+
new_height = self.driver.execute_script(js)
93+
if new_height > height:
94+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
95+
height = new_height
96+
time.sleep(1)
97+
else:
98+
print("滚动条已经处于页面最下方!")
99+
break
100+
source = self.driver.page_source
101+
# 创建一个response对象,把页面信息都封装在reponse对象中
102+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
103+
return response
104+
105+
def process_response(self, request, response, spider):
106+
# Called with the response returned from the downloader.
107+
108+
# Must either;
109+
# - return a Response object
110+
# - return a Request object
111+
# - or raise IgnoreRequest
112+
return response
113+
114+
def process_exception(self, request, exception, spider):
115+
# Called when a download handler or a process_request()
116+
# (from other downloader middleware) raises an exception.
117+
118+
# Must either:
119+
# - return None: continue processing this exception
120+
# - return a Response object: stops process_exception() chain
121+
# - return a Request object: stops process_exception() chain
122+
pass
123+
124+
def spider_opened(self, spider):
125+
spider.logger.info('Spider opened: %s' % spider.name)

fans/scrapydcd/dcd/dcd/pipelines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5+
6+
7+
# useful for handling different item types with a single interface
8+
from itemadapter import ItemAdapter
9+
10+
11+
class DcdPipeline:
12+
def process_item(self, item, spider):
13+
return item

fans/scrapydcd/dcd/dcd/settings.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Scrapy settings for dcd project
2+
#
3+
# For simplicity, this file contains only settings considered important or
4+
# commonly used. You can find more settings consulting the documentation:
5+
#
6+
# https://docs.scrapy.org/en/latest/topics/settings.html
7+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9+
10+
BOT_NAME = 'dcd'
11+
12+
SPIDER_MODULES = ['dcd.spiders']
13+
NEWSPIDER_MODULE = 'dcd.spiders'
14+
15+
16+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17+
#USER_AGENT = 'dcd (+http://www.yourdomain.com)'
18+
19+
# Obey robots.txt rules
20+
ROBOTSTXT_OBEY = True
21+
22+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
23+
#CONCURRENT_REQUESTS = 32
24+
25+
# Configure a delay for requests for the same website (default: 0)
26+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27+
# See also autothrottle settings and docs
28+
#DOWNLOAD_DELAY = 3
29+
# The download delay setting will honor only one of:
30+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
31+
#CONCURRENT_REQUESTS_PER_IP = 16
32+
33+
# Disable cookies (enabled by default)
34+
#COOKIES_ENABLED = False
35+
36+
# Disable Telnet Console (enabled by default)
37+
#TELNETCONSOLE_ENABLED = False
38+
39+
# Override the default request headers:
40+
#DEFAULT_REQUEST_HEADERS = {
41+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42+
# 'Accept-Language': 'en',
43+
#}
44+
45+
# Enable or disable spider middlewares
46+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47+
#SPIDER_MIDDLEWARES = {
48+
# 'dcd.middlewares.DcdSpiderMiddleware': 543,
49+
#}
50+
51+
# Enable or disable downloader middlewares
52+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53+
DOWNLOADER_MIDDLEWARES = {
54+
'dcd.chrom_middlewares.DcdDownloaderMiddleware': 543,
55+
}
56+
57+
# Enable or disable extensions
58+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
59+
#EXTENSIONS = {
60+
# 'scrapy.extensions.telnet.TelnetConsole': None,
61+
#}
62+
63+
# Configure item pipelines
64+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65+
#ITEM_PIPELINES = {
66+
# 'dcd.pipelines.DcdPipeline': 300,
67+
#}
68+
69+
# Enable and configure the AutoThrottle extension (disabled by default)
70+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71+
#AUTOTHROTTLE_ENABLED = True
72+
# The initial download delay
73+
#AUTOTHROTTLE_START_DELAY = 5
74+
# The maximum download delay to be set in case of high latencies
75+
#AUTOTHROTTLE_MAX_DELAY = 60
76+
# The average number of requests Scrapy should be sending in parallel to
77+
# each remote server
78+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79+
# Enable showing throttling stats for every response received:
80+
#AUTOTHROTTLE_DEBUG = False
81+
82+
# Enable and configure HTTP caching (disabled by default)
83+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84+
#HTTPCACHE_ENABLED = True
85+
#HTTPCACHE_EXPIRATION_SECS = 0
86+
#HTTPCACHE_DIR = 'httpcache'
87+
#HTTPCACHE_IGNORE_HTTP_CODES = []
88+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Binary file not shown.
Binary file not shown.

fans/scrapydcd/dcd/dcd/spiders/car.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import scrapy
2+
from lxml import etree
3+
from dcd.items import DcdItem
4+
import os,csv
5+
6+
if os.path.exists('D:/桌面/car.csv'):
7+
print('delete?')
8+
os.remove('D:/桌面/car.csv')
9+
f = open('D:/桌面/car.csv', 'a+', newline='', encoding='gb18030')
10+
f_csv = csv.writer(f)
11+
f_csv.writerow(['品牌','车型', '评分', '特点'])
12+
class RainSpider(scrapy.Spider):
13+
name = 'car'
14+
allowed_domains = ['https://www.dongchedi.com/']
15+
#url中的73代表吉利,18是奇瑞,35是长安,对这3个品牌发起请求
16+
start_urls = ['https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-73-x-x','https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-18-x-x','https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-35-x-x']
17+
18+
def parse(self, response):
19+
print('html111')
20+
html =etree.HTML(response.text)
21+
22+
item = DcdItem()
23+
brand = html.xpath('//*[@id="__next"]/div[1]/div[2]/div/div[4]/span[2]/div/div/a/text()')[0]
24+
lis = html.xpath('//*[@id="__next"]/div[1]/div[2]/div/ul/li[position()>= 1]')
25+
print('111 lis',lis)
26+
for li in lis:
27+
name = li.xpath('./div/a[1]/text()')[0]
28+
try:
29+
#有评分
30+
score = li.xpath('./div/a[2]/text()')[0].split('分')[0]
31+
except Exception as e:
32+
#无评分
33+
score = 0
34+
try:
35+
#有标题
36+
title = li.xpath('./div/span/text()')[0]
37+
# print('title111',title)
38+
except Exception as e:
39+
#无标题
40+
title = '无'
41+
print(name,score,title)
42+
f_csv.writerow([brand,name,score,title])
43+
44+
item['name'] = name
45+
item['score'] = score
46+
item['title'] = title
47+
yield item

fans/scrapydcd/dcd/dcd/start.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from scrapy.cmdline import execute
2+
3+
execute('scrapy crawl car'.split(' '))
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# -*- coding: utf-8 -*-
2+
import json, re, requests, ssl
3+
from bs4 import BeautifulSoup
4+
from multiprocessing.dummy import Pool as ThreadPool
5+
6+
url_list = []
7+
num_list = []
8+
brand_dict = {}
9+
10+
11+
def get_brand_id(num):
12+
x = 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-%s-x-x' % num
13+
rep = requests.get(x, timeout=10)
14+
soup = BeautifulSoup(rep.text, 'html.parser')
15+
condition = soup.find_all('span', class_='layout_label__1qfS8')
16+
# 当num大于500时,有可能没这个品牌,condition[5]会报错
17+
try:
18+
s = condition[5].next_sibling.a.text
19+
print('s111', s)
20+
url_list.append(x)
21+
except Exception as e:
22+
pass
23+
24+
for span in condition:
25+
if span.string == '已选条件':
26+
print('ok')
27+
brand_dict[s] = num
28+
num_list.append(num)
29+
30+
31+
pool = ThreadPool(10)
32+
pool.map(get_brand_id, [i for i in range(1, 1000)])
33+
print(num_list)
34+
print(brand_dict)
35+
print(url_list)

0 commit comments

Comments
 (0)