-
Notifications
You must be signed in to change notification settings - Fork 143
Description
Try this url http://www.fengzhiling.com.cn
import scrapy
from playwright.async_api import Page
from scrapy.crawler import CrawlerProcess
class ExampleSpider(scrapy.Spider):
name = "example"
custom_settings = {
"PLAYWRIGHT_ABORT_REQUEST":lambda req : req.resource_type in ["image"],
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"CONCURRENT_REQUESTS":1,
"PLAYWRIGHT_LAUNCH_OPTIONS":{'headless': True},
"RETRY_ENABLED":False,
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT":3000,
"LOG_LEVEL":'ERROR',
}
def start_requests(self):
i=1
urls=['http://www.fengzhiling.com.cn','http://www.fengzhiling.com.cn','http://www.fengzhiling.com.cn','http://www.fengzhiling.com.cn','http://www.fengzhiling.com.cn']
for url in urls:
yield scrapy.Request(
url=url,
meta={
"playwright": True,
"playwright_context": str(i),
"playwright_include_page": True,
},
callback=self.parse,
errback=self.close,
dont_filter=True
)
i+=1
async def parse(self, response):
page = response.meta["playwright_page"]
try:
title = await page.title()
html = await page.content()
finally:
await page.close()
await page.context.close()
async def close(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
await page.context.close()
process = CrawlerProcess()
process.crawl(ExampleSpider)
process.start()
logs:
2025-08-08 16:49:42 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2025-08-08 16:49:42 [scrapy-playwright] INFO: Starting download handler
2025-08-08 16:49:42 [scrapy-playwright] INFO: Starting download handler
<GET http://www.fengzhiling.com.cn>
2025-08-08 16:49:43 [scrapy-playwright] INFO: Launching browser chromium
2025-08-08 16:49:43 [scrapy-playwright] INFO: Browser chromium launched
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context started: '1' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=1] New page created, page count is 1 (1 for all contexts)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=1] Request: <GET http://www.fengzhiling.com.cn/> (resource type: document)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=1] Response: <200 http://www.fengzhiling.com.cn/>
<GET http://www.fengzhiling.com.cn>
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context started: '2' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=2] New page created, page count is 1 (2 for all contexts)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=2] Request: <GET http://www.fengzhiling.com.cn/> (resource type: document)
1print
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context closed: '1' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=2] Response: <200 http://www.fengzhiling.com.cn/>
<GET http://www.fengzhiling.com.cn>
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context started: '3' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=3] New page created, page count is 1 (2 for all contexts)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=3] Request: <GET http://www.fengzhiling.com.cn/> (resource type: document)
2print
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context closed: '2' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=3] Response: <200 http://www.fengzhiling.com.cn/>
<GET http://www.fengzhiling.com.cn>
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context started: '4' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=4] New page created, page count is 1 (2 for all contexts)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=4] Request: <GET http://www.fengzhiling.com.cn/> (resource type: document)
3print
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context closed: '3' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=4] Response: <200 http://www.fengzhiling.com.cn/>
<GET http://www.fengzhiling.com.cn>
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context started: '5' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=5] New page created, page count is 1 (2 for all contexts)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=5] Request: <GET http://www.fengzhiling.com.cn/> (resource type: document)
4print
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: Browser context closed: '4' (persistent=False, remote=False)
2025-08-08 16:49:43 [scrapy-playwright] DEBUG: [Context=5] Response: <200 http://www.fengzhiling.com.cn/>
5print
2025-08-08 16:49:44 [scrapy-playwright] DEBUG: Browser context closed: '5' (persistent=False, remote=False)
2025-08-08 16:49:44 [scrapy.core.engine] INFO: Closing spider (finished)
2025-08-08 16:49:44 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method ExampleSpider.close of <ExampleSpider 'example' at 0x7f4e8bbffb60>>
seems release every page:
Browser context started: '1'
Browser context started: '2'
Browser context started: '3'
Browser context started: '4'
Browser context started: '5'
Browser context closed: '1'
Browser context closed: '2'
Browser context closed: '3'
Browser context closed: '4'
Browser context closed: '5'