From bbd8d2dbc1b1150446788a38705d2957c218c9aa Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Tue, 7 Jun 2016 18:31:10 +0200 Subject: [PATCH 1/4] Use six for Python3 compatibility --- dirbot/pipelines.py | 4 +++- setup.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index d301b59..b7c9f3c 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -1,3 +1,5 @@ +import six + from scrapy.exceptions import DropItem @@ -10,7 +12,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in unicode(item['description']).lower(): + if word in six.text_type(item['description']).lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/setup.py b/setup.py index fac72ad..af95442 100644 --- a/setup.py +++ b/setup.py @@ -5,4 +5,8 @@ version='1.0', packages=find_packages(), entry_points={'scrapy': ['settings = dirbot.settings']}, + install_requires=[ + 'scrapy', + 'six' + ] ) From 6d0318e0d86b41d13bdd411c1be8704fadf5298f Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 8 Jun 2016 11:54:20 +0200 Subject: [PATCH 2/4] Use single values when building items Also reverts six dependency --- dirbot/pipelines.py | 4 +--- dirbot/spiders/dmoz.py | 12 +++++++----- setup.py | 4 ---- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index b7c9f3c..be30566 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -1,5 +1,3 @@ -import six - from scrapy.exceptions import DropItem @@ -12,7 +10,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in six.text_type(item['description']).lower(): + if word in item['description'].lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 919fe97..250cf6c 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -20,15 +20,17 @@ def parse(self, response): @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ - sel = Selector(response) - sites = sel.xpath('//ul[@class="directory-url"]/li') + sites = response.css('#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() - item['name'] = site.xpath('a/text()').extract() - item['url'] = site.xpath('a/@href').extract() - item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') + item['name'] = site.css( + 'a > div.site-title::text').extract_first().strip() + item['url'] = site.xpath( + 'a/@href').extract_first().strip() + item['description'] = site.css( + 'div.site-descr').xpath('text()[1]').extract_first().strip() items.append(item) return items diff --git a/setup.py b/setup.py index af95442..fac72ad 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,4 @@ version='1.0', packages=find_packages(), entry_points={'scrapy': ['settings = dirbot.settings']}, - install_requires=[ - 'scrapy', - 'six' - ] ) From 810eae1563f78c0402d054b4edfb12729da64887 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 8 Jun 2016 12:02:57 +0200 Subject: [PATCH 3/4] Simplify description extraction --- dirbot/spiders/dmoz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 250cf6c..6ecb2ff 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -30,7 +30,7 @@ def parse(self, response): item['url'] = site.xpath( 'a/@href').extract_first().strip() item['description'] = site.css( - 'div.site-descr').xpath('text()[1]').extract_first().strip() + 'div.site-descr::text').extract_first().strip() items.append(item) return items From 8e995447361ad83f6693d2e07d17d90912268ace Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Thu, 30 Mar 2017 16:10:19 +0200 Subject: [PATCH 4/4] Update README.rst --- README.rst | 49 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/README.rst b/README.rst index dc4e978..9955c53 100644 --- a/README.rst +++ b/README.rst @@ -2,48 +2,13 @@ dirbot ====== -This is a Scrapy project to scrape websites from public web directories. +Deprecation notice (March 2017) +=============================== -This project is only meant for educational purposes. +**This project is now deprecated.** -Items -===== +http://dmoz.org is no more and Scrapy's tutorial has been re-written +against http://quotes.toscrape.com/. -The items scraped by this project are websites, and the item is defined in the -class:: - - dirbot.items.Website - -See the source code for more details. - -Spiders -======= - -This project contains one spider called ``dmoz`` that you can see by running:: - - scrapy list - -Spider: dmoz ------------- - -The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's -based on the dmoz spider described in the `Scrapy tutorial`_ - -This spider doesn't crawl the entire dmoz.org site but only a few pages by -default (defined in the ``start_urls`` attribute). These pages are: - -* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ -* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ - -So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape -only those two pages. - -.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html - -Pipelines -========= - -This project uses a pipeline to filter out websites containing certain -forbidden words in their description. This pipeline is defined in the class:: - - dirbot.pipelines.FilterWordsPipeline +Please refer to https://github.com/scrapy/quotesbot for a more relevant +and up-to-date educational project on how to get started with Scrapy.