From 73e566a31007d8876a69ddfda03f4ccdda5b7dac Mon Sep 17 00:00:00 2001 From: Steven Wilson Date: Fri, 22 Mar 2013 17:50:06 +0000 Subject: [PATCH 01/14] capture description without trailing newline characters using a regular expression --- dirbot/spiders/dmoz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index e0807ec..05a0108 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -28,7 +28,7 @@ def parse(self, response): item = Website() item['name'] = site.select('a/text()').extract() item['url'] = site.select('a/@href').extract() - item['description'] = site.select('text()').extract() + item['description'] = site.select('text()').re('-\s([^\n]*?)\\n') items.append(item) return items From 0edb745020e6f83ea6a21e6dbea9b0446623db49 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Fri, 22 Mar 2013 15:02:25 -0300 Subject: [PATCH 02/14] remove custom __str__ from Website item --- dirbot/items.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dirbot/items.py b/dirbot/items.py index a3f0408..b1e341f 100644 --- a/dirbot/items.py +++ b/dirbot/items.py @@ -6,6 +6,3 @@ class Website(Item): name = Field() description = Field() url = Field() - - def __str__(self): - return "Website: name=%s url=%s" % (self.get('name'), self.get('url')) From 4bbc16b352b1ca22cd944f2a8a0d36a79b8ccf01 Mon Sep 17 00:00:00 2001 From: Rolando Espinoza La fuente Date: Sat, 19 Oct 2013 23:21:41 -0400 Subject: [PATCH 03/14] Removed reference to ``crawl URL`` command as it's no longer supported. --- README.rst | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/README.rst b/README.rst index f66b73d..9210eb1 100644 --- a/README.rst +++ b/README.rst @@ -36,15 +36,7 @@ default (defined in the ``start_pages`` attribute). These pages are: * http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape -only those two pages. However, you can scrape any dmoz.org page by passing the -url instead of the spider name. Scrapy internally resolves the spider to use by -looking at the allowed domains of each spider. - -For example, to scrape a different URL use:: - - scrapy crawl http://www.dmoz.org/Computers/Programming/Languages/Erlang/ - -You can scrape any URL from dmoz.org using this spider +only those two pages. .. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html From 8f1abfaf6ab3738cb880885e227a56c82bb4178c Mon Sep 17 00:00:00 2001 From: Mat Gadd Date: Mon, 20 Jan 2014 18:40:06 +0000 Subject: [PATCH 04/14] Ignore Python bytecode files. --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc From eb6a8b44f810c281ccd4217b8bb8ba1dbf9cec38 Mon Sep 17 00:00:00 2001 From: Mat Gadd Date: Mon, 20 Jan 2014 18:40:12 +0000 Subject: [PATCH 05/14] Fix deprecation warning. --- dirbot/spiders/dmoz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 05a0108..8a69fea 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -1,10 +1,10 @@ -from scrapy.spider import BaseSpider +from scrapy.spider import Spider from scrapy.selector import HtmlXPathSelector from dirbot.items import Website -class DmozSpider(BaseSpider): +class DmozSpider(Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ From 736423bbd3fa7a9cde6d346e6e7d5f716d900d25 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Sat, 12 Apr 2014 12:35:16 +0530 Subject: [PATCH 06/14] Updated deprecated code --- dirbot/settings.py | 2 +- dirbot/spiders/dmoz.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dirbot/settings.py b/dirbot/settings.py index 2c8dd3d..4742811 100644 --- a/dirbot/settings.py +++ b/dirbot/settings.py @@ -4,4 +4,4 @@ NEWSPIDER_MODULE = 'dirbot.spiders' DEFAULT_ITEM_CLASS = 'dirbot.items.Website' -ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline'] +ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1} diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 8a69fea..00eaa25 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -1,5 +1,5 @@ from scrapy.spider import Spider -from scrapy.selector import HtmlXPathSelector +from scrapy.selector import Selector from dirbot.items import Website @@ -20,15 +20,15 @@ def parse(self, response): @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ - hxs = HtmlXPathSelector(response) - sites = hxs.select('//ul[@class="directory-url"]/li') + sel = Selector(response) + sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() - item['name'] = site.select('a/text()').extract() - item['url'] = site.select('a/@href').extract() - item['description'] = site.select('text()').re('-\s([^\n]*?)\\n') + item['name'] = site.xpath('a/text()').extract() + item['url'] = site.xpath('a/@href').extract() + item['description'] = site.xpath('text()').re('-\s([^\n]*?)\\n') items.append(item) return items From 7fc9120f5b7d9c9f381099ebecef7e9e01446ade Mon Sep 17 00:00:00 2001 From: Carol Willing Date: Fri, 2 May 2014 19:33:15 -0700 Subject: [PATCH 07/14] Corrected link to tutorial --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9210eb1..86c7442 100644 --- a/README.rst +++ b/README.rst @@ -38,7 +38,7 @@ default (defined in the ``start_pages`` attribute). These pages are: So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape only those two pages. -.. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html +.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html Pipelines ========= From b303147a56f1090c67cf6ed4f0526b9f3960a186 Mon Sep 17 00:00:00 2001 From: iwahoo30 Date: Tue, 30 Sep 2014 11:48:13 -0700 Subject: [PATCH 08/14] ('-\s[^\n]*\\r') not getting desc made a little change to regex --- dirbot/spiders/dmoz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 00eaa25..7f88f05 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -28,7 +28,7 @@ def parse(self, response): item = Website() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() - item['description'] = site.xpath('text()').re('-\s([^\n]*?)\\n') + item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') items.append(item) return items From 64058941055398e1f6e29932a16dfaa47de6effe Mon Sep 17 00:00:00 2001 From: Geoffrey van Wyk Date: Thu, 2 Jul 2015 10:17:09 +0200 Subject: [PATCH 09/14] Silences deprecation warning about scrapy.spider module. --- dirbot/spiders/dmoz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 7f88f05..919fe97 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -1,4 +1,4 @@ -from scrapy.spider import Spider +from scrapy.spiders import Spider from scrapy.selector import Selector from dirbot.items import Website From 36b646d7efee77405216426e961616bed7c9735b Mon Sep 17 00:00:00 2001 From: Geoffrey van Wyk Date: Thu, 2 Jul 2015 11:14:42 +0200 Subject: [PATCH 10/14] Corrects start_pages to start_urls in README. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 86c7442..dc4e978 100644 --- a/README.rst +++ b/README.rst @@ -30,7 +30,7 @@ The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's based on the dmoz spider described in the `Scrapy tutorial`_ This spider doesn't crawl the entire dmoz.org site but only a few pages by -default (defined in the ``start_pages`` attribute). These pages are: +default (defined in the ``start_urls`` attribute). These pages are: * http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ * http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ From bbd8d2dbc1b1150446788a38705d2957c218c9aa Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Tue, 7 Jun 2016 18:31:10 +0200 Subject: [PATCH 11/14] Use six for Python3 compatibility --- dirbot/pipelines.py | 4 +++- setup.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index d301b59..b7c9f3c 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -1,3 +1,5 @@ +import six + from scrapy.exceptions import DropItem @@ -10,7 +12,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in unicode(item['description']).lower(): + if word in six.text_type(item['description']).lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/setup.py b/setup.py index fac72ad..af95442 100644 --- a/setup.py +++ b/setup.py @@ -5,4 +5,8 @@ version='1.0', packages=find_packages(), entry_points={'scrapy': ['settings = dirbot.settings']}, + install_requires=[ + 'scrapy', + 'six' + ] ) From 6d0318e0d86b41d13bdd411c1be8704fadf5298f Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 8 Jun 2016 11:54:20 +0200 Subject: [PATCH 12/14] Use single values when building items Also reverts six dependency --- dirbot/pipelines.py | 4 +--- dirbot/spiders/dmoz.py | 12 +++++++----- setup.py | 4 ---- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index b7c9f3c..be30566 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -1,5 +1,3 @@ -import six - from scrapy.exceptions import DropItem @@ -12,7 +10,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in six.text_type(item['description']).lower(): + if word in item['description'].lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 919fe97..250cf6c 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -20,15 +20,17 @@ def parse(self, response): @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ - sel = Selector(response) - sites = sel.xpath('//ul[@class="directory-url"]/li') + sites = response.css('#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() - item['name'] = site.xpath('a/text()').extract() - item['url'] = site.xpath('a/@href').extract() - item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') + item['name'] = site.css( + 'a > div.site-title::text').extract_first().strip() + item['url'] = site.xpath( + 'a/@href').extract_first().strip() + item['description'] = site.css( + 'div.site-descr').xpath('text()[1]').extract_first().strip() items.append(item) return items diff --git a/setup.py b/setup.py index af95442..fac72ad 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,4 @@ version='1.0', packages=find_packages(), entry_points={'scrapy': ['settings = dirbot.settings']}, - install_requires=[ - 'scrapy', - 'six' - ] ) From 810eae1563f78c0402d054b4edfb12729da64887 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Wed, 8 Jun 2016 12:02:57 +0200 Subject: [PATCH 13/14] Simplify description extraction --- dirbot/spiders/dmoz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 250cf6c..6ecb2ff 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -30,7 +30,7 @@ def parse(self, response): item['url'] = site.xpath( 'a/@href').extract_first().strip() item['description'] = site.css( - 'div.site-descr').xpath('text()[1]').extract_first().strip() + 'div.site-descr::text').extract_first().strip() items.append(item) return items From 8e995447361ad83f6693d2e07d17d90912268ace Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Thu, 30 Mar 2017 16:10:19 +0200 Subject: [PATCH 14/14] Update README.rst --- README.rst | 49 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/README.rst b/README.rst index dc4e978..9955c53 100644 --- a/README.rst +++ b/README.rst @@ -2,48 +2,13 @@ dirbot ====== -This is a Scrapy project to scrape websites from public web directories. +Deprecation notice (March 2017) +=============================== -This project is only meant for educational purposes. +**This project is now deprecated.** -Items -===== +http://dmoz.org is no more and Scrapy's tutorial has been re-written +against http://quotes.toscrape.com/. -The items scraped by this project are websites, and the item is defined in the -class:: - - dirbot.items.Website - -See the source code for more details. - -Spiders -======= - -This project contains one spider called ``dmoz`` that you can see by running:: - - scrapy list - -Spider: dmoz ------------- - -The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's -based on the dmoz spider described in the `Scrapy tutorial`_ - -This spider doesn't crawl the entire dmoz.org site but only a few pages by -default (defined in the ``start_urls`` attribute). These pages are: - -* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ -* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ - -So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape -only those two pages. - -.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html - -Pipelines -========= - -This project uses a pipeline to filter out websites containing certain -forbidden words in their description. This pipeline is defined in the class:: - - dirbot.pipelines.FilterWordsPipeline +Please refer to https://github.com/scrapy/quotesbot for a more relevant +and up-to-date educational project on how to get started with Scrapy.