diff --git a/README.rst b/README.rst index dc4e978..9955c53 100644 --- a/README.rst +++ b/README.rst @@ -2,48 +2,13 @@ dirbot ====== -This is a Scrapy project to scrape websites from public web directories. +Deprecation notice (March 2017) +=============================== -This project is only meant for educational purposes. +**This project is now deprecated.** -Items -===== +http://dmoz.org is no more and Scrapy's tutorial has been re-written +against http://quotes.toscrape.com/. -The items scraped by this project are websites, and the item is defined in the -class:: - - dirbot.items.Website - -See the source code for more details. - -Spiders -======= - -This project contains one spider called ``dmoz`` that you can see by running:: - - scrapy list - -Spider: dmoz ------------- - -The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's -based on the dmoz spider described in the `Scrapy tutorial`_ - -This spider doesn't crawl the entire dmoz.org site but only a few pages by -default (defined in the ``start_urls`` attribute). These pages are: - -* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ -* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ - -So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape -only those two pages. - -.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html - -Pipelines -========= - -This project uses a pipeline to filter out websites containing certain -forbidden words in their description. This pipeline is defined in the class:: - - dirbot.pipelines.FilterWordsPipeline +Please refer to https://github.com/scrapy/quotesbot for a more relevant +and up-to-date educational project on how to get started with Scrapy. diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index d301b59..be30566 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -10,7 +10,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in unicode(item['description']).lower(): + if word in item['description'].lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index 919fe97..6ecb2ff 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -20,15 +20,17 @@ def parse(self, response): @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ - sel = Selector(response) - sites = sel.xpath('//ul[@class="directory-url"]/li') + sites = response.css('#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() - item['name'] = site.xpath('a/text()').extract() - item['url'] = site.xpath('a/@href').extract() - item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') + item['name'] = site.css( + 'a > div.site-title::text').extract_first().strip() + item['url'] = site.xpath( + 'a/@href').extract_first().strip() + item['description'] = site.css( + 'div.site-descr::text').extract_first().strip() items.append(item) return items