diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/README.rst b/README.rst index f66b73d..9955c53 100644 --- a/README.rst +++ b/README.rst @@ -2,56 +2,13 @@ dirbot ====== -This is a Scrapy project to scrape websites from public web directories. +Deprecation notice (March 2017) +=============================== -This project is only meant for educational purposes. +**This project is now deprecated.** -Items -===== +http://dmoz.org is no more and Scrapy's tutorial has been re-written +against http://quotes.toscrape.com/. -The items scraped by this project are websites, and the item is defined in the -class:: - - dirbot.items.Website - -See the source code for more details. - -Spiders -======= - -This project contains one spider called ``dmoz`` that you can see by running:: - - scrapy list - -Spider: dmoz ------------- - -The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's -based on the dmoz spider described in the `Scrapy tutorial`_ - -This spider doesn't crawl the entire dmoz.org site but only a few pages by -default (defined in the ``start_pages`` attribute). These pages are: - -* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ -* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ - -So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape -only those two pages. However, you can scrape any dmoz.org page by passing the -url instead of the spider name. Scrapy internally resolves the spider to use by -looking at the allowed domains of each spider. - -For example, to scrape a different URL use:: - - scrapy crawl http://www.dmoz.org/Computers/Programming/Languages/Erlang/ - -You can scrape any URL from dmoz.org using this spider - -.. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html - -Pipelines -========= - -This project uses a pipeline to filter out websites containing certain -forbidden words in their description. This pipeline is defined in the class:: - - dirbot.pipelines.FilterWordsPipeline +Please refer to https://github.com/scrapy/quotesbot for a more relevant +and up-to-date educational project on how to get started with Scrapy. diff --git a/dirbot/items.py b/dirbot/items.py index a3f0408..b1e341f 100644 --- a/dirbot/items.py +++ b/dirbot/items.py @@ -6,6 +6,3 @@ class Website(Item): name = Field() description = Field() url = Field() - - def __str__(self): - return "Website: name=%s url=%s" % (self.get('name'), self.get('url')) diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py index d301b59..be30566 100644 --- a/dirbot/pipelines.py +++ b/dirbot/pipelines.py @@ -10,7 +10,7 @@ class FilterWordsPipeline(object): def process_item(self, item, spider): for word in self.words_to_filter: - if word in unicode(item['description']).lower(): + if word in item['description'].lower(): raise DropItem("Contains forbidden word: %s" % word) else: return item diff --git a/dirbot/settings.py b/dirbot/settings.py index 2c8dd3d..4742811 100644 --- a/dirbot/settings.py +++ b/dirbot/settings.py @@ -4,4 +4,4 @@ NEWSPIDER_MODULE = 'dirbot.spiders' DEFAULT_ITEM_CLASS = 'dirbot.items.Website' -ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline'] +ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1} diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py index e0807ec..6ecb2ff 100644 --- a/dirbot/spiders/dmoz.py +++ b/dirbot/spiders/dmoz.py @@ -1,10 +1,10 @@ -from scrapy.spider import BaseSpider -from scrapy.selector import HtmlXPathSelector +from scrapy.spiders import Spider +from scrapy.selector import Selector from dirbot.items import Website -class DmozSpider(BaseSpider): +class DmozSpider(Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ @@ -20,15 +20,17 @@ def parse(self, response): @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ - hxs = HtmlXPathSelector(response) - sites = hxs.select('//ul[@class="directory-url"]/li') + sites = response.css('#site-list-content > div.site-item > div.title-and-desc') items = [] for site in sites: item = Website() - item['name'] = site.select('a/text()').extract() - item['url'] = site.select('a/@href').extract() - item['description'] = site.select('text()').extract() + item['name'] = site.css( + 'a > div.site-title::text').extract_first().strip() + item['url'] = site.xpath( + 'a/@href').extract_first().strip() + item['description'] = site.css( + 'div.site-descr::text').extract_first().strip() items.append(item) return items