From 73e566a31007d8876a69ddfda03f4ccdda5b7dac Mon Sep 17 00:00:00 2001
From: Steven Wilson <stvn.wlsn@gmail.com>
Date: Fri, 22 Mar 2013 17:50:06 +0000
Subject: [PATCH 01/14] capture description without trailing newline characters
 using a regular expression

---
 dirbot/spiders/dmoz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index e0807ec..05a0108 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -28,7 +28,7 @@ def parse(self, response):
             item = Website()
             item['name'] = site.select('a/text()').extract()
             item['url'] = site.select('a/@href').extract()
-            item['description'] = site.select('text()').extract()
+            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
             items.append(item)
 
         return items

From 0edb745020e6f83ea6a21e6dbea9b0446623db49 Mon Sep 17 00:00:00 2001
From: Pablo Hoffman <pablo@pablohoffman.com>
Date: Fri, 22 Mar 2013 15:02:25 -0300
Subject: [PATCH 02/14] remove custom __str__ from Website item

---
 dirbot/items.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dirbot/items.py b/dirbot/items.py
index a3f0408..b1e341f 100644
--- a/dirbot/items.py
+++ b/dirbot/items.py
@@ -6,6 +6,3 @@ class Website(Item):
     name = Field()
     description = Field()
     url = Field()
-
-    def __str__(self):
-        return "Website: name=%s url=%s" % (self.get('name'), self.get('url'))

From 4bbc16b352b1ca22cd944f2a8a0d36a79b8ccf01 Mon Sep 17 00:00:00 2001
From: Rolando Espinoza La fuente <darkrho@gmail.com>
Date: Sat, 19 Oct 2013 23:21:41 -0400
Subject: [PATCH 03/14] Removed reference to ``crawl URL`` command as it's no
 longer supported.

---
 README.rst | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/README.rst b/README.rst
index f66b73d..9210eb1 100644
--- a/README.rst
+++ b/README.rst
@@ -36,15 +36,7 @@ default (defined in the ``start_pages`` attribute). These pages are:
 * http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
 
 So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape
-only those two pages. However, you can scrape any dmoz.org page by passing the
-url instead of the spider name. Scrapy internally resolves the spider to use by
-looking at the allowed domains of each spider.
-
-For example, to scrape a different URL use::
-
-    scrapy crawl http://www.dmoz.org/Computers/Programming/Languages/Erlang/
-
-You can scrape any URL from dmoz.org using this spider
+only those two pages.
 
 .. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html 
 

From 8f1abfaf6ab3738cb880885e227a56c82bb4178c Mon Sep 17 00:00:00 2001
From: Mat Gadd <mat@whiteoctober.co.uk>
Date: Mon, 20 Jan 2014 18:40:06 +0000
Subject: [PATCH 04/14] Ignore Python bytecode files.

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc

From eb6a8b44f810c281ccd4217b8bb8ba1dbf9cec38 Mon Sep 17 00:00:00 2001
From: Mat Gadd <mat@whiteoctober.co.uk>
Date: Mon, 20 Jan 2014 18:40:12 +0000
Subject: [PATCH 05/14] Fix deprecation warning.

---
 dirbot/spiders/dmoz.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 05a0108..8a69fea 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -1,10 +1,10 @@
-from scrapy.spider import BaseSpider
+from scrapy.spider import Spider
 from scrapy.selector import HtmlXPathSelector
 
 from dirbot.items import Website
 
 
-class DmozSpider(BaseSpider):
+class DmozSpider(Spider):
     name = "dmoz"
     allowed_domains = ["dmoz.org"]
     start_urls = [

From 736423bbd3fa7a9cde6d346e6e7d5f716d900d25 Mon Sep 17 00:00:00 2001
From: Nikhil Gupta <me@nikhgupta.com>
Date: Sat, 12 Apr 2014 12:35:16 +0530
Subject: [PATCH 06/14] Updated deprecated code

---
 dirbot/settings.py     |  2 +-
 dirbot/spiders/dmoz.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dirbot/settings.py b/dirbot/settings.py
index 2c8dd3d..4742811 100644
--- a/dirbot/settings.py
+++ b/dirbot/settings.py
@@ -4,4 +4,4 @@
 NEWSPIDER_MODULE = 'dirbot.spiders'
 DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
 
-ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline']
+ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1}
diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 8a69fea..00eaa25 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -1,5 +1,5 @@
 from scrapy.spider import Spider
-from scrapy.selector import HtmlXPathSelector
+from scrapy.selector import Selector
 
 from dirbot.items import Website
 
@@ -20,15 +20,15 @@ def parse(self, response):
         @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
         @scrapes name
         """
-        hxs = HtmlXPathSelector(response)
-        sites = hxs.select('//ul[@class="directory-url"]/li')
+        sel = Selector(response)
+        sites = sel.xpath('//ul[@class="directory-url"]/li')
         items = []
 
         for site in sites:
             item = Website()
-            item['name'] = site.select('a/text()').extract()
-            item['url'] = site.select('a/@href').extract()
-            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
+            item['name'] = site.xpath('a/text()').extract()
+            item['url'] = site.xpath('a/@href').extract()
+            item['description'] = site.xpath('text()').re('-\s([^\n]*?)\\n')
             items.append(item)
 
         return items

From 7fc9120f5b7d9c9f381099ebecef7e9e01446ade Mon Sep 17 00:00:00 2001
From: Carol Willing <carolcode@willingconsulting.com>
Date: Fri, 2 May 2014 19:33:15 -0700
Subject: [PATCH 07/14] Corrected link to tutorial

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 9210eb1..86c7442 100644
--- a/README.rst
+++ b/README.rst
@@ -38,7 +38,7 @@ default (defined in the ``start_pages`` attribute). These pages are:
 So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape
 only those two pages.
 
-.. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html 
+.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html
 
 Pipelines
 =========

From b303147a56f1090c67cf6ed4f0526b9f3960a186 Mon Sep 17 00:00:00 2001
From: iwahoo30 <iwahoo30@yahoo.com>
Date: Tue, 30 Sep 2014 11:48:13 -0700
Subject: [PATCH 08/14] ('-\s[^\n]*\\r')

not getting desc made a little change to regex
---
 dirbot/spiders/dmoz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 00eaa25..7f88f05 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -28,7 +28,7 @@ def parse(self, response):
             item = Website()
             item['name'] = site.xpath('a/text()').extract()
             item['url'] = site.xpath('a/@href').extract()
-            item['description'] = site.xpath('text()').re('-\s([^\n]*?)\\n')
+            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
             items.append(item)
 
         return items

From 64058941055398e1f6e29932a16dfaa47de6effe Mon Sep 17 00:00:00 2001
From: Geoffrey van Wyk <geoffrey@vanwyk.biz>
Date: Thu, 2 Jul 2015 10:17:09 +0200
Subject: [PATCH 09/14] Silences deprecation warning about scrapy.spider
 module.

---
 dirbot/spiders/dmoz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 7f88f05..919fe97 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -1,4 +1,4 @@
-from scrapy.spider import Spider
+from scrapy.spiders import Spider
 from scrapy.selector import Selector
 
 from dirbot.items import Website

From 36b646d7efee77405216426e961616bed7c9735b Mon Sep 17 00:00:00 2001
From: Geoffrey van Wyk <geoffrey@vanwyk.biz>
Date: Thu, 2 Jul 2015 11:14:42 +0200
Subject: [PATCH 10/14] Corrects start_pages to start_urls in README.

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 86c7442..dc4e978 100644
--- a/README.rst
+++ b/README.rst
@@ -30,7 +30,7 @@ The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's
 based on the dmoz spider described in the `Scrapy tutorial`_
 
 This spider doesn't crawl the entire dmoz.org site but only a few pages by
-default (defined in the ``start_pages`` attribute). These pages are:
+default (defined in the ``start_urls`` attribute). These pages are:
 
 * http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
 * http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/

From bbd8d2dbc1b1150446788a38705d2957c218c9aa Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Tue, 7 Jun 2016 18:31:10 +0200
Subject: [PATCH 11/14] Use six for Python3 compatibility

---
 dirbot/pipelines.py | 4 +++-
 setup.py            | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py
index d301b59..b7c9f3c 100644
--- a/dirbot/pipelines.py
+++ b/dirbot/pipelines.py
@@ -1,3 +1,5 @@
+import six
+
 from scrapy.exceptions import DropItem
 
 
@@ -10,7 +12,7 @@ class FilterWordsPipeline(object):
 
     def process_item(self, item, spider):
         for word in self.words_to_filter:
-            if word in unicode(item['description']).lower():
+            if word in six.text_type(item['description']).lower():
                 raise DropItem("Contains forbidden word: %s" % word)
         else:
             return item
diff --git a/setup.py b/setup.py
index fac72ad..af95442 100644
--- a/setup.py
+++ b/setup.py
@@ -5,4 +5,8 @@
     version='1.0',
     packages=find_packages(),
     entry_points={'scrapy': ['settings = dirbot.settings']},
+    install_requires=[
+        'scrapy',
+        'six'
+    ]
 )

From 6d0318e0d86b41d13bdd411c1be8704fadf5298f Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Wed, 8 Jun 2016 11:54:20 +0200
Subject: [PATCH 12/14] Use single values when building items

Also reverts six dependency
---
 dirbot/pipelines.py    |  4 +---
 dirbot/spiders/dmoz.py | 12 +++++++-----
 setup.py               |  4 ----
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py
index b7c9f3c..be30566 100644
--- a/dirbot/pipelines.py
+++ b/dirbot/pipelines.py
@@ -1,5 +1,3 @@
-import six
-
 from scrapy.exceptions import DropItem
 
 
@@ -12,7 +10,7 @@ class FilterWordsPipeline(object):
 
     def process_item(self, item, spider):
         for word in self.words_to_filter:
-            if word in six.text_type(item['description']).lower():
+            if word in item['description'].lower():
                 raise DropItem("Contains forbidden word: %s" % word)
         else:
             return item
diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 919fe97..250cf6c 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -20,15 +20,17 @@ def parse(self, response):
         @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
         @scrapes name
         """
-        sel = Selector(response)
-        sites = sel.xpath('//ul[@class="directory-url"]/li')
+        sites = response.css('#site-list-content > div.site-item > div.title-and-desc')
         items = []
 
         for site in sites:
             item = Website()
-            item['name'] = site.xpath('a/text()').extract()
-            item['url'] = site.xpath('a/@href').extract()
-            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
+            item['name'] = site.css(
+                'a > div.site-title::text').extract_first().strip()
+            item['url'] = site.xpath(
+                'a/@href').extract_first().strip()
+            item['description'] = site.css(
+                'div.site-descr').xpath('text()[1]').extract_first().strip()
             items.append(item)
 
         return items
diff --git a/setup.py b/setup.py
index af95442..fac72ad 100644
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,4 @@
     version='1.0',
     packages=find_packages(),
     entry_points={'scrapy': ['settings = dirbot.settings']},
-    install_requires=[
-        'scrapy',
-        'six'
-    ]
 )

From 810eae1563f78c0402d054b4edfb12729da64887 Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Wed, 8 Jun 2016 12:02:57 +0200
Subject: [PATCH 13/14] Simplify description extraction

---
 dirbot/spiders/dmoz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
index 250cf6c..6ecb2ff 100644
--- a/dirbot/spiders/dmoz.py
+++ b/dirbot/spiders/dmoz.py
@@ -30,7 +30,7 @@ def parse(self, response):
             item['url'] = site.xpath(
                 'a/@href').extract_first().strip()
             item['description'] = site.css(
-                'div.site-descr').xpath('text()[1]').extract_first().strip()
+                'div.site-descr::text').extract_first().strip()
             items.append(item)
 
         return items

From 8e995447361ad83f6693d2e07d17d90912268ace Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Thu, 30 Mar 2017 16:10:19 +0200
Subject: [PATCH 14/14] Update README.rst

---
 README.rst | 49 +++++++------------------------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/README.rst b/README.rst
index dc4e978..9955c53 100644
--- a/README.rst
+++ b/README.rst
@@ -2,48 +2,13 @@
 dirbot
 ======
 
-This is a Scrapy project to scrape websites from public web directories.
+Deprecation notice (March 2017)
+===============================
 
-This project is only meant for educational purposes.
+**This project is now deprecated.**
 
-Items
-=====
+http://dmoz.org is no more and Scrapy's tutorial has been re-written
+against http://quotes.toscrape.com/.
 
-The items scraped by this project are websites, and the item is defined in the
-class::
-
-    dirbot.items.Website
-
-See the source code for more details.
-
-Spiders
-=======
-
-This project contains one spider called ``dmoz`` that you can see by running::
-
-    scrapy list
-
-Spider: dmoz
-------------
-
-The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's
-based on the dmoz spider described in the `Scrapy tutorial`_
-
-This spider doesn't crawl the entire dmoz.org site but only a few pages by
-default (defined in the ``start_urls`` attribute). These pages are:
-
-* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
-* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
-
-So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape
-only those two pages.
-
-.. _Scrapy tutorial: http://doc.scrapy.org/en/latest/intro/tutorial.html
-
-Pipelines
-=========
-
-This project uses a pipeline to filter out websites containing certain
-forbidden words in their description. This pipeline is defined in the class::
-
-    dirbot.pipelines.FilterWordsPipeline
+Please refer to https://github.com/scrapy/quotesbot for a more relevant
+and up-to-date educational project on how to get started with Scrapy.