diff --git a/Dockerfile b/Dockerfile index 6f2fcb2..086f94d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,7 @@ -FROM scrapinghub/base:12.04 +FROM scrapinghub/scrapinghub-stack-scrapy:1.4 WORKDIR /app -ADD requirements.txt /app/requirements.txt +COPY requirements.txt /app/requirements.txt RUN pip install -r /app/requirements.txt -ADD . /app +ENV SCRAPY_SETTINGS_MODULE testspiders.settings +COPY . /app +RUN python setup.py install diff --git a/requirements.txt b/requirements.txt index 7903385..9b3b13f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1 @@ Scrapy>=1.0.2 -service_identity==1.0.0 -https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/tarball/7e9190e#egg=sh_scrapy diff --git a/testspiders/spiders/broken_link.py b/testspiders/spiders/broken_link.py new file mode 100644 index 0000000..29bddf3 --- /dev/null +++ b/testspiders/spiders/broken_link.py @@ -0,0 +1,79 @@ +# coding: utf8 + +import scrapy + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + + +class BrokenLink(scrapy.Spider): + """ + Spider arguments: + - input_url: Where to start the crawl with. + - allowed_domains (optional): Comma-separated list of domains to restrict the crawl with. If not specified, it would be inferred from the input URL, e.g. http://doc.scrapy.org/en/latest/intro/overview.html -> doc.scrapy.org + + Settings: + - DEPTH_LIMIT: Controls the maximum depth (defaults to 50). + - MAX_REQUESTS: Controls the maximum requests (defaults to 100000). The actual number of requests may be slightly different, e.g. MAX_REQUESTS=1000 and the spider stops when having sent 1008 requests. + """ + name = 'broken_link' + custom_settings = { + 'HTTPERROR_ALLOW_ALL': True, + 'DEPTH_LIMIT': 50, + 'MAX_REQUESTS': 100000, + 'RETRY_HTTP_CODES': [], + } + + def __init__(self, input_url, allowed_domains=None, *args, **kwargs): + """Initializes the instance""" + super(BrokenLink, self).__init__(*args, **kwargs) + self.start_urls = [input_url] + if allowed_domains: + self.allowed_domains = allowed_domains.split(',') + else: + netloc = urlparse(input_url).netloc + domain = netloc.split('@')[-1].split(':')[0] + self.allowed_domains = [domain] + + def start_requests(self): + """Generates initial requests""" + for url in self.start_urls: + # Explicitly set the errback handler + yield scrapy.Request( + url, + dont_filter=True, + callback=self.parse, + errback=self.errback + ) + + def parse(self, response): + """Parses a default response""" + if not isinstance(response, scrapy.http.TextResponse): + self.crawler.stats.inc_value('non_text_response') + return + if response.status >= 400 and response.status <= 599: + yield { + 'url': response.url, + 'status': 'invalid_http_status', + 'http_status': response.status, + } + max_reqs = self.settings.getint('MAX_REQUESTS', 0) + stats = self.crawler.stats + for href in response.css('a::attr(href)').extract(): + if max_reqs and max_reqs < stats.get_value('scheduler/enqueued'): + break + yield scrapy.Request( + response.urljoin(href), + callback=self.parse, + errback=self.errback + ) + + def errback(self, err): + """Handles an error""" + return { + 'url': err.request.url, + 'status': 'error_downloading_http_response', + 'message': str(err.value), + } diff --git a/testspiders/spiders/followall.py b/testspiders/spiders/followall.py index 76e5bfe..da5e4c9 100644 --- a/testspiders/spiders/followall.py +++ b/testspiders/spiders/followall.py @@ -63,7 +63,7 @@ def _set_title(self, page, response): def _set_new_cookies(self, page, response): cookies = [] - for cookie in [x.split(';', 1)[0] for x in + for cookie in [x.split(b';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) diff --git a/testspiders/spiders/timed.py b/testspiders/spiders/timed.py index 4396352..d18914b 100644 --- a/testspiders/spiders/timed.py +++ b/testspiders/spiders/timed.py @@ -2,7 +2,6 @@ Crawll-all spider without domain restriction """ from testspiders.spiders.followall import FollowAllSpider -from twisted.internet import reactor class TimedSpider(FollowAllSpider): @@ -15,6 +14,7 @@ def __init__(self, **kw): super(TimedSpider, self).__init__(**kw) def start_requests(self): + from twisted.internet import reactor reactor.callLater(self.timeout, self.stop) return super(TimedSpider, self).start_requests() diff --git a/testspiders/spiders/timewaste.py b/testspiders/spiders/timewaste.py new file mode 100644 index 0000000..6c14b0b --- /dev/null +++ b/testspiders/spiders/timewaste.py @@ -0,0 +1,20 @@ +import scrapy + + +class Spider(scrapy.Spider): + name = 'timewaste' + start_urls = ('https://example.com',) + + def __init__(self, **kw): + self.timeout = int(kw.pop('timeout', '600')) + super(Spider, self).__init__(**kw) + + def parse(self, response): + from twisted.internet import reactor, defer + self.log('I will waste your time for {} seconds'.format(self.timeout)) + dfd = defer.Deferred() + reactor.callLater(self.timeout, dfd.callback, None) + return dfd + + def stop(self): + self.crawler.engine.close_spider(self, 'timeout')