From 0641fc3e1b80699481dc4ed149089625677d473b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 10 Jan 2018 13:40:08 +0300 Subject: [PATCH 1/3] Speed-up stats() logic for ItemsResourceType --- scrapinghub/hubstorage/resourcetype.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/scrapinghub/hubstorage/resourcetype.py b/scrapinghub/hubstorage/resourcetype.py index 0fd029fd..06c8032e 100644 --- a/scrapinghub/hubstorage/resourcetype.py +++ b/scrapinghub/hubstorage/resourcetype.py @@ -1,8 +1,13 @@ +import time +import json +import socket +import logging +from collections import MutableMapping + import six from six.moves import range -import logging, time, json, socket -from collections import MutableMapping import requests.exceptions as rexc + from .utils import urlpathjoin, xauth from .serialization import jlencode, jldecode, mpdecode @@ -57,9 +62,7 @@ def _iter_lines(self, _path, **kwargs): # XXX explicitly encode data to overcome shazow/urllib3#717 # when dealing with large POST requests with enabled TLS kwargs['data'] = jlencode(kwargs.pop('jl')).encode('utf-8') - r = self.client.request(**kwargs) - lines = r.iter_lines() if six.PY3: return (l.decode(r.encoding or 'utf8') for l in lines) @@ -78,6 +81,14 @@ def apiget(self, _path=None, **kwargs): kwargs.setdefault('is_idempotent', True) return self.apirequest(_path, method='GET', **kwargs) + def apiget_json(self, _path, **kwargs): + kwargs.update(method='GET', url=urlpathjoin(self.url, _path)) + kwargs.setdefault('auth', self.auth) + kwargs.setdefault('is_idempotent', True) + r = self.client.request(**kwargs) + r.raise_for_status() + return r.json() + def apidelete(self, _path=None, **kwargs): kwargs.setdefault('is_idempotent', True) return self.apirequest(_path, method='DELETE', **kwargs) @@ -217,7 +228,7 @@ def get(self, _key, **params): return o def stats(self): - return next(self.apiget('stats')) + return self.apiget_json('stats') class MappingResourceType(ResourceType, MutableMapping): From 7a8040e55da2248e729b682610683931b4a98a02 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 10 Jan 2018 13:52:36 +0300 Subject: [PATCH 2/3] Add docstring for the new method --- scrapinghub/hubstorage/resourcetype.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scrapinghub/hubstorage/resourcetype.py b/scrapinghub/hubstorage/resourcetype.py index 06c8032e..dc56fa58 100644 --- a/scrapinghub/hubstorage/resourcetype.py +++ b/scrapinghub/hubstorage/resourcetype.py @@ -82,6 +82,14 @@ def apiget(self, _path=None, **kwargs): return self.apirequest(_path, method='GET', **kwargs) def apiget_json(self, _path, **kwargs): + """Optimized GET logic for endpoints returning a single JSON line. + + Some endpoints, like /items/stats, can return a large JSON line, and + due to chunking logic in _iter_lines() it can take double time to get + the chunks one by one, join and convert it to a single JSON line. This + method should be called for endpoints that always return a single JSON + line in the response. + """ kwargs.update(method='GET', url=urlpathjoin(self.url, _path)) kwargs.setdefault('auth', self.auth) kwargs.setdefault('is_idempotent', True) From 9f87d3ba489a80ce5119471891bf6a8d2ecad040 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 10 Jan 2018 13:54:59 +0300 Subject: [PATCH 3/3] Rollback redundant changes --- scrapinghub/hubstorage/resourcetype.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapinghub/hubstorage/resourcetype.py b/scrapinghub/hubstorage/resourcetype.py index dc56fa58..53320f6d 100644 --- a/scrapinghub/hubstorage/resourcetype.py +++ b/scrapinghub/hubstorage/resourcetype.py @@ -62,7 +62,9 @@ def _iter_lines(self, _path, **kwargs): # XXX explicitly encode data to overcome shazow/urllib3#717 # when dealing with large POST requests with enabled TLS kwargs['data'] = jlencode(kwargs.pop('jl')).encode('utf-8') + r = self.client.request(**kwargs) + lines = r.iter_lines() if six.PY3: return (l.decode(r.encoding or 'utf8') for l in lines)