Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/client/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ For example, to run a new job for a given spider with custom parameters::



Geting job information
Getting job information
^^^^^^^^^^^^^^^^^^^^^^

To select a specific job for a project, use ``.jobs.get(<jobKey>)``::
Expand Down Expand Up @@ -387,7 +387,7 @@ acts like a Python dictionary::
'5123a86-master'

To check what keys are available (they ultimately depend on the job),
you can use its ``.iter()`` method (here, it's wrapped inside a dict for readibility)::
you can use its ``.iter()`` method (here, it's wrapped inside a dict for readability)::

>>> dict(job.metadata.iter())
{...
Expand Down
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]


# disable cross-reference for ivar
# patch taken from http://stackoverflow.com/a/41184353/1932023
def patched_make_field(self, types, domain, items, env=None):
Expand Down
8 changes: 4 additions & 4 deletions scrapinghub/client/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def count(self, spider=None, state=None, has_tag=None, lacks_tag=None,
:param lacks_tag: (optional) filter results by missing tag(s), a string
or a list of strings.
:param startts: (optional) UNIX timestamp at which to begin results,
in millisecons.
in milliseconds.
:param endts: (optional) UNIX timestamp at which to end results,
in millisecons.
in milliseconds.
:param \*\*params: (optional) other filter params.

:return: jobs count.
Expand Down Expand Up @@ -222,9 +222,9 @@ def list(self, count=None, start=None, spider=None, state=None,
:param lacks_tag: (optional) filter results by missing tag(s), a string
or a list of strings.
:param startts: (optional) UNIX timestamp at which to begin results,
in millisecons.
in milliseconds.
:param endts: (optional) UNIX timestamp at which to end results,
in millisecons.
in milliseconds.
:param meta: (optional) request for additional fields, a single
field name or a list of field names to return.
:param \*\*params: (optional) other filter params.
Expand Down
8 changes: 4 additions & 4 deletions scrapinghub/client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def update_kwargs(kwargs, **params):


def parse_auth(auth):
"""Parse authentification token.
"""Parse authentication token.

>>> os.environ['SH_APIKEY'] = 'apikey'
>>> parse_auth(None)
Expand All @@ -106,7 +106,7 @@ def parse_auth(auth):
if auth is None:
apikey = os.environ.get('SH_APIKEY')
if apikey:
return (apikey, '')
return apikey, ''

jobauth = os.environ.get('SHUB_JOBAUTH')
if jobauth:
Expand All @@ -131,7 +131,7 @@ def parse_auth(auth):
return jwt_auth

login, _, password = auth.partition(':')
return (login, password)
return login, password


def _search_for_jwt_credentials(auth):
Expand All @@ -144,6 +144,6 @@ def _search_for_jwt_credentials(auth):
decoded_auth = decoded_auth.decode('ascii')
login, _, password = decoded_auth.partition(':')
if password and parse_job_key(login):
return (login, password)
return login, password
except (UnicodeDecodeError, ValueError):
pass
1 change: 1 addition & 0 deletions scrapinghub/hubstorage/batchuploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def _upload(self, batch):
headers=headers,
)


class ValueTooLarge(ValueError):
"""Raised when a serialized item is greater than 1MB"""

Expand Down
29 changes: 15 additions & 14 deletions scrapinghub/hubstorage/collectionsrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,22 @@ def truncate(self, _name):
return self.apipost('delete', params={'name': _name}, is_idempotent=True)

def iter_json(self, _type, _name, requests_params=None, **apiparams):
return DownloadableResource.iter_json(self, (_type, _name),
requests_params=requests_params, **apiparams)
return DownloadableResource.iter_json(
self, (_type, _name), requests_params=requests_params, **apiparams
)

def iter_msgpack(self, _type, _name, requests_params=None, **apiparams):
return DownloadableResource.iter_msgpack(self, (_type, _name),
requests_params=requests_params, **apiparams)
return DownloadableResource.iter_msgpack(
self, (_type, _name), requests_params=requests_params, **apiparams
)

def create_writer(self, coltype, colname, **writer_kwargs):
self._validate_collection(coltype, colname)
kwargs = dict(writer_kwargs)
kwargs.setdefault('content_encoding', 'gzip')
kwargs.setdefault('auth', self.auth)
url = urlpathjoin(self.url, coltype, colname)
return self.client.batchuploader.create_writer(url,
**kwargs)
return self.client.batchuploader.create_writer(url, **kwargs)

def new_collection(self, coltype, colname):
self._validate_collection(coltype, colname)
Expand Down Expand Up @@ -109,15 +110,14 @@ def _validate_collection(self, coltype, colname):
raise ValueError('Invalid collection name {!r}, only alphanumeric '
'characters'.format(colname))


def _batch(self, method, path, total_param, progress=None, **params):
total = 0
getparams = dict(params)
try:
while True:
r = next(self.apirequest(
path, method=method, params=getparams,
is_idempotent=method=='GET',
is_idempotent=method == 'GET',
))
total += r[total_param]
next_start = r.get('nextstart')
Expand Down Expand Up @@ -147,8 +147,7 @@ def create_writer(self, **kwargs):
kwargs are passed to batchuploader.create_writer, but auth and gzip
content encoding are specified if not provided
"""
return self._collections.create_writer(self.coltype, self.colname,
**kwargs)
return self._collections.create_writer(self.coltype, self.colname, **kwargs)

def get(self, *args, **kwargs):
return self._collections.get(self.coltype, self.colname, *args, **kwargs)
Expand All @@ -166,9 +165,11 @@ def count(self, *args, **kwargs):
return self._collections.count(self.coltype, self.colname, *args, **kwargs)

def iter_json(self, requests_params=None, **apiparams):
return self._collections.iter_json(self.coltype, self.colname,
requests_params=requests_params, **apiparams)
return self._collections.iter_json(
self.coltype, self.colname, requests_params=requests_params, **apiparams
)

def iter_values(self, requests_params=None, **apiparams):
return self._collections.iter_values(self.coltype, self.colname,
requests_params=requests_params, **apiparams)
return self._collections.iter_values(
self.coltype, self.colname, requests_params=requests_params, **apiparams
)
2 changes: 1 addition & 1 deletion scrapinghub/hubstorage/job.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from .resourcetype import (ItemsResourceType, DownloadableResource,
MappingResourceType)
MappingResourceType)
from .utils import millitime, urlpathjoin
from .jobq import JobQ

Expand Down
3 changes: 2 additions & 1 deletion scrapinghub/hubstorage/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, client, projectid, auth=None):
self.client = client
self.projectid = urlpathjoin(projectid)
assert len(self.projectid.split('/')) == 1, \
'projectkey must be just one id: %s' % projectid
'projectkey must be just one id: %s' % projectid
self.auth = xauth(auth) or client.auth
self.jobs = Jobs(client, self.projectid, auth=auth)
self.items = Items(client, self.projectid, auth=auth)
Expand Down Expand Up @@ -68,6 +68,7 @@ class Jobs(ResourceType):
def list(self, _key=None, **params):
return self.apiget(_key, params=params)


class Items(ResourceType):

resource_type = 'items'
Expand Down
2 changes: 1 addition & 1 deletion scrapinghub/hubstorage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def urlpathjoin(*parts):


def xauth(auth):
"""Expand authentification token
"""Expand authentication token

>>> xauth(None)
>>> xauth(('user', 'pass'))
Expand Down
8 changes: 4 additions & 4 deletions scrapinghub/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, apikey=None, password='', _old_passwd='',
raise RuntimeError("No API key provided and SH_APIKEY environment variable not set")

assert not apikey.startswith('http://'), \
"Instantiating scrapinghub.Connection with url as first argument is not supported"
"Instantiating scrapinghub.Connection with url as first argument is not supported"
if password:
warnings.warn("A lot of endpoints support authentication only via apikey.")
self.apikey = apikey
Expand All @@ -77,7 +77,7 @@ def __repr__(self):
def auth(self):
warnings.warn("'auth' connection attribute is deprecated, "
"use 'apikey' attribute instead", stacklevel=2)
return (self.apikey, self.password)
return self.apikey, self.password

def _create_session(self):
from requests import session
Expand Down Expand Up @@ -169,8 +169,8 @@ def _decode_response(self, response, format, raw):
raise APIError("JSON response does not contain status")
else: # jl
return (json.loads(line.decode('utf-8')
if isinstance(line, _BINARY_TYPE) else line)
for line in response.iter_lines())
if isinstance(line, _BINARY_TYPE) else line)
for line in response.iter_lines())

##
## public methods
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
author='Scrapinghub',
author_email='info@scrapinghub.com',
url='http://github.com/scrapinghub/python-scrapinghub',
platforms = ['Any'],
platforms=['Any'],
packages=['scrapinghub', 'scrapinghub.client', 'scrapinghub.hubstorage'],
package_data={'scrapinghub': ['VERSION']},
install_requires=['requests>=1.0', 'retrying>=1.3.3', 'six>=1.10.0'],
extras_require = {'msgpack': [mpack_required]},
classifiers = [
extras_require={'msgpack': [mpack_required]},
classifiers=[
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
Expand Down
2 changes: 1 addition & 1 deletion tests/client/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_cancel_jobs_validation(spider):
assert 'keys should be a list' in str(err)

with pytest.raises(ValueError) as err:
spider.jobs.cancel(count=[1,2])
spider.jobs.cancel(count=[1, 2])

assert 'count should be an int' in str(err)

Expand Down
2 changes: 1 addition & 1 deletion tests/client/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ def normalize_job_for_tests(job):
existing snapshots.
"""
normalized_key = '{}/{}'.format(TEST_PROJECT_ID, job.key.split('/', 1)[1])
return job._client.get_job(normalized_key)
return job._client.get_job(normalized_key)
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,4 @@ def _get_accept_header(request):
@pytest.fixture
def frontier_name(request):
"""Provide a name for test-unique HS frontier."""
return re.sub('\W+', '-', request.node.nodeid)
return re.sub(r'\W+', '-', request.node.nodeid)
2 changes: 1 addition & 1 deletion tests/hubstorage/test_activity.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Test Activty
Test Activity
"""
from six.moves import range

Expand Down
2 changes: 1 addition & 1 deletion tests/hubstorage/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def post_scan_test(hsproject, hscollection):

# combining with normal filters
result = list(hscollection.get(filter='["counter", ">", [5]]',
prefix='post_scan_test1'))
prefix='post_scan_test1'))
# 10-19
assert len(result) == 10

Expand Down
10 changes: 5 additions & 5 deletions tests/hubstorage/test_retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ def request_callback(request):
attempts[0] += 1

if attempts[0] <= timeout_count:
return (http_error_status, {}, "Timeout")
return http_error_status, {}, "Timeout"
else:
resp_body = dict(body_on_success)
return (200, {}, json.dumps(resp_body))
return 200, {}, json.dumps(resp_body)

return (request_callback, attempts)
return request_callback, attempts


def test_delete_on_hubstorage_api_does_not_404():
Expand Down Expand Up @@ -154,10 +154,10 @@ def request_callback(request):
if attempts_count[0] <= 2:
raise ConnectionError("Connection aborted.", BadStatusLine("''"))
if attempts_count[0] == 3:
return (err_code, {}, u'')
return err_code, {}, u''
else:
resp_body = dict(job_metadata)
return (200, {}, json.dumps(resp_body))
return 200, {}, json.dumps(resp_body)

mock_api(callback=request_callback)

Expand Down