From 534a7b7280493119b189cb659b20f4a0a8ad9477 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 4 Oct 2019 15:40:11 +0800 Subject: [PATCH 1/7] implement new iter_by_chunks() in items --- scrapinghub/client/items.py | 27 +++++++++++++++++++++++++++ tests/client/test_items.py | 17 +++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 669ea1af..243b9df2 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -59,3 +59,30 @@ def _modify_iter_params(self, params): if offset: params['start'] = '{}/{}'.format(self.key, offset) return params + + def iter_by_chunks(self, chunksize=10000, *args, **kwargs): + """An alternative for reading and processing items by returning a + generator of item chunks. + + This is a convenient method for cases when processing a large amount of + items from a job isn't ideal in one go due to the large memory needed. + Instead, this allows you to process it chunk by chunk. + + You can improve I/O overheads by increasing the chunk value but that + would also increase the memory consumption. + + :return: an iterator over a list of elements. + :rtype: :class:`collections.Iterable` + """ + + processed = 0 + while True: + next_key = self.key + '/' + str(processed) + items = [ + item for item in self.iter( + count=chunksize, start=next_key, *args, **kwargs) + ] + yield items + processed += len(items) + if len(items) < chunksize: + break diff --git a/tests/client/test_items.py b/tests/client/test_items.py index ea19d1bc..44775bf3 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -36,3 +36,20 @@ def test_items_list(spider, json_and_msgpack): assert o[0] == {'id': 0, 'data': 'data0'} assert o[1] == {'id': 1, 'data': 'data1'} assert o[2] == {'id': 2, 'data': 'data2'} + + +def test_items_iter_by_chunks(spider, json_and_msgpack): + job = spider.jobs.run(meta={'state': 'running'}) + _add_test_items(job) + + o = job.items.iter_by_chunks(2) + assert next(o) == [ + {'id': 0, 'data': 'data0'}, + {'id': 1, 'data': 'data1'}, + ] + assert next(o) == [ + {'id': 2, 'data': 'data2'}, + ] + next(o) + with pytest.raises(StopIteration): + next(o) From 25cd4dc8b70c987b12d96ab6029a5a5cc7e8db64 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 5 Oct 2019 13:41:08 +0800 Subject: [PATCH 2/7] rename 'iter_by_chunks' to 'list_iter' --- scrapinghub/client/items.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 243b9df2..f3b102bd 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -60,7 +60,7 @@ def _modify_iter_params(self, params): params['start'] = '{}/{}'.format(self.key, offset) return params - def iter_by_chunks(self, chunksize=10000, *args, **kwargs): + def list_iter(self, chunksize=10000, *args, **kwargs): """An alternative for reading and processing items by returning a generator of item chunks. From 62cab36dd591f1c2fc3cbec85183ac2f31cc45ba Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 5 Oct 2019 13:59:09 +0800 Subject: [PATCH 3/7] update docstring of 'list_iter()' to be more clear --- scrapinghub/client/items.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index f3b102bd..39909b6e 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -61,8 +61,8 @@ def _modify_iter_params(self, params): return params def list_iter(self, chunksize=10000, *args, **kwargs): - """An alternative for reading and processing items by returning a - generator of item chunks. + """An alternative interface for reading items by returning them + as a generator which yields lists of items sized as `chunksize`. This is a convenient method for cases when processing a large amount of items from a job isn't ideal in one go due to the large memory needed. @@ -71,7 +71,7 @@ def list_iter(self, chunksize=10000, *args, **kwargs): You can improve I/O overheads by increasing the chunk value but that would also increase the memory consumption. - :return: an iterator over a list of elements. + :return: an iterator over items, yielding lists of items. :rtype: :class:`collections.Iterable` """ From 15d647d44c5ed8259a4c9507ddb24897f523361c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 5 Oct 2019 14:01:45 +0800 Subject: [PATCH 4/7] add new docstring example to 'jobs.items' class for list_iter() --- scrapinghub/client/items.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 39909b6e..d946193d 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -37,6 +37,20 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy): 'size': 100000, }] + - retrieve items via a generator of lists. This is most useful in cases + where the job has a huge amount of items and it needs to be broken down + into chunks when consumed. This example shows a job with 3 items:: + + >>> gen = job.items.list_iter(chunksize=2) + >>> next(gen) + [{'name': 'Item #1'}, {'name': 'Item #2'}] + >>> next(gen) + [{'name': 'Item #3'}] + >>> next(gen) + Traceback (most recent call last): + File "", line 1, in + StopIteration + - retrieve 1 item with multiple filters:: >>> filters = [("size", ">", [30000]), ("size", "<", [40000])] From 90315bcb1f7048cdc575b5ef2a4098e7032b4a26 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 5 Oct 2019 14:38:22 +0800 Subject: [PATCH 5/7] fix tests --- tests/client/test_items.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/client/test_items.py b/tests/client/test_items.py index 44775bf3..cc28461d 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -38,11 +38,11 @@ def test_items_list(spider, json_and_msgpack): assert o[2] == {'id': 2, 'data': 'data2'} -def test_items_iter_by_chunks(spider, json_and_msgpack): +def test_items_list_iter(spider, json_and_msgpack): job = spider.jobs.run(meta={'state': 'running'}) _add_test_items(job) - o = job.items.iter_by_chunks(2) + o = job.items.list_iter(2) assert next(o) == [ {'id': 0, 'data': 'data0'}, {'id': 1, 'data': 'data1'}, @@ -50,6 +50,5 @@ def test_items_iter_by_chunks(spider, json_and_msgpack): assert next(o) == [ {'id': 2, 'data': 'data2'}, ] - next(o) with pytest.raises(StopIteration): next(o) From 6ad4ee91aecf19f3d61e62a457e2d9c29b307189 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 10 Oct 2019 16:28:50 +0800 Subject: [PATCH 6/7] add new test VCR cassetes for items_list_iter() --- tests/client/cassetes/test_items/test_items_list_iter-json.gz | 1 + tests/client/cassetes/test_items/test_items_list_iter.gz | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/client/cassetes/test_items/test_items_list_iter-json.gz create mode 100644 tests/client/cassetes/test_items/test_items_list_iter.gz diff --git a/tests/client/cassetes/test_items/test_items_list_iter-json.gz b/tests/client/cassetes/test_items/test_items_list_iter-json.gz new file mode 100644 index 00000000..c45d9c2d --- /dev/null +++ b/tests/client/cassetes/test_items/test_items_list_iter-json.gz @@ -0,0 +1 @@ +eJy1lo1/HEUZx9O0lNI0UHwrFJBrJOEI2bu9vV7ukhggbZo0701yCUvbsU72JplN7vbu2ZekKUaIIGhohFaKgoqiokIFbW1VRInu+oLv7+/6v/js7OWlyYVA+vGS2715Znbn5fn+fjMPl89AWVi9tqysbJKZlp43YEvXFnUXlnXDZibVbIxZUE5ga3gGtgVNTQYOs2y4Zga2h9XtGMkxm+czcK26DQtH+gbTsEPdij8dU4fr1DvwF7ftQmM0Go9Hgv9ErDElp+QoLehR0zEi4xb2vVM8P5LPTEOF2o0/C2Z+nGl2czwmKzGlxiroGWY2c0uysX8pKNZg57S5OnmgWlEsm9oM79Xxlrvxhi82dGPMDyRbYZcYPGcUH7KgcgauD6s7MTJkMVNqGWOGDTcQ2K3e5Hc8jRMyJEszcYDGGHdGokpEicTgRqregPUtmsYKtnTI0PIZrId3EXi3WokVY6f0Ql0ow0azOBJ4DxXLE7SG9xJ4n1iW2mgt7KGi84N5w2BileEmAjeL2ARjBYlm9UkGe4MXHMznJ3QGtxC4Ve3CsmaZo3Z+ghlK86GM0q+N9BQchdu9csFgDay1Wzus6bzDae+Rx5LT3bQv0zucmigY46OMx5OG0toWO5A4PNSuJ0Yn4DaqXh8MxMYlkLqZMWZzeD+B29VyjCfrIUQFEYst0tMFBvsIVKn7MEoLhayuUX8G0ZPS1NSUNJo3c5JjZpm/OCwDH6BiZVocXFJTPyWawh2qgrED1NK10NHc2/17oB6qHUfdISC0Cogmg5oZuDOg0M++YwGCeldYkOT3D7Vdb4rE55hl0TEGd4tp9XVBncMRAilo2uqnK0Igqt6GpTR36kIxOdSn2SFFjjWE5FSjkmyUE6H2njTIVL3Gn3k2m5+CGAFFzM/Hvi7UdyTd0dc7CHGq7l6xZkug7CeQEB36oEB9AIEqDRUs22Q0B0kCKfVGjGWoxaUctVCFJ5DByRg0BCwMMhO1Co0EmkRajDHdOBmNRWI+nh+k4uWT1JyGZgL3rKTn3jXE3UegZTVxB0ok+yCBVjGdlckWij1EHV4xA23F9Tf9KbZrJ/yF16ydAQE74bCaxurb3Sf8xOFnq/vycG2ko+4uadi9OODNeU8Oe2f78T7X5811bzHd05WV7uld7rx3qdKbl7zLle68e258Cz7qLXgLW8vc53Y0lt+JRegQPaPYdCMGnY9B1wB0W44zAz1hjv7UG+YrTKmPX6fes2xGlp03kYjIZC6yQugRLZ+L6jbLWVEl+ERjyYZoLH4v8mXazTIc4RX8sNq5OKWj3nn3eVLuLbjnh3H4/Tgl2Xuq/0hnR13HNsWbdS+5swPS/YlWb253D373MffRGrbNfbq8bLc73+g93u9PpJ/j6AcGYNCHMr3GmYYIDL+1M91PObqQWsKFHqAcreco30M5eswxvjdgUyummC2yeZwAWWbzQ5SjCZwQakkk4cOUb1qz1HE4ynQkzFGbWpijJPmSDDNChiy8FrtRAmPqzRi12Uk7WshS3WgKaZyaFrObh9JtUgo4FQ0GkTwNnzKpga5g2tIg03DvsadBJzCuSr786UkJk41bSSJeL8tyU0g3tKyTYYPOSGs+h++2mkIFk2XzNNMEE1S9pWjcliX5ozLzWanFV7zUZ+ooOcgSyKk+krVglLTQPIGCaCADBDIxebvq8w+WQNQWiDpXIDqJiCbeFqLj+RFYItQpZPxMT/EKMdkHq8ROWNUYqhrVDd3iLFNVF6qaYNN+KNhNi1RXzcBJPwHTYY6MneIBRg/yAJqPFKGZQWh8HD4qsrY/AQ9dBQ4PBzjMChw+tgqHRwQOj4aFCaqS73a6xqTh4vHk4wQeU8NYpbAReb9GtbgSVxoUmkyNKkoqFW9gsjw6qsQokxMpRZE1eJwuG/wnCHzyLQ0+JQx+bo0tP0HgtNj4/VVftOUpNgLzV5jypwg8udaUn9oYpjMEzvLSGH2awNPBqqfgXAlzfobAZ9Q9a3Zi356zusEs+OxGGnmWwHOb0MjnAqg/j1DLgrmSeCF2+MTkEo/FMxli9wUhgueFCL7oi8A/GrUfSsOXUAPsam26GKxW2jDsX+N4kWu0vGNgZXBoPIEjhhd4Ra+P3JeFAL5SFMBXiwJ4sSiAr/kC2DTxXw+I/4Yg/qVVxL8siD8fLn1g+CaBV5ZN+dWNUfoWgW+vg9IFAhcXD3XfKYHSJQKX1Zr1UVrtvt/diKzvEfj+Jsh6LSDrB7wd99nsFUeHsqVNVsE91t9i0+7F8sHtnY2d3gXvWfcl99x97qvuBfcFd9Y9g9cz7ht4fc19Udwv4FVsxqr7yD53bvvB43t3ubN7y455r7ivewsjvjm/7u/EPxyAHwk+3xB8Lqzk88f/Nz6VEnz+pMinK/j0inz+tMjnz4p8/vyq+PxFwOebgs9fruLzV4LPX6/D528I/HaZz99tzOfvCfxhHT7/SOBPotf6/fDnEnz+hcBf3wmff9uIz78T+Mcm+PxnwOe/BJ/H1+Xz7DKfbe+AzzMD0p6yW495z3iXYz6P//Z5/M8A/Bd5ZE7kf03pJtc= \ No newline at end of file diff --git a/tests/client/cassetes/test_items/test_items_list_iter.gz b/tests/client/cassetes/test_items/test_items_list_iter.gz new file mode 100644 index 00000000..8c3e325d --- /dev/null +++ b/tests/client/cassetes/test_items/test_items_list_iter.gz @@ -0,0 +1 @@ +eJy1lo1/FMUZx0NAwYRosLWIVjlSEo+Yvdvby+UuSaMNBJKQ91dX2hEne5PMJnd79+xLXrBpwVJNIxRqaatVQftqxbZqoa21Le1uX+z7+/vL39JnZw8SksPwCZ9ecrc7z8zuzDzz/f1mjpbOQ0lY3VxSUjLNTEvPGbCha4O6Fcu6YTOTajbGLCglsDE8D5uCpiYDh1k23DQPN4fVmzGSZTbPpWGzugkL/X1Dw7BF3Yi3jqnDLepuvOO2nW+KRuPxSPCfiDWl5JQcpXk9ajpGZNLCvsvE82O59ByUq914mzdzk0yzW+IxWYkpNVZeTzOzhVuSjf1LQbEGO6ct1cm91Ypi2dRmeK2Ot96PF3yxoRsTfiDZBlvF4Dmj+JAFFfNwa1gtw8iIxUypdYIZNtxGoFK90+94DidkSJZm4gCNCe6MRZWIEonBNqrehvWtmsbytrTf0HJprIfbCbxLrcCKiSN6vi6UZuMZHAm8m4r0BK3hDgLvEWmpjdbCdio635czDCayDHcS2CFiU4zlJZrRpxncFbxgXy43pTO4m8B71S4sa5Y5buemmKG07E8rA9pYT95RuN0r5w3WyNq6tQ5N551Oe488kZzrpn3p3tHUVN6YHGc8njSUtgOxvYmOkXY9MT4F91D11mAgNqZA6mbGhM3hXgI71VKMJxsgRAURl1sMz+UZ7CJQpe7CKM3nM7pG/RlEZ6WZmRlpPGdmJcfMMD85LA3voyIzrQ6m1NSPiKawW1UwtpdauhY6lL3ev4cboNpx1C0CQiuPaDKomYf7Agr91XcsQFD3hAVJfv9Q2/W2WPgssyw6weB+Ma2+LqhzOEIgBU3b/OWKEIiq92BpmDt1oZgc6tPskCLHGkNyqklJNimxUHvPMMhUvcmfeSaTm4EYAUXMz8e+LtTXP9zZ1zsEcapWLsvZFVDqCSREhz4o0BBAoEojecs2Gc1CkkBK3YaxNLW4lKUWqvAwMjgdg8aAhSFmolahiUCzWBZjQjdmo7FIzMfz/VS8fJqac9BC4IHl9Dy4irgPEGhdSdzeIou9j0CbmM7yxRaK3U8dXj4PBwr5N/0ptmuH/cRrVllAQBl0qINYvdN9yl84/Gx0XxmtjXTW7ZFG3TcGvUXv1Kj39ABeF/u8xe4NpnuiosI9sdU96V2o8E5K3sWKHvfM5AZ80rvkXdpYsvNeZ/N9WIJO0S9KTTdicPAJ6BqEbstx5qEnzNGdesN8mSX18VvUB5asyLJzJvIQmc5Glsk8ouWyUd1mWSuqBJ9oLNkYjdU/iHSZdosM/bycd6gHL09o0jvvniWl3iX3/CgOfgAnJHunB/oPdtZ1blK8Y+4F99ig9FCizVus7MHvLuYer2Gb3DOlJZXuySbvyQF/IgMcRz84CEM+ksOrfGmEwOg7+9JDlKMHqUU86GHK0XgO8e2Uo8N8kN8VkKkVFphdJvNDBMgSmY9QjhZwWGglkYRHKV+3YqnjcBTpWJijMrUwR0HyKyJMCxGy8GroxglMqDswarNZO5rPUN1oDmmcmhazW0aGD0gp4FQ0GELuNHzKpAZ6gmlLQ0zDnceeA53ApCr54qezEi42biSJeIMsy80h3dAyTpoNOWNtuSy+22oO5U2WydF0M0xR9e6CbVuW5I/KzGWkVl/vUp+po+AgQyCr+kTWglHUQHME8qKBDBCIxOTtqk8/WAJRWyDqXIXoNCKauC5EJ3NjcIVQJ5/2V3qGl4vJPlYl9sGqplDVuG7oFmfpqrpQ1RSb80PBXlqgumoeZv0FmAtzZOwIDzB6jAfQfLgAzTxC4+PwEbFq9Qn46A3gcDTA4ZjA4fEVOHxM4HA8LCxQlXyv0zUmjRYOJx8n8IQaxiqFjcn1GtXiSlxpVGgyNa4oqVS8kcny+LgSo0xOpBRF1uBJumTvCwQ+8Y72Xi/sfXGVKT9F4ITY9v2sXzblGTYGJ6+y5E8SOLXakk+vDdOnCDzNi2P0aQJngqyn4DNFrPmzBD6nbl+1D/vmnNENZsEza2nkWQKfX4dGngugfh6hlgVzRfFC7PCJ6Ss8Fk5kiN0LQgRnhQjO+SLwD0bt+4fhRdQAu1GbLgSrlQMY9n/r8Ueu0XKOgZXBkfEwjhhe4uW9PnJfEAL4YkEAXyoI4MvqHatSm7Um8lSbgq8IaXzVl8a6tfByoIWvCS28skIL54UWXg0XP0h8ncA3luz6m2tD9hqB168B2RsEviV6bUjAhSKQXSTwbXGKL5qJla78nbWI+y6BN9dB3PcC4t7i7bj/PnrVgaJk6hH3jPus97j72u5q9/XdsMddkNXb3XPuC4fcN72z3jPeae+ot+Cd7vDO4e3zg/6lzTvnvrjDfdld2Oa+5C1Uus9V7SvxLpZ7x91T/v4O3/d35R8Mwg8Fq5cEqz9azuqP/2+sKkVYdQuseoLVnxRY/WmB1Z/xgMif3xCRbwdE/kIQ+csVRP5KEPnraxD5GwK/XSLyd2sT+XsCf7gGkX8k8Kfg7JGAPxch8i8E/nr9RP5tLSL/TuAf6yDynwGR/xJE9l+DyAtLRB66DiK3lHhveQtl3qvVPoH/9gn8zyD8FwlkTuR/slMgwg== \ No newline at end of file From 0b7747e82a4feb4583c0a2bd77f3f649605873e0 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 18 Oct 2019 13:08:51 +0300 Subject: [PATCH 7/7] Normalize Job instances in tests --- tests/client/test_items.py | 4 ++++ tests/client/utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/client/test_items.py b/tests/client/test_items.py index cc28461d..0abdd6cb 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -1,6 +1,8 @@ import pytest from six.moves import range +from .utils import normalize_job_for_tests + def _add_test_items(job): for i in range(3): @@ -28,6 +30,7 @@ def test_items_iter(spider, json_and_msgpack): def test_items_list(spider, json_and_msgpack): job = spider.jobs.run(meta={'state': 'running'}) + job = normalize_job_for_tests(job) _add_test_items(job) o = job.items.list() @@ -40,6 +43,7 @@ def test_items_list(spider, json_and_msgpack): def test_items_list_iter(spider, json_and_msgpack): job = spider.jobs.run(meta={'state': 'running'}) + job = normalize_job_for_tests(job) _add_test_items(job) o = job.items.list_iter(2) diff --git a/tests/client/utils.py b/tests/client/utils.py index 8e60cb39..4e490de0 100644 --- a/tests/client/utils.py +++ b/tests/client/utils.py @@ -13,3 +13,30 @@ def validate_default_meta(meta, state='pending', units=1, assert meta.get('units') == units assert meta.get('api_url') == TEST_DASH_ENDPOINT assert meta.get('portia_url') + + +def normalize_job_for_tests(job): + """A temporary workaround to deal with VCR.py cassettes(snapshots). + + The existing tests highly rely on VCR.py which creates snapshots of real + HTTP requests and responses, and during the test process tries to match + requests with the snapshots. Sometimes it's hard to run an appropriate test + environment locally, so we allow to use our servers to create snapshots + for new tests, by "normalizing" the snapshots via patching hosts/credentials + on-the-fly before saving it (see #112). + + The problem here is that we patch only requests data and not responses data, + which is pretty difficult to unify over the whole client. It means that if + some test gets data from API (say, a new job ID) and uses it to form another + requests (get the job data), it will form the HTTP requests differently, + thus it won't match with the snapshots during the test process and the tests + will fail. + + As a temporary workaround, the helper gets a Job instance, extracts its key, + replaces the project ID part with TEST_PROJECT_ID, and returns a new Job. + So, the other requests done via the new job instance (updating job items, + accessing job logs, etc) will be done using proper URLs matching with + existing snapshots. + """ + normalized_key = '{}/{}'.format(TEST_PROJECT_ID, job.key.split('/', 1)[1]) + return job._client.get_job(normalized_key) \ No newline at end of file