From 534a7b7280493119b189cb659b20f4a0a8ad9477 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Fri, 4 Oct 2019 15:40:11 +0800
Subject: [PATCH 1/7] implement new iter_by_chunks() in items

---
 scrapinghub/client/items.py | 27 +++++++++++++++++++++++++++
 tests/client/test_items.py  | 17 +++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py
index 669ea1af..243b9df2 100644
--- a/scrapinghub/client/items.py
+++ b/scrapinghub/client/items.py
@@ -59,3 +59,30 @@ def _modify_iter_params(self, params):
         if offset:
             params['start'] = '{}/{}'.format(self.key, offset)
         return params
+
+    def iter_by_chunks(self, chunksize=10000, *args, **kwargs):
+        """An alternative for reading and processing items by returning a
+        generator of item chunks.
+
+        This is a convenient method for cases when processing a large amount of
+        items from a job isn't ideal in one go due to the large memory needed.
+        Instead, this allows you to process it chunk by chunk.
+
+        You can improve I/O overheads by increasing the chunk value but that
+        would also increase the memory consumption.
+
+        :return: an iterator over a list of elements.
+        :rtype: :class:`collections.Iterable`
+        """
+
+        processed = 0
+        while True:
+            next_key = self.key + '/' + str(processed)
+            items = [
+                item for item in self.iter(
+                    count=chunksize, start=next_key, *args, **kwargs)
+            ]
+            yield items
+            processed += len(items)
+            if len(items) < chunksize:
+                break
diff --git a/tests/client/test_items.py b/tests/client/test_items.py
index ea19d1bc..44775bf3 100644
--- a/tests/client/test_items.py
+++ b/tests/client/test_items.py
@@ -36,3 +36,20 @@ def test_items_list(spider, json_and_msgpack):
     assert o[0] == {'id': 0, 'data': 'data0'}
     assert o[1] == {'id': 1, 'data': 'data1'}
     assert o[2] == {'id': 2, 'data': 'data2'}
+
+
+def test_items_iter_by_chunks(spider, json_and_msgpack):
+    job = spider.jobs.run(meta={'state': 'running'})
+    _add_test_items(job)
+
+    o = job.items.iter_by_chunks(2)
+    assert next(o) == [
+        {'id': 0, 'data': 'data0'},
+        {'id': 1, 'data': 'data1'},
+    ]
+    assert next(o) == [
+        {'id': 2, 'data': 'data2'},
+    ]
+    next(o)
+    with pytest.raises(StopIteration):
+        next(o)

From 25cd4dc8b70c987b12d96ab6029a5a5cc7e8db64 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Sat, 5 Oct 2019 13:41:08 +0800
Subject: [PATCH 2/7] rename 'iter_by_chunks' to 'list_iter'

---
 scrapinghub/client/items.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py
index 243b9df2..f3b102bd 100644
--- a/scrapinghub/client/items.py
+++ b/scrapinghub/client/items.py
@@ -60,7 +60,7 @@ def _modify_iter_params(self, params):
             params['start'] = '{}/{}'.format(self.key, offset)
         return params
 
-    def iter_by_chunks(self, chunksize=10000, *args, **kwargs):
+    def list_iter(self, chunksize=10000, *args, **kwargs):
         """An alternative for reading and processing items by returning a
         generator of item chunks.
 

From 62cab36dd591f1c2fc3cbec85183ac2f31cc45ba Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Sat, 5 Oct 2019 13:59:09 +0800
Subject: [PATCH 3/7] update docstring of 'list_iter()' to be more clear

---
 scrapinghub/client/items.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py
index f3b102bd..39909b6e 100644
--- a/scrapinghub/client/items.py
+++ b/scrapinghub/client/items.py
@@ -61,8 +61,8 @@ def _modify_iter_params(self, params):
         return params
 
     def list_iter(self, chunksize=10000, *args, **kwargs):
-        """An alternative for reading and processing items by returning a
-        generator of item chunks.
+        """An alternative interface for reading items by returning them
+        as a generator which yields lists of items sized as `chunksize`.
 
         This is a convenient method for cases when processing a large amount of
         items from a job isn't ideal in one go due to the large memory needed.
@@ -71,7 +71,7 @@ def list_iter(self, chunksize=10000, *args, **kwargs):
         You can improve I/O overheads by increasing the chunk value but that
         would also increase the memory consumption.
 
-        :return: an iterator over a list of elements.
+        :return: an iterator over items, yielding lists of items.
         :rtype: :class:`collections.Iterable`
         """
 

From 15d647d44c5ed8259a4c9507ddb24897f523361c Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Sat, 5 Oct 2019 14:01:45 +0800
Subject: [PATCH 4/7] add new docstring example to 'jobs.items' class for
 list_iter()

---
 scrapinghub/client/items.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py
index 39909b6e..d946193d 100644
--- a/scrapinghub/client/items.py
+++ b/scrapinghub/client/items.py
@@ -37,6 +37,20 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy):
             'size': 100000,
         }]
 
+    - retrieve items via a generator of lists. This is most useful in cases
+      where the job has a huge amount of items and it needs to be broken down
+      into chunks when consumed. This example shows a job with 3 items::
+
+        >>> gen = job.items.list_iter(chunksize=2)
+        >>> next(gen)
+        [{'name': 'Item #1'}, {'name': 'Item #2'}]
+        >>> next(gen)
+        [{'name': 'Item #3'}]
+        >>> next(gen)
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+        StopIteration
+
     - retrieve 1 item with multiple filters::
 
         >>> filters = [("size", ">", [30000]), ("size", "<", [40000])]

From 90315bcb1f7048cdc575b5ef2a4098e7032b4a26 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Sat, 5 Oct 2019 14:38:22 +0800
Subject: [PATCH 5/7] fix tests

---
 tests/client/test_items.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/client/test_items.py b/tests/client/test_items.py
index 44775bf3..cc28461d 100644
--- a/tests/client/test_items.py
+++ b/tests/client/test_items.py
@@ -38,11 +38,11 @@ def test_items_list(spider, json_and_msgpack):
     assert o[2] == {'id': 2, 'data': 'data2'}
 
 
-def test_items_iter_by_chunks(spider, json_and_msgpack):
+def test_items_list_iter(spider, json_and_msgpack):
     job = spider.jobs.run(meta={'state': 'running'})
     _add_test_items(job)
 
-    o = job.items.iter_by_chunks(2)
+    o = job.items.list_iter(2)
     assert next(o) == [
         {'id': 0, 'data': 'data0'},
         {'id': 1, 'data': 'data1'},
@@ -50,6 +50,5 @@ def test_items_iter_by_chunks(spider, json_and_msgpack):
     assert next(o) == [
         {'id': 2, 'data': 'data2'},
     ]
-    next(o)
     with pytest.raises(StopIteration):
         next(o)

From 6ad4ee91aecf19f3d61e62a457e2d9c29b307189 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 10 Oct 2019 16:28:50 +0800
Subject: [PATCH 6/7] add new test VCR cassetes for items_list_iter()

---
 tests/client/cassetes/test_items/test_items_list_iter-json.gz | 1 +
 tests/client/cassetes/test_items/test_items_list_iter.gz      | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/client/cassetes/test_items/test_items_list_iter-json.gz
 create mode 100644 tests/client/cassetes/test_items/test_items_list_iter.gz

diff --git a/tests/client/cassetes/test_items/test_items_list_iter-json.gz b/tests/client/cassetes/test_items/test_items_list_iter-json.gz
new file mode 100644
index 00000000..c45d9c2d
--- /dev/null
+++ b/tests/client/cassetes/test_items/test_items_list_iter-json.gz
@@ -0,0 +1 @@
+eJy1lo1/HEUZx9O0lNI0UHwrFJBrJOEI2bu9vV7ukhggbZo0701yCUvbsU72JplN7vbu2ZekKUaIIGhohFaKgoqiokIFbW1VRInu+oLv7+/6v/js7OWlyYVA+vGS2715Znbn5fn+fjMPl89AWVi9tqysbJKZlp43YEvXFnUXlnXDZibVbIxZUE5ga3gGtgVNTQYOs2y4Zga2h9XtGMkxm+czcK26DQtH+gbTsEPdij8dU4fr1DvwF7ftQmM0Go9Hgv9ErDElp+QoLehR0zEi4xb2vVM8P5LPTEOF2o0/C2Z+nGl2czwmKzGlxiroGWY2c0uysX8pKNZg57S5OnmgWlEsm9oM79Xxlrvxhi82dGPMDyRbYZcYPGcUH7KgcgauD6s7MTJkMVNqGWOGDTcQ2K3e5Hc8jRMyJEszcYDGGHdGokpEicTgRqregPUtmsYKtnTI0PIZrId3EXi3WokVY6f0Ql0ow0azOBJ4DxXLE7SG9xJ4n1iW2mgt7KGi84N5w2BileEmAjeL2ARjBYlm9UkGe4MXHMznJ3QGtxC4Ve3CsmaZo3Z+ghlK86GM0q+N9BQchdu9csFgDay1Wzus6bzDae+Rx5LT3bQv0zucmigY46OMx5OG0toWO5A4PNSuJ0Yn4DaqXh8MxMYlkLqZMWZzeD+B29VyjCfrIUQFEYst0tMFBvsIVKn7MEoLhayuUX8G0ZPS1NSUNJo3c5JjZpm/OCwDH6BiZVocXFJTPyWawh2qgrED1NK10NHc2/17oB6qHUfdISC0Cogmg5oZuDOg0M++YwGCeldYkOT3D7Vdb4rE55hl0TEGd4tp9XVBncMRAilo2uqnK0Igqt6GpTR36kIxOdSn2SFFjjWE5FSjkmyUE6H2njTIVL3Gn3k2m5+CGAFFzM/Hvi7UdyTd0dc7CHGq7l6xZkug7CeQEB36oEB9AIEqDRUs22Q0B0kCKfVGjGWoxaUctVCFJ5DByRg0BCwMMhO1Co0EmkRajDHdOBmNRWI+nh+k4uWT1JyGZgL3rKTn3jXE3UegZTVxB0ok+yCBVjGdlckWij1EHV4xA23F9Tf9KbZrJ/yF16ydAQE74bCaxurb3Sf8xOFnq/vycG2ko+4uadi9OODNeU8Oe2f78T7X5811bzHd05WV7uld7rx3qdKbl7zLle68e258Cz7qLXgLW8vc53Y0lt+JRegQPaPYdCMGnY9B1wB0W44zAz1hjv7UG+YrTKmPX6fes2xGlp03kYjIZC6yQugRLZ+L6jbLWVEl+ERjyYZoLH4v8mXazTIc4RX8sNq5OKWj3nn3eVLuLbjnh3H4/Tgl2Xuq/0hnR13HNsWbdS+5swPS/YlWb253D373MffRGrbNfbq8bLc73+g93u9PpJ/j6AcGYNCHMr3GmYYIDL+1M91PObqQWsKFHqAcreco30M5eswxvjdgUyummC2yeZwAWWbzQ5SjCZwQakkk4cOUb1qz1HE4ynQkzFGbWpijJPmSDDNChiy8FrtRAmPqzRi12Uk7WshS3WgKaZyaFrObh9JtUgo4FQ0GkTwNnzKpga5g2tIg03DvsadBJzCuSr786UkJk41bSSJeL8tyU0g3tKyTYYPOSGs+h++2mkIFk2XzNNMEE1S9pWjcliX5ozLzWanFV7zUZ+ooOcgSyKk+krVglLTQPIGCaCADBDIxebvq8w+WQNQWiDpXIDqJiCbeFqLj+RFYItQpZPxMT/EKMdkHq8ROWNUYqhrVDd3iLFNVF6qaYNN+KNhNi1RXzcBJPwHTYY6MneIBRg/yAJqPFKGZQWh8HD4qsrY/AQ9dBQ4PBzjMChw+tgqHRwQOj4aFCaqS73a6xqTh4vHk4wQeU8NYpbAReb9GtbgSVxoUmkyNKkoqFW9gsjw6qsQokxMpRZE1eJwuG/wnCHzyLQ0+JQx+bo0tP0HgtNj4/VVftOUpNgLzV5jypwg8udaUn9oYpjMEzvLSGH2awNPBqqfgXAlzfobAZ9Q9a3Zi356zusEs+OxGGnmWwHOb0MjnAqg/j1DLgrmSeCF2+MTkEo/FMxli9wUhgueFCL7oi8A/GrUfSsOXUAPsam26GKxW2jDsX+N4kWu0vGNgZXBoPIEjhhd4Ra+P3JeFAL5SFMBXiwJ4sSiAr/kC2DTxXw+I/4Yg/qVVxL8siD8fLn1g+CaBV5ZN+dWNUfoWgW+vg9IFAhcXD3XfKYHSJQKX1Zr1UVrtvt/diKzvEfj+Jsh6LSDrB7wd99nsFUeHsqVNVsE91t9i0+7F8sHtnY2d3gXvWfcl99x97qvuBfcFd9Y9g9cz7ht4fc19Udwv4FVsxqr7yD53bvvB43t3ubN7y455r7ivewsjvjm/7u/EPxyAHwk+3xB8Lqzk88f/Nz6VEnz+pMinK/j0inz+tMjnz4p8/vyq+PxFwOebgs9fruLzV4LPX6/D528I/HaZz99tzOfvCfxhHT7/SOBPotf6/fDnEnz+hcBf3wmff9uIz78T+Mcm+PxnwOe/BJ/H1+Xz7DKfbe+AzzMD0p6yW495z3iXYz6P//Z5/M8A/Bd5ZE7kf03pJtc=
\ No newline at end of file
diff --git a/tests/client/cassetes/test_items/test_items_list_iter.gz b/tests/client/cassetes/test_items/test_items_list_iter.gz
new file mode 100644
index 00000000..8c3e325d
--- /dev/null
+++ b/tests/client/cassetes/test_items/test_items_list_iter.gz
@@ -0,0 +1 @@
+eJy1lo1/FMUZx0NAwYRosLWIVjlSEo+Yvdvby+UuSaMNBJKQ91dX2hEne5PMJnd79+xLXrBpwVJNIxRqaatVQftqxbZqoa21Le1uX+z7+/vL39JnZw8SksPwCZ9ecrc7z8zuzDzz/f1mjpbOQ0lY3VxSUjLNTEvPGbCha4O6Fcu6YTOTajbGLCglsDE8D5uCpiYDh1k23DQPN4fVmzGSZTbPpWGzugkL/X1Dw7BF3Yi3jqnDLepuvOO2nW+KRuPxSPCfiDWl5JQcpXk9ajpGZNLCvsvE82O59ByUq914mzdzk0yzW+IxWYkpNVZeTzOzhVuSjf1LQbEGO6ct1cm91Ypi2dRmeK2Ot96PF3yxoRsTfiDZBlvF4Dmj+JAFFfNwa1gtw8iIxUypdYIZNtxGoFK90+94DidkSJZm4gCNCe6MRZWIEonBNqrehvWtmsbytrTf0HJprIfbCbxLrcCKiSN6vi6UZuMZHAm8m4r0BK3hDgLvEWmpjdbCdio635czDCayDHcS2CFiU4zlJZrRpxncFbxgXy43pTO4m8B71S4sa5Y5buemmKG07E8rA9pYT95RuN0r5w3WyNq6tQ5N551Oe488kZzrpn3p3tHUVN6YHGc8njSUtgOxvYmOkXY9MT4F91D11mAgNqZA6mbGhM3hXgI71VKMJxsgRAURl1sMz+UZ7CJQpe7CKM3nM7pG/RlEZ6WZmRlpPGdmJcfMMD85LA3voyIzrQ6m1NSPiKawW1UwtpdauhY6lL3ev4cboNpx1C0CQiuPaDKomYf7Agr91XcsQFD3hAVJfv9Q2/W2WPgssyw6weB+Ma2+LqhzOEIgBU3b/OWKEIiq92BpmDt1oZgc6tPskCLHGkNyqklJNimxUHvPMMhUvcmfeSaTm4EYAUXMz8e+LtTXP9zZ1zsEcapWLsvZFVDqCSREhz4o0BBAoEojecs2Gc1CkkBK3YaxNLW4lKUWqvAwMjgdg8aAhSFmolahiUCzWBZjQjdmo7FIzMfz/VS8fJqac9BC4IHl9Dy4irgPEGhdSdzeIou9j0CbmM7yxRaK3U8dXj4PBwr5N/0ptmuH/cRrVllAQBl0qINYvdN9yl84/Gx0XxmtjXTW7ZFG3TcGvUXv1Kj39ABeF/u8xe4NpnuiosI9sdU96V2o8E5K3sWKHvfM5AZ80rvkXdpYsvNeZ/N9WIJO0S9KTTdicPAJ6BqEbstx5qEnzNGdesN8mSX18VvUB5asyLJzJvIQmc5Glsk8ouWyUd1mWSuqBJ9oLNkYjdU/iHSZdosM/bycd6gHL09o0jvvniWl3iX3/CgOfgAnJHunB/oPdtZ1blK8Y+4F99ig9FCizVus7MHvLuYer2Gb3DOlJZXuySbvyQF/IgMcRz84CEM+ksOrfGmEwOg7+9JDlKMHqUU86GHK0XgO8e2Uo8N8kN8VkKkVFphdJvNDBMgSmY9QjhZwWGglkYRHKV+3YqnjcBTpWJijMrUwR0HyKyJMCxGy8GroxglMqDswarNZO5rPUN1oDmmcmhazW0aGD0gp4FQ0GELuNHzKpAZ6gmlLQ0zDnceeA53ApCr54qezEi42biSJeIMsy80h3dAyTpoNOWNtuSy+22oO5U2WydF0M0xR9e6CbVuW5I/KzGWkVl/vUp+po+AgQyCr+kTWglHUQHME8qKBDBCIxOTtqk8/WAJRWyDqXIXoNCKauC5EJ3NjcIVQJ5/2V3qGl4vJPlYl9sGqplDVuG7oFmfpqrpQ1RSb80PBXlqgumoeZv0FmAtzZOwIDzB6jAfQfLgAzTxC4+PwEbFq9Qn46A3gcDTA4ZjA4fEVOHxM4HA8LCxQlXyv0zUmjRYOJx8n8IQaxiqFjcn1GtXiSlxpVGgyNa4oqVS8kcny+LgSo0xOpBRF1uBJumTvCwQ+8Y72Xi/sfXGVKT9F4ITY9v2sXzblGTYGJ6+y5E8SOLXakk+vDdOnCDzNi2P0aQJngqyn4DNFrPmzBD6nbl+1D/vmnNENZsEza2nkWQKfX4dGngugfh6hlgVzRfFC7PCJ6Ss8Fk5kiN0LQgRnhQjO+SLwD0bt+4fhRdQAu1GbLgSrlQMY9n/r8Ueu0XKOgZXBkfEwjhhe4uW9PnJfEAL4YkEAXyoI4MvqHatSm7Um8lSbgq8IaXzVl8a6tfByoIWvCS28skIL54UWXg0XP0h8ncA3luz6m2tD9hqB168B2RsEviV6bUjAhSKQXSTwbXGKL5qJla78nbWI+y6BN9dB3PcC4t7i7bj/PnrVgaJk6hH3jPus97j72u5q9/XdsMddkNXb3XPuC4fcN72z3jPeae+ot+Cd7vDO4e3zg/6lzTvnvrjDfdld2Oa+5C1Uus9V7SvxLpZ7x91T/v4O3/d35R8Mwg8Fq5cEqz9azuqP/2+sKkVYdQuseoLVnxRY/WmB1Z/xgMif3xCRbwdE/kIQ+csVRP5KEPnraxD5GwK/XSLyd2sT+XsCf7gGkX8k8Kfg7JGAPxch8i8E/nr9RP5tLSL/TuAf6yDynwGR/xJE9l+DyAtLRB66DiK3lHhveQtl3qvVPoH/9gn8zyD8FwlkTuR/slMgwg==
\ No newline at end of file

From 0b7747e82a4feb4583c0a2bd77f3f649605873e0 Mon Sep 17 00:00:00 2001
From: Viktor Shlapakov <vshlapakov@gmail.com>
Date: Fri, 18 Oct 2019 13:08:51 +0300
Subject: [PATCH 7/7] Normalize Job instances in tests

---
 tests/client/test_items.py |  4 ++++
 tests/client/utils.py      | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tests/client/test_items.py b/tests/client/test_items.py
index cc28461d..0abdd6cb 100644
--- a/tests/client/test_items.py
+++ b/tests/client/test_items.py
@@ -1,6 +1,8 @@
 import pytest
 from six.moves import range
 
+from .utils import normalize_job_for_tests
+
 
 def _add_test_items(job):
     for i in range(3):
@@ -28,6 +30,7 @@ def test_items_iter(spider, json_and_msgpack):
 
 def test_items_list(spider, json_and_msgpack):
     job = spider.jobs.run(meta={'state': 'running'})
+    job = normalize_job_for_tests(job)
     _add_test_items(job)
 
     o = job.items.list()
@@ -40,6 +43,7 @@ def test_items_list(spider, json_and_msgpack):
 
 def test_items_list_iter(spider, json_and_msgpack):
     job = spider.jobs.run(meta={'state': 'running'})
+    job = normalize_job_for_tests(job)
     _add_test_items(job)
 
     o = job.items.list_iter(2)
diff --git a/tests/client/utils.py b/tests/client/utils.py
index 8e60cb39..4e490de0 100644
--- a/tests/client/utils.py
+++ b/tests/client/utils.py
@@ -13,3 +13,30 @@ def validate_default_meta(meta, state='pending', units=1,
     assert meta.get('units') == units
     assert meta.get('api_url') == TEST_DASH_ENDPOINT
     assert meta.get('portia_url')
+
+
+def normalize_job_for_tests(job):
+    """A temporary workaround to deal with VCR.py cassettes(snapshots).
+
+    The existing tests highly rely on VCR.py which creates snapshots of real
+    HTTP requests and responses, and during the test process tries to match
+    requests with the snapshots. Sometimes it's hard to run an appropriate test
+    environment locally, so we allow to use our servers to create snapshots
+    for new tests, by "normalizing" the snapshots via patching hosts/credentials
+    on-the-fly before saving it (see #112).
+
+    The problem here is that we patch only requests data and not responses data,
+    which is pretty difficult to unify over the whole client. It means that if
+    some test gets data from API (say, a new job ID) and uses it to form another
+    requests (get the job data), it will form the HTTP requests differently,
+    thus it won't match with the snapshots during the test process and the tests
+    will fail.
+
+    As a temporary workaround, the helper gets a Job instance, extracts its key,
+    replaces the project ID part with TEST_PROJECT_ID, and returns a new Job.
+    So, the other requests done via the new job instance (updating job items,
+    accessing job logs, etc) will be done using proper URLs matching with
+    existing snapshots.
+    """
+    normalized_key = '{}/{}'.format(TEST_PROJECT_ID, job.key.split('/', 1)[1])
+    return job._client.get_job(normalized_key)
\ No newline at end of file