11from __future__ import absolute_import
22
3+ import sys
4+
35from .proxy import _ItemsResourceProxy , _DownloadableProxyMixin
46
57
@@ -37,6 +39,34 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy):
3739 'size': 100000,
3840 }]
3941
42+ - retrieve items via a generator of lists. This is most useful in cases
43+ where the job has a huge amount of items and it needs to be broken down
44+ into chunks when consumed. This example shows a job with 3 items::
45+
46+ >>> gen = job.items.list_iter(chunksize=2)
47+ >>> next(gen)
48+ [{'name': 'Item #1'}, {'name': 'Item #2'}]
49+ >>> next(gen)
50+ [{'name': 'Item #3'}]
51+ >>> next(gen)
52+ Traceback (most recent call last):
53+ File "<stdin>", line 1, in <module>
54+ StopIteration
55+
56+ - retrieving via meth::`list_iter` also supports the `start` and `count`.
57+ params. This is useful when you want to only retrieve a subset of items in
58+ a job. The example below belongs to a job with 10 items::
59+
60+ >>> gen = job.items.list_iter(chunksize=2, start=5, count=3)
61+ >>> next(gen)
62+ [{'name': 'Item #5'}, {'name': 'Item #6'}]
63+ >>> next(gen)
64+ [{'name': 'Item #7'}]
65+ >>> next(gen)
66+ Traceback (most recent call last):
67+ File "<stdin>", line 1, in <module>
68+ StopIteration
69+
4070 - retrieve 1 item with multiple filters::
4171
4272 >>> filters = [("size", ">", [30000]), ("size", "<", [40000])]
@@ -59,3 +89,43 @@ def _modify_iter_params(self, params):
5989 if offset :
6090 params ['start' ] = '{}/{}' .format (self .key , offset )
6191 return params
92+
93+ def list_iter (self , chunksize = 1000 , * args , ** kwargs ):
94+ """An alternative interface for reading items by returning them
95+ as a generator which yields lists of items sized as `chunksize`.
96+
97+ This is a convenient method for cases when processing a large amount of
98+ items from a job isn't ideal in one go due to the large memory needed.
99+ Instead, this allows you to process it chunk by chunk.
100+
101+ You can improve I/O overheads by increasing the chunk value but that
102+ would also increase the memory consumption.
103+
104+ :param chunksize: size of list to be returned per iteration
105+ :param start: offset to specify the start of the item iteration
106+ :param count: overall number of items to be returned, which is broken
107+ down by `chunksize`.
108+
109+ :return: an iterator over items, yielding lists of items.
110+ :rtype: :class:`collections.Iterable`
111+ """
112+
113+ start = kwargs .pop ("start" , 0 )
114+ count = kwargs .pop ("count" , sys .maxsize )
115+ processed = 0
116+
117+ while True :
118+ next_key = self .key + "/" + str (start )
119+ if processed + chunksize > count :
120+ chunksize = count - processed
121+ items = [
122+ item for item in self .iter (
123+ count = chunksize , start = next_key , * args , ** kwargs )
124+ ]
125+ yield items
126+ processed += len (items )
127+ start += len (items )
128+ if processed >= count :
129+ break
130+ if len (items ) < chunksize :
131+ break
0 commit comments