Skip to content

Commit 9d6613d

Browse files
feat: add info and memory_usage methods to dataframe (#219)
1 parent ae03756 commit 9d6613d

File tree

10 files changed

+235
-2
lines changed

10 files changed

+235
-2
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ repos:
3838
rev: v1.1.1
3939
hooks:
4040
- id: mypy
41-
additional_dependencies: [types-requests]
41+
additional_dependencies: [types-requests, types-tabulate]

bigframes/_config/display_options.py

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ class DisplayOptions:
3232
progress_bar: Optional[str] = "auto"
3333
repr_mode: Literal["head", "deferred"] = "head"
3434

35+
max_info_columns: int = 100
36+
max_info_rows: Optional[int] = 200000
37+
memory_usage: bool = True
38+
3539

3640
@contextlib.contextmanager
3741
def pandas_repr(display_options: DisplayOptions):

bigframes/core/indexes/index.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,14 @@ def _block(self) -> blocks.Block:
155155
def T(self) -> Index:
156156
return self.transpose()
157157

158+
def _memory_usage(self) -> int:
159+
(n_rows,) = self.shape
160+
return sum(
161+
self.dtypes.map(
162+
lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
163+
)
164+
)
165+
158166
def transpose(self) -> Index:
159167
return self
160168

@@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
326334

327335
def __getitem__(self, key: int) -> typing.Any:
328336
if isinstance(key, int):
329-
result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
337+
if key != -1:
338+
result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
339+
else: # special case, want [-1:] instead of [-1:0]
340+
result_pd_df, _ = self._block.slice(key).to_pandas()
330341
if result_pd_df.empty:
331342
raise IndexError("single positional indexer is out-of-bounds")
332343
return result_pd_df.index[0]

bigframes/dataframe.py

+84
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import datetime
2020
import re
21+
import sys
2122
import textwrap
2223
import typing
2324
from typing import (
@@ -36,6 +37,7 @@
3637
import google.cloud.bigquery as bigquery
3738
import numpy
3839
import pandas
40+
import tabulate
3941

4042
import bigframes
4143
import bigframes._config.display_options as display_options
@@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
350352
self._set_internal_query_job(self._compute_dry_run())
351353
return self._query_job
352354

355+
def memory_usage(self, index: bool = True):
356+
n_rows, _ = self.shape
357+
# like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object
358+
column_sizes = self.dtypes.map(
359+
lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
360+
)
361+
if index:
362+
index_size = pandas.Series([self.index._memory_usage()], index=["Index"])
363+
column_sizes = pandas.concat([index_size, column_sizes])
364+
return column_sizes
365+
366+
def info(
367+
self,
368+
verbose: Optional[bool] = None,
369+
buf=None,
370+
max_cols: Optional[int] = None,
371+
memory_usage: Optional[bool] = None,
372+
show_counts: Optional[bool] = None,
373+
):
374+
obuf = buf or sys.stdout
375+
376+
n_rows, n_columns = self.shape
377+
378+
max_cols = (
379+
max_cols
380+
if max_cols is not None
381+
else bigframes.options.display.max_info_columns
382+
)
383+
384+
show_all_columns = verbose if verbose is not None else (n_columns < max_cols)
385+
386+
obuf.write(f"{type(self)}\n")
387+
388+
index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"
389+
390+
# These accessses are kind of expensive, maybe should try to skip?
391+
first_indice = self.index[0]
392+
last_indice = self.index[-1]
393+
obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n")
394+
395+
dtype_strings = self.dtypes.astype("string")
396+
if show_all_columns:
397+
obuf.write(f"Data columns (total {n_columns} columns):\n")
398+
column_info = self.columns.to_frame(name="Column")
399+
400+
max_rows = bigframes.options.display.max_info_rows
401+
too_many_rows = n_rows > max_rows if max_rows is not None else False
402+
403+
if show_counts if show_counts is not None else (not too_many_rows):
404+
non_null_counts = self.count().to_pandas()
405+
column_info["Non-Null Count"] = non_null_counts.map(
406+
lambda x: f"{int(x)} non-null"
407+
)
408+
409+
column_info["Dtype"] = dtype_strings
410+
411+
column_info = column_info.reset_index(drop=True)
412+
column_info.index.name = "#"
413+
414+
column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore
415+
obuf.write(column_info_formatted)
416+
obuf.write("\n")
417+
418+
else: # Just number of columns and first, last
419+
obuf.write(
420+
f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n"
421+
)
422+
dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items()
423+
dtype_counts_formatted = ", ".join(
424+
f"{dtype}({count})" for dtype, count in dtype_counts
425+
)
426+
obuf.write(f"dtypes: {dtype_counts_formatted}\n")
427+
428+
show_memory = (
429+
memory_usage
430+
if memory_usage is not None
431+
else bigframes.options.display.memory_usage
432+
)
433+
if show_memory:
434+
# TODO: Convert to different units (kb, mb, etc.)
435+
obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
436+
353437
def _set_internal_query_job(self, query_job: bigquery.QueryJob):
354438
self._query_job = query_job
355439

bigframes/dtypes.py

+13
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,19 @@
143143
# "string" and "string[pyarrow] are accepted"
144144
BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")
145145

146+
# For the purposes of dataframe.memory_usage
147+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
148+
DTYPE_BYTE_SIZES = {
149+
pd.BooleanDtype(): 1,
150+
pd.Int64Dtype(): 8,
151+
pd.Float32Dtype(): 8,
152+
pd.StringDtype(): 8,
153+
pd.ArrowDtype(pa.time64("us")): 8,
154+
pd.ArrowDtype(pa.timestamp("us")): 8,
155+
pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8,
156+
pd.ArrowDtype(pa.date32()): 8,
157+
}
158+
146159

147160
def ibis_dtype_to_bigframes_dtype(
148161
ibis_dtype: ibis_dtypes.DataType,

noxfile.py

+1
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ def mypy(session):
228228
"types-python-dateutil",
229229
"types-requests",
230230
"types-setuptools",
231+
"types-tabulate",
231232
]
232233
)
233234
| set(SYSTEM_TEST_STANDARD_DEPENDENCIES)

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
"requests >=2.27.1",
5151
"scikit-learn >=1.2.2",
5252
"sqlalchemy >=1.4,<3.0dev",
53+
"tabulate >= 0.9",
5354
"ipywidgets >=7.7.1",
5455
"humanize >= 4.6.0",
5556
]

tests/system/small/test_dataframe.py

+42
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import io
1516
import operator
1617
import tempfile
1718
import typing
@@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs):
255256
assert_pandas_df_equal(bf_result, pd_result)
256257

257258

259+
def test_df_memory_usage(scalars_dfs):
260+
scalars_df, scalars_pandas_df = scalars_dfs
261+
262+
pd_result = scalars_pandas_df.memory_usage()
263+
bf_result = scalars_df.memory_usage()
264+
265+
pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5)
266+
267+
268+
def test_df_info(scalars_dfs):
269+
expected = (
270+
"<class 'bigframes.dataframe.DataFrame'>\n"
271+
"Index: 9 entries, 0 to 8\n"
272+
"Data columns (total 13 columns):\n"
273+
" # Column Non-Null Count Dtype\n"
274+
"--- ------------- ---------------- ------------------------------\n"
275+
" 0 bool_col 8 non-null boolean\n"
276+
" 1 bytes_col 6 non-null object\n"
277+
" 2 date_col 7 non-null date32[day][pyarrow]\n"
278+
" 3 datetime_col 6 non-null timestamp[us][pyarrow]\n"
279+
" 4 geography_col 4 non-null geometry\n"
280+
" 5 int64_col 8 non-null Int64\n"
281+
" 6 int64_too 9 non-null Int64\n"
282+
" 7 numeric_col 6 non-null object\n"
283+
" 8 float64_col 7 non-null Float64\n"
284+
" 9 rowindex_2 9 non-null Int64\n"
285+
" 10 string_col 8 non-null string\n"
286+
" 11 time_col 6 non-null time64[us][pyarrow]\n"
287+
" 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n"
288+
"dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
289+
"memory usage: 945 bytes\n"
290+
)
291+
292+
scalars_df, _ = scalars_dfs
293+
bf_result = io.StringIO()
294+
295+
scalars_df.info(buf=bf_result)
296+
297+
assert expected == bf_result.getvalue()
298+
299+
258300
def test_drop_index(scalars_dfs):
259301
scalars_df, scalars_pandas_df = scalars_dfs
260302

third_party/bigframes_vendored/pandas/core/config_init.py

+11
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@
3333
Instead estimated bytes processed will be shown. Dataframe and Series
3434
objects can still be computed with methods that explicitly execute and
3535
download results.
36+
max_info_columns (int):
37+
max_info_columns is used in DataFrame.info method to decide if
38+
per column information will be printed.
39+
max_info_rows (int or None):
40+
df.info() will usually show null-counts for each column.
41+
For large frames this can be quite slow. max_info_rows and max_info_cols
42+
limit this null check only to frames with smaller dimensions than
43+
specified.
44+
memory_usage (bool):
45+
This specifies if the memory usage of a DataFrame should be displayed when
46+
df.info() is called. Valid values True,False,
3647
"""
3748

3849
sampling_options_doc = """

third_party/bigframes_vendored/pandas/core/frame.py

+66
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,72 @@ def values(self) -> np.ndarray:
9292
"""
9393
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
9494

95+
def info(
96+
self,
97+
verbose: bool | None = None,
98+
buf=None,
99+
max_cols: int | None = None,
100+
memory_usage: bool | None = None,
101+
show_counts: bool | None = None,
102+
) -> None:
103+
"""
104+
Print a concise summary of a DataFrame.
105+
106+
This method prints information about a DataFrame including
107+
the index dtypeand columns, non-null values and memory usage.
108+
109+
Args:
110+
verbose (bool, optional):
111+
Whether to print the full summary. By default, the setting in
112+
``pandas.options.display.max_info_columns`` is followed.
113+
buf (writable buffer, defaults to sys.stdout):
114+
Where to send the output. By default, the output is printed to
115+
sys.stdout. Pass a writable buffer if you need to further process
116+
the output.
117+
max_cols (int, optional):
118+
When to switch from the verbose to the truncated output. If the
119+
DataFrame has more than `max_cols` columns, the truncated output
120+
is used. By default, the setting in
121+
``pandas.options.display.max_info_columns`` is used.
122+
memory_usage (bool, optional):
123+
Specifies whether total memory usage of the DataFrame
124+
elements (including the index) should be displayed. By default,
125+
this follows the ``pandas.options.display.memory_usage`` setting.
126+
True always show memory usage. False never shows memory usage.
127+
Memory estimation is made based in column dtype and number of rows
128+
assuming values consume the same memory amount for corresponding dtypes.
129+
show_counts (bool, optional):
130+
Whether to show the non-null counts. By default, this is shown
131+
only if the DataFrame is smaller than
132+
``pandas.options.display.max_info_rows`` and
133+
``pandas.options.display.max_info_columns``. A value of True always
134+
shows the counts, and False never shows the counts.
135+
136+
Returns:
137+
None: This method prints a summary of a DataFrame and returns None."""
138+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
139+
140+
def memory_usage(self, index: bool = True):
141+
"""
142+
Return the memory usage of each column in bytes.
143+
144+
The memory usage can optionally include the contribution of
145+
the index and elements of `object` dtype.
146+
147+
This value is displayed in `DataFrame.info` by default. This can be
148+
suppressed by setting ``pandas.options.display.memory_usage`` to False.
149+
150+
Args:
151+
index (bool, default True):
152+
Specifies whether to include the memory usage of the DataFrame's
153+
index in returned Series. If ``index=True``, the memory usage of
154+
the index is the first item in the output.
155+
156+
Returns:
157+
Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes.
158+
"""
159+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
160+
95161
# ----------------------------------------------------------------------
96162
# IO methods (to / from other formats)
97163
def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:

0 commit comments

Comments
 (0)