Skip to content

Commit f5cfdbb

Browse files
bashtageTomAugspurger
authored andcommitted
BUG: Set index when reading Stata file (#17328)
Ensures index is set when requested during reading of a Stata dta file Deprecates and renames index to index_col for API consistence closes #16342
1 parent 9ec157b commit f5cfdbb

File tree

3 files changed

+46
-28
lines changed

3 files changed

+46
-28
lines changed

doc/source/whatsnew/v0.21.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ Other API Changes
431431
- :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`)
432432
- :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`).
433433
- :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`)
434+
- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`)
434435

435436

436437
.. _whatsnew_0210.deprecations:
@@ -515,6 +516,7 @@ I/O
515516
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
516517
- Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
517518
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
519+
- Bug in :func:`read_stata` where the index was not set (:issue:`16342`)
518520
- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
519521
- Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`)
520522

pandas/io/stata.py

+34-27
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,30 @@
99
You can find more information on http://presbrey.mit.edu/PyDTA and
1010
http://www.statsmodels.org/devel/
1111
"""
12-
import numpy as np
1312

14-
import sys
13+
import datetime
1514
import struct
16-
from dateutil.relativedelta import relativedelta
15+
import sys
1716

18-
from pandas.core.dtypes.common import (
19-
is_categorical_dtype, is_datetime64_dtype,
20-
_ensure_object)
17+
import numpy as np
18+
from dateutil.relativedelta import relativedelta
19+
from pandas._libs.lib import max_len_string_array, infer_dtype
20+
from pandas._libs.tslib import NaT, Timestamp
2121

22+
import pandas as pd
23+
from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex
24+
from pandas.compat import (lrange, lmap, lzip, text_type, string_types, range,
25+
zip, BytesIO)
2226
from pandas.core.base import StringMixin
2327
from pandas.core.categorical import Categorical
28+
from pandas.core.dtypes.common import (is_categorical_dtype, _ensure_object,
29+
is_datetime64_dtype)
2430
from pandas.core.frame import DataFrame
2531
from pandas.core.series import Series
26-
import datetime
27-
from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex
28-
from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \
29-
zip, BytesIO
30-
from pandas.util._decorators import Appender
31-
import pandas as pd
32-
3332
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
3433
_stringify_path)
35-
from pandas._libs.lib import max_len_string_array, infer_dtype
36-
from pandas._libs.tslib import NaT, Timestamp
34+
from pandas.util._decorators import Appender
35+
from pandas.util._decorators import deprecate_kwarg
3736

3837
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
3938
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
@@ -53,8 +52,8 @@
5352
Encoding used to parse the files. None defaults to latin-1."""
5453

5554
_statafile_processing_params2 = """\
56-
index : identifier of index column
57-
identifier of column that should be used as index of the DataFrame
55+
index_col : string, optional, default: None
56+
Column to set as index
5857
convert_missing : boolean, defaults to False
5958
Flag indicating whether to convert missing values to their Stata
6059
representations. If False, missing values are replaced with nan.
@@ -159,15 +158,16 @@
159158

160159

161160
@Appender(_read_stata_doc)
161+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
162162
def read_stata(filepath_or_buffer, convert_dates=True,
163-
convert_categoricals=True, encoding=None, index=None,
163+
convert_categoricals=True, encoding=None, index_col=None,
164164
convert_missing=False, preserve_dtypes=True, columns=None,
165165
order_categoricals=True, chunksize=None, iterator=False):
166166

167167
reader = StataReader(filepath_or_buffer,
168168
convert_dates=convert_dates,
169169
convert_categoricals=convert_categoricals,
170-
index=index, convert_missing=convert_missing,
170+
index_col=index_col, convert_missing=convert_missing,
171171
preserve_dtypes=preserve_dtypes,
172172
columns=columns,
173173
order_categoricals=order_categoricals,
@@ -945,8 +945,9 @@ def __init__(self, encoding):
945945
class StataReader(StataParser, BaseIterator):
946946
__doc__ = _stata_reader_doc
947947

948+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
948949
def __init__(self, path_or_buf, convert_dates=True,
949-
convert_categoricals=True, index=None,
950+
convert_categoricals=True, index_col=None,
950951
convert_missing=False, preserve_dtypes=True,
951952
columns=None, order_categoricals=True,
952953
encoding='latin-1', chunksize=None):
@@ -957,7 +958,7 @@ def __init__(self, path_or_buf, convert_dates=True,
957958
# calls to read).
958959
self._convert_dates = convert_dates
959960
self._convert_categoricals = convert_categoricals
960-
self._index = index
961+
self._index_col = index_col
961962
self._convert_missing = convert_missing
962963
self._preserve_dtypes = preserve_dtypes
963964
self._columns = columns
@@ -1461,8 +1462,9 @@ def get_chunk(self, size=None):
14611462
return self.read(nrows=size)
14621463

14631464
@Appender(_read_method_doc)
1465+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
14641466
def read(self, nrows=None, convert_dates=None,
1465-
convert_categoricals=None, index=None,
1467+
convert_categoricals=None, index_col=None,
14661468
convert_missing=None, preserve_dtypes=None,
14671469
columns=None, order_categoricals=None):
14681470
# Handle empty file or chunk. If reading incrementally raise
@@ -1487,6 +1489,8 @@ def read(self, nrows=None, convert_dates=None,
14871489
columns = self._columns
14881490
if order_categoricals is None:
14891491
order_categoricals = self._order_categoricals
1492+
if index_col is None:
1493+
index_col = self._index_col
14901494

14911495
if nrows is None:
14921496
nrows = self.nobs
@@ -1525,14 +1529,14 @@ def read(self, nrows=None, convert_dates=None,
15251529
self._read_value_labels()
15261530

15271531
if len(data) == 0:
1528-
data = DataFrame(columns=self.varlist, index=index)
1532+
data = DataFrame(columns=self.varlist)
15291533
else:
1530-
data = DataFrame.from_records(data, index=index)
1534+
data = DataFrame.from_records(data)
15311535
data.columns = self.varlist
15321536

15331537
# If index is not specified, use actual row number rather than
15341538
# restarting at 0 for each chunk.
1535-
if index is None:
1539+
if index_col is None:
15361540
ix = np.arange(self._lines_read - read_lines, self._lines_read)
15371541
data = data.set_index(ix)
15381542

@@ -1554,7 +1558,7 @@ def read(self, nrows=None, convert_dates=None,
15541558
cols_ = np.where(self.dtyplist)[0]
15551559

15561560
# Convert columns (if needed) to match input type
1557-
index = data.index
1561+
ix = data.index
15581562
requires_type_conversion = False
15591563
data_formatted = []
15601564
for i in cols_:
@@ -1564,7 +1568,7 @@ def read(self, nrows=None, convert_dates=None,
15641568
if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
15651569
requires_type_conversion = True
15661570
data_formatted.append(
1567-
(col, Series(data[col], index, self.dtyplist[i])))
1571+
(col, Series(data[col], ix, self.dtyplist[i])))
15681572
else:
15691573
data_formatted.append((col, data[col]))
15701574
if requires_type_conversion:
@@ -1607,6 +1611,9 @@ def read(self, nrows=None, convert_dates=None,
16071611
if convert:
16081612
data = DataFrame.from_items(retyped_data)
16091613

1614+
if index_col is not None:
1615+
data = data.set_index(data.pop(index_col))
1616+
16101617
return data
16111618

16121619
def _do_convert_missing(self, data, convert_missing):

pandas/tests/io/test_stata.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ def test_read_write_reread_dta15(self):
476476
tm.assert_frame_equal(parsed_114, parsed_117)
477477

478478
def test_timestamp_and_label(self):
479-
original = DataFrame([(1,)], columns=['var'])
479+
original = DataFrame([(1,)], columns=['variable'])
480480
time_stamp = datetime(2000, 2, 29, 14, 21)
481481
data_label = 'This is a data file.'
482482
with tm.ensure_clean() as path:
@@ -1309,3 +1309,12 @@ def test_value_labels_iterator(self, write_index):
13091309
dta_iter = pd.read_stata(path, iterator=True)
13101310
value_labels = dta_iter.value_labels()
13111311
assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}
1312+
1313+
def test_set_index(self):
1314+
# GH 17328
1315+
df = tm.makeDataFrame()
1316+
df.index.name = 'index'
1317+
with tm.ensure_clean() as path:
1318+
df.to_stata(path)
1319+
reread = pd.read_stata(path, index_col='index')
1320+
tm.assert_frame_equal(df, reread)

0 commit comments

Comments
 (0)