Skip to content

Commit 4e30df8

Browse files
committed
BUG: Set index when reading Stata file
Ensures index is set when requested during reading of a Stata dta file Deprecates and renames index to index_col for API consistence closes #16342
1 parent 0d676a3 commit 4e30df8

File tree

3 files changed

+46
-28
lines changed

3 files changed

+46
-28
lines changed

doc/source/whatsnew/v0.21.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ Other API Changes
293293
- :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`)
294294
- :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`).
295295
- :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`)
296+
- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`)
296297

297298

298299
.. _whatsnew_0210.deprecations:
@@ -370,6 +371,7 @@ I/O
370371
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
371372
- Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
372373
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
374+
- Bug in :func:`read_stata` where the index was not set (:issue:`16342`)
373375
- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
374376

375377
Plotting

pandas/io/stata.py

+34-27
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,30 @@
99
You can find more information on http://presbrey.mit.edu/PyDTA and
1010
http://www.statsmodels.org/devel/
1111
"""
12-
import numpy as np
1312

14-
import sys
13+
import datetime
1514
import struct
16-
from dateutil.relativedelta import relativedelta
15+
import sys
1716

18-
from pandas.core.dtypes.common import (
19-
is_categorical_dtype, is_datetime64_dtype,
20-
_ensure_object)
17+
import numpy as np
18+
from dateutil.relativedelta import relativedelta
19+
from pandas._libs.lib import max_len_string_array, infer_dtype
20+
from pandas._libs.tslib import NaT, Timestamp
2121

22+
import pandas as pd
23+
from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex
24+
from pandas.compat import (lrange, lmap, lzip, text_type, string_types, range,
25+
zip, BytesIO)
2226
from pandas.core.base import StringMixin
2327
from pandas.core.categorical import Categorical
28+
from pandas.core.dtypes.common import (is_categorical_dtype, _ensure_object,
29+
is_datetime64_dtype)
2430
from pandas.core.frame import DataFrame
2531
from pandas.core.series import Series
26-
import datetime
27-
from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex
28-
from pandas.compat import lrange, lmap, lzip, text_type, string_types, range, \
29-
zip, BytesIO
30-
from pandas.util._decorators import Appender
31-
import pandas as pd
32-
3332
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
3433
_stringify_path)
35-
from pandas._libs.lib import max_len_string_array, infer_dtype
36-
from pandas._libs.tslib import NaT, Timestamp
34+
from pandas.util._decorators import Appender
35+
from pandas.util._decorators import deprecate_kwarg
3736

3837
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
3938
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
@@ -53,8 +52,8 @@
5352
Encoding used to parse the files. None defaults to latin-1."""
5453

5554
_statafile_processing_params2 = """\
56-
index : identifier of index column
57-
identifier of column that should be used as index of the DataFrame
55+
index_col : string, optional, default: None
56+
Column to set as index
5857
convert_missing : boolean, defaults to False
5958
Flag indicating whether to convert missing values to their Stata
6059
representations. If False, missing values are replaced with nans.
@@ -159,15 +158,16 @@
159158

160159

161160
@Appender(_read_stata_doc)
161+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
162162
def read_stata(filepath_or_buffer, convert_dates=True,
163-
convert_categoricals=True, encoding=None, index=None,
163+
convert_categoricals=True, encoding=None, index_col=None,
164164
convert_missing=False, preserve_dtypes=True, columns=None,
165165
order_categoricals=True, chunksize=None, iterator=False):
166166

167167
reader = StataReader(filepath_or_buffer,
168168
convert_dates=convert_dates,
169169
convert_categoricals=convert_categoricals,
170-
index=index, convert_missing=convert_missing,
170+
index_col=index_col, convert_missing=convert_missing,
171171
preserve_dtypes=preserve_dtypes,
172172
columns=columns,
173173
order_categoricals=order_categoricals,
@@ -944,8 +944,9 @@ def __init__(self, encoding):
944944
class StataReader(StataParser, BaseIterator):
945945
__doc__ = _stata_reader_doc
946946

947+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
947948
def __init__(self, path_or_buf, convert_dates=True,
948-
convert_categoricals=True, index=None,
949+
convert_categoricals=True, index_col=None,
949950
convert_missing=False, preserve_dtypes=True,
950951
columns=None, order_categoricals=True,
951952
encoding='latin-1', chunksize=None):
@@ -956,7 +957,7 @@ def __init__(self, path_or_buf, convert_dates=True,
956957
# calls to read).
957958
self._convert_dates = convert_dates
958959
self._convert_categoricals = convert_categoricals
959-
self._index = index
960+
self._index_col = index_col
960961
self._convert_missing = convert_missing
961962
self._preserve_dtypes = preserve_dtypes
962963
self._columns = columns
@@ -1460,8 +1461,9 @@ def get_chunk(self, size=None):
14601461
return self.read(nrows=size)
14611462

14621463
@Appender(_read_method_doc)
1464+
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
14631465
def read(self, nrows=None, convert_dates=None,
1464-
convert_categoricals=None, index=None,
1466+
convert_categoricals=None, index_col=None,
14651467
convert_missing=None, preserve_dtypes=None,
14661468
columns=None, order_categoricals=None):
14671469
# Handle empty file or chunk. If reading incrementally raise
@@ -1486,6 +1488,8 @@ def read(self, nrows=None, convert_dates=None,
14861488
columns = self._columns
14871489
if order_categoricals is None:
14881490
order_categoricals = self._order_categoricals
1491+
if index_col is None:
1492+
index_col = self._index_col
14891493

14901494
if nrows is None:
14911495
nrows = self.nobs
@@ -1524,14 +1528,14 @@ def read(self, nrows=None, convert_dates=None,
15241528
self._read_value_labels()
15251529

15261530
if len(data) == 0:
1527-
data = DataFrame(columns=self.varlist, index=index)
1531+
data = DataFrame(columns=self.varlist)
15281532
else:
1529-
data = DataFrame.from_records(data, index=index)
1533+
data = DataFrame.from_records(data)
15301534
data.columns = self.varlist
15311535

15321536
# If index is not specified, use actual row number rather than
15331537
# restarting at 0 for each chunk.
1534-
if index is None:
1538+
if index_col is None:
15351539
ix = np.arange(self._lines_read - read_lines, self._lines_read)
15361540
data = data.set_index(ix)
15371541

@@ -1553,7 +1557,7 @@ def read(self, nrows=None, convert_dates=None,
15531557
cols_ = np.where(self.dtyplist)[0]
15541558

15551559
# Convert columns (if needed) to match input type
1556-
index = data.index
1560+
ix = data.index
15571561
requires_type_conversion = False
15581562
data_formatted = []
15591563
for i in cols_:
@@ -1563,7 +1567,7 @@ def read(self, nrows=None, convert_dates=None,
15631567
if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
15641568
requires_type_conversion = True
15651569
data_formatted.append(
1566-
(col, Series(data[col], index, self.dtyplist[i])))
1570+
(col, Series(data[col], ix, self.dtyplist[i])))
15671571
else:
15681572
data_formatted.append((col, data[col]))
15691573
if requires_type_conversion:
@@ -1606,6 +1610,9 @@ def read(self, nrows=None, convert_dates=None,
16061610
if convert:
16071611
data = DataFrame.from_items(retyped_data)
16081612

1613+
if index_col is not None:
1614+
data = data.set_index(data.pop(index_col))
1615+
16091616
return data
16101617

16111618
def _do_convert_missing(self, data, convert_missing):

pandas/tests/io/test_stata.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ def test_read_write_reread_dta15(self):
476476
tm.assert_frame_equal(parsed_114, parsed_117)
477477

478478
def test_timestamp_and_label(self):
479-
original = DataFrame([(1,)], columns=['var'])
479+
original = DataFrame([(1,)], columns=['variable'])
480480
time_stamp = datetime(2000, 2, 29, 14, 21)
481481
data_label = 'This is a data file.'
482482
with tm.ensure_clean() as path:
@@ -1309,3 +1309,12 @@ def test_value_labels_iterator(self, write_index):
13091309
dta_iter = pd.read_stata(path, iterator=True)
13101310
value_labels = dta_iter.value_labels()
13111311
assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}
1312+
1313+
def test_set_index(self):
1314+
# GH 17328
1315+
df = tm.makeDataFrame()
1316+
df.index.name = 'index'
1317+
with tm.ensure_clean() as path:
1318+
df.to_stata(path)
1319+
reread = pd.read_stata(path, index_col='index')
1320+
tm.assert_frame_equal(df, reread)

0 commit comments

Comments
 (0)