Skip to content

Commit 7cad3f1

Browse files
committed
PERF: improved perf in .to_json when lines=True
closes pandas-dev#14408
1 parent fd3be00 commit 7cad3f1

File tree

3 files changed

+43
-21
lines changed

3 files changed

+43
-21
lines changed

doc/source/whatsnew/v0.19.1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ Highlights include:
2020
Performance Improvements
2121
~~~~~~~~~~~~~~~~~~~~~~~~
2222

23-
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
24-
23+
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
24+
- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
2525

2626

2727

pandas/io/json.py

+3-19
Original file line numberDiff line numberDiff line change
@@ -605,25 +605,9 @@ def _convert_to_line_delimits(s):
605605
if not s[0] == '[' and s[-1] == ']':
606606
return s
607607
s = s[1:-1]
608-
num_open_brackets_seen = 0
609-
commas_to_replace = []
610-
in_quotes = False
611-
for idx, char in enumerate(s): # iter through to find all
612-
if char == '"' and idx > 0 and s[idx - 1] != '\\':
613-
in_quotes = ~in_quotes
614-
elif char == ',': # commas that should be \n
615-
if num_open_brackets_seen == 0 and not in_quotes:
616-
commas_to_replace.append(idx)
617-
elif char == '{':
618-
if not in_quotes:
619-
num_open_brackets_seen += 1
620-
elif char == '}':
621-
if not in_quotes:
622-
num_open_brackets_seen -= 1
623-
s_arr = np.array(list(s)) # Turn to an array to set
624-
s_arr[commas_to_replace] = '\n' # all commas at once.
625-
s = ''.join(s_arr)
626-
return s
608+
609+
from pandas.lib import convert_json_to_lines
610+
return convert_json_to_lines(s)
627611

628612

629613
def nested_to_record(ds, prefix="", level=0):

pandas/lib.pyx

+38
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,44 @@ def string_array_replace_from_nan_rep(
10871087
return arr
10881088

10891089

1090+
@cython.boundscheck(False)
1091+
@cython.wraparound(False)
1092+
def convert_json_to_lines(object arr):
1093+
"""
1094+
replace comma separated json with line feeds, paying special attention
1095+
to quotes & brackets
1096+
"""
1097+
cdef:
1098+
Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length
1099+
ndarray[uint8_t] narr
1100+
unsigned char v, comma, left_bracket, right_brack, newline
1101+
1102+
newline = ord('\n')
1103+
comma = ord(',')
1104+
left_bracket = ord('{')
1105+
right_bracket = ord('}')
1106+
quote = ord('"')
1107+
backslash = ord('\\')
1108+
1109+
narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
1110+
length = narr.shape[0]
1111+
for i in range(length):
1112+
v = narr[i]
1113+
if v == quote and i > 0 and narr[i - 1] != backslash:
1114+
in_quotes = ~in_quotes
1115+
if v == comma: # commas that should be \n
1116+
if num_open_brackets_seen == 0 and not in_quotes:
1117+
narr[i] = newline
1118+
elif v == left_bracket:
1119+
if not in_quotes:
1120+
num_open_brackets_seen += 1
1121+
elif v == right_bracket:
1122+
if not in_quotes:
1123+
num_open_brackets_seen -= 1
1124+
1125+
return narr.tostring().decode('utf-8')
1126+
1127+
10901128
@cython.boundscheck(False)
10911129
@cython.wraparound(False)
10921130
def write_csv_rows(list data, ndarray data_index,

0 commit comments

Comments
 (0)