PERF: improved perf in .to_json when lines=True

jreback · jreback · commit 7cad3f16bccd · 2016-10-15T16:02:30.000-04:00
closes pandas-dev#14408
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -20,8 +20,8 @@ Highlights include:
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
- - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
-
+- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
+- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)
 
 
 
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -605,25 +605,9 @@ def _convert_to_line_delimits(s):
     if not s[0] == '[' and s[-1] == ']':
         return s
     s = s[1:-1]
-    num_open_brackets_seen = 0
-    commas_to_replace = []
-    in_quotes = False
-    for idx, char in enumerate(s):              # iter through to find all
-        if char == '"' and idx > 0 and s[idx - 1] != '\\':
-            in_quotes = ~in_quotes
-        elif char == ',':                         # commas that should be \n
-            if num_open_brackets_seen == 0 and not in_quotes:
-                commas_to_replace.append(idx)
-        elif char == '{':
-            if not in_quotes:
-                num_open_brackets_seen += 1
-        elif char == '}':
-            if not in_quotes:
-                num_open_brackets_seen -= 1
-    s_arr = np.array(list(s))                  # Turn to an array to set
-    s_arr[commas_to_replace] = '\n'            # all commas at once.
-    s = ''.join(s_arr)
-    return s
+
+    from pandas.lib import convert_json_to_lines
+    return convert_json_to_lines(s)
 
 
 def nested_to_record(ds, prefix="", level=0):
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1087,6 +1087,44 @@ def string_array_replace_from_nan_rep(
     return arr
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def convert_json_to_lines(object arr):
+    """
+    replace comma separated json with line feeds, paying special attention
+    to quotes & brackets
+    """
+    cdef:
+        Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length
+        ndarray[uint8_t] narr
+        unsigned char v, comma, left_bracket, right_brack, newline
+
+    newline = ord('\n')
+    comma = ord(',')
+    left_bracket = ord('{')
+    right_bracket = ord('}')
+    quote = ord('"')
+    backslash = ord('\\')
+
+    narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
+    length = narr.shape[0]
+    for i in range(length):
+        v = narr[i]
+        if v == quote and i > 0 and narr[i - 1] != backslash:
+            in_quotes = ~in_quotes
+        if v == comma: # commas that should be \n
+            if num_open_brackets_seen == 0 and not in_quotes:
+                narr[i] = newline
+        elif v == left_bracket:
+            if not in_quotes:
+                num_open_brackets_seen += 1
+        elif v == right_bracket:
+            if not in_quotes:
+                num_open_brackets_seen -= 1
+
+    return narr.tostring().decode('utf-8')
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def write_csv_rows(list data, ndarray data_index,