Skip to content

Commit 52e9a78

Browse files
committed
Fix JSON error reporting for many cases of erroneous string values.
The majority of error exit cases in json_lex_string() failed to set lex->token_terminator, causing problems for the error context reporting code: it would see token_terminator less than token_start and do something more or less nuts. In v14 and up the end result could be as bad as a crash in report_json_context(). Older versions accidentally avoided that fate; but all versions produce error context lines that are far less useful than intended, because they'd stop at the end of the prior token instead of continuing to where the actually-bad input is. To fix, invent some macros that make it less notationally painful to do the right thing. Also add documentation about what the function is actually required to do; and in >= v14, add an assertion in report_json_context about token_terminator being sufficiently far advanced. Per report from Nikolay Shaplov. Back-patch to all supported versions. Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro
1 parent bc0bcce commit 52e9a78

File tree

3 files changed

+71
-54
lines changed

3 files changed

+71
-54
lines changed

src/common/jsonapi.c

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,14 @@ json_lex(JsonLexContext *lex)
673673

674674
/*
675675
* The next token in the input stream is known to be a string; lex it.
676+
*
677+
* If lex->strval isn't NULL, fill it with the decoded string.
678+
* Set lex->token_terminator to the end of the decoded input, and in
679+
* success cases, transfer its previous value to lex->prev_token_terminator.
680+
* Return JSON_SUCCESS or an error code.
681+
*
682+
* Note: be careful that all error exits advance lex->token_terminator
683+
* to the point after the character we detected the error on.
676684
*/
677685
static inline JsonParseErrorType
678686
json_lex_string(JsonLexContext *lex)
@@ -681,6 +689,19 @@ json_lex_string(JsonLexContext *lex)
681689
int len;
682690
int hi_surrogate = -1;
683691

692+
/* Convenience macros for error exits */
693+
#define FAIL_AT_CHAR_START(code) \
694+
do { \
695+
lex->token_terminator = s; \
696+
return code; \
697+
} while (0)
698+
#define FAIL_AT_CHAR_END(code) \
699+
do { \
700+
lex->token_terminator = \
701+
s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
702+
return code; \
703+
} while (0)
704+
684705
if (lex->strval != NULL)
685706
resetStringInfo(lex->strval);
686707

@@ -693,29 +714,22 @@ json_lex_string(JsonLexContext *lex)
693714
len++;
694715
/* Premature end of the string. */
695716
if (len >= lex->input_length)
696-
{
697-
lex->token_terminator = s;
698-
return JSON_INVALID_TOKEN;
699-
}
717+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
700718
else if (*s == '"')
701719
break;
702720
else if ((unsigned char) *s < 32)
703721
{
704722
/* Per RFC4627, these characters MUST be escaped. */
705723
/* Since *s isn't printable, exclude it from the context string */
706-
lex->token_terminator = s;
707-
return JSON_ESCAPING_REQUIRED;
724+
FAIL_AT_CHAR_START(JSON_ESCAPING_REQUIRED);
708725
}
709726
else if (*s == '\\')
710727
{
711728
/* OK, we have an escape character. */
712729
s++;
713730
len++;
714731
if (len >= lex->input_length)
715-
{
716-
lex->token_terminator = s;
717-
return JSON_INVALID_TOKEN;
718-
}
732+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
719733
else if (*s == 'u')
720734
{
721735
int i;
@@ -726,21 +740,15 @@ json_lex_string(JsonLexContext *lex)
726740
s++;
727741
len++;
728742
if (len >= lex->input_length)
729-
{
730-
lex->token_terminator = s;
731-
return JSON_INVALID_TOKEN;
732-
}
743+
FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
733744
else if (*s >= '0' && *s <= '9')
734745
ch = (ch * 16) + (*s - '0');
735746
else if (*s >= 'a' && *s <= 'f')
736747
ch = (ch * 16) + (*s - 'a') + 10;
737748
else if (*s >= 'A' && *s <= 'F')
738749
ch = (ch * 16) + (*s - 'A') + 10;
739750
else
740-
{
741-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
742-
return JSON_UNICODE_ESCAPE_FORMAT;
743-
}
751+
FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
744752
}
745753
if (lex->strval != NULL)
746754
{
@@ -750,20 +758,20 @@ json_lex_string(JsonLexContext *lex)
750758
if (is_utf16_surrogate_first(ch))
751759
{
752760
if (hi_surrogate != -1)
753-
return JSON_UNICODE_HIGH_SURROGATE;
761+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
754762
hi_surrogate = ch;
755763
continue;
756764
}
757765
else if (is_utf16_surrogate_second(ch))
758766
{
759767
if (hi_surrogate == -1)
760-
return JSON_UNICODE_LOW_SURROGATE;
768+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
761769
ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
762770
hi_surrogate = -1;
763771
}
764772

765773
if (hi_surrogate != -1)
766-
return JSON_UNICODE_LOW_SURROGATE;
774+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
767775

768776
/*
769777
* Reject invalid cases. We can't have a value above
@@ -773,7 +781,7 @@ json_lex_string(JsonLexContext *lex)
773781
if (ch == 0)
774782
{
775783
/* We can't allow this, since our TEXT type doesn't */
776-
return JSON_UNICODE_CODE_POINT_ZERO;
784+
FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
777785
}
778786

779787
/*
@@ -810,14 +818,14 @@ json_lex_string(JsonLexContext *lex)
810818
appendStringInfoChar(lex->strval, (char) ch);
811819
}
812820
else
813-
return JSON_UNICODE_HIGH_ESCAPE;
821+
FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
814822
#endif /* FRONTEND */
815823
}
816824
}
817825
else if (lex->strval != NULL)
818826
{
819827
if (hi_surrogate != -1)
820-
return JSON_UNICODE_LOW_SURROGATE;
828+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
821829

822830
switch (*s)
823831
{
@@ -842,10 +850,14 @@ json_lex_string(JsonLexContext *lex)
842850
appendStringInfoChar(lex->strval, '\t');
843851
break;
844852
default:
845-
/* Not a valid string escape, so signal error. */
853+
854+
/*
855+
* Not a valid string escape, so signal error. We
856+
* adjust token_start so that just the escape sequence
857+
* is reported, not the whole string.
858+
*/
846859
lex->token_start = s;
847-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
848-
return JSON_ESCAPING_INVALID;
860+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
849861
}
850862
}
851863
else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -858,28 +870,33 @@ json_lex_string(JsonLexContext *lex)
858870
* shown it's not a performance win.
859871
*/
860872
lex->token_start = s;
861-
lex->token_terminator = s + pg_encoding_mblen_bounded(lex->input_encoding, s);
862-
return JSON_ESCAPING_INVALID;
873+
FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
863874
}
864875

865876
}
866877
else if (lex->strval != NULL)
867878
{
868879
if (hi_surrogate != -1)
869-
return JSON_UNICODE_LOW_SURROGATE;
880+
FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
870881

871882
appendStringInfoChar(lex->strval, *s);
872883
}
873884

874885
}
875886

876887
if (hi_surrogate != -1)
888+
{
889+
lex->token_terminator = s + 1;
877890
return JSON_UNICODE_LOW_SURROGATE;
891+
}
878892

879893
/* Hooray, we found the end of the string! */
880894
lex->prev_token_terminator = lex->token_terminator;
881895
lex->token_terminator = s + 1;
882896
return JSON_SUCCESS;
897+
898+
#undef FAIL_AT_CHAR_START
899+
#undef FAIL_AT_CHAR_END
883900
}
884901

885902
/*

src/test/regress/expected/json_encoding.out

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,19 @@ select json '{ "a": "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
5656
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
5757
ERROR: invalid input syntax for type json
5858
DETAIL: Unicode high surrogate must not follow a high surrogate.
59-
CONTEXT: JSON data, line 1: { "a":...
59+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
6060
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
6161
ERROR: invalid input syntax for type json
6262
DETAIL: Unicode low surrogate must follow a high surrogate.
63-
CONTEXT: JSON data, line 1: { "a":...
63+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6464
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
6565
ERROR: invalid input syntax for type json
6666
DETAIL: Unicode low surrogate must follow a high surrogate.
67-
CONTEXT: JSON data, line 1: { "a":...
67+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
6868
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
6969
ERROR: invalid input syntax for type json
7070
DETAIL: Unicode low surrogate must follow a high surrogate.
71-
CONTEXT: JSON data, line 1: { "a":...
71+
CONTEXT: JSON data, line 1: { "a": "\ude04...
7272
--handling of simple unicode escapes
7373
select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
7474
correct_in_utf8
@@ -121,7 +121,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
121121
select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
122122
ERROR: unsupported Unicode escape sequence
123123
DETAIL: \u0000 cannot be converted to text.
124-
CONTEXT: JSON data, line 1: { "a":...
124+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
125125
select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
126126
not_an_escape
127127
--------------------
@@ -159,7 +159,7 @@ ERROR: unsupported Unicode escape sequence
159159
LINE 1: SELECT '"\u0000"'::jsonb;
160160
^
161161
DETAIL: \u0000 cannot be converted to text.
162-
CONTEXT: JSON data, line 1: ...
162+
CONTEXT: JSON data, line 1: "\u0000...
163163
-- use octet_length here so we don't get an odd unicode char in the
164164
-- output
165165
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -180,25 +180,25 @@ ERROR: invalid input syntax for type json
180180
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
181181
^
182182
DETAIL: Unicode high surrogate must not follow a high surrogate.
183-
CONTEXT: JSON data, line 1: { "a":...
183+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
184184
SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
185185
ERROR: invalid input syntax for type json
186186
LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a';
187187
^
188188
DETAIL: Unicode low surrogate must follow a high surrogate.
189-
CONTEXT: JSON data, line 1: { "a":...
189+
CONTEXT: JSON data, line 1: { "a": "\ude04...
190190
SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
191191
ERROR: invalid input syntax for type json
192192
LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a';
193193
^
194194
DETAIL: Unicode low surrogate must follow a high surrogate.
195-
CONTEXT: JSON data, line 1: { "a":...
195+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
196196
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
197197
ERROR: invalid input syntax for type json
198198
LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a';
199199
^
200200
DETAIL: Unicode low surrogate must follow a high surrogate.
201-
CONTEXT: JSON data, line 1: { "a":...
201+
CONTEXT: JSON data, line 1: { "a": "\ude04...
202202
-- handling of simple unicode escapes
203203
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
204204
correct_in_utf8
@@ -223,7 +223,7 @@ ERROR: unsupported Unicode escape sequence
223223
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
224224
^
225225
DETAIL: \u0000 cannot be converted to text.
226-
CONTEXT: JSON data, line 1: { "a":...
226+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
227227
SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
228228
not_an_escape
229229
------------------------------
@@ -253,7 +253,7 @@ ERROR: unsupported Unicode escape sequence
253253
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
254254
^
255255
DETAIL: \u0000 cannot be converted to text.
256-
CONTEXT: JSON data, line 1: { "a":...
256+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
257257
SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
258258
not_an_escape
259259
--------------------

src/test/regress/expected/json_encoding_1.out

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,19 @@ ERROR: conversion between UTF8 and SQL_ASCII is not supported
5252
select json '{ "a": "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
5353
ERROR: invalid input syntax for type json
5454
DETAIL: Unicode high surrogate must not follow a high surrogate.
55-
CONTEXT: JSON data, line 1: { "a":...
55+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
5656
select json '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
5757
ERROR: invalid input syntax for type json
5858
DETAIL: Unicode low surrogate must follow a high surrogate.
59-
CONTEXT: JSON data, line 1: { "a":...
59+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6060
select json '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
6161
ERROR: invalid input syntax for type json
6262
DETAIL: Unicode low surrogate must follow a high surrogate.
63-
CONTEXT: JSON data, line 1: { "a":...
63+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
6464
select json '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
6565
ERROR: invalid input syntax for type json
6666
DETAIL: Unicode low surrogate must follow a high surrogate.
67-
CONTEXT: JSON data, line 1: { "a":...
67+
CONTEXT: JSON data, line 1: { "a": "\ude04...
6868
--handling of simple unicode escapes
6969
select json '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
7070
correct_in_utf8
@@ -113,7 +113,7 @@ select json '{ "a": "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
113113
select json '{ "a": "null \u0000 escape" }' ->> 'a' as fails;
114114
ERROR: unsupported Unicode escape sequence
115115
DETAIL: \u0000 cannot be converted to text.
116-
CONTEXT: JSON data, line 1: { "a":...
116+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
117117
select json '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
118118
not_an_escape
119119
--------------------
@@ -151,7 +151,7 @@ ERROR: unsupported Unicode escape sequence
151151
LINE 1: SELECT '"\u0000"'::jsonb;
152152
^
153153
DETAIL: \u0000 cannot be converted to text.
154-
CONTEXT: JSON data, line 1: ...
154+
CONTEXT: JSON data, line 1: "\u0000...
155155
-- use octet_length here so we don't get an odd unicode char in the
156156
-- output
157157
SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -168,25 +168,25 @@ ERROR: invalid input syntax for type json
168168
LINE 1: SELECT jsonb '{ "a": "\ud83d\ud83d" }' -> 'a';
169169
^
170170
DETAIL: Unicode high surrogate must not follow a high surrogate.
171-
CONTEXT: JSON data, line 1: { "a":...
171+
CONTEXT: JSON data, line 1: { "a": "\ud83d\ud83d...
172172
SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
173173
ERROR: invalid input syntax for type json
174174
LINE 1: SELECT jsonb '{ "a": "\ude04\ud83d" }' -> 'a';
175175
^
176176
DETAIL: Unicode low surrogate must follow a high surrogate.
177-
CONTEXT: JSON data, line 1: { "a":...
177+
CONTEXT: JSON data, line 1: { "a": "\ude04...
178178
SELECT jsonb '{ "a": "\ud83dX" }' -> 'a'; -- orphan high surrogate
179179
ERROR: invalid input syntax for type json
180180
LINE 1: SELECT jsonb '{ "a": "\ud83dX" }' -> 'a';
181181
^
182182
DETAIL: Unicode low surrogate must follow a high surrogate.
183-
CONTEXT: JSON data, line 1: { "a":...
183+
CONTEXT: JSON data, line 1: { "a": "\ud83dX...
184184
SELECT jsonb '{ "a": "\ude04X" }' -> 'a'; -- orphan low surrogate
185185
ERROR: invalid input syntax for type json
186186
LINE 1: SELECT jsonb '{ "a": "\ude04X" }' -> 'a';
187187
^
188188
DETAIL: Unicode low surrogate must follow a high surrogate.
189-
CONTEXT: JSON data, line 1: { "a":...
189+
CONTEXT: JSON data, line 1: { "a": "\ude04...
190190
-- handling of simple unicode escapes
191191
SELECT jsonb '{ "a": "the Copyright \u00a9 sign" }' as correct_in_utf8;
192192
ERROR: conversion between UTF8 and SQL_ASCII is not supported
@@ -209,7 +209,7 @@ ERROR: unsupported Unicode escape sequence
209209
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' as fails;
210210
^
211211
DETAIL: \u0000 cannot be converted to text.
212-
CONTEXT: JSON data, line 1: { "a":...
212+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
213213
SELECT jsonb '{ "a": "null \\u0000 escape" }' as not_an_escape;
214214
not_an_escape
215215
------------------------------
@@ -237,7 +237,7 @@ ERROR: unsupported Unicode escape sequence
237237
LINE 1: SELECT jsonb '{ "a": "null \u0000 escape" }' ->> 'a' as fai...
238238
^
239239
DETAIL: \u0000 cannot be converted to text.
240-
CONTEXT: JSON data, line 1: { "a":...
240+
CONTEXT: JSON data, line 1: { "a": "null \u0000...
241241
SELECT jsonb '{ "a": "null \\u0000 escape" }' ->> 'a' as not_an_escape;
242242
not_an_escape
243243
--------------------

0 commit comments

Comments
 (0)