Skip to content

Commit c47adb0

Browse files
committed
Change PyUnicode_KIND to 1,2,4. Drop _KIND_SIZE and _CHARACTER_SIZE.
1 parent dd07732 commit c47adb0

File tree

8 files changed

+84
-123
lines changed

8 files changed

+84
-123
lines changed

Doc/c-api/unicode.rst

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ access internal read-only data of Unicode objects:
9999
100100
.. deprecated-removed:: 3.3 4.0
101101
Part of the old-style Unicode API, please migrate to using
102-
:c:func:`PyUnicode_GET_LENGTH` or :c:func:`PyUnicode_KIND_SIZE`.
102+
:c:func:`PyUnicode_GET_LENGTH`.
103103
104104
105105
.. c:function:: Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *o)
@@ -149,9 +149,8 @@ access internal read-only data of Unicode objects:
149149
Return a pointer to the canonical representation cast to UCS1, UCS2 or UCS4
150150
integer types for direct character access. No checks are performed if the
151151
canonical representation has the correct character size; use
152-
:c:func:`PyUnicode_CHARACTER_SIZE` or :c:func:`PyUnicode_KIND` to select the
153-
right macro. Make sure :c:func:`PyUnicode_READY` has been called before
154-
accessing this.
152+
:c:func:`PyUnicode_KIND` to select the right macro. Make sure
153+
:c:func:`PyUnicode_READY` has been called before accessing this.
155154
156155
.. versionadded:: 3.3
157156
@@ -176,15 +175,6 @@ access internal read-only data of Unicode objects:
176175
.. versionadded:: 3.3
177176
178177
179-
.. c:function:: int PyUnicode_CHARACTER_SIZE(PyObject *o)
180-
181-
Return the number of bytes the string uses to represent single characters;
182-
this can be 1, 2 or 4. *o* has to be a Unicode object in the "canonical"
183-
representation (not checked).
184-
185-
.. versionadded:: 3.3
186-
187-
188178
.. c:function:: void* PyUnicode_DATA(PyObject *o)
189179
190180
Return a void pointer to the raw unicode buffer. *o* has to be a Unicode
@@ -193,14 +183,6 @@ access internal read-only data of Unicode objects:
193183
.. versionadded:: 3.3
194184
195185
196-
.. c:function:: int PyUnicode_KIND_SIZE(int kind, Py_ssize_t index)
197-
198-
Compute ``index * char_size`` where ``char_size`` is ``2**(kind - 1)``. The
199-
index is a character index, the result is a size in bytes.
200-
201-
.. versionadded:: 3.3
202-
203-
204186
.. c:function:: void PyUnicode_WRITE(int kind, void *data, Py_ssize_t index, \
205187
Py_UCS4 value)
206188

Include/unicodeobject.h

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,12 @@ typedef struct {
305305
* character type = Py_UCS2 (16 bits, unsigned)
306306
* at least one character must be in range U+0100-U+FFFF
307307
308-
- PyUnicode_4BYTE_KIND (3):
308+
- PyUnicode_4BYTE_KIND (4):
309309
310310
* character type = Py_UCS4 (32 bits, unsigned)
311311
* at least one character must be in range U+10000-U+10FFFF
312312
*/
313-
unsigned int kind:2;
313+
unsigned int kind:3;
314314
/* Compact is with respect to the allocation scheme. Compact unicode
315315
objects only require one memory block while non-compact objects use
316316
one block for the PyUnicodeObject struct and another for its data
@@ -424,29 +424,21 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
424424
#define PyUnicode_IS_COMPACT_ASCII(op) \
425425
(PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
426426

427+
enum PyUnicode_Kind {
427428
/* String contains only wstr byte characters. This is only possible
428429
when the string was created with a legacy API and _PyUnicode_Ready()
429430
has not been called yet. */
430-
#define PyUnicode_WCHAR_KIND 0
431-
431+
PyUnicode_WCHAR_KIND = 0,
432432
/* Return values of the PyUnicode_KIND() macro: */
433-
434-
#define PyUnicode_1BYTE_KIND 1
435-
#define PyUnicode_2BYTE_KIND 2
436-
#define PyUnicode_4BYTE_KIND 3
437-
438-
439-
/* Return the number of bytes the string uses to represent single characters,
440-
this can be 1, 2 or 4.
441-
442-
See also PyUnicode_KIND_SIZE(). */
443-
#define PyUnicode_CHARACTER_SIZE(op) \
444-
(((Py_ssize_t)1 << (PyUnicode_KIND(op) - 1)))
433+
PyUnicode_1BYTE_KIND = 1,
434+
PyUnicode_2BYTE_KIND = 2,
435+
PyUnicode_4BYTE_KIND = 4
436+
};
445437

446438
/* Return pointers to the canonical representation cast to unsigned char,
447439
Py_UCS2, or Py_UCS4 for direct character access.
448-
No checks are performed, use PyUnicode_CHARACTER_SIZE or
449-
PyUnicode_KIND() before to ensure these will work correctly. */
440+
No checks are performed, use PyUnicode_KIND() before to ensure
441+
these will work correctly. */
450442

451443
#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452444
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
@@ -473,13 +465,6 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
473465
PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
474466
_PyUnicode_NONCOMPACT_DATA(op))
475467

476-
/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
477-
The index is a character index, the result is a size in bytes.
478-
479-
See also PyUnicode_CHARACTER_SIZE(). */
480-
#define PyUnicode_KIND_SIZE(kind, index) \
481-
(((Py_ssize_t)(index)) << ((kind) - 1))
482-
483468
/* In the access macros below, "kind" may be evaluated more than once.
484469
All other macro parameters are evaluated exactly once, so it is safe
485470
to put side effects into them (such as increasing the index). */

Modules/_io/textio.c

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -291,9 +291,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
291291
kind = PyUnicode_KIND(modified);
292292
out = PyUnicode_DATA(modified);
293293
PyUnicode_WRITE(kind, PyUnicode_DATA(modified), 0, '\r');
294-
memcpy(out + PyUnicode_KIND_SIZE(kind, 1),
295-
PyUnicode_DATA(output),
296-
PyUnicode_KIND_SIZE(kind, output_len));
294+
memcpy(out + kind, PyUnicode_DATA(output), kind * output_len);
297295
Py_DECREF(output);
298296
output = modified; /* output remains ready */
299297
self->pendingcr = 0;
@@ -336,15 +334,15 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
336334
for the \r *byte* with the libc's optimized memchr.
337335
*/
338336
if (seennl == SEEN_LF || seennl == 0) {
339-
only_lf = (memchr(in_str, '\r', PyUnicode_KIND_SIZE(kind, len)) == NULL);
337+
only_lf = (memchr(in_str, '\r', kind * len) == NULL);
340338
}
341339

342340
if (only_lf) {
343341
/* If not already seen, quick scan for a possible "\n" character.
344342
(there's nothing else to be done, even when in translation mode)
345343
*/
346344
if (seennl == 0 &&
347-
memchr(in_str, '\n', PyUnicode_KIND_SIZE(kind, len)) != NULL) {
345+
memchr(in_str, '\n', kind * len) != NULL) {
348346
Py_ssize_t i = 0;
349347
for (;;) {
350348
Py_UCS4 c;
@@ -403,7 +401,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
403401
when there is something to translate. On the other hand,
404402
we already know there is a \r byte, so chances are high
405403
that something needs to be done. */
406-
translated = PyMem_Malloc(PyUnicode_KIND_SIZE(kind, len));
404+
translated = PyMem_Malloc(kind * len);
407405
if (translated == NULL) {
408406
PyErr_NoMemory();
409407
goto error;
@@ -1576,15 +1574,14 @@ textiowrapper_read(textio *self, PyObject *args)
15761574
static char *
15771575
find_control_char(int kind, char *s, char *end, Py_UCS4 ch)
15781576
{
1579-
int size = PyUnicode_KIND_SIZE(kind, 1);
15801577
for (;;) {
15811578
while (PyUnicode_READ(kind, s, 0) > ch)
1582-
s += size;
1579+
s += kind;
15831580
if (PyUnicode_READ(kind, s, 0) == ch)
15841581
return s;
15851582
if (s == end)
15861583
return NULL;
1587-
s += size;
1584+
s += kind;
15881585
}
15891586
}
15901587

@@ -1593,14 +1590,13 @@ _PyIO_find_line_ending(
15931590
int translated, int universal, PyObject *readnl,
15941591
int kind, char *start, char *end, Py_ssize_t *consumed)
15951592
{
1596-
int size = PyUnicode_KIND_SIZE(kind, 1);
1597-
Py_ssize_t len = ((char*)end - (char*)start)/size;
1593+
Py_ssize_t len = ((char*)end - (char*)start)/kind;
15981594

15991595
if (translated) {
16001596
/* Newlines are already translated, only search for \n */
16011597
char *pos = find_control_char(kind, start, end, '\n');
16021598
if (pos != NULL)
1603-
return (pos - start)/size + 1;
1599+
return (pos - start)/kind + 1;
16041600
else {
16051601
*consumed = len;
16061602
return -1;
@@ -1616,20 +1612,20 @@ _PyIO_find_line_ending(
16161612
/* Fast path for non-control chars. The loop always ends
16171613
since the Unicode string is NUL-terminated. */
16181614
while (PyUnicode_READ(kind, s, 0) > '\r')
1619-
s += size;
1615+
s += kind;
16201616
if (s >= end) {
16211617
*consumed = len;
16221618
return -1;
16231619
}
16241620
ch = PyUnicode_READ(kind, s, 0);
1625-
s += size;
1621+
s += kind;
16261622
if (ch == '\n')
1627-
return (s - start)/size;
1623+
return (s - start)/kind;
16281624
if (ch == '\r') {
16291625
if (PyUnicode_READ(kind, s, 0) == '\n')
1630-
return (s - start)/size + 1;
1626+
return (s - start)/kind + 1;
16311627
else
1632-
return (s - start)/size;
1628+
return (s - start)/kind;
16331629
}
16341630
}
16351631
}
@@ -1642,13 +1638,13 @@ _PyIO_find_line_ending(
16421638
if (readnl_len == 1) {
16431639
char *pos = find_control_char(kind, start, end, nl[0]);
16441640
if (pos != NULL)
1645-
return (pos - start)/size + 1;
1641+
return (pos - start)/kind + 1;
16461642
*consumed = len;
16471643
return -1;
16481644
}
16491645
else {
16501646
char *s = start;
1651-
char *e = end - (readnl_len - 1)*size;
1647+
char *e = end - (readnl_len - 1)*kind;
16521648
char *pos;
16531649
if (e < s)
16541650
e = s;
@@ -1662,14 +1658,14 @@ _PyIO_find_line_ending(
16621658
break;
16631659
}
16641660
if (i == readnl_len)
1665-
return (pos - start)/size + readnl_len;
1666-
s = pos + size;
1661+
return (pos - start)/kind + readnl_len;
1662+
s = pos + kind;
16671663
}
16681664
pos = find_control_char(kind, e, end, nl[0]);
16691665
if (pos == NULL)
16701666
*consumed = len;
16711667
else
1672-
*consumed = (pos - start)/size;
1668+
*consumed = (pos - start)/kind;
16731669
return -1;
16741670
}
16751671
}
@@ -1738,8 +1734,8 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit)
17381734
endpos = _PyIO_find_line_ending(
17391735
self->readtranslate, self->readuniversal, self->readnl,
17401736
kind,
1741-
ptr + PyUnicode_KIND_SIZE(kind, start),
1742-
ptr + PyUnicode_KIND_SIZE(kind, line_len),
1737+
ptr + kind * start,
1738+
ptr + kind * line_len,
17431739
&consumed);
17441740
if (endpos >= 0) {
17451741
endpos += start;

Modules/_json.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
365365
APPEND_OLD_CHUNK
366366
chunk = PyUnicode_FromKindAndData(
367367
kind,
368-
(char*)buf + PyUnicode_KIND_SIZE(kind, end),
368+
(char*)buf + kind * end,
369369
next - end);
370370
if (chunk == NULL) {
371371
goto bail;
@@ -931,7 +931,7 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_
931931
if (custom_func) {
932932
/* copy the section we determined to be a number */
933933
numstr = PyUnicode_FromKindAndData(kind,
934-
(char*)str + PyUnicode_KIND_SIZE(kind, start),
934+
(char*)str + kind * start,
935935
idx - start);
936936
if (numstr == NULL)
937937
return NULL;

Modules/_sre.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1669,7 +1669,7 @@ getstring(PyObject* string, Py_ssize_t* p_length,
16691669
return NULL;
16701670
ptr = PyUnicode_DATA(string);
16711671
*p_length = PyUnicode_GET_LENGTH(string);
1672-
*p_charsize = PyUnicode_CHARACTER_SIZE(string);
1672+
*p_charsize = PyUnicode_KIND(string);
16731673
*p_logical_charsize = 4;
16741674
return ptr;
16751675
}

Objects/stringlib/eq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@ unicode_eq(PyObject *aa, PyObject *bb)
3030
PyUnicode_GET_LENGTH(a) == 1)
3131
return 1;
3232
return memcmp(PyUnicode_1BYTE_DATA(a), PyUnicode_1BYTE_DATA(b),
33-
PyUnicode_GET_LENGTH(a) * PyUnicode_CHARACTER_SIZE(a)) == 0;
33+
PyUnicode_GET_LENGTH(a) * PyUnicode_KIND(a)) == 0;
3434
}

0 commit comments

Comments
 (0)