Skip to content

Commit 313cfca

Browse files
committedJul 2, 2014
Unicode trie generator: add tests for the generator itself that use non-default
trie parameters and fix a few bugs The bugs did not affect correctness of the particular instance of trie created for grapheme cluster property, because trie parameters that were confused with each other happened to be equal. Also, fix a trie size bug: we were creating a trie large enough to store information for 0x200000 code points, but there are only 0x10ffff. It saved only 15 bytes in the grapheme cluster tree, because that extra information was compressed with some supplementary planes that also had default values. This also improved trie generation time by almost 2x. Swift SVN r19457
1 parent d8ed7b6 commit 313cfca

File tree

4 files changed

+196
-25
lines changed

4 files changed

+196
-25
lines changed
 

‎stdlib/core/UnicodeTrie.swift.gyb

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ SuppLookup1BytesPerEntry = 1
3232
SuppLookup2BytesPerEntry = 1
3333
SuppDataBytesPerEntry = 1
3434

35-
TrieSize = 15904
35+
TrieSize = 15889
3636

3737
BMPLookupBytesOffset = 0
3838
BMPDataBytesOffset = 256
3939
SuppLookup1BytesOffset = 12032
40-
SuppLookup2BytesOffset = 12064
41-
SuppDataBytesOffset = 12832
40+
SuppLookup2BytesOffset = 12049
41+
SuppDataBytesOffset = 12817
4242

4343
}%
4444

‎stdlib/runtime/UnicodeExtendedGraphemeClusters.cpp.gyb

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ grapheme_cluster_break_property_table = \
2121
GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile)
2222
2323
trie_generator = UnicodeTrieGenerator()
24+
trie_generator.create_tables()
2425
trie_generator.fill_from_unicode_property(grapheme_cluster_break_property_table)
2526
trie_generator.verify(grapheme_cluster_break_property_table)
2627

‎test/stdlib/UnicodeTrieGenerator.gyb

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
%{
2+
3+
# RUN: rm -rf %t && mkdir -p %t && %S/../../utils/gyb %s | FileCheck %s
4+
#
5+
# REQUIRES: long_tests
6+
7+
from GYBUnicodeDataUtils import *
8+
9+
def test_trie_generation(property_table, configure_generator=None):
10+
trie_generator = UnicodeTrieGenerator()
11+
if configure_generator is not None:
12+
configure_generator(trie_generator)
13+
trie_generator.create_tables()
14+
trie_generator.fill_from_unicode_property(property_table)
15+
trie_generator.verify(property_table)
16+
trie_generator.freeze()
17+
trie_generator.verify(property_table)
18+
trie_generator.serialize(property_table)
19+
print (
20+
trie_generator.BMP_first_level_index_bits,
21+
trie_generator.BMP_data_offset_bits,
22+
trie_generator.supp_first_level_index_bits,
23+
trie_generator.supp_second_level_index_bits,
24+
trie_generator.supp_data_offset_bits,
25+
26+
trie_generator.BMP_lookup_bytes_per_entry,
27+
trie_generator.BMP_data_bytes_per_entry,
28+
trie_generator.supp_lookup1_bytes_per_entry,
29+
trie_generator.supp_lookup2_bytes_per_entry,
30+
trie_generator.supp_data_bytes_per_entry,
31+
32+
len(trie_generator.trie_bytes),
33+
34+
trie_generator.BMP_data_bytes_offset - trie_generator.BMP_lookup_bytes_offset,
35+
trie_generator.supp_lookup1_bytes_offset - trie_generator.BMP_data_bytes_offset,
36+
trie_generator.supp_lookup2_bytes_offset - trie_generator.supp_lookup1_bytes_offset,
37+
trie_generator.supp_data_bytes_offset - trie_generator.supp_lookup2_bytes_offset,
38+
len(trie_generator.trie_bytes) - trie_generator.supp_data_bytes_offset)
39+
40+
class PerfectlyCompressableProperty(UnicodeProperty):
41+
def __init__(self):
42+
pass
43+
44+
def get_default_value(self):
45+
return 'Default'
46+
47+
def get_value(self, cp):
48+
return 'Default'
49+
50+
def to_numeric_value(self, value):
51+
if value == 'Default':
52+
return 42
53+
assert(False)
54+
55+
def get_numeric_value(self, cp):
56+
return self.to_numeric_value(self.get_value(cp))
57+
58+
print 'PerfectlyCompressableProperty'
59+
test_trie_generation(PerfectlyCompressableProperty())
60+
# CHECK-LABEL: PerfectlyCompressableProperty
61+
# CHECK: (8, 8, 5, 8, 8, 1, 1, 1, 1, 1, 1041, 256, 256, 17, 256, 256)
62+
#
63+
# Explanation for table sizes above:
64+
#
65+
# BMP_lookup: 1-byte words x 256 = 256
66+
# BMP_data: 1 x 1 = 256
67+
# supp_lookup1: 1 x 17 = 17
68+
# supp_lookup2: 1 x 1*256 = 256
69+
# supp_data: 1 x 1*256 = 256
70+
71+
72+
class UncompressableProperty(UnicodeProperty):
73+
def __init__(self):
74+
pass
75+
76+
def get_default_value(self):
77+
return 42
78+
79+
def get_value(self, cp):
80+
# Split Unicode codespace into 128-entry "pages". Start each page with
81+
# a unique sequence of property values (page number) so that the result
82+
# can not be compressed.
83+
page_number = cp >> 7
84+
if cp % 0x80 == 1:
85+
return page_number & 0xff
86+
if cp % 0x80 == 2:
87+
return (page_number >> 8) & 0xff
88+
if cp % 0x80 == 3:
89+
return (page_number >> 16) & 0xff
90+
return 42
91+
92+
def to_numeric_value(self, value):
93+
return value
94+
95+
def get_numeric_value(self, cp):
96+
return self.to_numeric_value(self.get_value(cp))
97+
98+
print 'UncompressableProperty, default trie parameters'
99+
test_trie_generation(UncompressableProperty())
100+
# CHECK-LABEL: UncompressableProperty, default trie parameters
101+
# CHECK: (8, 8, 5, 8, 8, 2, 1, 1, 2, 1, 1123601, 512, 65536, 17, 8704, 1048832)
102+
#
103+
# Explanation for table sizes above:
104+
#
105+
# BMP_lookup: 2-byte words x 256 = 512
106+
# BMP_data: 1 x 256*256 = 65536
107+
# supp_lookup1: 1 x 17 = 17
108+
# supp_lookup2: 2 x 17*256 = 8704
109+
# supp_data: 1 x (16*256+1)*256 = 1048832
110+
111+
def configure_generator_for_16_bit_indexes(trie_generator):
112+
trie_generator.BMP_first_level_index_bits = 9
113+
114+
trie_generator.supp_first_level_index_bits = 10
115+
trie_generator.supp_second_level_index_bits = 2
116+
117+
print 'UncompressableProperty, 16-bit indexes'
118+
test_trie_generation(UncompressableProperty(),
119+
configure_generator_for_16_bit_indexes)
120+
# CHECK-LABEL: UncompressableProperty, 16-bit indexes
121+
# CHECK: (9, 7, 10, 2, 9, 2, 1, 2, 2, 1, 1120840, 1024, 65536, 1088, 4104, 1049088)
122+
#
123+
# Explanation for table sizes above:
124+
#
125+
# BMP_lookup: 2-byte words x 512 = 1024
126+
# BMP_data: 1 x 512*128 = 65536
127+
# supp_lookup1: 2 x 544 = 1088
128+
# supp_lookup2: 2 x 513*4 = 4104
129+
# supp_data: 1 x (2048+1)*512 = 1049088
130+
131+
132+
# gyb will print line markers after our output, so make sure that those
133+
# don't accidentally match any other CHECK lines.
134+
135+
print 'THE END'
136+
# CHECK-LABEL: THE END
137+
138+
}%
139+

‎utils/GYBUnicodeDataUtils.py

+53-22
Original file line numberDiff line numberDiff line change
@@ -207,14 +207,12 @@ class UnicodeTrieGenerator(object):
207207
# Note: if you change any of these parameters, don't forget to update the
208208
# ASCII art above.
209209
BMP_first_level_index_bits = 8
210-
BMP_data_offset_bits = 16 - BMP_first_level_index_bits
211210

212211
supp_first_level_index_bits = 5
213212
supp_second_level_index_bits = 8
214-
supp_data_offset_bits = 21 - supp_first_level_index_bits - supp_second_level_index_bits
215213

216214
def get_BMP_first_level_index(self, cp):
217-
return cp >> self.BMP_first_level_index_bits
215+
return cp >> self.BMP_data_offset_bits
218216

219217
def get_BMP_data_offset(self, cp):
220218
return cp & ((1 << self.BMP_data_offset_bits) - 1)
@@ -229,6 +227,30 @@ def get_supp_data_offset(self, cp):
229227
return cp & ((1 << self.supp_data_offset_bits) - 1)
230228

231229
def __init__(self):
230+
"""Create a trie generator with default parameters."""
231+
pass
232+
233+
def create_tables(self):
234+
"""Compute derived parameter values and create internal data
235+
structures.
236+
237+
Don't change parameter values after calling this method.
238+
"""
239+
240+
self.BMP_data_offset_bits = 16 - self.BMP_first_level_index_bits
241+
242+
self.supp_data_offset_bits = \
243+
21 - self.supp_first_level_index_bits - \
244+
self.supp_second_level_index_bits
245+
246+
# The maximum value of the first level index for supp tables. It is
247+
# not equal to ((1 << supp_first_level_index_bits) - 1), because
248+
# maximum Unicode code point value is not 2^21-1 (0x1fffff), it is
249+
# 0x10ffff.
250+
self.supp_first_level_index_max = \
251+
0x10ffff >> (self.supp_second_level_index_bits + \
252+
self.supp_data_offset_bits)
253+
232254
# A mapping from BMP first-level index to BMP data block index.
233255
self.BMP_lookup = [ i for i in range(0, 1 << self.BMP_first_level_index_bits) ]
234256

@@ -239,19 +261,19 @@ def __init__(self):
239261

240262
# A mapping from supp first-level index to an index of the second-level
241263
# lookup table.
242-
self.supp_lookup1 = [ i for i in range(0, 1 << self.supp_first_level_index_bits) ]
264+
self.supp_lookup1 = [ i for i in range(0, self.supp_first_level_index_max + 1) ]
243265

244266
# An array of second-level lookup tables. Each second-level lookup
245267
# table is a mapping from a supp second-level index to supp data block
246268
# index.
247269
self.supp_lookup2 = [
248270
[ j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits) ]
249-
for i in range(0, (1 << self.supp_first_level_index_bits)) ]
271+
for i in range(0, self.supp_first_level_index_max + 1) ]
250272

251273
# An arry of supp data blocks.
252274
self.supp_data = [
253275
[ -1 for i in range(0, 1 << self.supp_data_offset_bits) ]
254-
for i in range(0, 1 << (self.supp_first_level_index_bits + self.supp_second_level_index_bits)) ]
276+
for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ]
255277

256278
def splat(self, value):
257279
for i in range(0, len(self.BMP_data)):
@@ -292,6 +314,23 @@ def verify(self, unicode_property):
292314
assert(expectedValue == actualValue)
293315

294316
def freeze(self):
317+
"""Compress internal trie representation.
318+
319+
Don't mutate the trie after calling this method.
320+
321+
"""
322+
323+
def remap_indexes(indexes, old_idx, new_idx):
324+
def map_index(idx):
325+
if idx == old_idx:
326+
return new_idx
327+
elif idx > old_idx:
328+
return idx - 1
329+
else:
330+
return idx
331+
332+
return map(map_index, indexes)
333+
295334
# If self.BMP_data contains identical data blocks, keep the first one,
296335
# remove duplicates and change the indexes in self.BMP_lookup to point to
297336
# the first one.
@@ -301,11 +340,8 @@ def freeze(self):
301340
while j < len(self.BMP_data):
302341
if self.BMP_data[i] == self.BMP_data[j]:
303342
self.BMP_data.pop(j)
304-
for k in range(0, len(self.BMP_lookup)):
305-
if self.BMP_lookup[k] == j:
306-
self.BMP_lookup[k] = i
307-
elif self.BMP_lookup[k] > j:
308-
self.BMP_lookup[k] -= 1
343+
self.BMP_lookup = \
344+
remap_indexes(self.BMP_lookup, old_idx=j, new_idx=i)
309345
else:
310346
j += 1
311347
i += 1
@@ -320,11 +356,9 @@ def freeze(self):
320356
if self.supp_data[i] == self.supp_data[j]:
321357
self.supp_data.pop(j)
322358
for k in range(0, len(self.supp_lookup2)):
323-
for l in range(0, len(self.supp_lookup2[k])):
324-
if self.supp_lookup2[k][l] == j:
325-
self.supp_lookup2[k][l] = i
326-
elif self.supp_lookup2[k][l] > j:
327-
self.supp_lookup2[k][l] -= 1
359+
self.supp_lookup2[k] = \
360+
remap_indexes(self.supp_lookup2[k], old_idx=j,
361+
new_idx=i)
328362
else:
329363
j += 1
330364
i += 1
@@ -337,11 +371,8 @@ def freeze(self):
337371
while j < len(self.supp_lookup2):
338372
if self.supp_lookup2[i] == self.supp_lookup2[j]:
339373
self.supp_lookup2.pop(j)
340-
for k in range(0, len(self.supp_lookup1)):
341-
if self.supp_lookup1[k] == j:
342-
self.supp_lookup1[k] = i
343-
elif self.supp_lookup1[k] > j:
344-
self.supp_lookup1[k] -= 1
374+
self.supp_lookup1 = \
375+
remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i)
345376
else:
346377
j += 1
347378
i += 1
@@ -351,7 +382,7 @@ def _int_to_LE_bytes(self, data, width):
351382
assert(data & ~0xff == 0)
352383
return [ data ]
353384
if width == 2:
354-
assert(data & 0xffff == 0)
385+
assert(data & ~0xffff == 0)
355386
return [ data & 0xff, data & 0xff00 ]
356387
assert(False)
357388

0 commit comments

Comments
 (0)