Skip to content

Commit 7704e19

Browse files
committed
libBasic: implement extended grapheme cluster segmentation algorithm
This is only for the frontend, not for stdlib. The implementation is very slow, optimizing it is the next step. rdar://16755123 rdar://16013860 Swift SVN r18928
1 parent e310fd7 commit 7704e19

File tree

11 files changed

+2071
-9
lines changed

11 files changed

+2071
-9
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,8 @@ set(SWIFTLIB_SUBDIR ".")
545545
set(SWIFT_GYB_FLAGS
546546
"-DCMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}"
547547
"-DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}"
548+
"-DunicodeGraphemeBreakPropertyFile=${SWIFT_SOURCE_DIR}/utils/UnicodeData/GraphemeBreakProperty.txt"
549+
"-DunicodeGraphemeBreakTestFile=${SWIFT_SOURCE_DIR}/utils/UnicodeData/GraphemeBreakTest.txt"
548550
"--test" # Run gyb's self-tests whenever we use it. They're cheap
549551
# enough and it keeps us honest.
550552
)

include/swift/Basic/Unicode.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,53 @@ static inline bool isSingleExtendedGraphemeCluster(StringRef S) {
2828
return First == S;
2929
}
3030

31+
enum class GraphemeClusterBreakProperty : uint8_t {
32+
Other,
33+
CR,
34+
LF,
35+
Control,
36+
Extend,
37+
Regional_Indicator,
38+
Prepend,
39+
SpacingMark,
40+
L,
41+
V,
42+
T,
43+
LV,
44+
LVT,
45+
};
46+
47+
/// Extended grapheme cluster boundary rules, represented as a matrix. Indexed
48+
/// by first code point, then by second code point in least-significant-bit
49+
/// order. A set bit means that a boundary is prohibited between two code
50+
/// points.
51+
extern uint16_t ExtendedGraphemeClusterNoBoundaryRulesMatrix[];
52+
53+
/// Returns the value of the Grapheme_Cluster_Break property for a given code
54+
/// point.
55+
GraphemeClusterBreakProperty getGraphemeClusterBreakProperty(uint32_t C);
56+
57+
/// Returns true if there is always an extended grapheme cluster boundary
58+
/// after a code point with a given property value. Use only for optimization,
59+
/// to skip calculating Grapheme_Cluster_Break property for the second code
60+
/// point.
61+
static inline bool
62+
isExtendedGraphemeClusterBoundaryAfter(GraphemeClusterBreakProperty GCB1) {
63+
auto RuleRow =
64+
ExtendedGraphemeClusterNoBoundaryRulesMatrix[static_cast<unsigned>(GCB1)];
65+
return RuleRow == 0;
66+
}
67+
68+
/// Determine if there is an extended grapheme cluster boundary between code
69+
/// points with given Grapheme_Cluster_Break property values.
70+
static inline bool
71+
isExtendedGraphemeClusterBoundary(GraphemeClusterBreakProperty GCB1,
72+
GraphemeClusterBreakProperty GCB2) {
73+
auto RuleRow =
74+
ExtendedGraphemeClusterNoBoundaryRulesMatrix[static_cast<unsigned>(GCB1)];
75+
return !(RuleRow & (1 << static_cast<unsigned>(GCB2)));
76+
}
77+
3178
} // namespace unicode
3279
} // namespace swift
3380

lib/Basic/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
set(UNICODE_TABLES UnicodeExtendedGraphemeClusters.cpp.gyb)
2+
3+
handle_gyb_sources(UNICODE_TABLES)
4+
15
add_swift_library(swiftBasic
26
Cache.cpp
37
Demangle.cpp
@@ -21,7 +25,10 @@ add_swift_library(swiftBasic
2125
Unix/TaskQueue.inc
2226

2327
# Platform-agnostic fallback TaskQueue implementation
24-
Default/TaskQueue.inc)
28+
Default/TaskQueue.inc
29+
30+
${UNICODE_TABLES}
31+
)
2532

2633
set(SWIFT_VERSION "1.0")
2734
message(STATUS "Swift version: ${SWIFT_VERSION}")

lib/Basic/Unicode.cpp

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,47 @@
1616
using namespace swift;
1717

1818
StringRef swift::unicode::extractFirstExtendedGraphemeCluster(StringRef S) {
19-
// FIXME: implement as described in Unicode Standard Annex #29.
19+
// Extended grapheme cluster segmentation algorithm as described in Unicode
20+
// Standard Annex #29.
2021
if (S.empty())
2122
return StringRef();
2223

23-
// FIXME: deal with broken code unit sequences.
24-
// For now, just extract the first code point.
25-
unsigned CodeUnitSeqLen = getNumBytesForUTF8(S[0]);
26-
return S.slice(0, CodeUnitSeqLen);
24+
const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
25+
26+
const UTF8 *SourceNext = SourceStart;
27+
UTF32 C[2];
28+
UTF32 *TargetStart = C;
29+
30+
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 1,
31+
lenientConversion);
32+
if (TargetStart == C) {
33+
// The source string contains an ill-formed subsequence at the end.
34+
return S;
35+
}
36+
37+
GraphemeClusterBreakProperty GCBForC0 = getGraphemeClusterBreakProperty(C[0]);
38+
while (true) {
39+
if (isExtendedGraphemeClusterBoundaryAfter(GCBForC0))
40+
return S.slice(0, SourceNext - SourceStart);
41+
42+
size_t C1Offset = SourceNext - SourceStart;
43+
ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart, C + 2,
44+
lenientConversion);
45+
46+
if (TargetStart == C + 1) {
47+
// End of source string or the source string contains an ill-formed
48+
// subsequence at the end.
49+
return S.slice(0, C1Offset);
50+
}
51+
52+
GraphemeClusterBreakProperty GCBForC1 =
53+
getGraphemeClusterBreakProperty(C[1]);
54+
if (isExtendedGraphemeClusterBoundary(GCBForC0, GCBForC1))
55+
return S.slice(0, C1Offset);
56+
57+
C[0] = C[1];
58+
TargetStart = C + 1;
59+
GCBForC0 = GCBForC1;
60+
}
2761
}
2862

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
%# -*- mode: C++ -*-
2+
3+
%# Ignore the following admonition; it applies to the resulting .cpp file only
4+
//// Automatically Generated From UnicodeExtendedGraphemeClusters.cpp.gyb.
5+
//// Do Not Edit Directly!
6+
//===----------------------------------------------------------------------===//
7+
//
8+
// This source file is part of the Swift.org open source project
9+
//
10+
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
11+
// Licensed under Apache License v2.0 with Runtime Library Exception
12+
//
13+
// See http://swift.org/LICENSE.txt for license information
14+
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
%{
19+
20+
import re
21+
22+
# Grapheme_Cluster_Break property. An array of tuples (startCodePoint,
23+
# endCodePoint, value).
24+
graphemeBreakProperty = []
25+
26+
with open(unicodeGraphemeBreakPropertyFile, 'rb') as f:
27+
for line in f:
28+
# Strip comments.
29+
line = re.sub('#.*', '', line)
30+
31+
# Single code point?
32+
m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line)
33+
if m:
34+
codePoint = int(m.group(1), 16)
35+
value = m.group(2)
36+
graphemeBreakProperty += [(codePoint, codePoint, value)]
37+
continue
38+
39+
# Range of code points?
40+
m = re.match('([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line)
41+
if m:
42+
startCodePoint = int(m.group(1), 16)
43+
endCodePoint = int(m.group(2), 16)
44+
value = m.group(3)
45+
graphemeBreakProperty += [(startCodePoint, endCodePoint, value)]
46+
47+
}%
48+
49+
#include "swift/Basic/Unicode.h"
50+
51+
swift::unicode::GraphemeClusterBreakProperty
52+
swift::unicode::getGraphemeClusterBreakProperty(uint32_t C) {
53+
// FIXME: replace linear search with a trie lookup.
54+
55+
% for startCodePoint,endCodePoint,value in graphemeBreakProperty:
56+
% if startCodePoint == 0:
57+
if (C <= ${endCodePoint})
58+
% else:
59+
if (C >= ${startCodePoint} && C <= ${endCodePoint})
60+
% end
61+
return GraphemeClusterBreakProperty::${value};
62+
% end
63+
64+
return GraphemeClusterBreakProperty::Other;
65+
}
66+
67+
%{
68+
69+
# The order should be consistent with 'GraphemeClusterBreakProperty' enum.
70+
anyGraphemePropertyValue = [
71+
'Other', 'CR', 'LF', 'Control', 'Extend', 'Regional_Indicator', 'Prepend',
72+
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
73+
]
74+
75+
# Rules to determine extended grapheme cluster boundaries, as defined in
76+
# 'Grapheme Break Chart', ucd/auxiliary/GraphemeBreakTest.html, Unicode 6.3.0.
77+
extendedGraphemeClusterRules = [
78+
( [ 'CR' ], 'no_boundary', [ 'LF' ] ),
79+
( [ 'Control', 'CR', 'LF' ], 'boundary', anyGraphemePropertyValue ),
80+
( anyGraphemePropertyValue, 'boundary', [ 'Control', 'CR', 'LF' ] ),
81+
( [ 'L' ], 'no_boundary', [ 'L', 'V', 'LV', 'LVT' ] ),
82+
( [ 'LV', 'V' ], 'no_boundary', [ 'V', 'T' ] ),
83+
( [ 'LVT', 'T' ], 'no_boundary', [ 'T' ] ),
84+
( [ 'Regional_Indicator' ], 'no_boundary', [ 'Regional_Indicator' ] ),
85+
( anyGraphemePropertyValue, 'no_boundary', [ 'Extend' ] ),
86+
( anyGraphemePropertyValue, 'no_boundary', [ 'SpacingMark' ] ),
87+
( [ 'Prepend' ], 'no_boundary', anyGraphemePropertyValue ),
88+
( anyGraphemePropertyValue, 'boundary', anyGraphemePropertyValue ),
89+
]
90+
91+
# Expand the rules into a matrix.
92+
extendedGraphemeClusterRulesMatrix = {}
93+
for first in anyGraphemePropertyValue:
94+
extendedGraphemeClusterRulesMatrix[first] = \
95+
dict.fromkeys(anyGraphemePropertyValue, None)
96+
97+
for firstList,action,secondList in reversed(extendedGraphemeClusterRules):
98+
for first in firstList:
99+
for second in secondList:
100+
extendedGraphemeClusterRulesMatrix[first][second] = action
101+
102+
# Make sure we can pack one row of the matrix into a 'uint16_t'.
103+
assert(len(anyGraphemePropertyValue) <= 16)
104+
105+
}%
106+
107+
uint16_t swift::unicode::ExtendedGraphemeClusterNoBoundaryRulesMatrix[] = {
108+
% for first in anyGraphemePropertyValue:
109+
% # Retrieve a row that corresponds to this first code point.
110+
% row = extendedGraphemeClusterRulesMatrix[first]
111+
112+
% # Change strings into bits.
113+
% bits = [ row[second] == 'no_boundary' for second in anyGraphemePropertyValue ]
114+
115+
% # Pack bits into an integer.
116+
% packed = sum([ bits[i] * pow(2, i) for i in range(0, len(bits)) ])
117+
118+
${packed},
119+
% end
120+
};
121+

test/stdlib/CharacterTypes.swift

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ func testTypeInference() {
2121
isCharacter(&ch2)
2222
var ch4: Character = ""
2323
isCharacter(&ch3)
24-
// FIXME: this should pass when we implement grapheme cluster extraction
25-
// correctly.
26-
var ch5: Character = "\u304b\u3099" // expected-error {{cannot convert the expression's type 'String' to type 'Character'}}
24+
var ch5: Character = "\u304b\u3099"
2725
isCharacter(&ch4)
2826

2927
var s1 = ""

unittests/Basic/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1+
set(GENERATED_TESTS UnicodeGraphemeBreakTest.cpp.gyb)
2+
3+
handle_gyb_sources(GENERATED_TESTS)
4+
15
add_swift_unittest(SwiftBasicTests
26
SourceManager.cpp
37
TreeScopedHashTableTests.cpp
48
StringExtrasTest.cpp
59
SuccessorMapTest.cpp
610
Unicode.cpp
11+
12+
${GENERATED_TESTS}
713
)
814

915
target_link_libraries(SwiftBasicTests

0 commit comments

Comments
 (0)