Skip to content

Commit 1de8372

Browse files
authored
Implement full Unicode 16.0.0 extended grapheme breaking. (#719)
Implement full Unicode 16.0.0 extended grapheme breaking. Includes rule GB9c (Indic Conjunt Break based breaking). This change has a significant cost in size since the information needed per character no longer fits in 4 bits. The base table is therefore twice as big (one byte per entry rather than half of that). The number of states in the state automatons have also increased slightly, but in comparison that's a negligible change. Tests have been made more thorough, testing not only the Unicode Consortium provided tests, but also variants of those with representative characters for each category of character that either in or not-in the BMP, to test that surrogate pair decoding works correctly. Test also check that the created automatons are minimal, in that no state is unreachable and no two states are indistinguishable.
1 parent 6af0821 commit 1de8372

33 files changed

+23533
-18145
lines changed

pkgs/characters/README.md

-4
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ using a [`CharacterRange`][CharacterRange].
1313

1414
Based on Unicode <!-- unicode-version -->version 16.0.0<!-- /unicode-version -->.
1515

16-
This package is not script-aware, and does not currently support the rule for
17-
Indic Conjunct Breaks introduced in Unicode 15.10.0
18-
([GB9c](https://www.unicode.org/reports/tr29/tr29-43.html#GB9c)).
19-
2016
## Unicode characters and representations
2117

2218
There is no such thing as plain text.

pkgs/characters/analysis_options.yaml

-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
11
include: package:dart_flutter_team_lints/analysis_options.yaml
2-
3-
analyzer:
4-
errors:
5-
prefer_single_quotes: ignore

pkgs/characters/benchmark/benchmark.dart

+16-16
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
// Benchmark of efficiency of grapheme cluster operations.
66

7-
import "package:characters/characters.dart";
7+
import 'package:characters/characters.dart';
88

9-
import "../test/src/text_samples.dart";
9+
import '../test/src/text_samples.dart';
1010

1111
double bench(int Function() action, int ms) {
1212
var elapsed = 0;
@@ -49,12 +49,12 @@ int reverseStrings() {
4949
var revHangul = reverse(hangul);
5050
var rev2Hangul = reverse(revHangul);
5151
if (hangul != rev2Hangul || hangul == revHangul) {
52-
throw AssertionError("Bad reverse");
52+
throw AssertionError('Bad reverse');
5353
}
5454
var revGenesis = reverse(genesis);
5555
var rev2Genesis = reverse(revGenesis);
5656
if (genesis != rev2Genesis || genesis == revGenesis) {
57-
throw AssertionError("Bad reverse");
57+
throw AssertionError('Bad reverse');
5858
}
5959

6060
return (hangul.length + genesis.length) * 2;
@@ -63,16 +63,16 @@ int reverseStrings() {
6363
int replaceStrings() {
6464
var count = 0;
6565
{
66-
const language = "한글";
66+
const language = '한글';
6767
assert(language.length == 6);
6868
var chars = Characters(hangul);
6969
var replaced =
70-
chars.replaceAll(Characters(language), Characters("Hangul!"));
70+
chars.replaceAll(Characters(language), Characters('Hangul!'));
7171
count += replaced.string.length - hangul.length;
7272
}
7373
{
7474
var chars = Characters(genesis);
75-
var replaced = chars.replaceAll(Characters("And"), Characters("Also"));
75+
var replaced = chars.replaceAll(Characters('And'), Characters('Also'));
7676
count += replaced.string.length - genesis.length;
7777
}
7878
return count;
@@ -111,27 +111,27 @@ void main(List<String> args) {
111111

112112
for (var i = 0; i < count; i++) {
113113
var performance = bench(iterateIndicesOnly, 2000);
114-
print("Index Iteration: ${toDigits(performance)} gc/ms");
114+
print('Index Iteration: ${toDigits(performance)} gc/ms');
115115
if (performance > bestIterateIndices) bestIterateIndices = performance;
116116

117117
performance = bench(iterateStrings, 2000);
118-
print("String Iteration: ${toDigits(performance)} cu/ms");
118+
print('String Iteration: ${toDigits(performance)} cu/ms');
119119
if (performance > bestIterateStrings) bestIterateStrings = performance;
120120

121121
performance = bench(reverseStrings, 2000);
122-
print("String Reversing: ${toDigits(performance)} cu/ms");
122+
print('String Reversing: ${toDigits(performance)} cu/ms');
123123
if (performance > bestReverseStrings) bestReverseStrings = performance;
124124

125125
performance = bench(replaceStrings, 2000);
126-
print("String Replacing: ${toDigits(performance)} changes/ms");
126+
print('String Replacing: ${toDigits(performance)} changes/ms');
127127
if (performance > bestReplaceStrings) bestReplaceStrings = performance;
128128
}
129129

130130
if (count > 1) {
131-
print("Best: ");
132-
print("Index Iteration: ${toDigits(bestIterateIndices)} gc/ms");
133-
print("String Iteration: ${toDigits(bestIterateStrings)} cu/ms");
134-
print("String Reversing: ${toDigits(bestReverseStrings)} cu/ms");
135-
print("String Replacing: ${toDigits(bestReplaceStrings)} changes/ms");
131+
print('Best: ');
132+
print('Index Iteration: ${toDigits(bestIterateIndices)} gc/ms');
133+
print('String Iteration: ${toDigits(bestIterateStrings)} cu/ms');
134+
print('String Reversing: ${toDigits(bestReverseStrings)} cu/ms');
135+
print('String Replacing: ${toDigits(bestReplaceStrings)} changes/ms');
136136
}
137137
}

pkgs/characters/lib/characters.dart

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
/// String operations based on characters (Unicode grapheme clusters).
66
library;
77

8-
export "src/characters.dart";
9-
export "src/extensions.dart";
8+
export 'src/characters.dart';
9+
export 'src/extensions.dart';

pkgs/characters/lib/src/characters.dart

+8-8
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// for details. All rights reserved. Use of this source code is governed by a
33
// BSD-style license that can be found in the LICENSE file.
44

5-
import "characters_impl.dart";
5+
import 'characters_impl.dart';
66

77
/// The characters of a string.
88
///
@@ -21,7 +21,7 @@ import "characters_impl.dart";
2121
/// in different ways.
2222
abstract class Characters implements Iterable<String> {
2323
/// An empty [Characters] containing no characters.
24-
static const Characters empty = StringCharacters("");
24+
static const Characters empty = StringCharacters('');
2525

2626
/// Creates a [Characters] allowing iteration of
2727
/// the characters of [string].
@@ -260,9 +260,9 @@ abstract class Characters implements Iterable<String> {
260260
/// Any further occurrences will be included in the last part.
261261
/// Example:
262262
/// ```dart
263-
/// var c = "abracadabra".characters;
264-
/// var parts = c.split("a".characters, 4).toList();
265-
/// print(parts); // Prints is ["", "br", "c", "dabra"]
263+
/// var c = 'abracadabra'.characters;
264+
/// var parts = c.split('a'.characters, 4).toList();
265+
/// print(parts); // Prints is ['', 'br', 'c', 'dabra']
266266
/// ```
267267
/// If there are fewer than `maxParts - 1` occurrences of [pattern],
268268
/// then the characters are split at all occurrences.
@@ -790,9 +790,9 @@ abstract class CharacterRange implements Iterator<String> {
790790
///
791791
/// Example:
792792
/// ```dart
793-
/// var c = "abracadabra".characters.dropFirst().dropLast();
794-
/// // c is "bracadabr".
795-
/// var parts = c.split("a".characters, 3).toList();
793+
/// var c = 'abracadabra'.characters.dropFirst().dropLast();
794+
/// // c is 'bracadabr'.
795+
/// var parts = c.split('a'.characters, 3).toList();
796796
/// print(parts); // [br, c, dabr]
797797
/// ```
798798
/// If there are fewer than `maxParts - 1` occurrences of [pattern],

pkgs/characters/lib/src/characters_impl.dart

+27-27
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
// for details. All rights reserved. Use of this source code is governed by a
33
// BSD-style license that can be found in the LICENSE file.
44

5-
import "characters.dart";
6-
import "grapheme_clusters/breaks.dart";
7-
import "grapheme_clusters/constants.dart";
5+
import 'characters.dart';
6+
import 'grapheme_clusters/breaks.dart';
7+
import 'grapheme_clusters/constants.dart';
88
import 'grapheme_clusters/table.dart';
99

1010
/// The grapheme clusters of a string.
@@ -28,23 +28,23 @@ final class StringCharacters extends Iterable<String> implements Characters {
2828

2929
@override
3030
String get first => string.isEmpty
31-
? throw StateError("No element")
31+
? throw StateError('No element')
3232
: string.substring(
3333
0, Breaks(string, 0, string.length, stateSoTNoBreak).nextBreak());
3434

3535
@override
3636
String get last => string.isEmpty
37-
? throw StateError("No element")
37+
? throw StateError('No element')
3838
: string.substring(
3939
BackBreaks(string, string.length, 0, stateEoTNoBreak).nextBreak());
4040

4141
@override
4242
String get single {
43-
if (string.isEmpty) throw StateError("No element");
43+
if (string.isEmpty) throw StateError('No element');
4444
var firstEnd =
4545
Breaks(string, 0, string.length, stateSoTNoBreak).nextBreak();
4646
if (firstEnd == string.length) return string;
47-
throw StateError("Too many elements");
47+
throw StateError('Too many elements');
4848
}
4949

5050
@override
@@ -74,9 +74,9 @@ final class StringCharacters extends Iterable<String> implements Characters {
7474
}
7575

7676
@override
77-
String join([String separator = ""]) {
78-
if (separator == "") return string;
79-
return _explodeReplace(string, 0, string.length, separator, "");
77+
String join([String separator = '']) {
78+
if (separator == '') return string;
79+
return _explodeReplace(string, 0, string.length, separator, '');
8080
}
8181

8282
@override
@@ -91,12 +91,12 @@ final class StringCharacters extends Iterable<String> implements Characters {
9191
cursor = next;
9292
}
9393
if (orElse != null) return orElse();
94-
throw StateError("No element");
94+
throw StateError('No element');
9595
}
9696

9797
@override
9898
String elementAt(int index) {
99-
RangeError.checkNotNegative(index, "index");
99+
RangeError.checkNotNegative(index, 'index');
100100
var count = 0;
101101
if (string.isNotEmpty) {
102102
var breaks = Breaks(string, 0, string.length, stateSoTNoBreak);
@@ -108,7 +108,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
108108
start = end;
109109
}
110110
}
111-
throw RangeError.index(index, this, "index", null, count);
111+
throw RangeError.index(index, this, 'index', null, count);
112112
}
113113

114114
@override
@@ -209,7 +209,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
209209

210210
@override
211211
Characters skip(int count) {
212-
RangeError.checkNotNegative(count, "count");
212+
RangeError.checkNotNegative(count, 'count');
213213
return _skip(count);
214214
}
215215

@@ -221,7 +221,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
221221

222222
@override
223223
Characters take(int count) {
224-
RangeError.checkNotNegative(count, "count");
224+
RangeError.checkNotNegative(count, 'count');
225225
return _take(count);
226226
}
227227

@@ -233,9 +233,9 @@ final class StringCharacters extends Iterable<String> implements Characters {
233233

234234
@override
235235
Characters getRange(int start, [int? end]) {
236-
RangeError.checkNotNegative(start, "start");
236+
RangeError.checkNotNegative(start, 'start');
237237
if (end == null) return _skip(start);
238-
if (end < start) throw RangeError.range(end, start, null, "end");
238+
if (end < start) throw RangeError.range(end, start, null, 'end');
239239
if (end == start) return Characters.empty;
240240
if (start == 0) return _take(end);
241241
if (string.isEmpty) return this;
@@ -254,10 +254,10 @@ final class StringCharacters extends Iterable<String> implements Characters {
254254
while (position > 0) {
255255
position--;
256256
start = breaks.nextBreak();
257-
if (start < 0) throw StateError("No element");
257+
if (start < 0) throw StateError('No element');
258258
}
259259
var end = breaks.nextBreak();
260-
if (end < 0) throw StateError("No element");
260+
if (end < 0) throw StateError('No element');
261261
if (start == 0 && end == string.length) return this;
262262
return StringCharacters(string.substring(start, end));
263263
}
@@ -311,7 +311,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
311311

312312
@override
313313
Characters skipLast(int count) {
314-
RangeError.checkNotNegative(count, "count");
314+
RangeError.checkNotNegative(count, 'count');
315315
if (count == 0) return this;
316316
if (string.isNotEmpty) {
317317
var breaks = BackBreaks(string, string.length, 0, stateEoTNoBreak);
@@ -351,7 +351,7 @@ final class StringCharacters extends Iterable<String> implements Characters {
351351

352352
@override
353353
Characters takeLast(int count) {
354-
RangeError.checkNotNegative(count, "count");
354+
RangeError.checkNotNegative(count, 'count');
355355
if (count == 0) return Characters.empty;
356356
if (string.isNotEmpty) {
357357
var breaks = BackBreaks(string, string.length, 0, stateEoTNoBreak);
@@ -446,7 +446,7 @@ class StringCharacterRange implements CharacterRange {
446446
factory StringCharacterRange.at(String string, int startIndex,
447447
[int? endIndex]) {
448448
RangeError.checkValidRange(
449-
startIndex, endIndex, string.length, "startIndex", "endIndex");
449+
startIndex, endIndex, string.length, 'startIndex', 'endIndex');
450450
return _expandRange(string, startIndex, endIndex ?? startIndex);
451451
}
452452

@@ -501,7 +501,7 @@ class StringCharacterRange implements CharacterRange {
501501
}
502502
}
503503
state = move(state, category);
504-
if (state & stateNoBreak == 0 && --count == 0) {
504+
if (state & maskBreak != flagNoBreak && --count == 0) {
505505
_move(newStart, index);
506506
return true;
507507
}
@@ -513,7 +513,7 @@ class StringCharacterRange implements CharacterRange {
513513
_move(newStart, _end);
514514
return true;
515515
} else {
516-
throw RangeError.range(count, 0, null, "count");
516+
throw RangeError.range(count, 0, null, 'count');
517517
}
518518
}
519519

@@ -530,7 +530,7 @@ class StringCharacterRange implements CharacterRange {
530530
bool moveBack([int count = 1]) => _retractStart(count, _start);
531531

532532
bool _retractStart(int count, int newEnd) {
533-
RangeError.checkNotNegative(count, "count");
533+
RangeError.checkNotNegative(count, 'count');
534534
var breaks = _backBreaksFromStart();
535535
var start = _start;
536536
while (count > 0) {
@@ -578,7 +578,7 @@ class StringCharacterRange implements CharacterRange {
578578

579579
@override
580580
bool dropFirst([int count = 1]) {
581-
RangeError.checkNotNegative(count, "count");
581+
RangeError.checkNotNegative(count, 'count');
582582
if (_start == _end) return count == 0;
583583
var breaks = Breaks(_string, _start, _end, stateSoTNoBreak);
584584
while (count > 0) {
@@ -636,7 +636,7 @@ class StringCharacterRange implements CharacterRange {
636636

637637
@override
638638
bool dropLast([int count = 1]) {
639-
RangeError.checkNotNegative(count, "count");
639+
RangeError.checkNotNegative(count, 'count');
640640
var breaks = BackBreaks(_string, _end, _start, stateEoTNoBreak);
641641
while (count > 0) {
642642
var nextBreak = breaks.nextBreak();

0 commit comments

Comments
 (0)