Skip to content

Commit 5d4d136

Browse files
authored
Merge pull request #245 from hamishknight/to-scale
2 parents ebfcdb3 + ed9f72c commit 5d4d136

File tree

4 files changed

+53
-155
lines changed

4 files changed

+53
-155
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

+36-28
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,41 @@ extension AST.Atom {
631631
}
632632
}
633633

634+
extension AST.Atom.EscapedBuiltin {
635+
/// If the escape sequence represents a unicode scalar value, returns the
636+
/// value, otherwise `nil`.
637+
public var scalarValue: UnicodeScalar? {
638+
switch self {
639+
// TODO: Should we separate these into a separate enum? Or move the
640+
// specifics of the scalar to the DSL tree?
641+
case .alarm:
642+
return "\u{7}"
643+
case .backspace:
644+
return "\u{8}"
645+
case .escape:
646+
return "\u{1B}"
647+
case .formfeed:
648+
return "\u{C}"
649+
case .newline:
650+
return "\n"
651+
case .carriageReturn:
652+
return "\r"
653+
case .tab:
654+
return "\t"
655+
656+
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
657+
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
658+
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
659+
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
660+
.wordBoundary, .notWordBoundary, .startOfSubject,
661+
.endOfSubjectBeforeNewline, .endOfSubject,
662+
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
663+
.textSegment, .notTextSegment:
664+
return nil
665+
}
666+
}
667+
}
668+
634669
extension AST.Atom {
635670
/// Retrieve the character value of the atom if it represents a literal
636671
/// character or unicode scalar, nil otherwise.
@@ -642,34 +677,7 @@ extension AST.Atom {
642677
return Character(s)
643678

644679
case .escaped(let c):
645-
switch c {
646-
// TODO: Should we separate these into a separate enum? Or move the
647-
// specifics of the scalar to the DSL tree?
648-
case .alarm:
649-
return "\u{7}"
650-
case .backspace:
651-
return "\u{8}"
652-
case .escape:
653-
return "\u{1B}"
654-
case .formfeed:
655-
return "\u{C}"
656-
case .newline:
657-
return "\n"
658-
case .carriageReturn:
659-
return "\r"
660-
case .tab:
661-
return "\t"
662-
663-
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
664-
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
665-
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
666-
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
667-
.wordBoundary, .notWordBoundary, .startOfSubject,
668-
.endOfSubjectBeforeNewline, .endOfSubject,
669-
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
670-
.textSegment, .notTextSegment:
671-
return nil
672-
}
680+
return c.scalarValue.map(Character.init)
673681

674682
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
675683
// TODO: These should have unicode scalar values.

Sources/_StringProcessing/ConsumerInterface.swift

-118
Original file line numberDiff line numberDiff line change
@@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member {
295295
}
296296
}
297297

298-
extension AST.CustomCharacterClass.Member {
299-
func generateConsumer(
300-
_ opts: MatchingOptions
301-
) throws -> MEProgram<String>.ConsumeFunction {
302-
switch self {
303-
case .custom(let ccc):
304-
return try ccc.generateConsumer(opts)
305-
306-
case .range(let r):
307-
guard let lhs = r.lhs.literalCharacterValue else {
308-
throw Unsupported("\(r.lhs) in range")
309-
}
310-
guard let rhs = r.rhs.literalCharacterValue else {
311-
throw Unsupported("\(r.rhs) in range")
312-
}
313-
314-
return { input, bounds in
315-
// TODO: check for out of bounds?
316-
let curIdx = bounds.lowerBound
317-
if (lhs...rhs).contains(input[curIdx]) {
318-
// TODO: semantic level
319-
return input.index(after: curIdx)
320-
}
321-
return nil
322-
}
323-
324-
case .atom(let atom):
325-
guard let gen = try atom.generateConsumer(opts) else {
326-
throw Unsupported("TODO")
327-
}
328-
return gen
329-
330-
case .quote(let q):
331-
// TODO: Not optimal.
332-
let consumers = try q.literal.map {
333-
try AST.Atom(.char($0), .fake).generateConsumer(opts)!
334-
}
335-
return { input, bounds in
336-
for consumer in consumers {
337-
if let idx = consumer(input, bounds) {
338-
return idx
339-
}
340-
}
341-
return nil
342-
}
343-
344-
case .trivia:
345-
throw Unreachable(
346-
"Should have been stripped by caller")
347-
348-
case .setOperation(let lhs, let op, let rhs):
349-
// TODO: We should probably have a component type
350-
// instead of a members array... for now we reconstruct
351-
// an AST node...
352-
let start = AST.Located(
353-
faking: AST.CustomCharacterClass.Start.normal)
354-
355-
let lhs = try AST.CustomCharacterClass(
356-
start, lhs, .fake
357-
).generateConsumer(opts)
358-
let rhs = try AST.CustomCharacterClass(
359-
start, rhs, .fake
360-
).generateConsumer(opts)
361-
362-
return { input, bounds in
363-
// NOTE: Easy way to implement, not performant
364-
let lhsIdxOpt = lhs(input, bounds)
365-
let rhsIdxOpt = rhs(input, bounds)
366-
367-
// TODO: What if lengths don't line up?
368-
assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil
369-
|| rhsIdxOpt == nil)
370-
371-
switch op.value {
372-
case .subtraction:
373-
guard rhsIdxOpt == nil else { return nil }
374-
return lhsIdxOpt
375-
376-
case .intersection:
377-
if let idx = lhsIdxOpt {
378-
return rhsIdxOpt == nil ? nil : idx
379-
}
380-
return nil
381-
382-
case .symmetricDifference:
383-
if let idx = lhsIdxOpt {
384-
return rhsIdxOpt == nil ? idx : nil
385-
}
386-
return rhsIdxOpt
387-
}
388-
}
389-
}
390-
}
391-
}
392-
393298
extension DSLTree.CustomCharacterClass {
394299
func generateConsumer(
395300
_ opts: MatchingOptions
@@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass {
413318
}
414319
}
415320

416-
extension AST.CustomCharacterClass {
417-
func generateConsumer(
418-
_ opts: MatchingOptions
419-
) throws -> MEProgram<String>.ConsumeFunction {
420-
// NOTE: Easy way to implement, obviously not performant
421-
let consumers = try strippingTriviaShallow.members.map {
422-
try $0.generateConsumer(opts)
423-
}
424-
return { input, bounds in
425-
for consumer in consumers {
426-
if let idx = consumer(input, bounds) {
427-
return isInverted ? nil : idx
428-
}
429-
}
430-
if isInverted {
431-
// FIXME: semantic level
432-
return input.index(after: bounds.lowerBound)
433-
}
434-
return nil
435-
}
436-
}
437-
}
438-
439321
// NOTE: Conveniences, though not most performant
440322
private func consumeScalarScript(
441323
_ s: Unicode.Script

Sources/_StringProcessing/Regex/ASTConversion.swift

+3
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ extension AST.Atom {
211211
case .any: return .any
212212
case let .backreference(r): return .backreference(r)
213213

214+
case .escaped(let c) where c.scalarValue != nil:
215+
return .scalar(c.scalarValue!)
216+
214217
default: return .unconverted(self)
215218
}
216219
}

Tests/RegexTests/MatchTests.swift

+14-9
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,15 @@ extension RegexTests {
281281
// code point sequence
282282
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
283283

284+
// Escape sequences that represent scalar values.
285+
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
286+
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
287+
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
288+
firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
289+
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
290+
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
291+
292+
firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")
284293

285294
// MARK: Quotes
286295

@@ -596,24 +605,20 @@ extension RegexTests {
596605

597606
func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
598607

599-
// Currently not supported in the matching engine.
600608
for s in scalar("\u{C}") ... scalar("\u{1B}") {
601609
let u = UnicodeScalar(s)!
602-
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
603-
xfail: true)
610+
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
604611
}
605612
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
606-
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
607-
xfail: true)
613+
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
608614
}
609615
for s in scalar("\u{A}") ... scalar("\u{D}") {
610616
let u = UnicodeScalar(s)!
611-
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
612-
xfail: true)
617+
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
613618
}
614-
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
615-
xfail: true)
619+
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
616620

621+
// Currently not supported in the matching engine.
617622
for c: UnicodeScalar in ["a", "b", "c"] {
618623
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
619624
xfail: true)

0 commit comments

Comments
 (0)