From 2014cae7fd48149e0365c503922c9a5df3547a76 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 23 Apr 2025 08:13:04 -0600 Subject: [PATCH 1/2] Have the parser reject quant bounds over UInt16.max (#812) (#813) * Have the parser reject quant bounds over UInt16.max --- .../Regex/Parse/LexicalAnalysis.swift | 34 ++++++++++++++++--- Tests/RegexTests/LexTests.swift | 19 +++++++++++ Tests/RegexTests/MatchTests.swift | 6 ++++ 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index b38a07e12..b9693d97a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -331,7 +331,9 @@ extension Parser { /// /// Diagnoses on overflow /// - mutating func lexNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number? { + mutating func lexNumber( + _ kind: RadixKind = .decimal + ) -> AST.Atom.Number? { guard let str = tryEatPrefix(kind.characterFilter) else { return nil } @@ -342,6 +344,26 @@ extension Parser { return .init(i, at: str.location) } + /// Try to eat a quantification bound, such as appears in `/x{3,12}` + /// + /// Returns: `nil` if there's no number, otherwise the number + /// + /// Diagnoses on overflow. Currently, we will diagnose for any values over `UInt16.max` + /// + mutating func lexQuantBound() -> AST.Atom.Number? { + let kind = RadixKind.decimal + guard let str = tryEatPrefix(kind.characterFilter) else { + return nil + } + guard let i = UInt16(str.value, radix: kind.radix) else { + error(.numberOverflow(str.value), at: str.location) + return .init(nil, at: str.location) + } + + return .init(Int(i), at: str.location) + } + + /// Expect a number of a given `kind`, diagnosing if a number cannot be /// parsed. mutating func expectNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number { @@ -492,7 +514,7 @@ extension Parser { return p.tryEating { p in guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), + let range = p.lexQuantRange(trivia: &trivia), p.tryEat("}") else { return nil } return range.value @@ -519,12 +541,14 @@ extension Parser { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange(trivia: inout [AST.Trivia]) -> Located? { + mutating func lexQuantRange( + trivia: inout [AST.Trivia] + ) -> Located? { recordLoc { p in p.tryEating { p in if let t = p.lexWhitespace() { trivia.append(t) } - let lowerOpt = p.lexNumber() + let lowerOpt = p.lexQuantBound() if let t = p.lexWhitespace() { trivia.append(t) } @@ -546,7 +570,7 @@ extension Parser { if let t = p.lexWhitespace() { trivia.append(t) } - var upperOpt = p.lexNumber() + var upperOpt = p.lexQuantBound() if closedRange == false { // If we have an open range, the upper bound should be adjusted down. upperOpt?.value? -= 1 diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 53775e66e..ccfd18eb8 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -63,6 +63,25 @@ extension RegexTests { _ = p.lexNumber() } + let invalidQuantBounds: Array = [ + "65536", // UInt16.max + 1 + "2147483646", // Int32.max - 1 + "9223372036854775806", // Int64.max - 1 + ] + + for invalidNum in invalidQuantBounds { + let regexes: Array = [ + "x{\(invalidNum)}", + "x{1,\(invalidNum)}", + "x{\(invalidNum),1}", + ] + for regex in regexes { + diagnose(regex, expecting: .numberOverflow(invalidNum)) { p in + _ = p.parse() + } + } + } + // TODO: want to dummy print out source ranges, etc, test that. } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c52560d66..017005e5b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -751,6 +751,12 @@ extension RegexTests { firstMatchTest("(?U)a??", input: "a", match: "a") firstMatchTest("(?U)a??a", input: "aaa", match: "aa") + // Quantification syntax is somewhat dependent on the contents. + // In JS, PCRE2, Python, and some others, /x{-1}/ will be literally "x{-1}" + // Note that Java8 and Rust throw an (unhelpful) error + firstMatchTest("x{-1}", input: "x{-1}", match: "x{-1}") + firstMatchTest("x{-1}", input: "xax{-2}bx{-1}c", match: "x{-1}") + // TODO: After captures, easier to test these } From 463d5b013cd017a9ec54d37f8d3d0923f3a9fe44 Mon Sep 17 00:00:00 2001 From: Stephen Canon Date: Thu, 3 Jul 2025 13:08:35 -0400 Subject: [PATCH 2/2] Merge pull request #819 from swiftlang/word-breaking-818 Work around word breaking issue --- .../_StringProcessing/Unicode/WordBreaking.swift | 5 ++++- Tests/RegexBuilderTests/RegexDSLTests.swift | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 8db40efac..8e4e5226f 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -87,7 +87,10 @@ extension String { var j = maxIndex ?? range.lowerBound while j < range.upperBound, j <= i { - cache!.insert(j) + // Workaround for underlying issue in https://github.com/swiftlang/swift-experimental-string-processing/issues/818 + let (inserted, _) = cache!.insert(j) + guard inserted else { return true } + j = _wordIndex(after: j) } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 3a0c63a98..c6f5fdaf5 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1946,6 +1946,22 @@ extension RegexDSLTests { XCTAssertEqual(anyOutput[15].value as? Int, 123) XCTAssertEqual(anyOutput[16].substring, "456") } + + func testIssue818() throws { + // Original report from https://github.com/swiftlang/swift-experimental-string-processing/issues/818 + let clip = "⁠‘⁠⁠example.com⁠⁠’" + let clip2 = "\u{2060}\u{2018}\u{2060}\u{2060}example.com\u{2060}\u{2060}\u{2019}" + assert(clip.unicodeScalars.elementsEqual(clip2.unicodeScalars)) + + let pattern = Regex { + Anchor.wordBoundary // line A + "example" + Anchor.wordBoundary // line B + } + + XCTAssertNotNil(clip.contains(pattern)) + XCTAssertNotNil(clip2.contains(pattern)) + } } extension Unicode.Scalar {