Skip to content

Commit 6645f0d

Browse files
authored
Merge pull request #40630 from rxwei/capture-inference
2 parents 324cccd + 1b3c0b7 commit 6645f0d

13 files changed

+252
-39
lines changed

include/swift/AST/DiagnosticsSema.def

+3
Original file line numberDiff line numberDiff line change
@@ -4769,6 +4769,9 @@ ERROR(async_unavailable_decl,none,
47694769
ERROR(string_processing_lib_missing,none,
47704770
"missing '%0' declaration, probably because the '_StringProcessing' "
47714771
"module was not imported properly", (StringRef))
4772+
ERROR(regex_capture_types_failed_to_decode,none,
4773+
"failed to decode capture types for regular expression literal; this may "
4774+
"be a compiler bug", ())
47724775

47734776
//------------------------------------------------------------------------------
47744777
// MARK: Type Check Types

include/swift/AST/Expr.h

+18-3
Original file line numberDiff line numberDiff line change
@@ -971,22 +971,37 @@ class RegexLiteralExpr : public LiteralExpr {
971971
SourceLoc Loc;
972972
StringRef RegexText;
973973
unsigned Version;
974+
ArrayRef<uint8_t> SerializedCaptureStructure;
974975

975976
RegexLiteralExpr(SourceLoc loc, StringRef regexText, unsigned version,
977+
ArrayRef<uint8_t> serializedCaps,
976978
bool isImplicit)
977979
: LiteralExpr(ExprKind::RegexLiteral, isImplicit), Loc(loc),
978-
RegexText(regexText), Version(version) {}
980+
RegexText(regexText), Version(version),
981+
SerializedCaptureStructure(serializedCaps) {}
979982

980983
public:
981-
static RegexLiteralExpr *createParsed(ASTContext &ctx, SourceLoc loc,
982-
StringRef regexText, unsigned version);
984+
static RegexLiteralExpr *createParsed(
985+
ASTContext &ctx, SourceLoc loc, StringRef regexText, unsigned version,
986+
ArrayRef<uint8_t> serializedCaptureStructure);
987+
988+
typedef uint16_t CaptureStructureSerializationVersion;
989+
990+
static unsigned getCaptureStructureSerializationAllocationSize(
991+
unsigned regexLength) {
992+
return sizeof(CaptureStructureSerializationVersion) + regexLength + 1;
993+
}
983994

984995
/// Retrieve the raw regex text.
985996
StringRef getRegexText() const { return RegexText; }
986997

987998
/// Retrieve the version of the regex string.
988999
unsigned getVersion() const { return Version; }
9891000

1001+
ArrayRef<uint8_t> getSerializedCaptureStructure() {
1002+
return SerializedCaptureStructure;
1003+
}
1004+
9901005
SourceRange getSourceRange() const { return Loc; }
9911006

9921007
static bool classof(const Expr *E) {

include/swift/Parse/ExperimentalRegexBridging.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ void Parser_registerRegexLiteralLexingFn(RegexLiteralLexingFn fn);
3434
/// - CaptureStructureOut: A buffer accepting a byte sequence representing the
3535
/// capture structure of the literal.
3636
/// - CaptureStructureSize: The size of the capture structure buffer. Must be
37-
/// greater than or equal to `strlen(InputPtr)`.
37+
/// greater than or equal to `strlen(InputPtr) + 3`.
3838
typedef void(* RegexLiteralParsingFn)(/*InputPtr*/ const char *,
3939
/*ErrorOut*/ const char **,
4040
/*VersionOut*/ unsigned *,
41-
/*CaptureStructureOut*/ char *,
41+
/*CaptureStructureOut*/ void *,
4242
/*CaptureStructureSize*/ unsigned);
4343
void Parser_registerRegexLiteralParsingFn(RegexLiteralParsingFn fn);
4444

lib/AST/Expr.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -2262,8 +2262,9 @@ SourceLoc TapExpr::getEndLoc() const {
22622262

22632263
RegexLiteralExpr *
22642264
RegexLiteralExpr::createParsed(ASTContext &ctx, SourceLoc loc,
2265-
StringRef regexText, unsigned version) {
2266-
return new (ctx) RegexLiteralExpr(loc, regexText, version,
2265+
StringRef regexText, unsigned version,
2266+
ArrayRef<uint8_t> serializedCaps) {
2267+
return new (ctx) RegexLiteralExpr(loc, regexText, version, serializedCaps,
22672268
/*implicit*/ false);
22682269
}
22692270

lib/Parse/ParseRegex.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,18 @@ ParserResult<Expr> Parser::parseExprRegexLiteral() {
4545
// at.
4646
const char *errorStr = nullptr;
4747
unsigned version;
48+
auto capturesBuf = Context.AllocateUninitialized<uint8_t>(
49+
RegexLiteralExpr::getCaptureStructureSerializationAllocationSize(
50+
regexText.size()));
4851
regexLiteralParsingFn(regexText.str().c_str(), &errorStr, &version,
49-
/*captureStructureOut*/ nullptr,
50-
/*captureStructureSize*/ 0);
51-
if (errorStr)
52+
/*captureStructureOut*/ capturesBuf.data(),
53+
/*captureStructureSize*/ capturesBuf.size());
54+
if (errorStr) {
5255
diagnose(Tok, diag::regex_literal_parsing_error, errorStr);
56+
return makeParserError();
57+
}
5358

5459
auto loc = consumeToken();
55-
return makeParserResult(
56-
RegexLiteralExpr::createParsed(Context, loc, regexText, version));
60+
return makeParserResult(RegexLiteralExpr::createParsed(
61+
Context, loc, regexText, version, capturesBuf));
5762
}

lib/Sema/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ add_swift_host_library(swiftSema STATIC
6262
TypeCheckPropertyWrapper.cpp
6363
TypeCheckProtocol.cpp
6464
TypeCheckProtocolInference.cpp
65+
TypeCheckRegex.cpp
6566
TypeCheckRequestFunctions.cpp
6667
TypeCheckStmt.cpp
6768
TypeCheckStorage.cpp

lib/Sema/CSGen.cpp

+11-6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
//===----------------------------------------------------------------------===//
1616
#include "TypeCheckConcurrency.h"
1717
#include "TypeCheckType.h"
18+
#include "TypeCheckRegex.h"
1819
#include "TypeChecker.h"
1920
#include "swift/AST/ASTVisitor.h"
2021
#include "swift/AST/ASTWalker.h"
@@ -1266,15 +1267,19 @@ namespace {
12661267
ctx.Id_Regex.str());
12671268
return Type();
12681269
}
1269-
auto dynCapturesType = ctx.getDynamicCapturesType();
1270-
if (!dynCapturesType) {
1270+
SmallVector<TupleTypeElt, 4> captureTypes;
1271+
if (decodeRegexCaptureTypes(ctx,
1272+
E->getSerializedCaptureStructure(),
1273+
/*atomType*/ ctx.getSubstringType(),
1274+
captureTypes)) {
12711275
ctx.Diags.diagnose(E->getLoc(),
1272-
diag::string_processing_lib_missing,
1273-
"DynamicCaptures");
1276+
diag::regex_capture_types_failed_to_decode);
12741277
return Type();
12751278
}
1276-
// TODO: Replace `DynamicCaptures` with type inferred from the regex.
1277-
return BoundGenericStructType::get(regexDecl, Type(), {dynCapturesType});
1279+
auto genericArg = captureTypes.size() == 1
1280+
? captureTypes[0].getRawType()
1281+
: TupleType::get(captureTypes, ctx);
1282+
return BoundGenericStructType::get(regexDecl, Type(), {genericArg});
12781283
}
12791284

12801285
Type visitDeclRefExpr(DeclRefExpr *E) {

lib/Sema/TypeCheckRegex.cpp

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
//===--- TypeCheckRegex.cpp - Regex type checking utilities ---------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "TypeCheckRegex.h"
14+
15+
#include "swift/AST/ASTContext.h"
16+
#include "swift/AST/Decl.h"
17+
#include "swift/AST/Expr.h"
18+
#include "swift/AST/Type.h"
19+
#include "swift/AST/Types.h"
20+
21+
using namespace swift;
22+
23+
// Encoding rules:
24+
// encode(〚`T`〛) ==> <version>, 〚`T`〛, .end
25+
// 〚`T` (atom)〛 ==> .atom
26+
// 〚`name: T` (atom)〛 ==> .atom, `name`, '\0'
27+
// 〚`[T]`〛 ==> 〚`T`〛, .formArray
28+
// 〚`T?`〛 ==> 〚`T`〛, .formOptional
29+
// 〚`(T0, T1, ...)`〛 ==> .beginTuple, 〚`T0`〛, 〚`T1`〛, ..., .endTuple
30+
//
31+
// For details, see apple/swift-experimental-string-processing.
32+
bool swift::decodeRegexCaptureTypes(ASTContext &ctx,
33+
ArrayRef<uint8_t> serialization,
34+
Type atomType,
35+
SmallVectorImpl<TupleTypeElt> &result) {
36+
using Version = RegexLiteralExpr::CaptureStructureSerializationVersion;
37+
static const Version implVersion = 1;
38+
unsigned size = serialization.size();
39+
// A serialization should store a version and `.end` at the very least.
40+
unsigned minSize = sizeof(Version) + sizeof(RegexCaptureStructureCode);
41+
if (size < minSize)
42+
return false;
43+
// Read version.
44+
Version version = *reinterpret_cast<const Version *>(serialization.data());
45+
if (version != implVersion)
46+
return true;
47+
// Read contents.
48+
SmallVector<SmallVector<TupleTypeElt, 4>, 4> scopes(1);
49+
unsigned offset = sizeof(Version);
50+
auto consumeCode = [&]() -> Optional<RegexCaptureStructureCode> {
51+
auto rawValue = serialization[offset];
52+
if (rawValue >= (uint8_t)RegexCaptureStructureCode::CaseCount)
53+
return None;
54+
offset += sizeof(RegexCaptureStructureCode);
55+
return (RegexCaptureStructureCode)rawValue;
56+
};
57+
do {
58+
auto code = consumeCode();
59+
if (!code)
60+
return false;
61+
switch (*code) {
62+
case RegexCaptureStructureCode::End:
63+
offset = size;
64+
break;
65+
case RegexCaptureStructureCode::Atom:
66+
scopes.back().push_back(atomType);
67+
break;
68+
case RegexCaptureStructureCode::NamedAtom: {
69+
auto *namePtr = reinterpret_cast<const char *>(
70+
serialization.slice(offset).data());
71+
auto length = strnlen(namePtr, size - offset);
72+
if (length >= size - offset)
73+
return true; // Unterminated string.
74+
StringRef name(namePtr, length);
75+
scopes.back().push_back(TupleTypeElt(atomType, ctx.getIdentifier(name)));
76+
offset += length + /*NUL*/ 1;
77+
break;
78+
}
79+
case RegexCaptureStructureCode::FormArray: {
80+
auto &type = scopes.back().back();
81+
type = TupleTypeElt(ArraySliceType::get(type.getRawType()));
82+
break;
83+
}
84+
case RegexCaptureStructureCode::FormOptional: {
85+
auto &type = scopes.back().back();
86+
type = TupleTypeElt(OptionalType::get(type.getRawType()));
87+
break;
88+
}
89+
case RegexCaptureStructureCode::BeginTuple:
90+
scopes.push_back({});
91+
break;
92+
case RegexCaptureStructureCode::EndTuple: {
93+
auto children = scopes.pop_back_val();
94+
scopes.back().push_back(TupleType::get(children, ctx));
95+
break;
96+
}
97+
case RegexCaptureStructureCode::CaseCount:
98+
llvm_unreachable("Handled earlier");
99+
}
100+
} while (offset < size);
101+
if (scopes.size() != 1)
102+
return true; // Unterminated tuple.
103+
auto &elements = scopes.back();
104+
result.append(elements.begin(), elements.end());
105+
return false;
106+
}

lib/Sema/TypeCheckRegex.h

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===--- TypeCheckRegex.h - Regex type checking utilities -----------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef SWIFT_TYPE_CHECK_REGEX_H
14+
#define SWIFT_TYPE_CHECK_REGEX_H
15+
16+
#include <cstdint>
17+
#include <llvm/ADT/ArrayRef.h>
18+
#include <llvm/ADT/SmallVector.h>
19+
20+
namespace swift {
21+
22+
class ASTContext;
23+
class TupleTypeElt;
24+
class Type;
25+
26+
enum class RegexCaptureStructureCode: uint8_t {
27+
End = 0,
28+
Atom = 1,
29+
NamedAtom = 2,
30+
FormArray = 3,
31+
FormOptional = 4,
32+
BeginTuple = 5,
33+
EndTuple = 6,
34+
CaseCount
35+
};
36+
37+
/// Decodes regex capture types from the given serialization and appends the
38+
/// decoded capture types to @p result. Returns true if the serialization is
39+
/// malformed.
40+
bool decodeRegexCaptureTypes(ASTContext &ctx,
41+
llvm::ArrayRef<uint8_t> serialization,
42+
Type atomType,
43+
llvm::SmallVectorImpl<TupleTypeElt> &result);
44+
45+
} // end namespace swift
46+
47+
#endif // SWIFT_TYPE_CHECK_REGEX_H

test/StringProcessing/Runtime/regex_basic.swift

+8-10
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,19 @@ RegexBasicTests.test("Basic") {
2525

2626
let match1 = input.expectMatch('/aabcc./')
2727
expectEqual("aabccd", input[match1.range])
28-
expectEqual(.empty, match1.captures)
28+
expectTrue(() == match1.captures)
2929

3030
let match2 = input.expectMatch('/a*b.+./')
3131
expectEqual("aabccd", input[match2.range])
32-
expectEqual(.empty, match2.captures)
32+
expectTrue(() == match2.captures)
3333
}
3434

3535
RegexBasicTests.test("Modern") {
3636
let input = "aabccd"
3737

3838
let match1 = input.expectMatch('|a a bc c /*hello*/ .|')
3939
expectEqual("aabccd", input[match1.range])
40-
expectEqual(.empty, match1.captures)
40+
expectTrue(() == match1.captures)
4141
}
4242

4343
RegexBasicTests.test("Captures") {
@@ -46,15 +46,13 @@ RegexBasicTests.test("Captures") {
4646
COMBINING MARK TUKWENTIS
4747
"""
4848
let regex = '/([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*/'
49+
// Test inferred type.
50+
let _: Regex<(Substring, Substring?, Substring)>.Type = type(of: regex)
4951
let match1 = input.expectMatch(regex)
5052
expectEqual(input[...], input[match1.range])
51-
expectEqual(
52-
.tuple([
53-
.substring("A6F0"),
54-
.optional(.substring("A6F1")),
55-
.substring("Extend")
56-
]),
57-
match1.captures)
53+
expectTrue("A6F0" == match1.captures.0)
54+
expectTrue("A6F1" == match1.captures.1)
55+
expectTrue("Extend" == match1.captures.2)
5856
}
5957

6058
runAllTests()

test/StringProcessing/SILGen/regex_literal_silgen.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ var s = '/abc/'
1111
// CHECK: [[VERSION_INT:%[0-9]+]] = apply [[INT_INIT]]([[VERSION_LITERAL]]
1212

1313
// CHECK: [[REGEX_INIT:%[0-9]+]] = function_ref @$s17_StringProcessing5RegexV06_regexA07versionACyxGSS_SitcfC : $@convention(method) <τ_0_0> (@owned String, Int, @thin Regex<τ_0_0>.Type) -> @out Regex<τ_0_0>
14-
// CHECK: apply [[REGEX_INIT]]<DynamicCaptures>({{%.+}}, [[REGEX_STR]], [[VERSION_INT]], {{%.+}}) : $@convention(method) <τ_0_0> (@owned String, Int, @thin Regex<τ_0_0>.Type) -> @out Regex<τ_0_0>
14+
// CHECK: apply [[REGEX_INIT]]<{{.+}}>({{%.+}}, [[REGEX_STR]], [[VERSION_INT]], {{%.+}}) : $@convention(method) <τ_0_0> (@owned String, Int, @thin Regex<τ_0_0>.Type) -> @out Regex<τ_0_0>

0 commit comments

Comments
 (0)