From 6cdb9040f55aba067e2b3eeb5ea3045051681780 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 14 May 2021 08:31:50 -0400 Subject: [PATCH 01/21] fuzz: bump libfuzzer-sys dependency This is a half-hearted attempt to fix a build failure that I don't understand in OSS-fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=34294 cc @DavidKorczynski --- fuzz/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 98e20b70d..88fa00bbc 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -11,7 +11,7 @@ edition = "2018" cargo-fuzz = true [dependencies] -libfuzzer-sys = "0.4.0" +libfuzzer-sys = "0.4.1" [dependencies.regex] path = ".." From fce37e49329157f411e45a3350c42e6927996a40 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 26 Jun 2021 09:16:29 -0400 Subject: [PATCH 02/21] dfa: remove some redundant branches I discovered these while reviewing the code to prep for the rewrite in regex-automata. --- src/dfa.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index 4b60f4d19..4aee8039c 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1353,7 +1353,6 @@ impl<'a> Fsm<'a> { match self.cache.trans.next(si, self.byte_class(b)) { STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), STATE_QUIT => None, - STATE_DEAD => Some(STATE_DEAD), nsi => Some(nsi), } } @@ -1387,7 +1386,6 @@ impl<'a> Fsm<'a> { }; match self.cache.start_states[flagi] { STATE_UNKNOWN => {} - STATE_DEAD => return Some(STATE_DEAD), si => return Some(si), } q.clear(); From bd0a14231b8848669e0d257ba55526f62756c749 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 23 Jul 2021 08:24:15 -0400 Subject: [PATCH 03/21] readme: fix badges Fixes #797, Fixes #798 --- README.md | 2 +- regex-syntax/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86d69968c..9acd5bb4a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) -[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) +[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) [![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation diff --git a/regex-syntax/README.md b/regex-syntax/README.md index e90460114..1a4560e31 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -2,9 +2,9 @@ regex-syntax ============ This crate provides a robust regular expression parser. -[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) -[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax) +[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) [![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) From bd7466034f8cccc3b0918201d1eb099cc8be3c56 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 23 Jul 2021 08:39:44 -0400 Subject: [PATCH 04/21] fuzz: try to fix build issue Ref: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=36474 See: https://oss-fuzz-build-logs.storage.googleapis.com/log-fe51f615-a13f-4685-b8d8-de4583da1ebd.txt --- fuzz/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 88fa00bbc..fa4538167 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -28,7 +28,7 @@ path = "fuzz_targets/fuzz_regex_match.rs" opt-level = 3 debug = true -[profile.debug] +[profile.dev] inherits = "release" [profile.test] From d6bc7a4c3b58e1d618024aaededa722df32fa6e8 Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Fri, 23 Jul 2021 09:49:36 -0700 Subject: [PATCH 05/21] readme: remove broken badge This was missed in bd0a142. Fixes #797 (again) --- regex-syntax/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/regex-syntax/README.md b/regex-syntax/README.md index 1a4560e31..9e1ab6174 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -3,7 +3,6 @@ regex-syntax This crate provides a robust regular expression parser. [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) -[![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax) [![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) [![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) From 63ee6699a27b294774af0154862e5cc35b495ee6 Mon Sep 17 00:00:00 2001 From: Ian Kerins Date: Tue, 2 Nov 2021 18:25:39 -0400 Subject: [PATCH 06/21] syntax/doc: fix 'their' typo --- regex-syntax/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/README.md b/regex-syntax/README.md index 9e1ab6174..592f84268 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -52,7 +52,7 @@ for extreme optimization, and therefore, use of `unsafe`. The standard for using `unsafe` in this crate is extremely high because this crate is intended to be reasonably safe to use with user supplied regular -expressions. Therefore, while their may be bugs in the regex parser itself, +expressions. Therefore, while there may be bugs in the regex parser itself, they should _never_ result in memory unsafety unless there is either a bug in the compiler or the standard library. (Since `regex-syntax` has zero dependencies.) From 3662851482327e3642940981298150c93718de3c Mon Sep 17 00:00:00 2001 From: Dave Rolsky Date: Mon, 15 Nov 2021 08:52:37 -0600 Subject: [PATCH 07/21] doc: fix typo PR #814 --- src/re_unicode.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 142c78fb1..e4871a621 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -538,7 +538,7 @@ impl Regex { mut rep: R, ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, - // then we can fast path. The fast path can make a tremendous + // then we can use the fast path. The fast path can make a tremendous // difference: // // 1) We use `find_iter` instead of `captures_iter`. Not asking for From 5197f21287344d2994f9cf06758a3ea30f5a26c3 Mon Sep 17 00:00:00 2001 From: Catena cyber <35799796+catenacyber@users.noreply.github.com> Date: Wed, 17 Nov 2021 22:49:44 +0100 Subject: [PATCH 08/21] fuzz: do not use inherits in Cargo.toml This fixes the oss-fuzz build. Specifically, the build log[1] showed this error: Step #3 - "compile-libfuzzer-address-x86_64": error: inherits must not be specified in root profile dev So we just remove it and inline the settings. PR #817 [1] - https://oss-fuzz-build-logs.storage.googleapis.com/log-c9b61873-8950-4a50-a729-820d5617ff7a.txt --- fuzz/Cargo.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index fa4538167..874e19a0f 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -29,7 +29,9 @@ opt-level = 3 debug = true [profile.dev] -inherits = "release" +opt-level = 3 +debug = true [profile.test] -inherits = "release" +opt-level = 3 +debug = true From f6e52dafdee305d16d6778e7bfe935bd9a6ae38b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 25 Feb 2022 12:48:26 -0500 Subject: [PATCH 09/21] syntax: fix 'unused' warnings It looks like the dead code detector got smarter. We never ended up using the 'printer' field in these visitors, so just get rid of it. --- regex-syntax/src/ast/print.rs | 9 ++++----- regex-syntax/src/hir/print.rs | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 283ce4c57..045de2eaf 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -57,17 +57,16 @@ impl Printer { /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { - visitor::visit(ast, Writer { printer: self, wtr: wtr }) + visitor::visit(ast, Writer { wtr }) } } #[derive(Debug)] -struct Writer<'p, W> { - printer: &'p mut Printer, +struct Writer { wtr: W, } -impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { +impl Visitor for Writer { type Output = (); type Err = fmt::Error; @@ -153,7 +152,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { } } -impl<'p, W: fmt::Write> Writer<'p, W> { +impl Writer { fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { use crate::ast::GroupKind::*; match ast.kind { diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index ff18c6e92..b71f3897c 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -65,17 +65,16 @@ impl Printer { /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, hir: &Hir, wtr: W) -> fmt::Result { - visitor::visit(hir, Writer { printer: self, wtr: wtr }) + visitor::visit(hir, Writer { wtr }) } } #[derive(Debug)] -struct Writer<'p, W> { - printer: &'p mut Printer, +struct Writer { wtr: W, } -impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { +impl Visitor for Writer { type Output = (); type Err = fmt::Error; @@ -209,7 +208,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { } } -impl<'p, W: fmt::Write> Writer<'p, W> { +impl Writer { fn write_literal_char(&mut self, c: char) -> fmt::Result { if is_meta_character(c) { self.wtr.write_str("\\")?; From b92ffd5471018419ec48dbdef32757424439f065 Mon Sep 17 00:00:00 2001 From: Alex Touchet Date: Thu, 3 Mar 2022 04:31:45 -0800 Subject: [PATCH 10/21] cargo: use SPDX license format We were previously using '/' to indicate the dual licensing scheme, but I guess we're now supposed to use 'OR'. PR #843 --- bench/Cargo.toml | 2 +- regex-capi/Cargo.toml | 2 +- regex-debug/Cargo.toml | 2 +- regex-syntax/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 9e61fd046..be8783573 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -3,7 +3,7 @@ publish = false name = "regex-benchmark" version = "0.1.0" authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" diff --git a/regex-capi/Cargo.toml b/regex-capi/Cargo.toml index 47938092d..bb8a1517e 100644 --- a/regex-capi/Cargo.toml +++ b/regex-capi/Cargo.toml @@ -2,7 +2,7 @@ name = "rure" version = "0.2.1" #:version authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" readme = "README.md" repository = "https://github.com/rust-lang/regex" documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi" diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index 34a62f87b..1db4036b9 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -3,7 +3,7 @@ publish = false name = "regex-debug" version = "0.1.0" authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 1359aa15d..82700b03a 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -2,7 +2,7 @@ name = "regex-syntax" version = "0.6.25" #:version authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" From ae70b41d4f46641dbc45c7a4f87954aea356283e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 25 Feb 2022 12:48:15 -0500 Subject: [PATCH 11/21] security: fix denial-of-service bug in compiler The regex compiler will happily attempt to compile '(?:){294967295}' by compiling the empty sub-expression 294,967,295 times. Empty sub-expressions don't use any memory in the current implementation, so this doesn't trigger the pre-existing machinery for stopping compilation early if the regex object gets too big. The end result is that while compilation will eventually succeed, it takes a very long time to do so. In this commit, we fix this problem by adding a fake amount of memory every time we compile an empty sub-expression. It turns out we were already tracking an additional amount of indirect heap usage via 'extra_inst_bytes' in the compiler, so we just make it look like compiling an empty sub-expression actually adds an additional 'Inst' to the compiled regex object. This has the effect of causing the regex compiler to reject this sort of regex in a reasonable amount of time by default. Many thanks to @VTCAKAVSMoACE for reporting this, providing the valuable test cases and continuing to test this patch as it was developed. Fixes https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 --- src/compile.rs | 27 +++++++++++++++-- tests/test_default.rs | 70 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/compile.rs b/src/compile.rs index 9a2ed5e92..069f445c8 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -38,6 +38,16 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. extra_inst_bytes: usize, } @@ -260,7 +270,7 @@ impl Compiler { self.check_size()?; match *expr.kind() { - Empty => Ok(None), + Empty => self.c_empty(), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -378,6 +388,19 @@ impl Compiler { } } + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::(); + Ok(None) + } + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because @@ -496,7 +519,7 @@ impl Compiler { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { match exprs.next() { - None => return Ok(None), + None => return self.c_empty(), Some(e) => { if let Some(p) = self.c(e)? { break p; diff --git a/tests/test_default.rs b/tests/test_default.rs index d4365fbb3..be627f7a6 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -150,3 +150,73 @@ fn regex_is_reasonably_small() { assert_eq!(16, size_of::()); assert_eq!(16, size_of::()); } + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +} From d130381b150756ba7e5940efdc6ebdf47f4febc0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 8 Mar 2022 08:58:47 -0500 Subject: [PATCH 12/21] 1.5.5 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 468230b76..6f6ebd4d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.5.4" #:version +version = "1.5.5" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" readme = "README.md" From 258bdf798a14f50529c1665e84cc8a3a9e2c90fc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 8 Mar 2022 09:45:49 -0500 Subject: [PATCH 13/21] changelog: 1.5.5 This adds the notes after the release, which were overlooked. --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d19633d..1cd27e7d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +1.5.5 (2022-03-08) +================== +This releases fixes a security bug in the regex compiler. This bug permits a +vector for a denial-of-service attack in cases where the regex being compiled +is untrusted. There are no known problems where the regex is itself trusted, +including in cases of untrusted haystacks. + +* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8): + Fixes a bug in the regex compiler where empty sub-expressions subverted the + existing mitigations in place to enforce a size limit on compiled regexes. + The Rust Security Response WG published an advisory about this: + https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw + + 1.5.4 (2021-05-06) ================== This release fixes another compilation failure when building regex. This time, From b5372864e2df6a2f5e543a556a62197f50ca3650 Mon Sep 17 00:00:00 2001 From: cui fliter Date: Mon, 25 Apr 2022 01:24:49 +0800 Subject: [PATCH 14/21] doc: fix some typos PR #856 --- CHANGELOG.md | 2 +- bench/src/sherlock.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cd27e7d1..8bf571caf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -683,7 +683,7 @@ New features: * Empty sub-expressions are now permitted in most places. That is, `()+` is now a valid regex. * Almost everything in regex-syntax now uses constant stack space, even when - performing anaylsis that requires structural induction. This reduces the risk + performing analysis that requires structural induction. This reduces the risk of a user provided regular expression causing a stack overflow. * [FEATURE #174](https://github.com/rust-lang/regex/issues/174): The `Ast` type in `regex-syntax` now contains span information. diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 8feb5862e..d1afbdd1b 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -149,12 +149,12 @@ sherlock!(before_holmes, r"\w+\s+Holmes", 319); // and suffix optimizations. sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137); -// Find Holmes co-occuring with Watson in a particular window of characters. +// Find Holmes co-occurring with Watson in a particular window of characters. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); -// Find Holmes co-occuring with Watson in a particular window of words. +// Find Holmes co-occurring with Watson in a particular window of words. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. #[cfg(not(feature = "re-onig"))] From 72f09f1aeb0ff3f703b1afdbdd21f5ff63162fb4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 17 May 2022 09:39:16 -0400 Subject: [PATCH 15/21] syntax: fix ascii class union bug This fixes a bug in how ASCII class unioning was implemented. Namely, it previously and erroneously unioned together two classes and then applied negation/case-folding based on the most recently added class, even if the class added previously wasn't negated. So for example, given the regex '[[:alnum:][:^ascii:]]', this would initialize the class with '[:alnum:]', then add all '[:^ascii:]' codepoints and then negate the entire thing because of the negation in '[:^ascii:]'. Negating the entire thing is clearly wrong and not the intended semantics. We fix this by applying negation/case-folding only to the class we're dealing with, and then we union it with whatever existing class we're building. Fixes #680 --- CHANGELOG.md | 7 ++++ regex-syntax/src/hir/translate.rs | 61 +++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bf571caf..961b5abb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +TBD +=== + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. + + 1.5.5 (2022-03-08) ================== This releases fixes a security bug in the regex compiler. This bug permits a diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 99c949302..a7ae9bc38 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } ast::ClassSetItem::Ascii(ref x) => { if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassUnicodeRange::new(s, e)); - } - self.unicode_fold_and_negate( - &x.span, x.negated, &mut cls, - )?; + cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { + let xcls = self.hir_ascii_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassBytesRange::new(s as u8, e as u8)); - } - self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?; + cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } @@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { result } + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassUnicode::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, @@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassBytes, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { @@ -1943,6 +1963,25 @@ mod tests { ); } + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + #[test] #[cfg(feature = "unicode-perl")] fn class_perl() { From 88a2a62d861d189faae539990f63cb9cf195bd8c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 17 May 2022 14:27:33 -0400 Subject: [PATCH 16/21] syntax: fix 'is_match_empty' predicate This was incorrectly defined for \b. Previously, I had erroneously made it return true only for \B since \B matches '' and \b does not match ''. However, \b does match the empty string. Like \B, it only matches a subset of empty strings, depending on what the surrounding context is. The important bit is that it can match *an* empty string, not that it matches *the* empty string. We were not yet using this predicate anywhere in the regex crate, so we just fix the implementation and update the tests. This does present a compatibility hazard for anyone who was using this function, but as of this time, I'm considering this a bug fix since \b clearly matches an empty string. Fixes #859 --- CHANGELOG.md | 3 +++ regex-syntax/src/hir/mod.rs | 14 +++++++++----- regex-syntax/src/hir/translate.rs | 7 +++++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 961b5abb8..bd612e72a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ TBD === +The below are changes for the next release, which is to be determined. * [BUG #680](https://github.com/rust-lang/regex/issues/680): Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. +* [BUG #859](https://github.com/rust-lang/regex/issues/859): + Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. 1.5.5 (2022-03-08) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 4969f1265..f5cf992e5 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -334,9 +334,13 @@ impl Hir { info.set_any_anchored_end(false); info.set_literal(false); info.set_alternation_literal(false); - // A negated word boundary matches the empty string, but a normal - // word boundary does not! - info.set_match_empty(word_boundary.is_negated()); + // A negated word boundary matches '', so that's fine. But \b does not + // match \b, so why do we say it can match the empty string? Well, + // because, if you search for \b against 'a', it will report [0, 0) and + // [1, 1) as matches, and both of those matches correspond to the empty + // string. Thus, only *certain* empty strings match \b, which similarly + // applies to \B. + info.set_match_empty(true); // Negated ASCII word boundaries can match invalid UTF-8. if let WordBoundary::AsciiNegate = word_boundary { info.set_always_utf8(false); @@ -661,8 +665,8 @@ impl Hir { /// Return true if and only if the empty string is part of the language /// matched by this regular expression. /// - /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, - /// but not `a`, `a+` or `\b`. + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` + /// and `\B`, but not `a` or `a+`. pub fn is_match_empty(&self) -> bool { self.info.is_match_empty() } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a7ae9bc38..56afbbed8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3139,6 +3139,9 @@ mod tests { assert!(t(r"\pL*").is_match_empty()); assert!(t(r"a*|b").is_match_empty()); assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a|").is_match_empty()); + assert!(t(r"|a").is_match_empty()); + assert!(t(r"a||b").is_match_empty()); assert!(t(r"a*a?(abcd)*").is_match_empty()); assert!(t(r"^").is_match_empty()); assert!(t(r"$").is_match_empty()); @@ -3148,6 +3151,8 @@ mod tests { assert!(t(r"\z").is_match_empty()); assert!(t(r"\B").is_match_empty()); assert!(t_bytes(r"(?-u)\B").is_match_empty()); + assert!(t(r"\b").is_match_empty()); + assert!(t(r"(?-u)\b").is_match_empty()); // Negative examples. assert!(!t(r"a+").is_match_empty()); @@ -3157,8 +3162,6 @@ mod tests { assert!(!t(r"a{1,10}").is_match_empty()); assert!(!t(r"b|a").is_match_empty()); assert!(!t(r"a*a+(abcd)*").is_match_empty()); - assert!(!t(r"\b").is_match_empty()); - assert!(!t(r"(?-u)\b").is_match_empty()); } #[test] From 1c19619672c2ef16dc9f64fec38af5719c4ec06c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 13:30:04 -0400 Subject: [PATCH 17/21] syntax: fix literal extraction for 'ab??' Previously, 'ab??' returned [Complete(ab), Complete(a)], but the order matters here because of greediness. The correct result is [Complete(a), Complete(ab)]. Instead of trying to actually fix literal extraction (which is a mess), we just rewrite 'ab?' (and 'ab??') as 'ab*'. 'ab*' still produces literals in the incorrect order, i.e., [Cut(ab), Complete(a)], but since one is cut we are guaranteed that the regex engine will be called to confirm the match. In so doing, it will correctly report 'a' as a match for 'ab??' in 'ab'. Fixes #862 --- CHANGELOG.md | 2 ++ regex-syntax/src/hir/literal/mod.rs | 33 +++++++++++++++++------------ tests/regression.rs | 3 +++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd612e72a..fee39b5e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The below are changes for the next release, which is to be determined. Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. * [BUG #859](https://github.com/rust-lang/regex/issues/859): Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. 1.5.5 (2022-03-08) diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 25ee88b06..1e66d2cc3 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -735,18 +735,18 @@ fn repeat_zero_or_one_literals( lits: &mut Literals, mut f: F, ) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + // FIXME: Our literal extraction doesn't care about greediness. + // Which is partially why we're treating 'e?' as 'e*'. Namely, + // 'ab??' yields [Complete(ab), Complete(a)], but it should yield + // [Complete(a), Complete(ab)] because of the non-greediness. + greedy: true, + hir: Box::new(e.clone()), + }), + lits, + ); } fn repeat_zero_or_more_literals( @@ -1141,6 +1141,11 @@ mod tests { test_lit!(pfx_group1, prefixes, "(a)", M("a")); test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); + // FIXME: This should return [M("a"), M("ab")] because of the non-greedy + // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get + // a cut literal. + test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); @@ -1249,8 +1254,8 @@ mod tests { pfx_crazy1, prefixes, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'am"), - C("Mu\\'am"), + C("Mo\\'"), + C("Mu\\'"), C("Moam"), C("Muam") ); diff --git a/tests/regression.rs b/tests/regression.rs index 44b90832b..e8b252538 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -217,3 +217,6 @@ matiter!( // https://en.wikipedia.org/wiki/Je_(Cyrillic) ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); From d98da65bb3df16836f1181c6f7e4f03c3af1d5a5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 14:03:31 -0400 Subject: [PATCH 18/21] changelog: 1.5.6 --- CHANGELOG.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fee39b5e9..26f9955b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -TBD -=== -The below are changes for the next release, which is to be determined. +1.5.6 (2022-05-20) +================== +This release includes a few bug fixes, including a bug that produced incorrect +matches when a non-greedy `?` operator was used. * [BUG #680](https://github.com/rust-lang/regex/issues/680): Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. From b41bde0b854e3cd1018f55e5dcd80c09b418d6c4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 14:05:16 -0400 Subject: [PATCH 19/21] regex-syntax-0.6.26 --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 82700b03a..64acce292 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.6.25" #:version +version = "0.6.26" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" From 2931b070fd9b525dec95c2b4c91f8b9ee500239e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 14:05:37 -0400 Subject: [PATCH 20/21] syntax: bump minimum regex-syntax version to 0.6.26 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6f6ebd4d5..71094f029 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -117,7 +117,7 @@ optional = true # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.6.25" +version = "0.6.26" default-features = false [dev-dependencies] From 9aef5b1edc2a436244b936db53a03ed6d720e87e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 14:06:04 -0400 Subject: [PATCH 21/21] 1.5.6 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 71094f029..4a21d1974 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.5.5" #:version +version = "1.5.6" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" readme = "README.md"