diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d19633d..26f9955b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,30 @@ +1.5.6 (2022-05-20) +================== +This release includes a few bug fixes, including a bug that produced incorrect +matches when a non-greedy `?` operator was used. + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. +* [BUG #859](https://github.com/rust-lang/regex/issues/859): + Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. + + +1.5.5 (2022-03-08) +================== +This releases fixes a security bug in the regex compiler. This bug permits a +vector for a denial-of-service attack in cases where the regex being compiled +is untrusted. There are no known problems where the regex is itself trusted, +including in cases of untrusted haystacks. + +* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8): + Fixes a bug in the regex compiler where empty sub-expressions subverted the + existing mitigations in place to enforce a size limit on compiled regexes. + The Rust Security Response WG published an advisory about this: + https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw + + 1.5.4 (2021-05-06) ================== This release fixes another compilation failure when building regex. This time, @@ -669,7 +696,7 @@ New features: * Empty sub-expressions are now permitted in most places. That is, `()+` is now a valid regex. * Almost everything in regex-syntax now uses constant stack space, even when - performing anaylsis that requires structural induction. This reduces the risk + performing analysis that requires structural induction. This reduces the risk of a user provided regular expression causing a stack overflow. * [FEATURE #174](https://github.com/rust-lang/regex/issues/174): The `Ast` type in `regex-syntax` now contains span information. diff --git a/Cargo.toml b/Cargo.toml index 468230b76..4a21d1974 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.5.4" #:version +version = "1.5.6" #:version authors = ["The Rust Project Developers"] license = "MIT OR Apache-2.0" readme = "README.md" @@ -117,7 +117,7 @@ optional = true # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.6.25" +version = "0.6.26" default-features = false [dev-dependencies] diff --git a/README.md b/README.md index 86d69968c..9acd5bb4a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) -[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) +[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) [![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 9e61fd046..be8783573 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -3,7 +3,7 @@ publish = false name = "regex-benchmark" version = "0.1.0" authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 8feb5862e..d1afbdd1b 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -149,12 +149,12 @@ sherlock!(before_holmes, r"\w+\s+Holmes", 319); // and suffix optimizations. sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137); -// Find Holmes co-occuring with Watson in a particular window of characters. +// Find Holmes co-occurring with Watson in a particular window of characters. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); -// Find Holmes co-occuring with Watson in a particular window of words. +// Find Holmes co-occurring with Watson in a particular window of words. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. #[cfg(not(feature = "re-onig"))] diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 98e20b70d..874e19a0f 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -11,7 +11,7 @@ edition = "2018" cargo-fuzz = true [dependencies] -libfuzzer-sys = "0.4.0" +libfuzzer-sys = "0.4.1" [dependencies.regex] path = ".." @@ -28,8 +28,10 @@ path = "fuzz_targets/fuzz_regex_match.rs" opt-level = 3 debug = true -[profile.debug] -inherits = "release" +[profile.dev] +opt-level = 3 +debug = true [profile.test] -inherits = "release" +opt-level = 3 +debug = true diff --git a/regex-capi/Cargo.toml b/regex-capi/Cargo.toml index 47938092d..bb8a1517e 100644 --- a/regex-capi/Cargo.toml +++ b/regex-capi/Cargo.toml @@ -2,7 +2,7 @@ name = "rure" version = "0.2.1" #:version authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" readme = "README.md" repository = "https://github.com/rust-lang/regex" documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi" diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index 34a62f87b..1db4036b9 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -3,7 +3,7 @@ publish = false name = "regex-debug" version = "0.1.0" authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex" homepage = "https://github.com/rust-lang/regex" diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 1359aa15d..64acce292 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "regex-syntax" -version = "0.6.25" #:version +version = "0.6.26" #:version authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" diff --git a/regex-syntax/README.md b/regex-syntax/README.md index e90460114..592f84268 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -2,9 +2,8 @@ regex-syntax ============ This crate provides a robust regular expression parser. -[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) -[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) -[![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax) +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) [![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) @@ -53,7 +52,7 @@ for extreme optimization, and therefore, use of `unsafe`. The standard for using `unsafe` in this crate is extremely high because this crate is intended to be reasonably safe to use with user supplied regular -expressions. Therefore, while their may be bugs in the regex parser itself, +expressions. Therefore, while there may be bugs in the regex parser itself, they should _never_ result in memory unsafety unless there is either a bug in the compiler or the standard library. (Since `regex-syntax` has zero dependencies.) diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 283ce4c57..045de2eaf 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -57,17 +57,16 @@ impl Printer { /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { - visitor::visit(ast, Writer { printer: self, wtr: wtr }) + visitor::visit(ast, Writer { wtr }) } } #[derive(Debug)] -struct Writer<'p, W> { - printer: &'p mut Printer, +struct Writer { wtr: W, } -impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { +impl Visitor for Writer { type Output = (); type Err = fmt::Error; @@ -153,7 +152,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { } } -impl<'p, W: fmt::Write> Writer<'p, W> { +impl Writer { fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { use crate::ast::GroupKind::*; match ast.kind { diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 25ee88b06..1e66d2cc3 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -735,18 +735,18 @@ fn repeat_zero_or_one_literals( lits: &mut Literals, mut f: F, ) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + // FIXME: Our literal extraction doesn't care about greediness. + // Which is partially why we're treating 'e?' as 'e*'. Namely, + // 'ab??' yields [Complete(ab), Complete(a)], but it should yield + // [Complete(a), Complete(ab)] because of the non-greediness. + greedy: true, + hir: Box::new(e.clone()), + }), + lits, + ); } fn repeat_zero_or_more_literals( @@ -1141,6 +1141,11 @@ mod tests { test_lit!(pfx_group1, prefixes, "(a)", M("a")); test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); + // FIXME: This should return [M("a"), M("ab")] because of the non-greedy + // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get + // a cut literal. + test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); @@ -1249,8 +1254,8 @@ mod tests { pfx_crazy1, prefixes, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'am"), - C("Mu\\'am"), + C("Mo\\'"), + C("Mu\\'"), C("Moam"), C("Muam") ); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 4969f1265..f5cf992e5 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -334,9 +334,13 @@ impl Hir { info.set_any_anchored_end(false); info.set_literal(false); info.set_alternation_literal(false); - // A negated word boundary matches the empty string, but a normal - // word boundary does not! - info.set_match_empty(word_boundary.is_negated()); + // A negated word boundary matches '', so that's fine. But \b does not + // match \b, so why do we say it can match the empty string? Well, + // because, if you search for \b against 'a', it will report [0, 0) and + // [1, 1) as matches, and both of those matches correspond to the empty + // string. Thus, only *certain* empty strings match \b, which similarly + // applies to \B. + info.set_match_empty(true); // Negated ASCII word boundaries can match invalid UTF-8. if let WordBoundary::AsciiNegate = word_boundary { info.set_always_utf8(false); @@ -661,8 +665,8 @@ impl Hir { /// Return true if and only if the empty string is part of the language /// matched by this regular expression. /// - /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, - /// but not `a`, `a+` or `\b`. + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` + /// and `\B`, but not `a` or `a+`. pub fn is_match_empty(&self) -> bool { self.info.is_match_empty() } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index ff18c6e92..b71f3897c 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -65,17 +65,16 @@ impl Printer { /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, hir: &Hir, wtr: W) -> fmt::Result { - visitor::visit(hir, Writer { printer: self, wtr: wtr }) + visitor::visit(hir, Writer { wtr }) } } #[derive(Debug)] -struct Writer<'p, W> { - printer: &'p mut Printer, +struct Writer { wtr: W, } -impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { +impl Visitor for Writer { type Output = (); type Err = fmt::Error; @@ -209,7 +208,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { } } -impl<'p, W: fmt::Write> Writer<'p, W> { +impl Writer { fn write_literal_char(&mut self, c: char) -> fmt::Result { if is_meta_character(c) { self.wtr.write_str("\\")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 99c949302..56afbbed8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } ast::ClassSetItem::Ascii(ref x) => { if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassUnicodeRange::new(s, e)); - } - self.unicode_fold_and_negate( - &x.span, x.negated, &mut cls, - )?; + cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { + let xcls = self.hir_ascii_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassBytesRange::new(s as u8, e as u8)); - } - self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?; + cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } @@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { result } + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassUnicode::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, @@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassBytes, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { @@ -1943,6 +1963,25 @@ mod tests { ); } + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + #[test] #[cfg(feature = "unicode-perl")] fn class_perl() { @@ -3100,6 +3139,9 @@ mod tests { assert!(t(r"\pL*").is_match_empty()); assert!(t(r"a*|b").is_match_empty()); assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a|").is_match_empty()); + assert!(t(r"|a").is_match_empty()); + assert!(t(r"a||b").is_match_empty()); assert!(t(r"a*a?(abcd)*").is_match_empty()); assert!(t(r"^").is_match_empty()); assert!(t(r"$").is_match_empty()); @@ -3109,6 +3151,8 @@ mod tests { assert!(t(r"\z").is_match_empty()); assert!(t(r"\B").is_match_empty()); assert!(t_bytes(r"(?-u)\B").is_match_empty()); + assert!(t(r"\b").is_match_empty()); + assert!(t(r"(?-u)\b").is_match_empty()); // Negative examples. assert!(!t(r"a+").is_match_empty()); @@ -3118,8 +3162,6 @@ mod tests { assert!(!t(r"a{1,10}").is_match_empty()); assert!(!t(r"b|a").is_match_empty()); assert!(!t(r"a*a+(abcd)*").is_match_empty()); - assert!(!t(r"\b").is_match_empty()); - assert!(!t(r"(?-u)\b").is_match_empty()); } #[test] diff --git a/src/compile.rs b/src/compile.rs index 9a2ed5e92..069f445c8 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -38,6 +38,16 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. extra_inst_bytes: usize, } @@ -260,7 +270,7 @@ impl Compiler { self.check_size()?; match *expr.kind() { - Empty => Ok(None), + Empty => self.c_empty(), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -378,6 +388,19 @@ impl Compiler { } } + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::(); + Ok(None) + } + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because @@ -496,7 +519,7 @@ impl Compiler { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { match exprs.next() { - None => return Ok(None), + None => return self.c_empty(), Some(e) => { if let Some(p) = self.c(e)? { break p; diff --git a/src/dfa.rs b/src/dfa.rs index 4b60f4d19..4aee8039c 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1353,7 +1353,6 @@ impl<'a> Fsm<'a> { match self.cache.trans.next(si, self.byte_class(b)) { STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), STATE_QUIT => None, - STATE_DEAD => Some(STATE_DEAD), nsi => Some(nsi), } } @@ -1387,7 +1386,6 @@ impl<'a> Fsm<'a> { }; match self.cache.start_states[flagi] { STATE_UNKNOWN => {} - STATE_DEAD => return Some(STATE_DEAD), si => return Some(si), } q.clear(); diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 142c78fb1..e4871a621 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -538,7 +538,7 @@ impl Regex { mut rep: R, ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, - // then we can fast path. The fast path can make a tremendous + // then we can use the fast path. The fast path can make a tremendous // difference: // // 1) We use `find_iter` instead of `captures_iter`. Not asking for diff --git a/tests/regression.rs b/tests/regression.rs index 44b90832b..e8b252538 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -217,3 +217,6 @@ matiter!( // https://en.wikipedia.org/wiki/Je_(Cyrillic) ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); diff --git a/tests/test_default.rs b/tests/test_default.rs index d4365fbb3..be627f7a6 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -150,3 +150,73 @@ fn regex_is_reasonably_small() { assert_eq!(16, size_of::()); assert_eq!(16, size_of::()); } + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +}