From 08c0d6ad65f7c161add82ae906efb90dbd7f653d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 18:11:56 -0500 Subject: [PATCH 1/5] Invent "rainbow" arcs within the regex engine. Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- contrib/pg_trgm/trgm_regexp.c | 27 +++++++---- src/backend/regex/README | 36 +++++++++++---- src/backend/regex/regc_color.c | 22 ++++++++- src/backend/regex/regc_nfa.c | 82 ++++++++++++++++++++++++++++++---- src/backend/regex/regcomp.c | 12 +++-- src/backend/regex/rege_dfa.c | 5 ++- src/backend/regex/regexport.c | 7 +-- src/backend/regex/regprefix.c | 8 +++- src/include/regex/regexport.h | 4 ++ src/include/regex/regguts.h | 11 ++++- 10 files changed, 177 insertions(+), 37 deletions(-) diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 1e4f0121f3d97..fcf03de32dc83 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -282,8 +282,8 @@ typedef struct typedef int TrgmColor; /* We assume that colors returned by the regexp engine cannot be these: */ -#define COLOR_UNKNOWN (-1) -#define COLOR_BLANK (-2) +#define COLOR_UNKNOWN (-3) +#define COLOR_BLANK (-4) typedef struct { @@ -780,7 +780,8 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) palloc0(colorsCount * sizeof(TrgmColorInfo)); /* - * Loop over colors, filling TrgmColorInfo about each. + * Loop over colors, filling TrgmColorInfo about each. Note we include + * WHITE (0) even though we know it'll be reported as non-expandable. */ for (i = 0; i < colorsCount; i++) { @@ -1098,9 +1099,9 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) /* Add enter key to this state */ addKeyToQueue(trgmNFA, &destKey); } - else + else if (arc->co >= 0) { - /* Regular color */ + /* Regular color (including WHITE) */ TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; if (colorInfo->expandable) @@ -1156,6 +1157,14 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key) addKeyToQueue(trgmNFA, &destKey); } } + else + { + /* RAINBOW: treat as unexpandable color */ + destKey.prefix.colors[0] = COLOR_UNKNOWN; + destKey.prefix.colors[1] = COLOR_UNKNOWN; + destKey.nstate = arc->to; + addKeyToQueue(trgmNFA, &destKey); + } } pfree(arcs); @@ -1216,10 +1225,10 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state) /* * Ignore non-expandable colors; addKey already handled the case. * - * We need no special check for begin/end pseudocolors here. We - * don't need to do any processing for them, and they will be - * marked non-expandable since the regex engine will have reported - * them that way. + * We need no special check for WHITE or begin/end pseudocolors + * here. We don't need to do any processing for them, and they + * will be marked non-expandable since the regex engine will have + * reported them that way. */ if (!colorInfo->expandable) continue; diff --git a/src/backend/regex/README b/src/backend/regex/README index f08aab69e376f..a83ab5074dfc3 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -261,6 +261,18 @@ and the NFA has these arcs: states 4 -> 5 on color 2 ("x" only) which can be seen to be a correct representation of the regex. +There is one more complexity, which is how to handle ".", that is a +match-anything atom. We used to do that by generating a "rainbow" +of arcs of all live colors between the two NFA states before and after +the dot. That's expensive in itself when there are lots of colors, +and it also typically adds lots of follow-on arc-splitting work for the +color splitting logic. Now we handle this case by generating a single arc +labeled with the special color RAINBOW, meaning all colors. Such arcs +never need to be split, so they help keep NFAs small in this common case. +(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is +not supposed to match newline. In that case we still handle "." by +generating an almost-rainbow of all colors except newline's color.) + Given this summary, we can see we need the following operations for colors: @@ -349,6 +361,8 @@ The possible arc types are: PLAIN arcs, which specify matching of any character of a given "color" (see above). These are dumped as "[color_number]->to_state". + In addition there can be "rainbow" PLAIN arcs, which are dumped as + "[*]->to_state". EMPTY arcs, which specify a no-op transition to another state. These are dumped as "->to_state". @@ -356,11 +370,11 @@ The possible arc types are: AHEAD constraints, which represent a "next character must be of this color" constraint. AHEAD differs from a PLAIN arc in that the input character is not consumed when crossing the arc. These are dumped as - ">color_number>->to_state". + ">color_number>->to_state", or possibly ">*>->to_state". BEHIND constraints, which represent a "previous character must be of this color" constraint, which likewise consumes no input. These are - dumped as "to_state". + dumped as "to_state", or possibly "<*<->to_state". '^' arcs, which specify a beginning-of-input constraint. These are dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and @@ -396,14 +410,20 @@ substring, or an imaginary following EOS character if the substring is at the end of the input. 3. If the NFA is (or can be) in the goal state at this point, it matches. +This definition is necessary to support regexes that begin or end with +constraints such as \m and \M, which imply requirements on the adjacent +character if any. The executor implements that by checking if the +adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the +right color, and it does that in the same loop that checks characters +within the match. + So one can mentally execute an untransformed NFA by taking ^ and $ as ordinary constraints that match at start and end of input; but plain arcs out of the start state should be taken as matches for the character before the target substring, and similarly, plain arcs leading to the post state are matches for the character after the target substring. -This definition is necessary to support regexes that begin or end with -constraints such as \m and \M, which imply requirements on the adjacent -character if any. NFAs for simple unanchored patterns will usually have -pre-state outarcs for all possible character colors as well as BOS and -BOL, and post-state inarcs for all possible character colors as well as -EOS and EOL, so that the executor's behavior will work. +After the optimize() transformation, there are explicit arcs mentioning +BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished +NFA for a pattern without anchors or adjacent-character constraints will +have pre-state outarcs for RAINBOW (all possible character colors) as well +as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL. diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c index f5a4151757ddc..0864011cce16f 100644 --- a/src/backend/regex/regc_color.c +++ b/src/backend/regex/regc_color.c @@ -977,6 +977,7 @@ colorchain(struct colormap *cm, { struct colordesc *cd = &cm->cd[a->co]; + assert(a->co >= 0); if (cd->arcs != NULL) cd->arcs->colorchainRev = a; a->colorchain = cd->arcs; @@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm, struct colordesc *cd = &cm->cd[a->co]; struct arc *aa = a->colorchainRev; + assert(a->co >= 0); if (aa == NULL) { assert(cd->arcs == a); @@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm, /* * rainbow - add arcs of all full colors (but one) between specified states + * + * If there isn't an exception color, we now generate just a single arc + * labeled RAINBOW, saving lots of arc-munging later on. */ static void rainbow(struct nfa *nfa, @@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa, struct colordesc *end = CDEND(cm); color co; + if (but == COLORLESS) + { + newarc(nfa, type, RAINBOW, from, to); + return; + } + + /* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */ for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but && !(cd->flags & PSEUDO)) @@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa, /* * colorcomplement - add arcs of complementary colors * + * We add arcs of all colors that are not pseudocolors and do not match + * any of the "of" state's PLAIN outarcs. + * * The calling sequence ought to be reconciled with cloneouts(). */ static void colorcomplement(struct nfa *nfa, struct colormap *cm, int type, - struct state *of, /* complements of this guy's PLAIN outarcs */ + struct state *of, struct state *from, struct state *to) { @@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa, color co; assert(of != from); + + /* A RAINBOW arc matches all colors, making the complement empty */ + if (findarc(of, PLAIN, RAINBOW) != NULL) + return; + for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) if (findarc(of, PLAIN, co) == NULL) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 7ed675f88a4ab..ff98bfd694e7d 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -271,6 +271,11 @@ destroystate(struct nfa *nfa, * * This function checks to make sure that no duplicate arcs are created. * In general we never want duplicates. + * + * However: in principle, a RAINBOW arc is redundant with any plain arc + * (unless that arc is for a pseudocolor). But we don't try to recognize + * that redundancy, either here or in allied operations such as moveins(). + * The pseudocolor consideration makes that more costly than it seems worth. */ static void newarc(struct nfa *nfa, @@ -1170,6 +1175,9 @@ copyouts(struct nfa *nfa, /* * cloneouts - copy out arcs of a state to another state pair, modifying type + * + * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share + * the same interpretation of "co". It wouldn't be sensible with LACONs. */ static void cloneouts(struct nfa *nfa, @@ -1181,9 +1189,13 @@ cloneouts(struct nfa *nfa, struct arc *a; assert(old != from); + assert(type == AHEAD || type == BEHIND); for (a = old->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); newarc(nfa, type, a->co, from, to); + } } /* @@ -1597,7 +1609,7 @@ pull(struct nfa *nfa, for (a = from->ins; a != NULL && !NISERR(); a = nexta) { nexta = a->inchain; - switch (combine(con, a)) + switch (combine(nfa, con, a)) { case INCOMPATIBLE: /* destroy the arc */ freearc(nfa, a); @@ -1624,6 +1636,10 @@ pull(struct nfa *nfa, cparc(nfa, a, s, to); freearc(nfa, a); break; + case REPLACEARC: /* replace arc's color */ + newarc(nfa, a->type, con->co, a->from, to); + freearc(nfa, a); + break; default: assert(NOTREACHED); break; @@ -1764,7 +1780,7 @@ push(struct nfa *nfa, for (a = to->outs; a != NULL && !NISERR(); a = nexta) { nexta = a->outchain; - switch (combine(con, a)) + switch (combine(nfa, con, a)) { case INCOMPATIBLE: /* destroy the arc */ freearc(nfa, a); @@ -1791,6 +1807,10 @@ push(struct nfa *nfa, cparc(nfa, a, from, s); freearc(nfa, a); break; + case REPLACEARC: /* replace arc's color */ + newarc(nfa, a->type, con->co, from, a->to); + freearc(nfa, a); + break; default: assert(NOTREACHED); break; @@ -1810,9 +1830,11 @@ push(struct nfa *nfa, * #def INCOMPATIBLE 1 // destroys arc * #def SATISFIED 2 // constraint satisfied * #def COMPATIBLE 3 // compatible but not satisfied yet + * #def REPLACEARC 4 // replace arc's color with constraint color */ static int -combine(struct arc *con, +combine(struct nfa *nfa, + struct arc *con, struct arc *a) { #define CA(ct,at) (((ct)<co == a->co) return SATISFIED; + if (con->co == RAINBOW) + { + /* con is satisfied unless arc's color is a pseudocolor */ + if (!(nfa->cm->cd[a->co].flags & PSEUDO)) + return SATISFIED; + } + else if (a->co == RAINBOW) + { + /* con is incompatible if it's for a pseudocolor */ + if (nfa->cm->cd[con->co].flags & PSEUDO) + return INCOMPATIBLE; + /* otherwise, constraint constrains arc to be only its color */ + return REPLACEARC; + } return INCOMPATIBLE; break; case CA('^', '^'): /* collision, similar constraints */ case CA('$', '$'): - case CA(AHEAD, AHEAD): + if (con->co == a->co) /* true duplication */ + return SATISFIED; + return INCOMPATIBLE; + break; + case CA(AHEAD, AHEAD): /* collision, similar constraints */ case CA(BEHIND, BEHIND): if (con->co == a->co) /* true duplication */ return SATISFIED; + if (con->co == RAINBOW) + { + /* con is satisfied unless arc's color is a pseudocolor */ + if (!(nfa->cm->cd[a->co].flags & PSEUDO)) + return SATISFIED; + } + else if (a->co == RAINBOW) + { + /* con is incompatible if it's for a pseudocolor */ + if (nfa->cm->cd[con->co].flags & PSEUDO) + return INCOMPATIBLE; + /* otherwise, constraint constrains arc to be only its color */ + return REPLACEARC; + } return INCOMPATIBLE; break; case CA('^', BEHIND): /* collision, dissimilar constraints */ @@ -2895,6 +2949,7 @@ compact(struct nfa *nfa, break; case LACON: assert(s->no != cnfa->pre); + assert(a->co >= 0); ca->co = (color) (cnfa->ncolors + a->co); ca->to = a->to->no; ca++; @@ -3068,13 +3123,22 @@ dumparc(struct arc *a, switch (a->type) { case PLAIN: - fprintf(f, "[%ld]", (long) a->co); + if (a->co == RAINBOW) + fprintf(f, "[*]"); + else + fprintf(f, "[%ld]", (long) a->co); break; case AHEAD: - fprintf(f, ">%ld>", (long) a->co); + if (a->co == RAINBOW) + fprintf(f, ">*>"); + else + fprintf(f, ">%ld>", (long) a->co); break; case BEHIND: - fprintf(f, "<%ld<", (long) a->co); + if (a->co == RAINBOW) + fprintf(f, "<*<"); + else + fprintf(f, "<%ld<", (long) a->co); break; case LACON: fprintf(f, ":%ld:", (long) a->co); @@ -3161,7 +3225,9 @@ dumpcstate(int st, pos = 1; for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { - if (ca->co < cnfa->ncolors) + if (ca->co == RAINBOW) + fprintf(f, "\t[*]->%d", ca->to); + else if (ca->co < cnfa->ncolors) fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); else fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index cd0caaa2d03d1..ae8dbe58191cb 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -158,7 +158,8 @@ static int push(struct nfa *, struct arc *, struct state **); #define INCOMPATIBLE 1 /* destroys arc */ #define SATISFIED 2 /* constraint satisfied */ #define COMPATIBLE 3 /* compatible but not satisfied yet */ -static int combine(struct arc *, struct arc *); +#define REPLACEARC 4 /* replace arc's color with constraint color */ +static int combine(struct nfa *nfa, struct arc *con, struct arc *a); static void fixempties(struct nfa *, FILE *); static struct state *emptyreachable(struct nfa *, struct state *, struct state *, struct arc **); @@ -289,9 +290,11 @@ struct vars #define SBEGIN 'A' /* beginning of string (even if not BOL) */ #define SEND 'Z' /* end of string (even if not EOL) */ -/* is an arc colored, and hence on a color chain? */ +/* is an arc colored, and hence should belong to a color chain? */ +/* the test on "co" eliminates RAINBOW arcs, which we don't bother to chain */ #define COLORED(a) \ - ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) + ((a)->co >= 0 && \ + ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)) /* static function list */ @@ -1393,7 +1396,8 @@ bracket(struct vars *v, * cbracket - handle complemented bracket expression * We do it by calling bracket() with dummy endpoints, and then complementing * the result. The alternative would be to invoke rainbow(), and then delete - * arcs as the b.e. is seen... but that gets messy. + * arcs as the b.e. is seen... but that gets messy, and is really quite + * infeasible now that rainbow() just puts out one RAINBOW arc. */ static void cbracket(struct vars *v, diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 5695e158a50b8..32be2592c5653 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -612,6 +612,7 @@ miss(struct vars *v, unsigned h; struct carc *ca; struct sset *p; + int ispseudocolor; int ispost; int noprogress; int gotstate; @@ -643,13 +644,15 @@ miss(struct vars *v, */ for (i = 0; i < d->wordsper; i++) d->work[i] = 0; /* build new stateset bitmap in d->work */ + ispseudocolor = d->cm->cd[co].flags & PSEUDO; ispost = 0; noprogress = 1; gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) - if (ca->co == co) + if (ca->co == co || + (ca->co == RAINBOW && !ispseudocolor)) { BSET(d->work, ca->to); gotstate = 1; diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c index d4f940b8c3493..a493dbe88c1a7 100644 --- a/src/backend/regex/regexport.c +++ b/src/backend/regex/regexport.c @@ -222,7 +222,8 @@ pg_reg_colorisend(const regex_t *regex, int co) * Get number of member chrs of color number "co". * * Note: we return -1 if the color number is invalid, or if it is a special - * color (WHITE or a pseudocolor), or if the number of members is uncertain. + * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is + * uncertain. * Callers should not try to extract the members if -1 is returned. */ int @@ -233,7 +234,7 @@ pg_reg_getnumcharacters(const regex_t *regex, int co) assert(regex != NULL && regex->re_magic == REMAGIC); cm = &((struct guts *) regex->re_guts)->cmap; - if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */ + if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */ return -1; if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ return -1; @@ -257,7 +258,7 @@ pg_reg_getnumcharacters(const regex_t *regex, int co) * whose length chars_len must be at least as long as indicated by * pg_reg_getnumcharacters(), else not all chars will be returned. * - * Fetching the members of WHITE or a pseudocolor is not supported. + * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported. * * Caution: this is a relatively expensive operation. */ diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c index 1d4593ac945c9..e2fbad7a8a9b6 100644 --- a/src/backend/regex/regprefix.c +++ b/src/backend/regex/regprefix.c @@ -165,9 +165,13 @@ findprefix(struct cnfa *cnfa, /* We can ignore BOS/BOL arcs */ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) continue; - /* ... but EOS/EOL arcs terminate the search, as do LACONs */ + + /* + * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs + * and LACONs + */ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || - ca->co >= cnfa->ncolors) + ca->co == RAINBOW || ca->co >= cnfa->ncolors) { thiscolor = COLORLESS; break; diff --git a/src/include/regex/regexport.h b/src/include/regex/regexport.h index e6209463f7f83..99c4fb854ec97 100644 --- a/src/include/regex/regexport.h +++ b/src/include/regex/regexport.h @@ -30,6 +30,10 @@ #include "regex/regex.h" +/* These macros must match corresponding ones in regguts.h: */ +#define COLOR_WHITE 0 /* color for chars not appearing in regex */ +#define COLOR_RAINBOW (-2) /* represents all colors except pseudocolors */ + /* information about one arc of a regex's NFA */ typedef struct { diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 0a616562d03b6..6d391083194c4 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -130,11 +130,16 @@ /* * As soon as possible, we map chrs into equivalence classes -- "colors" -- * which are of much more manageable number. + * + * To further reduce the number of arcs in NFAs and DFAs, we also have a + * special RAINBOW "color" that can be assigned to an arc. This is not a + * real color, in that it has no entry in color maps. */ typedef short color; /* colors of characters */ #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */ #define COLORLESS (-1) /* impossible color */ +#define RAINBOW (-2) /* represents all colors except pseudocolors */ #define WHITE 0 /* default color, parent of all others */ /* Note: various places in the code know that WHITE is zero */ @@ -276,7 +281,7 @@ struct state; struct arc { int type; /* 0 if free, else an NFA arc type code */ - color co; + color co; /* color the arc matches (possibly RAINBOW) */ struct state *from; /* where it's from (and contained within) */ struct state *to; /* where it's to */ struct arc *outchain; /* link in *from's outs chain or free chain */ @@ -284,6 +289,7 @@ struct arc #define freechain outchain /* we do not maintain "freechainRev" */ struct arc *inchain; /* link in *to's ins chain */ struct arc *inchainRev; /* back-link in *to's ins chain */ + /* these fields are not used when co == RAINBOW: */ struct arc *colorchain; /* link in color's arc chain */ struct arc *colorchainRev; /* back-link in color's arc chain */ }; @@ -344,6 +350,9 @@ struct nfa * Plain arcs just store the transition color number as "co". LACON arcs * store the lookaround constraint number plus cnfa.ncolors as "co". LACON * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. + * + * Note that in a plain arc, "co" can be RAINBOW; since that's negative, + * it doesn't break the rule about how to recognize LACON arcs. */ struct carc { From 824bf71902db4a2067b8d64583c9d88bb264c44b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 18:31:19 -0500 Subject: [PATCH 2/5] Recognize "match-all" NFAs within the regex engine. This builds on the previous "rainbow" patch to detect NFAs that will match any string, though possibly with constraints on the string length. This definition is chosen to match constructs such as ".*", ".+", and ".{1,100}". Recognizing such an NFA after the optimization pass is fairly cheap, since we basically just have to verify that all arcs are RAINBOW arcs and count the number of steps to the end state. (Well, there's a bit of complication with pseudo-color arcs for string boundary conditions, but not much.) Once we have these markings, the regex executor functions longest(), shortest(), and matchuntil() don't have to expend per-character work to determine whether a given substring satisfies such an NFA; they just need to check its length against the bounds. Since some matching problems require O(N) invocations of these functions, we've reduced the runtime for an N-character string from O(N^2) to O(N). Of course, this is no help for non-matchall sub-patterns, but those usually have constraints that allow us to avoid needing O(N) substring checks in the first place. It's precisely the unconstrained "match-all" cases that cause the most headaches. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- src/backend/regex/regc_nfa.c | 296 +++++++++++++++++++++++++++++++++- src/backend/regex/regcomp.c | 5 + src/backend/regex/rege_dfa.c | 57 +++++++ src/backend/regex/regprefix.c | 4 + src/include/regex/regguts.h | 15 ++ 5 files changed, 376 insertions(+), 1 deletion(-) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index ff98bfd694e7d..1a6c9ce183816 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -65,6 +65,8 @@ newnfa(struct vars *v, nfa->v = v; nfa->bos[0] = nfa->bos[1] = COLORLESS; nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->flags = 0; + nfa->minmatchall = nfa->maxmatchall = -1; nfa->parent = parent; /* Precedes newfstate so parent is valid. */ nfa->post = newfstate(nfa, '@'); /* number 0 */ nfa->pre = newfstate(nfa, '>'); /* number 1 */ @@ -2875,8 +2877,14 @@ analyze(struct nfa *nfa) if (NISERR()) return 0; + /* Detect whether NFA can't match anything */ if (nfa->pre->outs == NULL) return REG_UIMPOSSIBLE; + + /* Detect whether NFA matches all strings (possibly with length bounds) */ + checkmatchall(nfa); + + /* Detect whether NFA can possibly match a zero-length string */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) for (aa = a->to->outs; aa != NULL; aa = aa->outchain) if (aa->to == nfa->post) @@ -2884,6 +2892,282 @@ analyze(struct nfa *nfa) return 0; } +/* + * checkmatchall - does the NFA represent no more than a string length test? + * + * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1 + * to begin with) and set the MATCHALL bit in nfa->flags. + * + * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those + * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the + * post state via RAINBOW arcs, and if there are any loops in the graph, they + * must be loop-to-self arcs, ensuring that each loop iteration consumes + * exactly one character. (Longer loops are problematic because they create + * non-consecutive possible match lengths; we have no good way to represent + * that situation for lengths beyond the DUPINF limit.) + * + * Pseudocolor arcs complicate things a little. We know that they can only + * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for + * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs, + * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS + * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor + * arcs can occur, meaning that the NFA involves some constraint on the + * adjacent characters, which makes it not a matchall NFA. + */ +static void +checkmatchall(struct nfa *nfa) +{ + bool hasmatch[DUPINF + 1]; + int minmatch, + maxmatch, + morematch; + + /* + * hasmatch[i] will be set true if a match of length i is feasible, for i + * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match + * length of DUPINF or more is feasible. + */ + memset(hasmatch, 0, sizeof(hasmatch)); + + /* + * Recursively search the graph for all-RAINBOW paths to the "post" state, + * starting at the "pre" state. The -1 initial depth accounts for the + * fact that transitions out of the "pre" state are not part of the + * matched string. We likewise don't count the final transition to the + * "post" state as part of the match length. (But we still insist that + * those transitions have RAINBOW arcs, otherwise there are lookbehind or + * lookahead constraints at the start/end of the pattern.) + */ + if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch)) + return; + + /* + * We found some all-RAINBOW paths, and not anything that we couldn't + * handle. Now verify that pseudocolor arcs adjacent to the pre and post + * states match the RAINBOW arcs there. (We could do this while + * recursing, but it's expensive and unlikely to fail, so do it last.) + */ + if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) || + !check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) || + !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) || + !check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW)) + return; + if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) || + !check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) || + !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) || + !check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW)) + return; + + /* + * hasmatch[] now represents the set of possible match lengths; but we + * want to reduce that to a min and max value, because it doesn't seem + * worth complicating regexec.c to deal with nonconsecutive possible match + * lengths. Find min and max of first run of lengths, then verify there + * are no nonconsecutive lengths. + */ + for (minmatch = 0; minmatch <= DUPINF; minmatch++) + { + if (hasmatch[minmatch]) + break; + } + assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */ + for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++) + { + if (!hasmatch[maxmatch + 1]) + break; + } + for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++) + { + if (hasmatch[morematch]) + return; /* fail, there are nonconsecutive lengths */ + } + + /* Success, so record the info */ + nfa->minmatchall = minmatch; + nfa->maxmatchall = maxmatch; + nfa->flags |= MATCHALL; +} + +/* + * checkmatchall_recurse - recursive search for checkmatchall + * + * s is the current state + * foundloop is true if any predecessor state has a loop-to-self + * depth is the current recursion depth (starting at -1) + * hasmatch[] is the output area for recording feasible match lengths + * + * We return true if there is at least one all-RAINBOW path to the "post" + * state and no non-matchall paths; otherwise false. Note we assume that + * any dead-end paths have already been removed, else we might return + * false unnecessarily. + */ +static bool +checkmatchall_recurse(struct nfa *nfa, struct state *s, + bool foundloop, int depth, + bool *hasmatch) +{ + bool result = false; + struct arc *a; + + /* + * Since this is recursive, it could be driven to stack overflow. But we + * need not treat that as a hard failure; just deem the NFA non-matchall. + */ + if (STACK_TOO_DEEP(nfa->v->re)) + return false; + + /* + * Likewise, if we get to a depth too large to represent correctly in + * maxmatchall, fail quietly. + */ + if (depth >= DUPINF) + return false; + + /* + * Scan the outarcs to detect cases we can't handle, and to see if there + * is a loop-to-self here. We need to know about any such loop before we + * recurse, so it's hard to avoid making two passes over the outarcs. In + * any case, checking for showstoppers before we recurse is probably best. + */ + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN) + return false; /* any LACONs make it non-matchall */ + if (a->co != RAINBOW) + { + if (nfa->cm->cd[a->co].flags & PSEUDO) + { + /* + * Pseudocolor arc: verify it's in a valid place (this seems + * quite unlikely to fail, but let's be sure). + */ + if (s == nfa->pre && + (a->co == nfa->bos[0] || a->co == nfa->bos[1])) + /* okay BOS/BOL arc */ ; + else if (a->to == nfa->post && + (a->co == nfa->eos[0] || a->co == nfa->eos[1])) + /* okay EOS/EOL arc */ ; + else + return false; /* unexpected pseudocolor arc */ + /* We'll finish checking these arcs after the recursion */ + continue; + } + return false; /* any other color makes it non-matchall */ + } + if (a->to == s) + { + /* + * We found a cycle of length 1, so remember that to pass down to + * successor states. (It doesn't matter if there was also such a + * loop at a predecessor state.) + */ + foundloop = true; + } + else if (a->to->tmp) + { + /* We found a cycle of length > 1, so fail. */ + return false; + } + } + + /* We need to recurse, so mark state as under consideration */ + assert(s->tmp == NULL); + s->tmp = s; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->co != RAINBOW) + continue; /* ignore pseudocolor arcs */ + if (a->to == nfa->post) + { + /* We found an all-RAINBOW path to the post state */ + result = true; + /* Record potential match lengths */ + assert(depth >= 0); + hasmatch[depth] = true; + if (foundloop) + { + /* A predecessor loop makes all larger lengths match, too */ + int i; + + for (i = depth + 1; i <= DUPINF; i++) + hasmatch[i] = true; + } + } + else if (a->to != s) + { + /* This is a new path forward; recurse to investigate */ + result = checkmatchall_recurse(nfa, a->to, + foundloop, depth + 1, + hasmatch); + /* Fail if any recursive path fails */ + if (!result) + break; + } + } + + s->tmp = NULL; + return result; +} + +/* + * check_out_colors_match - subroutine for checkmatchall + * + * Check if every s outarc of color co1 has a matching outarc of color co2. + * (checkmatchall_recurse already verified that all of the outarcs are PLAIN, + * so we need not examine arc types here. Also, since it verified that there + * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for + * this brute-force O(N^2) implementation to cause problems.) + */ +static bool +check_out_colors_match(struct state *s, color co1, color co2) +{ + struct arc *a1; + struct arc *a2; + + for (a1 = s->outs; a1 != NULL; a1 = a1->outchain) + { + if (a1->co != co1) + continue; + for (a2 = s->outs; a2 != NULL; a2 = a2->outchain) + { + if (a2->co == co2 && a2->to == a1->to) + break; + } + if (a2 == NULL) + return false; + } + return true; +} + +/* + * check_in_colors_match - subroutine for checkmatchall + * + * Check if every s inarc of color co1 has a matching inarc of color co2. + * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still + * not expecting very many arcs.) + */ +static bool +check_in_colors_match(struct state *s, color co1, color co2) +{ + struct arc *a1; + struct arc *a2; + + for (a1 = s->ins; a1 != NULL; a1 = a1->inchain) + { + if (a1->type != PLAIN || a1->co != co1) + continue; + for (a2 = s->ins; a2 != NULL; a2 = a2->inchain) + { + if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from) + break; + } + if (a2 == NULL) + return false; + } + return true; +} + /* * compact - construct the compact representation of an NFA */ @@ -2930,7 +3214,9 @@ compact(struct nfa *nfa, cnfa->eos[0] = nfa->eos[0]; cnfa->eos[1] = nfa->eos[1]; cnfa->ncolors = maxcolor(nfa->cm) + 1; - cnfa->flags = 0; + cnfa->flags = nfa->flags; + cnfa->minmatchall = nfa->minmatchall; + cnfa->maxmatchall = nfa->maxmatchall; ca = cnfa->arcs; for (s = nfa->states; s != NULL; s = s->next) @@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa, fprintf(f, ", eos [%ld]", (long) nfa->eos[0]); if (nfa->eos[1] != COLORLESS) fprintf(f, ", eol [%ld]", (long) nfa->eos[1]); + if (nfa->flags & HASLACONS) + fprintf(f, ", haslacons"); + if (nfa->flags & MATCHALL) + fprintf(f, ", minmatchall %d, maxmatchall %d", + nfa->minmatchall, nfa->maxmatchall); fprintf(f, "\n"); for (s = nfa->states; s != NULL; s = s->next) { @@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa, fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]); if (cnfa->flags & HASLACONS) fprintf(f, ", haslacons"); + if (cnfa->flags & MATCHALL) + fprintf(f, ", minmatchall %d, maxmatchall %d", + cnfa->minmatchall, cnfa->maxmatchall); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) dumpcstate(st, cnfa, f); diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index ae8dbe58191cb..39e837adc2af1 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -175,6 +175,11 @@ static void cleanup(struct nfa *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); static long analyze(struct nfa *); +static void checkmatchall(struct nfa *); +static bool checkmatchall_recurse(struct nfa *, struct state *, + bool, int, bool *); +static bool check_out_colors_match(struct state *, color, color); +static bool check_in_colors_match(struct state *, color, color); static void compact(struct nfa *, struct cnfa *); static void carcsort(struct carc *, size_t); static int carc_cmp(const void *, const void *); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 32be2592c5653..89d162ed6afd6 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -58,6 +58,29 @@ longest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = stop - start; + size_t maxmatchall = d->cnfa->maxmatchall; + + if (nchr < d->cnfa->minmatchall) + return NULL; + if (maxmatchall == DUPINF) + { + if (stop == v->stop && hitstopp != NULL) + *hitstopp = 1; + } + else + { + if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL) + *hitstopp = 1; + if (nchr > maxmatchall) + return start + maxmatchall; + } + return stop; + } + /* initialize */ css = initialize(v, d, start); if (css == NULL) @@ -187,6 +210,24 @@ shortest(struct vars *v, if (hitstopp != NULL) *hitstopp = 0; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = min - start; + + if (d->cnfa->maxmatchall != DUPINF && + nchr > d->cnfa->maxmatchall) + return NULL; + if ((max - start) < d->cnfa->minmatchall) + return NULL; + if (nchr < d->cnfa->minmatchall) + min = start + d->cnfa->minmatchall; + if (coldp != NULL) + *coldp = start; + /* there is no case where we should set *hitstopp */ + return min; + } + /* initialize */ css = initialize(v, d, start); if (css == NULL) @@ -312,6 +353,22 @@ matchuntil(struct vars *v, struct sset *ss; struct colormap *cm = d->cm; + /* fast path for matchall NFAs */ + if (d->cnfa->flags & MATCHALL) + { + size_t nchr = probe - v->start; + + /* + * It might seem that we should check maxmatchall too, but the .* at + * the front of the pattern absorbs any extra characters (and it was + * tacked on *after* computing minmatchall/maxmatchall). Thus, we + * should match if there are at least minmatchall characters. + */ + if (nchr < d->cnfa->minmatchall) + return 0; + return 1; + } + /* initialize and startup, or restart, if necessary */ if (cp == NULL || cp > probe) { diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c index e2fbad7a8a9b6..ec435b6f5f64f 100644 --- a/src/backend/regex/regprefix.c +++ b/src/backend/regex/regprefix.c @@ -77,6 +77,10 @@ pg_regprefix(regex_t *re, assert(g->tree != NULL); cnfa = &g->tree->cnfa; + /* matchall NFAs never have a fixed prefix */ + if (cnfa->flags & MATCHALL) + return REG_NOMATCH; + /* * Since a correct NFA should never contain any exit-free loops, it should * not be possible for our traversal to return to a previously visited NFA diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 6d391083194c4..82e761bfe5704 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -331,6 +331,9 @@ struct nfa struct colormap *cm; /* the color map */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + int flags; /* flags to pass forward to cNFA */ + int minmatchall; /* min number of chrs to match, if matchall */ + int maxmatchall; /* max number of chrs to match, or DUPINF */ struct vars *v; /* simplifies compile error reporting */ struct nfa *parent; /* parent NFA, if any */ }; @@ -353,6 +356,14 @@ struct nfa * * Note that in a plain arc, "co" can be RAINBOW; since that's negative, * it doesn't break the rule about how to recognize LACON arcs. + * + * We have special markings for "trivial" NFAs that can match any string + * (possibly with limits on the number of characters therein). In such a + * case, flags & MATCHALL is set (and HASLACONS can't be set). Then the + * fields minmatchall and maxmatchall give the minimum and maximum numbers + * of characters to match. For example, ".*" produces minmatchall = 0 + * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and + * maxmatchall = DUPINF. */ struct carc { @@ -366,6 +377,7 @@ struct cnfa int ncolors; /* number of colors (max color in use + 1) */ int flags; #define HASLACONS 01 /* uses lookaround constraints */ +#define MATCHALL 02 /* matches all strings of a range of lengths */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ @@ -375,6 +387,9 @@ struct cnfa struct carc **states; /* vector of pointers to outarc lists */ /* states[n] are pointers into a single malloc'd array of arcs */ struct carc *arcs; /* the area for the lists */ + /* these fields are used only in a MATCHALL NFA (else they're -1): */ + int minmatchall; /* min number of chrs to match */ + int maxmatchall; /* max number of chrs to match, or DUPINF */ }; /* From cebc1d34e5207c37871708f91be65dd839760b5f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 18:45:29 -0500 Subject: [PATCH 3/5] Fix regex engine to suppress useless concatenation sub-REs. The comment for parsebranch() claims that it avoids generating unnecessary concatenation nodes in the "subre" tree, but it missed some significant cases. Once we've decided that a given atom is "messy" and can't be bundled with the preceding atom(s) of the current regex branch, parseqatom() always generated two new concat nodes, one to concat the messy atom to what follows it in the branch, and an upper node to concatenate the preceding part of the branch to that one. But one or both of these could be unnecessary, if the messy atom is the first, last, or only one in the branch. Improve the code to suppress such useless concat nodes, along with the no-op child nodes representing empty chunks of a branch. Reducing the number of subre tree nodes offers significant savings not only at execution but during compilation, because each subre node has its own NFA that has to be separately optimized. (Maybe someday we'll figure out how to share the optimization work across multiple tree nodes, but it doesn't look easy.) Eliminating upper tree nodes is especially useful because they tend to have larger NFAs. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- src/backend/regex/regcomp.c | 83 +++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 39e837adc2af1..0182e02db1b1a 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1123,14 +1123,24 @@ parseqatom(struct vars *v, t->left = atom; atomp = &t->left; - /* here we should recurse... but we must postpone that to the end */ + /* + * Here we should recurse to fill t->right ... but we must postpone that + * to the end. + */ - /* split top into prefix and remaining */ + /* + * Convert top node to a concatenation of the prefix (top->left, covering + * whatever we parsed previously) and remaining (t). Note that the prefix + * could be empty, in which case this concatenation node is unnecessary. + * To keep things simple, we operate in a general way for now, and get rid + * of unnecessary subres below. + */ assert(top->op == '=' && top->left == NULL && top->right == NULL); top->left = subre(v, '=', top->flags, top->begin, lp); NOERR(); top->op = '.'; top->right = t; + /* top->flags will get updated later */ /* if it's a backref, now is the time to replicate the subNFA */ if (atomtype == BACKREF) @@ -1220,16 +1230,75 @@ parseqatom(struct vars *v, /* and finally, look after that postponed recursion */ t = top->right; if (!(SEE('|') || SEE(stopper) || SEE(EOS))) + { + /* parse all the rest of the branch, and insert in t->right */ t->right = parsebranch(v, stopper, type, s2, rp, 1); + NOERR(); + assert(SEE('|') || SEE(stopper) || SEE(EOS)); + + /* here's the promised update of the flags */ + t->flags |= COMBINE(t->flags, t->right->flags); + top->flags |= COMBINE(top->flags, t->flags); + + /* + * At this point both top and t are concatenation (op == '.') subres, + * and we have top->left = prefix of branch, top->right = t, t->left = + * messy atom (with quantification superstructure if needed), t->right + * = rest of branch. + * + * If the messy atom was the first thing in the branch, then top->left + * is vacuous and we can get rid of one level of concatenation. Since + * the caller is holding a pointer to the top node, we can't remove + * that node; but we're allowed to change its properties. + */ + assert(top->left->op == '='); + if (top->left->begin == top->left->end) + { + assert(!MESSY(top->left->flags)); + freesubre(v, top->left); + top->left = t->left; + top->right = t->right; + t->left = t->right = NULL; + freesubre(v, t); + } + } else { + /* + * There's nothing left in the branch, so we don't need the second + * concatenation node 't'. Just link s2 straight to rp. + */ EMPTYARC(s2, rp); - t->right = subre(v, '=', 0, s2, rp); + top->right = t->left; + top->flags |= COMBINE(top->flags, top->right->flags); + t->left = t->right = NULL; + freesubre(v, t); + + /* + * Again, it could be that top->left is vacuous (if the messy atom was + * in fact the only thing in the branch). In that case we need no + * concatenation at all; just replace top with top->right. + */ + assert(top->left->op == '='); + if (top->left->begin == top->left->end) + { + assert(!MESSY(top->left->flags)); + freesubre(v, top->left); + t = top->right; + top->op = t->op; + top->flags = t->flags; + top->id = t->id; + top->subno = t->subno; + top->min = t->min; + top->max = t->max; + top->left = t->left; + top->right = t->right; + top->begin = t->begin; + top->end = t->end; + t->left = t->right = NULL; + freesubre(v, t); + } } - NOERR(); - assert(SEE('|') || SEE(stopper) || SEE(EOS)); - t->flags |= COMBINE(t->flags, t->right->flags); - top->flags |= COMBINE(top->flags, t->flags); } /* From 581043089472816061a7fd381f40572191dfa48f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 19:07:45 -0500 Subject: [PATCH 4/5] Convert regex engine's subre tree from binary to N-ary style. Instead of having left and right child links in subre structs, have a single child link plus a sibling link. Multiple children of a tree node are now reached by chasing the sibling chain. The beneficiary of this is alternation tree nodes. A regular expression with N (>1) branches is now represented by one alternation node with N children, rather than a tree that includes N alternation nodes as well as N children. While the old representation didn't really cost anything extra at execution time, it was pretty horrid for compilation purposes, because each of the alternation nodes had its own NFA, which we were too stupid not to separately optimize. (To make matters worse, all of those NFAs described the entire alternation pattern, not just the portion of it that one might expect from the tree structure.) We continue to require concatenation nodes to have exactly two children. This data structure is now prepared to support more, but the executor's logic would need some careful redesign, and it's not clear that a lot of benefit could be had. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- src/backend/regex/README | 4 +- src/backend/regex/regcomp.c | 222 +++++++++++++++++++----------------- src/backend/regex/regexec.c | 104 +++++++++-------- src/include/regex/regguts.h | 20 ++-- 4 files changed, 185 insertions(+), 165 deletions(-) diff --git a/src/backend/regex/README b/src/backend/regex/README index a83ab5074dfc3..cafeb3dffb096 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many possibilities. As an example, consider the regex "(a[bc]+)\1". The compiled -representation will have a top-level concatenation subre node. Its left +representation will have a top-level concatenation subre node. Its first child is a capture node, and the child of that is a plain DFA node for -"a[bc]+". The concatenation's right child is a backref node for \1. +"a[bc]+". The concatenation's second child is a backref node for \1. The DFA associated with the concatenation node will be "a[bc]+a[bc]+", where the backref has been replaced by a copy of the DFA for its referent expression. When executed, the concatenation node will have to search for diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 0182e02db1b1a..c0680b280bc01 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -58,6 +58,7 @@ static void processlacon(struct vars *, struct state *, struct state *, int, struct state *, struct state *); static struct subre *subre(struct vars *, int, int, struct state *, struct state *); static void freesubre(struct vars *, struct subre *); +static void freesubreandsiblings(struct vars *, struct subre *); static void freesrnode(struct vars *, struct subre *); static void optst(struct vars *, struct subre *); static int numst(struct subre *, int); @@ -652,8 +653,8 @@ makesearch(struct vars *v, * parse - parse an RE * * This is actually just the top level, which parses a bunch of branches - * tied together with '|'. They appear in the tree as the left children - * of a chain of '|' subres. + * tied together with '|'. If there's more than one, they appear in the + * tree as the children of a '|' subre. */ static struct subre * parse(struct vars *v, @@ -662,41 +663,34 @@ parse(struct vars *v, struct state *init, /* initial state */ struct state *final) /* final state */ { - struct state *left; /* scaffolding for branch */ - struct state *right; struct subre *branches; /* top level */ - struct subre *branch; /* current branch */ - struct subre *t; /* temporary */ - int firstbranch; /* is this the first branch? */ + struct subre *lastbranch; /* latest branch */ assert(stopper == ')' || stopper == EOS); branches = subre(v, '|', LONGER, init, final); NOERRN(); - branch = branches; - firstbranch = 1; + lastbranch = NULL; do { /* a branch */ - if (!firstbranch) - { - /* need a place to hang it */ - branch->right = subre(v, '|', LONGER, init, final); - NOERRN(); - branch = branch->right; - } - firstbranch = 0; + struct subre *branch; + struct state *left; /* scaffolding for branch */ + struct state *right; + left = newstate(v->nfa); right = newstate(v->nfa); NOERRN(); EMPTYARC(init, left); EMPTYARC(right, final); NOERRN(); - branch->left = parsebranch(v, stopper, type, left, right, 0); + branch = parsebranch(v, stopper, type, left, right, 0); NOERRN(); - branch->flags |= UP(branch->flags | branch->left->flags); - if ((branch->flags & ~branches->flags) != 0) /* new flags */ - for (t = branches; t != branch; t = t->right) - t->flags |= branch->flags; + if (lastbranch) + lastbranch->sibling = branch; + else + branches->child = branch; + branches->flags |= UP(branches->flags | branch->flags); + lastbranch = branch; } while (EAT('|')); assert(SEE(stopper) || SEE(EOS)); @@ -707,20 +701,16 @@ parse(struct vars *v, } /* optimize out simple cases */ - if (branch == branches) + if (lastbranch == branches->child) { /* only one branch */ - assert(branch->right == NULL); - t = branch->left; - branch->left = NULL; - freesubre(v, branches); - branches = t; + assert(lastbranch->sibling == NULL); + freesrnode(v, branches); + branches = lastbranch; } else if (!MESSY(branches->flags)) { /* no interesting innards */ - freesubre(v, branches->left); - branches->left = NULL; - freesubre(v, branches->right); - branches->right = NULL; + freesubreandsiblings(v, branches->child); + branches->child = NULL; branches->op = '='; } @@ -972,7 +962,7 @@ parseqatom(struct vars *v, t = subre(v, '(', atom->flags | CAP, lp, rp); NOERR(); t->subno = subno; - t->left = atom; + t->child = atom; atom = t; } /* postpone everything else pending possible {0} */ @@ -1120,26 +1110,27 @@ parseqatom(struct vars *v, /* break remaining subRE into x{...} and what follows */ t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); NOERR(); - t->left = atom; - atomp = &t->left; + t->child = atom; + atomp = &t->child; /* - * Here we should recurse to fill t->right ... but we must postpone that - * to the end. + * Here we should recurse to fill t->child->sibling ... but we must + * postpone that to the end. One reason is that t->child may be replaced + * below, and we don't want to worry about its sibling link. */ /* - * Convert top node to a concatenation of the prefix (top->left, covering + * Convert top node to a concatenation of the prefix (top->child, covering * whatever we parsed previously) and remaining (t). Note that the prefix * could be empty, in which case this concatenation node is unnecessary. * To keep things simple, we operate in a general way for now, and get rid * of unnecessary subres below. */ - assert(top->op == '=' && top->left == NULL && top->right == NULL); - top->left = subre(v, '=', top->flags, top->begin, lp); + assert(top->op == '=' && top->child == NULL); + top->child = subre(v, '=', top->flags, top->begin, lp); NOERR(); top->op = '.'; - top->right = t; + top->child->sibling = t; /* top->flags will get updated later */ /* if it's a backref, now is the time to replicate the subNFA */ @@ -1201,9 +1192,9 @@ parseqatom(struct vars *v, f = COMBINE(qprefer, atom->flags); t = subre(v, '.', f, s, atom->end); /* prefix and atom */ NOERR(); - t->left = subre(v, '=', PREF(f), s, atom->begin); + t->child = subre(v, '=', PREF(f), s, atom->begin); NOERR(); - t->right = atom; + t->child->sibling = atom; *atomp = t; /* rest of branch can be strung starting from atom->end */ s2 = atom->end; @@ -1222,44 +1213,43 @@ parseqatom(struct vars *v, NOERR(); t->min = (short) m; t->max = (short) n; - t->left = atom; + t->child = atom; *atomp = t; /* rest of branch is to be strung from iteration's end state */ } /* and finally, look after that postponed recursion */ - t = top->right; + t = top->child->sibling; if (!(SEE('|') || SEE(stopper) || SEE(EOS))) { - /* parse all the rest of the branch, and insert in t->right */ - t->right = parsebranch(v, stopper, type, s2, rp, 1); + /* parse all the rest of the branch, and insert in t->child->sibling */ + t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1); NOERR(); assert(SEE('|') || SEE(stopper) || SEE(EOS)); /* here's the promised update of the flags */ - t->flags |= COMBINE(t->flags, t->right->flags); + t->flags |= COMBINE(t->flags, t->child->sibling->flags); top->flags |= COMBINE(top->flags, t->flags); /* * At this point both top and t are concatenation (op == '.') subres, - * and we have top->left = prefix of branch, top->right = t, t->left = - * messy atom (with quantification superstructure if needed), t->right - * = rest of branch. + * and we have top->child = prefix of branch, top->child->sibling = t, + * t->child = messy atom (with quantification superstructure if + * needed), t->child->sibling = rest of branch. * - * If the messy atom was the first thing in the branch, then top->left - * is vacuous and we can get rid of one level of concatenation. Since - * the caller is holding a pointer to the top node, we can't remove - * that node; but we're allowed to change its properties. + * If the messy atom was the first thing in the branch, then + * top->child is vacuous and we can get rid of one level of + * concatenation. Since the caller is holding a pointer to the top + * node, we can't remove that node; but we're allowed to change its + * properties. */ - assert(top->left->op == '='); - if (top->left->begin == top->left->end) + assert(top->child->op == '='); + if (top->child->begin == top->child->end) { - assert(!MESSY(top->left->flags)); - freesubre(v, top->left); - top->left = t->left; - top->right = t->right; - t->left = t->right = NULL; - freesubre(v, t); + assert(!MESSY(top->child->flags)); + freesubre(v, top->child); + top->child = t->child; + freesrnode(v, t); } } else @@ -1269,34 +1259,31 @@ parseqatom(struct vars *v, * concatenation node 't'. Just link s2 straight to rp. */ EMPTYARC(s2, rp); - top->right = t->left; - top->flags |= COMBINE(top->flags, top->right->flags); - t->left = t->right = NULL; - freesubre(v, t); + top->child->sibling = t->child; + top->flags |= COMBINE(top->flags, top->child->sibling->flags); + freesrnode(v, t); /* - * Again, it could be that top->left is vacuous (if the messy atom was - * in fact the only thing in the branch). In that case we need no - * concatenation at all; just replace top with top->right. + * Again, it could be that top->child is vacuous (if the messy atom + * was in fact the only thing in the branch). In that case we need no + * concatenation at all; just replace top with top->child->sibling. */ - assert(top->left->op == '='); - if (top->left->begin == top->left->end) + assert(top->child->op == '='); + if (top->child->begin == top->child->end) { - assert(!MESSY(top->left->flags)); - freesubre(v, top->left); - t = top->right; + assert(!MESSY(top->child->flags)); + t = top->child->sibling; + freesubre(v, top->child); top->op = t->op; top->flags = t->flags; top->id = t->id; top->subno = t->subno; top->min = t->min; top->max = t->max; - top->left = t->left; - top->right = t->right; + top->child = t->child; top->begin = t->begin; top->end = t->end; - t->left = t->right = NULL; - freesubre(v, t); + freesrnode(v, t); } } } @@ -1786,7 +1773,7 @@ subre(struct vars *v, } if (ret != NULL) - v->treefree = ret->left; + v->treefree = ret->child; else { ret = (struct subre *) MALLOC(sizeof(struct subre)); @@ -1806,8 +1793,8 @@ subre(struct vars *v, ret->id = 0; /* will be assigned later */ ret->subno = 0; ret->min = ret->max = 1; - ret->left = NULL; - ret->right = NULL; + ret->child = NULL; + ret->sibling = NULL; ret->begin = begin; ret->end = end; ZAPCNFA(ret->cnfa); @@ -1817,6 +1804,9 @@ subre(struct vars *v, /* * freesubre - free a subRE subtree + * + * This frees child node(s) of the given subRE too, + * but not its siblings. */ static void freesubre(struct vars *v, /* might be NULL */ @@ -1825,14 +1815,31 @@ freesubre(struct vars *v, /* might be NULL */ if (sr == NULL) return; - if (sr->left != NULL) - freesubre(v, sr->left); - if (sr->right != NULL) - freesubre(v, sr->right); + if (sr->child != NULL) + freesubreandsiblings(v, sr->child); freesrnode(v, sr); } +/* + * freesubreandsiblings - free a subRE subtree + * + * This frees child node(s) of the given subRE too, + * as well as any following siblings. + */ +static void +freesubreandsiblings(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + while (sr != NULL) + { + struct subre *next = sr->sibling; + + freesubre(v, sr); + sr = next; + } +} + /* * freesrnode - free one node in a subRE subtree */ @@ -1850,7 +1857,7 @@ freesrnode(struct vars *v, /* might be NULL */ if (v != NULL && v->treechain != NULL) { /* we're still parsing, maybe we can reuse the subre */ - sr->left = v->treefree; + sr->child = v->treefree; v->treefree = sr; } else @@ -1881,15 +1888,14 @@ numst(struct subre *t, int start) /* starting point for subtree numbers */ { int i; + struct subre *t2; assert(t != NULL); i = start; t->id = (short) i++; - if (t->left != NULL) - i = numst(t->left, i); - if (t->right != NULL) - i = numst(t->right, i); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + i = numst(t2, i); return i; } @@ -1913,13 +1919,13 @@ numst(struct subre *t, static void markst(struct subre *t) { + struct subre *t2; + assert(t != NULL); t->flags |= INUSE; - if (t->left != NULL) - markst(t->left); - if (t->right != NULL) - markst(t->right); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + markst(t2); } /* @@ -1949,12 +1955,12 @@ nfatree(struct vars *v, struct subre *t, FILE *f) /* for debug output */ { + struct subre *t2; + assert(t != NULL && t->begin != NULL); - if (t->left != NULL) - (DISCARD) nfatree(v, t->left, f); - if (t->right != NULL) - (DISCARD) nfatree(v, t->right, f); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + (DISCARD) nfatree(v, t2, f); return nfanode(v, t, 0, f); } @@ -2206,6 +2212,7 @@ stdump(struct subre *t, int nfapresent) /* is the original NFA still around? */ { char idbuf[50]; + struct subre *t2; fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); if (t->flags & LONGER) @@ -2231,20 +2238,21 @@ stdump(struct subre *t, } if (nfapresent) fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no); - if (t->left != NULL) - fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); - if (t->right != NULL) - fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); + if (t->child != NULL) + fprintf(f, " C:%s", stid(t->child, idbuf, sizeof(idbuf))); + /* printing second child isn't necessary, but it is often helpful */ + if (t->child != NULL && t->child->sibling != NULL) + fprintf(f, " C2:%s", stid(t->child->sibling, idbuf, sizeof(idbuf))); + if (t->sibling != NULL) + fprintf(f, " S:%s", stid(t->sibling, idbuf, sizeof(idbuf))); if (!NULLCNFA(t->cnfa)) { fprintf(f, "\n"); dumpcnfa(&t->cnfa, f); } fprintf(f, "\n"); - if (t->left != NULL) - stdump(t->left, f, nfapresent); - if (t->right != NULL) - stdump(t->right, f, nfapresent); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + stdump(t2, f, nfapresent); } /* diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index adcbcc0a8ea25..4541cf9a7e085 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -640,6 +640,8 @@ static void zaptreesubs(struct vars *v, struct subre *t) { + struct subre *t2; + if (t->op == '(') { int n = t->subno; @@ -652,10 +654,8 @@ zaptreesubs(struct vars *v, } } - if (t->left != NULL) - zaptreesubs(v, t->left); - if (t->right != NULL) - zaptreesubs(v, t->right); + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + zaptreesubs(v, t2); } /* @@ -714,35 +714,35 @@ cdissect(struct vars *v, switch (t->op) { case '=': /* terminal node */ - assert(t->left == NULL && t->right == NULL); + assert(t->child == NULL); er = REG_OKAY; /* no action, parent did the work */ break; case 'b': /* back reference */ - assert(t->left == NULL && t->right == NULL); + assert(t->child == NULL); er = cbrdissect(v, t, begin, end); break; case '.': /* concatenation */ - assert(t->left != NULL && t->right != NULL); - if (t->left->flags & SHORTER) /* reverse scan */ + assert(t->child != NULL); + if (t->child->flags & SHORTER) /* reverse scan */ er = crevcondissect(v, t, begin, end); else er = ccondissect(v, t, begin, end); break; case '|': /* alternation */ - assert(t->left != NULL); + assert(t->child != NULL); er = caltdissect(v, t, begin, end); break; case '*': /* iteration */ - assert(t->left != NULL); - if (t->left->flags & SHORTER) /* reverse scan */ + assert(t->child != NULL); + if (t->child->flags & SHORTER) /* reverse scan */ er = creviterdissect(v, t, begin, end); else er = citerdissect(v, t, begin, end); break; case '(': /* capturing */ - assert(t->left != NULL && t->right == NULL); + assert(t->child != NULL); assert(t->subno > 0); - er = cdissect(v, t->left, begin, end); + er = cdissect(v, t->child, begin, end); if (er == REG_OKAY) subset(v, t, begin, end); break; @@ -770,19 +770,22 @@ ccondissect(struct vars *v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { + struct subre *left = t->child; + struct subre *right = left->sibling; struct dfa *d; struct dfa *d2; chr *mid; int er; assert(t->op == '.'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->right != NULL && t->right->cnfa.nstates > 0); - assert(!(t->left->flags & SHORTER)); + assert(left != NULL && left->cnfa.nstates > 0); + assert(right != NULL && right->cnfa.nstates > 0); + assert(right->sibling == NULL); + assert(!(left->flags & SHORTER)); - d = getsubdfa(v, t->left); + d = getsubdfa(v, left); NOERR(); - d2 = getsubdfa(v, t->right); + d2 = getsubdfa(v, right); NOERR(); MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); @@ -799,10 +802,10 @@ ccondissect(struct vars *v, /* try this midpoint on for size */ if (longest(v, d2, mid, end, (int *) NULL) == end) { - er = cdissect(v, t->left, begin, mid); + er = cdissect(v, left, begin, mid); if (er == REG_OKAY) { - er = cdissect(v, t->right, mid, end); + er = cdissect(v, right, mid, end); if (er == REG_OKAY) { /* satisfaction */ @@ -831,8 +834,8 @@ ccondissect(struct vars *v, return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); - zaptreesubs(v, t->left); - zaptreesubs(v, t->right); + zaptreesubs(v, left); + zaptreesubs(v, right); } /* can't get here */ @@ -848,19 +851,22 @@ crevcondissect(struct vars *v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { + struct subre *left = t->child; + struct subre *right = left->sibling; struct dfa *d; struct dfa *d2; chr *mid; int er; assert(t->op == '.'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->right != NULL && t->right->cnfa.nstates > 0); - assert(t->left->flags & SHORTER); + assert(left != NULL && left->cnfa.nstates > 0); + assert(right != NULL && right->cnfa.nstates > 0); + assert(right->sibling == NULL); + assert(left->flags & SHORTER); - d = getsubdfa(v, t->left); + d = getsubdfa(v, left); NOERR(); - d2 = getsubdfa(v, t->right); + d2 = getsubdfa(v, right); NOERR(); MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); @@ -877,10 +883,10 @@ crevcondissect(struct vars *v, /* try this midpoint on for size */ if (longest(v, d2, mid, end, (int *) NULL) == end) { - er = cdissect(v, t->left, begin, mid); + er = cdissect(v, left, begin, mid); if (er == REG_OKAY) { - er = cdissect(v, t->right, mid, end); + er = cdissect(v, right, mid, end); if (er == REG_OKAY) { /* satisfaction */ @@ -909,8 +915,8 @@ crevcondissect(struct vars *v, return REG_NOMATCH; } MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); - zaptreesubs(v, t->left); - zaptreesubs(v, t->right); + zaptreesubs(v, left); + zaptreesubs(v, right); } /* can't get here */ @@ -1011,26 +1017,30 @@ caltdissect(struct vars *v, struct dfa *d; int er; - /* We loop, rather than tail-recurse, to handle a chain of alternatives */ + assert(t->op == '|'); + + t = t->child; + /* there should be at least 2 alternatives */ + assert(t != NULL && t->sibling != NULL); + while (t != NULL) { - assert(t->op == '|'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->cnfa.nstates > 0); MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); - d = getsubdfa(v, t->left); + d = getsubdfa(v, t); NOERR(); if (longest(v, d, begin, end, (int *) NULL) == end) { MDEBUG(("%d: caltdissect matched\n", t->id)); - er = cdissect(v, t->left, begin, end); + er = cdissect(v, t, begin, end); if (er != REG_NOMATCH) return er; } NOERR(); - t = t->right; + t = t->sibling; } return REG_NOMATCH; @@ -1056,8 +1066,8 @@ citerdissect(struct vars *v, int er; assert(t->op == '*'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(!(t->left->flags & SHORTER)); + assert(t->child != NULL && t->child->cnfa.nstates > 0); + assert(!(t->child->flags & SHORTER)); assert(begin <= end); MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); @@ -1094,7 +1104,7 @@ citerdissect(struct vars *v, return REG_ESPACE; endpts[0] = begin; - d = getsubdfa(v, t->left); + d = getsubdfa(v, t->child); if (ISERR()) { FREE(endpts); @@ -1172,8 +1182,8 @@ citerdissect(struct vars *v, for (i = nverified + 1; i <= k; i++) { - zaptreesubs(v, t->left); - er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + zaptreesubs(v, t->child); + er = cdissect(v, t->child, endpts[i - 1], endpts[i]); if (er == REG_OKAY) { nverified = i; @@ -1258,8 +1268,8 @@ creviterdissect(struct vars *v, int er; assert(t->op == '*'); - assert(t->left != NULL && t->left->cnfa.nstates > 0); - assert(t->left->flags & SHORTER); + assert(t->child != NULL && t->child->cnfa.nstates > 0); + assert(t->child->flags & SHORTER); assert(begin <= end); MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); @@ -1299,7 +1309,7 @@ creviterdissect(struct vars *v, return REG_ESPACE; endpts[0] = begin; - d = getsubdfa(v, t->left); + d = getsubdfa(v, t->child); if (ISERR()) { FREE(endpts); @@ -1383,8 +1393,8 @@ creviterdissect(struct vars *v, for (i = nverified + 1; i <= k; i++) { - zaptreesubs(v, t->left); - er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + zaptreesubs(v, t->child); + er = cdissect(v, t->child, endpts[i - 1], endpts[i]); if (er == REG_OKAY) { nverified = i; diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 82e761bfe5704..90ee16957ac7a 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -423,15 +423,17 @@ struct cnfa * '=' plain regex without interesting substructure (implemented as DFA) * 'b' back-reference (has no substructure either) * '(' capture node: captures the match of its single child - * '.' concatenation: matches a match for left, then a match for right - * '|' alternation: matches a match for left or a match for right + * '.' concatenation: matches a match for first child, then second child + * '|' alternation: matches a match for any of its children * '*' iteration: matches some number of matches of its single child * - * Note: the right child of an alternation must be another alternation or - * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you - * might expect. This could stand to be changed. Actually I'd rather see - * a single alternation node with N children, but that will take revising - * the representation of struct subre. + * An alternation node can have any number of children (but at least two), + * linked through their sibling fields. + * + * A concatenation node must have exactly two children. It might be useful + * to support more, but that would complicate the executor. Note that it is + * the first child's greediness that determines the node's preference for + * where to split a match. * * Note: when a backref is directly quantified, we stick the min/max counts * into the backref rather than plastering an iteration node on top. This is @@ -460,8 +462,8 @@ struct subre * LATYPE code for lookaround constraint */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ - struct subre *left; /* left child, if any (also freelist chain) */ - struct subre *right; /* right child, if any */ + struct subre *child; /* first child, if any (also freelist chain) */ + struct subre *sibling; /* next child of same parent, if any */ struct state *begin; /* outarcs from here... */ struct state *end; /* ...ending in inarcs here */ struct cnfa cnfa; /* compacted NFA, if any */ From ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 19:26:41 -0500 Subject: [PATCH 5/5] Avoid generating extra subre tree nodes for capturing parentheses. Previously, each pair of capturing parentheses gave rise to a separate subre tree node, whose only function was to identify that we ought to capture the match details for this particular sub-expression. In most cases we don't really need that, since we can perfectly well put a "capture this" annotation on the child node that does the real matching work. As with the two preceding commits, the main value of this is to avoid generating and optimizing an NFA for a tree node that's not really pulling its weight. The chosen data representation only allows one capture annotation per subre node. In the legal-per-spec, but seemingly not very useful, case where there are multiple capturing parens around the exact same bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture nodes that act the same as before. We could work harder at that but I'll refrain, pending some evidence that such cases are worth troubling over. In passing, improve the comments in regex.h to say what all the different re_info bits mean. Some of them were pretty obvious but others not so much, so reverse-engineer some documentation. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- src/backend/regex/README | 15 ++++++++-- src/backend/regex/regcomp.c | 54 ++++++++++++++++++++++++++---------- src/backend/regex/rege_dfa.c | 6 ++-- src/backend/regex/regexec.c | 22 ++++++++------- src/include/regex/regex.h | 32 +++++++++++---------- src/include/regex/regguts.h | 13 +++++---- 6 files changed, 91 insertions(+), 51 deletions(-) diff --git a/src/backend/regex/README b/src/backend/regex/README index cafeb3dffb096..e4b083664f21a 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -130,8 +130,8 @@ possibilities. As an example, consider the regex "(a[bc]+)\1". The compiled representation will have a top-level concatenation subre node. Its first -child is a capture node, and the child of that is a plain DFA node for -"a[bc]+". The concatenation's second child is a backref node for \1. +child is a plain DFA node for "a[bc]+" (which is marked as being a capture +node). The concatenation's second child is a backref node for \1. The DFA associated with the concatenation node will be "a[bc]+a[bc]+", where the backref has been replaced by a copy of the DFA for its referent expression. When executed, the concatenation node will have to search for @@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's library. +It's perhaps worth noting that separate capture subre nodes are a rarity: +normally, we just mark a subre as capturing and that's it. However, it's +legal to write a regex like "((x))" in which the same substring has to be +captured by multiple sets of parentheses. Since a subre has room for only +one "capno" field, a single subre can't handle that. We handle such cases +by wrapping the base subre (which captures the innermost parens) in a +no-op capture node, or even more than one for "(((x)))" etc. This is a +little bit inefficient because we end up with multiple identical NFAs, +but since the case is pointless and infrequent, it's not worth working +harder. + Colors and colormapping ----------------------- diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index c0680b280bc01..0cd4b4c4c2977 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -452,7 +452,7 @@ pg_regcomp(regex_t *re, #endif /* Prepend .* to pattern if it's a lookbehind LACON */ - nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug); } CNOERR(); if (v->tree->flags & SHORTER) @@ -944,7 +944,13 @@ parseqatom(struct vars *v, else atomtype = PLAIN; /* something that's not '(' */ NEXT(); - /* need new endpoints because tree will contain pointers */ + + /* + * Make separate endpoints to ensure we keep this sub-NFA cleanly + * separate from what surrounds it. We need to be sure that when + * we duplicate the sub-NFA for a backref, we get the right states + * and no others. + */ s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); @@ -959,11 +965,21 @@ parseqatom(struct vars *v, { assert(v->subs[subno] == NULL); v->subs[subno] = atom; - t = subre(v, '(', atom->flags | CAP, lp, rp); - NOERR(); - t->subno = subno; - t->child = atom; - atom = t; + if (atom->capno == 0) + { + /* normal case: just mark the atom as capturing */ + atom->flags |= CAP; + atom->capno = subno; + } + else + { + /* generate no-op wrapper node to handle "((x))" */ + t = subre(v, '(', atom->flags | CAP, lp, rp); + NOERR(); + t->capno = subno; + t->child = atom; + atom = t; + } } /* postpone everything else pending possible {0} */ break; @@ -976,7 +992,7 @@ parseqatom(struct vars *v, atom = subre(v, 'b', BACKR, lp, rp); NOERR(); subno = v->nextvalue; - atom->subno = subno; + atom->backno = subno; EMPTYARC(lp, rp); /* temporarily, so there's something */ NEXT(); break; @@ -1276,8 +1292,10 @@ parseqatom(struct vars *v, freesubre(v, top->child); top->op = t->op; top->flags = t->flags; + top->latype = t->latype; top->id = t->id; - top->subno = t->subno; + top->capno = t->capno; + top->backno = t->backno; top->min = t->min; top->max = t->max; top->child = t->child; @@ -1790,8 +1808,10 @@ subre(struct vars *v, ret->op = op; ret->flags = flags; + ret->latype = (char) -1; ret->id = 0; /* will be assigned later */ - ret->subno = 0; + ret->capno = 0; + ret->backno = 0; ret->min = ret->max = 1; ret->child = NULL; ret->sibling = NULL; @@ -1893,7 +1913,7 @@ numst(struct subre *t, assert(t != NULL); i = start; - t->id = (short) i++; + t->id = i++; for (t2 = t->child; t2 != NULL; t2 = t2->sibling) i = numst(t2, i); return i; @@ -2040,7 +2060,7 @@ newlacon(struct vars *v, sub = &v->lacons[n]; sub->begin = begin; sub->end = end; - sub->subno = latype; + sub->latype = latype; ZAPCNFA(sub->cnfa); return n; } @@ -2163,7 +2183,7 @@ dump(regex_t *re, struct subre *lasub = &g->lacons[i]; const char *latype; - switch (lasub->subno) + switch (lasub->latype) { case LATYPE_AHEAD_POS: latype = "positive lookahead"; @@ -2227,8 +2247,12 @@ stdump(struct subre *t, fprintf(f, " hasbackref"); if (!(t->flags & INUSE)) fprintf(f, " UNUSED"); - if (t->subno != 0) - fprintf(f, " (#%d)", t->subno); + if (t->latype != (char) -1) + fprintf(f, " latype(%d)", t->latype); + if (t->capno != 0) + fprintf(f, " capture(%d)", t->capno); + if (t->backno != 0) + fprintf(f, " backref(%d)", t->backno); if (t->min != 1 || t->max != 1) { fprintf(f, " {%d,", t->min); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 89d162ed6afd6..957ceb8137de6 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -825,12 +825,12 @@ lacon(struct vars *v, d = getladfa(v, n); if (d == NULL) return 0; - if (LATYPE_IS_AHEAD(sub->subno)) + if (LATYPE_IS_AHEAD(sub->latype)) { /* used to use longest() here, but shortest() could be much cheaper */ end = shortest(v, d, cp, cp, v->stop, (chr **) NULL, (int *) NULL); - satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); + satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL); } else { @@ -843,7 +843,7 @@ lacon(struct vars *v, * nominal match. */ satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); - if (!LATYPE_IS_POS(sub->subno)) + if (!LATYPE_IS_POS(sub->latype)) satisfied = !satisfied; } FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 4541cf9a7e085..2a1ef0176a500 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -640,13 +640,11 @@ static void zaptreesubs(struct vars *v, struct subre *t) { + int n = t->capno; struct subre *t2; - if (t->op == '(') + if (n > 0) { - int n = t->subno; - - assert(n > 0); if ((size_t) n < v->nmatch) { v->pmatch[n].rm_so = -1; @@ -667,7 +665,7 @@ subset(struct vars *v, chr *begin, chr *end) { - int n = sub->subno; + int n = sub->capno; assert(n > 0); if ((size_t) n >= v->nmatch) @@ -739,12 +737,10 @@ cdissect(struct vars *v, else er = citerdissect(v, t, begin, end); break; - case '(': /* capturing */ + case '(': /* no-op capture node */ assert(t->child != NULL); - assert(t->subno > 0); + assert(t->capno > 0); er = cdissect(v, t->child, begin, end); - if (er == REG_OKAY) - subset(v, t, begin, end); break; default: er = REG_ASSERT; @@ -758,6 +754,12 @@ cdissect(struct vars *v, */ assert(er != REG_NOMATCH || (t->flags & BACKR)); + /* + * If this node is marked as capturing, save successful match's location. + */ + if (t->capno > 0 && er == REG_OKAY) + subset(v, t, begin, end); + return er; } @@ -932,7 +934,7 @@ cbrdissect(struct vars *v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { - int n = t->subno; + int n = t->backno; size_t numreps; size_t tlen; size_t brlen; diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index dc31899aa4d4c..2b48a19fb7d05 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -56,21 +56,23 @@ typedef struct { int re_magic; /* magic number */ size_t re_nsub; /* number of subexpressions */ - long re_info; /* information about RE */ -#define REG_UBACKREF 000001 -#define REG_ULOOKAROUND 000002 -#define REG_UBOUNDS 000004 -#define REG_UBRACES 000010 -#define REG_UBSALNUM 000020 -#define REG_UPBOTCH 000040 -#define REG_UBBS 000100 -#define REG_UNONPOSIX 000200 -#define REG_UUNSPEC 000400 -#define REG_UUNPORT 001000 -#define REG_ULOCALE 002000 -#define REG_UEMPTYMATCH 004000 -#define REG_UIMPOSSIBLE 010000 -#define REG_USHORTEST 020000 + long re_info; /* bitmask of the following flags: */ +#define REG_UBACKREF 000001 /* has back-reference (\n) */ +#define REG_ULOOKAROUND 000002 /* has lookahead/lookbehind constraint */ +#define REG_UBOUNDS 000004 /* has bounded quantifier ({m,n}) */ +#define REG_UBRACES 000010 /* has { that doesn't begin a quantifier */ +#define REG_UBSALNUM 000020 /* has backslash-alphanumeric in non-ARE */ +#define REG_UPBOTCH 000040 /* has unmatched right paren in ERE (legal + * per spec, but that was a mistake) */ +#define REG_UBBS 000100 /* has backslash within bracket expr */ +#define REG_UNONPOSIX 000200 /* has any construct that extends POSIX */ +#define REG_UUNSPEC 000400 /* has any case disallowed by POSIX, e.g. + * an empty branch */ +#define REG_UUNPORT 001000 /* has numeric character code dependency */ +#define REG_ULOCALE 002000 /* has locale dependency */ +#define REG_UEMPTYMATCH 004000 /* can match a zero-length string */ +#define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */ +#define REG_USHORTEST 020000 /* has non-greedy quantifier */ int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ Oid re_collation; /* Collation that defines LC_CTYPE behavior */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 90ee16957ac7a..306525eb5fa89 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -422,7 +422,7 @@ struct cnfa * "op" is one of: * '=' plain regex without interesting substructure (implemented as DFA) * 'b' back-reference (has no substructure either) - * '(' capture node: captures the match of its single child + * '(' no-op capture node: captures the match of its single child * '.' concatenation: matches a match for first child, then second child * '|' alternation: matches a match for any of its children * '*' iteration: matches some number of matches of its single child @@ -446,8 +446,8 @@ struct subre #define LONGER 01 /* prefers longer match */ #define SHORTER 02 /* prefers shorter match */ #define MIXED 04 /* mixed preference below */ -#define CAP 010 /* capturing parens below */ -#define BACKR 020 /* back reference below */ +#define CAP 010 /* capturing parens here or below */ +#define BACKR 020 /* back reference here or below */ #define INUSE 0100 /* in use in final tree */ #define NOPROP 03 /* bits which may not propagate up */ #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ @@ -457,9 +457,10 @@ struct subre #define PREF(f) ((f)&NOPROP) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) - short id; /* ID of subre (1..ntree-1) */ - int subno; /* subexpression number for 'b' and '(', or - * LATYPE code for lookaround constraint */ + char latype; /* LATYPE code, if lookaround constraint */ + int id; /* ID of subre (1..ntree-1) */ + int capno; /* if capture node, subno to capture into */ + int backno; /* if backref node, subno it refers to */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ struct subre *child; /* first child, if any (also freelist chain) */