Skip to content

Commit 62516f1

Browse files
authored
Merge pull request #149 from postgres/master
Sync Fork from Upstream Repo
2 parents e25885a + ea1268f commit 62516f1

File tree

12 files changed

+866
-222
lines changed

12 files changed

+866
-222
lines changed

contrib/pg_trgm/trgm_regexp.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,8 @@ typedef struct
282282
typedef int TrgmColor;
283283

284284
/* We assume that colors returned by the regexp engine cannot be these: */
285-
#define COLOR_UNKNOWN (-1)
286-
#define COLOR_BLANK (-2)
285+
#define COLOR_UNKNOWN (-3)
286+
#define COLOR_BLANK (-4)
287287

288288
typedef struct
289289
{
@@ -780,7 +780,8 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
780780
palloc0(colorsCount * sizeof(TrgmColorInfo));
781781

782782
/*
783-
* Loop over colors, filling TrgmColorInfo about each.
783+
* Loop over colors, filling TrgmColorInfo about each. Note we include
784+
* WHITE (0) even though we know it'll be reported as non-expandable.
784785
*/
785786
for (i = 0; i < colorsCount; i++)
786787
{
@@ -1098,9 +1099,9 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10981099
/* Add enter key to this state */
10991100
addKeyToQueue(trgmNFA, &destKey);
11001101
}
1101-
else
1102+
else if (arc->co >= 0)
11021103
{
1103-
/* Regular color */
1104+
/* Regular color (including WHITE) */
11041105
TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
11051106

11061107
if (colorInfo->expandable)
@@ -1156,6 +1157,14 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
11561157
addKeyToQueue(trgmNFA, &destKey);
11571158
}
11581159
}
1160+
else
1161+
{
1162+
/* RAINBOW: treat as unexpandable color */
1163+
destKey.prefix.colors[0] = COLOR_UNKNOWN;
1164+
destKey.prefix.colors[1] = COLOR_UNKNOWN;
1165+
destKey.nstate = arc->to;
1166+
addKeyToQueue(trgmNFA, &destKey);
1167+
}
11591168
}
11601169

11611170
pfree(arcs);
@@ -1216,10 +1225,10 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state)
12161225
/*
12171226
* Ignore non-expandable colors; addKey already handled the case.
12181227
*
1219-
* We need no special check for begin/end pseudocolors here. We
1220-
* don't need to do any processing for them, and they will be
1221-
* marked non-expandable since the regex engine will have reported
1222-
* them that way.
1228+
* We need no special check for WHITE or begin/end pseudocolors
1229+
* here. We don't need to do any processing for them, and they
1230+
* will be marked non-expandable since the regex engine will have
1231+
* reported them that way.
12231232
*/
12241233
if (!colorInfo->expandable)
12251234
continue;

src/backend/regex/README

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many
129129
possibilities.
130130

131131
As an example, consider the regex "(a[bc]+)\1". The compiled
132-
representation will have a top-level concatenation subre node. Its left
133-
child is a capture node, and the child of that is a plain DFA node for
134-
"a[bc]+". The concatenation's right child is a backref node for \1.
132+
representation will have a top-level concatenation subre node. Its first
133+
child is a plain DFA node for "a[bc]+" (which is marked as being a capture
134+
node). The concatenation's second child is a backref node for \1.
135135
The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
136136
where the backref has been replaced by a copy of the DFA for its referent
137137
expression. When executed, the concatenation node will have to search for
@@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that
147147
justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
148148
library.
149149

150+
It's perhaps worth noting that separate capture subre nodes are a rarity:
151+
normally, we just mark a subre as capturing and that's it. However, it's
152+
legal to write a regex like "((x))" in which the same substring has to be
153+
captured by multiple sets of parentheses. Since a subre has room for only
154+
one "capno" field, a single subre can't handle that. We handle such cases
155+
by wrapping the base subre (which captures the innermost parens) in a
156+
no-op capture node, or even more than one for "(((x)))" etc. This is a
157+
little bit inefficient because we end up with multiple identical NFAs,
158+
but since the case is pointless and infrequent, it's not worth working
159+
harder.
160+
150161

151162
Colors and colormapping
152163
-----------------------
@@ -261,6 +272,18 @@ and the NFA has these arcs:
261272
states 4 -> 5 on color 2 ("x" only)
262273
which can be seen to be a correct representation of the regex.
263274

275+
There is one more complexity, which is how to handle ".", that is a
276+
match-anything atom. We used to do that by generating a "rainbow"
277+
of arcs of all live colors between the two NFA states before and after
278+
the dot. That's expensive in itself when there are lots of colors,
279+
and it also typically adds lots of follow-on arc-splitting work for the
280+
color splitting logic. Now we handle this case by generating a single arc
281+
labeled with the special color RAINBOW, meaning all colors. Such arcs
282+
never need to be split, so they help keep NFAs small in this common case.
283+
(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is
284+
not supposed to match newline. In that case we still handle "." by
285+
generating an almost-rainbow of all colors except newline's color.)
286+
264287
Given this summary, we can see we need the following operations for
265288
colors:
266289

@@ -349,18 +372,20 @@ The possible arc types are:
349372

350373
PLAIN arcs, which specify matching of any character of a given "color"
351374
(see above). These are dumped as "[color_number]->to_state".
375+
In addition there can be "rainbow" PLAIN arcs, which are dumped as
376+
"[*]->to_state".
352377

353378
EMPTY arcs, which specify a no-op transition to another state. These
354379
are dumped as "->to_state".
355380

356381
AHEAD constraints, which represent a "next character must be of this
357382
color" constraint. AHEAD differs from a PLAIN arc in that the input
358383
character is not consumed when crossing the arc. These are dumped as
359-
">color_number>->to_state".
384+
">color_number>->to_state", or possibly ">*>->to_state".
360385

361386
BEHIND constraints, which represent a "previous character must be of
362387
this color" constraint, which likewise consumes no input. These are
363-
dumped as "<color_number<->to_state".
388+
dumped as "<color_number<->to_state", or possibly "<*<->to_state".
364389

365390
'^' arcs, which specify a beginning-of-input constraint. These are
366391
dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
@@ -396,14 +421,20 @@ substring, or an imaginary following EOS character if the substring is at
396421
the end of the input.
397422
3. If the NFA is (or can be) in the goal state at this point, it matches.
398423

424+
This definition is necessary to support regexes that begin or end with
425+
constraints such as \m and \M, which imply requirements on the adjacent
426+
character if any. The executor implements that by checking if the
427+
adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the
428+
right color, and it does that in the same loop that checks characters
429+
within the match.
430+
399431
So one can mentally execute an untransformed NFA by taking ^ and $ as
400432
ordinary constraints that match at start and end of input; but plain
401433
arcs out of the start state should be taken as matches for the character
402434
before the target substring, and similarly, plain arcs leading to the
403435
post state are matches for the character after the target substring.
404-
This definition is necessary to support regexes that begin or end with
405-
constraints such as \m and \M, which imply requirements on the adjacent
406-
character if any. NFAs for simple unanchored patterns will usually have
407-
pre-state outarcs for all possible character colors as well as BOS and
408-
BOL, and post-state inarcs for all possible character colors as well as
409-
EOS and EOL, so that the executor's behavior will work.
436+
After the optimize() transformation, there are explicit arcs mentioning
437+
BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished
438+
NFA for a pattern without anchors or adjacent-character constraints will
439+
have pre-state outarcs for RAINBOW (all possible character colors) as well
440+
as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL.

src/backend/regex/regc_color.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,7 @@ colorchain(struct colormap *cm,
977977
{
978978
struct colordesc *cd = &cm->cd[a->co];
979979

980+
assert(a->co >= 0);
980981
if (cd->arcs != NULL)
981982
cd->arcs->colorchainRev = a;
982983
a->colorchain = cd->arcs;
@@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm,
994995
struct colordesc *cd = &cm->cd[a->co];
995996
struct arc *aa = a->colorchainRev;
996997

998+
assert(a->co >= 0);
997999
if (aa == NULL)
9981000
{
9991001
assert(cd->arcs == a);
@@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm,
10121014

10131015
/*
10141016
* rainbow - add arcs of all full colors (but one) between specified states
1017+
*
1018+
* If there isn't an exception color, we now generate just a single arc
1019+
* labeled RAINBOW, saving lots of arc-munging later on.
10151020
*/
10161021
static void
10171022
rainbow(struct nfa *nfa,
@@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa,
10251030
struct colordesc *end = CDEND(cm);
10261031
color co;
10271032

1033+
if (but == COLORLESS)
1034+
{
1035+
newarc(nfa, type, RAINBOW, from, to);
1036+
return;
1037+
}
1038+
1039+
/* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */
10281040
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
10291041
if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
10301042
!(cd->flags & PSEUDO))
@@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa,
10341046
/*
10351047
* colorcomplement - add arcs of complementary colors
10361048
*
1049+
* We add arcs of all colors that are not pseudocolors and do not match
1050+
* any of the "of" state's PLAIN outarcs.
1051+
*
10371052
* The calling sequence ought to be reconciled with cloneouts().
10381053
*/
10391054
static void
10401055
colorcomplement(struct nfa *nfa,
10411056
struct colormap *cm,
10421057
int type,
1043-
struct state *of, /* complements of this guy's PLAIN outarcs */
1058+
struct state *of,
10441059
struct state *from,
10451060
struct state *to)
10461061
{
@@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa,
10491064
color co;
10501065

10511066
assert(of != from);
1067+
1068+
/* A RAINBOW arc matches all colors, making the complement empty */
1069+
if (findarc(of, PLAIN, RAINBOW) != NULL)
1070+
return;
1071+
10521072
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
10531073
if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
10541074
if (findarc(of, PLAIN, co) == NULL)

0 commit comments

Comments
 (0)