sthagen
diff --git a/‎contrib/pg_trgm/trgm_regexp.c‎
Lines changed: 18 additions & 9 deletions b/‎contrib/pg_trgm/trgm_regexp.c‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎src/backend/regex/README‎
Lines changed: 42 additions & 11 deletions b/‎src/backend/regex/README‎
Lines changed: 42 additions & 11 deletions
diff --git a/‎src/backend/regex/regc_color.c‎
Lines changed: 21 additions & 1 deletion b/‎src/backend/regex/regc_color.c‎
Lines changed: 21 additions & 1 deletion
@@ -282,8 +282,8 @@ typedef struct
 typedef int TrgmColor;
 
 /* We assume that colors returned by the regexp engine cannot be these: */
-#define COLOR_UNKNOWN	(-1)
-#define COLOR_BLANK		(-2)
+#define COLOR_UNKNOWN	(-3)
+#define COLOR_BLANK		(-4)
 
 typedef struct
 {
@@ -780,7 +780,8 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
 		palloc0(colorsCount * sizeof(TrgmColorInfo));
 
 	/*
-	 * Loop over colors, filling TrgmColorInfo about each.
+	 * Loop over colors, filling TrgmColorInfo about each.  Note we include
+	 * WHITE (0) even though we know it'll be reported as non-expandable.
 	 */
 	for (i = 0; i < colorsCount; i++)
 	{
@@ -1098,9 +1099,9 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
 			/* Add enter key to this state */
 			addKeyToQueue(trgmNFA, &destKey);
 		}
-		else
+		else if (arc->co >= 0)
 		{
-			/* Regular color */
+			/* Regular color (including WHITE) */
 			TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
 
 			if (colorInfo->expandable)
@@ -1156,6 +1157,14 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
 				addKeyToQueue(trgmNFA, &destKey);
 			}
 		}
+		else
+		{
+			/* RAINBOW: treat as unexpandable color */
+			destKey.prefix.colors[0] = COLOR_UNKNOWN;
+			destKey.prefix.colors[1] = COLOR_UNKNOWN;
+			destKey.nstate = arc->to;
+			addKeyToQueue(trgmNFA, &destKey);
+		}
 	}
 
 	pfree(arcs);
@@ -1216,10 +1225,10 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state)
 			/*
 			 * Ignore non-expandable colors; addKey already handled the case.
 			 *
-			 * We need no special check for begin/end pseudocolors here.  We
-			 * don't need to do any processing for them, and they will be
-			 * marked non-expandable since the regex engine will have reported
-			 * them that way.
+			 * We need no special check for WHITE or begin/end pseudocolors
+			 * here.  We don't need to do any processing for them, and they
+			 * will be marked non-expandable since the regex engine will have
+			 * reported them that way.
 			 */
 			if (!colorInfo->expandable)
 				continue;
 
@@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many
 possibilities.
 
 As an example, consider the regex "(a[bc]+)\1".  The compiled
-representation will have a top-level concatenation subre node.  Its left
-child is a capture node, and the child of that is a plain DFA node for
-"a[bc]+".  The concatenation's right child is a backref node for \1.
+representation will have a top-level concatenation subre node.  Its first
+child is a plain DFA node for "a[bc]+" (which is marked as being a capture
+node).  The concatenation's second child is a backref node for \1.
 The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
 where the backref has been replaced by a copy of the DFA for its referent
 expression.  When executed, the concatenation node will have to search for
@@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do.  It is this behavior that
 justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
 library.
 
+It's perhaps worth noting that separate capture subre nodes are a rarity:
+normally, we just mark a subre as capturing and that's it.  However, it's
+legal to write a regex like "((x))" in which the same substring has to be
+captured by multiple sets of parentheses.  Since a subre has room for only
+one "capno" field, a single subre can't handle that.  We handle such cases
+by wrapping the base subre (which captures the innermost parens) in a
+no-op capture node, or even more than one for "(((x)))" etc.  This is a
+little bit inefficient because we end up with multiple identical NFAs,
+but since the case is pointless and infrequent, it's not worth working
+harder.
+
 
 Colors and colormapping
 -----------------------
@@ -261,6 +272,18 @@ and the NFA has these arcs:
 	states 4 -> 5 on color 2 ("x" only)
 which can be seen to be a correct representation of the regex.
 
+There is one more complexity, which is how to handle ".", that is a
+match-anything atom.  We used to do that by generating a "rainbow"
+of arcs of all live colors between the two NFA states before and after
+the dot.  That's expensive in itself when there are lots of colors,
+and it also typically adds lots of follow-on arc-splitting work for the
+color splitting logic.  Now we handle this case by generating a single arc
+labeled with the special color RAINBOW, meaning all colors.  Such arcs
+never need to be split, so they help keep NFAs small in this common case.
+(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is
+not supposed to match newline.  In that case we still handle "." by
+generating an almost-rainbow of all colors except newline's color.)
+
 Given this summary, we can see we need the following operations for
 colors:
 
@@ -349,18 +372,20 @@ The possible arc types are:
 
     PLAIN arcs, which specify matching of any character of a given "color"
     (see above).  These are dumped as "[color_number]->to_state".
+    In addition there can be "rainbow" PLAIN arcs, which are dumped as
+    "[*]->to_state".
 
     EMPTY arcs, which specify a no-op transition to another state.  These
     are dumped as "->to_state".
 
     AHEAD constraints, which represent a "next character must be of this
     color" constraint.  AHEAD differs from a PLAIN arc in that the input
     character is not consumed when crossing the arc.  These are dumped as
-    ">color_number>->to_state".
+    ">color_number>->to_state", or possibly ">*>->to_state".
 
     BEHIND constraints, which represent a "previous character must be of
     this color" constraint, which likewise consumes no input.  These are
-    dumped as "<color_number<->to_state".
+    dumped as "<color_number<->to_state", or possibly "<*<->to_state".
 
     '^' arcs, which specify a beginning-of-input constraint.  These are
     dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
@@ -396,14 +421,20 @@ substring, or an imaginary following EOS character if the substring is at
 the end of the input.
 3. If the NFA is (or can be) in the goal state at this point, it matches.
 
+This definition is necessary to support regexes that begin or end with
+constraints such as \m and \M, which imply requirements on the adjacent
+character if any.  The executor implements that by checking if the
+adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the
+right color, and it does that in the same loop that checks characters
+within the match.
+
 So one can mentally execute an untransformed NFA by taking ^ and $ as
 ordinary constraints that match at start and end of input; but plain
 arcs out of the start state should be taken as matches for the character
 before the target substring, and similarly, plain arcs leading to the
 post state are matches for the character after the target substring.
-This definition is necessary to support regexes that begin or end with
-constraints such as \m and \M, which imply requirements on the adjacent
-character if any.  NFAs for simple unanchored patterns will usually have
-pre-state outarcs for all possible character colors as well as BOS and
-BOL, and post-state inarcs for all possible character colors as well as
-EOS and EOL, so that the executor's behavior will work.
+After the optimize() transformation, there are explicit arcs mentioning
+BOS/BOL/EOS/EOL adjacent to the pre-state and post-state.  So a finished
+NFA for a pattern without anchors or adjacent-character constraints will
+have pre-state outarcs for RAINBOW (all possible character colors) as well
+as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL.
@@ -977,6 +977,7 @@ colorchain(struct colormap *cm,
 {
 	struct colordesc *cd = &cm->cd[a->co];
 
+	assert(a->co >= 0);
 	if (cd->arcs != NULL)
 		cd->arcs->colorchainRev = a;
 	a->colorchain = cd->arcs;
@@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm,
 	struct colordesc *cd = &cm->cd[a->co];
 	struct arc *aa = a->colorchainRev;
 
+	assert(a->co >= 0);
 	if (aa == NULL)
 	{
 		assert(cd->arcs == a);
@@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm,
 
 /*
  * rainbow - add arcs of all full colors (but one) between specified states
+ *
+ * If there isn't an exception color, we now generate just a single arc
+ * labeled RAINBOW, saving lots of arc-munging later on.
  */
 static void
 rainbow(struct nfa *nfa,
@@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa,
 	struct colordesc *end = CDEND(cm);
 	color		co;
 
+	if (but == COLORLESS)
+	{
+		newarc(nfa, type, RAINBOW, from, to);
+		return;
+	}
+
+	/* Gotta do it the hard way.  Skip subcolors, pseudocolors, and "but" */
 	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
 		if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
 			!(cd->flags & PSEUDO))
@@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa,
 /*
  * colorcomplement - add arcs of complementary colors
  *
+ * We add arcs of all colors that are not pseudocolors and do not match
+ * any of the "of" state's PLAIN outarcs.
+ *
  * The calling sequence ought to be reconciled with cloneouts().
  */
 static void
 colorcomplement(struct nfa *nfa,
 				struct colormap *cm,
 				int type,
-				struct state *of,	/* complements of this guy's PLAIN outarcs */
+				struct state *of,
 				struct state *from,
 				struct state *to)
 {
@@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa,
 	color		co;
 
 	assert(of != from);
+
+	/* A RAINBOW arc matches all colors, making the complement empty */
+	if (findarc(of, PLAIN, RAINBOW) != NULL)
+		return;
+
 	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
 		if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
 			if (findarc(of, PLAIN, co) == NULL)