Skip to content

Commit 9875568

Browse files
committed
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty. It intends to avoid merging the initial and final states of the NFA, which is necessary, but in testing whether removal of a specific trigram would cause that, it failed to consider the combined effects of all the state merges that that trigram's removal would cause. This could result in a broken final graph that would never match anything, leading to GIN or GiST indexscans not finding anything. To fix, add a "tentParent" field that is used only within this loop, and set it to show state merges that we are tentatively going to do. While examining a particular arc, we must chase up through tentParent links as well as regular parent links (the former can only appear atop the latter), and we must account for state init/fin flag merges that haven't actually been done yet. To simplify the latter, combine the separate init and fin bool fields into a bitmap flags field. I also chose to get rid of the "children" state list, which seems entirely inessential. Per bug #14563 from Alexey Isayko, which the added test cases are based on. Back-patch to 9.3 where this code was added. Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
1 parent a3eb715 commit 9875568

File tree

3 files changed

+91
-37
lines changed

3 files changed

+91
-37
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
11941194
0.5 | qwertyu0987
11951195
(2 rows)
11961196

1197+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
1198+
count
1199+
-------
1200+
1000
1201+
(1 row)
1202+
11971203
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
11981204
set enable_seqscan=off;
11991205
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2338,6 +2344,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
23382344
0.5 | qwertyu0987
23392345
(2 rows)
23402346

2347+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2348+
count
2349+
-------
2350+
1000
2351+
(1 row)
2352+
23412353
drop index trgm_idx;
23422354
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
23432355
set enable_seqscan=off;
@@ -3467,6 +3479,12 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673479
qwertyu0988 | 0.333333
34683480
(1 row)
34693481

3482+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3483+
count
3484+
-------
3485+
1000
3486+
(1 row)
3487+
34703488
create table test2(t text);
34713489
insert into test2 values ('abcdef');
34723490
insert into test2 values ('quark');

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
2121
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
2222
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
2323
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
24+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2425

2526
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
2627
set enable_seqscan=off;
@@ -31,6 +32,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
3132
explain (costs off)
3233
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
3334
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
35+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3436

3537
drop index trgm_idx;
3638
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
@@ -39,6 +41,7 @@ set enable_seqscan=off;
3941
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
4042
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4143
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
44+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
4245

4346
create table test2(t text);
4447
insert into test2 values ('abcdef');

contrib/pg_trgm/trgm_regexp.c

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -318,21 +318,22 @@ typedef struct
318318
* arcs - outgoing arcs of this state (List of TrgmArc)
319319
* enterKeys - enter keys reachable from this state without reading any
320320
* predictable trigram (List of TrgmStateKey)
321-
* fin - flag indicating this state is final
322-
* init - flag indicating this state is initial
321+
* flags - flag bits
323322
* parent - parent state, if this state has been merged into another
324-
* children - child states (states that have been merged into this one)
323+
* tentParent - planned parent state, if considering a merge
325324
* number - number of this state (used at the packaging stage)
326325
*/
326+
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
327+
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
328+
327329
typedef struct TrgmState
328330
{
329331
TrgmStateKey stateKey; /* hashtable key: must be first field */
330332
List *arcs;
331333
List *enterKeys;
332-
bool fin;
333-
bool init;
334+
int flags;
334335
struct TrgmState *parent;
335-
List *children;
336+
struct TrgmState *tentParent;
336337
int number;
337338
} TrgmState;
338339

@@ -601,7 +602,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
601602
* get from the initial state to the final state without reading any
602603
* predictable trigram.
603604
*/
604-
if (trgmNFA.initState->fin)
605+
if (trgmNFA.initState->flags & TSTATE_FIN)
605606
return NULL;
606607

607608
/*
@@ -928,7 +929,7 @@ transformGraph(TrgmNFA *trgmNFA)
928929
initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
929930

930931
initstate = getState(trgmNFA, &initkey);
931-
initstate->init = true;
932+
initstate->flags |= TSTATE_INIT;
932933
trgmNFA->initState = initstate;
933934

934935
/*
@@ -946,7 +947,7 @@ transformGraph(TrgmNFA *trgmNFA)
946947
* actual processing.
947948
*/
948949
if (trgmNFA->overflowed)
949-
state->fin = true;
950+
state->flags |= TSTATE_FIN;
950951
else
951952
processState(trgmNFA, state);
952953

@@ -971,7 +972,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
971972
* queue is empty. But we can quit if the state gets marked final.
972973
*/
973974
addKey(trgmNFA, state, &state->stateKey);
974-
while (trgmNFA->keysQueue != NIL && !state->fin)
975+
while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
975976
{
976977
TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue);
977978

@@ -983,7 +984,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
983984
* Add outgoing arcs only if state isn't final (we have no interest in
984985
* outgoing arcs if we already match)
985986
*/
986-
if (!state->fin)
987+
if (!(state->flags & TSTATE_FIN))
987988
addArcs(trgmNFA, state);
988989
}
989990

@@ -992,7 +993,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
992993
* whether this should result in any further enter keys being added.
993994
* If so, add those keys to keysQueue so that processState will handle them.
994995
*
995-
* If the enter key is for the NFA's final state, set state->fin = TRUE.
996+
* If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
996997
* This situation means that we can reach the final state from this expanded
997998
* state without reading any predictable trigram, so we must consider this
998999
* state as an accepting one.
@@ -1062,7 +1063,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10621063
/* If state is now known final, mark it and we're done */
10631064
if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
10641065
{
1065-
state->fin = true;
1066+
state->flags |= TSTATE_FIN;
10661067
return;
10671068
}
10681069

@@ -1388,10 +1389,9 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
13881389
/* New state: initialize and queue it */
13891390
state->arcs = NIL;
13901391
state->enterKeys = NIL;
1391-
state->init = false;
1392-
state->fin = false;
1392+
state->flags = 0;
13931393
state->parent = NULL;
1394-
state->children = NIL;
1394+
state->tentParent = NULL;
13951395
state->number = -1;
13961396

13971397
trgmNFA->queue = lappend(trgmNFA->queue, state);
@@ -1585,20 +1585,60 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15851585
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
15861586
TrgmState *source = arcInfo->source,
15871587
*target = arcInfo->target;
1588+
int source_flags,
1589+
target_flags;
15881590

15891591
/* examine parent states, if any merging has already happened */
15901592
while (source->parent)
15911593
source = source->parent;
15921594
while (target->parent)
15931595
target = target->parent;
15941596

1595-
if ((source->init || target->init) &&
1596-
(source->fin || target->fin))
1597+
/* we must also consider merges we are planning right now */
1598+
source_flags = source->flags;
1599+
while (source->tentParent)
1600+
{
1601+
source = source->tentParent;
1602+
source_flags |= source->flags;
1603+
}
1604+
target_flags = target->flags;
1605+
while (target->tentParent)
1606+
{
1607+
target = target->tentParent;
1608+
target_flags |= target->flags;
1609+
}
1610+
1611+
/* would fully-merged state have both INIT and FIN set? */
1612+
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1613+
(TSTATE_INIT | TSTATE_FIN))
15971614
{
15981615
canRemove = false;
15991616
break;
16001617
}
1618+
1619+
/* ok so far, so remember planned merge */
1620+
if (source != target)
1621+
target->tentParent = source;
16011622
}
1623+
1624+
/* We must clear all the tentParent fields before continuing */
1625+
foreach(cell, trgmInfo->arcs)
1626+
{
1627+
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1628+
TrgmState *target = arcInfo->target;
1629+
TrgmState *ttarget;
1630+
1631+
while (target->parent)
1632+
target = target->parent;
1633+
1634+
while ((ttarget = target->tentParent) != NULL)
1635+
{
1636+
target->tentParent = NULL;
1637+
target = ttarget;
1638+
}
1639+
}
1640+
1641+
/* Now, move on if we can't drop this trigram */
16021642
if (!canRemove)
16031643
continue;
16041644

@@ -1614,7 +1654,12 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
16141654
while (target->parent)
16151655
target = target->parent;
16161656
if (source != target)
1657+
{
16171658
mergeStates(source, target);
1659+
/* Assert we didn't merge initial and final states */
1660+
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1661+
(TSTATE_INIT | TSTATE_FIN));
1662+
}
16181663
}
16191664

16201665
/* Mark trigram unexpanded, and update totals */
@@ -1757,27 +1802,15 @@ fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
17571802
static void
17581803
mergeStates(TrgmState *state1, TrgmState *state2)
17591804
{
1760-
ListCell *cell;
1761-
17621805
Assert(state1 != state2);
17631806
Assert(!state1->parent);
17641807
Assert(!state2->parent);
17651808

1766-
/* state1 absorbs state2's init/fin flags */
1767-
state1->init |= state2->init;
1768-
state1->fin |= state2->fin;
1809+
/* state1 absorbs state2's flags */
1810+
state1->flags |= state2->flags;
17691811

1770-
/* state2, and all its children, become children of state1 */
1771-
foreach(cell, state2->children)
1772-
{
1773-
TrgmState *state = (TrgmState *) lfirst(cell);
1774-
1775-
state->parent = state1;
1776-
}
1812+
/* state2, and indirectly all its children, become children of state1 */
17771813
state2->parent = state1;
1778-
state1->children = list_concat(state1->children, state2->children);
1779-
state1->children = lappend(state1->children, state2);
1780-
state2->children = NIL;
17811814
}
17821815

17831816
/*
@@ -1846,9 +1879,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
18461879

18471880
if (state->number < 0)
18481881
{
1849-
if (state->init)
1882+
if (state->flags & TSTATE_INIT)
18501883
state->number = 0;
1851-
else if (state->fin)
1884+
else if (state->flags & TSTATE_FIN)
18521885
state->number = 1;
18531886
else
18541887
{
@@ -2112,9 +2145,9 @@ printTrgmNFA(TrgmNFA *trgmNFA)
21122145
ListCell *cell;
21132146

21142147
appendStringInfo(&buf, "s%p", (void *) state);
2115-
if (state->fin)
2148+
if (state->flags & TSTATE_FIN)
21162149
appendStringInfoString(&buf, " [shape = doublecircle]");
2117-
if (state->init)
2150+
if (state->flags & TSTATE_INIT)
21182151
initstate = state;
21192152
appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
21202153
appendStringInfoString(&buf, ";\n");

0 commit comments

Comments
 (0)