Skip to content

Commit 53b5a8c

Browse files
committed
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty. It intends to avoid merging the initial and final states of the NFA, which is necessary, but in testing whether removal of a specific trigram would cause that, it failed to consider the combined effects of all the state merges that that trigram's removal would cause. This could result in a broken final graph that would never match anything, leading to GIN or GiST indexscans not finding anything. To fix, add a "tentParent" field that is used only within this loop, and set it to show state merges that we are tentatively going to do. While examining a particular arc, we must chase up through tentParent links as well as regular parent links (the former can only appear atop the latter), and we must account for state init/fin flag merges that haven't actually been done yet. To simplify the latter, combine the separate init and fin bool fields into a bitmap flags field. I also chose to get rid of the "children" state list, which seems entirely inessential. Per bug #14563 from Alexey Isayko, which the added test cases are based on. Back-patch to 9.3 where this code was added. Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
1 parent 3f613c6 commit 53b5a8c

File tree

3 files changed

+91
-37
lines changed

3 files changed

+91
-37
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
11941194
0.5 | qwertyu0987
11951195
(2 rows)
11961196

1197+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
1198+
count
1199+
-------
1200+
1000
1201+
(1 row)
1202+
11971203
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
11981204
set enable_seqscan=off;
11991205
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2338,6 +2344,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
23382344
0.5 | qwertyu0987
23392345
(2 rows)
23402346

2347+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2348+
count
2349+
-------
2350+
1000
2351+
(1 row)
2352+
23412353
drop index trgm_idx;
23422354
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
23432355
set enable_seqscan=off;
@@ -3467,6 +3479,12 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673479
qwertyu0988 | 0.333333
34683480
(1 row)
34693481

3482+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3483+
count
3484+
-------
3485+
1000
3486+
(1 row)
3487+
34703488
create table test2(t text);
34713489
insert into test2 values ('abcdef');
34723490
insert into test2 values ('quark');

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
2121
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
2222
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
2323
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
24+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2425

2526
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
2627
set enable_seqscan=off;
@@ -31,6 +32,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
3132
explain (costs off)
3233
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
3334
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
35+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3436

3537
drop index trgm_idx;
3638
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
@@ -39,6 +41,7 @@ set enable_seqscan=off;
3941
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
4042
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4143
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
44+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
4245

4346
create table test2(t text);
4447
insert into test2 values ('abcdef');

contrib/pg_trgm/trgm_regexp.c

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -287,21 +287,22 @@ typedef struct
287287
* arcs - outgoing arcs of this state (List of TrgmArc)
288288
* enterKeys - enter keys reachable from this state without reading any
289289
* predictable trigram (List of TrgmStateKey)
290-
* fin - flag indicating this state is final
291-
* init - flag indicating this state is initial
290+
* flags - flag bits
292291
* parent - parent state, if this state has been merged into another
293-
* children - child states (states that have been merged into this one)
292+
* tentParent - planned parent state, if considering a merge
294293
* number - number of this state (used at the packaging stage)
295294
*/
295+
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
296+
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
297+
296298
typedef struct TrgmState
297299
{
298300
TrgmStateKey stateKey; /* hashtable key: must be first field */
299301
List *arcs;
300302
List *enterKeys;
301-
bool fin;
302-
bool init;
303+
int flags;
303304
struct TrgmState *parent;
304-
List *children;
305+
struct TrgmState *tentParent;
305306
int number;
306307
} TrgmState;
307308

@@ -569,7 +570,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
569570
* get from the initial state to the final state without reading any
570571
* predictable trigram.
571572
*/
572-
if (trgmNFA.initState->fin)
573+
if (trgmNFA.initState->flags & TSTATE_FIN)
573574
return NULL;
574575

575576
/*
@@ -896,7 +897,7 @@ transformGraph(TrgmNFA *trgmNFA)
896897
initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
897898

898899
initstate = getState(trgmNFA, &initkey);
899-
initstate->init = true;
900+
initstate->flags |= TSTATE_INIT;
900901
trgmNFA->initState = initstate;
901902

902903
/*
@@ -914,7 +915,7 @@ transformGraph(TrgmNFA *trgmNFA)
914915
* actual processing.
915916
*/
916917
if (trgmNFA->overflowed)
917-
state->fin = true;
918+
state->flags |= TSTATE_FIN;
918919
else
919920
processState(trgmNFA, state);
920921

@@ -939,7 +940,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
939940
* queue is empty. But we can quit if the state gets marked final.
940941
*/
941942
addKey(trgmNFA, state, &state->stateKey);
942-
while (trgmNFA->keysQueue != NIL && !state->fin)
943+
while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
943944
{
944945
TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue);
945946

@@ -951,7 +952,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
951952
* Add outgoing arcs only if state isn't final (we have no interest in
952953
* outgoing arcs if we already match)
953954
*/
954-
if (!state->fin)
955+
if (!(state->flags & TSTATE_FIN))
955956
addArcs(trgmNFA, state);
956957
}
957958

@@ -960,7 +961,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
960961
* whether this should result in any further enter keys being added.
961962
* If so, add those keys to keysQueue so that processState will handle them.
962963
*
963-
* If the enter key is for the NFA's final state, set state->fin = TRUE.
964+
* If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
964965
* This situation means that we can reach the final state from this expanded
965966
* state without reading any predictable trigram, so we must consider this
966967
* state as an accepting one.
@@ -1030,7 +1031,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10301031
/* If state is now known final, mark it and we're done */
10311032
if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
10321033
{
1033-
state->fin = true;
1034+
state->flags |= TSTATE_FIN;
10341035
return;
10351036
}
10361037

@@ -1356,10 +1357,9 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
13561357
/* New state: initialize and queue it */
13571358
state->arcs = NIL;
13581359
state->enterKeys = NIL;
1359-
state->init = false;
1360-
state->fin = false;
1360+
state->flags = 0;
13611361
state->parent = NULL;
1362-
state->children = NIL;
1362+
state->tentParent = NULL;
13631363
state->number = -1;
13641364

13651365
trgmNFA->queue = lappend(trgmNFA->queue, state);
@@ -1538,20 +1538,60 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15381538
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
15391539
TrgmState *source = arcInfo->source,
15401540
*target = arcInfo->target;
1541+
int source_flags,
1542+
target_flags;
15411543

15421544
/* examine parent states, if any merging has already happened */
15431545
while (source->parent)
15441546
source = source->parent;
15451547
while (target->parent)
15461548
target = target->parent;
15471549

1548-
if ((source->init || target->init) &&
1549-
(source->fin || target->fin))
1550+
/* we must also consider merges we are planning right now */
1551+
source_flags = source->flags;
1552+
while (source->tentParent)
1553+
{
1554+
source = source->tentParent;
1555+
source_flags |= source->flags;
1556+
}
1557+
target_flags = target->flags;
1558+
while (target->tentParent)
1559+
{
1560+
target = target->tentParent;
1561+
target_flags |= target->flags;
1562+
}
1563+
1564+
/* would fully-merged state have both INIT and FIN set? */
1565+
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1566+
(TSTATE_INIT | TSTATE_FIN))
15501567
{
15511568
canRemove = false;
15521569
break;
15531570
}
1571+
1572+
/* ok so far, so remember planned merge */
1573+
if (source != target)
1574+
target->tentParent = source;
15541575
}
1576+
1577+
/* We must clear all the tentParent fields before continuing */
1578+
foreach(cell, trgmInfo->arcs)
1579+
{
1580+
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1581+
TrgmState *target = arcInfo->target;
1582+
TrgmState *ttarget;
1583+
1584+
while (target->parent)
1585+
target = target->parent;
1586+
1587+
while ((ttarget = target->tentParent) != NULL)
1588+
{
1589+
target->tentParent = NULL;
1590+
target = ttarget;
1591+
}
1592+
}
1593+
1594+
/* Now, move on if we can't drop this trigram */
15551595
if (!canRemove)
15561596
continue;
15571597

@@ -1567,7 +1607,12 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15671607
while (target->parent)
15681608
target = target->parent;
15691609
if (source != target)
1610+
{
15701611
mergeStates(source, target);
1612+
/* Assert we didn't merge initial and final states */
1613+
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1614+
(TSTATE_INIT | TSTATE_FIN));
1615+
}
15711616
}
15721617

15731618
/* Mark trigram unexpanded, and update totalTrgmCount */
@@ -1709,27 +1754,15 @@ fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
17091754
static void
17101755
mergeStates(TrgmState *state1, TrgmState *state2)
17111756
{
1712-
ListCell *cell;
1713-
17141757
Assert(state1 != state2);
17151758
Assert(!state1->parent);
17161759
Assert(!state2->parent);
17171760

1718-
/* state1 absorbs state2's init/fin flags */
1719-
state1->init |= state2->init;
1720-
state1->fin |= state2->fin;
1761+
/* state1 absorbs state2's flags */
1762+
state1->flags |= state2->flags;
17211763

1722-
/* state2, and all its children, become children of state1 */
1723-
foreach(cell, state2->children)
1724-
{
1725-
TrgmState *state = (TrgmState *) lfirst(cell);
1726-
1727-
state->parent = state1;
1728-
}
1764+
/* state2, and indirectly all its children, become children of state1 */
17291765
state2->parent = state1;
1730-
state1->children = list_concat(state1->children, state2->children);
1731-
state1->children = lappend(state1->children, state2);
1732-
state2->children = NIL;
17331766
}
17341767

17351768
/*
@@ -1798,9 +1831,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
17981831

17991832
if (state->number < 0)
18001833
{
1801-
if (state->init)
1834+
if (state->flags & TSTATE_INIT)
18021835
state->number = 0;
1803-
else if (state->fin)
1836+
else if (state->flags & TSTATE_FIN)
18041837
state->number = 1;
18051838
else
18061839
{
@@ -2064,9 +2097,9 @@ printTrgmNFA(TrgmNFA *trgmNFA)
20642097
ListCell *cell;
20652098

20662099
appendStringInfo(&buf, "s%p", (void *) state);
2067-
if (state->fin)
2100+
if (state->flags & TSTATE_FIN)
20682101
appendStringInfo(&buf, " [shape = doublecircle]");
2069-
if (state->init)
2102+
if (state->flags & TSTATE_INIT)
20702103
initstate = state;
20712104
appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
20722105
appendStringInfo(&buf, ";\n");

0 commit comments

Comments
 (0)