The code that translates SIMILAR TO pattern matching expressions to
POSIX-style regular expressions did not consider that square brackets
can be nested. For example, in an expression like [[:alpha:]%_], the
logic replaced the placeholders '_' and '%' but it should not.
This commit fixes the conversion logic by tracking the nesting level of
square brackets marking character class areas, while considering that
in expressions like []] or [^]] the first closing square bracket is a
regular character. Multiple tests are added to show how the conversions
should or should not apply applied while in a character class area, with
specific cases added for all the characters converted outside character
classes like an opening parenthesis '(', dollar sign '$', etc.
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/
16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
int plen,
elen;
bool afterescape = false;
- bool incharclass = false;
int nquotes = 0;
+ int charclass_depth = 0; /* Nesting level of character classes,
+ * encompassed by square brackets */
+ int charclass_start = 0; /* State of the character class start,
+ * for carets */
p = VARDATA_ANY(pat_text);
plen = VARSIZE_ANY_EXHDR(pat_text);
/* fast path */
if (afterescape)
{
- if (pchar == '"' && !incharclass) /* escape-double-quote? */
+ if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
{
/* emit appropriate part separator, per notes above */
if (nquotes == 0)
/* SQL escape character; do not send to output */
afterescape = true;
}
- else if (incharclass)
+ else if (charclass_depth > 0)
{
if (pchar == '\\')
*r++ = '\\';
*r++ = pchar;
- if (pchar == ']')
- incharclass = false;
+
+ /*
+ * Ignore a closing bracket at the start of a character class.
+ * Such a bracket is taken literally rather than closing the
+ * class. "charclass_start" is 1 right at the beginning of a
+ * class and 2 after an initial caret.
+ */
+ if (pchar == ']' && charclass_start > 2)
+ charclass_depth--;
+ else if (pchar == '[')
+ charclass_depth++;
+
+ /*
+ * If there is a caret right after the opening bracket, it negates
+ * the character class, but a following closing bracket should
+ * still be treated as a normal character. That holds only for
+ * the first caret, so only the values 1 and 2 mean that closing
+ * brackets should be taken literally.
+ */
+ if (pchar == '^')
+ charclass_start++;
+ else
+ charclass_start = 3; /* definitely past the start */
}
else if (pchar == '[')
{
+ /* start of a character class */
*r++ = pchar;
- incharclass = true;
+ charclass_depth++;
+ charclass_start = 1;
}
else if (pchar == '%')
{
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
ERROR: invalid escape string
HINT: Escape string must be empty or one character.
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
+ QUERY PLAN
+------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
+(2 rows)
+
+-- Percentage "%"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
+ QUERY PLAN
+--------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
+(2 rows)
+
+-- Dot "."
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
+ QUERY PLAN
+--------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
+(2 rows)
+
+-- Dollar "$"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
+ QUERY PLAN
+--------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
+(2 rows)
+
+-- Opening parenthesis "("
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
+ERROR: invalid regular expression: parentheses () not balanced
+-- Caret "^"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+ QUERY PLAN
+------------------------------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
+(2 rows)
+
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
+ QUERY PLAN
+------------------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
+(2 rows)
+
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
+ QUERY PLAN
+---------------------------------------
+ Seq Scan on text_tbl
+ Filter: (f1 ~ '^(?:[^^]\^)$'::text)
+(2 rows)
+
-- Test backslash escapes in regexp_replace's replacement string
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
regexp_replace
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
+-- Percentage "%"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
+-- Dot "."
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
+-- Dollar "$"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
+-- Opening parenthesis "("
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
+-- Caret "^"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
+
-- Test backslash escapes in regexp_replace's replacement string
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');