Skip to content

Commit a4b169a

Browse files
authored
Add Regex Extension Library (#1187)
Adds the Regex extension library for additional regular expression operations. Functions: `regex.replace(source : string, pattern_re : string, replace : string) -> string` -- replace all `regex.replace(source : string, pattern_re : string, replace : string, int : limit) -> string` -- replace all (capped) `regex.extract(source : string, pattern_re : string) -> optional(string)` -- extract pattern (first) `regex.extractAll(source : string, pattern_re : string) -> list(string)` -- extract pattern (all)
1 parent 050981e commit a4b169a

File tree

4 files changed

+611
-0
lines changed

4 files changed

+611
-0
lines changed

‎ext/BUILD.bazel‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ go_library(
1818
"math.go",
1919
"native.go",
2020
"protos.go",
21+
"regex.go",
2122
"sets.go",
2223
"strings.go",
2324
],
@@ -59,6 +60,7 @@ go_test(
5960
"math_test.go",
6061
"native_test.go",
6162
"protos_test.go",
63+
"regex_test.go",
6264
"sets_test.go",
6365
"strings_test.go",
6466
],

‎ext/extension_option_factory.go‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,7 @@ var extFactories = map[string]extensionFactory{
6969
"two-var-comprehensions": func(version uint32) cel.EnvOption {
7070
return TwoVarComprehensions(TwoVarComprehensionsVersion(version))
7171
},
72+
"regex": func(version uint32) cel.EnvOption {
73+
return Regex(RegexVersion(version))
74+
},
7275
}

‎ext/regex.go‎

Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package ext
16+
17+
import (
18+
"errors"
19+
"fmt"
20+
"math"
21+
"regexp"
22+
"strconv"
23+
"strings"
24+
25+
"github.com/google/cel-go/cel"
26+
"github.com/google/cel-go/common/types"
27+
"github.com/google/cel-go/common/types/ref"
28+
)
29+
30+
const (
31+
regexReplace = "regex.replace"
32+
regexExtract = "regex.extract"
33+
regexExtractAll = "regex.extractAll"
34+
)
35+
36+
// Regex returns a cel.EnvOption to configure extended functions for regular
37+
// expression operations.
38+
//
39+
// Note: all functions use the 'regex' namespace. If you are
40+
// currently using a variable named 'regex', the functions will likely work as
41+
// intended, however there is some chance for collision.
42+
//
43+
// This library depends on the CEL optional type. Please ensure that the
44+
// cel.OptionalTypes() is enabled when using regex extensions.
45+
//
46+
// # Replace
47+
//
48+
// The `regex.replace` function replaces all non-overlapping substring of a regex
49+
// pattern in the target string with a replacement string. Optionally, you can
50+
// limit the number of replacements by providing a count argument. When the count
51+
// is a negative number, the function acts as replace all. Only numeric (\N)
52+
// capture group references are supported in the replacement string, with
53+
// validation for correctness. Backslashed-escaped digits (\1 to \9) within the
54+
// replacement argument can be used to insert text matching the corresponding
55+
// parenthesized group in the regexp pattern. An error will be thrown for invalid
56+
// regex or replace string.
57+
//
58+
// regex.replace(target: string, pattern: string, replacement: string) -> string
59+
// regex.replace(target: string, pattern: string, replacement: string, count: int) -> string
60+
//
61+
// Examples:
62+
//
63+
// regex.replace('hello world hello', 'hello', 'hi') == 'hi world hi'
64+
// regex.replace('banana', 'a', 'x', 0) == 'banana'
65+
// regex.replace('banana', 'a', 'x', 1) == 'bxnana'
66+
// regex.replace('banana', 'a', 'x', 2) == 'bxnxna'
67+
// regex.replace('banana', 'a', 'x', -12) == 'bxnxnx'
68+
// regex.replace('foo bar', '(fo)o (ba)r', r'\2 \1') == 'ba fo'
69+
// regex.replace('test', '(.)', r'\2') \\ Runtime Error invalid replace string
70+
// regex.replace('foo bar', '(', '$2 $1') \\ Runtime Error invalid regex string
71+
// regex.replace('id=123', r'id=(?P<value>\d+)', r'value: \values') \\ Runtime Error invalid replace string
72+
//
73+
// # Extract
74+
//
75+
// The `regex.extract` function returns the first match of a regex pattern in a
76+
// string. If no match is found, it returns an optional none value. An error will
77+
// be thrown for invalid regex or for multiple capture groups.
78+
//
79+
// regex.extract(target: string, pattern: string) -> optional<string>
80+
//
81+
// Examples:
82+
//
83+
// regex.extract('hello world', 'hello(.*)') == optional.of(' world')
84+
// regex.extract('item-A, item-B', 'item-(\\w+)') == optional.of('A')
85+
// regex.extract('HELLO', 'hello') == optional.empty()
86+
// regex.extract('testuser@testdomain', '(.*)@([^.]*)') // Runtime Error multiple capture group
87+
//
88+
// # Extract All
89+
//
90+
// The `regex.extractAll` function returns a list of all matches of a regex
91+
// pattern in a target string. If no matches are found, it returns an empty list. An error will
92+
// be thrown for invalid regex or for multiple capture groups.
93+
//
94+
// regex.extractAll(target: string, pattern: string) -> list<string>
95+
//
96+
// Examples:
97+
//
98+
// regex.extractAll('id:123, id:456', 'id:\\d+') == ['id:123', 'id:456']
99+
// regex.extractAll('id:123, id:456', 'assa') == []
100+
// regex.extractAll('testuser@testdomain', '(.*)@([^.]*)') // Runtime Error multiple capture group
101+
func Regex(options ...RegexOptions) cel.EnvOption {
102+
s := &regexLib{
103+
version: math.MaxUint32,
104+
}
105+
for _, o := range options {
106+
s = o(s)
107+
}
108+
return cel.Lib(s)
109+
}
110+
111+
// RegexOptions declares a functional operator for configuring regex extension.
112+
type RegexOptions func(*regexLib) *regexLib
113+
114+
// RegexVersion configures the version of the Regex library definitions to use. See [Regex] for supported values.
115+
func RegexVersion(version uint32) RegexOptions {
116+
return func(lib *regexLib) *regexLib {
117+
lib.version = version
118+
return lib
119+
}
120+
}
121+
122+
type regexLib struct {
123+
version uint32
124+
}
125+
126+
// LibraryName implements that SingletonLibrary interface method.
127+
func (r *regexLib) LibraryName() string {
128+
return "cel.lib.ext.regex"
129+
}
130+
131+
// CompileOptions implements the cel.Library interface method.
132+
func (r *regexLib) CompileOptions() []cel.EnvOption {
133+
optionalTypesEnabled := func(env *cel.Env) (*cel.Env, error) {
134+
if !env.HasLibrary("cel.lib.optional") {
135+
return nil, errors.New("regex library requires the optional library")
136+
}
137+
return env, nil
138+
}
139+
opts := []cel.EnvOption{
140+
cel.Function(regexExtract,
141+
cel.Overload("regex_extract_string_string", []*cel.Type{cel.StringType, cel.StringType}, cel.OptionalType(cel.StringType),
142+
cel.BinaryBinding(extract))),
143+
144+
cel.Function(regexExtractAll,
145+
cel.Overload("regex_extractAll_string_string", []*cel.Type{cel.StringType, cel.StringType}, cel.ListType(cel.StringType),
146+
cel.BinaryBinding(extractAll))),
147+
148+
cel.Function(regexReplace,
149+
cel.Overload("regex_replace_string_string_string", []*cel.Type{cel.StringType, cel.StringType, cel.StringType}, cel.StringType,
150+
cel.FunctionBinding(regReplace)),
151+
cel.Overload("regex_replace_string_string_string_int", []*cel.Type{cel.StringType, cel.StringType, cel.StringType, cel.IntType}, cel.StringType,
152+
cel.FunctionBinding((regReplaceN))),
153+
),
154+
cel.EnvOption(optionalTypesEnabled),
155+
}
156+
return opts
157+
}
158+
159+
// ProgramOptions implements the cel.Library interface method
160+
func (r *regexLib) ProgramOptions() []cel.ProgramOption {
161+
return []cel.ProgramOption{}
162+
}
163+
164+
func compileRegex(regexStr string) (*regexp.Regexp, error) {
165+
re, err := regexp.Compile(regexStr)
166+
if err != nil {
167+
return nil, fmt.Errorf("given regex is invalid: %w", err)
168+
}
169+
return re, nil
170+
}
171+
172+
func regReplace(args ...ref.Val) ref.Val {
173+
target := args[0].(types.String)
174+
regexStr := args[1].(types.String)
175+
replaceStr := args[2].(types.String)
176+
177+
return regReplaceN(target, regexStr, replaceStr, types.Int(-1))
178+
}
179+
180+
func regReplaceN(args ...ref.Val) ref.Val {
181+
target := string(args[0].(types.String))
182+
regexStr := string(args[1].(types.String))
183+
replaceStr := string(args[2].(types.String))
184+
replaceCount := int64(args[3].(types.Int))
185+
186+
if replaceCount == 0 {
187+
return types.String(target)
188+
}
189+
190+
if replaceCount > math.MaxInt32 {
191+
return types.NewErr("integer overflow")
192+
}
193+
194+
// If replaceCount is negative, just do a replaceAll.
195+
if replaceCount < 0 {
196+
replaceCount = -1
197+
}
198+
199+
re, err := regexp.Compile(regexStr)
200+
if err != nil {
201+
return types.WrapErr(err)
202+
}
203+
204+
var resultBuilder strings.Builder
205+
var lastIndex int
206+
counter := int64(0)
207+
208+
matches := re.FindAllStringSubmatchIndex(target, -1)
209+
210+
for _, match := range matches {
211+
if replaceCount != -1 && counter >= replaceCount {
212+
break
213+
}
214+
215+
processedReplacement, err := replaceStrValidator(target, re, match, replaceStr)
216+
if err != nil {
217+
return types.WrapErr(err)
218+
}
219+
220+
resultBuilder.WriteString(target[lastIndex:match[0]])
221+
resultBuilder.WriteString(processedReplacement)
222+
lastIndex = match[1]
223+
counter++
224+
}
225+
226+
resultBuilder.WriteString(target[lastIndex:])
227+
return types.String(resultBuilder.String())
228+
}
229+
230+
func replaceStrValidator(target string, re *regexp.Regexp, match []int, replacement string) (string, error) {
231+
groupCount := re.NumSubexp()
232+
var sb strings.Builder
233+
runes := []rune(replacement)
234+
235+
for i := 0; i < len(runes); i++ {
236+
c := runes[i]
237+
238+
if c != '\\' {
239+
sb.WriteRune(c)
240+
continue
241+
}
242+
243+
if i+1 >= len(runes) {
244+
return "", fmt.Errorf("invalid replacement string: '%s' \\ not allowed at end", replacement)
245+
}
246+
247+
i++
248+
nextChar := runes[i]
249+
250+
if nextChar == '\\' {
251+
sb.WriteRune('\\')
252+
continue
253+
}
254+
255+
groupNum, err := strconv.Atoi(string(nextChar))
256+
if err != nil {
257+
return "", fmt.Errorf("invalid replacement string: '%s' \\ must be followed by a digit or \\", replacement)
258+
}
259+
260+
if groupNum > groupCount {
261+
return "", fmt.Errorf("replacement string references group %d but regex has only %d group(s)", groupNum, groupCount)
262+
}
263+
264+
if match[2*groupNum] != -1 {
265+
sb.WriteString(target[match[2*groupNum]:match[2*groupNum+1]])
266+
}
267+
}
268+
return sb.String(), nil
269+
}
270+
271+
func extract(target, regexStr ref.Val) ref.Val {
272+
t := string(target.(types.String))
273+
r := string(regexStr.(types.String))
274+
re, err := compileRegex(r)
275+
if err != nil {
276+
return types.WrapErr(err)
277+
}
278+
279+
if len(re.SubexpNames())-1 > 1 {
280+
return types.WrapErr(fmt.Errorf("regular expression has more than one capturing group: %q", r))
281+
}
282+
283+
matches := re.FindStringSubmatch(t)
284+
if len(matches) == 0 {
285+
return types.OptionalNone
286+
}
287+
288+
// If there is a capturing group, return the first match; otherwise, return the whole match.
289+
if len(matches) > 1 {
290+
capturedGroup := matches[1]
291+
// If optional group is empty, return OptionalNone.
292+
if capturedGroup == "" {
293+
return types.OptionalNone
294+
}
295+
return types.OptionalOf(types.String(capturedGroup))
296+
}
297+
return types.OptionalOf(types.String(matches[0]))
298+
}
299+
300+
func extractAll(target, regexStr ref.Val) ref.Val {
301+
t := string(target.(types.String))
302+
r := string(regexStr.(types.String))
303+
re, err := compileRegex(r)
304+
if err != nil {
305+
return types.WrapErr(err)
306+
}
307+
308+
groupCount := len(re.SubexpNames()) - 1
309+
if groupCount > 1 {
310+
return types.WrapErr(fmt.Errorf("regular expression has more than one capturing group: %q", r))
311+
}
312+
313+
matches := re.FindAllStringSubmatch(t, -1)
314+
result := make([]string, 0, len(matches))
315+
if len(matches) == 0 {
316+
return types.NewStringList(types.DefaultTypeAdapter, result)
317+
}
318+
319+
if groupCount != 1 {
320+
for _, match := range matches {
321+
result = append(result, match[0])
322+
}
323+
return types.NewStringList(types.DefaultTypeAdapter, result)
324+
}
325+
326+
for _, match := range matches {
327+
if match[1] != "" {
328+
result = append(result, match[1])
329+
}
330+
}
331+
return types.NewStringList(types.DefaultTypeAdapter, result)
332+
}

0 commit comments

Comments
 (0)