|
| 1 | +// Copyright 2025 Google LLC |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +package ext |
| 16 | + |
| 17 | +import ( |
| 18 | + "errors" |
| 19 | + "fmt" |
| 20 | + "math" |
| 21 | + "regexp" |
| 22 | + "strconv" |
| 23 | + "strings" |
| 24 | + |
| 25 | + "github.com/google/cel-go/cel" |
| 26 | + "github.com/google/cel-go/common/types" |
| 27 | + "github.com/google/cel-go/common/types/ref" |
| 28 | +) |
| 29 | + |
| 30 | +const ( |
| 31 | + regexReplace = "regex.replace" |
| 32 | + regexExtract = "regex.extract" |
| 33 | + regexExtractAll = "regex.extractAll" |
| 34 | +) |
| 35 | + |
| 36 | +// Regex returns a cel.EnvOption to configure extended functions for regular |
| 37 | +// expression operations. |
| 38 | +// |
| 39 | +// Note: all functions use the 'regex' namespace. If you are |
| 40 | +// currently using a variable named 'regex', the functions will likely work as |
| 41 | +// intended, however there is some chance for collision. |
| 42 | +// |
| 43 | +// This library depends on the CEL optional type. Please ensure that the |
| 44 | +// cel.OptionalTypes() is enabled when using regex extensions. |
| 45 | +// |
| 46 | +// # Replace |
| 47 | +// |
| 48 | +// The `regex.replace` function replaces all non-overlapping substring of a regex |
| 49 | +// pattern in the target string with a replacement string. Optionally, you can |
| 50 | +// limit the number of replacements by providing a count argument. When the count |
| 51 | +// is a negative number, the function acts as replace all. Only numeric (\N) |
| 52 | +// capture group references are supported in the replacement string, with |
| 53 | +// validation for correctness. Backslashed-escaped digits (\1 to \9) within the |
| 54 | +// replacement argument can be used to insert text matching the corresponding |
| 55 | +// parenthesized group in the regexp pattern. An error will be thrown for invalid |
| 56 | +// regex or replace string. |
| 57 | +// |
| 58 | +// regex.replace(target: string, pattern: string, replacement: string) -> string |
| 59 | +// regex.replace(target: string, pattern: string, replacement: string, count: int) -> string |
| 60 | +// |
| 61 | +// Examples: |
| 62 | +// |
| 63 | +// regex.replace('hello world hello', 'hello', 'hi') == 'hi world hi' |
| 64 | +// regex.replace('banana', 'a', 'x', 0) == 'banana' |
| 65 | +// regex.replace('banana', 'a', 'x', 1) == 'bxnana' |
| 66 | +// regex.replace('banana', 'a', 'x', 2) == 'bxnxna' |
| 67 | +// regex.replace('banana', 'a', 'x', -12) == 'bxnxnx' |
| 68 | +// regex.replace('foo bar', '(fo)o (ba)r', r'\2 \1') == 'ba fo' |
| 69 | +// regex.replace('test', '(.)', r'\2') \\ Runtime Error invalid replace string |
| 70 | +// regex.replace('foo bar', '(', '$2 $1') \\ Runtime Error invalid regex string |
| 71 | +// regex.replace('id=123', r'id=(?P<value>\d+)', r'value: \values') \\ Runtime Error invalid replace string |
| 72 | +// |
| 73 | +// # Extract |
| 74 | +// |
| 75 | +// The `regex.extract` function returns the first match of a regex pattern in a |
| 76 | +// string. If no match is found, it returns an optional none value. An error will |
| 77 | +// be thrown for invalid regex or for multiple capture groups. |
| 78 | +// |
| 79 | +// regex.extract(target: string, pattern: string) -> optional<string> |
| 80 | +// |
| 81 | +// Examples: |
| 82 | +// |
| 83 | +// regex.extract('hello world', 'hello(.*)') == optional.of(' world') |
| 84 | +// regex.extract('item-A, item-B', 'item-(\\w+)') == optional.of('A') |
| 85 | +// regex.extract('HELLO', 'hello') == optional.empty() |
| 86 | +// regex.extract('testuser@testdomain', '(.*)@([^.]*)') // Runtime Error multiple capture group |
| 87 | +// |
| 88 | +// # Extract All |
| 89 | +// |
| 90 | +// The `regex.extractAll` function returns a list of all matches of a regex |
| 91 | +// pattern in a target string. If no matches are found, it returns an empty list. An error will |
| 92 | +// be thrown for invalid regex or for multiple capture groups. |
| 93 | +// |
| 94 | +// regex.extractAll(target: string, pattern: string) -> list<string> |
| 95 | +// |
| 96 | +// Examples: |
| 97 | +// |
| 98 | +// regex.extractAll('id:123, id:456', 'id:\\d+') == ['id:123', 'id:456'] |
| 99 | +// regex.extractAll('id:123, id:456', 'assa') == [] |
| 100 | +// regex.extractAll('testuser@testdomain', '(.*)@([^.]*)') // Runtime Error multiple capture group |
| 101 | +func Regex(options ...RegexOptions) cel.EnvOption { |
| 102 | + s := ®exLib{ |
| 103 | + version: math.MaxUint32, |
| 104 | + } |
| 105 | + for _, o := range options { |
| 106 | + s = o(s) |
| 107 | + } |
| 108 | + return cel.Lib(s) |
| 109 | +} |
| 110 | + |
| 111 | +// RegexOptions declares a functional operator for configuring regex extension. |
| 112 | +type RegexOptions func(*regexLib) *regexLib |
| 113 | + |
| 114 | +// RegexVersion configures the version of the Regex library definitions to use. See [Regex] for supported values. |
| 115 | +func RegexVersion(version uint32) RegexOptions { |
| 116 | + return func(lib *regexLib) *regexLib { |
| 117 | + lib.version = version |
| 118 | + return lib |
| 119 | + } |
| 120 | +} |
| 121 | + |
| 122 | +type regexLib struct { |
| 123 | + version uint32 |
| 124 | +} |
| 125 | + |
| 126 | +// LibraryName implements that SingletonLibrary interface method. |
| 127 | +func (r *regexLib) LibraryName() string { |
| 128 | + return "cel.lib.ext.regex" |
| 129 | +} |
| 130 | + |
| 131 | +// CompileOptions implements the cel.Library interface method. |
| 132 | +func (r *regexLib) CompileOptions() []cel.EnvOption { |
| 133 | + optionalTypesEnabled := func(env *cel.Env) (*cel.Env, error) { |
| 134 | + if !env.HasLibrary("cel.lib.optional") { |
| 135 | + return nil, errors.New("regex library requires the optional library") |
| 136 | + } |
| 137 | + return env, nil |
| 138 | + } |
| 139 | + opts := []cel.EnvOption{ |
| 140 | + cel.Function(regexExtract, |
| 141 | + cel.Overload("regex_extract_string_string", []*cel.Type{cel.StringType, cel.StringType}, cel.OptionalType(cel.StringType), |
| 142 | + cel.BinaryBinding(extract))), |
| 143 | + |
| 144 | + cel.Function(regexExtractAll, |
| 145 | + cel.Overload("regex_extractAll_string_string", []*cel.Type{cel.StringType, cel.StringType}, cel.ListType(cel.StringType), |
| 146 | + cel.BinaryBinding(extractAll))), |
| 147 | + |
| 148 | + cel.Function(regexReplace, |
| 149 | + cel.Overload("regex_replace_string_string_string", []*cel.Type{cel.StringType, cel.StringType, cel.StringType}, cel.StringType, |
| 150 | + cel.FunctionBinding(regReplace)), |
| 151 | + cel.Overload("regex_replace_string_string_string_int", []*cel.Type{cel.StringType, cel.StringType, cel.StringType, cel.IntType}, cel.StringType, |
| 152 | + cel.FunctionBinding((regReplaceN))), |
| 153 | + ), |
| 154 | + cel.EnvOption(optionalTypesEnabled), |
| 155 | + } |
| 156 | + return opts |
| 157 | +} |
| 158 | + |
| 159 | +// ProgramOptions implements the cel.Library interface method |
| 160 | +func (r *regexLib) ProgramOptions() []cel.ProgramOption { |
| 161 | + return []cel.ProgramOption{} |
| 162 | +} |
| 163 | + |
| 164 | +func compileRegex(regexStr string) (*regexp.Regexp, error) { |
| 165 | + re, err := regexp.Compile(regexStr) |
| 166 | + if err != nil { |
| 167 | + return nil, fmt.Errorf("given regex is invalid: %w", err) |
| 168 | + } |
| 169 | + return re, nil |
| 170 | +} |
| 171 | + |
| 172 | +func regReplace(args ...ref.Val) ref.Val { |
| 173 | + target := args[0].(types.String) |
| 174 | + regexStr := args[1].(types.String) |
| 175 | + replaceStr := args[2].(types.String) |
| 176 | + |
| 177 | + return regReplaceN(target, regexStr, replaceStr, types.Int(-1)) |
| 178 | +} |
| 179 | + |
| 180 | +func regReplaceN(args ...ref.Val) ref.Val { |
| 181 | + target := string(args[0].(types.String)) |
| 182 | + regexStr := string(args[1].(types.String)) |
| 183 | + replaceStr := string(args[2].(types.String)) |
| 184 | + replaceCount := int64(args[3].(types.Int)) |
| 185 | + |
| 186 | + if replaceCount == 0 { |
| 187 | + return types.String(target) |
| 188 | + } |
| 189 | + |
| 190 | + if replaceCount > math.MaxInt32 { |
| 191 | + return types.NewErr("integer overflow") |
| 192 | + } |
| 193 | + |
| 194 | + // If replaceCount is negative, just do a replaceAll. |
| 195 | + if replaceCount < 0 { |
| 196 | + replaceCount = -1 |
| 197 | + } |
| 198 | + |
| 199 | + re, err := regexp.Compile(regexStr) |
| 200 | + if err != nil { |
| 201 | + return types.WrapErr(err) |
| 202 | + } |
| 203 | + |
| 204 | + var resultBuilder strings.Builder |
| 205 | + var lastIndex int |
| 206 | + counter := int64(0) |
| 207 | + |
| 208 | + matches := re.FindAllStringSubmatchIndex(target, -1) |
| 209 | + |
| 210 | + for _, match := range matches { |
| 211 | + if replaceCount != -1 && counter >= replaceCount { |
| 212 | + break |
| 213 | + } |
| 214 | + |
| 215 | + processedReplacement, err := replaceStrValidator(target, re, match, replaceStr) |
| 216 | + if err != nil { |
| 217 | + return types.WrapErr(err) |
| 218 | + } |
| 219 | + |
| 220 | + resultBuilder.WriteString(target[lastIndex:match[0]]) |
| 221 | + resultBuilder.WriteString(processedReplacement) |
| 222 | + lastIndex = match[1] |
| 223 | + counter++ |
| 224 | + } |
| 225 | + |
| 226 | + resultBuilder.WriteString(target[lastIndex:]) |
| 227 | + return types.String(resultBuilder.String()) |
| 228 | +} |
| 229 | + |
| 230 | +func replaceStrValidator(target string, re *regexp.Regexp, match []int, replacement string) (string, error) { |
| 231 | + groupCount := re.NumSubexp() |
| 232 | + var sb strings.Builder |
| 233 | + runes := []rune(replacement) |
| 234 | + |
| 235 | + for i := 0; i < len(runes); i++ { |
| 236 | + c := runes[i] |
| 237 | + |
| 238 | + if c != '\\' { |
| 239 | + sb.WriteRune(c) |
| 240 | + continue |
| 241 | + } |
| 242 | + |
| 243 | + if i+1 >= len(runes) { |
| 244 | + return "", fmt.Errorf("invalid replacement string: '%s' \\ not allowed at end", replacement) |
| 245 | + } |
| 246 | + |
| 247 | + i++ |
| 248 | + nextChar := runes[i] |
| 249 | + |
| 250 | + if nextChar == '\\' { |
| 251 | + sb.WriteRune('\\') |
| 252 | + continue |
| 253 | + } |
| 254 | + |
| 255 | + groupNum, err := strconv.Atoi(string(nextChar)) |
| 256 | + if err != nil { |
| 257 | + return "", fmt.Errorf("invalid replacement string: '%s' \\ must be followed by a digit or \\", replacement) |
| 258 | + } |
| 259 | + |
| 260 | + if groupNum > groupCount { |
| 261 | + return "", fmt.Errorf("replacement string references group %d but regex has only %d group(s)", groupNum, groupCount) |
| 262 | + } |
| 263 | + |
| 264 | + if match[2*groupNum] != -1 { |
| 265 | + sb.WriteString(target[match[2*groupNum]:match[2*groupNum+1]]) |
| 266 | + } |
| 267 | + } |
| 268 | + return sb.String(), nil |
| 269 | +} |
| 270 | + |
| 271 | +func extract(target, regexStr ref.Val) ref.Val { |
| 272 | + t := string(target.(types.String)) |
| 273 | + r := string(regexStr.(types.String)) |
| 274 | + re, err := compileRegex(r) |
| 275 | + if err != nil { |
| 276 | + return types.WrapErr(err) |
| 277 | + } |
| 278 | + |
| 279 | + if len(re.SubexpNames())-1 > 1 { |
| 280 | + return types.WrapErr(fmt.Errorf("regular expression has more than one capturing group: %q", r)) |
| 281 | + } |
| 282 | + |
| 283 | + matches := re.FindStringSubmatch(t) |
| 284 | + if len(matches) == 0 { |
| 285 | + return types.OptionalNone |
| 286 | + } |
| 287 | + |
| 288 | + // If there is a capturing group, return the first match; otherwise, return the whole match. |
| 289 | + if len(matches) > 1 { |
| 290 | + capturedGroup := matches[1] |
| 291 | + // If optional group is empty, return OptionalNone. |
| 292 | + if capturedGroup == "" { |
| 293 | + return types.OptionalNone |
| 294 | + } |
| 295 | + return types.OptionalOf(types.String(capturedGroup)) |
| 296 | + } |
| 297 | + return types.OptionalOf(types.String(matches[0])) |
| 298 | +} |
| 299 | + |
| 300 | +func extractAll(target, regexStr ref.Val) ref.Val { |
| 301 | + t := string(target.(types.String)) |
| 302 | + r := string(regexStr.(types.String)) |
| 303 | + re, err := compileRegex(r) |
| 304 | + if err != nil { |
| 305 | + return types.WrapErr(err) |
| 306 | + } |
| 307 | + |
| 308 | + groupCount := len(re.SubexpNames()) - 1 |
| 309 | + if groupCount > 1 { |
| 310 | + return types.WrapErr(fmt.Errorf("regular expression has more than one capturing group: %q", r)) |
| 311 | + } |
| 312 | + |
| 313 | + matches := re.FindAllStringSubmatch(t, -1) |
| 314 | + result := make([]string, 0, len(matches)) |
| 315 | + if len(matches) == 0 { |
| 316 | + return types.NewStringList(types.DefaultTypeAdapter, result) |
| 317 | + } |
| 318 | + |
| 319 | + if groupCount != 1 { |
| 320 | + for _, match := range matches { |
| 321 | + result = append(result, match[0]) |
| 322 | + } |
| 323 | + return types.NewStringList(types.DefaultTypeAdapter, result) |
| 324 | + } |
| 325 | + |
| 326 | + for _, match := range matches { |
| 327 | + if match[1] != "" { |
| 328 | + result = append(result, match[1]) |
| 329 | + } |
| 330 | + } |
| 331 | + return types.NewStringList(types.DefaultTypeAdapter, result) |
| 332 | +} |
0 commit comments