This is an automated cherry-pick of #64165

Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>
pingcap · ti-chi-bot · Oct 31, 2025 · Oct 31, 2025 · b725930542a6b0a9d403ee253476fbeda13a0b36
commit b725930542a6b0a9d403ee253476fbeda13a0b36
diff --git a/pkg/util/collate/collate.go b/pkg/util/collate/collate.go
@@ -19,6 +19,7 @@ import (
 	"fmt"
 	"slices"
 	"sync/atomic"
+	"unicode/utf8"
 
 	"github.com/pingcap/errors"
 	"github.com/pingcap/tidb/pkg/parser/charset"
@@ -384,6 +385,51 @@ func CollationToProto(c string) int32 {
 	return v
 }
 
+<<<<<<< HEAD
+=======
+func compareCommon(a, b string, keyFunc func(rune) uint32) int {
+	a = truncateTailingSpace(a)
+	b = truncateTailingSpace(b)
+
+	r1, r2 := rune(0), rune(0)
+	ai, bi := 0, 0
+	r1Len, r2Len := 0, 0
+	for ai < len(a) && bi < len(b) {
+		r1, r1Len = utf8.DecodeRuneInString(a[ai:])
+		r2, r2Len = utf8.DecodeRuneInString(b[bi:])
+		// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+		// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+		// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+		invalid1 := r1 == utf8.RuneError && r1Len == 1
+		invalid2 := r2 == utf8.RuneError && r2Len == 1
+		if invalid1 || invalid2 {
+			return 0
+		}
+
+		ai += r1Len
+		bi += r2Len
+
+		cmp := cmp.Compare(keyFunc(r1), keyFunc(r2))
+		if cmp != 0 {
+			return cmp
+		}
+	}
+	return cmp.Compare(len(a)-ai, len(b)-bi)
+}
+
+// CanUseRawMemAsKey returns true if current collator can use the original raw memory as the key
+// only return true for binCollator and derivedBinCollator
+func CanUseRawMemAsKey(c Collator) bool {
+	if _, ok := c.(*binCollator); ok {
+		return true
+	}
+	if _, ok := c.(*derivedBinCollator); ok {
+		return true
+	}
+	return false
+}
+
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 // ProtoToCollation converts collation from int32(used by protocol) to string.
 func ProtoToCollation(c int32) string {
 	coll, err := charset.GetCollationByID(int(RestoreCollationIDIfNeeded(c)))

diff --git a/pkg/util/collate/gb18030_chinese_ci.go b/pkg/util/collate/gb18030_chinese_ci.go
@@ -0,0 +1,115 @@
+// Copyright 2024 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package collate
+
+import (
+	_ "embed"
+	"encoding/binary"
+	"unicode/utf8"
+
+	"github.com/pingcap/tidb/pkg/util/stringutil"
+)
+
+//go:embed gb18030_weight.data
+var gb18030WeightData []byte
+
+const (
+	// Unicode code points up to U+10FFFF can be encoded as GB18030.
+	gb18030MaxCodePoint = 0x10FFFF
+)
+
+type gb18030ChineseCICollator struct {
+}
+
+// Clone implements Collator interface.
+func (*gb18030ChineseCICollator) Clone() Collator {
+	return new(gb18030ChineseCICollator)
+}
+
+// Compare implements Collator interface.
+func (*gb18030ChineseCICollator) Compare(a, b string) int {
+	return compareCommon(a, b, gb18030ChineseCISortKey)
+}
+
+// Key implements Collator interface.
+func (g *gb18030ChineseCICollator) Key(str string) []byte {
+	return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
+}
+
+// ImmutableKey implement Collator interface.
+func (g *gb18030ChineseCICollator) ImmutableKey(str string) []byte {
+	return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
+}
+
+// KeyWithoutTrimRightSpace implement Collator interface.
+func (*gb18030ChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
+	buf := make([]byte, 0, len(str)*2)
+	i, rLen := 0, 0
+	r := rune(0)
+	for i < len(str) {
+		// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+		// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+		// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+		r, rLen = utf8.DecodeRuneInString(str[i:])
+		invalid := r == utf8.RuneError && rLen == 1
+		if invalid {
+			return buf
+		}
+
+		i = i + rLen
+		u32 := gb18030ChineseCISortKey(r)
+		if u32 > 0xFFFFFF {
+			buf = append(buf, byte(u32>>24))
+		}
+		if u32 > 0xFFFF {
+			buf = append(buf, byte(u32>>16))
+		}
+		if u32 > 0xFF {
+			buf = append(buf, byte(u32>>8))
+		}
+		buf = append(buf, byte(u32))
+	}
+	return buf
+}
+
+// Pattern implements Collator interface.
+func (*gb18030ChineseCICollator) Pattern() WildcardPattern {
+	return &gb18030ChineseCIPattern{}
+}
+
+type gb18030ChineseCIPattern struct {
+	patChars []rune
+	patTypes []byte
+}
+
+// Compile implements WildcardPattern interface.
+func (p *gb18030ChineseCIPattern) Compile(patternStr string, escape byte) {
+	p.patChars, p.patTypes = stringutil.CompilePatternInner(patternStr, escape)
+}
+
+// DoMatch implements WildcardPattern interface.
+func (p *gb18030ChineseCIPattern) DoMatch(str string) bool {
+	return stringutil.DoMatchCustomized(str, p.patChars, p.patTypes, func(a, b rune) bool {
+		return gb18030ChineseCISortKey(a) == gb18030ChineseCISortKey(b)
+	})
+}
+
+func gb18030ChineseCISortKey(r rune) uint32 {
+	if r > gb18030MaxCodePoint {
+		return 0x3F
+	}
+
+	return binary.LittleEndian.Uint32(gb18030WeightData[4*r : 4*r+4])
+}
diff --git a/pkg/util/collate/gbk_chinese_ci.go b/pkg/util/collate/gbk_chinese_ci.go
@@ -14,13 +14,22 @@
 
 package collate
 
+<<<<<<< HEAD
 import "github.com/pingcap/tidb/pkg/util/stringutil"
+=======
+import (
+	"unicode/utf8"
+
+	"github.com/pingcap/tidb/pkg/util/stringutil"
+)
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 
 type gbkChineseCICollator struct {
 }
 
 // Compare implements Collator interface.
 func (*gbkChineseCICollator) Compare(a, b string) int {
+<<<<<<< HEAD
 	a = truncateTailingSpace(a)
 	b = truncateTailingSpace(b)
 
@@ -36,6 +45,9 @@ func (*gbkChineseCICollator) Compare(a, b string) int {
 		}
 	}
 	return sign((len(a) - ai) - (len(b) - bi))
+=======
+	return compareCommon(a, b, gbkChineseCISortKey)
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 }
 
 // Key implements Collator interface.
@@ -49,7 +61,20 @@ func (*gbkChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
 	i := 0
 	r := rune(0)
 	for i < len(str) {
+<<<<<<< HEAD
 		r, i = decodeRune(str, i)
+=======
+		// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+		// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+		// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+		r, rLen = utf8.DecodeRuneInString(str[i:])
+		invalid := r == utf8.RuneError && rLen == 1
+		if invalid {
+			return buf
+		}
+
+		i = i + rLen
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 		u16 := gbkChineseCISortKey(r)
 		if u16 > 0xFF {
 			buf = append(buf, byte(u16>>8))
@@ -81,10 +106,10 @@ func (p *gbkChineseCIPattern) DoMatch(str string) bool {
 	})
 }
 
-func gbkChineseCISortKey(r rune) uint16 {
+func gbkChineseCISortKey(r rune) uint32 {
 	if r > 0xFFFF {
 		return 0x3F
 	}
 
-	return gbkChineseCISortKeyTable[r]
+	return uint32(gbkChineseCISortKeyTable[r])
 }
diff --git a/pkg/util/collate/general_ci.go b/pkg/util/collate/general_ci.go
@@ -15,6 +15,11 @@
 package collate
 
 import (
+<<<<<<< HEAD
+=======
+	"unicode/utf8"
+
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 	"github.com/pingcap/tidb/pkg/util/stringutil"
 )
 
@@ -23,6 +28,7 @@ type generalCICollator struct {
 
 // Compare implements Collator interface.
 func (*generalCICollator) Compare(a, b string) int {
+<<<<<<< HEAD
 	a = truncateTailingSpace(a)
 	b = truncateTailingSpace(b)
 	r1, r2 := rune(0), rune(0)
@@ -37,6 +43,9 @@ func (*generalCICollator) Compare(a, b string) int {
 		}
 	}
 	return sign((len(a) - ai) - (len(b) - bi))
+=======
+	return compareCommon(a, b, convertRuneGeneralCI)
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 }
 
 // Key implements Collator interface.
@@ -50,7 +59,20 @@ func (*generalCICollator) KeyWithoutTrimRightSpace(str string) []byte {
 	i := 0
 	r := rune(0)
 	for i < len(str) {
+<<<<<<< HEAD
 		r, i = decodeRune(str, i)
+=======
+		// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+		// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+		// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+		r, rLen = utf8.DecodeRuneInString(str[i:])
+		invalid := r == utf8.RuneError && rLen == 1
+		if invalid {
+			return buf
+		}
+
+		i = i + rLen
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 		u16 := convertRuneGeneralCI(r)
 		buf = append(buf, byte(u16>>8), byte(u16))
 	}
@@ -79,15 +101,15 @@ func (p *ciPattern) DoMatch(str string) bool {
 	})
 }
 
-func convertRuneGeneralCI(r rune) uint16 {
+func convertRuneGeneralCI(r rune) uint32 {
 	if r > 0xFFFF {
 		return 0xFFFD
 	}
 	plane := planeTable[r>>8]
 	if plane == nil {
-		return uint16(r)
+		return uint32(r)
 	}
-	return plane[r&0xFF]
+	return uint32(plane[r&0xFF])
 }
 
 var (

diff --git a/pkg/util/collate/ucaimpl/unicode_ci.go.tpl b/pkg/util/collate/ucaimpl/unicode_ci.go.tpl
@@ -18,6 +18,11 @@
 
 package collate
 
+<<<<<<< HEAD
+=======
+import "unicode/utf8"
+
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 // {{.Name}} implements UCA. see http://unicode.org/reports/tr10/
 type {{.Name}} struct {
 	impl {{.ImplName}}
@@ -39,7 +44,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
 		if an == 0 {
 			if as == 0 {
 				for an == 0 && ai < len(a) {
+<<<<<<< HEAD
 					ar, ai = decodeRune(a, ai)
+=======
+					// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+					// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+					// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+					ar, arLen = utf8.DecodeRuneInString(a[ai:])
+					invalid := ar == utf8.RuneError && arLen == 1
+					if invalid {
+						return 0
+					}
+					ai = ai + arLen
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 					an, as = uc.impl.GetWeight(ar)
 				}
 			} else {
@@ -51,7 +68,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
 		if bn == 0 {
 			if bs == 0 {
 				for bn == 0 && bi < len(b) {
+<<<<<<< HEAD
 					br, bi = decodeRune(b, bi)
+=======
+					// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
+					// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
+					// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
+					br, brLen = utf8.DecodeRuneInString(b[bi:])
+					invalid := br == utf8.RuneError && brLen == 1
+					if invalid {
+						return 0
+					}
+					bi = bi + brLen
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 					bn, bs = uc.impl.GetWeight(br)
 				}
 			} else {
@@ -92,8 +121,18 @@ func (uc *{{.Name}}) KeyWithoutTrimRightSpace(str string) []byte {
 	sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s
 
 	for si < len(str) {
+<<<<<<< HEAD
 		r, si = decodeRune(str, si)
 
+=======
+		r, rLen = utf8.DecodeRuneInString(str[si:])
+		invalid := r == utf8.RuneError && rLen == 1
+		if invalid {
+			return buf
+		}
+
+		si = si + rLen
+>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
 		sn, ss = uc.impl.GetWeight(r)
 
 		for sn != 0 {