Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
This is an automated cherry-pick of #64165
Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>
  • Loading branch information
bb7133 authored and ti-chi-bot committed Oct 31, 2025
commit b725930542a6b0a9d403ee253476fbeda13a0b36
46 changes: 46 additions & 0 deletions pkg/util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"slices"
"sync/atomic"
"unicode/utf8"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/parser/charset"
Expand Down Expand Up @@ -384,6 +385,51 @@ func CollationToProto(c string) int32 {
return v
}

<<<<<<< HEAD
=======
func compareCommon(a, b string, keyFunc func(rune) uint32) int {
a = truncateTailingSpace(a)
b = truncateTailingSpace(b)

r1, r2 := rune(0), rune(0)
ai, bi := 0, 0
r1Len, r2Len := 0, 0
for ai < len(a) && bi < len(b) {
r1, r1Len = utf8.DecodeRuneInString(a[ai:])
r2, r2Len = utf8.DecodeRuneInString(b[bi:])
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
invalid1 := r1 == utf8.RuneError && r1Len == 1
invalid2 := r2 == utf8.RuneError && r2Len == 1
if invalid1 || invalid2 {
return 0
}

ai += r1Len
bi += r2Len

cmp := cmp.Compare(keyFunc(r1), keyFunc(r2))
if cmp != 0 {
return cmp
}
}
return cmp.Compare(len(a)-ai, len(b)-bi)
}

// CanUseRawMemAsKey returns true if current collator can use the original raw memory as the key
// only return true for binCollator and derivedBinCollator
func CanUseRawMemAsKey(c Collator) bool {
if _, ok := c.(*binCollator); ok {
return true
}
if _, ok := c.(*derivedBinCollator); ok {
return true
}
return false
}

>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
// ProtoToCollation converts collation from int32(used by protocol) to string.
func ProtoToCollation(c int32) string {
coll, err := charset.GetCollationByID(int(RestoreCollationIDIfNeeded(c)))
Expand Down
115 changes: 115 additions & 0 deletions pkg/util/collate/gb18030_chinese_ci.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2024 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collate

import (
_ "embed"
"encoding/binary"
"unicode/utf8"

"github.com/pingcap/tidb/pkg/util/stringutil"
)

//go:embed gb18030_weight.data
var gb18030WeightData []byte

const (
// Unicode code points up to U+10FFFF can be encoded as GB18030.
gb18030MaxCodePoint = 0x10FFFF
)

type gb18030ChineseCICollator struct {
}

// Clone implements Collator interface.
func (*gb18030ChineseCICollator) Clone() Collator {
return new(gb18030ChineseCICollator)
}

// Compare implements Collator interface.
func (*gb18030ChineseCICollator) Compare(a, b string) int {
return compareCommon(a, b, gb18030ChineseCISortKey)
}

// Key implements Collator interface.
func (g *gb18030ChineseCICollator) Key(str string) []byte {
return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
}

// ImmutableKey implement Collator interface.
func (g *gb18030ChineseCICollator) ImmutableKey(str string) []byte {
return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
}

// KeyWithoutTrimRightSpace implement Collator interface.
func (*gb18030ChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
buf := make([]byte, 0, len(str)*2)
i, rLen := 0, 0
r := rune(0)
for i < len(str) {
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
r, rLen = utf8.DecodeRuneInString(str[i:])
invalid := r == utf8.RuneError && rLen == 1
if invalid {
return buf
}

i = i + rLen
u32 := gb18030ChineseCISortKey(r)
if u32 > 0xFFFFFF {
buf = append(buf, byte(u32>>24))
}
if u32 > 0xFFFF {
buf = append(buf, byte(u32>>16))
}
if u32 > 0xFF {
buf = append(buf, byte(u32>>8))
}
buf = append(buf, byte(u32))
}
return buf
}

// Pattern implements Collator interface.
func (*gb18030ChineseCICollator) Pattern() WildcardPattern {
return &gb18030ChineseCIPattern{}
}

type gb18030ChineseCIPattern struct {
patChars []rune
patTypes []byte
}

// Compile implements WildcardPattern interface.
func (p *gb18030ChineseCIPattern) Compile(patternStr string, escape byte) {
p.patChars, p.patTypes = stringutil.CompilePatternInner(patternStr, escape)
}

// DoMatch implements WildcardPattern interface.
func (p *gb18030ChineseCIPattern) DoMatch(str string) bool {
return stringutil.DoMatchCustomized(str, p.patChars, p.patTypes, func(a, b rune) bool {
return gb18030ChineseCISortKey(a) == gb18030ChineseCISortKey(b)
})
}

func gb18030ChineseCISortKey(r rune) uint32 {
if r > gb18030MaxCodePoint {
return 0x3F
}

return binary.LittleEndian.Uint32(gb18030WeightData[4*r : 4*r+4])
}
29 changes: 27 additions & 2 deletions pkg/util/collate/gbk_chinese_ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,22 @@

package collate

<<<<<<< HEAD
import "github.com/pingcap/tidb/pkg/util/stringutil"
=======
import (
"unicode/utf8"

"github.com/pingcap/tidb/pkg/util/stringutil"
)
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))

type gbkChineseCICollator struct {
}

// Compare implements Collator interface.
func (*gbkChineseCICollator) Compare(a, b string) int {
<<<<<<< HEAD
a = truncateTailingSpace(a)
b = truncateTailingSpace(b)

Expand All @@ -36,6 +45,9 @@ func (*gbkChineseCICollator) Compare(a, b string) int {
}
}
return sign((len(a) - ai) - (len(b) - bi))
=======
return compareCommon(a, b, gbkChineseCISortKey)
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
}

// Key implements Collator interface.
Expand All @@ -49,7 +61,20 @@ func (*gbkChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
i := 0
r := rune(0)
for i < len(str) {
<<<<<<< HEAD
r, i = decodeRune(str, i)
=======
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
r, rLen = utf8.DecodeRuneInString(str[i:])
invalid := r == utf8.RuneError && rLen == 1
if invalid {
return buf
}

i = i + rLen
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
u16 := gbkChineseCISortKey(r)
if u16 > 0xFF {
buf = append(buf, byte(u16>>8))
Expand Down Expand Up @@ -81,10 +106,10 @@ func (p *gbkChineseCIPattern) DoMatch(str string) bool {
})
}

func gbkChineseCISortKey(r rune) uint16 {
func gbkChineseCISortKey(r rune) uint32 {
if r > 0xFFFF {
return 0x3F
}

return gbkChineseCISortKeyTable[r]
return uint32(gbkChineseCISortKeyTable[r])
}
28 changes: 25 additions & 3 deletions pkg/util/collate/general_ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
package collate

import (
<<<<<<< HEAD
=======
"unicode/utf8"

>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
"github.com/pingcap/tidb/pkg/util/stringutil"
)

Expand All @@ -23,6 +28,7 @@ type generalCICollator struct {

// Compare implements Collator interface.
func (*generalCICollator) Compare(a, b string) int {
<<<<<<< HEAD
a = truncateTailingSpace(a)
b = truncateTailingSpace(b)
r1, r2 := rune(0), rune(0)
Expand All @@ -37,6 +43,9 @@ func (*generalCICollator) Compare(a, b string) int {
}
}
return sign((len(a) - ai) - (len(b) - bi))
=======
return compareCommon(a, b, convertRuneGeneralCI)
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
}

// Key implements Collator interface.
Expand All @@ -50,7 +59,20 @@ func (*generalCICollator) KeyWithoutTrimRightSpace(str string) []byte {
i := 0
r := rune(0)
for i < len(str) {
<<<<<<< HEAD
r, i = decodeRune(str, i)
=======
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
r, rLen = utf8.DecodeRuneInString(str[i:])
invalid := r == utf8.RuneError && rLen == 1
if invalid {
return buf
}

i = i + rLen
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
u16 := convertRuneGeneralCI(r)
buf = append(buf, byte(u16>>8), byte(u16))
}
Expand Down Expand Up @@ -79,15 +101,15 @@ func (p *ciPattern) DoMatch(str string) bool {
})
}

func convertRuneGeneralCI(r rune) uint16 {
func convertRuneGeneralCI(r rune) uint32 {
if r > 0xFFFF {
return 0xFFFD
}
plane := planeTable[r>>8]
if plane == nil {
return uint16(r)
return uint32(r)
}
return plane[r&0xFF]
return uint32(plane[r&0xFF])
}

var (
Expand Down
39 changes: 39 additions & 0 deletions pkg/util/collate/ucaimpl/unicode_ci.go.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

package collate

<<<<<<< HEAD
=======
import "unicode/utf8"

>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
// {{.Name}} implements UCA. see http://unicode.org/reports/tr10/
type {{.Name}} struct {
impl {{.ImplName}}
Expand All @@ -39,7 +44,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
if an == 0 {
if as == 0 {
for an == 0 && ai < len(a) {
<<<<<<< HEAD
ar, ai = decodeRune(a, ai)
=======
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
ar, arLen = utf8.DecodeRuneInString(a[ai:])
invalid := ar == utf8.RuneError && arLen == 1
if invalid {
return 0
}
ai = ai + arLen
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
an, as = uc.impl.GetWeight(ar)
}
} else {
Expand All @@ -51,7 +68,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
if bn == 0 {
if bs == 0 {
for bn == 0 && bi < len(b) {
<<<<<<< HEAD
br, bi = decodeRune(b, bi)
=======
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
br, brLen = utf8.DecodeRuneInString(b[bi:])
invalid := br == utf8.RuneError && brLen == 1
if invalid {
return 0
}
bi = bi + brLen
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
bn, bs = uc.impl.GetWeight(br)
}
} else {
Expand Down Expand Up @@ -92,8 +121,18 @@ func (uc *{{.Name}}) KeyWithoutTrimRightSpace(str string) []byte {
sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s

for si < len(str) {
<<<<<<< HEAD
r, si = decodeRune(str, si)

=======
r, rLen = utf8.DecodeRuneInString(str[si:])
invalid := r == utf8.RuneError && rLen == 1
if invalid {
return buf
}

si = si + rLen
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
sn, ss = uc.impl.GetWeight(r)

for sn != 0 {
Expand Down
Loading