Skip to content

Commit b725930

Browse files
bb7133ti-chi-bot
authored andcommitted
This is an automated cherry-pick of #64165
Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>
1 parent e84f8af commit b725930

File tree

9 files changed

+459
-5
lines changed

9 files changed

+459
-5
lines changed

‎pkg/util/collate/collate.go‎

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"fmt"
2020
"slices"
2121
"sync/atomic"
22+
"unicode/utf8"
2223

2324
"github.com/pingcap/errors"
2425
"github.com/pingcap/tidb/pkg/parser/charset"
@@ -384,6 +385,51 @@ func CollationToProto(c string) int32 {
384385
return v
385386
}
386387

388+
<<<<<<< HEAD
389+
=======
390+
func compareCommon(a, b string, keyFunc func(rune) uint32) int {
391+
a = truncateTailingSpace(a)
392+
b = truncateTailingSpace(b)
393+
394+
r1, r2 := rune(0), rune(0)
395+
ai, bi := 0, 0
396+
r1Len, r2Len := 0, 0
397+
for ai < len(a) && bi < len(b) {
398+
r1, r1Len = utf8.DecodeRuneInString(a[ai:])
399+
r2, r2Len = utf8.DecodeRuneInString(b[bi:])
400+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
401+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
402+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
403+
invalid1 := r1 == utf8.RuneError && r1Len == 1
404+
invalid2 := r2 == utf8.RuneError && r2Len == 1
405+
if invalid1 || invalid2 {
406+
return 0
407+
}
408+
409+
ai += r1Len
410+
bi += r2Len
411+
412+
cmp := cmp.Compare(keyFunc(r1), keyFunc(r2))
413+
if cmp != 0 {
414+
return cmp
415+
}
416+
}
417+
return cmp.Compare(len(a)-ai, len(b)-bi)
418+
}
419+
420+
// CanUseRawMemAsKey returns true if current collator can use the original raw memory as the key
421+
// only return true for binCollator and derivedBinCollator
422+
func CanUseRawMemAsKey(c Collator) bool {
423+
if _, ok := c.(*binCollator); ok {
424+
return true
425+
}
426+
if _, ok := c.(*derivedBinCollator); ok {
427+
return true
428+
}
429+
return false
430+
}
431+
432+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
387433
// ProtoToCollation converts collation from int32(used by protocol) to string.
388434
func ProtoToCollation(c int32) string {
389435
coll, err := charset.GetCollationByID(int(RestoreCollationIDIfNeeded(c)))
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package collate
16+
17+
import (
18+
_ "embed"
19+
"encoding/binary"
20+
"unicode/utf8"
21+
22+
"github.com/pingcap/tidb/pkg/util/stringutil"
23+
)
24+
25+
//go:embed gb18030_weight.data
26+
var gb18030WeightData []byte
27+
28+
const (
29+
// Unicode code points up to U+10FFFF can be encoded as GB18030.
30+
gb18030MaxCodePoint = 0x10FFFF
31+
)
32+
33+
type gb18030ChineseCICollator struct {
34+
}
35+
36+
// Clone implements Collator interface.
37+
func (*gb18030ChineseCICollator) Clone() Collator {
38+
return new(gb18030ChineseCICollator)
39+
}
40+
41+
// Compare implements Collator interface.
42+
func (*gb18030ChineseCICollator) Compare(a, b string) int {
43+
return compareCommon(a, b, gb18030ChineseCISortKey)
44+
}
45+
46+
// Key implements Collator interface.
47+
func (g *gb18030ChineseCICollator) Key(str string) []byte {
48+
return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
49+
}
50+
51+
// ImmutableKey implement Collator interface.
52+
func (g *gb18030ChineseCICollator) ImmutableKey(str string) []byte {
53+
return g.KeyWithoutTrimRightSpace(truncateTailingSpace(str))
54+
}
55+
56+
// KeyWithoutTrimRightSpace implement Collator interface.
57+
func (*gb18030ChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
58+
buf := make([]byte, 0, len(str)*2)
59+
i, rLen := 0, 0
60+
r := rune(0)
61+
for i < len(str) {
62+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
63+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
64+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
65+
r, rLen = utf8.DecodeRuneInString(str[i:])
66+
invalid := r == utf8.RuneError && rLen == 1
67+
if invalid {
68+
return buf
69+
}
70+
71+
i = i + rLen
72+
u32 := gb18030ChineseCISortKey(r)
73+
if u32 > 0xFFFFFF {
74+
buf = append(buf, byte(u32>>24))
75+
}
76+
if u32 > 0xFFFF {
77+
buf = append(buf, byte(u32>>16))
78+
}
79+
if u32 > 0xFF {
80+
buf = append(buf, byte(u32>>8))
81+
}
82+
buf = append(buf, byte(u32))
83+
}
84+
return buf
85+
}
86+
87+
// Pattern implements Collator interface.
88+
func (*gb18030ChineseCICollator) Pattern() WildcardPattern {
89+
return &gb18030ChineseCIPattern{}
90+
}
91+
92+
type gb18030ChineseCIPattern struct {
93+
patChars []rune
94+
patTypes []byte
95+
}
96+
97+
// Compile implements WildcardPattern interface.
98+
func (p *gb18030ChineseCIPattern) Compile(patternStr string, escape byte) {
99+
p.patChars, p.patTypes = stringutil.CompilePatternInner(patternStr, escape)
100+
}
101+
102+
// DoMatch implements WildcardPattern interface.
103+
func (p *gb18030ChineseCIPattern) DoMatch(str string) bool {
104+
return stringutil.DoMatchCustomized(str, p.patChars, p.patTypes, func(a, b rune) bool {
105+
return gb18030ChineseCISortKey(a) == gb18030ChineseCISortKey(b)
106+
})
107+
}
108+
109+
func gb18030ChineseCISortKey(r rune) uint32 {
110+
if r > gb18030MaxCodePoint {
111+
return 0x3F
112+
}
113+
114+
return binary.LittleEndian.Uint32(gb18030WeightData[4*r : 4*r+4])
115+
}

‎pkg/util/collate/gbk_chinese_ci.go‎

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,22 @@
1414

1515
package collate
1616

17+
<<<<<<< HEAD
1718
import "github.com/pingcap/tidb/pkg/util/stringutil"
19+
=======
20+
import (
21+
"unicode/utf8"
22+
23+
"github.com/pingcap/tidb/pkg/util/stringutil"
24+
)
25+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
1826

1927
type gbkChineseCICollator struct {
2028
}
2129

2230
// Compare implements Collator interface.
2331
func (*gbkChineseCICollator) Compare(a, b string) int {
32+
<<<<<<< HEAD
2433
a = truncateTailingSpace(a)
2534
b = truncateTailingSpace(b)
2635

@@ -36,6 +45,9 @@ func (*gbkChineseCICollator) Compare(a, b string) int {
3645
}
3746
}
3847
return sign((len(a) - ai) - (len(b) - bi))
48+
=======
49+
return compareCommon(a, b, gbkChineseCISortKey)
50+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
3951
}
4052

4153
// Key implements Collator interface.
@@ -49,7 +61,20 @@ func (*gbkChineseCICollator) KeyWithoutTrimRightSpace(str string) []byte {
4961
i := 0
5062
r := rune(0)
5163
for i < len(str) {
64+
<<<<<<< HEAD
5265
r, i = decodeRune(str, i)
66+
=======
67+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
68+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
69+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
70+
r, rLen = utf8.DecodeRuneInString(str[i:])
71+
invalid := r == utf8.RuneError && rLen == 1
72+
if invalid {
73+
return buf
74+
}
75+
76+
i = i + rLen
77+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
5378
u16 := gbkChineseCISortKey(r)
5479
if u16 > 0xFF {
5580
buf = append(buf, byte(u16>>8))
@@ -81,10 +106,10 @@ func (p *gbkChineseCIPattern) DoMatch(str string) bool {
81106
})
82107
}
83108

84-
func gbkChineseCISortKey(r rune) uint16 {
109+
func gbkChineseCISortKey(r rune) uint32 {
85110
if r > 0xFFFF {
86111
return 0x3F
87112
}
88113

89-
return gbkChineseCISortKeyTable[r]
114+
return uint32(gbkChineseCISortKeyTable[r])
90115
}

‎pkg/util/collate/general_ci.go‎

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
package collate
1616

1717
import (
18+
<<<<<<< HEAD
19+
=======
20+
"unicode/utf8"
21+
22+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
1823
"github.com/pingcap/tidb/pkg/util/stringutil"
1924
)
2025

@@ -23,6 +28,7 @@ type generalCICollator struct {
2328

2429
// Compare implements Collator interface.
2530
func (*generalCICollator) Compare(a, b string) int {
31+
<<<<<<< HEAD
2632
a = truncateTailingSpace(a)
2733
b = truncateTailingSpace(b)
2834
r1, r2 := rune(0), rune(0)
@@ -37,6 +43,9 @@ func (*generalCICollator) Compare(a, b string) int {
3743
}
3844
}
3945
return sign((len(a) - ai) - (len(b) - bi))
46+
=======
47+
return compareCommon(a, b, convertRuneGeneralCI)
48+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
4049
}
4150

4251
// Key implements Collator interface.
@@ -50,7 +59,20 @@ func (*generalCICollator) KeyWithoutTrimRightSpace(str string) []byte {
5059
i := 0
5160
r := rune(0)
5261
for i < len(str) {
62+
<<<<<<< HEAD
5363
r, i = decodeRune(str, i)
64+
=======
65+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('�') and size 1.
66+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
67+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid '�'.
68+
r, rLen = utf8.DecodeRuneInString(str[i:])
69+
invalid := r == utf8.RuneError && rLen == 1
70+
if invalid {
71+
return buf
72+
}
73+
74+
i = i + rLen
75+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
5476
u16 := convertRuneGeneralCI(r)
5577
buf = append(buf, byte(u16>>8), byte(u16))
5678
}
@@ -79,15 +101,15 @@ func (p *ciPattern) DoMatch(str string) bool {
79101
})
80102
}
81103

82-
func convertRuneGeneralCI(r rune) uint16 {
104+
func convertRuneGeneralCI(r rune) uint32 {
83105
if r > 0xFFFF {
84106
return 0xFFFD
85107
}
86108
plane := planeTable[r>>8]
87109
if plane == nil {
88-
return uint16(r)
110+
return uint32(r)
89111
}
90-
return plane[r&0xFF]
112+
return uint32(plane[r&0xFF])
91113
}
92114

93115
var (

‎pkg/util/collate/ucaimpl/unicode_ci.go.tpl‎

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818

1919
package collate
2020

21+
<<<<<<< HEAD
22+
=======
23+
import "unicode/utf8"
24+
25+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '���'(\uFFFD) was treated as invalid sequence (#64165))
2126
// {{.Name}} implements UCA. see http://unicode.org/reports/tr10/
2227
type {{.Name}} struct {
2328
impl {{.ImplName}}
@@ -39,7 +44,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
3944
if an == 0 {
4045
if as == 0 {
4146
for an == 0 && ai < len(a) {
47+
<<<<<<< HEAD
4248
ar, ai = decodeRune(a, ai)
49+
=======
50+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('') and size 1.
51+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
52+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid ''.
53+
ar, arLen = utf8.DecodeRuneInString(a[ai:])
54+
invalid := ar == utf8.RuneError && arLen == 1
55+
if invalid {
56+
return 0
57+
}
58+
ai = ai + arLen
59+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
4360
an, as = uc.impl.GetWeight(ar)
4461
}
4562
} else {
@@ -51,7 +68,19 @@ func (uc *{{.Name}}) Compare(a, b string) int {
5168
if bn == 0 {
5269
if bs == 0 {
5370
for bn == 0 && bi < len(b) {
71+
<<<<<<< HEAD
5472
br, bi = decodeRune(b, bi)
73+
=======
74+
// When the byte sequence is not a valid UTF-8 encoding of a rune, Golang returns RuneError('') and size 1.
75+
// See https://pkg.go.dev/unicode/utf8#DecodeRune for more details.
76+
// Here we check both the size and rune to distinguish between invalid byte sequence and valid ''.
77+
br, brLen = utf8.DecodeRuneInString(b[bi:])
78+
invalid := br == utf8.RuneError && brLen == 1
79+
if invalid {
80+
return 0
81+
}
82+
bi = bi + brLen
83+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
5584
bn, bs = uc.impl.GetWeight(br)
5685
}
5786
} else {
@@ -92,8 +121,18 @@ func (uc *{{.Name}}) KeyWithoutTrimRightSpace(str string) []byte {
92121
sn, ss := uint64(0), uint64(0) // weight of str. weight in unicode_ci may has 8 uint16s. sn indicate first 4 u16s, ss indicate last 4 u16s
93122
94123
for si < len(str) {
124+
<<<<<<< HEAD
95125
r, si = decodeRune(str, si)
96126
127+
=======
128+
r, rLen = utf8.DecodeRuneInString(str[si:])
129+
invalid := r == utf8.RuneError && rLen == 1
130+
if invalid {
131+
return buf
132+
}
133+
134+
si = si + rLen
135+
>>>>>>> fa0106d6c46 (util/collate: fix the issue that '�'(\uFFFD) was treated as invalid sequence (#64165))
97136
sn, ss = uc.impl.GetWeight(r)
98137

99138
for sn != 0 {

0 commit comments

Comments
 (0)