260
|
1 |
package toml |
|
2 |
|
|
3 |
import ( |
|
4 |
"unicode/utf8" |
|
5 |
) |
|
6 |
|
|
7 |
type utf8Err struct { |
|
8 |
Index int |
|
9 |
Size int |
|
10 |
} |
|
11 |
|
|
12 |
func (u utf8Err) Zero() bool { |
|
13 |
return u.Size == 0 |
|
14 |
} |
|
15 |
|
|
16 |
// Verified that a given string is only made of valid UTF-8 characters allowed |
|
17 |
// by the TOML spec: |
|
18 |
// |
|
19 |
// Any Unicode character may be used except those that must be escaped: |
|
20 |
// quotation mark, backslash, and the control characters other than tab (U+0000 |
|
21 |
// to U+0008, U+000A to U+001F, U+007F). |
|
22 |
// |
|
23 |
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early |
|
24 |
// when a character is not allowed. |
|
25 |
// |
|
26 |
// The returned utf8Err is Zero() if the string is valid, or contains the byte |
|
27 |
// index and size of the invalid character. |
|
28 |
// |
|
29 |
// quotation mark => already checked |
|
30 |
// backslash => already checked |
|
31 |
// 0-0x8 => invalid |
|
32 |
// 0x9 => tab, ok |
|
33 |
// 0xA - 0x1F => invalid |
|
34 |
// 0x7F => invalid |
|
35 |
func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) { |
|
36 |
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration. |
|
37 |
offset := 0 |
|
38 |
for len(p) >= 8 { |
|
39 |
// Combining two 32 bit loads allows the same code to be used |
|
40 |
// for 32 and 64 bit platforms. |
|
41 |
// The compiler can generate a 32bit load for first32 and second32 |
|
42 |
// on many platforms. See test/codegen/memcombine.go. |
|
43 |
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 |
|
44 |
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24 |
|
45 |
if (first32|second32)&0x80808080 != 0 { |
|
46 |
// Found a non ASCII byte (>= RuneSelf). |
|
47 |
break |
|
48 |
} |
|
49 |
|
|
50 |
for i, b := range p[:8] { |
|
51 |
if invalidAscii(b) { |
|
52 |
err.Index = offset + i |
|
53 |
err.Size = 1 |
|
54 |
return |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
p = p[8:] |
|
59 |
offset += 8 |
|
60 |
} |
|
61 |
n := len(p) |
|
62 |
for i := 0; i < n; { |
|
63 |
pi := p[i] |
|
64 |
if pi < utf8.RuneSelf { |
|
65 |
if invalidAscii(pi) { |
|
66 |
err.Index = offset + i |
|
67 |
err.Size = 1 |
|
68 |
return |
|
69 |
} |
|
70 |
i++ |
|
71 |
continue |
|
72 |
} |
|
73 |
x := first[pi] |
|
74 |
if x == xx { |
|
75 |
// Illegal starter byte. |
|
76 |
err.Index = offset + i |
|
77 |
err.Size = 1 |
|
78 |
return |
|
79 |
} |
|
80 |
size := int(x & 7) |
|
81 |
if i+size > n { |
|
82 |
// Short or invalid. |
|
83 |
err.Index = offset + i |
|
84 |
err.Size = n - i |
|
85 |
return |
|
86 |
} |
|
87 |
accept := acceptRanges[x>>4] |
|
88 |
if c := p[i+1]; c < accept.lo || accept.hi < c { |
|
89 |
err.Index = offset + i |
|
90 |
err.Size = 2 |
|
91 |
return |
|
92 |
} else if size == 2 { |
|
93 |
} else if c := p[i+2]; c < locb || hicb < c { |
|
94 |
err.Index = offset + i |
|
95 |
err.Size = 3 |
|
96 |
return |
|
97 |
} else if size == 3 { |
|
98 |
} else if c := p[i+3]; c < locb || hicb < c { |
|
99 |
err.Index = offset + i |
|
100 |
err.Size = 4 |
|
101 |
return |
|
102 |
} |
|
103 |
i += size |
|
104 |
} |
|
105 |
return |
|
106 |
} |
|
107 |
|
|
108 |
// Return the size of the next rune if valid, 0 otherwise. |
|
109 |
func utf8ValidNext(p []byte) int { |
|
110 |
c := p[0] |
|
111 |
|
|
112 |
if c < utf8.RuneSelf { |
|
113 |
if invalidAscii(c) { |
|
114 |
return 0 |
|
115 |
} |
|
116 |
return 1 |
|
117 |
} |
|
118 |
|
|
119 |
x := first[c] |
|
120 |
if x == xx { |
|
121 |
// Illegal starter byte. |
|
122 |
return 0 |
|
123 |
} |
|
124 |
size := int(x & 7) |
|
125 |
if size > len(p) { |
|
126 |
// Short or invalid. |
|
127 |
return 0 |
|
128 |
} |
|
129 |
accept := acceptRanges[x>>4] |
|
130 |
if c := p[1]; c < accept.lo || accept.hi < c { |
|
131 |
return 0 |
|
132 |
} else if size == 2 { |
|
133 |
} else if c := p[2]; c < locb || hicb < c { |
|
134 |
return 0 |
|
135 |
} else if size == 3 { |
|
136 |
} else if c := p[3]; c < locb || hicb < c { |
|
137 |
return 0 |
|
138 |
} |
|
139 |
|
|
140 |
return size |
|
141 |
} |
|
142 |
|
|
143 |
var invalidAsciiTable = [256]bool{ |
|
144 |
0x00: true, |
|
145 |
0x01: true, |
|
146 |
0x02: true, |
|
147 |
0x03: true, |
|
148 |
0x04: true, |
|
149 |
0x05: true, |
|
150 |
0x06: true, |
|
151 |
0x07: true, |
|
152 |
0x08: true, |
|
153 |
// 0x09 TAB |
|
154 |
// 0x0A LF |
|
155 |
0x0B: true, |
|
156 |
0x0C: true, |
|
157 |
// 0x0D CR |
|
158 |
0x0E: true, |
|
159 |
0x0F: true, |
|
160 |
0x10: true, |
|
161 |
0x11: true, |
|
162 |
0x12: true, |
|
163 |
0x13: true, |
|
164 |
0x14: true, |
|
165 |
0x15: true, |
|
166 |
0x16: true, |
|
167 |
0x17: true, |
|
168 |
0x18: true, |
|
169 |
0x19: true, |
|
170 |
0x1A: true, |
|
171 |
0x1B: true, |
|
172 |
0x1C: true, |
|
173 |
0x1D: true, |
|
174 |
0x1E: true, |
|
175 |
0x1F: true, |
|
176 |
// 0x20 - 0x7E Printable ASCII characters |
|
177 |
0x7F: true, |
|
178 |
} |
|
179 |
|
|
180 |
func invalidAscii(b byte) bool { |
|
181 |
return invalidAsciiTable[b] |
|
182 |
} |
|
183 |
|
|
184 |
// acceptRange gives the range of valid values for the second byte in a UTF-8 |
|
185 |
// sequence. |
|
186 |
type acceptRange struct { |
|
187 |
lo uint8 // lowest value for second byte. |
|
188 |
hi uint8 // highest value for second byte. |
|
189 |
} |
|
190 |
|
|
191 |
// acceptRanges has size 16 to avoid bounds checks in the code that uses it. |
|
192 |
var acceptRanges = [16]acceptRange{ |
|
193 |
0: {locb, hicb}, |
|
194 |
1: {0xA0, hicb}, |
|
195 |
2: {locb, 0x9F}, |
|
196 |
3: {0x90, hicb}, |
|
197 |
4: {locb, 0x8F}, |
|
198 |
} |
|
199 |
|
|
200 |
// first is information about the first byte in a UTF-8 sequence. |
|
201 |
var first = [256]uint8{ |
|
202 |
// 1 2 3 4 5 6 7 8 9 A B C D E F |
|
203 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F |
|
204 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F |
|
205 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F |
|
206 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F |
|
207 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F |
|
208 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F |
|
209 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F |
|
210 |
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F |
|
211 |
// 1 2 3 4 5 6 7 8 9 A B C D E F |
|
212 |
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F |
|
213 |
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F |
|
214 |
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF |
|
215 |
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF |
|
216 |
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF |
|
217 |
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF |
|
218 |
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF |
|
219 |
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF |
|
220 |
} |
|
221 |
|
|
222 |
const ( |
|
223 |
// The default lowest and highest continuation byte. |
|
224 |
locb = 0b10000000 |
|
225 |
hicb = 0b10111111 |
|
226 |
|
|
227 |
// These names of these constants are chosen to give nice alignment in the |
|
228 |
// table below. The first nibble is an index into acceptRanges or F for |
|
229 |
// special one-byte cases. The second nibble is the Rune length or the |
|
230 |
// Status for the special one-byte case. |
|
231 |
xx = 0xF1 // invalid: size 1 |
|
232 |
as = 0xF0 // ASCII: size 1 |
|
233 |
s1 = 0x02 // accept 0, size 2 |
|
234 |
s2 = 0x13 // accept 1, size 3 |
|
235 |
s3 = 0x03 // accept 0, size 3 |
|
236 |
s4 = 0x23 // accept 2, size 3 |
|
237 |
s5 = 0x34 // accept 3, size 4 |
|
238 |
s6 = 0x04 // accept 0, size 4 |
|
239 |
s7 = 0x44 // accept 4, size 4 |
|
240 |
) |