|
1 // Copyright 2011 The Go Authors. All rights reserved. |
|
2 // Use of this source code is governed by a BSD-style |
|
3 // license that can be found in the LICENSE file. |
|
4 |
|
5 // Note: the file data_test.go that is generated should not be checked in. |
|
6 //go:generate go run maketables.go triegen.go |
|
7 //go:generate go test -tags test |
|
8 |
|
9 // Package norm contains types and functions for normalizing Unicode strings. |
|
10 package norm // import "golang.org/x/text/unicode/norm" |
|
11 |
|
12 import ( |
|
13 "unicode/utf8" |
|
14 |
|
15 "golang.org/x/text/transform" |
|
16 ) |
|
17 |
|
18 // A Form denotes a canonical representation of Unicode code points. |
|
19 // The Unicode-defined normalization and equivalence forms are: |
|
20 // |
|
21 // NFC Unicode Normalization Form C |
|
22 // NFD Unicode Normalization Form D |
|
23 // NFKC Unicode Normalization Form KC |
|
24 // NFKD Unicode Normalization Form KD |
|
25 // |
|
26 // For a Form f, this documentation uses the notation f(x) to mean |
|
27 // the bytes or string x converted to the given form. |
|
28 // A position n in x is called a boundary if conversion to the form can |
|
29 // proceed independently on both sides: |
|
30 // f(x) == append(f(x[0:n]), f(x[n:])...) |
|
31 // |
|
32 // References: http://unicode.org/reports/tr15/ and |
|
33 // http://unicode.org/notes/tn5/. |
|
34 type Form int |
|
35 |
|
36 const ( |
|
37 NFC Form = iota |
|
38 NFD |
|
39 NFKC |
|
40 NFKD |
|
41 ) |
|
42 |
|
43 // Bytes returns f(b). May return b if f(b) = b. |
|
44 func (f Form) Bytes(b []byte) []byte { |
|
45 src := inputBytes(b) |
|
46 ft := formTable[f] |
|
47 n, ok := ft.quickSpan(src, 0, len(b), true) |
|
48 if ok { |
|
49 return b |
|
50 } |
|
51 out := make([]byte, n, len(b)) |
|
52 copy(out, b[0:n]) |
|
53 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} |
|
54 return doAppendInner(&rb, n) |
|
55 } |
|
56 |
|
57 // String returns f(s). |
|
58 func (f Form) String(s string) string { |
|
59 src := inputString(s) |
|
60 ft := formTable[f] |
|
61 n, ok := ft.quickSpan(src, 0, len(s), true) |
|
62 if ok { |
|
63 return s |
|
64 } |
|
65 out := make([]byte, n, len(s)) |
|
66 copy(out, s[0:n]) |
|
67 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} |
|
68 return string(doAppendInner(&rb, n)) |
|
69 } |
|
70 |
|
71 // IsNormal returns true if b == f(b). |
|
72 func (f Form) IsNormal(b []byte) bool { |
|
73 src := inputBytes(b) |
|
74 ft := formTable[f] |
|
75 bp, ok := ft.quickSpan(src, 0, len(b), true) |
|
76 if ok { |
|
77 return true |
|
78 } |
|
79 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} |
|
80 rb.setFlusher(nil, cmpNormalBytes) |
|
81 for bp < len(b) { |
|
82 rb.out = b[bp:] |
|
83 if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
|
84 return false |
|
85 } |
|
86 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) |
|
87 } |
|
88 return true |
|
89 } |
|
90 |
|
91 func cmpNormalBytes(rb *reorderBuffer) bool { |
|
92 b := rb.out |
|
93 for i := 0; i < rb.nrune; i++ { |
|
94 info := rb.rune[i] |
|
95 if int(info.size) > len(b) { |
|
96 return false |
|
97 } |
|
98 p := info.pos |
|
99 pe := p + info.size |
|
100 for ; p < pe; p++ { |
|
101 if b[0] != rb.byte[p] { |
|
102 return false |
|
103 } |
|
104 b = b[1:] |
|
105 } |
|
106 } |
|
107 return true |
|
108 } |
|
109 |
|
110 // IsNormalString returns true if s == f(s). |
|
111 func (f Form) IsNormalString(s string) bool { |
|
112 src := inputString(s) |
|
113 ft := formTable[f] |
|
114 bp, ok := ft.quickSpan(src, 0, len(s), true) |
|
115 if ok { |
|
116 return true |
|
117 } |
|
118 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} |
|
119 rb.setFlusher(nil, func(rb *reorderBuffer) bool { |
|
120 for i := 0; i < rb.nrune; i++ { |
|
121 info := rb.rune[i] |
|
122 if bp+int(info.size) > len(s) { |
|
123 return false |
|
124 } |
|
125 p := info.pos |
|
126 pe := p + info.size |
|
127 for ; p < pe; p++ { |
|
128 if s[bp] != rb.byte[p] { |
|
129 return false |
|
130 } |
|
131 bp++ |
|
132 } |
|
133 } |
|
134 return true |
|
135 }) |
|
136 for bp < len(s) { |
|
137 if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
|
138 return false |
|
139 } |
|
140 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) |
|
141 } |
|
142 return true |
|
143 } |
|
144 |
|
145 // patchTail fixes a case where a rune may be incorrectly normalized |
|
146 // if it is followed by illegal continuation bytes. It returns the |
|
147 // patched buffer and whether the decomposition is still in progress. |
|
148 func patchTail(rb *reorderBuffer) bool { |
|
149 info, p := lastRuneStart(&rb.f, rb.out) |
|
150 if p == -1 || info.size == 0 { |
|
151 return true |
|
152 } |
|
153 end := p + int(info.size) |
|
154 extra := len(rb.out) - end |
|
155 if extra > 0 { |
|
156 // Potentially allocating memory. However, this only |
|
157 // happens with ill-formed UTF-8. |
|
158 x := make([]byte, 0) |
|
159 x = append(x, rb.out[len(rb.out)-extra:]...) |
|
160 rb.out = rb.out[:end] |
|
161 decomposeToLastBoundary(rb) |
|
162 rb.doFlush() |
|
163 rb.out = append(rb.out, x...) |
|
164 return false |
|
165 } |
|
166 buf := rb.out[p:] |
|
167 rb.out = rb.out[:p] |
|
168 decomposeToLastBoundary(rb) |
|
169 if s := rb.ss.next(info); s == ssStarter { |
|
170 rb.doFlush() |
|
171 rb.ss.first(info) |
|
172 } else if s == ssOverflow { |
|
173 rb.doFlush() |
|
174 rb.insertCGJ() |
|
175 rb.ss = 0 |
|
176 } |
|
177 rb.insertUnsafe(inputBytes(buf), 0, info) |
|
178 return true |
|
179 } |
|
180 |
|
181 func appendQuick(rb *reorderBuffer, i int) int { |
|
182 if rb.nsrc == i { |
|
183 return i |
|
184 } |
|
185 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) |
|
186 rb.out = rb.src.appendSlice(rb.out, i, end) |
|
187 return end |
|
188 } |
|
189 |
|
190 // Append returns f(append(out, b...)). |
|
191 // The buffer out must be nil, empty, or equal to f(out). |
|
192 func (f Form) Append(out []byte, src ...byte) []byte { |
|
193 return f.doAppend(out, inputBytes(src), len(src)) |
|
194 } |
|
195 |
|
196 func (f Form) doAppend(out []byte, src input, n int) []byte { |
|
197 if n == 0 { |
|
198 return out |
|
199 } |
|
200 ft := formTable[f] |
|
201 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. |
|
202 if len(out) == 0 { |
|
203 p, _ := ft.quickSpan(src, 0, n, true) |
|
204 out = src.appendSlice(out, 0, p) |
|
205 if p == n { |
|
206 return out |
|
207 } |
|
208 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} |
|
209 return doAppendInner(&rb, p) |
|
210 } |
|
211 rb := reorderBuffer{f: *ft, src: src, nsrc: n} |
|
212 return doAppend(&rb, out, 0) |
|
213 } |
|
214 |
|
215 func doAppend(rb *reorderBuffer, out []byte, p int) []byte { |
|
216 rb.setFlusher(out, appendFlush) |
|
217 src, n := rb.src, rb.nsrc |
|
218 doMerge := len(out) > 0 |
|
219 if q := src.skipContinuationBytes(p); q > p { |
|
220 // Move leading non-starters to destination. |
|
221 rb.out = src.appendSlice(rb.out, p, q) |
|
222 p = q |
|
223 doMerge = patchTail(rb) |
|
224 } |
|
225 fd := &rb.f |
|
226 if doMerge { |
|
227 var info Properties |
|
228 if p < n { |
|
229 info = fd.info(src, p) |
|
230 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { |
|
231 if p == 0 { |
|
232 decomposeToLastBoundary(rb) |
|
233 } |
|
234 p = decomposeSegment(rb, p, true) |
|
235 } |
|
236 } |
|
237 if info.size == 0 { |
|
238 rb.doFlush() |
|
239 // Append incomplete UTF-8 encoding. |
|
240 return src.appendSlice(rb.out, p, n) |
|
241 } |
|
242 if rb.nrune > 0 { |
|
243 return doAppendInner(rb, p) |
|
244 } |
|
245 } |
|
246 p = appendQuick(rb, p) |
|
247 return doAppendInner(rb, p) |
|
248 } |
|
249 |
|
250 func doAppendInner(rb *reorderBuffer, p int) []byte { |
|
251 for n := rb.nsrc; p < n; { |
|
252 p = decomposeSegment(rb, p, true) |
|
253 p = appendQuick(rb, p) |
|
254 } |
|
255 return rb.out |
|
256 } |
|
257 |
|
258 // AppendString returns f(append(out, []byte(s))). |
|
259 // The buffer out must be nil, empty, or equal to f(out). |
|
260 func (f Form) AppendString(out []byte, src string) []byte { |
|
261 return f.doAppend(out, inputString(src), len(src)) |
|
262 } |
|
263 |
|
264 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). |
|
265 // It is not guaranteed to return the largest such n. |
|
266 func (f Form) QuickSpan(b []byte) int { |
|
267 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) |
|
268 return n |
|
269 } |
|
270 |
|
271 // Span implements transform.SpanningTransformer. It returns a boundary n such |
|
272 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. |
|
273 func (f Form) Span(b []byte, atEOF bool) (n int, err error) { |
|
274 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) |
|
275 if n < len(b) { |
|
276 if !ok { |
|
277 err = transform.ErrEndOfSpan |
|
278 } else { |
|
279 err = transform.ErrShortSrc |
|
280 } |
|
281 } |
|
282 return n, err |
|
283 } |
|
284 |
|
285 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). |
|
286 // It is not guaranteed to return the largest such n. |
|
287 func (f Form) SpanString(s string, atEOF bool) (n int, err error) { |
|
288 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) |
|
289 if n < len(s) { |
|
290 if !ok { |
|
291 err = transform.ErrEndOfSpan |
|
292 } else { |
|
293 err = transform.ErrShortSrc |
|
294 } |
|
295 } |
|
296 return n, err |
|
297 } |
|
298 |
|
299 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and |
|
300 // whether any non-normalized parts were found. If atEOF is false, n will |
|
301 // not point past the last segment if this segment might be become |
|
302 // non-normalized by appending other runes. |
|
303 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { |
|
304 var lastCC uint8 |
|
305 ss := streamSafe(0) |
|
306 lastSegStart := i |
|
307 for n = end; i < n; { |
|
308 if j := src.skipASCII(i, n); i != j { |
|
309 i = j |
|
310 lastSegStart = i - 1 |
|
311 lastCC = 0 |
|
312 ss = 0 |
|
313 continue |
|
314 } |
|
315 info := f.info(src, i) |
|
316 if info.size == 0 { |
|
317 if atEOF { |
|
318 // include incomplete runes |
|
319 return n, true |
|
320 } |
|
321 return lastSegStart, true |
|
322 } |
|
323 // This block needs to be before the next, because it is possible to |
|
324 // have an overflow for runes that are starters (e.g. with U+FF9E). |
|
325 switch ss.next(info) { |
|
326 case ssStarter: |
|
327 lastSegStart = i |
|
328 case ssOverflow: |
|
329 return lastSegStart, false |
|
330 case ssSuccess: |
|
331 if lastCC > info.ccc { |
|
332 return lastSegStart, false |
|
333 } |
|
334 } |
|
335 if f.composing { |
|
336 if !info.isYesC() { |
|
337 break |
|
338 } |
|
339 } else { |
|
340 if !info.isYesD() { |
|
341 break |
|
342 } |
|
343 } |
|
344 lastCC = info.ccc |
|
345 i += int(info.size) |
|
346 } |
|
347 if i == n { |
|
348 if !atEOF { |
|
349 n = lastSegStart |
|
350 } |
|
351 return n, true |
|
352 } |
|
353 return lastSegStart, false |
|
354 } |
|
355 |
|
356 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). |
|
357 // It is not guaranteed to return the largest such n. |
|
358 func (f Form) QuickSpanString(s string) int { |
|
359 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) |
|
360 return n |
|
361 } |
|
362 |
|
363 // FirstBoundary returns the position i of the first boundary in b |
|
364 // or -1 if b contains no boundary. |
|
365 func (f Form) FirstBoundary(b []byte) int { |
|
366 return f.firstBoundary(inputBytes(b), len(b)) |
|
367 } |
|
368 |
|
369 func (f Form) firstBoundary(src input, nsrc int) int { |
|
370 i := src.skipContinuationBytes(0) |
|
371 if i >= nsrc { |
|
372 return -1 |
|
373 } |
|
374 fd := formTable[f] |
|
375 ss := streamSafe(0) |
|
376 // We should call ss.first here, but we can't as the first rune is |
|
377 // skipped already. This means FirstBoundary can't really determine |
|
378 // CGJ insertion points correctly. Luckily it doesn't have to. |
|
379 for { |
|
380 info := fd.info(src, i) |
|
381 if info.size == 0 { |
|
382 return -1 |
|
383 } |
|
384 if s := ss.next(info); s != ssSuccess { |
|
385 return i |
|
386 } |
|
387 i += int(info.size) |
|
388 if i >= nsrc { |
|
389 if !info.BoundaryAfter() && !ss.isMax() { |
|
390 return -1 |
|
391 } |
|
392 return nsrc |
|
393 } |
|
394 } |
|
395 } |
|
396 |
|
397 // FirstBoundaryInString returns the position i of the first boundary in s |
|
398 // or -1 if s contains no boundary. |
|
399 func (f Form) FirstBoundaryInString(s string) int { |
|
400 return f.firstBoundary(inputString(s), len(s)) |
|
401 } |
|
402 |
|
403 // NextBoundary reports the index of the boundary between the first and next |
|
404 // segment in b or -1 if atEOF is false and there are not enough bytes to |
|
405 // determine this boundary. |
|
406 func (f Form) NextBoundary(b []byte, atEOF bool) int { |
|
407 return f.nextBoundary(inputBytes(b), len(b), atEOF) |
|
408 } |
|
409 |
|
410 // NextBoundaryInString reports the index of the boundary between the first and |
|
411 // next segment in b or -1 if atEOF is false and there are not enough bytes to |
|
412 // determine this boundary. |
|
413 func (f Form) NextBoundaryInString(s string, atEOF bool) int { |
|
414 return f.nextBoundary(inputString(s), len(s), atEOF) |
|
415 } |
|
416 |
|
417 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { |
|
418 if nsrc == 0 { |
|
419 if atEOF { |
|
420 return 0 |
|
421 } |
|
422 return -1 |
|
423 } |
|
424 fd := formTable[f] |
|
425 info := fd.info(src, 0) |
|
426 if info.size == 0 { |
|
427 if atEOF { |
|
428 return 1 |
|
429 } |
|
430 return -1 |
|
431 } |
|
432 ss := streamSafe(0) |
|
433 ss.first(info) |
|
434 |
|
435 for i := int(info.size); i < nsrc; i += int(info.size) { |
|
436 info = fd.info(src, i) |
|
437 if info.size == 0 { |
|
438 if atEOF { |
|
439 return i |
|
440 } |
|
441 return -1 |
|
442 } |
|
443 // TODO: Using streamSafe to determine the boundary isn't the same as |
|
444 // using BoundaryBefore. Determine which should be used. |
|
445 if s := ss.next(info); s != ssSuccess { |
|
446 return i |
|
447 } |
|
448 } |
|
449 if !atEOF && !info.BoundaryAfter() && !ss.isMax() { |
|
450 return -1 |
|
451 } |
|
452 return nsrc |
|
453 } |
|
454 |
|
455 // LastBoundary returns the position i of the last boundary in b |
|
456 // or -1 if b contains no boundary. |
|
457 func (f Form) LastBoundary(b []byte) int { |
|
458 return lastBoundary(formTable[f], b) |
|
459 } |
|
460 |
|
461 func lastBoundary(fd *formInfo, b []byte) int { |
|
462 i := len(b) |
|
463 info, p := lastRuneStart(fd, b) |
|
464 if p == -1 { |
|
465 return -1 |
|
466 } |
|
467 if info.size == 0 { // ends with incomplete rune |
|
468 if p == 0 { // starts with incomplete rune |
|
469 return -1 |
|
470 } |
|
471 i = p |
|
472 info, p = lastRuneStart(fd, b[:i]) |
|
473 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter |
|
474 return i |
|
475 } |
|
476 } |
|
477 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 |
|
478 return i |
|
479 } |
|
480 if info.BoundaryAfter() { |
|
481 return i |
|
482 } |
|
483 ss := streamSafe(0) |
|
484 v := ss.backwards(info) |
|
485 for i = p; i >= 0 && v != ssStarter; i = p { |
|
486 info, p = lastRuneStart(fd, b[:i]) |
|
487 if v = ss.backwards(info); v == ssOverflow { |
|
488 break |
|
489 } |
|
490 if p+int(info.size) != i { |
|
491 if p == -1 { // no boundary found |
|
492 return -1 |
|
493 } |
|
494 return i // boundary after an illegal UTF-8 encoding |
|
495 } |
|
496 } |
|
497 return i |
|
498 } |
|
499 |
|
500 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f |
|
501 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters |
|
502 // and returns the number of bytes consumed from src or iShortDst or iShortSrc. |
|
503 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { |
|
504 // Force one character to be consumed. |
|
505 info := rb.f.info(rb.src, sp) |
|
506 if info.size == 0 { |
|
507 return 0 |
|
508 } |
|
509 if s := rb.ss.next(info); s == ssStarter { |
|
510 // TODO: this could be removed if we don't support merging. |
|
511 if rb.nrune > 0 { |
|
512 goto end |
|
513 } |
|
514 } else if s == ssOverflow { |
|
515 rb.insertCGJ() |
|
516 goto end |
|
517 } |
|
518 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
|
519 return int(err) |
|
520 } |
|
521 for { |
|
522 sp += int(info.size) |
|
523 if sp >= rb.nsrc { |
|
524 if !atEOF && !info.BoundaryAfter() { |
|
525 return int(iShortSrc) |
|
526 } |
|
527 break |
|
528 } |
|
529 info = rb.f.info(rb.src, sp) |
|
530 if info.size == 0 { |
|
531 if !atEOF { |
|
532 return int(iShortSrc) |
|
533 } |
|
534 break |
|
535 } |
|
536 if s := rb.ss.next(info); s == ssStarter { |
|
537 break |
|
538 } else if s == ssOverflow { |
|
539 rb.insertCGJ() |
|
540 break |
|
541 } |
|
542 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
|
543 return int(err) |
|
544 } |
|
545 } |
|
546 end: |
|
547 if !rb.doFlush() { |
|
548 return int(iShortDst) |
|
549 } |
|
550 return sp |
|
551 } |
|
552 |
|
553 // lastRuneStart returns the runeInfo and position of the last |
|
554 // rune in buf or the zero runeInfo and -1 if no rune was found. |
|
555 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { |
|
556 p := len(buf) - 1 |
|
557 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { |
|
558 } |
|
559 if p < 0 { |
|
560 return Properties{}, -1 |
|
561 } |
|
562 return fd.info(inputBytes(buf), p), p |
|
563 } |
|
564 |
|
565 // decomposeToLastBoundary finds an open segment at the end of the buffer |
|
566 // and scans it into rb. Returns the buffer minus the last segment. |
|
567 func decomposeToLastBoundary(rb *reorderBuffer) { |
|
568 fd := &rb.f |
|
569 info, i := lastRuneStart(fd, rb.out) |
|
570 if int(info.size) != len(rb.out)-i { |
|
571 // illegal trailing continuation bytes |
|
572 return |
|
573 } |
|
574 if info.BoundaryAfter() { |
|
575 return |
|
576 } |
|
577 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order |
|
578 padd := 0 |
|
579 ss := streamSafe(0) |
|
580 p := len(rb.out) |
|
581 for { |
|
582 add[padd] = info |
|
583 v := ss.backwards(info) |
|
584 if v == ssOverflow { |
|
585 // Note that if we have an overflow, it the string we are appending to |
|
586 // is not correctly normalized. In this case the behavior is undefined. |
|
587 break |
|
588 } |
|
589 padd++ |
|
590 p -= int(info.size) |
|
591 if v == ssStarter || p < 0 { |
|
592 break |
|
593 } |
|
594 info, i = lastRuneStart(fd, rb.out[:p]) |
|
595 if int(info.size) != p-i { |
|
596 break |
|
597 } |
|
598 } |
|
599 rb.ss = ss |
|
600 // Copy bytes for insertion as we may need to overwrite rb.out. |
|
601 var buf [maxBufferSize * utf8.UTFMax]byte |
|
602 cp := buf[:copy(buf[:], rb.out[p:])] |
|
603 rb.out = rb.out[:p] |
|
604 for padd--; padd >= 0; padd-- { |
|
605 info = add[padd] |
|
606 rb.insertUnsafe(inputBytes(cp), 0, info) |
|
607 cp = cp[info.size:] |
|
608 } |
|
609 } |