|
1 // Copyright 2014 The Go Authors. All rights reserved. |
|
2 // Use of this source code is governed by a BSD-style |
|
3 // license that can be found in the LICENSE file. |
|
4 |
|
5 // Package runes provide transforms for UTF-8 encoded text. |
|
6 package runes // import "golang.org/x/text/runes" |
|
7 |
|
8 import ( |
|
9 "unicode" |
|
10 "unicode/utf8" |
|
11 |
|
12 "golang.org/x/text/transform" |
|
13 ) |
|
14 |
|
15 // A Set is a collection of runes. |
|
16 type Set interface { |
|
17 // Contains returns true if r is contained in the set. |
|
18 Contains(r rune) bool |
|
19 } |
|
20 |
|
21 type setFunc func(rune) bool |
|
22 |
|
23 func (s setFunc) Contains(r rune) bool { |
|
24 return s(r) |
|
25 } |
|
26 |
|
27 // Note: using funcs here instead of wrapping types result in cleaner |
|
28 // documentation and a smaller API. |
|
29 |
|
30 // In creates a Set with a Contains method that returns true for all runes in |
|
31 // the given RangeTable. |
|
32 func In(rt *unicode.RangeTable) Set { |
|
33 return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) |
|
34 } |
|
35 |
|
36 // In creates a Set with a Contains method that returns true for all runes not |
|
37 // in the given RangeTable. |
|
38 func NotIn(rt *unicode.RangeTable) Set { |
|
39 return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) |
|
40 } |
|
41 |
|
42 // Predicate creates a Set with a Contains method that returns f(r). |
|
43 func Predicate(f func(rune) bool) Set { |
|
44 return setFunc(f) |
|
45 } |
|
46 |
|
47 // Transformer implements the transform.Transformer interface. |
|
48 type Transformer struct { |
|
49 t transform.SpanningTransformer |
|
50 } |
|
51 |
|
52 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
53 return t.t.Transform(dst, src, atEOF) |
|
54 } |
|
55 |
|
56 func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { |
|
57 return t.t.Span(b, atEOF) |
|
58 } |
|
59 |
|
60 func (t Transformer) Reset() { t.t.Reset() } |
|
61 |
|
62 // Bytes returns a new byte slice with the result of converting b using t. It |
|
63 // calls Reset on t. It returns nil if any error was found. This can only happen |
|
64 // if an error-producing Transformer is passed to If. |
|
65 func (t Transformer) Bytes(b []byte) []byte { |
|
66 b, _, err := transform.Bytes(t, b) |
|
67 if err != nil { |
|
68 return nil |
|
69 } |
|
70 return b |
|
71 } |
|
72 |
|
73 // String returns a string with the result of converting s using t. It calls |
|
74 // Reset on t. It returns the empty string if any error was found. This can only |
|
75 // happen if an error-producing Transformer is passed to If. |
|
76 func (t Transformer) String(s string) string { |
|
77 s, _, err := transform.String(t, s) |
|
78 if err != nil { |
|
79 return "" |
|
80 } |
|
81 return s |
|
82 } |
|
83 |
|
84 // TODO: |
|
85 // - Copy: copying strings and bytes in whole-rune units. |
|
86 // - Validation (maybe) |
|
87 // - Well-formed-ness (maybe) |
|
88 |
|
89 const runeErrorString = string(utf8.RuneError) |
|
90 |
|
91 // Remove returns a Transformer that removes runes r for which s.Contains(r). |
|
92 // Illegal input bytes are replaced by RuneError before being passed to f. |
|
93 func Remove(s Set) Transformer { |
|
94 if f, ok := s.(setFunc); ok { |
|
95 // This little trick cuts the running time of BenchmarkRemove for sets |
|
96 // created by Predicate roughly in half. |
|
97 // TODO: special-case RangeTables as well. |
|
98 return Transformer{remove(f)} |
|
99 } |
|
100 return Transformer{remove(s.Contains)} |
|
101 } |
|
102 |
|
103 // TODO: remove transform.RemoveFunc. |
|
104 |
|
105 type remove func(r rune) bool |
|
106 |
|
107 func (remove) Reset() {} |
|
108 |
|
109 // Span implements transform.Spanner. |
|
110 func (t remove) Span(src []byte, atEOF bool) (n int, err error) { |
|
111 for r, size := rune(0), 0; n < len(src); { |
|
112 if r = rune(src[n]); r < utf8.RuneSelf { |
|
113 size = 1 |
|
114 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
|
115 // Invalid rune. |
|
116 if !atEOF && !utf8.FullRune(src[n:]) { |
|
117 err = transform.ErrShortSrc |
|
118 } else { |
|
119 err = transform.ErrEndOfSpan |
|
120 } |
|
121 break |
|
122 } |
|
123 if t(r) { |
|
124 err = transform.ErrEndOfSpan |
|
125 break |
|
126 } |
|
127 n += size |
|
128 } |
|
129 return |
|
130 } |
|
131 |
|
132 // Transform implements transform.Transformer. |
|
133 func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
134 for r, size := rune(0), 0; nSrc < len(src); { |
|
135 if r = rune(src[nSrc]); r < utf8.RuneSelf { |
|
136 size = 1 |
|
137 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
|
138 // Invalid rune. |
|
139 if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
140 err = transform.ErrShortSrc |
|
141 break |
|
142 } |
|
143 // We replace illegal bytes with RuneError. Not doing so might |
|
144 // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. |
|
145 // The resulting byte sequence may subsequently contain runes |
|
146 // for which t(r) is true that were passed unnoticed. |
|
147 if !t(utf8.RuneError) { |
|
148 if nDst+3 > len(dst) { |
|
149 err = transform.ErrShortDst |
|
150 break |
|
151 } |
|
152 dst[nDst+0] = runeErrorString[0] |
|
153 dst[nDst+1] = runeErrorString[1] |
|
154 dst[nDst+2] = runeErrorString[2] |
|
155 nDst += 3 |
|
156 } |
|
157 nSrc++ |
|
158 continue |
|
159 } |
|
160 if t(r) { |
|
161 nSrc += size |
|
162 continue |
|
163 } |
|
164 if nDst+size > len(dst) { |
|
165 err = transform.ErrShortDst |
|
166 break |
|
167 } |
|
168 for i := 0; i < size; i++ { |
|
169 dst[nDst] = src[nSrc] |
|
170 nDst++ |
|
171 nSrc++ |
|
172 } |
|
173 } |
|
174 return |
|
175 } |
|
176 |
|
177 // Map returns a Transformer that maps the runes in the input using the given |
|
178 // mapping. Illegal bytes in the input are converted to utf8.RuneError before |
|
179 // being passed to the mapping func. |
|
180 func Map(mapping func(rune) rune) Transformer { |
|
181 return Transformer{mapper(mapping)} |
|
182 } |
|
183 |
|
184 type mapper func(rune) rune |
|
185 |
|
186 func (mapper) Reset() {} |
|
187 |
|
188 // Span implements transform.Spanner. |
|
189 func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { |
|
190 for r, size := rune(0), 0; n < len(src); n += size { |
|
191 if r = rune(src[n]); r < utf8.RuneSelf { |
|
192 size = 1 |
|
193 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
|
194 // Invalid rune. |
|
195 if !atEOF && !utf8.FullRune(src[n:]) { |
|
196 err = transform.ErrShortSrc |
|
197 } else { |
|
198 err = transform.ErrEndOfSpan |
|
199 } |
|
200 break |
|
201 } |
|
202 if t(r) != r { |
|
203 err = transform.ErrEndOfSpan |
|
204 break |
|
205 } |
|
206 } |
|
207 return n, err |
|
208 } |
|
209 |
|
210 // Transform implements transform.Transformer. |
|
211 func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
212 var replacement rune |
|
213 var b [utf8.UTFMax]byte |
|
214 |
|
215 for r, size := rune(0), 0; nSrc < len(src); { |
|
216 if r = rune(src[nSrc]); r < utf8.RuneSelf { |
|
217 if replacement = t(r); replacement < utf8.RuneSelf { |
|
218 if nDst == len(dst) { |
|
219 err = transform.ErrShortDst |
|
220 break |
|
221 } |
|
222 dst[nDst] = byte(replacement) |
|
223 nDst++ |
|
224 nSrc++ |
|
225 continue |
|
226 } |
|
227 size = 1 |
|
228 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
|
229 // Invalid rune. |
|
230 if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
231 err = transform.ErrShortSrc |
|
232 break |
|
233 } |
|
234 |
|
235 if replacement = t(utf8.RuneError); replacement == utf8.RuneError { |
|
236 if nDst+3 > len(dst) { |
|
237 err = transform.ErrShortDst |
|
238 break |
|
239 } |
|
240 dst[nDst+0] = runeErrorString[0] |
|
241 dst[nDst+1] = runeErrorString[1] |
|
242 dst[nDst+2] = runeErrorString[2] |
|
243 nDst += 3 |
|
244 nSrc++ |
|
245 continue |
|
246 } |
|
247 } else if replacement = t(r); replacement == r { |
|
248 if nDst+size > len(dst) { |
|
249 err = transform.ErrShortDst |
|
250 break |
|
251 } |
|
252 for i := 0; i < size; i++ { |
|
253 dst[nDst] = src[nSrc] |
|
254 nDst++ |
|
255 nSrc++ |
|
256 } |
|
257 continue |
|
258 } |
|
259 |
|
260 n := utf8.EncodeRune(b[:], replacement) |
|
261 |
|
262 if nDst+n > len(dst) { |
|
263 err = transform.ErrShortDst |
|
264 break |
|
265 } |
|
266 for i := 0; i < n; i++ { |
|
267 dst[nDst] = b[i] |
|
268 nDst++ |
|
269 } |
|
270 nSrc += size |
|
271 } |
|
272 return |
|
273 } |
|
274 |
|
275 // ReplaceIllFormed returns a transformer that replaces all input bytes that are |
|
276 // not part of a well-formed UTF-8 code sequence with utf8.RuneError. |
|
277 func ReplaceIllFormed() Transformer { |
|
278 return Transformer{&replaceIllFormed{}} |
|
279 } |
|
280 |
|
281 type replaceIllFormed struct{ transform.NopResetter } |
|
282 |
|
283 func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { |
|
284 for n < len(src) { |
|
285 // ASCII fast path. |
|
286 if src[n] < utf8.RuneSelf { |
|
287 n++ |
|
288 continue |
|
289 } |
|
290 |
|
291 r, size := utf8.DecodeRune(src[n:]) |
|
292 |
|
293 // Look for a valid non-ASCII rune. |
|
294 if r != utf8.RuneError || size != 1 { |
|
295 n += size |
|
296 continue |
|
297 } |
|
298 |
|
299 // Look for short source data. |
|
300 if !atEOF && !utf8.FullRune(src[n:]) { |
|
301 err = transform.ErrShortSrc |
|
302 break |
|
303 } |
|
304 |
|
305 // We have an invalid rune. |
|
306 err = transform.ErrEndOfSpan |
|
307 break |
|
308 } |
|
309 return n, err |
|
310 } |
|
311 |
|
312 func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
313 for nSrc < len(src) { |
|
314 // ASCII fast path. |
|
315 if r := src[nSrc]; r < utf8.RuneSelf { |
|
316 if nDst == len(dst) { |
|
317 err = transform.ErrShortDst |
|
318 break |
|
319 } |
|
320 dst[nDst] = r |
|
321 nDst++ |
|
322 nSrc++ |
|
323 continue |
|
324 } |
|
325 |
|
326 // Look for a valid non-ASCII rune. |
|
327 if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { |
|
328 if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { |
|
329 err = transform.ErrShortDst |
|
330 break |
|
331 } |
|
332 nDst += size |
|
333 nSrc += size |
|
334 continue |
|
335 } |
|
336 |
|
337 // Look for short source data. |
|
338 if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
339 err = transform.ErrShortSrc |
|
340 break |
|
341 } |
|
342 |
|
343 // We have an invalid rune. |
|
344 if nDst+3 > len(dst) { |
|
345 err = transform.ErrShortDst |
|
346 break |
|
347 } |
|
348 dst[nDst+0] = runeErrorString[0] |
|
349 dst[nDst+1] = runeErrorString[1] |
|
350 dst[nDst+2] = runeErrorString[2] |
|
351 nDst += 3 |
|
352 nSrc++ |
|
353 } |
|
354 return nDst, nSrc, err |
|
355 } |