vendor/golang.org/x/text/runes/runes.go
changeset 260 445e01aede7e
child 265 05c40b36d3b2
equal deleted inserted replaced
259:db4911b0c721 260:445e01aede7e
       
     1 // Copyright 2014 The Go Authors. All rights reserved.
       
     2 // Use of this source code is governed by a BSD-style
       
     3 // license that can be found in the LICENSE file.
       
     4 
       
     5 // Package runes provide transforms for UTF-8 encoded text.
       
     6 package runes // import "golang.org/x/text/runes"
       
     7 
       
     8 import (
       
     9 	"unicode"
       
    10 	"unicode/utf8"
       
    11 
       
    12 	"golang.org/x/text/transform"
       
    13 )
       
    14 
       
    15 // A Set is a collection of runes.
       
    16 type Set interface {
       
    17 	// Contains returns true if r is contained in the set.
       
    18 	Contains(r rune) bool
       
    19 }
       
    20 
       
    21 type setFunc func(rune) bool
       
    22 
       
    23 func (s setFunc) Contains(r rune) bool {
       
    24 	return s(r)
       
    25 }
       
    26 
       
    27 // Note: using funcs here instead of wrapping types result in cleaner
       
    28 // documentation and a smaller API.
       
    29 
       
    30 // In creates a Set with a Contains method that returns true for all runes in
       
    31 // the given RangeTable.
       
    32 func In(rt *unicode.RangeTable) Set {
       
    33 	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
       
    34 }
       
    35 
       
    36 // In creates a Set with a Contains method that returns true for all runes not
       
    37 // in the given RangeTable.
       
    38 func NotIn(rt *unicode.RangeTable) Set {
       
    39 	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
       
    40 }
       
    41 
       
    42 // Predicate creates a Set with a Contains method that returns f(r).
       
    43 func Predicate(f func(rune) bool) Set {
       
    44 	return setFunc(f)
       
    45 }
       
    46 
       
    47 // Transformer implements the transform.Transformer interface.
       
    48 type Transformer struct {
       
    49 	t transform.SpanningTransformer
       
    50 }
       
    51 
       
    52 func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
       
    53 	return t.t.Transform(dst, src, atEOF)
       
    54 }
       
    55 
       
    56 func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
       
    57 	return t.t.Span(b, atEOF)
       
    58 }
       
    59 
       
    60 func (t Transformer) Reset() { t.t.Reset() }
       
    61 
       
    62 // Bytes returns a new byte slice with the result of converting b using t.  It
       
    63 // calls Reset on t. It returns nil if any error was found. This can only happen
       
    64 // if an error-producing Transformer is passed to If.
       
    65 func (t Transformer) Bytes(b []byte) []byte {
       
    66 	b, _, err := transform.Bytes(t, b)
       
    67 	if err != nil {
       
    68 		return nil
       
    69 	}
       
    70 	return b
       
    71 }
       
    72 
       
    73 // String returns a string with the result of converting s using t. It calls
       
    74 // Reset on t. It returns the empty string if any error was found. This can only
       
    75 // happen if an error-producing Transformer is passed to If.
       
    76 func (t Transformer) String(s string) string {
       
    77 	s, _, err := transform.String(t, s)
       
    78 	if err != nil {
       
    79 		return ""
       
    80 	}
       
    81 	return s
       
    82 }
       
    83 
       
    84 // TODO:
       
    85 // - Copy: copying strings and bytes in whole-rune units.
       
    86 // - Validation (maybe)
       
    87 // - Well-formed-ness (maybe)
       
    88 
       
    89 const runeErrorString = string(utf8.RuneError)
       
    90 
       
    91 // Remove returns a Transformer that removes runes r for which s.Contains(r).
       
    92 // Illegal input bytes are replaced by RuneError before being passed to f.
       
    93 func Remove(s Set) Transformer {
       
    94 	if f, ok := s.(setFunc); ok {
       
    95 		// This little trick cuts the running time of BenchmarkRemove for sets
       
    96 		// created by Predicate roughly in half.
       
    97 		// TODO: special-case RangeTables as well.
       
    98 		return Transformer{remove(f)}
       
    99 	}
       
   100 	return Transformer{remove(s.Contains)}
       
   101 }
       
   102 
       
   103 // TODO: remove transform.RemoveFunc.
       
   104 
       
   105 type remove func(r rune) bool
       
   106 
       
   107 func (remove) Reset() {}
       
   108 
       
   109 // Span implements transform.Spanner.
       
   110 func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
       
   111 	for r, size := rune(0), 0; n < len(src); {
       
   112 		if r = rune(src[n]); r < utf8.RuneSelf {
       
   113 			size = 1
       
   114 		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
       
   115 			// Invalid rune.
       
   116 			if !atEOF && !utf8.FullRune(src[n:]) {
       
   117 				err = transform.ErrShortSrc
       
   118 			} else {
       
   119 				err = transform.ErrEndOfSpan
       
   120 			}
       
   121 			break
       
   122 		}
       
   123 		if t(r) {
       
   124 			err = transform.ErrEndOfSpan
       
   125 			break
       
   126 		}
       
   127 		n += size
       
   128 	}
       
   129 	return
       
   130 }
       
   131 
       
   132 // Transform implements transform.Transformer.
       
   133 func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
       
   134 	for r, size := rune(0), 0; nSrc < len(src); {
       
   135 		if r = rune(src[nSrc]); r < utf8.RuneSelf {
       
   136 			size = 1
       
   137 		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
       
   138 			// Invalid rune.
       
   139 			if !atEOF && !utf8.FullRune(src[nSrc:]) {
       
   140 				err = transform.ErrShortSrc
       
   141 				break
       
   142 			}
       
   143 			// We replace illegal bytes with RuneError. Not doing so might
       
   144 			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
       
   145 			// The resulting byte sequence may subsequently contain runes
       
   146 			// for which t(r) is true that were passed unnoticed.
       
   147 			if !t(utf8.RuneError) {
       
   148 				if nDst+3 > len(dst) {
       
   149 					err = transform.ErrShortDst
       
   150 					break
       
   151 				}
       
   152 				dst[nDst+0] = runeErrorString[0]
       
   153 				dst[nDst+1] = runeErrorString[1]
       
   154 				dst[nDst+2] = runeErrorString[2]
       
   155 				nDst += 3
       
   156 			}
       
   157 			nSrc++
       
   158 			continue
       
   159 		}
       
   160 		if t(r) {
       
   161 			nSrc += size
       
   162 			continue
       
   163 		}
       
   164 		if nDst+size > len(dst) {
       
   165 			err = transform.ErrShortDst
       
   166 			break
       
   167 		}
       
   168 		for i := 0; i < size; i++ {
       
   169 			dst[nDst] = src[nSrc]
       
   170 			nDst++
       
   171 			nSrc++
       
   172 		}
       
   173 	}
       
   174 	return
       
   175 }
       
   176 
       
   177 // Map returns a Transformer that maps the runes in the input using the given
       
   178 // mapping. Illegal bytes in the input are converted to utf8.RuneError before
       
   179 // being passed to the mapping func.
       
   180 func Map(mapping func(rune) rune) Transformer {
       
   181 	return Transformer{mapper(mapping)}
       
   182 }
       
   183 
       
   184 type mapper func(rune) rune
       
   185 
       
   186 func (mapper) Reset() {}
       
   187 
       
   188 // Span implements transform.Spanner.
       
   189 func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
       
   190 	for r, size := rune(0), 0; n < len(src); n += size {
       
   191 		if r = rune(src[n]); r < utf8.RuneSelf {
       
   192 			size = 1
       
   193 		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
       
   194 			// Invalid rune.
       
   195 			if !atEOF && !utf8.FullRune(src[n:]) {
       
   196 				err = transform.ErrShortSrc
       
   197 			} else {
       
   198 				err = transform.ErrEndOfSpan
       
   199 			}
       
   200 			break
       
   201 		}
       
   202 		if t(r) != r {
       
   203 			err = transform.ErrEndOfSpan
       
   204 			break
       
   205 		}
       
   206 	}
       
   207 	return n, err
       
   208 }
       
   209 
       
   210 // Transform implements transform.Transformer.
       
   211 func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
       
   212 	var replacement rune
       
   213 	var b [utf8.UTFMax]byte
       
   214 
       
   215 	for r, size := rune(0), 0; nSrc < len(src); {
       
   216 		if r = rune(src[nSrc]); r < utf8.RuneSelf {
       
   217 			if replacement = t(r); replacement < utf8.RuneSelf {
       
   218 				if nDst == len(dst) {
       
   219 					err = transform.ErrShortDst
       
   220 					break
       
   221 				}
       
   222 				dst[nDst] = byte(replacement)
       
   223 				nDst++
       
   224 				nSrc++
       
   225 				continue
       
   226 			}
       
   227 			size = 1
       
   228 		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
       
   229 			// Invalid rune.
       
   230 			if !atEOF && !utf8.FullRune(src[nSrc:]) {
       
   231 				err = transform.ErrShortSrc
       
   232 				break
       
   233 			}
       
   234 
       
   235 			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
       
   236 				if nDst+3 > len(dst) {
       
   237 					err = transform.ErrShortDst
       
   238 					break
       
   239 				}
       
   240 				dst[nDst+0] = runeErrorString[0]
       
   241 				dst[nDst+1] = runeErrorString[1]
       
   242 				dst[nDst+2] = runeErrorString[2]
       
   243 				nDst += 3
       
   244 				nSrc++
       
   245 				continue
       
   246 			}
       
   247 		} else if replacement = t(r); replacement == r {
       
   248 			if nDst+size > len(dst) {
       
   249 				err = transform.ErrShortDst
       
   250 				break
       
   251 			}
       
   252 			for i := 0; i < size; i++ {
       
   253 				dst[nDst] = src[nSrc]
       
   254 				nDst++
       
   255 				nSrc++
       
   256 			}
       
   257 			continue
       
   258 		}
       
   259 
       
   260 		n := utf8.EncodeRune(b[:], replacement)
       
   261 
       
   262 		if nDst+n > len(dst) {
       
   263 			err = transform.ErrShortDst
       
   264 			break
       
   265 		}
       
   266 		for i := 0; i < n; i++ {
       
   267 			dst[nDst] = b[i]
       
   268 			nDst++
       
   269 		}
       
   270 		nSrc += size
       
   271 	}
       
   272 	return
       
   273 }
       
   274 
       
   275 // ReplaceIllFormed returns a transformer that replaces all input bytes that are
       
   276 // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
       
   277 func ReplaceIllFormed() Transformer {
       
   278 	return Transformer{&replaceIllFormed{}}
       
   279 }
       
   280 
       
   281 type replaceIllFormed struct{ transform.NopResetter }
       
   282 
       
   283 func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
       
   284 	for n < len(src) {
       
   285 		// ASCII fast path.
       
   286 		if src[n] < utf8.RuneSelf {
       
   287 			n++
       
   288 			continue
       
   289 		}
       
   290 
       
   291 		r, size := utf8.DecodeRune(src[n:])
       
   292 
       
   293 		// Look for a valid non-ASCII rune.
       
   294 		if r != utf8.RuneError || size != 1 {
       
   295 			n += size
       
   296 			continue
       
   297 		}
       
   298 
       
   299 		// Look for short source data.
       
   300 		if !atEOF && !utf8.FullRune(src[n:]) {
       
   301 			err = transform.ErrShortSrc
       
   302 			break
       
   303 		}
       
   304 
       
   305 		// We have an invalid rune.
       
   306 		err = transform.ErrEndOfSpan
       
   307 		break
       
   308 	}
       
   309 	return n, err
       
   310 }
       
   311 
       
   312 func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
       
   313 	for nSrc < len(src) {
       
   314 		// ASCII fast path.
       
   315 		if r := src[nSrc]; r < utf8.RuneSelf {
       
   316 			if nDst == len(dst) {
       
   317 				err = transform.ErrShortDst
       
   318 				break
       
   319 			}
       
   320 			dst[nDst] = r
       
   321 			nDst++
       
   322 			nSrc++
       
   323 			continue
       
   324 		}
       
   325 
       
   326 		// Look for a valid non-ASCII rune.
       
   327 		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
       
   328 			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
       
   329 				err = transform.ErrShortDst
       
   330 				break
       
   331 			}
       
   332 			nDst += size
       
   333 			nSrc += size
       
   334 			continue
       
   335 		}
       
   336 
       
   337 		// Look for short source data.
       
   338 		if !atEOF && !utf8.FullRune(src[nSrc:]) {
       
   339 			err = transform.ErrShortSrc
       
   340 			break
       
   341 		}
       
   342 
       
   343 		// We have an invalid rune.
       
   344 		if nDst+3 > len(dst) {
       
   345 			err = transform.ErrShortDst
       
   346 			break
       
   347 		}
       
   348 		dst[nDst+0] = runeErrorString[0]
       
   349 		dst[nDst+1] = runeErrorString[1]
       
   350 		dst[nDst+2] = runeErrorString[2]
       
   351 		nDst += 3
       
   352 		nSrc++
       
   353 	}
       
   354 	return nDst, nSrc, err
       
   355 }