vendor/github.com/pelletier/go-toml/v2/utf8.go
changeset 260 445e01aede7e
equal deleted inserted replaced
259:db4911b0c721 260:445e01aede7e
       
     1 package toml
       
     2 
       
     3 import (
       
     4 	"unicode/utf8"
       
     5 )
       
     6 
       
     7 type utf8Err struct {
       
     8 	Index int
       
     9 	Size  int
       
    10 }
       
    11 
       
    12 func (u utf8Err) Zero() bool {
       
    13 	return u.Size == 0
       
    14 }
       
    15 
       
    16 // Verified that a given string is only made of valid UTF-8 characters allowed
       
    17 // by the TOML spec:
       
    18 //
       
    19 // Any Unicode character may be used except those that must be escaped:
       
    20 // quotation mark, backslash, and the control characters other than tab (U+0000
       
    21 // to U+0008, U+000A to U+001F, U+007F).
       
    22 //
       
    23 // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
       
    24 // when a character is not allowed.
       
    25 //
       
    26 // The returned utf8Err is Zero() if the string is valid, or contains the byte
       
    27 // index and size of the invalid character.
       
    28 //
       
    29 // quotation mark => already checked
       
    30 // backslash => already checked
       
    31 // 0-0x8 => invalid
       
    32 // 0x9 => tab, ok
       
    33 // 0xA - 0x1F => invalid
       
    34 // 0x7F => invalid
       
    35 func utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
       
    36 	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
       
    37 	offset := 0
       
    38 	for len(p) >= 8 {
       
    39 		// Combining two 32 bit loads allows the same code to be used
       
    40 		// for 32 and 64 bit platforms.
       
    41 		// The compiler can generate a 32bit load for first32 and second32
       
    42 		// on many platforms. See test/codegen/memcombine.go.
       
    43 		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
       
    44 		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
       
    45 		if (first32|second32)&0x80808080 != 0 {
       
    46 			// Found a non ASCII byte (>= RuneSelf).
       
    47 			break
       
    48 		}
       
    49 
       
    50 		for i, b := range p[:8] {
       
    51 			if invalidAscii(b) {
       
    52 				err.Index = offset + i
       
    53 				err.Size = 1
       
    54 				return
       
    55 			}
       
    56 		}
       
    57 
       
    58 		p = p[8:]
       
    59 		offset += 8
       
    60 	}
       
    61 	n := len(p)
       
    62 	for i := 0; i < n; {
       
    63 		pi := p[i]
       
    64 		if pi < utf8.RuneSelf {
       
    65 			if invalidAscii(pi) {
       
    66 				err.Index = offset + i
       
    67 				err.Size = 1
       
    68 				return
       
    69 			}
       
    70 			i++
       
    71 			continue
       
    72 		}
       
    73 		x := first[pi]
       
    74 		if x == xx {
       
    75 			// Illegal starter byte.
       
    76 			err.Index = offset + i
       
    77 			err.Size = 1
       
    78 			return
       
    79 		}
       
    80 		size := int(x & 7)
       
    81 		if i+size > n {
       
    82 			// Short or invalid.
       
    83 			err.Index = offset + i
       
    84 			err.Size = n - i
       
    85 			return
       
    86 		}
       
    87 		accept := acceptRanges[x>>4]
       
    88 		if c := p[i+1]; c < accept.lo || accept.hi < c {
       
    89 			err.Index = offset + i
       
    90 			err.Size = 2
       
    91 			return
       
    92 		} else if size == 2 {
       
    93 		} else if c := p[i+2]; c < locb || hicb < c {
       
    94 			err.Index = offset + i
       
    95 			err.Size = 3
       
    96 			return
       
    97 		} else if size == 3 {
       
    98 		} else if c := p[i+3]; c < locb || hicb < c {
       
    99 			err.Index = offset + i
       
   100 			err.Size = 4
       
   101 			return
       
   102 		}
       
   103 		i += size
       
   104 	}
       
   105 	return
       
   106 }
       
   107 
       
   108 // Return the size of the next rune if valid, 0 otherwise.
       
   109 func utf8ValidNext(p []byte) int {
       
   110 	c := p[0]
       
   111 
       
   112 	if c < utf8.RuneSelf {
       
   113 		if invalidAscii(c) {
       
   114 			return 0
       
   115 		}
       
   116 		return 1
       
   117 	}
       
   118 
       
   119 	x := first[c]
       
   120 	if x == xx {
       
   121 		// Illegal starter byte.
       
   122 		return 0
       
   123 	}
       
   124 	size := int(x & 7)
       
   125 	if size > len(p) {
       
   126 		// Short or invalid.
       
   127 		return 0
       
   128 	}
       
   129 	accept := acceptRanges[x>>4]
       
   130 	if c := p[1]; c < accept.lo || accept.hi < c {
       
   131 		return 0
       
   132 	} else if size == 2 {
       
   133 	} else if c := p[2]; c < locb || hicb < c {
       
   134 		return 0
       
   135 	} else if size == 3 {
       
   136 	} else if c := p[3]; c < locb || hicb < c {
       
   137 		return 0
       
   138 	}
       
   139 
       
   140 	return size
       
   141 }
       
   142 
       
   143 var invalidAsciiTable = [256]bool{
       
   144 	0x00: true,
       
   145 	0x01: true,
       
   146 	0x02: true,
       
   147 	0x03: true,
       
   148 	0x04: true,
       
   149 	0x05: true,
       
   150 	0x06: true,
       
   151 	0x07: true,
       
   152 	0x08: true,
       
   153 	// 0x09 TAB
       
   154 	// 0x0A LF
       
   155 	0x0B: true,
       
   156 	0x0C: true,
       
   157 	// 0x0D CR
       
   158 	0x0E: true,
       
   159 	0x0F: true,
       
   160 	0x10: true,
       
   161 	0x11: true,
       
   162 	0x12: true,
       
   163 	0x13: true,
       
   164 	0x14: true,
       
   165 	0x15: true,
       
   166 	0x16: true,
       
   167 	0x17: true,
       
   168 	0x18: true,
       
   169 	0x19: true,
       
   170 	0x1A: true,
       
   171 	0x1B: true,
       
   172 	0x1C: true,
       
   173 	0x1D: true,
       
   174 	0x1E: true,
       
   175 	0x1F: true,
       
   176 	// 0x20 - 0x7E Printable ASCII characters
       
   177 	0x7F: true,
       
   178 }
       
   179 
       
   180 func invalidAscii(b byte) bool {
       
   181 	return invalidAsciiTable[b]
       
   182 }
       
   183 
       
   184 // acceptRange gives the range of valid values for the second byte in a UTF-8
       
   185 // sequence.
       
   186 type acceptRange struct {
       
   187 	lo uint8 // lowest value for second byte.
       
   188 	hi uint8 // highest value for second byte.
       
   189 }
       
   190 
       
   191 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
       
   192 var acceptRanges = [16]acceptRange{
       
   193 	0: {locb, hicb},
       
   194 	1: {0xA0, hicb},
       
   195 	2: {locb, 0x9F},
       
   196 	3: {0x90, hicb},
       
   197 	4: {locb, 0x8F},
       
   198 }
       
   199 
       
   200 // first is information about the first byte in a UTF-8 sequence.
       
   201 var first = [256]uint8{
       
   202 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
       
   203 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
       
   204 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
       
   205 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
       
   206 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
       
   207 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
       
   208 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
       
   209 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
       
   210 	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
       
   211 	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
       
   212 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
       
   213 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
       
   214 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
       
   215 	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
       
   216 	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
       
   217 	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
       
   218 	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
       
   219 	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
       
   220 }
       
   221 
       
   222 const (
       
   223 	// The default lowest and highest continuation byte.
       
   224 	locb = 0b10000000
       
   225 	hicb = 0b10111111
       
   226 
       
   227 	// These names of these constants are chosen to give nice alignment in the
       
   228 	// table below. The first nibble is an index into acceptRanges or F for
       
   229 	// special one-byte cases. The second nibble is the Rune length or the
       
   230 	// Status for the special one-byte case.
       
   231 	xx = 0xF1 // invalid: size 1
       
   232 	as = 0xF0 // ASCII: size 1
       
   233 	s1 = 0x02 // accept 0, size 2
       
   234 	s2 = 0x13 // accept 1, size 3
       
   235 	s3 = 0x03 // accept 0, size 3
       
   236 	s4 = 0x23 // accept 2, size 3
       
   237 	s5 = 0x34 // accept 3, size 4
       
   238 	s6 = 0x04 // accept 0, size 4
       
   239 	s7 = 0x44 // accept 4, size 4
       
   240 )