vendor/golang.org/x/text/unicode/norm/forminfo.go
changeset 242 2a9ec03fe5a1
child 251 1c52a0eeb952
equal deleted inserted replaced
241:e77dad242f4c 242:2a9ec03fe5a1
       
     1 // Copyright 2011 The Go Authors. All rights reserved.
       
     2 // Use of this source code is governed by a BSD-style
       
     3 // license that can be found in the LICENSE file.
       
     4 
       
     5 package norm
       
     6 
       
     7 // This file contains Form-specific logic and wrappers for data in tables.go.
       
     8 
       
     9 // Rune info is stored in a separate trie per composing form. A composing form
       
    10 // and its corresponding decomposing form share the same trie.  Each trie maps
       
    11 // a rune to a uint16. The values take two forms.  For v >= 0x8000:
       
    12 //   bits
       
    13 //   15:    1 (inverse of NFD_QC bit of qcInfo)
       
    14 //   13..7: qcInfo (see below). isYesD is always true (no decompostion).
       
    15 //    6..0: ccc (compressed CCC value).
       
    16 // For v < 0x8000, the respective rune has a decomposition and v is an index
       
    17 // into a byte array of UTF-8 decomposition sequences and additional info and
       
    18 // has the form:
       
    19 //    <header> <decomp_byte>* [<tccc> [<lccc>]]
       
    20 // The header contains the number of bytes in the decomposition (excluding this
       
    21 // length byte). The two most significant bits of this length byte correspond
       
    22 // to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
       
    23 // The byte sequence is followed by a trailing and leading CCC if the values
       
    24 // for these are not zero.  The value of v determines which ccc are appended
       
    25 // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
       
    26 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
       
    27 // there is an additional leading ccc. The value of tccc itself is the
       
    28 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
       
    29 // are the number of trailing non-starters.
       
    30 
       
    31 const (
       
    32 	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
       
    33 	headerLenMask   = 0x3F // extract the length value from the header byte
       
    34 	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
       
    35 )
       
    36 
       
    37 // Properties provides access to normalization properties of a rune.
       
    38 type Properties struct {
       
    39 	pos   uint8  // start position in reorderBuffer; used in composition.go
       
    40 	size  uint8  // length of UTF-8 encoding of this rune
       
    41 	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
       
    42 	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
       
    43 	nLead uint8  // number of leading non-starters.
       
    44 	flags qcInfo // quick check flags
       
    45 	index uint16
       
    46 }
       
    47 
       
    48 // functions dispatchable per form
       
    49 type lookupFunc func(b input, i int) Properties
       
    50 
       
    51 // formInfo holds Form-specific functions and tables.
       
    52 type formInfo struct {
       
    53 	form                     Form
       
    54 	composing, compatibility bool // form type
       
    55 	info                     lookupFunc
       
    56 	nextMain                 iterFunc
       
    57 }
       
    58 
       
    59 var formTable = []*formInfo{{
       
    60 	form:          NFC,
       
    61 	composing:     true,
       
    62 	compatibility: false,
       
    63 	info:          lookupInfoNFC,
       
    64 	nextMain:      nextComposed,
       
    65 }, {
       
    66 	form:          NFD,
       
    67 	composing:     false,
       
    68 	compatibility: false,
       
    69 	info:          lookupInfoNFC,
       
    70 	nextMain:      nextDecomposed,
       
    71 }, {
       
    72 	form:          NFKC,
       
    73 	composing:     true,
       
    74 	compatibility: true,
       
    75 	info:          lookupInfoNFKC,
       
    76 	nextMain:      nextComposed,
       
    77 }, {
       
    78 	form:          NFKD,
       
    79 	composing:     false,
       
    80 	compatibility: true,
       
    81 	info:          lookupInfoNFKC,
       
    82 	nextMain:      nextDecomposed,
       
    83 }}
       
    84 
       
    85 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
       
    86 // unexpected behavior for the user.  For example, in NFD, there is a boundary
       
    87 // after 'a'.  However, 'a' might combine with modifiers, so from the application's
       
    88 // perspective it is not a good boundary. We will therefore always use the
       
    89 // boundaries for the combining variants.
       
    90 
       
    91 // BoundaryBefore returns true if this rune starts a new segment and
       
    92 // cannot combine with any rune on the left.
       
    93 func (p Properties) BoundaryBefore() bool {
       
    94 	if p.ccc == 0 && !p.combinesBackward() {
       
    95 		return true
       
    96 	}
       
    97 	// We assume that the CCC of the first character in a decomposition
       
    98 	// is always non-zero if different from info.ccc and that we can return
       
    99 	// false at this point. This is verified by maketables.
       
   100 	return false
       
   101 }
       
   102 
       
   103 // BoundaryAfter returns true if runes cannot combine with or otherwise
       
   104 // interact with this or previous runes.
       
   105 func (p Properties) BoundaryAfter() bool {
       
   106 	// TODO: loosen these conditions.
       
   107 	return p.isInert()
       
   108 }
       
   109 
       
   110 // We pack quick check data in 4 bits:
       
   111 //   5:    Combines forward  (0 == false, 1 == true)
       
   112 //   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
       
   113 //   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
       
   114 //   1..0: Number of trailing non-starters.
       
   115 //
       
   116 // When all 4 bits are zero, the character is inert, meaning it is never
       
   117 // influenced by normalization.
       
   118 type qcInfo uint8
       
   119 
       
   120 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
       
   121 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
       
   122 
       
   123 func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
       
   124 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
       
   125 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
       
   126 
       
   127 func (p Properties) isInert() bool {
       
   128 	return p.flags&qcInfoMask == 0 && p.ccc == 0
       
   129 }
       
   130 
       
   131 func (p Properties) multiSegment() bool {
       
   132 	return p.index >= firstMulti && p.index < endMulti
       
   133 }
       
   134 
       
   135 func (p Properties) nLeadingNonStarters() uint8 {
       
   136 	return p.nLead
       
   137 }
       
   138 
       
   139 func (p Properties) nTrailingNonStarters() uint8 {
       
   140 	return uint8(p.flags & 0x03)
       
   141 }
       
   142 
       
   143 // Decomposition returns the decomposition for the underlying rune
       
   144 // or nil if there is none.
       
   145 func (p Properties) Decomposition() []byte {
       
   146 	// TODO: create the decomposition for Hangul?
       
   147 	if p.index == 0 {
       
   148 		return nil
       
   149 	}
       
   150 	i := p.index
       
   151 	n := decomps[i] & headerLenMask
       
   152 	i++
       
   153 	return decomps[i : i+uint16(n)]
       
   154 }
       
   155 
       
   156 // Size returns the length of UTF-8 encoding of the rune.
       
   157 func (p Properties) Size() int {
       
   158 	return int(p.size)
       
   159 }
       
   160 
       
   161 // CCC returns the canonical combining class of the underlying rune.
       
   162 func (p Properties) CCC() uint8 {
       
   163 	if p.index >= firstCCCZeroExcept {
       
   164 		return 0
       
   165 	}
       
   166 	return ccc[p.ccc]
       
   167 }
       
   168 
       
   169 // LeadCCC returns the CCC of the first rune in the decomposition.
       
   170 // If there is no decomposition, LeadCCC equals CCC.
       
   171 func (p Properties) LeadCCC() uint8 {
       
   172 	return ccc[p.ccc]
       
   173 }
       
   174 
       
   175 // TrailCCC returns the CCC of the last rune in the decomposition.
       
   176 // If there is no decomposition, TrailCCC equals CCC.
       
   177 func (p Properties) TrailCCC() uint8 {
       
   178 	return ccc[p.tccc]
       
   179 }
       
   180 
       
   181 // Recomposition
       
   182 // We use 32-bit keys instead of 64-bit for the two codepoint keys.
       
   183 // This clips off the bits of three entries, but we know this will not
       
   184 // result in a collision. In the unlikely event that changes to
       
   185 // UnicodeData.txt introduce collisions, the compiler will catch it.
       
   186 // Note that the recomposition map for NFC and NFKC are identical.
       
   187 
       
   188 // combine returns the combined rune or 0 if it doesn't exist.
       
   189 func combine(a, b rune) rune {
       
   190 	key := uint32(uint16(a))<<16 + uint32(uint16(b))
       
   191 	return recompMap[key]
       
   192 }
       
   193 
       
   194 func lookupInfoNFC(b input, i int) Properties {
       
   195 	v, sz := b.charinfoNFC(i)
       
   196 	return compInfo(v, sz)
       
   197 }
       
   198 
       
   199 func lookupInfoNFKC(b input, i int) Properties {
       
   200 	v, sz := b.charinfoNFKC(i)
       
   201 	return compInfo(v, sz)
       
   202 }
       
   203 
       
   204 // Properties returns properties for the first rune in s.
       
   205 func (f Form) Properties(s []byte) Properties {
       
   206 	if f == NFC || f == NFD {
       
   207 		return compInfo(nfcData.lookup(s))
       
   208 	}
       
   209 	return compInfo(nfkcData.lookup(s))
       
   210 }
       
   211 
       
   212 // PropertiesString returns properties for the first rune in s.
       
   213 func (f Form) PropertiesString(s string) Properties {
       
   214 	if f == NFC || f == NFD {
       
   215 		return compInfo(nfcData.lookupString(s))
       
   216 	}
       
   217 	return compInfo(nfkcData.lookupString(s))
       
   218 }
       
   219 
       
   220 // compInfo converts the information contained in v and sz
       
   221 // to a Properties.  See the comment at the top of the file
       
   222 // for more information on the format.
       
   223 func compInfo(v uint16, sz int) Properties {
       
   224 	if v == 0 {
       
   225 		return Properties{size: uint8(sz)}
       
   226 	} else if v >= 0x8000 {
       
   227 		p := Properties{
       
   228 			size:  uint8(sz),
       
   229 			ccc:   uint8(v),
       
   230 			tccc:  uint8(v),
       
   231 			flags: qcInfo(v >> 8),
       
   232 		}
       
   233 		if p.ccc > 0 || p.combinesBackward() {
       
   234 			p.nLead = uint8(p.flags & 0x3)
       
   235 		}
       
   236 		return p
       
   237 	}
       
   238 	// has decomposition
       
   239 	h := decomps[v]
       
   240 	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
       
   241 	p := Properties{size: uint8(sz), flags: f, index: v}
       
   242 	if v >= firstCCC {
       
   243 		v += uint16(h&headerLenMask) + 1
       
   244 		c := decomps[v]
       
   245 		p.tccc = c >> 2
       
   246 		p.flags |= qcInfo(c & 0x3)
       
   247 		if v >= firstLeadingCCC {
       
   248 			p.nLead = c & 0x3
       
   249 			if v >= firstStarterWithNLead {
       
   250 				// We were tricked. Remove the decomposition.
       
   251 				p.flags &= 0x03
       
   252 				p.index = 0
       
   253 				return p
       
   254 			}
       
   255 			p.ccc = decomps[v+1]
       
   256 		}
       
   257 	}
       
   258 	return p
       
   259 }