|
1 // Copyright 2011 The Go Authors. All rights reserved. |
|
2 // Use of this source code is governed by a BSD-style |
|
3 // license that can be found in the LICENSE file. |
|
4 |
|
5 package norm |
|
6 |
|
7 // This file contains Form-specific logic and wrappers for data in tables.go. |
|
8 |
|
9 // Rune info is stored in a separate trie per composing form. A composing form |
|
10 // and its corresponding decomposing form share the same trie. Each trie maps |
|
11 // a rune to a uint16. The values take two forms. For v >= 0x8000: |
|
12 // bits |
|
13 // 15: 1 (inverse of NFD_QC bit of qcInfo) |
|
14 // 13..7: qcInfo (see below). isYesD is always true (no decompostion). |
|
15 // 6..0: ccc (compressed CCC value). |
|
16 // For v < 0x8000, the respective rune has a decomposition and v is an index |
|
17 // into a byte array of UTF-8 decomposition sequences and additional info and |
|
18 // has the form: |
|
19 // <header> <decomp_byte>* [<tccc> [<lccc>]] |
|
20 // The header contains the number of bytes in the decomposition (excluding this |
|
21 // length byte). The two most significant bits of this length byte correspond |
|
22 // to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. |
|
23 // The byte sequence is followed by a trailing and leading CCC if the values |
|
24 // for these are not zero. The value of v determines which ccc are appended |
|
25 // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, |
|
26 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC |
|
27 // there is an additional leading ccc. The value of tccc itself is the |
|
28 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc |
|
29 // are the number of trailing non-starters. |
|
30 |
|
31 const ( |
|
32 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo |
|
33 headerLenMask = 0x3F // extract the length value from the header byte |
|
34 headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte |
|
35 ) |
|
36 |
|
37 // Properties provides access to normalization properties of a rune. |
|
38 type Properties struct { |
|
39 pos uint8 // start position in reorderBuffer; used in composition.go |
|
40 size uint8 // length of UTF-8 encoding of this rune |
|
41 ccc uint8 // leading canonical combining class (ccc if not decomposition) |
|
42 tccc uint8 // trailing canonical combining class (ccc if not decomposition) |
|
43 nLead uint8 // number of leading non-starters. |
|
44 flags qcInfo // quick check flags |
|
45 index uint16 |
|
46 } |
|
47 |
|
48 // functions dispatchable per form |
|
49 type lookupFunc func(b input, i int) Properties |
|
50 |
|
51 // formInfo holds Form-specific functions and tables. |
|
52 type formInfo struct { |
|
53 form Form |
|
54 composing, compatibility bool // form type |
|
55 info lookupFunc |
|
56 nextMain iterFunc |
|
57 } |
|
58 |
|
59 var formTable = []*formInfo{{ |
|
60 form: NFC, |
|
61 composing: true, |
|
62 compatibility: false, |
|
63 info: lookupInfoNFC, |
|
64 nextMain: nextComposed, |
|
65 }, { |
|
66 form: NFD, |
|
67 composing: false, |
|
68 compatibility: false, |
|
69 info: lookupInfoNFC, |
|
70 nextMain: nextDecomposed, |
|
71 }, { |
|
72 form: NFKC, |
|
73 composing: true, |
|
74 compatibility: true, |
|
75 info: lookupInfoNFKC, |
|
76 nextMain: nextComposed, |
|
77 }, { |
|
78 form: NFKD, |
|
79 composing: false, |
|
80 compatibility: true, |
|
81 info: lookupInfoNFKC, |
|
82 nextMain: nextDecomposed, |
|
83 }} |
|
84 |
|
85 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid |
|
86 // unexpected behavior for the user. For example, in NFD, there is a boundary |
|
87 // after 'a'. However, 'a' might combine with modifiers, so from the application's |
|
88 // perspective it is not a good boundary. We will therefore always use the |
|
89 // boundaries for the combining variants. |
|
90 |
|
91 // BoundaryBefore returns true if this rune starts a new segment and |
|
92 // cannot combine with any rune on the left. |
|
93 func (p Properties) BoundaryBefore() bool { |
|
94 if p.ccc == 0 && !p.combinesBackward() { |
|
95 return true |
|
96 } |
|
97 // We assume that the CCC of the first character in a decomposition |
|
98 // is always non-zero if different from info.ccc and that we can return |
|
99 // false at this point. This is verified by maketables. |
|
100 return false |
|
101 } |
|
102 |
|
103 // BoundaryAfter returns true if runes cannot combine with or otherwise |
|
104 // interact with this or previous runes. |
|
105 func (p Properties) BoundaryAfter() bool { |
|
106 // TODO: loosen these conditions. |
|
107 return p.isInert() |
|
108 } |
|
109 |
|
110 // We pack quick check data in 4 bits: |
|
111 // 5: Combines forward (0 == false, 1 == true) |
|
112 // 4..3: NFC_QC Yes(00), No (10), or Maybe (11) |
|
113 // 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. |
|
114 // 1..0: Number of trailing non-starters. |
|
115 // |
|
116 // When all 4 bits are zero, the character is inert, meaning it is never |
|
117 // influenced by normalization. |
|
118 type qcInfo uint8 |
|
119 |
|
120 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
|
121 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
|
122 |
|
123 func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
|
124 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe |
|
125 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD |
|
126 |
|
127 func (p Properties) isInert() bool { |
|
128 return p.flags&qcInfoMask == 0 && p.ccc == 0 |
|
129 } |
|
130 |
|
131 func (p Properties) multiSegment() bool { |
|
132 return p.index >= firstMulti && p.index < endMulti |
|
133 } |
|
134 |
|
135 func (p Properties) nLeadingNonStarters() uint8 { |
|
136 return p.nLead |
|
137 } |
|
138 |
|
139 func (p Properties) nTrailingNonStarters() uint8 { |
|
140 return uint8(p.flags & 0x03) |
|
141 } |
|
142 |
|
143 // Decomposition returns the decomposition for the underlying rune |
|
144 // or nil if there is none. |
|
145 func (p Properties) Decomposition() []byte { |
|
146 // TODO: create the decomposition for Hangul? |
|
147 if p.index == 0 { |
|
148 return nil |
|
149 } |
|
150 i := p.index |
|
151 n := decomps[i] & headerLenMask |
|
152 i++ |
|
153 return decomps[i : i+uint16(n)] |
|
154 } |
|
155 |
|
156 // Size returns the length of UTF-8 encoding of the rune. |
|
157 func (p Properties) Size() int { |
|
158 return int(p.size) |
|
159 } |
|
160 |
|
161 // CCC returns the canonical combining class of the underlying rune. |
|
162 func (p Properties) CCC() uint8 { |
|
163 if p.index >= firstCCCZeroExcept { |
|
164 return 0 |
|
165 } |
|
166 return ccc[p.ccc] |
|
167 } |
|
168 |
|
169 // LeadCCC returns the CCC of the first rune in the decomposition. |
|
170 // If there is no decomposition, LeadCCC equals CCC. |
|
171 func (p Properties) LeadCCC() uint8 { |
|
172 return ccc[p.ccc] |
|
173 } |
|
174 |
|
175 // TrailCCC returns the CCC of the last rune in the decomposition. |
|
176 // If there is no decomposition, TrailCCC equals CCC. |
|
177 func (p Properties) TrailCCC() uint8 { |
|
178 return ccc[p.tccc] |
|
179 } |
|
180 |
|
181 // Recomposition |
|
182 // We use 32-bit keys instead of 64-bit for the two codepoint keys. |
|
183 // This clips off the bits of three entries, but we know this will not |
|
184 // result in a collision. In the unlikely event that changes to |
|
185 // UnicodeData.txt introduce collisions, the compiler will catch it. |
|
186 // Note that the recomposition map for NFC and NFKC are identical. |
|
187 |
|
188 // combine returns the combined rune or 0 if it doesn't exist. |
|
189 func combine(a, b rune) rune { |
|
190 key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
|
191 return recompMap[key] |
|
192 } |
|
193 |
|
194 func lookupInfoNFC(b input, i int) Properties { |
|
195 v, sz := b.charinfoNFC(i) |
|
196 return compInfo(v, sz) |
|
197 } |
|
198 |
|
199 func lookupInfoNFKC(b input, i int) Properties { |
|
200 v, sz := b.charinfoNFKC(i) |
|
201 return compInfo(v, sz) |
|
202 } |
|
203 |
|
204 // Properties returns properties for the first rune in s. |
|
205 func (f Form) Properties(s []byte) Properties { |
|
206 if f == NFC || f == NFD { |
|
207 return compInfo(nfcData.lookup(s)) |
|
208 } |
|
209 return compInfo(nfkcData.lookup(s)) |
|
210 } |
|
211 |
|
212 // PropertiesString returns properties for the first rune in s. |
|
213 func (f Form) PropertiesString(s string) Properties { |
|
214 if f == NFC || f == NFD { |
|
215 return compInfo(nfcData.lookupString(s)) |
|
216 } |
|
217 return compInfo(nfkcData.lookupString(s)) |
|
218 } |
|
219 |
|
220 // compInfo converts the information contained in v and sz |
|
221 // to a Properties. See the comment at the top of the file |
|
222 // for more information on the format. |
|
223 func compInfo(v uint16, sz int) Properties { |
|
224 if v == 0 { |
|
225 return Properties{size: uint8(sz)} |
|
226 } else if v >= 0x8000 { |
|
227 p := Properties{ |
|
228 size: uint8(sz), |
|
229 ccc: uint8(v), |
|
230 tccc: uint8(v), |
|
231 flags: qcInfo(v >> 8), |
|
232 } |
|
233 if p.ccc > 0 || p.combinesBackward() { |
|
234 p.nLead = uint8(p.flags & 0x3) |
|
235 } |
|
236 return p |
|
237 } |
|
238 // has decomposition |
|
239 h := decomps[v] |
|
240 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
|
241 p := Properties{size: uint8(sz), flags: f, index: v} |
|
242 if v >= firstCCC { |
|
243 v += uint16(h&headerLenMask) + 1 |
|
244 c := decomps[v] |
|
245 p.tccc = c >> 2 |
|
246 p.flags |= qcInfo(c & 0x3) |
|
247 if v >= firstLeadingCCC { |
|
248 p.nLead = c & 0x3 |
|
249 if v >= firstStarterWithNLead { |
|
250 // We were tricked. Remove the decomposition. |
|
251 p.flags &= 0x03 |
|
252 p.index = 0 |
|
253 return p |
|
254 } |
|
255 p.ccc = decomps[v+1] |
|
256 } |
|
257 } |
|
258 return p |
|
259 } |