1 // TOML lexer. |
|
2 // |
|
3 // Written using the principles developed by Rob Pike in |
|
4 // http://www.youtube.com/watch?v=HxaD_trXwRE |
|
5 |
|
6 package toml |
|
7 |
|
8 import ( |
|
9 "bytes" |
|
10 "errors" |
|
11 "fmt" |
|
12 "strconv" |
|
13 "strings" |
|
14 ) |
|
15 |
|
16 // Define state functions |
|
17 type tomlLexStateFn func() tomlLexStateFn |
|
18 |
|
19 // Define lexer |
|
20 type tomlLexer struct { |
|
21 inputIdx int |
|
22 input []rune // Textual source |
|
23 currentTokenStart int |
|
24 currentTokenStop int |
|
25 tokens []token |
|
26 brackets []rune |
|
27 line int |
|
28 col int |
|
29 endbufferLine int |
|
30 endbufferCol int |
|
31 } |
|
32 |
|
33 // Basic read operations on input |
|
34 |
|
35 func (l *tomlLexer) read() rune { |
|
36 r := l.peek() |
|
37 if r == '\n' { |
|
38 l.endbufferLine++ |
|
39 l.endbufferCol = 1 |
|
40 } else { |
|
41 l.endbufferCol++ |
|
42 } |
|
43 l.inputIdx++ |
|
44 return r |
|
45 } |
|
46 |
|
47 func (l *tomlLexer) next() rune { |
|
48 r := l.read() |
|
49 |
|
50 if r != eof { |
|
51 l.currentTokenStop++ |
|
52 } |
|
53 return r |
|
54 } |
|
55 |
|
56 func (l *tomlLexer) ignore() { |
|
57 l.currentTokenStart = l.currentTokenStop |
|
58 l.line = l.endbufferLine |
|
59 l.col = l.endbufferCol |
|
60 } |
|
61 |
|
62 func (l *tomlLexer) skip() { |
|
63 l.next() |
|
64 l.ignore() |
|
65 } |
|
66 |
|
67 func (l *tomlLexer) fastForward(n int) { |
|
68 for i := 0; i < n; i++ { |
|
69 l.next() |
|
70 } |
|
71 } |
|
72 |
|
73 func (l *tomlLexer) emitWithValue(t tokenType, value string) { |
|
74 l.tokens = append(l.tokens, token{ |
|
75 Position: Position{l.line, l.col}, |
|
76 typ: t, |
|
77 val: value, |
|
78 }) |
|
79 l.ignore() |
|
80 } |
|
81 |
|
82 func (l *tomlLexer) emit(t tokenType) { |
|
83 l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) |
|
84 } |
|
85 |
|
86 func (l *tomlLexer) peek() rune { |
|
87 if l.inputIdx >= len(l.input) { |
|
88 return eof |
|
89 } |
|
90 return l.input[l.inputIdx] |
|
91 } |
|
92 |
|
93 func (l *tomlLexer) peekString(size int) string { |
|
94 maxIdx := len(l.input) |
|
95 upperIdx := l.inputIdx + size // FIXME: potential overflow |
|
96 if upperIdx > maxIdx { |
|
97 upperIdx = maxIdx |
|
98 } |
|
99 return string(l.input[l.inputIdx:upperIdx]) |
|
100 } |
|
101 |
|
102 func (l *tomlLexer) follow(next string) bool { |
|
103 return next == l.peekString(len(next)) |
|
104 } |
|
105 |
|
106 // Error management |
|
107 |
|
108 func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { |
|
109 l.tokens = append(l.tokens, token{ |
|
110 Position: Position{l.line, l.col}, |
|
111 typ: tokenError, |
|
112 val: fmt.Sprintf(format, args...), |
|
113 }) |
|
114 return nil |
|
115 } |
|
116 |
|
117 // State functions |
|
118 |
|
119 func (l *tomlLexer) lexVoid() tomlLexStateFn { |
|
120 for { |
|
121 next := l.peek() |
|
122 switch next { |
|
123 case '}': // after '{' |
|
124 return l.lexRightCurlyBrace |
|
125 case '[': |
|
126 return l.lexTableKey |
|
127 case '#': |
|
128 return l.lexComment(l.lexVoid) |
|
129 case '=': |
|
130 return l.lexEqual |
|
131 case '\r': |
|
132 fallthrough |
|
133 case '\n': |
|
134 l.skip() |
|
135 continue |
|
136 } |
|
137 |
|
138 if isSpace(next) { |
|
139 l.skip() |
|
140 } |
|
141 |
|
142 if isKeyStartChar(next) { |
|
143 return l.lexKey |
|
144 } |
|
145 |
|
146 if next == eof { |
|
147 l.next() |
|
148 break |
|
149 } |
|
150 } |
|
151 |
|
152 l.emit(tokenEOF) |
|
153 return nil |
|
154 } |
|
155 |
|
156 func (l *tomlLexer) lexRvalue() tomlLexStateFn { |
|
157 for { |
|
158 next := l.peek() |
|
159 switch next { |
|
160 case '.': |
|
161 return l.errorf("cannot start float with a dot") |
|
162 case '=': |
|
163 return l.lexEqual |
|
164 case '[': |
|
165 return l.lexLeftBracket |
|
166 case ']': |
|
167 return l.lexRightBracket |
|
168 case '{': |
|
169 return l.lexLeftCurlyBrace |
|
170 case '}': |
|
171 return l.lexRightCurlyBrace |
|
172 case '#': |
|
173 return l.lexComment(l.lexRvalue) |
|
174 case '"': |
|
175 return l.lexString |
|
176 case '\'': |
|
177 return l.lexLiteralString |
|
178 case ',': |
|
179 return l.lexComma |
|
180 case '\r': |
|
181 fallthrough |
|
182 case '\n': |
|
183 l.skip() |
|
184 if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' { |
|
185 return l.lexRvalue |
|
186 } |
|
187 return l.lexVoid |
|
188 } |
|
189 |
|
190 if l.follow("true") { |
|
191 return l.lexTrue |
|
192 } |
|
193 |
|
194 if l.follow("false") { |
|
195 return l.lexFalse |
|
196 } |
|
197 |
|
198 if l.follow("inf") { |
|
199 return l.lexInf |
|
200 } |
|
201 |
|
202 if l.follow("nan") { |
|
203 return l.lexNan |
|
204 } |
|
205 |
|
206 if isSpace(next) { |
|
207 l.skip() |
|
208 continue |
|
209 } |
|
210 |
|
211 if next == eof { |
|
212 l.next() |
|
213 break |
|
214 } |
|
215 |
|
216 if next == '+' || next == '-' { |
|
217 return l.lexNumber |
|
218 } |
|
219 |
|
220 if isDigit(next) { |
|
221 return l.lexDateTimeOrNumber |
|
222 } |
|
223 |
|
224 return l.errorf("no value can start with %c", next) |
|
225 } |
|
226 |
|
227 l.emit(tokenEOF) |
|
228 return nil |
|
229 } |
|
230 |
|
231 func (l *tomlLexer) lexDateTimeOrNumber() tomlLexStateFn { |
|
232 // Could be either a date/time, or a digit. |
|
233 // The options for date/times are: |
|
234 // YYYY-... => date or date-time |
|
235 // HH:... => time |
|
236 // Anything else should be a number. |
|
237 |
|
238 lookAhead := l.peekString(5) |
|
239 if len(lookAhead) < 3 { |
|
240 return l.lexNumber() |
|
241 } |
|
242 |
|
243 for idx, r := range lookAhead { |
|
244 if !isDigit(r) { |
|
245 if idx == 2 && r == ':' { |
|
246 return l.lexDateTimeOrTime() |
|
247 } |
|
248 if idx == 4 && r == '-' { |
|
249 return l.lexDateTimeOrTime() |
|
250 } |
|
251 return l.lexNumber() |
|
252 } |
|
253 } |
|
254 return l.lexNumber() |
|
255 } |
|
256 |
|
257 func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { |
|
258 l.next() |
|
259 l.emit(tokenLeftCurlyBrace) |
|
260 l.brackets = append(l.brackets, '{') |
|
261 return l.lexVoid |
|
262 } |
|
263 |
|
264 func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { |
|
265 l.next() |
|
266 l.emit(tokenRightCurlyBrace) |
|
267 if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' { |
|
268 return l.errorf("cannot have '}' here") |
|
269 } |
|
270 l.brackets = l.brackets[:len(l.brackets)-1] |
|
271 return l.lexRvalue |
|
272 } |
|
273 |
|
274 func (l *tomlLexer) lexDateTimeOrTime() tomlLexStateFn { |
|
275 // Example matches: |
|
276 // 1979-05-27T07:32:00Z |
|
277 // 1979-05-27T00:32:00-07:00 |
|
278 // 1979-05-27T00:32:00.999999-07:00 |
|
279 // 1979-05-27 07:32:00Z |
|
280 // 1979-05-27 00:32:00-07:00 |
|
281 // 1979-05-27 00:32:00.999999-07:00 |
|
282 // 1979-05-27T07:32:00 |
|
283 // 1979-05-27T00:32:00.999999 |
|
284 // 1979-05-27 07:32:00 |
|
285 // 1979-05-27 00:32:00.999999 |
|
286 // 1979-05-27 |
|
287 // 07:32:00 |
|
288 // 00:32:00.999999 |
|
289 |
|
290 // we already know those two are digits |
|
291 l.next() |
|
292 l.next() |
|
293 |
|
294 // Got 2 digits. At that point it could be either a time or a date(-time). |
|
295 |
|
296 r := l.next() |
|
297 if r == ':' { |
|
298 return l.lexTime() |
|
299 } |
|
300 |
|
301 return l.lexDateTime() |
|
302 } |
|
303 |
|
304 func (l *tomlLexer) lexDateTime() tomlLexStateFn { |
|
305 // This state accepts an offset date-time, a local date-time, or a local date. |
|
306 // |
|
307 // v--- cursor |
|
308 // 1979-05-27T07:32:00Z |
|
309 // 1979-05-27T00:32:00-07:00 |
|
310 // 1979-05-27T00:32:00.999999-07:00 |
|
311 // 1979-05-27 07:32:00Z |
|
312 // 1979-05-27 00:32:00-07:00 |
|
313 // 1979-05-27 00:32:00.999999-07:00 |
|
314 // 1979-05-27T07:32:00 |
|
315 // 1979-05-27T00:32:00.999999 |
|
316 // 1979-05-27 07:32:00 |
|
317 // 1979-05-27 00:32:00.999999 |
|
318 // 1979-05-27 |
|
319 |
|
320 // date |
|
321 |
|
322 // already checked by lexRvalue |
|
323 l.next() // digit |
|
324 l.next() // - |
|
325 |
|
326 for i := 0; i < 2; i++ { |
|
327 r := l.next() |
|
328 if !isDigit(r) { |
|
329 return l.errorf("invalid month digit in date: %c", r) |
|
330 } |
|
331 } |
|
332 |
|
333 r := l.next() |
|
334 if r != '-' { |
|
335 return l.errorf("expected - to separate month of a date, not %c", r) |
|
336 } |
|
337 |
|
338 for i := 0; i < 2; i++ { |
|
339 r := l.next() |
|
340 if !isDigit(r) { |
|
341 return l.errorf("invalid day digit in date: %c", r) |
|
342 } |
|
343 } |
|
344 |
|
345 l.emit(tokenLocalDate) |
|
346 |
|
347 r = l.peek() |
|
348 |
|
349 if r == eof { |
|
350 |
|
351 return l.lexRvalue |
|
352 } |
|
353 |
|
354 if r != ' ' && r != 'T' { |
|
355 return l.errorf("incorrect date/time separation character: %c", r) |
|
356 } |
|
357 |
|
358 if r == ' ' { |
|
359 lookAhead := l.peekString(3)[1:] |
|
360 if len(lookAhead) < 2 { |
|
361 return l.lexRvalue |
|
362 } |
|
363 for _, r := range lookAhead { |
|
364 if !isDigit(r) { |
|
365 return l.lexRvalue |
|
366 } |
|
367 } |
|
368 } |
|
369 |
|
370 l.skip() // skip the T or ' ' |
|
371 |
|
372 // time |
|
373 |
|
374 for i := 0; i < 2; i++ { |
|
375 r := l.next() |
|
376 if !isDigit(r) { |
|
377 return l.errorf("invalid hour digit in time: %c", r) |
|
378 } |
|
379 } |
|
380 |
|
381 r = l.next() |
|
382 if r != ':' { |
|
383 return l.errorf("time hour/minute separator should be :, not %c", r) |
|
384 } |
|
385 |
|
386 for i := 0; i < 2; i++ { |
|
387 r := l.next() |
|
388 if !isDigit(r) { |
|
389 return l.errorf("invalid minute digit in time: %c", r) |
|
390 } |
|
391 } |
|
392 |
|
393 r = l.next() |
|
394 if r != ':' { |
|
395 return l.errorf("time minute/second separator should be :, not %c", r) |
|
396 } |
|
397 |
|
398 for i := 0; i < 2; i++ { |
|
399 r := l.next() |
|
400 if !isDigit(r) { |
|
401 return l.errorf("invalid second digit in time: %c", r) |
|
402 } |
|
403 } |
|
404 |
|
405 r = l.peek() |
|
406 if r == '.' { |
|
407 l.next() |
|
408 r := l.next() |
|
409 if !isDigit(r) { |
|
410 return l.errorf("expected at least one digit in time's fraction, not %c", r) |
|
411 } |
|
412 |
|
413 for { |
|
414 r := l.peek() |
|
415 if !isDigit(r) { |
|
416 break |
|
417 } |
|
418 l.next() |
|
419 } |
|
420 } |
|
421 |
|
422 l.emit(tokenLocalTime) |
|
423 |
|
424 return l.lexTimeOffset |
|
425 |
|
426 } |
|
427 |
|
428 func (l *tomlLexer) lexTimeOffset() tomlLexStateFn { |
|
429 // potential offset |
|
430 |
|
431 // Z |
|
432 // -07:00 |
|
433 // +07:00 |
|
434 // nothing |
|
435 |
|
436 r := l.peek() |
|
437 |
|
438 if r == 'Z' { |
|
439 l.next() |
|
440 l.emit(tokenTimeOffset) |
|
441 } else if r == '+' || r == '-' { |
|
442 l.next() |
|
443 |
|
444 for i := 0; i < 2; i++ { |
|
445 r := l.next() |
|
446 if !isDigit(r) { |
|
447 return l.errorf("invalid hour digit in time offset: %c", r) |
|
448 } |
|
449 } |
|
450 |
|
451 r = l.next() |
|
452 if r != ':' { |
|
453 return l.errorf("time offset hour/minute separator should be :, not %c", r) |
|
454 } |
|
455 |
|
456 for i := 0; i < 2; i++ { |
|
457 r := l.next() |
|
458 if !isDigit(r) { |
|
459 return l.errorf("invalid minute digit in time offset: %c", r) |
|
460 } |
|
461 } |
|
462 |
|
463 l.emit(tokenTimeOffset) |
|
464 } |
|
465 |
|
466 return l.lexRvalue |
|
467 } |
|
468 |
|
469 func (l *tomlLexer) lexTime() tomlLexStateFn { |
|
470 // v--- cursor |
|
471 // 07:32:00 |
|
472 // 00:32:00.999999 |
|
473 |
|
474 for i := 0; i < 2; i++ { |
|
475 r := l.next() |
|
476 if !isDigit(r) { |
|
477 return l.errorf("invalid minute digit in time: %c", r) |
|
478 } |
|
479 } |
|
480 |
|
481 r := l.next() |
|
482 if r != ':' { |
|
483 return l.errorf("time minute/second separator should be :, not %c", r) |
|
484 } |
|
485 |
|
486 for i := 0; i < 2; i++ { |
|
487 r := l.next() |
|
488 if !isDigit(r) { |
|
489 return l.errorf("invalid second digit in time: %c", r) |
|
490 } |
|
491 } |
|
492 |
|
493 r = l.peek() |
|
494 if r == '.' { |
|
495 l.next() |
|
496 r := l.next() |
|
497 if !isDigit(r) { |
|
498 return l.errorf("expected at least one digit in time's fraction, not %c", r) |
|
499 } |
|
500 |
|
501 for { |
|
502 r := l.peek() |
|
503 if !isDigit(r) { |
|
504 break |
|
505 } |
|
506 l.next() |
|
507 } |
|
508 } |
|
509 |
|
510 l.emit(tokenLocalTime) |
|
511 return l.lexRvalue |
|
512 |
|
513 } |
|
514 |
|
515 func (l *tomlLexer) lexTrue() tomlLexStateFn { |
|
516 l.fastForward(4) |
|
517 l.emit(tokenTrue) |
|
518 return l.lexRvalue |
|
519 } |
|
520 |
|
521 func (l *tomlLexer) lexFalse() tomlLexStateFn { |
|
522 l.fastForward(5) |
|
523 l.emit(tokenFalse) |
|
524 return l.lexRvalue |
|
525 } |
|
526 |
|
527 func (l *tomlLexer) lexInf() tomlLexStateFn { |
|
528 l.fastForward(3) |
|
529 l.emit(tokenInf) |
|
530 return l.lexRvalue |
|
531 } |
|
532 |
|
533 func (l *tomlLexer) lexNan() tomlLexStateFn { |
|
534 l.fastForward(3) |
|
535 l.emit(tokenNan) |
|
536 return l.lexRvalue |
|
537 } |
|
538 |
|
539 func (l *tomlLexer) lexEqual() tomlLexStateFn { |
|
540 l.next() |
|
541 l.emit(tokenEqual) |
|
542 return l.lexRvalue |
|
543 } |
|
544 |
|
545 func (l *tomlLexer) lexComma() tomlLexStateFn { |
|
546 l.next() |
|
547 l.emit(tokenComma) |
|
548 if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' { |
|
549 return l.lexVoid |
|
550 } |
|
551 return l.lexRvalue |
|
552 } |
|
553 |
|
554 // Parse the key and emits its value without escape sequences. |
|
555 // bare keys, basic string keys and literal string keys are supported. |
|
556 func (l *tomlLexer) lexKey() tomlLexStateFn { |
|
557 var sb strings.Builder |
|
558 |
|
559 for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { |
|
560 if r == '"' { |
|
561 l.next() |
|
562 str, err := l.lexStringAsString(`"`, false, true) |
|
563 if err != nil { |
|
564 return l.errorf(err.Error()) |
|
565 } |
|
566 sb.WriteString("\"") |
|
567 sb.WriteString(str) |
|
568 sb.WriteString("\"") |
|
569 l.next() |
|
570 continue |
|
571 } else if r == '\'' { |
|
572 l.next() |
|
573 str, err := l.lexLiteralStringAsString(`'`, false) |
|
574 if err != nil { |
|
575 return l.errorf(err.Error()) |
|
576 } |
|
577 sb.WriteString("'") |
|
578 sb.WriteString(str) |
|
579 sb.WriteString("'") |
|
580 l.next() |
|
581 continue |
|
582 } else if r == '\n' { |
|
583 return l.errorf("keys cannot contain new lines") |
|
584 } else if isSpace(r) { |
|
585 var str strings.Builder |
|
586 str.WriteString(" ") |
|
587 |
|
588 // skip trailing whitespace |
|
589 l.next() |
|
590 for r = l.peek(); isSpace(r); r = l.peek() { |
|
591 str.WriteRune(r) |
|
592 l.next() |
|
593 } |
|
594 // break loop if not a dot |
|
595 if r != '.' { |
|
596 break |
|
597 } |
|
598 str.WriteString(".") |
|
599 // skip trailing whitespace after dot |
|
600 l.next() |
|
601 for r = l.peek(); isSpace(r); r = l.peek() { |
|
602 str.WriteRune(r) |
|
603 l.next() |
|
604 } |
|
605 sb.WriteString(str.String()) |
|
606 continue |
|
607 } else if r == '.' { |
|
608 // skip |
|
609 } else if !isValidBareChar(r) { |
|
610 return l.errorf("keys cannot contain %c character", r) |
|
611 } |
|
612 sb.WriteRune(r) |
|
613 l.next() |
|
614 } |
|
615 l.emitWithValue(tokenKey, sb.String()) |
|
616 return l.lexVoid |
|
617 } |
|
618 |
|
619 func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { |
|
620 return func() tomlLexStateFn { |
|
621 for next := l.peek(); next != '\n' && next != eof; next = l.peek() { |
|
622 if next == '\r' && l.follow("\r\n") { |
|
623 break |
|
624 } |
|
625 l.next() |
|
626 } |
|
627 l.ignore() |
|
628 return previousState |
|
629 } |
|
630 } |
|
631 |
|
632 func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { |
|
633 l.next() |
|
634 l.emit(tokenLeftBracket) |
|
635 l.brackets = append(l.brackets, '[') |
|
636 return l.lexRvalue |
|
637 } |
|
638 |
|
639 func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { |
|
640 var sb strings.Builder |
|
641 |
|
642 if discardLeadingNewLine { |
|
643 if l.follow("\r\n") { |
|
644 l.skip() |
|
645 l.skip() |
|
646 } else if l.peek() == '\n' { |
|
647 l.skip() |
|
648 } |
|
649 } |
|
650 |
|
651 // find end of string |
|
652 for { |
|
653 if l.follow(terminator) { |
|
654 return sb.String(), nil |
|
655 } |
|
656 |
|
657 next := l.peek() |
|
658 if next == eof { |
|
659 break |
|
660 } |
|
661 sb.WriteRune(l.next()) |
|
662 } |
|
663 |
|
664 return "", errors.New("unclosed string") |
|
665 } |
|
666 |
|
667 func (l *tomlLexer) lexLiteralString() tomlLexStateFn { |
|
668 l.skip() |
|
669 |
|
670 // handle special case for triple-quote |
|
671 terminator := "'" |
|
672 discardLeadingNewLine := false |
|
673 if l.follow("''") { |
|
674 l.skip() |
|
675 l.skip() |
|
676 terminator = "'''" |
|
677 discardLeadingNewLine = true |
|
678 } |
|
679 |
|
680 str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) |
|
681 if err != nil { |
|
682 return l.errorf(err.Error()) |
|
683 } |
|
684 |
|
685 l.emitWithValue(tokenString, str) |
|
686 l.fastForward(len(terminator)) |
|
687 l.ignore() |
|
688 return l.lexRvalue |
|
689 } |
|
690 |
|
691 // Lex a string and return the results as a string. |
|
692 // Terminator is the substring indicating the end of the token. |
|
693 // The resulting string does not include the terminator. |
|
694 func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { |
|
695 var sb strings.Builder |
|
696 |
|
697 if discardLeadingNewLine { |
|
698 if l.follow("\r\n") { |
|
699 l.skip() |
|
700 l.skip() |
|
701 } else if l.peek() == '\n' { |
|
702 l.skip() |
|
703 } |
|
704 } |
|
705 |
|
706 for { |
|
707 if l.follow(terminator) { |
|
708 return sb.String(), nil |
|
709 } |
|
710 |
|
711 if l.follow("\\") { |
|
712 l.next() |
|
713 switch l.peek() { |
|
714 case '\r': |
|
715 fallthrough |
|
716 case '\n': |
|
717 fallthrough |
|
718 case '\t': |
|
719 fallthrough |
|
720 case ' ': |
|
721 // skip all whitespace chars following backslash |
|
722 for strings.ContainsRune("\r\n\t ", l.peek()) { |
|
723 l.next() |
|
724 } |
|
725 case '"': |
|
726 sb.WriteString("\"") |
|
727 l.next() |
|
728 case 'n': |
|
729 sb.WriteString("\n") |
|
730 l.next() |
|
731 case 'b': |
|
732 sb.WriteString("\b") |
|
733 l.next() |
|
734 case 'f': |
|
735 sb.WriteString("\f") |
|
736 l.next() |
|
737 case '/': |
|
738 sb.WriteString("/") |
|
739 l.next() |
|
740 case 't': |
|
741 sb.WriteString("\t") |
|
742 l.next() |
|
743 case 'r': |
|
744 sb.WriteString("\r") |
|
745 l.next() |
|
746 case '\\': |
|
747 sb.WriteString("\\") |
|
748 l.next() |
|
749 case 'u': |
|
750 l.next() |
|
751 var code strings.Builder |
|
752 for i := 0; i < 4; i++ { |
|
753 c := l.peek() |
|
754 if !isHexDigit(c) { |
|
755 return "", errors.New("unfinished unicode escape") |
|
756 } |
|
757 l.next() |
|
758 code.WriteRune(c) |
|
759 } |
|
760 intcode, err := strconv.ParseInt(code.String(), 16, 32) |
|
761 if err != nil { |
|
762 return "", errors.New("invalid unicode escape: \\u" + code.String()) |
|
763 } |
|
764 sb.WriteRune(rune(intcode)) |
|
765 case 'U': |
|
766 l.next() |
|
767 var code strings.Builder |
|
768 for i := 0; i < 8; i++ { |
|
769 c := l.peek() |
|
770 if !isHexDigit(c) { |
|
771 return "", errors.New("unfinished unicode escape") |
|
772 } |
|
773 l.next() |
|
774 code.WriteRune(c) |
|
775 } |
|
776 intcode, err := strconv.ParseInt(code.String(), 16, 64) |
|
777 if err != nil { |
|
778 return "", errors.New("invalid unicode escape: \\U" + code.String()) |
|
779 } |
|
780 sb.WriteRune(rune(intcode)) |
|
781 default: |
|
782 return "", errors.New("invalid escape sequence: \\" + string(l.peek())) |
|
783 } |
|
784 } else { |
|
785 r := l.peek() |
|
786 |
|
787 if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) { |
|
788 return "", fmt.Errorf("unescaped control character %U", r) |
|
789 } |
|
790 l.next() |
|
791 sb.WriteRune(r) |
|
792 } |
|
793 |
|
794 if l.peek() == eof { |
|
795 break |
|
796 } |
|
797 } |
|
798 |
|
799 return "", errors.New("unclosed string") |
|
800 } |
|
801 |
|
802 func (l *tomlLexer) lexString() tomlLexStateFn { |
|
803 l.skip() |
|
804 |
|
805 // handle special case for triple-quote |
|
806 terminator := `"` |
|
807 discardLeadingNewLine := false |
|
808 acceptNewLines := false |
|
809 if l.follow(`""`) { |
|
810 l.skip() |
|
811 l.skip() |
|
812 terminator = `"""` |
|
813 discardLeadingNewLine = true |
|
814 acceptNewLines = true |
|
815 } |
|
816 |
|
817 str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) |
|
818 if err != nil { |
|
819 return l.errorf(err.Error()) |
|
820 } |
|
821 |
|
822 l.emitWithValue(tokenString, str) |
|
823 l.fastForward(len(terminator)) |
|
824 l.ignore() |
|
825 return l.lexRvalue |
|
826 } |
|
827 |
|
828 func (l *tomlLexer) lexTableKey() tomlLexStateFn { |
|
829 l.next() |
|
830 |
|
831 if l.peek() == '[' { |
|
832 // token '[[' signifies an array of tables |
|
833 l.next() |
|
834 l.emit(tokenDoubleLeftBracket) |
|
835 return l.lexInsideTableArrayKey |
|
836 } |
|
837 // vanilla table key |
|
838 l.emit(tokenLeftBracket) |
|
839 return l.lexInsideTableKey |
|
840 } |
|
841 |
|
842 // Parse the key till "]]", but only bare keys are supported |
|
843 func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { |
|
844 for r := l.peek(); r != eof; r = l.peek() { |
|
845 switch r { |
|
846 case ']': |
|
847 if l.currentTokenStop > l.currentTokenStart { |
|
848 l.emit(tokenKeyGroupArray) |
|
849 } |
|
850 l.next() |
|
851 if l.peek() != ']' { |
|
852 break |
|
853 } |
|
854 l.next() |
|
855 l.emit(tokenDoubleRightBracket) |
|
856 return l.lexVoid |
|
857 case '[': |
|
858 return l.errorf("table array key cannot contain ']'") |
|
859 default: |
|
860 l.next() |
|
861 } |
|
862 } |
|
863 return l.errorf("unclosed table array key") |
|
864 } |
|
865 |
|
866 // Parse the key till "]" but only bare keys are supported |
|
867 func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { |
|
868 for r := l.peek(); r != eof; r = l.peek() { |
|
869 switch r { |
|
870 case ']': |
|
871 if l.currentTokenStop > l.currentTokenStart { |
|
872 l.emit(tokenKeyGroup) |
|
873 } |
|
874 l.next() |
|
875 l.emit(tokenRightBracket) |
|
876 return l.lexVoid |
|
877 case '[': |
|
878 return l.errorf("table key cannot contain ']'") |
|
879 default: |
|
880 l.next() |
|
881 } |
|
882 } |
|
883 return l.errorf("unclosed table key") |
|
884 } |
|
885 |
|
886 func (l *tomlLexer) lexRightBracket() tomlLexStateFn { |
|
887 l.next() |
|
888 l.emit(tokenRightBracket) |
|
889 if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' { |
|
890 return l.errorf("cannot have ']' here") |
|
891 } |
|
892 l.brackets = l.brackets[:len(l.brackets)-1] |
|
893 return l.lexRvalue |
|
894 } |
|
895 |
|
896 type validRuneFn func(r rune) bool |
|
897 |
|
898 func isValidHexRune(r rune) bool { |
|
899 return r >= 'a' && r <= 'f' || |
|
900 r >= 'A' && r <= 'F' || |
|
901 r >= '0' && r <= '9' || |
|
902 r == '_' |
|
903 } |
|
904 |
|
905 func isValidOctalRune(r rune) bool { |
|
906 return r >= '0' && r <= '7' || r == '_' |
|
907 } |
|
908 |
|
909 func isValidBinaryRune(r rune) bool { |
|
910 return r == '0' || r == '1' || r == '_' |
|
911 } |
|
912 |
|
913 func (l *tomlLexer) lexNumber() tomlLexStateFn { |
|
914 r := l.peek() |
|
915 |
|
916 if r == '0' { |
|
917 follow := l.peekString(2) |
|
918 if len(follow) == 2 { |
|
919 var isValidRune validRuneFn |
|
920 switch follow[1] { |
|
921 case 'x': |
|
922 isValidRune = isValidHexRune |
|
923 case 'o': |
|
924 isValidRune = isValidOctalRune |
|
925 case 'b': |
|
926 isValidRune = isValidBinaryRune |
|
927 default: |
|
928 if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { |
|
929 return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) |
|
930 } |
|
931 } |
|
932 |
|
933 if isValidRune != nil { |
|
934 l.next() |
|
935 l.next() |
|
936 digitSeen := false |
|
937 for { |
|
938 next := l.peek() |
|
939 if !isValidRune(next) { |
|
940 break |
|
941 } |
|
942 digitSeen = true |
|
943 l.next() |
|
944 } |
|
945 |
|
946 if !digitSeen { |
|
947 return l.errorf("number needs at least one digit") |
|
948 } |
|
949 |
|
950 l.emit(tokenInteger) |
|
951 |
|
952 return l.lexRvalue |
|
953 } |
|
954 } |
|
955 } |
|
956 |
|
957 if r == '+' || r == '-' { |
|
958 l.next() |
|
959 if l.follow("inf") { |
|
960 return l.lexInf |
|
961 } |
|
962 if l.follow("nan") { |
|
963 return l.lexNan |
|
964 } |
|
965 } |
|
966 |
|
967 pointSeen := false |
|
968 expSeen := false |
|
969 digitSeen := false |
|
970 for { |
|
971 next := l.peek() |
|
972 if next == '.' { |
|
973 if pointSeen { |
|
974 return l.errorf("cannot have two dots in one float") |
|
975 } |
|
976 l.next() |
|
977 if !isDigit(l.peek()) { |
|
978 return l.errorf("float cannot end with a dot") |
|
979 } |
|
980 pointSeen = true |
|
981 } else if next == 'e' || next == 'E' { |
|
982 expSeen = true |
|
983 l.next() |
|
984 r := l.peek() |
|
985 if r == '+' || r == '-' { |
|
986 l.next() |
|
987 } |
|
988 } else if isDigit(next) { |
|
989 digitSeen = true |
|
990 l.next() |
|
991 } else if next == '_' { |
|
992 l.next() |
|
993 } else { |
|
994 break |
|
995 } |
|
996 if pointSeen && !digitSeen { |
|
997 return l.errorf("cannot start float with a dot") |
|
998 } |
|
999 } |
|
1000 |
|
1001 if !digitSeen { |
|
1002 return l.errorf("no digit in that number") |
|
1003 } |
|
1004 if pointSeen || expSeen { |
|
1005 l.emit(tokenFloat) |
|
1006 } else { |
|
1007 l.emit(tokenInteger) |
|
1008 } |
|
1009 return l.lexRvalue |
|
1010 } |
|
1011 |
|
1012 func (l *tomlLexer) run() { |
|
1013 for state := l.lexVoid; state != nil; { |
|
1014 state = state() |
|
1015 } |
|
1016 } |
|
1017 |
|
1018 // Entry point |
|
1019 func lexToml(inputBytes []byte) []token { |
|
1020 runes := bytes.Runes(inputBytes) |
|
1021 l := &tomlLexer{ |
|
1022 input: runes, |
|
1023 tokens: make([]token, 0, 256), |
|
1024 line: 1, |
|
1025 col: 1, |
|
1026 endbufferLine: 1, |
|
1027 endbufferCol: 1, |
|
1028 } |
|
1029 l.run() |
|
1030 return l.tokens |
|
1031 } |
|