lexer.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. // TOML lexer.
  2. //
  3. // Written using the principles developed by Rob Pike in
  4. // http://www.youtube.com/watch?v=HxaD_trXwRE
  5. package toml
  6. import (
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "regexp"
  11. "strconv"
  12. "strings"
  13. )
  14. var dateRegexp *regexp.Regexp
  15. // Define state functions
  16. type tomlLexStateFn func() tomlLexStateFn
  17. // Define lexer
  18. type tomlLexer struct {
  19. inputIdx int
  20. input []rune // Textual source
  21. currentTokenStart int
  22. currentTokenStop int
  23. tokens []token
  24. depth int
  25. line int
  26. col int
  27. endbufferLine int
  28. endbufferCol int
  29. }
  30. // Basic read operations on input
  31. func (l *tomlLexer) read() rune {
  32. r := l.peek()
  33. if r == '\n' {
  34. l.endbufferLine++
  35. l.endbufferCol = 1
  36. } else {
  37. l.endbufferCol++
  38. }
  39. l.inputIdx++
  40. return r
  41. }
  42. func (l *tomlLexer) next() rune {
  43. r := l.read()
  44. if r != eof {
  45. l.currentTokenStop++
  46. }
  47. return r
  48. }
  49. func (l *tomlLexer) ignore() {
  50. l.currentTokenStart = l.currentTokenStop
  51. l.line = l.endbufferLine
  52. l.col = l.endbufferCol
  53. }
  54. func (l *tomlLexer) skip() {
  55. l.next()
  56. l.ignore()
  57. }
  58. func (l *tomlLexer) fastForward(n int) {
  59. for i := 0; i < n; i++ {
  60. l.next()
  61. }
  62. }
  63. func (l *tomlLexer) emitWithValue(t tokenType, value string) {
  64. l.tokens = append(l.tokens, token{
  65. Position: Position{l.line, l.col},
  66. typ: t,
  67. val: value,
  68. })
  69. l.ignore()
  70. }
  71. func (l *tomlLexer) emit(t tokenType) {
  72. l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
  73. }
  74. func (l *tomlLexer) peek() rune {
  75. if l.inputIdx >= len(l.input) {
  76. return eof
  77. }
  78. return l.input[l.inputIdx]
  79. }
  80. func (l *tomlLexer) peekString(size int) string {
  81. maxIdx := len(l.input)
  82. upperIdx := l.inputIdx + size // FIXME: potential overflow
  83. if upperIdx > maxIdx {
  84. upperIdx = maxIdx
  85. }
  86. return string(l.input[l.inputIdx:upperIdx])
  87. }
  88. func (l *tomlLexer) follow(next string) bool {
  89. return next == l.peekString(len(next))
  90. }
  91. // Error management
  92. func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
  93. l.tokens = append(l.tokens, token{
  94. Position: Position{l.line, l.col},
  95. typ: tokenError,
  96. val: fmt.Sprintf(format, args...),
  97. })
  98. return nil
  99. }
  100. // State functions
  101. func (l *tomlLexer) lexVoid() tomlLexStateFn {
  102. for {
  103. next := l.peek()
  104. switch next {
  105. case '[':
  106. return l.lexTableKey
  107. case '#':
  108. return l.lexComment(l.lexVoid)
  109. case '=':
  110. return l.lexEqual
  111. case '\r':
  112. fallthrough
  113. case '\n':
  114. l.skip()
  115. continue
  116. }
  117. if isSpace(next) {
  118. l.skip()
  119. }
  120. if l.depth > 0 {
  121. return l.lexRvalue
  122. }
  123. if isKeyStartChar(next) {
  124. return l.lexKey
  125. }
  126. if next == eof {
  127. l.next()
  128. break
  129. }
  130. }
  131. l.emit(tokenEOF)
  132. return nil
  133. }
  134. func (l *tomlLexer) lexRvalue() tomlLexStateFn {
  135. for {
  136. next := l.peek()
  137. switch next {
  138. case '.':
  139. return l.errorf("cannot start float with a dot")
  140. case '=':
  141. return l.lexEqual
  142. case '[':
  143. l.depth++
  144. return l.lexLeftBracket
  145. case ']':
  146. l.depth--
  147. return l.lexRightBracket
  148. case '{':
  149. return l.lexLeftCurlyBrace
  150. case '}':
  151. return l.lexRightCurlyBrace
  152. case '#':
  153. return l.lexComment(l.lexRvalue)
  154. case '"':
  155. return l.lexString
  156. case '\'':
  157. return l.lexLiteralString
  158. case ',':
  159. return l.lexComma
  160. case '\r':
  161. fallthrough
  162. case '\n':
  163. l.skip()
  164. if l.depth == 0 {
  165. return l.lexVoid
  166. }
  167. return l.lexRvalue
  168. case '_':
  169. return l.errorf("cannot start number with underscore")
  170. }
  171. if l.follow("true") {
  172. return l.lexTrue
  173. }
  174. if l.follow("false") {
  175. return l.lexFalse
  176. }
  177. if l.follow("inf") {
  178. return l.lexInf
  179. }
  180. if l.follow("nan") {
  181. return l.lexNan
  182. }
  183. if isSpace(next) {
  184. l.skip()
  185. continue
  186. }
  187. if next == eof {
  188. l.next()
  189. break
  190. }
  191. possibleDate := l.peekString(35)
  192. dateMatch := dateRegexp.FindString(possibleDate)
  193. if dateMatch != "" {
  194. l.fastForward(len(dateMatch))
  195. return l.lexDate
  196. }
  197. if next == '+' || next == '-' || isDigit(next) {
  198. return l.lexNumber
  199. }
  200. if isAlphanumeric(next) {
  201. return l.lexKey
  202. }
  203. return l.errorf("no value can start with %c", next)
  204. }
  205. l.emit(tokenEOF)
  206. return nil
  207. }
  208. func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
  209. l.next()
  210. l.emit(tokenLeftCurlyBrace)
  211. return l.lexRvalue
  212. }
  213. func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
  214. l.next()
  215. l.emit(tokenRightCurlyBrace)
  216. return l.lexRvalue
  217. }
  218. func (l *tomlLexer) lexDate() tomlLexStateFn {
  219. l.emit(tokenDate)
  220. return l.lexRvalue
  221. }
  222. func (l *tomlLexer) lexTrue() tomlLexStateFn {
  223. l.fastForward(4)
  224. l.emit(tokenTrue)
  225. return l.lexRvalue
  226. }
  227. func (l *tomlLexer) lexFalse() tomlLexStateFn {
  228. l.fastForward(5)
  229. l.emit(tokenFalse)
  230. return l.lexRvalue
  231. }
  232. func (l *tomlLexer) lexInf() tomlLexStateFn {
  233. l.fastForward(3)
  234. l.emit(tokenInf)
  235. return l.lexRvalue
  236. }
  237. func (l *tomlLexer) lexNan() tomlLexStateFn {
  238. l.fastForward(3)
  239. l.emit(tokenNan)
  240. return l.lexRvalue
  241. }
  242. func (l *tomlLexer) lexEqual() tomlLexStateFn {
  243. l.next()
  244. l.emit(tokenEqual)
  245. return l.lexRvalue
  246. }
  247. func (l *tomlLexer) lexComma() tomlLexStateFn {
  248. l.next()
  249. l.emit(tokenComma)
  250. return l.lexRvalue
  251. }
  252. // Parse the key and emits its value without escape sequences.
  253. // bare keys, basic string keys and literal string keys are supported.
  254. func (l *tomlLexer) lexKey() tomlLexStateFn {
  255. growingString := ""
  256. for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
  257. if r == '"' {
  258. l.next()
  259. str, err := l.lexStringAsString(`"`, false, true)
  260. if err != nil {
  261. return l.errorf(err.Error())
  262. }
  263. growingString += str
  264. l.next()
  265. continue
  266. } else if r == '\'' {
  267. l.next()
  268. str, err := l.lexLiteralStringAsString(`'`, false)
  269. if err != nil {
  270. return l.errorf(err.Error())
  271. }
  272. growingString += str
  273. l.next()
  274. continue
  275. } else if r == '\n' {
  276. return l.errorf("keys cannot contain new lines")
  277. } else if isSpace(r) {
  278. break
  279. } else if !isValidBareChar(r) {
  280. return l.errorf("keys cannot contain %c character", r)
  281. }
  282. growingString += string(r)
  283. l.next()
  284. }
  285. l.emitWithValue(tokenKey, growingString)
  286. return l.lexVoid
  287. }
  288. func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
  289. return func() tomlLexStateFn {
  290. for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
  291. if next == '\r' && l.follow("\r\n") {
  292. break
  293. }
  294. l.next()
  295. }
  296. l.ignore()
  297. return previousState
  298. }
  299. }
  300. func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
  301. l.next()
  302. l.emit(tokenLeftBracket)
  303. return l.lexRvalue
  304. }
  305. func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
  306. growingString := ""
  307. if discardLeadingNewLine {
  308. if l.follow("\r\n") {
  309. l.skip()
  310. l.skip()
  311. } else if l.peek() == '\n' {
  312. l.skip()
  313. }
  314. }
  315. // find end of string
  316. for {
  317. if l.follow(terminator) {
  318. return growingString, nil
  319. }
  320. next := l.peek()
  321. if next == eof {
  322. break
  323. }
  324. growingString += string(l.next())
  325. }
  326. return "", errors.New("unclosed string")
  327. }
  328. func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
  329. l.skip()
  330. // handle special case for triple-quote
  331. terminator := "'"
  332. discardLeadingNewLine := false
  333. if l.follow("''") {
  334. l.skip()
  335. l.skip()
  336. terminator = "'''"
  337. discardLeadingNewLine = true
  338. }
  339. str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
  340. if err != nil {
  341. return l.errorf(err.Error())
  342. }
  343. l.emitWithValue(tokenString, str)
  344. l.fastForward(len(terminator))
  345. l.ignore()
  346. return l.lexRvalue
  347. }
  348. // Lex a string and return the results as a string.
  349. // Terminator is the substring indicating the end of the token.
  350. // The resulting string does not include the terminator.
  351. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
  352. growingString := ""
  353. if discardLeadingNewLine {
  354. if l.follow("\r\n") {
  355. l.skip()
  356. l.skip()
  357. } else if l.peek() == '\n' {
  358. l.skip()
  359. }
  360. }
  361. for {
  362. if l.follow(terminator) {
  363. return growingString, nil
  364. }
  365. if l.follow("\\") {
  366. l.next()
  367. switch l.peek() {
  368. case '\r':
  369. fallthrough
  370. case '\n':
  371. fallthrough
  372. case '\t':
  373. fallthrough
  374. case ' ':
  375. // skip all whitespace chars following backslash
  376. for strings.ContainsRune("\r\n\t ", l.peek()) {
  377. l.next()
  378. }
  379. case '"':
  380. growingString += "\""
  381. l.next()
  382. case 'n':
  383. growingString += "\n"
  384. l.next()
  385. case 'b':
  386. growingString += "\b"
  387. l.next()
  388. case 'f':
  389. growingString += "\f"
  390. l.next()
  391. case '/':
  392. growingString += "/"
  393. l.next()
  394. case 't':
  395. growingString += "\t"
  396. l.next()
  397. case 'r':
  398. growingString += "\r"
  399. l.next()
  400. case '\\':
  401. growingString += "\\"
  402. l.next()
  403. case 'u':
  404. l.next()
  405. code := ""
  406. for i := 0; i < 4; i++ {
  407. c := l.peek()
  408. if !isHexDigit(c) {
  409. return "", errors.New("unfinished unicode escape")
  410. }
  411. l.next()
  412. code = code + string(c)
  413. }
  414. intcode, err := strconv.ParseInt(code, 16, 32)
  415. if err != nil {
  416. return "", errors.New("invalid unicode escape: \\u" + code)
  417. }
  418. growingString += string(rune(intcode))
  419. case 'U':
  420. l.next()
  421. code := ""
  422. for i := 0; i < 8; i++ {
  423. c := l.peek()
  424. if !isHexDigit(c) {
  425. return "", errors.New("unfinished unicode escape")
  426. }
  427. l.next()
  428. code = code + string(c)
  429. }
  430. intcode, err := strconv.ParseInt(code, 16, 64)
  431. if err != nil {
  432. return "", errors.New("invalid unicode escape: \\U" + code)
  433. }
  434. growingString += string(rune(intcode))
  435. default:
  436. return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
  437. }
  438. } else {
  439. r := l.peek()
  440. if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
  441. return "", fmt.Errorf("unescaped control character %U", r)
  442. }
  443. l.next()
  444. growingString += string(r)
  445. }
  446. if l.peek() == eof {
  447. break
  448. }
  449. }
  450. return "", errors.New("unclosed string")
  451. }
  452. func (l *tomlLexer) lexString() tomlLexStateFn {
  453. l.skip()
  454. // handle special case for triple-quote
  455. terminator := `"`
  456. discardLeadingNewLine := false
  457. acceptNewLines := false
  458. if l.follow(`""`) {
  459. l.skip()
  460. l.skip()
  461. terminator = `"""`
  462. discardLeadingNewLine = true
  463. acceptNewLines = true
  464. }
  465. str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
  466. if err != nil {
  467. return l.errorf(err.Error())
  468. }
  469. l.emitWithValue(tokenString, str)
  470. l.fastForward(len(terminator))
  471. l.ignore()
  472. return l.lexRvalue
  473. }
  474. func (l *tomlLexer) lexTableKey() tomlLexStateFn {
  475. l.next()
  476. if l.peek() == '[' {
  477. // token '[[' signifies an array of tables
  478. l.next()
  479. l.emit(tokenDoubleLeftBracket)
  480. return l.lexInsideTableArrayKey
  481. }
  482. // vanilla table key
  483. l.emit(tokenLeftBracket)
  484. return l.lexInsideTableKey
  485. }
  486. // Parse the key till "]]", but only bare keys are supported
  487. func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
  488. for r := l.peek(); r != eof; r = l.peek() {
  489. switch r {
  490. case ']':
  491. if l.currentTokenStop > l.currentTokenStart {
  492. l.emit(tokenKeyGroupArray)
  493. }
  494. l.next()
  495. if l.peek() != ']' {
  496. break
  497. }
  498. l.next()
  499. l.emit(tokenDoubleRightBracket)
  500. return l.lexVoid
  501. case '[':
  502. return l.errorf("table array key cannot contain ']'")
  503. default:
  504. l.next()
  505. }
  506. }
  507. return l.errorf("unclosed table array key")
  508. }
  509. // Parse the key till "]" but only bare keys are supported
  510. func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
  511. for r := l.peek(); r != eof; r = l.peek() {
  512. switch r {
  513. case ']':
  514. if l.currentTokenStop > l.currentTokenStart {
  515. l.emit(tokenKeyGroup)
  516. }
  517. l.next()
  518. l.emit(tokenRightBracket)
  519. return l.lexVoid
  520. case '[':
  521. return l.errorf("table key cannot contain ']'")
  522. default:
  523. l.next()
  524. }
  525. }
  526. return l.errorf("unclosed table key")
  527. }
  528. func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
  529. l.next()
  530. l.emit(tokenRightBracket)
  531. return l.lexRvalue
  532. }
  533. type validRuneFn func(r rune) bool
  534. func isValidHexRune(r rune) bool {
  535. return r >= 'a' && r <= 'f' ||
  536. r >= 'A' && r <= 'F' ||
  537. r >= '0' && r <= '9' ||
  538. r == '_'
  539. }
  540. func isValidOctalRune(r rune) bool {
  541. return r >= '0' && r <= '7' || r == '_'
  542. }
  543. func isValidBinaryRune(r rune) bool {
  544. return r == '0' || r == '1' || r == '_'
  545. }
  546. func (l *tomlLexer) lexNumber() tomlLexStateFn {
  547. r := l.peek()
  548. if r == '0' {
  549. follow := l.peekString(2)
  550. if len(follow) == 2 {
  551. var isValidRune validRuneFn
  552. switch follow[1] {
  553. case 'x':
  554. isValidRune = isValidHexRune
  555. case 'o':
  556. isValidRune = isValidOctalRune
  557. case 'b':
  558. isValidRune = isValidBinaryRune
  559. default:
  560. if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
  561. return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
  562. }
  563. }
  564. if isValidRune != nil {
  565. l.next()
  566. l.next()
  567. digitSeen := false
  568. for {
  569. next := l.peek()
  570. if !isValidRune(next) {
  571. break
  572. }
  573. digitSeen = true
  574. l.next()
  575. }
  576. if !digitSeen {
  577. return l.errorf("number needs at least one digit")
  578. }
  579. l.emit(tokenInteger)
  580. return l.lexRvalue
  581. }
  582. }
  583. }
  584. if r == '+' || r == '-' {
  585. l.next()
  586. if l.follow("inf") {
  587. return l.lexInf
  588. }
  589. if l.follow("nan") {
  590. return l.lexNan
  591. }
  592. }
  593. pointSeen := false
  594. expSeen := false
  595. digitSeen := false
  596. for {
  597. next := l.peek()
  598. if next == '.' {
  599. if pointSeen {
  600. return l.errorf("cannot have two dots in one float")
  601. }
  602. l.next()
  603. if !isDigit(l.peek()) {
  604. return l.errorf("float cannot end with a dot")
  605. }
  606. pointSeen = true
  607. } else if next == 'e' || next == 'E' {
  608. expSeen = true
  609. l.next()
  610. r := l.peek()
  611. if r == '+' || r == '-' {
  612. l.next()
  613. }
  614. } else if isDigit(next) {
  615. digitSeen = true
  616. l.next()
  617. } else if next == '_' {
  618. l.next()
  619. } else {
  620. break
  621. }
  622. if pointSeen && !digitSeen {
  623. return l.errorf("cannot start float with a dot")
  624. }
  625. }
  626. if !digitSeen {
  627. return l.errorf("no digit in that number")
  628. }
  629. if pointSeen || expSeen {
  630. l.emit(tokenFloat)
  631. } else {
  632. l.emit(tokenInteger)
  633. }
  634. return l.lexRvalue
  635. }
  636. func (l *tomlLexer) run() {
  637. for state := l.lexVoid; state != nil; {
  638. state = state()
  639. }
  640. }
  641. func init() {
  642. dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
  643. }
  644. // Entry point
  645. func lexToml(inputBytes []byte) []token {
  646. runes := bytes.Runes(inputBytes)
  647. l := &tomlLexer{
  648. input: runes,
  649. tokens: make([]token, 0, 256),
  650. line: 1,
  651. col: 1,
  652. endbufferLine: 1,
  653. endbufferCol: 1,
  654. }
  655. l.run()
  656. return l.tokens
  657. }