lexer.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. package xregex
  2. /*
  3. golang version regex parser
  4. refer to: https://github.com/aristotle9/as3cc/tree/master/java-template/src/org/lala/lex/utils/parser
  5. */
  6. import (
  7. "errors"
  8. "fmt"
  9. "strconv"
  10. )
  11. const (
  12. _initial = "INITIAL"
  13. _deadState = 0xFFFFFFFF
  14. _maxValue = 0x7fffffffffffffff
  15. )
  16. var (
  17. errEOF = errors.New("已经到达末尾")
  18. )
  19. // Lexer golang lexter
  20. type lexer struct {
  21. transTable []*stateTransItem
  22. finalTable map[int64]int64
  23. initialTable map[string]int64
  24. inputTable []*rangeItem
  25. start int64
  26. oldStart int64
  27. tokenName string
  28. yyText interface{}
  29. yy interface{}
  30. ended bool
  31. initialInput int64
  32. initialState string
  33. line int64
  34. column int64
  35. advanced bool
  36. source string
  37. }
  38. func newLexer() (lx *lexer) {
  39. lx = &lexer{}
  40. lx.transTable = []*stateTransItem{
  41. {false, []int64{0xFFFFFFFF, 0x3, 0x2, 0x1},
  42. []*rangeItem{{0, 32, 0}, {33, 33, 1},
  43. {34, 34, 2}, {35, 35, 3}}},
  44. {false,
  45. []int64{0xFFFFFFFF, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5,
  46. 0x4},
  47. []*rangeItem{{0, 0, 0}, {1, 1, 1},
  48. {2, 2, 2}, {3, 3, 3}, {4, 4, 4},
  49. {5, 5, 5}, {6, 6, 6}, {7, 7, 7},
  50. {8, 28, 8}, {29, 29, 9}, {30, 30, 10},
  51. {31, 31, 11}, {32, 32, 12},
  52. {33, 35, 0}}},
  53. {false, []int64{0xFFFFFFFF, 0xF, 0xE, 0xD, 0x8, 0x12, 0x11, 0x10},
  54. []*rangeItem{{0, 0, 0}, {1, 1, 1},
  55. {2, 2, 2}, {3, 3, 3}, {4, 7, 4},
  56. {8, 8, 5}, {9, 9, 6}, {10, 27, 4},
  57. {28, 28, 7}, {29, 32, 4},
  58. {33, 35, 0}}},
  59. {false, []int64{0xFFFFFFFF, 0x16, 0x15, 0x14, 0x13},
  60. []*rangeItem{{0, 21, 0}, {22, 24, 1},
  61. {25, 25, 2}, {26, 26, 3}, {27, 27, 4},
  62. {28, 35, 0}}},
  63. {true, nil, nil}, {true, nil, nil},
  64. {true, nil, nil}, {true, nil, nil},
  65. {true, nil, nil}, {true, nil, nil},
  66. {true, nil, nil}, {true, nil, nil},
  67. {true, nil, nil},
  68. {false,
  69. []int64{0xFFFFFFFF, 0x1F, 0x17, 0xE, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x1E, 0x21,
  70. 0x20, 0x18},
  71. []*rangeItem{{0, 0, 0}, {1, 1, 1},
  72. {2, 9, 2}, {10, 11, 3}, {12, 12, 4},
  73. {13, 13, 5}, {14, 14, 6}, {15, 15, 7},
  74. {16, 16, 8}, {17, 18, 2}, {19, 19, 9},
  75. {20, 20, 10}, {21, 21, 11}, {22, 23, 2},
  76. {24, 24, 12}, {25, 32, 2},
  77. {33, 35, 0}}},
  78. {true, nil, nil}, {true, nil, nil},
  79. {true, nil, nil}, {true, nil, nil},
  80. {true, nil, nil}, {true, nil, nil},
  81. {false, []int64{0xFFFFFFFF, 0x14},
  82. []*rangeItem{{0, 25, 0}, {26, 26, 1},
  83. {27, 35, 0}}},
  84. {true, nil, nil},
  85. {false, []int64{0xFFFFFFFF, 0x16},
  86. []*rangeItem{{0, 21, 0}, {22, 24, 1},
  87. {25, 35, 0}}},
  88. {true, nil, nil},
  89. {false, []int64{0xFFFFFFFF, 0x22},
  90. []*rangeItem{{0, 22, 0}, {23, 24, 1},
  91. {25, 35, 0}}},
  92. {false, []int64{0xFFFFFFFF, 0x23},
  93. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  94. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  95. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  96. {21, 21, 0}, {22, 24, 1},
  97. {25, 35, 0}}},
  98. {false, []int64{0xFFFFFFFF, 0x24},
  99. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  100. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  101. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  102. {21, 21, 0}, {22, 24, 1},
  103. {25, 35, 0}}},
  104. {true, nil, nil}, {true, nil, nil},
  105. {true, nil, nil}, {true, nil, nil},
  106. {true, nil, nil}, {true, nil, nil},
  107. {true, nil, nil},
  108. {false, []int64{0xFFFFFFFF, 0x25},
  109. []*rangeItem{{0, 22, 0}, {23, 24, 1},
  110. {25, 35, 0}}},
  111. {false, []int64{0xFFFFFFFF, 0x26},
  112. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  113. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  114. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  115. {21, 21, 0}, {22, 24, 1},
  116. {25, 35, 0}}},
  117. {false, []int64{0xFFFFFFFF, 0x27},
  118. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  119. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  120. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  121. {21, 21, 0}, {22, 24, 1},
  122. {25, 35, 0}}},
  123. {true, nil, nil}, {true, nil, nil},
  124. {false, []int64{0xFFFFFFFF, 0x28},
  125. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  126. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  127. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  128. {21, 21, 0}, {22, 24, 1},
  129. {25, 35, 0}}},
  130. {false, []int64{0xFFFFFFFF, 0x29},
  131. []*rangeItem{{0, 10, 0}, {11, 11, 1},
  132. {12, 12, 0}, {13, 14, 1}, {15, 17, 0},
  133. {18, 18, 1}, {19, 19, 0}, {20, 20, 1},
  134. {21, 21, 0}, {22, 24, 1},
  135. {25, 35, 0}}},
  136. {true, nil, nil}}
  137. lx.finalTable = make(map[int64]int64)
  138. lx.finalTable[0x4] = 0x0
  139. lx.finalTable[0x5] = 0x4
  140. lx.finalTable[0x6] = 0x1
  141. lx.finalTable[0x7] = 0x2
  142. lx.finalTable[0x8] = 0x1C
  143. lx.finalTable[0x9] = 0x3
  144. lx.finalTable[0xA] = 0x6
  145. lx.finalTable[0xB] = 0x5
  146. lx.finalTable[0xC] = 0xA
  147. lx.finalTable[0xD] = 0x1C
  148. lx.finalTable[0xE] = 0x12
  149. lx.finalTable[0xF] = 0x1B
  150. lx.finalTable[0x10] = 0x8
  151. lx.finalTable[0x11] = 0x7
  152. lx.finalTable[0x12] = 0x9
  153. lx.finalTable[0x13] = 0xE
  154. lx.finalTable[0x14] = 0xD
  155. lx.finalTable[0x15] = 0xB
  156. lx.finalTable[0x16] = 0xC
  157. lx.finalTable[0x17] = 0x1A
  158. lx.finalTable[0x18] = 0x1A
  159. lx.finalTable[0x19] = 0x1A
  160. lx.finalTable[0x1A] = 0x1A
  161. lx.finalTable[0x1B] = 0x16
  162. lx.finalTable[0x1C] = 0x17
  163. lx.finalTable[0x1D] = 0x13
  164. lx.finalTable[0x1E] = 0x15
  165. lx.finalTable[0x1F] = 0x18
  166. lx.finalTable[0x20] = 0x14
  167. lx.finalTable[0x21] = 0x19
  168. lx.finalTable[0x25] = 0xF
  169. lx.finalTable[0x26] = 0x10
  170. lx.finalTable[0x29] = 0x11
  171. lx.inputTable = []*rangeItem{{0, 8, 17}, {9, 9, 26},
  172. {10, 10, 0}, {11, 12, 17}, {13, 13, 0},
  173. {14, 31, 17}, {32, 32, 26}, {33, 39, 17},
  174. {40, 40, 31}, {41, 41, 5}, {42, 42, 32},
  175. {43, 43, 30}, {44, 44, 25}, {45, 45, 28},
  176. {46, 46, 2}, {47, 47, 1}, {48, 48, 24},
  177. {49, 55, 23}, {56, 57, 22}, {58, 62, 17},
  178. {63, 63, 29}, {64, 64, 17}, {65, 70, 18},
  179. {71, 90, 17}, {91, 91, 6}, {92, 92, 3},
  180. {93, 93, 8}, {94, 94, 9}, {95, 96, 17},
  181. {97, 97, 18}, {98, 98, 14}, {99, 99, 20},
  182. {100, 100, 11}, {101, 101, 18}, {102, 102, 13},
  183. {103, 109, 17}, {110, 110, 21}, {111, 113, 17},
  184. {114, 114, 12}, {115, 115, 10}, {116, 116, 19},
  185. {117, 117, 15}, {118, 118, 17}, {119, 119, 10},
  186. {120, 120, 16}, {121, 122, 17}, {123, 123, 4},
  187. {124, 124, 7}, {125, 125, 27}, {126, 65535, 17}}
  188. lx.initialTable = make(map[string]int64)
  189. lx.initialTable["REPEAT"] = 0x1
  190. lx.initialTable["BRACKET"] = 0x2
  191. lx.initialTable["INITIAL"] = 0x3
  192. return
  193. }
  194. func (lx *lexer) setSource(src string) {
  195. if src != "" {
  196. lx.source = src
  197. }
  198. lx.ended = false
  199. lx.start = 0
  200. lx.oldStart = 0
  201. lx.line = 1
  202. lx.column = 0
  203. lx.advanced = true
  204. lx.tokenName = ""
  205. lx.yy = nil
  206. lx.initialState = _initial
  207. lx.initialInput = lx.initialTable[lx.initialState]
  208. }
  209. func (lx *lexer) getToken() (string, error) {
  210. var err error
  211. if lx.advanced {
  212. lx.tokenName, err = lx.next()
  213. lx.advanced = false
  214. }
  215. return lx.tokenName, err
  216. }
  217. func (lx *lexer) getPositionInfo() string {
  218. return fmt.Sprintf("row(%d) column(%d)", lx.line, lx.column)
  219. }
  220. func (lx *lexer) next() (ret string, err error) {
  221. for {
  222. var (
  223. nextState int64
  224. ch int64
  225. och = _maxValue
  226. next = lx.start
  227. curState = lx.transTable[0].toStates[lx.initialInput]
  228. lastFinalState = int64(_deadState)
  229. lastFinalPosition = lx.start
  230. )
  231. for {
  232. if next < int64(len(lx.source)) {
  233. ch = int64(lx.source[next])
  234. // 计算行、列的位置
  235. if och != _maxValue {
  236. if ch == 0x0d { // \r符号
  237. lx.column = 0
  238. lx.line++
  239. } else if ch == 0x0a { // \n
  240. if och != 0x0d { // != \r
  241. lx.column = 0
  242. lx.line++
  243. }
  244. } else {
  245. lx.column++
  246. }
  247. }
  248. och = int(ch)
  249. if nextState, err = lx.trans(curState, ch); err != nil {
  250. return
  251. }
  252. } else {
  253. nextState = _deadState
  254. }
  255. //OK
  256. if nextState == _deadState {
  257. if lx.start == lastFinalPosition {
  258. if lx.start == int64(len(lx.source)) {
  259. if !lx.ended {
  260. lx.ended = true
  261. return "<$>", nil
  262. }
  263. return "", errEOF
  264. }
  265. return "", fmt.Errorf("意外的字符(line:%d,col:%d) of %s", lx.line, lx.column, lx.source)
  266. }
  267. lx.yyText = lx.source[lx.start:lastFinalPosition]
  268. lx.oldStart = lx.start
  269. lx.start = lastFinalPosition
  270. fIndex := lx.finalTable[lastFinalState]
  271. switch fIndex {
  272. case 0x0:
  273. return "*", nil
  274. case 0x1:
  275. return "+", nil
  276. case 0x2:
  277. return "?", nil
  278. case 0x3:
  279. return "|", nil
  280. case 0x4:
  281. return "(", nil
  282. case 0x5:
  283. return ")", nil
  284. case 0x6:
  285. if err = lx.begin("BRACKET"); err != nil {
  286. return
  287. }
  288. return "[", nil
  289. case 0x7:
  290. return "^", nil
  291. case 0x8:
  292. return "-", nil
  293. case 0x9:
  294. if err = lx.begin("INITIAL"); err != nil {
  295. return
  296. }
  297. return "]", nil
  298. case 0xA:
  299. if err = lx.begin("REPEAT"); err != nil {
  300. return
  301. }
  302. return "{", nil
  303. case 0xB:
  304. return ",", nil
  305. case 0xC:
  306. if lx.yyText, err = strconv.ParseInt(lx.yyText.(string), 10, 64); err != nil {
  307. return
  308. }
  309. return "d", nil
  310. case 0xE:
  311. if err = lx.begin("INITIAL"); err != nil {
  312. return
  313. }
  314. return "}", nil
  315. case 0xF:
  316. var tmp int64
  317. if tmp, err = strconv.ParseInt(lx.yyText.(string)[2:4], 8, 64); err != nil {
  318. return
  319. }
  320. lx.yyText = string(tmp)
  321. return "c", nil
  322. case 0x10:
  323. var tmp int64
  324. if tmp, err = strconv.ParseInt(lx.yyText.(string)[2:4], 16, 64); err != nil {
  325. return
  326. }
  327. lx.yyText = string(tmp)
  328. return "c", nil
  329. case 0x11:
  330. var tmp int64
  331. if tmp, err = strconv.ParseInt(lx.yyText.(string)[2:6], 16, 64); err != nil {
  332. return
  333. }
  334. lx.yyText = string(tmp)
  335. return "c", nil
  336. case 0x12:
  337. return "escc", nil
  338. case 0x13:
  339. lx.yyText = "\r"
  340. return "c", nil
  341. case 0x14:
  342. lx.yyText = "\n"
  343. return "c", nil
  344. case 0x15:
  345. lx.yyText = "\t"
  346. return "c", nil
  347. case 0x16:
  348. lx.yyText = "\b"
  349. return "c", nil
  350. case 0x17:
  351. lx.yyText = "\f"
  352. return "c", nil
  353. case 0x18:
  354. lx.yyText = "/"
  355. return "c", nil
  356. case 0x19:
  357. return "escc", nil
  358. case 0x1A:
  359. lx.yyText = lx.yyText.(string)[1:2]
  360. return "c", nil
  361. case 0x1B:
  362. return "/", nil
  363. case 0x1C:
  364. return "c", nil
  365. }
  366. break
  367. } else {
  368. next++
  369. if _, ok := lx.finalTable[nextState]; ok {
  370. lastFinalState = nextState
  371. lastFinalPosition = next
  372. }
  373. curState = nextState
  374. }
  375. }
  376. }
  377. }
  378. func (lx *lexer) begin(state string) error {
  379. return lx.setInitialState(state)
  380. }
  381. func (lx *lexer) setInitialState(state string) (err error) {
  382. if _, ok := lx.initialTable[state]; !ok {
  383. err = fmt.Errorf("未定义的初始状态:%s", state)
  384. return
  385. }
  386. lx.initialState = state
  387. lx.initialInput = lx.initialTable[state]
  388. return
  389. }
  390. func (lx *lexer) trans(curState, ch int64) (int64, error) {
  391. if ch < lx.inputTable[0].from || ch > lx.inputTable[len(lx.inputTable)-1].to {
  392. return 0, fmt.Errorf("line:%d,column:%d 输入字符超出范围", lx.line, lx.column)
  393. }
  394. if lx.transTable[curState].isDead {
  395. return _deadState, nil
  396. }
  397. pubInput := find(ch, lx.inputTable)
  398. innerInput := find(pubInput, lx.transTable[curState].transEdge)
  399. return lx.transTable[curState].toStates[innerInput], nil
  400. }
  401. func find(code int64, table []*rangeItem) int64 {
  402. var (
  403. max = len(table) - 1
  404. min int
  405. mid uint64
  406. )
  407. for {
  408. mid = uint64(min+max) >> 1
  409. if table[mid].from <= code {
  410. if table[mid].to >= code {
  411. return table[mid].value
  412. }
  413. min = int(mid) + 1
  414. } else {
  415. max = int(mid) - 1
  416. }
  417. }
  418. }