triegen.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package triegen implements a code generator for a trie for associating
  5. // unsigned integer values with UTF-8 encoded runes.
  6. //
  7. // Many of the go.text packages use tries for storing per-rune information. A
  8. // trie is especially useful if many of the runes have the same value. If this
  9. // is the case, many blocks can be expected to be shared allowing for
  10. // information on many runes to be stored in little space.
  11. //
  12. // As most of the lookups are done directly on []byte slices, the tries use the
  13. // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
  14. // runes and contributes a little bit to better performance. It also naturally
  15. // provides a fast path for ASCII.
  16. //
  17. // Space is also an issue. There are many code points defined in Unicode and as
  18. // a result tables can get quite large. So every byte counts. The triegen
  19. // package automatically chooses the smallest integer values to represent the
  20. // tables. Compacters allow further compression of the trie by allowing for
  21. // alternative representations of individual trie blocks.
  22. //
  23. // triegen allows generating multiple tries as a single structure. This is
  24. // useful when, for example, one wants to generate tries for several languages
  25. // that have a lot of values in common. Some existing libraries for
  26. // internationalization store all per-language data as a dynamically loadable
  27. // chunk. The go.text packages are designed with the assumption that the user
  28. // typically wants to compile in support for all supported languages, in line
  29. // with the approach common to Go to create a single standalone binary. The
  30. // multi-root trie approach can give significant storage savings in this
  31. // scenario.
  32. //
  33. // triegen generates both tables and code. The code is optimized to use the
  34. // automatically chosen data types. The following code is generated for a Trie
  35. // or multiple Tries named "foo":
  36. // - type fooTrie
  37. // The trie type.
  38. //
  39. // - func newFooTrie(x int) *fooTrie
  40. // Trie constructor, where x is the index of the trie passed to Gen.
  41. //
  42. // - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
  43. // The lookup method, where uintX is automatically chosen.
  44. //
  45. // - func lookupString, lookupUnsafe and lookupStringUnsafe
  46. // Variants of the above.
  47. //
  48. // - var fooValues and fooIndex and any tables generated by Compacters.
  49. // The core trie data.
  50. //
  51. // - var fooTrieHandles
  52. // Indexes of starter blocks in case of multiple trie roots.
  53. //
  54. // It is recommended that users test the generated trie by checking the returned
  55. // value for every rune. Such exhaustive tests are possible as the the number of
  56. // runes in Unicode is limited.
  57. package triegen // import "golang.org/x/text/internal/triegen"
  58. // TODO: Arguably, the internally optimized data types would not have to be
  59. // exposed in the generated API. We could also investigate not generating the
  60. // code, but using it through a package. We would have to investigate the impact
  61. // on performance of making such change, though. For packages like unicode/norm,
  62. // small changes like this could tank performance.
  63. import (
  64. "encoding/binary"
  65. "fmt"
  66. "hash/crc64"
  67. "io"
  68. "log"
  69. "unicode/utf8"
  70. )
  71. // builder builds a set of tries for associating values with runes. The set of
  72. // tries can share common index and value blocks.
  73. type builder struct {
  74. Name string
  75. // ValueType is the type of the trie values looked up.
  76. ValueType string
  77. // ValueSize is the byte size of the ValueType.
  78. ValueSize int
  79. // IndexType is the type of trie index values used for all UTF-8 bytes of
  80. // a rune except the last one.
  81. IndexType string
  82. // IndexSize is the byte size of the IndexType.
  83. IndexSize int
  84. // SourceType is used when generating the lookup functions. If the user
  85. // requests StringSupport, all lookup functions will be generated for
  86. // string input as well.
  87. SourceType string
  88. Trie []*Trie
  89. IndexBlocks []*node
  90. ValueBlocks [][]uint64
  91. Compactions []compaction
  92. Checksum uint64
  93. ASCIIBlock string
  94. StarterBlock string
  95. indexBlockIdx map[uint64]int
  96. valueBlockIdx map[uint64]nodeIndex
  97. asciiBlockIdx map[uint64]int
  98. // Stats are used to fill out the template.
  99. Stats struct {
  100. NValueEntries int
  101. NValueBytes int
  102. NIndexEntries int
  103. NIndexBytes int
  104. NHandleBytes int
  105. }
  106. err error
  107. }
  108. // A nodeIndex encodes the index of a node, which is defined by the compaction
  109. // which stores it and an index within the compaction. For internal nodes, the
  110. // compaction is always 0.
  111. type nodeIndex struct {
  112. compaction int
  113. index int
  114. }
  115. // compaction keeps track of stats used for the compaction.
  116. type compaction struct {
  117. c Compacter
  118. blocks []*node
  119. maxHandle uint32
  120. totalSize int
  121. // Used by template-based generator and thus exported.
  122. Cutoff uint32
  123. Offset uint32
  124. Handler string
  125. }
  126. func (b *builder) setError(err error) {
  127. if b.err == nil {
  128. b.err = err
  129. }
  130. }
  131. // An Option can be passed to Gen.
  132. type Option func(b *builder) error
  133. // Compact configures the trie generator to use the given Compacter.
  134. func Compact(c Compacter) Option {
  135. return func(b *builder) error {
  136. b.Compactions = append(b.Compactions, compaction{
  137. c: c,
  138. Handler: c.Handler() + "(n, b)"})
  139. return nil
  140. }
  141. }
  142. // Gen writes Go code for a shared trie lookup structure to w for the given
  143. // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
  144. // return the *nameTrie for tries[x]. A value can be looked up by using one of
  145. // the various lookup methods defined on nameTrie. It returns the table size of
  146. // the generated trie.
  147. func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
  148. // The index contains two dummy blocks, followed by the zero block. The zero
  149. // block is at offset 0x80, so that the offset for the zero block for
  150. // continuation bytes is 0.
  151. b := &builder{
  152. Name: name,
  153. Trie: tries,
  154. IndexBlocks: []*node{{}, {}, {}},
  155. Compactions: []compaction{{
  156. Handler: name + "Values[n<<6+uint32(b)]",
  157. }},
  158. // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
  159. // block.
  160. indexBlockIdx: map[uint64]int{0: 0},
  161. valueBlockIdx: map[uint64]nodeIndex{0: {}},
  162. asciiBlockIdx: map[uint64]int{},
  163. }
  164. b.Compactions[0].c = (*simpleCompacter)(b)
  165. for _, f := range opts {
  166. if err := f(b); err != nil {
  167. return 0, err
  168. }
  169. }
  170. b.build()
  171. if b.err != nil {
  172. return 0, b.err
  173. }
  174. if err = b.print(w); err != nil {
  175. return 0, err
  176. }
  177. return b.Size(), nil
  178. }
  179. // A Trie represents a single root node of a trie. A builder may build several
  180. // overlapping tries at once.
  181. type Trie struct {
  182. root *node
  183. hiddenTrie
  184. }
  185. // hiddenTrie contains values we want to be visible to the template generator,
  186. // but hidden from the API documentation.
  187. type hiddenTrie struct {
  188. Name string
  189. Checksum uint64
  190. ASCIIIndex int
  191. StarterIndex int
  192. }
  193. // NewTrie returns a new trie root.
  194. func NewTrie(name string) *Trie {
  195. return &Trie{
  196. &node{
  197. children: make([]*node, blockSize),
  198. values: make([]uint64, utf8.RuneSelf),
  199. },
  200. hiddenTrie{Name: name},
  201. }
  202. }
  203. // Gen is a convenience wrapper around the Gen func passing t as the only trie
  204. // and uses the name passed to NewTrie. It returns the size of the generated
  205. // tables.
  206. func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
  207. return Gen(w, t.Name, []*Trie{t}, opts...)
  208. }
  209. // node is a node of the intermediate trie structure.
  210. type node struct {
  211. // children holds this node's children. It is always of length 64.
  212. // A child node may be nil.
  213. children []*node
  214. // values contains the values of this node. If it is non-nil, this node is
  215. // either a root or leaf node:
  216. // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
  217. // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF].
  218. values []uint64
  219. index nodeIndex
  220. }
  221. // Insert associates value with the given rune. Insert will panic if a non-zero
  222. // value is passed for an invalid rune.
  223. func (t *Trie) Insert(r rune, value uint64) {
  224. if value == 0 {
  225. return
  226. }
  227. s := string(r)
  228. if []rune(s)[0] != r && value != 0 {
  229. // Note: The UCD tables will always assign what amounts to a zero value
  230. // to a surrogate. Allowing a zero value for an illegal rune allows
  231. // users to iterate over [0..MaxRune] without having to explicitly
  232. // exclude surrogates, which would be tedious.
  233. panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
  234. }
  235. if len(s) == 1 {
  236. // It is a root node value (ASCII).
  237. t.root.values[s[0]] = value
  238. return
  239. }
  240. n := t.root
  241. for ; len(s) > 1; s = s[1:] {
  242. if n.children == nil {
  243. n.children = make([]*node, blockSize)
  244. }
  245. p := s[0] % blockSize
  246. c := n.children[p]
  247. if c == nil {
  248. c = &node{}
  249. n.children[p] = c
  250. }
  251. if len(s) > 2 && c.values != nil {
  252. log.Fatalf("triegen: insert(%U): found internal node with values", r)
  253. }
  254. n = c
  255. }
  256. if n.values == nil {
  257. n.values = make([]uint64, blockSize)
  258. }
  259. if n.children != nil {
  260. log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
  261. }
  262. n.values[s[0]-0x80] = value
  263. }
  264. // Size returns the number of bytes the generated trie will take to store. It
  265. // needs to be exported as it is used in the templates.
  266. func (b *builder) Size() int {
  267. // Index blocks.
  268. sz := len(b.IndexBlocks) * blockSize * b.IndexSize
  269. // Skip the first compaction, which represents the normal value blocks, as
  270. // its totalSize does not account for the ASCII blocks, which are managed
  271. // separately.
  272. sz += len(b.ValueBlocks) * blockSize * b.ValueSize
  273. for _, c := range b.Compactions[1:] {
  274. sz += c.totalSize
  275. }
  276. // TODO: this computation does not account for the fixed overhead of a using
  277. // a compaction, either code or data. As for data, though, the typical
  278. // overhead of data is in the order of bytes (2 bytes for cases). Further,
  279. // the savings of using a compaction should anyway be substantial for it to
  280. // be worth it.
  281. // For multi-root tries, we also need to account for the handles.
  282. if len(b.Trie) > 1 {
  283. sz += 2 * b.IndexSize * len(b.Trie)
  284. }
  285. return sz
  286. }
  287. func (b *builder) build() {
  288. // Compute the sizes of the values.
  289. var vmax uint64
  290. for _, t := range b.Trie {
  291. vmax = maxValue(t.root, vmax)
  292. }
  293. b.ValueType, b.ValueSize = getIntType(vmax)
  294. // Compute all block allocations.
  295. // TODO: first compute the ASCII blocks for all tries and then the other
  296. // nodes. ASCII blocks are more restricted in placement, as they require two
  297. // blocks to be placed consecutively. Processing them first may improve
  298. // sharing (at least one zero block can be expected to be saved.)
  299. for _, t := range b.Trie {
  300. b.Checksum += b.buildTrie(t)
  301. }
  302. // Compute the offsets for all the Compacters.
  303. offset := uint32(0)
  304. for i := range b.Compactions {
  305. c := &b.Compactions[i]
  306. c.Offset = offset
  307. offset += c.maxHandle + 1
  308. c.Cutoff = offset
  309. }
  310. // Compute the sizes of indexes.
  311. // TODO: different byte positions could have different sizes. So far we have
  312. // not found a case where this is beneficial.
  313. imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
  314. for _, ib := range b.IndexBlocks {
  315. if x := uint64(ib.index.index); x > imax {
  316. imax = x
  317. }
  318. }
  319. b.IndexType, b.IndexSize = getIntType(imax)
  320. }
  321. func maxValue(n *node, max uint64) uint64 {
  322. if n == nil {
  323. return max
  324. }
  325. for _, c := range n.children {
  326. max = maxValue(c, max)
  327. }
  328. for _, v := range n.values {
  329. if max < v {
  330. max = v
  331. }
  332. }
  333. return max
  334. }
  335. func getIntType(v uint64) (string, int) {
  336. switch {
  337. case v < 1<<8:
  338. return "uint8", 1
  339. case v < 1<<16:
  340. return "uint16", 2
  341. case v < 1<<32:
  342. return "uint32", 4
  343. }
  344. return "uint64", 8
  345. }
  346. const (
  347. blockSize = 64
  348. // Subtract two blocks to offset 0x80, the first continuation byte.
  349. blockOffset = 2
  350. // Subtract three blocks to offset 0xC0, the first non-ASCII starter.
  351. rootBlockOffset = 3
  352. )
  353. var crcTable = crc64.MakeTable(crc64.ISO)
  354. func (b *builder) buildTrie(t *Trie) uint64 {
  355. n := t.root
  356. // Get the ASCII offset. For the first trie, the ASCII block will be at
  357. // position 0.
  358. hasher := crc64.New(crcTable)
  359. binary.Write(hasher, binary.BigEndian, n.values)
  360. hash := hasher.Sum64()
  361. v, ok := b.asciiBlockIdx[hash]
  362. if !ok {
  363. v = len(b.ValueBlocks)
  364. b.asciiBlockIdx[hash] = v
  365. b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
  366. if v == 0 {
  367. // Add the zero block at position 2 so that it will be assigned a
  368. // zero reference in the lookup blocks.
  369. // TODO: always do this? This would allow us to remove a check from
  370. // the trie lookup, but at the expense of extra space. Analyze
  371. // performance for unicode/norm.
  372. b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
  373. }
  374. }
  375. t.ASCIIIndex = v
  376. // Compute remaining offsets.
  377. t.Checksum = b.computeOffsets(n, true)
  378. // We already subtracted the normal blockOffset from the index. Subtract the
  379. // difference for starter bytes.
  380. t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
  381. return t.Checksum
  382. }
  383. func (b *builder) computeOffsets(n *node, root bool) uint64 {
  384. // For the first trie, the root lookup block will be at position 3, which is
  385. // the offset for UTF-8 non-ASCII starter bytes.
  386. first := len(b.IndexBlocks) == rootBlockOffset
  387. if first {
  388. b.IndexBlocks = append(b.IndexBlocks, n)
  389. }
  390. // We special-case the cases where all values recursively are 0. This allows
  391. // for the use of a zero block to which all such values can be directed.
  392. hash := uint64(0)
  393. if n.children != nil || n.values != nil {
  394. hasher := crc64.New(crcTable)
  395. for _, c := range n.children {
  396. var v uint64
  397. if c != nil {
  398. v = b.computeOffsets(c, false)
  399. }
  400. binary.Write(hasher, binary.BigEndian, v)
  401. }
  402. binary.Write(hasher, binary.BigEndian, n.values)
  403. hash = hasher.Sum64()
  404. }
  405. if first {
  406. b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
  407. }
  408. // Compacters don't apply to internal nodes.
  409. if n.children != nil {
  410. v, ok := b.indexBlockIdx[hash]
  411. if !ok {
  412. v = len(b.IndexBlocks) - blockOffset
  413. b.IndexBlocks = append(b.IndexBlocks, n)
  414. b.indexBlockIdx[hash] = v
  415. }
  416. n.index = nodeIndex{0, v}
  417. } else {
  418. h, ok := b.valueBlockIdx[hash]
  419. if !ok {
  420. bestI, bestSize := 0, blockSize*b.ValueSize
  421. for i, c := range b.Compactions[1:] {
  422. if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
  423. bestI, bestSize = i+1, sz
  424. }
  425. }
  426. c := &b.Compactions[bestI]
  427. c.totalSize += bestSize
  428. v := c.c.Store(n.values)
  429. if c.maxHandle < v {
  430. c.maxHandle = v
  431. }
  432. h = nodeIndex{bestI, int(v)}
  433. b.valueBlockIdx[hash] = h
  434. }
  435. n.index = h
  436. }
  437. return hash
  438. }