ucd.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package ucd provides a parser for Unicode Character Database files, the
  5. // format of which is defined in http://www.unicode.org/reports/tr44/. See
  6. // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
  7. //
  8. // It currently does not support substitutions of missing fields.
  9. package ucd // import "golang.org/x/text/internal/ucd"
  10. import (
  11. "bufio"
  12. "bytes"
  13. "errors"
  14. "io"
  15. "log"
  16. "regexp"
  17. "strconv"
  18. "strings"
  19. )
  20. // UnicodeData.txt fields.
  21. const (
  22. CodePoint = iota
  23. Name
  24. GeneralCategory
  25. CanonicalCombiningClass
  26. BidiClass
  27. DecompMapping
  28. DecimalValue
  29. DigitValue
  30. NumericValue
  31. BidiMirrored
  32. Unicode1Name
  33. ISOComment
  34. SimpleUppercaseMapping
  35. SimpleLowercaseMapping
  36. SimpleTitlecaseMapping
  37. )
  38. // Parse calls f for each entry in the given reader of a UCD file. It will close
  39. // the reader upon return. It will call log.Fatal if any error occurred.
  40. //
  41. // This implements the most common usage pattern of using Parser.
  42. func Parse(r io.ReadCloser, f func(p *Parser)) {
  43. defer r.Close()
  44. p := New(r)
  45. for p.Next() {
  46. f(p)
  47. }
  48. if err := p.Err(); err != nil {
  49. r.Close() // os.Exit will cause defers not to be called.
  50. log.Fatal(err)
  51. }
  52. }
  53. // An Option is used to configure a Parser.
  54. type Option func(p *Parser)
  55. func keepRanges(p *Parser) {
  56. p.keepRanges = true
  57. }
  58. var (
  59. // KeepRanges prevents the expansion of ranges. The raw ranges can be
  60. // obtained by calling Range(0) on the parser.
  61. KeepRanges Option = keepRanges
  62. )
  63. // The Part option register a handler for lines starting with a '@'. The text
  64. // after a '@' is available as the first field. Comments are handled as usual.
  65. func Part(f func(p *Parser)) Option {
  66. return func(p *Parser) {
  67. p.partHandler = f
  68. }
  69. }
  70. // The CommentHandler option passes comments that are on a line by itself to
  71. // a given handler.
  72. func CommentHandler(f func(s string)) Option {
  73. return func(p *Parser) {
  74. p.commentHandler = f
  75. }
  76. }
  77. // A Parser parses Unicode Character Database (UCD) files.
  78. type Parser struct {
  79. scanner *bufio.Scanner
  80. keepRanges bool // Don't expand rune ranges in field 0.
  81. err error
  82. comment []byte
  83. field [][]byte
  84. // parsedRange is needed in case Range(0) is called more than once for one
  85. // field. In some cases this requires scanning ahead.
  86. parsedRange bool
  87. rangeStart, rangeEnd rune
  88. partHandler func(p *Parser)
  89. commentHandler func(s string)
  90. }
  91. func (p *Parser) setError(err error) {
  92. if p.err == nil {
  93. p.err = err
  94. }
  95. }
  96. func (p *Parser) getField(i int) []byte {
  97. if i >= len(p.field) {
  98. return nil
  99. }
  100. return p.field[i]
  101. }
  102. // Err returns a non-nil error if any error occurred during parsing.
  103. func (p *Parser) Err() error {
  104. return p.err
  105. }
  106. // New returns a Parser for the given Reader.
  107. func New(r io.Reader, o ...Option) *Parser {
  108. p := &Parser{
  109. scanner: bufio.NewScanner(r),
  110. }
  111. for _, f := range o {
  112. f(p)
  113. }
  114. return p
  115. }
  116. // Next parses the next line in the file. It returns true if a line was parsed
  117. // and false if it reached the end of the file.
  118. func (p *Parser) Next() bool {
  119. if !p.keepRanges && p.rangeStart < p.rangeEnd {
  120. p.rangeStart++
  121. return true
  122. }
  123. p.comment = nil
  124. p.field = p.field[:0]
  125. p.parsedRange = false
  126. for p.scanner.Scan() {
  127. b := p.scanner.Bytes()
  128. if len(b) == 0 {
  129. continue
  130. }
  131. if b[0] == '#' {
  132. if p.commentHandler != nil {
  133. p.commentHandler(strings.TrimSpace(string(b[1:])))
  134. }
  135. continue
  136. }
  137. // Parse line
  138. if i := bytes.IndexByte(b, '#'); i != -1 {
  139. p.comment = bytes.TrimSpace(b[i+1:])
  140. b = b[:i]
  141. }
  142. if b[0] == '@' {
  143. if p.partHandler != nil {
  144. p.field = append(p.field, bytes.TrimSpace(b[1:]))
  145. p.partHandler(p)
  146. p.field = p.field[:0]
  147. }
  148. p.comment = nil
  149. continue
  150. }
  151. for {
  152. i := bytes.IndexByte(b, ';')
  153. if i == -1 {
  154. p.field = append(p.field, bytes.TrimSpace(b))
  155. break
  156. }
  157. p.field = append(p.field, bytes.TrimSpace(b[:i]))
  158. b = b[i+1:]
  159. }
  160. if !p.keepRanges {
  161. p.rangeStart, p.rangeEnd = p.getRange(0)
  162. }
  163. return true
  164. }
  165. p.setError(p.scanner.Err())
  166. return false
  167. }
  168. func parseRune(b []byte) (rune, error) {
  169. if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
  170. b = b[2:]
  171. }
  172. x, err := strconv.ParseUint(string(b), 16, 32)
  173. return rune(x), err
  174. }
  175. func (p *Parser) parseRune(b []byte) rune {
  176. x, err := parseRune(b)
  177. p.setError(err)
  178. return x
  179. }
  180. // Rune parses and returns field i as a rune.
  181. func (p *Parser) Rune(i int) rune {
  182. if i > 0 || p.keepRanges {
  183. return p.parseRune(p.getField(i))
  184. }
  185. return p.rangeStart
  186. }
  187. // Runes interprets and returns field i as a sequence of runes.
  188. func (p *Parser) Runes(i int) (runes []rune) {
  189. add := func(b []byte) {
  190. if b = bytes.TrimSpace(b); len(b) > 0 {
  191. runes = append(runes, p.parseRune(b))
  192. }
  193. }
  194. for b := p.getField(i); ; {
  195. i := bytes.IndexByte(b, ' ')
  196. if i == -1 {
  197. add(b)
  198. break
  199. }
  200. add(b[:i])
  201. b = b[i+1:]
  202. }
  203. return
  204. }
  205. var (
  206. errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
  207. // reRange matches one line of a legacy rune range.
  208. reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
  209. )
  210. // Range parses and returns field i as a rune range. A range is inclusive at
  211. // both ends. If the field only has one rune, first and last will be identical.
  212. // It supports the legacy format for ranges used in UnicodeData.txt.
  213. func (p *Parser) Range(i int) (first, last rune) {
  214. if !p.keepRanges {
  215. return p.rangeStart, p.rangeStart
  216. }
  217. return p.getRange(i)
  218. }
  219. func (p *Parser) getRange(i int) (first, last rune) {
  220. b := p.getField(i)
  221. if k := bytes.Index(b, []byte("..")); k != -1 {
  222. return p.parseRune(b[:k]), p.parseRune(b[k+2:])
  223. }
  224. // The first field may not be a rune, in which case we may ignore any error
  225. // and set the range as 0..0.
  226. x, err := parseRune(b)
  227. if err != nil {
  228. // Disable range parsing henceforth. This ensures that an error will be
  229. // returned if the user subsequently will try to parse this field as
  230. // a Rune.
  231. p.keepRanges = true
  232. }
  233. // Special case for UnicodeData that was retained for backwards compatibility.
  234. if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
  235. if p.parsedRange {
  236. return p.rangeStart, p.rangeEnd
  237. }
  238. mf := reRange.FindStringSubmatch(p.scanner.Text())
  239. if mf == nil || !p.scanner.Scan() {
  240. p.setError(errIncorrectLegacyRange)
  241. return x, x
  242. }
  243. // Using Bytes would be more efficient here, but Text is a lot easier
  244. // and this is not a frequent case.
  245. ml := reRange.FindStringSubmatch(p.scanner.Text())
  246. if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
  247. p.setError(errIncorrectLegacyRange)
  248. return x, x
  249. }
  250. p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
  251. p.parsedRange = true
  252. return p.rangeStart, p.rangeEnd
  253. }
  254. return x, x
  255. }
  256. // bools recognizes all valid UCD boolean values.
  257. var bools = map[string]bool{
  258. "": false,
  259. "N": false,
  260. "No": false,
  261. "F": false,
  262. "False": false,
  263. "Y": true,
  264. "Yes": true,
  265. "T": true,
  266. "True": true,
  267. }
  268. // Bool parses and returns field i as a boolean value.
  269. func (p *Parser) Bool(i int) bool {
  270. b := p.getField(i)
  271. for s, v := range bools {
  272. if bstrEq(b, s) {
  273. return v
  274. }
  275. }
  276. p.setError(strconv.ErrSyntax)
  277. return false
  278. }
  279. // Int parses and returns field i as an integer value.
  280. func (p *Parser) Int(i int) int {
  281. x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
  282. p.setError(err)
  283. return int(x)
  284. }
  285. // Uint parses and returns field i as an unsigned integer value.
  286. func (p *Parser) Uint(i int) uint {
  287. x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
  288. p.setError(err)
  289. return uint(x)
  290. }
  291. // Float parses and returns field i as a decimal value.
  292. func (p *Parser) Float(i int) float64 {
  293. x, err := strconv.ParseFloat(string(p.getField(i)), 64)
  294. p.setError(err)
  295. return x
  296. }
  297. // String parses and returns field i as a string value.
  298. func (p *Parser) String(i int) string {
  299. return string(p.getField(i))
  300. }
  301. // Strings parses and returns field i as a space-separated list of strings.
  302. func (p *Parser) Strings(i int) []string {
  303. ss := strings.Split(string(p.getField(i)), " ")
  304. for i, s := range ss {
  305. ss[i] = strings.TrimSpace(s)
  306. }
  307. return ss
  308. }
  309. // Comment returns the comments for the current line.
  310. func (p *Parser) Comment() string {
  311. return string(p.comment)
  312. }
  313. var errUndefinedEnum = errors.New("ucd: undefined enum value")
  314. // Enum interprets and returns field i as a value that must be one of the values
  315. // in enum.
  316. func (p *Parser) Enum(i int, enum ...string) string {
  317. b := p.getField(i)
  318. for _, s := range enum {
  319. if bstrEq(b, s) {
  320. return s
  321. }
  322. }
  323. p.setError(errUndefinedEnum)
  324. return ""
  325. }
  326. func bstrEq(b []byte, s string) bool {
  327. if len(b) != len(s) {
  328. return false
  329. }
  330. for i, c := range b {
  331. if c != s[i] {
  332. return false
  333. }
  334. }
  335. return true
  336. }