123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package ucd provides a parser for Unicode Character Database files, the
- // format of which is defined in http://www.unicode.org/reports/tr44/. See
- // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
- //
- // It currently does not support substitutions of missing fields.
- package ucd // import "golang.org/x/text/internal/ucd"
- import (
- "bufio"
- "bytes"
- "errors"
- "io"
- "log"
- "regexp"
- "strconv"
- "strings"
- )
- // UnicodeData.txt fields.
- const (
- CodePoint = iota
- Name
- GeneralCategory
- CanonicalCombiningClass
- BidiClass
- DecompMapping
- DecimalValue
- DigitValue
- NumericValue
- BidiMirrored
- Unicode1Name
- ISOComment
- SimpleUppercaseMapping
- SimpleLowercaseMapping
- SimpleTitlecaseMapping
- )
- // Parse calls f for each entry in the given reader of a UCD file. It will close
- // the reader upon return. It will call log.Fatal if any error occurred.
- //
- // This implements the most common usage pattern of using Parser.
- func Parse(r io.ReadCloser, f func(p *Parser)) {
- defer r.Close()
- p := New(r)
- for p.Next() {
- f(p)
- }
- if err := p.Err(); err != nil {
- r.Close() // os.Exit will cause defers not to be called.
- log.Fatal(err)
- }
- }
- // An Option is used to configure a Parser.
- type Option func(p *Parser)
- func keepRanges(p *Parser) {
- p.keepRanges = true
- }
- var (
- // KeepRanges prevents the expansion of ranges. The raw ranges can be
- // obtained by calling Range(0) on the parser.
- KeepRanges Option = keepRanges
- )
- // The Part option register a handler for lines starting with a '@'. The text
- // after a '@' is available as the first field. Comments are handled as usual.
- func Part(f func(p *Parser)) Option {
- return func(p *Parser) {
- p.partHandler = f
- }
- }
- // The CommentHandler option passes comments that are on a line by itself to
- // a given handler.
- func CommentHandler(f func(s string)) Option {
- return func(p *Parser) {
- p.commentHandler = f
- }
- }
- // A Parser parses Unicode Character Database (UCD) files.
- type Parser struct {
- scanner *bufio.Scanner
- keepRanges bool // Don't expand rune ranges in field 0.
- err error
- comment []byte
- field [][]byte
- // parsedRange is needed in case Range(0) is called more than once for one
- // field. In some cases this requires scanning ahead.
- parsedRange bool
- rangeStart, rangeEnd rune
- partHandler func(p *Parser)
- commentHandler func(s string)
- }
- func (p *Parser) setError(err error) {
- if p.err == nil {
- p.err = err
- }
- }
- func (p *Parser) getField(i int) []byte {
- if i >= len(p.field) {
- return nil
- }
- return p.field[i]
- }
- // Err returns a non-nil error if any error occurred during parsing.
- func (p *Parser) Err() error {
- return p.err
- }
- // New returns a Parser for the given Reader.
- func New(r io.Reader, o ...Option) *Parser {
- p := &Parser{
- scanner: bufio.NewScanner(r),
- }
- for _, f := range o {
- f(p)
- }
- return p
- }
- // Next parses the next line in the file. It returns true if a line was parsed
- // and false if it reached the end of the file.
- func (p *Parser) Next() bool {
- if !p.keepRanges && p.rangeStart < p.rangeEnd {
- p.rangeStart++
- return true
- }
- p.comment = nil
- p.field = p.field[:0]
- p.parsedRange = false
- for p.scanner.Scan() {
- b := p.scanner.Bytes()
- if len(b) == 0 {
- continue
- }
- if b[0] == '#' {
- if p.commentHandler != nil {
- p.commentHandler(strings.TrimSpace(string(b[1:])))
- }
- continue
- }
- // Parse line
- if i := bytes.IndexByte(b, '#'); i != -1 {
- p.comment = bytes.TrimSpace(b[i+1:])
- b = b[:i]
- }
- if b[0] == '@' {
- if p.partHandler != nil {
- p.field = append(p.field, bytes.TrimSpace(b[1:]))
- p.partHandler(p)
- p.field = p.field[:0]
- }
- p.comment = nil
- continue
- }
- for {
- i := bytes.IndexByte(b, ';')
- if i == -1 {
- p.field = append(p.field, bytes.TrimSpace(b))
- break
- }
- p.field = append(p.field, bytes.TrimSpace(b[:i]))
- b = b[i+1:]
- }
- if !p.keepRanges {
- p.rangeStart, p.rangeEnd = p.getRange(0)
- }
- return true
- }
- p.setError(p.scanner.Err())
- return false
- }
- func parseRune(b []byte) (rune, error) {
- if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
- b = b[2:]
- }
- x, err := strconv.ParseUint(string(b), 16, 32)
- return rune(x), err
- }
- func (p *Parser) parseRune(b []byte) rune {
- x, err := parseRune(b)
- p.setError(err)
- return x
- }
- // Rune parses and returns field i as a rune.
- func (p *Parser) Rune(i int) rune {
- if i > 0 || p.keepRanges {
- return p.parseRune(p.getField(i))
- }
- return p.rangeStart
- }
- // Runes interprets and returns field i as a sequence of runes.
- func (p *Parser) Runes(i int) (runes []rune) {
- add := func(b []byte) {
- if b = bytes.TrimSpace(b); len(b) > 0 {
- runes = append(runes, p.parseRune(b))
- }
- }
- for b := p.getField(i); ; {
- i := bytes.IndexByte(b, ' ')
- if i == -1 {
- add(b)
- break
- }
- add(b[:i])
- b = b[i+1:]
- }
- return
- }
- var (
- errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
- // reRange matches one line of a legacy rune range.
- reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
- )
- // Range parses and returns field i as a rune range. A range is inclusive at
- // both ends. If the field only has one rune, first and last will be identical.
- // It supports the legacy format for ranges used in UnicodeData.txt.
- func (p *Parser) Range(i int) (first, last rune) {
- if !p.keepRanges {
- return p.rangeStart, p.rangeStart
- }
- return p.getRange(i)
- }
- func (p *Parser) getRange(i int) (first, last rune) {
- b := p.getField(i)
- if k := bytes.Index(b, []byte("..")); k != -1 {
- return p.parseRune(b[:k]), p.parseRune(b[k+2:])
- }
- // The first field may not be a rune, in which case we may ignore any error
- // and set the range as 0..0.
- x, err := parseRune(b)
- if err != nil {
- // Disable range parsing henceforth. This ensures that an error will be
- // returned if the user subsequently will try to parse this field as
- // a Rune.
- p.keepRanges = true
- }
- // Special case for UnicodeData that was retained for backwards compatibility.
- if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
- if p.parsedRange {
- return p.rangeStart, p.rangeEnd
- }
- mf := reRange.FindStringSubmatch(p.scanner.Text())
- if mf == nil || !p.scanner.Scan() {
- p.setError(errIncorrectLegacyRange)
- return x, x
- }
- // Using Bytes would be more efficient here, but Text is a lot easier
- // and this is not a frequent case.
- ml := reRange.FindStringSubmatch(p.scanner.Text())
- if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
- p.setError(errIncorrectLegacyRange)
- return x, x
- }
- p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
- p.parsedRange = true
- return p.rangeStart, p.rangeEnd
- }
- return x, x
- }
- // bools recognizes all valid UCD boolean values.
- var bools = map[string]bool{
- "": false,
- "N": false,
- "No": false,
- "F": false,
- "False": false,
- "Y": true,
- "Yes": true,
- "T": true,
- "True": true,
- }
- // Bool parses and returns field i as a boolean value.
- func (p *Parser) Bool(i int) bool {
- b := p.getField(i)
- for s, v := range bools {
- if bstrEq(b, s) {
- return v
- }
- }
- p.setError(strconv.ErrSyntax)
- return false
- }
- // Int parses and returns field i as an integer value.
- func (p *Parser) Int(i int) int {
- x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
- p.setError(err)
- return int(x)
- }
- // Uint parses and returns field i as an unsigned integer value.
- func (p *Parser) Uint(i int) uint {
- x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
- p.setError(err)
- return uint(x)
- }
- // Float parses and returns field i as a decimal value.
- func (p *Parser) Float(i int) float64 {
- x, err := strconv.ParseFloat(string(p.getField(i)), 64)
- p.setError(err)
- return x
- }
- // String parses and returns field i as a string value.
- func (p *Parser) String(i int) string {
- return string(p.getField(i))
- }
- // Strings parses and returns field i as a space-separated list of strings.
- func (p *Parser) Strings(i int) []string {
- ss := strings.Split(string(p.getField(i)), " ")
- for i, s := range ss {
- ss[i] = strings.TrimSpace(s)
- }
- return ss
- }
- // Comment returns the comments for the current line.
- func (p *Parser) Comment() string {
- return string(p.comment)
- }
- var errUndefinedEnum = errors.New("ucd: undefined enum value")
- // Enum interprets and returns field i as a value that must be one of the values
- // in enum.
- func (p *Parser) Enum(i int, enum ...string) string {
- b := p.getField(i)
- for _, s := range enum {
- if bstrEq(b, s) {
- return s
- }
- }
- p.setError(errUndefinedEnum)
- return ""
- }
- func bstrEq(b []byte, s string) bool {
- if len(b) != len(s) {
- return false
- }
- for i, c := range b {
- if c != s[i] {
- return false
- }
- }
- return true
- }
|