bidirule.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package bidirule implements the Bidi Rule defined by RFC 5893.
  5. //
  6. // This package is under development. The API may change without notice and
  7. // without preserving backward compatibility.
  8. package bidirule
  9. import (
  10. "errors"
  11. "unicode/utf8"
  12. "golang.org/x/text/transform"
  13. "golang.org/x/text/unicode/bidi"
  14. )
  15. // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
  16. // Internationalized Domain Names for Applications (IDNA)
  17. //
  18. // A label is an individual component of a domain name. Labels are usually
  19. // shown separated by dots; for example, the domain name "www.example.com" is
  20. // composed of three labels: "www", "example", and "com".
  21. //
  22. // An RTL label is a label that contains at least one character of class R, AL,
  23. // or AN. An LTR label is any label that is not an RTL label.
  24. //
  25. // A "Bidi domain name" is a domain name that contains at least one RTL label.
  26. //
  27. // The following guarantees can be made based on the above:
  28. //
  29. // o In a domain name consisting of only labels that satisfy the rule,
  30. // the requirements of Section 3 are satisfied. Note that even LTR
  31. // labels and pure ASCII labels have to be tested.
  32. //
  33. // o In a domain name consisting of only LDH labels (as defined in the
  34. // Definitions document [RFC5890]) and labels that satisfy the rule,
  35. // the requirements of Section 3 are satisfied as long as a label
  36. // that starts with an ASCII digit does not come after a
  37. // right-to-left label.
  38. //
  39. // No guarantee is given for other combinations.
  40. // ErrInvalid indicates a label is invalid according to the Bidi Rule.
  41. var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
  42. type ruleState uint8
  43. const (
  44. ruleInitial ruleState = iota
  45. ruleLTR
  46. ruleLTRFinal
  47. ruleRTL
  48. ruleRTLFinal
  49. ruleInvalid
  50. )
  51. type ruleTransition struct {
  52. next ruleState
  53. mask uint16
  54. }
  55. var transitions = [...][2]ruleTransition{
  56. // [2.1] The first character must be a character with Bidi property L, R, or
  57. // AL. If it has the R or AL property, it is an RTL label; if it has the L
  58. // property, it is an LTR label.
  59. ruleInitial: {
  60. {ruleLTRFinal, 1 << bidi.L},
  61. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
  62. },
  63. ruleRTL: {
  64. // [2.3] In an RTL label, the end of the label must be a character with
  65. // Bidi property R, AL, EN, or AN, followed by zero or more characters
  66. // with Bidi property NSM.
  67. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
  68. // [2.2] In an RTL label, only characters with the Bidi properties R,
  69. // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  70. // We exclude the entries from [2.3]
  71. {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
  72. },
  73. ruleRTLFinal: {
  74. // [2.3] In an RTL label, the end of the label must be a character with
  75. // Bidi property R, AL, EN, or AN, followed by zero or more characters
  76. // with Bidi property NSM.
  77. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
  78. // [2.2] In an RTL label, only characters with the Bidi properties R,
  79. // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  80. // We exclude the entries from [2.3] and NSM.
  81. {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
  82. },
  83. ruleLTR: {
  84. // [2.6] In an LTR label, the end of the label must be a character with
  85. // Bidi property L or EN, followed by zero or more characters with Bidi
  86. // property NSM.
  87. {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
  88. // [2.5] In an LTR label, only characters with the Bidi properties L,
  89. // EN, ES, CS, ET, ON, BN, or NSM are allowed.
  90. // We exclude the entries from [2.6].
  91. {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
  92. },
  93. ruleLTRFinal: {
  94. // [2.6] In an LTR label, the end of the label must be a character with
  95. // Bidi property L or EN, followed by zero or more characters with Bidi
  96. // property NSM.
  97. {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
  98. // [2.5] In an LTR label, only characters with the Bidi properties L,
  99. // EN, ES, CS, ET, ON, BN, or NSM are allowed.
  100. // We exclude the entries from [2.6].
  101. {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
  102. },
  103. ruleInvalid: {
  104. {ruleInvalid, 0},
  105. {ruleInvalid, 0},
  106. },
  107. }
  108. // [2.4] In an RTL label, if an EN is present, no AN may be present, and
  109. // vice versa.
  110. const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
  111. // From RFC 5893
  112. // An RTL label is a label that contains at least one character of type
  113. // R, AL, or AN.
  114. //
  115. // An LTR label is any label that is not an RTL label.
  116. // Direction reports the direction of the given label as defined by RFC 5893.
  117. // The Bidi Rule does not have to be applied to labels of the category
  118. // LeftToRight.
  119. func Direction(b []byte) bidi.Direction {
  120. for i := 0; i < len(b); {
  121. e, sz := bidi.Lookup(b[i:])
  122. if sz == 0 {
  123. i++
  124. }
  125. c := e.Class()
  126. if c == bidi.R || c == bidi.AL || c == bidi.AN {
  127. return bidi.RightToLeft
  128. }
  129. i += sz
  130. }
  131. return bidi.LeftToRight
  132. }
  133. // DirectionString reports the direction of the given label as defined by RFC
  134. // 5893. The Bidi Rule does not have to be applied to labels of the category
  135. // LeftToRight.
  136. func DirectionString(s string) bidi.Direction {
  137. for i := 0; i < len(s); {
  138. e, sz := bidi.LookupString(s[i:])
  139. if sz == 0 {
  140. i++
  141. }
  142. c := e.Class()
  143. if c == bidi.R || c == bidi.AL || c == bidi.AN {
  144. return bidi.RightToLeft
  145. }
  146. i += sz
  147. }
  148. return bidi.LeftToRight
  149. }
  150. // Valid reports whether b conforms to the BiDi rule.
  151. func Valid(b []byte) bool {
  152. var t Transformer
  153. if n, ok := t.advance(b); !ok || n < len(b) {
  154. return false
  155. }
  156. return t.isFinal()
  157. }
  158. // ValidString reports whether s conforms to the BiDi rule.
  159. func ValidString(s string) bool {
  160. var t Transformer
  161. if n, ok := t.advanceString(s); !ok || n < len(s) {
  162. return false
  163. }
  164. return t.isFinal()
  165. }
  166. // New returns a Transformer that verifies that input adheres to the Bidi Rule.
  167. func New() *Transformer {
  168. return &Transformer{}
  169. }
  170. // Transformer implements transform.Transform.
  171. type Transformer struct {
  172. state ruleState
  173. hasRTL bool
  174. seen uint16
  175. }
  176. // A rule can only be violated for "Bidi Domain names", meaning if one of the
  177. // following categories has been observed.
  178. func (t *Transformer) isRTL() bool {
  179. const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
  180. return t.seen&isRTL != 0
  181. }
  182. func (t *Transformer) isFinal() bool {
  183. if !t.isRTL() {
  184. return true
  185. }
  186. return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
  187. }
  188. // Reset implements transform.Transformer.
  189. func (t *Transformer) Reset() { *t = Transformer{} }
  190. // Transform implements transform.Transformer. This Transformer has state and
  191. // needs to be reset between uses.
  192. func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  193. if len(dst) < len(src) {
  194. src = src[:len(dst)]
  195. atEOF = false
  196. err = transform.ErrShortDst
  197. }
  198. n, err1 := t.Span(src, atEOF)
  199. copy(dst, src[:n])
  200. if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
  201. err = err1
  202. }
  203. return n, n, err
  204. }
  205. // Span returns the first n bytes of src that conform to the Bidi rule.
  206. func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
  207. if t.state == ruleInvalid && t.isRTL() {
  208. return 0, ErrInvalid
  209. }
  210. n, ok := t.advance(src)
  211. switch {
  212. case !ok:
  213. err = ErrInvalid
  214. case n < len(src):
  215. if !atEOF {
  216. err = transform.ErrShortSrc
  217. break
  218. }
  219. err = ErrInvalid
  220. case !t.isFinal():
  221. err = ErrInvalid
  222. }
  223. return n, err
  224. }
  225. // Precomputing the ASCII values decreases running time for the ASCII fast path
  226. // by about 30%.
  227. var asciiTable [128]bidi.Properties
  228. func init() {
  229. for i := range asciiTable {
  230. p, _ := bidi.LookupRune(rune(i))
  231. asciiTable[i] = p
  232. }
  233. }
  234. func (t *Transformer) advance(s []byte) (n int, ok bool) {
  235. var e bidi.Properties
  236. var sz int
  237. for n < len(s) {
  238. if s[n] < utf8.RuneSelf {
  239. e, sz = asciiTable[s[n]], 1
  240. } else {
  241. e, sz = bidi.Lookup(s[n:])
  242. if sz <= 1 {
  243. if sz == 1 {
  244. // We always consider invalid UTF-8 to be invalid, even if
  245. // the string has not yet been determined to be RTL.
  246. // TODO: is this correct?
  247. return n, false
  248. }
  249. return n, true // incomplete UTF-8 encoding
  250. }
  251. }
  252. // TODO: using CompactClass would result in noticeable speedup.
  253. // See unicode/bidi/prop.go:Properties.CompactClass.
  254. c := uint16(1 << e.Class())
  255. t.seen |= c
  256. if t.seen&exclusiveRTL == exclusiveRTL {
  257. t.state = ruleInvalid
  258. return n, false
  259. }
  260. switch tr := transitions[t.state]; {
  261. case tr[0].mask&c != 0:
  262. t.state = tr[0].next
  263. case tr[1].mask&c != 0:
  264. t.state = tr[1].next
  265. default:
  266. t.state = ruleInvalid
  267. if t.isRTL() {
  268. return n, false
  269. }
  270. }
  271. n += sz
  272. }
  273. return n, true
  274. }
  275. func (t *Transformer) advanceString(s string) (n int, ok bool) {
  276. var e bidi.Properties
  277. var sz int
  278. for n < len(s) {
  279. if s[n] < utf8.RuneSelf {
  280. e, sz = asciiTable[s[n]], 1
  281. } else {
  282. e, sz = bidi.LookupString(s[n:])
  283. if sz <= 1 {
  284. if sz == 1 {
  285. return n, false // invalid UTF-8
  286. }
  287. return n, true // incomplete UTF-8 encoding
  288. }
  289. }
  290. // TODO: using CompactClass results in noticeable speedup.
  291. // See unicode/bidi/prop.go:Properties.CompactClass.
  292. c := uint16(1 << e.Class())
  293. t.seen |= c
  294. if t.seen&exclusiveRTL == exclusiveRTL {
  295. t.state = ruleInvalid
  296. return n, false
  297. }
  298. switch tr := transitions[t.state]; {
  299. case tr[0].mask&c != 0:
  300. t.state = tr[0].next
  301. case tr[1].mask&c != 0:
  302. t.state = tr[1].next
  303. default:
  304. t.state = ruleInvalid
  305. if t.isRTL() {
  306. return n, false
  307. }
  308. }
  309. n += sz
  310. }
  311. return n, true
  312. }