pcre.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. // Copyright (c) 2011 Florian Weimer. All rights reserved.
  2. //
  3. // Redistribution and use in source and binary forms, with or without
  4. // modification, are permitted provided that the following conditions are
  5. // met:
  6. //
  7. // * Redistributions of source code must retain the above copyright
  8. // notice, this list of conditions and the following disclaimer.
  9. //
  10. // * Redistributions in binary form must reproduce the above copyright
  11. // notice, this list of conditions and the following disclaimer in the
  12. // documentation and/or other materials provided with the distribution.
  13. //
  14. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  15. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  16. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  17. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  18. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  19. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  20. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. // This package provides access to the Perl Compatible Regular
  26. // Expresion library, PCRE.
  27. //
  28. // It implements two main types, Regexp and Matcher. Regexp objects
  29. // store a compiled regular expression. They consist of two immutable
  30. // parts: pcre and pcre_extra. You can add pcre_exta to Compiled Regexp by
  31. // studying it with Study() function.
  32. // Compilation of regular expressions using Compile or MustCompile is
  33. // slightly expensive, so these objects should be kept and reused,
  34. // instead of compiling them from scratch for each matching attempt.
  35. // CompileJIT and MustCompileJIT are way more expensive then ordinary
  36. // methods, becose they run Study() func after Regexp compiled but gives
  37. // much better perfomance:
  38. // http://sljit.sourceforge.net/regex_perf.html
  39. //
  40. // Matcher objects keeps the results of a match against a []byte or
  41. // string subject. The Group and GroupString functions provide access
  42. // to capture groups; both versions work no matter if the subject was a
  43. // []byte or string.
  44. //
  45. // Matcher objects contain some temporary space and refer the original
  46. // subject. They are mutable and can be reused (using Match,
  47. // MatchString, Reset or ResetString).
  48. //
  49. // Most of Matcher.*String method are just links to []byte methods, so keep
  50. // this in mind.
  51. //
  52. // For details on the regular expression language implemented by this
  53. // package and the flags defined below, see the PCRE documentation.
  54. // http://www.pcre.org/pcre.txt
  55. package pcre
  56. /*
  57. #cgo pkg-config: libpcre
  58. #include <pcre.h>
  59. #include <string.h>
  60. */
  61. import "C"
  62. import (
  63. "fmt"
  64. "strconv"
  65. "strings"
  66. "unsafe"
  67. )
  68. // Flags for Compile and Match functions.
  69. const (
  70. ANCHORED = C.PCRE_ANCHORED
  71. BSR_ANYCRLF = C.PCRE_BSR_ANYCRLF
  72. BSR_UNICODE = C.PCRE_BSR_UNICODE
  73. NEWLINE_ANY = C.PCRE_NEWLINE_ANY
  74. NEWLINE_ANYCRLF = C.PCRE_NEWLINE_ANYCRLF
  75. NEWLINE_CR = C.PCRE_NEWLINE_CR
  76. NEWLINE_CRLF = C.PCRE_NEWLINE_CRLF
  77. NEWLINE_LF = C.PCRE_NEWLINE_LF
  78. NO_UTF8_CHECK = C.PCRE_NO_UTF8_CHECK
  79. )
  80. // Flags for Compile functions
  81. const (
  82. CASELESS = C.PCRE_CASELESS
  83. DOLLAR_ENDONLY = C.PCRE_DOLLAR_ENDONLY
  84. DOTALL = C.PCRE_DOTALL
  85. DUPNAMES = C.PCRE_DUPNAMES
  86. EXTENDED = C.PCRE_EXTENDED
  87. EXTRA = C.PCRE_EXTRA
  88. FIRSTLINE = C.PCRE_FIRSTLINE
  89. JAVASCRIPT_COMPAT = C.PCRE_JAVASCRIPT_COMPAT
  90. MULTILINE = C.PCRE_MULTILINE
  91. NO_AUTO_CAPTURE = C.PCRE_NO_AUTO_CAPTURE
  92. UNGREEDY = C.PCRE_UNGREEDY
  93. UTF8 = C.PCRE_UTF8
  94. UCP = C.PCRE_UCP
  95. )
  96. // Flags for Match functions
  97. const (
  98. NOTBOL = C.PCRE_NOTBOL
  99. NOTEOL = C.PCRE_NOTEOL
  100. NOTEMPTY = C.PCRE_NOTEMPTY
  101. NOTEMPTY_ATSTART = C.PCRE_NOTEMPTY_ATSTART
  102. NO_START_OPTIMIZE = C.PCRE_NO_START_OPTIMIZE
  103. PARTIAL_HARD = C.PCRE_PARTIAL_HARD
  104. PARTIAL_SOFT = C.PCRE_PARTIAL_SOFT
  105. )
  106. // Flags for Study function
  107. const (
  108. STUDY_JIT_COMPILE = C.PCRE_STUDY_JIT_COMPILE
  109. STUDY_JIT_PARTIAL_SOFT_COMPILE = C.PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
  110. STUDY_JIT_PARTIAL_HARD_COMPILE = C.PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE
  111. )
  112. // Flags for Config() fuction
  113. const (
  114. CONFIG_JIT = C.PCRE_CONFIG_JIT
  115. CONFIG_JITTARGET = C.PCRE_CONFIG_JITTARGET
  116. CONFIG_LINK_SIZE = C.PCRE_CONFIG_LINK_SIZE
  117. CONFIG_MATCH_LIMIT = C.PCRE_CONFIG_MATCH_LIMIT
  118. CONFIG_MATCH_LIMIT_RECURSION = C.PCRE_CONFIG_MATCH_LIMIT_RECURSION
  119. CONFIG_NEWLINE = C.PCRE_CONFIG_NEWLINE
  120. CONFIG_BSR = C.PCRE_CONFIG_BSR
  121. CONFIG_POSIX_MALLOC_THRESHOLD = C.PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
  122. CONFIG_STACKRECURSE = C.PCRE_CONFIG_STACKRECURSE
  123. CONFIG_UTF16 = C.PCRE_CONFIG_UTF16
  124. CONFIG_UTF32 = C.PCRE_CONFIG_UTF32
  125. CONFIG_UTF8 = C.PCRE_CONFIG_UTF8
  126. CONFIG_UNICODE_PROPERTIES = C.PCRE_CONFIG_UNICODE_PROPERTIES
  127. )
  128. // Exec-time and get/set-time error codes
  129. const (
  130. ERROR_NOMATCH = C.PCRE_ERROR_NOMATCH
  131. ERROR_NULL = C.PCRE_ERROR_NULL
  132. ERROR_BADOPTION = C.PCRE_ERROR_BADOPTION
  133. ERROR_BADMAGIC = C.PCRE_ERROR_BADMAGIC
  134. ERROR_UNKNOWN_OPCODE = C.PCRE_ERROR_UNKNOWN_OPCODE
  135. ERROR_UNKNOWN_NODE = C.PCRE_ERROR_UNKNOWN_NODE
  136. ERROR_NOMEMORY = C.PCRE_ERROR_NOMEMORY
  137. ERROR_NOSUBSTRING = C.PCRE_ERROR_NOSUBSTRING
  138. ERROR_MATCHLIMIT = C.PCRE_ERROR_MATCHLIMIT
  139. ERROR_CALLOUT = C.PCRE_ERROR_CALLOUT
  140. ERROR_BADUTF8 = C.PCRE_ERROR_BADUTF8
  141. ERROR_BADUTF8_OFFSET = C.PCRE_ERROR_BADUTF8_OFFSET
  142. ERROR_PARTIAL = C.PCRE_ERROR_PARTIAL
  143. ERROR_BADPARTIAL = C.PCRE_ERROR_BADPARTIAL
  144. ERROR_RECURSIONLIMIT = C.PCRE_ERROR_RECURSIONLIMIT
  145. ERROR_INTERNAL = C.PCRE_ERROR_INTERNAL
  146. ERROR_BADCOUNT = C.PCRE_ERROR_BADCOUNT
  147. ERROR_JIT_STACKLIMIT = C.PCRE_ERROR_JIT_STACKLIMIT
  148. )
  149. // This function returns information about libpcre configuration.
  150. // Function passed flag f to C.pcre_config() func, and convert returned
  151. // vaule to string type.
  152. // http://www.pcre.org/original/doc/html/pcre_config.html
  153. func Config(f int) (r string) {
  154. if f == C.PCRE_CONFIG_JITTARGET {
  155. var jittarget *C.char
  156. C.pcre_config(C.PCRE_CONFIG_JITTARGET, unsafe.Pointer(&jittarget))
  157. r = C.GoString(jittarget)
  158. } else {
  159. var i C.int
  160. C.pcre_config(C.int(f), unsafe.Pointer(&i))
  161. r = fmt.Sprint(int32(i))
  162. }
  163. return
  164. }
  165. // This function returns string, which contains all information
  166. // you can access by pcre_config() function
  167. func ConfigAll() (ret string) {
  168. var i C.int
  169. C.pcre_config(C.PCRE_CONFIG_JIT, unsafe.Pointer(&i))
  170. ret += fmt.Sprintf("jit: %d\n", int32(i))
  171. var jittarget *C.char
  172. C.pcre_config(C.PCRE_CONFIG_JITTARGET, unsafe.Pointer(&jittarget))
  173. ret += fmt.Sprintf("jittarget: %s\n", C.GoString(jittarget))
  174. C.pcre_config(C.PCRE_CONFIG_LINK_SIZE, unsafe.Pointer(&i))
  175. ret += fmt.Sprintf("link_size: %d\n", int32(i))
  176. C.pcre_config(C.PCRE_CONFIG_MATCH_LIMIT, unsafe.Pointer(&i))
  177. ret += fmt.Sprintf("match_limit: %d\n", int32(i))
  178. C.pcre_config(C.PCRE_CONFIG_MATCH_LIMIT_RECURSION, unsafe.Pointer(&i))
  179. ret += fmt.Sprintf("match_limit_recursion: %d\n", int32(i))
  180. C.pcre_config(C.PCRE_CONFIG_NEWLINE, unsafe.Pointer(&i))
  181. ret += fmt.Sprintf("newline: %d\n", int32(i))
  182. C.pcre_config(C.PCRE_CONFIG_BSR, unsafe.Pointer(&i))
  183. ret += fmt.Sprintf("bsr: %d\n", int32(i))
  184. C.pcre_config(C.PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, unsafe.Pointer(&i))
  185. ret += fmt.Sprintf("posix_malloc_threshold: %d\n", int32(i))
  186. C.pcre_config(C.PCRE_CONFIG_STACKRECURSE, unsafe.Pointer(&i))
  187. ret += fmt.Sprintf("stackrecurse: %d\n", int32(i))
  188. C.pcre_config(C.PCRE_CONFIG_UTF16, unsafe.Pointer(&i))
  189. ret += fmt.Sprintf("utf16: %d\n", int32(i))
  190. C.pcre_config(C.PCRE_CONFIG_UTF32, unsafe.Pointer(&i))
  191. ret += fmt.Sprintf("utf32: %d\n", int32(i))
  192. C.pcre_config(C.PCRE_CONFIG_UTF8, unsafe.Pointer(&i))
  193. ret += fmt.Sprintf("utf8: %d", int32(i))
  194. C.pcre_config(C.PCRE_CONFIG_UNICODE_PROPERTIES, unsafe.Pointer(&i))
  195. ret += fmt.Sprintf("unicode_properties: %d\n", int32(i))
  196. return
  197. }
  198. // A reference to a compiled regular expression.
  199. // Use Compile or MustCompile to create such objects.
  200. type Regexp struct {
  201. ptr []byte
  202. extra []byte
  203. }
  204. // Number of bytes in the compiled pattern
  205. func pcresize(ptr *C.pcre) (size C.size_t) {
  206. C.pcre_fullinfo(ptr, nil, C.PCRE_INFO_SIZE, unsafe.Pointer(&size))
  207. return
  208. }
  209. func pcreJITsize(ptr *C.pcre, ext *C.pcre_extra) (size C.size_t) {
  210. C.pcre_fullinfo(ptr, ext, C.PCRE_INFO_JITSIZE, unsafe.Pointer(&size))
  211. return
  212. }
  213. // Number of capture groups
  214. func pcregroups(ptr *C.pcre) (count C.int) {
  215. C.pcre_fullinfo(ptr, nil,
  216. C.PCRE_INFO_CAPTURECOUNT, unsafe.Pointer(&count))
  217. return
  218. }
  219. // Returns string with regex pattern and int with fpcre flags.
  220. // Flags are specified before the regex in form like this "(?flags)regex"
  221. // Supported symbols i=CASELESS; m=MULTILINE; s=DOTALL; U=UNGREEDY; J=DUPNAMES;
  222. // x=EXTENDED; X=EXTRA; D=DOLLAR_ENDONLY; u=UTF8|UCP;
  223. func ParseFlags(ptr string) (string, int) {
  224. fReg := MustCompile("^\\(\\?[a-zA-Z]+?\\)", 0)
  225. flags := 0
  226. for fStr := fReg.FindString(ptr, 0); fStr != ""; ptr = ptr[len(fStr):] {
  227. fStr = fReg.FindString(ptr, 0)
  228. if strings.Contains(fStr, "i") {
  229. flags = flags | CASELESS
  230. }
  231. if strings.Contains(fStr, "D") {
  232. flags = flags | DOLLAR_ENDONLY
  233. }
  234. if strings.Contains(fStr, "s") {
  235. flags = flags | DOTALL
  236. }
  237. if strings.Contains(fStr, "J") {
  238. flags = flags | DUPNAMES
  239. }
  240. if strings.Contains(fStr, "x") {
  241. flags = flags | EXTENDED
  242. }
  243. if strings.Contains(fStr, "X") {
  244. flags = flags | EXTRA
  245. }
  246. if strings.Contains(fStr, "m") {
  247. flags = flags | MULTILINE
  248. }
  249. if strings.Contains(fStr, "U") {
  250. flags = flags | UNGREEDY
  251. }
  252. if strings.Contains(fStr, "u") {
  253. flags = flags | UTF8 | UCP
  254. }
  255. }
  256. return ptr, flags
  257. }
  258. // Try to compile the pattern. If an error occurs, the second return
  259. // value is non-nil.
  260. func Compile(pattern string, flags int) (Regexp, error) {
  261. patternC := C.CString(pattern)
  262. defer C.free(unsafe.Pointer(patternC))
  263. if clen := int(C.strlen(patternC)); clen != len(pattern) {
  264. return Regexp{}, fmt.Errorf("%s (%d): %s",
  265. pattern,
  266. clen,
  267. "NUL byte in pattern",
  268. )
  269. }
  270. var errptr *C.char
  271. var erroffset C.int
  272. ptr := C.pcre_compile(patternC, C.int(flags), &errptr, &erroffset, nil)
  273. if ptr == nil {
  274. return Regexp{}, fmt.Errorf("%s (%d): %s",
  275. pattern,
  276. int(erroffset),
  277. C.GoString(errptr),
  278. )
  279. }
  280. defer C.free(unsafe.Pointer(ptr))
  281. psize := pcresize(ptr)
  282. var re Regexp
  283. re.ptr = make([]byte, psize)
  284. C.memcpy(unsafe.Pointer(&re.ptr[0]), unsafe.Pointer(ptr), psize)
  285. return re, nil
  286. }
  287. // Try to parse flags of regex and compile it. If an error occurs,
  288. // the second return value is non-nil.
  289. func CompileParse(ptr string) (Regexp, error) {
  290. ptr, f := ParseFlags(ptr)
  291. retRegex, err := Compile(ptr, f)
  292. if err != nil {
  293. return Regexp{}, fmt.Errorf("can't compile/study pcre regexp: %s\nFlags:%b", ptr, f)
  294. }
  295. return retRegex, nil
  296. }
  297. // Compile pattern with jit compilation. flagC is Compile flags,
  298. // flagS is study flag.
  299. func CompileJIT(pattern string, flagsC, flagsS int) (Regexp, error) {
  300. patternC := C.CString(pattern)
  301. defer C.free(unsafe.Pointer(patternC))
  302. if clen := int(C.strlen(patternC)); clen != len(pattern) {
  303. return Regexp{}, fmt.Errorf("%s (%d): %s",
  304. pattern,
  305. clen,
  306. "NUL byte in pattern",
  307. )
  308. }
  309. var errptr *C.char
  310. var erroffset C.int
  311. ptr := C.pcre_compile(patternC, C.int(flagsC), &errptr, &erroffset, nil)
  312. if ptr == nil {
  313. return Regexp{}, fmt.Errorf("%s (%d): %s",
  314. pattern,
  315. int(erroffset),
  316. C.GoString(errptr),
  317. )
  318. }
  319. psize := pcresize(ptr)
  320. var re Regexp
  321. re.ptr = make([]byte, psize)
  322. C.memcpy(unsafe.Pointer(&re.ptr[0]), unsafe.Pointer(ptr), psize)
  323. errS := re.study(flagsS)
  324. if errS != nil {
  325. return re, fmt.Errorf("study error: %s", errS)
  326. }
  327. return re, nil
  328. }
  329. // Try to parse flags of regex and compile it with JIT optimization.
  330. // If an error occurs, the second return value is non-nil.
  331. func CompileParseJIT(ptr string, flags int) (Regexp, error) {
  332. ptr, f := ParseFlags(ptr)
  333. retRegex, err := CompileJIT(ptr, f, flags)
  334. if err != nil {
  335. return Regexp{}, fmt.Errorf("can't compile/study pcre regexp: %s\nFlags:%b\nFlagsJIT%b", ptr, f, flags)
  336. }
  337. return retRegex, nil
  338. }
  339. // Compile the pattern. If compilation fails, panic.
  340. func MustCompile(pattern string, flag int) (re Regexp) {
  341. re, err := Compile(pattern, flag)
  342. if err != nil {
  343. panic(err)
  344. }
  345. return
  346. }
  347. // CompileParse the pattern. If compilation fails, panic.
  348. func MustCompileParse(pattern string) (re Regexp) {
  349. re, err := CompileParse(pattern)
  350. if err != nil {
  351. panic(err)
  352. }
  353. return
  354. }
  355. // CompileJIT the pattern. If compilation fails, panic.
  356. func MustCompileJIT(pattern string, flagsC, flagsS int) (re Regexp) {
  357. re, err := CompileJIT(pattern, flagsC, flagsS)
  358. if err != nil {
  359. panic(err)
  360. }
  361. return
  362. }
  363. // CompileParseJIT the pattern. If compilation fails, panic.
  364. func MustCompileParseJIT(pattern string, flags int) (re Regexp) {
  365. re, err := CompileParseJIT(pattern, flags)
  366. if err != nil {
  367. panic(err)
  368. }
  369. return
  370. }
  371. // Return the start and end of the first match.
  372. func (re *Regexp) FindAllIndex(bytes []byte, flags int) (r [][]int) {
  373. m := re.Matcher(bytes, flags)
  374. offset := 0
  375. for m.Match(bytes[offset:], flags) {
  376. r = append(r, []int{offset + int(m.ovector[0]), offset + int(m.ovector[1])})
  377. offset += int(m.ovector[1])
  378. }
  379. return
  380. }
  381. // Return the start and end of the first match, or nil if no match.
  382. // loc[0] is the start and loc[1] is the end.
  383. func (re *Regexp) FindIndex(bytes []byte, flags int) []int {
  384. m := re.Matcher(bytes, flags)
  385. if m.Matches {
  386. return []int{int(m.ovector[0]), int(m.ovector[1])}
  387. }
  388. return nil
  389. }
  390. // Return the start and end of the first match, or nil if no match.
  391. // loc[0] is the start and loc[1] is the end.
  392. func (re *Regexp) FindString(s string, flags int) string {
  393. m := re.Matcher([]byte(s), flags)
  394. if m.Matches {
  395. return s[int(m.ovector[0]):int(m.ovector[1])]
  396. }
  397. return ""
  398. }
  399. // Returns the number of capture groups in the compiled regexp pattern.
  400. func (re Regexp) Groups() int {
  401. if re.ptr == nil {
  402. panic("Regexp.Groups: uninitialized")
  403. }
  404. return int(pcregroups((*C.pcre)(unsafe.Pointer(&re.ptr[0]))))
  405. }
  406. // Tries to match the speficied byte array slice to the current pattern.
  407. // Returns true if the match succeeds.
  408. func (r *Regexp) Match(subject []byte, flags int) bool {
  409. m := r.Matcher(subject, flags)
  410. return m.Matches
  411. }
  412. // Same as Match, but accept string as argument
  413. func (r *Regexp) MatchString(subject string, flags int) bool {
  414. m := r.Matcher([]byte(subject), flags)
  415. return m.Matches
  416. }
  417. // Returns a new matcher object, with the byte array slice as a
  418. // subject.
  419. func (re Regexp) Matcher(subject []byte, flags int) (m *Matcher) {
  420. m = new(Matcher)
  421. m.Reset(re, subject, flags)
  422. return
  423. }
  424. // Returns a new matcher object, with the specified subject string.
  425. func (re Regexp) MatcherString(subject string, flags int) (m *Matcher) {
  426. m = new(Matcher)
  427. m.ResetString(re, subject, flags)
  428. return
  429. }
  430. // Return a copy of a byte slice with pattern matches replaced by repl.
  431. func (re Regexp) ReplaceAll(bytes, repl []byte, flags int) []byte {
  432. m := re.Matcher(bytes, 0)
  433. r := []byte{}
  434. for m.Match(bytes, flags) {
  435. r = append(append(r, bytes[:m.ovector[0]]...), repl...)
  436. bytes = bytes[m.ovector[1]:]
  437. }
  438. return append(r, bytes...)
  439. }
  440. // Same as ReplaceAll, but accept strings as arguments
  441. func (re Regexp) ReplaceAllString(src, repl string, flags int) string {
  442. return string(re.ReplaceAll([]byte(src), []byte(repl), flags))
  443. }
  444. // Study regexp and add pcre_extra information to it, which gives huge
  445. // speed boost when matching. If an error occurs, return value is
  446. // non-nil. If flags = 0, don't study at all and return error.
  447. // Studying can be quite a heavy optimization, but it's worth it.
  448. func (re *Regexp) study(flags int) error {
  449. if re.extra != nil {
  450. return fmt.Errorf("regexp already optimized")
  451. }
  452. if flags <= 0 {
  453. return fmt.Errorf("flag must be > 0")
  454. }
  455. var err *C.char
  456. extra := C.pcre_study((*C.pcre)(unsafe.Pointer(&re.ptr[0])), C.int(flags), &err)
  457. if err != nil {
  458. return fmt.Errorf(C.GoString(err))
  459. }
  460. defer C.free(unsafe.Pointer(extra))
  461. size := pcreJITsize((*C.pcre)(unsafe.Pointer(&re.ptr[0])), extra)
  462. if size > 0 {
  463. re.extra = make([]byte, size)
  464. C.memcpy(unsafe.Pointer(&re.extra[0]), unsafe.Pointer(extra), size)
  465. return nil
  466. } else {
  467. return fmt.Errorf(C.GoString(err))
  468. }
  469. }
  470. // Matcher objects provide a place for storing match results.
  471. // They can be created by the Matcher and MatcherString functions,
  472. // or they can be initialized with Reset or ResetString.
  473. type Matcher struct {
  474. re Regexp
  475. Groups int
  476. ovector []int32 // space for capture offsets, int32 is analogfor C.int type
  477. Matches bool // last match was successful
  478. Error error // pcre_exec error from last match
  479. Partial bool // was the last match a partial match?
  480. SubjectS string // contain finded subject as string
  481. SubjectB []byte // contain finded subject as []byte
  482. }
  483. // Tries to match the speficied byte array slice to the current
  484. // pattern. Returns exec result.
  485. // C docs http://www.pcre.org/original/doc/html/pcre_exec.html
  486. func (m *Matcher) Exec(subject []byte, flags int) int {
  487. if m.re.ptr == nil {
  488. panic("Matcher.Match: uninitialized")
  489. }
  490. length := len(subject)
  491. m.SubjectS = string(subject)
  492. m.SubjectB = subject
  493. if length == 0 {
  494. subject = nullbyte // make first character adressable
  495. }
  496. subjectptr := (*C.char)(unsafe.Pointer(&subject[0]))
  497. return m.exec(subjectptr, length, flags)
  498. }
  499. // Same as Exec, but accept string as argument
  500. func (m *Matcher) ExecString(subject string, flags int) int {
  501. return m.Exec([]byte(subject), flags)
  502. }
  503. func (m *Matcher) exec(subjectptr *C.char, length, flags int) int {
  504. var extra *C.pcre_extra
  505. if m.re.extra != nil {
  506. extra = (*C.pcre_extra)(unsafe.Pointer(&m.re.extra[0]))
  507. } else {
  508. extra = nil
  509. }
  510. rc := C.pcre_exec((*C.pcre)(unsafe.Pointer(&m.re.ptr[0])), extra,
  511. subjectptr, C.int(length), 0, C.int(flags),
  512. (*C.int)(unsafe.Pointer(&m.ovector[0])), C.int(len(m.ovector)))
  513. return int(rc)
  514. }
  515. // Returns the captured string with submatches of the last match
  516. // (performed by Matcher, MatcherString, Reset, ResetString, Match,
  517. // or MatchString). Group 0 is the part of the subject which matches
  518. // the whole pattern; the first actual capture group is numbered 1.
  519. // Capture groups which are not present return a nil slice.
  520. func (m *Matcher) Extract() [][]byte {
  521. if m.Matches {
  522. captured_texts := make([][]byte, m.Groups+1)
  523. captured_texts[0] = m.SubjectB
  524. for i := 1; i < m.Groups+1; i++ {
  525. start := m.ovector[2*i]
  526. end := m.ovector[2*i+1]
  527. captured_text := m.SubjectB[start:end]
  528. captured_texts[i] = captured_text
  529. }
  530. return captured_texts
  531. } else {
  532. return nil
  533. }
  534. }
  535. // Same as Extract, but returns []string
  536. func (m *Matcher) ExtractString() []string {
  537. if m.Matches {
  538. captured_texts := make([]string, m.Groups+1)
  539. captured_texts[0] = m.SubjectS
  540. for i := 1; i < m.Groups+1; i++ {
  541. start := m.ovector[2*i]
  542. end := m.ovector[2*i+1]
  543. captured_text := m.SubjectS[start:end]
  544. captured_texts[i] = captured_text
  545. }
  546. return captured_texts
  547. } else {
  548. return nil
  549. }
  550. }
  551. func (m *Matcher) init(re Regexp) {
  552. m.Matches = false
  553. if m.re.ptr != nil && &m.re.ptr[0] == &re.ptr[0] {
  554. // Skip group count extraction if the matcher has
  555. // already been initialized with the same regular
  556. // expression.
  557. return
  558. }
  559. m.re = re
  560. m.Groups = re.Groups()
  561. if ovectorlen := 3 * (1 + m.Groups); len(m.ovector) < ovectorlen {
  562. m.ovector = make([]int32, int32(ovectorlen))
  563. }
  564. }
  565. var nullbyte = []byte{0}
  566. // Returns the numbered capture group of the last match (performed by
  567. // Matcher, MatcherString, Reset, ResetString, Match, or MatchString).
  568. // Group 0 is the part of the subject which matches the whole pattern;
  569. // the first actual capture group is numbered 1. Capture groups which
  570. // are not present return a nil slice.
  571. func (m *Matcher) Group(group int) []byte {
  572. start := m.ovector[2*group]
  573. end := m.ovector[2*group+1]
  574. if start >= 0 {
  575. return m.SubjectB[start:end]
  576. }
  577. return nil
  578. }
  579. // Returns the numbered capture group positions of the last match
  580. // (performed by Matcher, MatcherString, Reset, ResetString, Match,
  581. // or MatchString). Group 0 is the part of the subject which matches
  582. // the whole pattern; the first actual capture group is numbered 1.
  583. // Capture groups which are not present return a nil slice.
  584. func (m *Matcher) GroupIndices(group int) []int {
  585. start := m.ovector[2*group]
  586. end := m.ovector[2*group+1]
  587. if start >= 0 {
  588. return []int{int(start), int(end)}
  589. }
  590. return nil
  591. }
  592. // Same as Group, but returns string
  593. func (m *Matcher) GroupString(group int) string {
  594. start := m.ovector[2*group]
  595. end := m.ovector[2*group+1]
  596. if start >= 0 {
  597. return m.SubjectS[start:end]
  598. }
  599. return ""
  600. }
  601. // Index returns the start and end of the first match, if a previous
  602. // call to Matcher, MatcherString, Reset, ResetString, Match or
  603. // MatchString succeeded. loc[0] is the start and loc[1] is the end.
  604. func (m *Matcher) Index() []int {
  605. if !m.Matches {
  606. return nil
  607. }
  608. return []int{int(m.ovector[0]), int(m.ovector[1])}
  609. }
  610. // Tries to match the speficied byte array slice to the current
  611. // pattern. Returns true if the match succeeds.
  612. func (m *Matcher) Match(subject []byte, flags int) bool {
  613. rc := m.Exec(subject, flags)
  614. m.Matches, m.Error = checkMatch(rc)
  615. m.Partial = (rc == C.PCRE_ERROR_PARTIAL)
  616. return m.Matches
  617. }
  618. // Tries to match the speficied subject string to the current pattern.
  619. // Returns true if the match succeeds.
  620. func (m *Matcher) MatchString(subject string, flags int) bool {
  621. rc := m.ExecString(subject, flags)
  622. m.Matches, m.Error = checkMatch(rc)
  623. m.Partial = (rc == ERROR_PARTIAL)
  624. return m.Matches
  625. }
  626. func checkMatch(rc int) (bool, error) {
  627. switch {
  628. case rc >= 0 || rc == ERROR_PARTIAL:
  629. return true, nil
  630. case rc == ERROR_NOMATCH:
  631. return false, nil
  632. case rc == ERROR_NULL:
  633. return false, fmt.Errorf("%d, pcre_exec: one or more variables passed to pcre_exec == NULL", ERROR_NULL)
  634. case rc == ERROR_BADOPTION:
  635. return false, fmt.Errorf("%d, pcre_exec: An unrecognized bit was set in the options argument", ERROR_BADOPTION)
  636. case rc == ERROR_BADMAGIC:
  637. return false, fmt.Errorf("%d, pcre_exec: invalid option flag", ERROR_BADMAGIC)
  638. case rc == ERROR_UNKNOWN_OPCODE:
  639. return false, fmt.Errorf("%d, pcre_exec: an unknown item was encountered in the compiled pattern", ERROR_UNKNOWN_OPCODE)
  640. case rc == ERROR_NOMEMORY:
  641. return false, fmt.Errorf("%d, pcre_exec: match limit", ERROR_NOMEMORY)
  642. case rc == ERROR_MATCHLIMIT:
  643. return false, fmt.Errorf("%d, pcre_exec: backtracking (match) limit was reached", ERROR_MATCHLIMIT)
  644. case rc == ERROR_BADUTF8:
  645. return false, fmt.Errorf("%d, pcre_exec: string that contains an invalid UTF-8 byte sequence was passed as a subject", ERROR_BADUTF8)
  646. case rc == ERROR_RECURSIONLIMIT:
  647. return false, fmt.Errorf("%d, pcre_exec: recursion limit", ERROR_RECURSIONLIMIT)
  648. case rc == ERROR_JIT_STACKLIMIT:
  649. return false, fmt.Errorf("%d, pcre_exec: error JIT stack limit", ERROR_JIT_STACKLIMIT)
  650. case rc == ERROR_INTERNAL:
  651. panic("pcre_exec: INTERNAL ERROR")
  652. case rc == ERROR_BADCOUNT:
  653. panic("pcre_exec: INTERNAL ERROR")
  654. }
  655. panic("unexepected return code from pcre_exec: " +
  656. strconv.Itoa(int(rc)))
  657. }
  658. func (m *Matcher) name2index(name string) (group int, err error) {
  659. if m.re.ptr == nil {
  660. err = fmt.Errorf("Matcher.Named: uninitialized")
  661. return
  662. }
  663. name1 := C.CString(name)
  664. defer C.free(unsafe.Pointer(name1))
  665. group = int(C.pcre_get_stringnumber(
  666. (*C.pcre)(unsafe.Pointer(&m.re.ptr[0])), name1))
  667. if group < 0 {
  668. err = fmt.Errorf("Matcher.Named: unknown name: " + name)
  669. return
  670. }
  671. return
  672. }
  673. // Returns the value of the named capture group. This is a nil slice
  674. // if the capture group is not present. Panics if the name does not
  675. // refer to a group.
  676. func (m *Matcher) Named(group string) (g []byte, err error) {
  677. group_num, err := m.name2index(group)
  678. if err != nil {
  679. return
  680. }
  681. return m.Group(group_num), nil
  682. }
  683. // Returns true if the named capture group is present. Panics if the
  684. // name does not refer to a group.
  685. func (m *Matcher) NamedPresent(group string) (pres bool) {
  686. group_num, err := m.name2index(group)
  687. if err != nil {
  688. return false
  689. }
  690. return m.Present(group_num)
  691. }
  692. // Returns the value of the named capture group, or an empty string if
  693. // the capture group is not present. Panics if the name does not
  694. // refer to a group.
  695. func (m *Matcher) NamedString(group string) (g string, err error) {
  696. group_num, err := m.name2index(group)
  697. if err != nil {
  698. return
  699. }
  700. return m.GroupString(group_num), nil
  701. }
  702. // Returns true if the numbered capture group is present in the last
  703. // match (performed by Matcher, MatcherString, Reset, ResetString,
  704. // Match, or MatchString). Group numbers start at 1. A capture group
  705. // can be present and match the empty string.
  706. func (m *Matcher) Present(group int) bool {
  707. return m.ovector[2*group] >= 0
  708. }
  709. // Switches the matcher object to the specified pattern and subject.
  710. func (m *Matcher) Reset(re Regexp, subject []byte, flags int) {
  711. if re.ptr == nil {
  712. panic("Regexp.Matcher: uninitialized")
  713. }
  714. m.init(re)
  715. m.Match(subject, flags)
  716. }
  717. // Switches the matcher object to the specified pattern and subject
  718. // string.
  719. func (m *Matcher) ResetString(re Regexp, subject string, flags int) {
  720. if re.ptr == nil {
  721. panic("Regexp.Matcher: uninitialized")
  722. }
  723. m.init(re)
  724. m.MatchString(subject, flags)
  725. }