xurls.go 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
  2. // See LICENSE for licensing information
  3. // Package xurls extracts urls from plain text using regular expressions.
  4. package xurls
  5. import "regexp"
  6. //go:generate go run generate/tldsgen/main.go
  7. //go:generate go run generate/regexgen/main.go
  8. const (
  9. letter = `\p{L}`
  10. mark = `\p{M}`
  11. number = `\p{N}`
  12. iriChar = letter + mark + number
  13. currency = `\p{Sc}`
  14. otherSymb = `\p{So}`
  15. endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb
  16. midChar = endChar + `@.,:;'?!|`
  17. wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)`
  18. wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]`
  19. wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}`
  20. wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
  21. pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+`
  22. comScheme = `[a-zA-Z][a-zA-Z.\-+]*://`
  23. scheme = `(` + comScheme + `|` + otherScheme + `)`
  24. iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?`
  25. domain = `(` + iri + `\.)+`
  26. octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
  27. ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b`
  28. ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:`
  29. ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)`
  30. site = domain + gtld
  31. hostName = `(` + site + `|` + ipAddr + `)`
  32. port = `(:[0-9]*)?`
  33. path = `(/|/` + pathCont + `?|\b|$)`
  34. webURL = hostName + port + path
  35. strict = `(\b` + scheme + pathCont + `)`
  36. relaxed = `(` + strict + `|` + webURL + `)`
  37. )
  38. var (
  39. // Relaxed matches all the urls it can find.
  40. Relaxed = regexp.MustCompile(relaxed)
  41. // Strict only matches urls with a scheme to avoid false positives.
  42. Strict = regexp.MustCompile(strict)
  43. )
  44. func init() {
  45. Relaxed.Longest()
  46. Strict.Longest()
  47. }
  48. // StrictMatchingScheme produces a regexp that matches urls like Strict but
  49. // whose scheme matches the given regular expression.
  50. func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
  51. strictMatching := `(\b(?i)(` + exp + `)(?-i)` + pathCont + `)`
  52. re, err := regexp.Compile(strictMatching)
  53. if err != nil {
  54. return nil, err
  55. }
  56. re.Longest()
  57. return re, nil
  58. }