sanitize.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "io"
  33. "net/url"
  34. "strings"
  35. "golang.org/x/net/html"
  36. )
  37. // Sanitize takes a string that contains a HTML fragment or document and applies
  38. // the given policy whitelist.
  39. //
  40. // It returns a HTML string that has been sanitized by the policy or an empty
  41. // string if an error has occurred (most likely as a consequence of extremely
  42. // malformed input)
  43. func (p *Policy) Sanitize(s string) string {
  44. if strings.TrimSpace(s) == "" {
  45. return s
  46. }
  47. return p.sanitize(strings.NewReader(s)).String()
  48. }
  49. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  50. // the given policy whitelist.
  51. //
  52. // It returns a []byte containing the HTML that has been sanitized by the policy
  53. // or an empty []byte if an error has occurred (most likely as a consequence of
  54. // extremely malformed input)
  55. func (p *Policy) SanitizeBytes(b []byte) []byte {
  56. if len(bytes.TrimSpace(b)) == 0 {
  57. return b
  58. }
  59. return p.sanitize(bytes.NewReader(b)).Bytes()
  60. }
  61. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  62. // and applies the given policy whitelist.
  63. //
  64. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  65. // policy. Errors during sanitization will merely return an empty result.
  66. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  67. return p.sanitize(r)
  68. }
  69. // Performs the actual sanitization process.
  70. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
  71. // It is possible that the developer has created the policy via:
  72. // p := bluemonday.Policy{}
  73. // rather than:
  74. // p := bluemonday.NewPolicy()
  75. // If this is the case, and if they haven't yet triggered an action that
  76. // would initiliaze the maps, then we need to do that.
  77. p.init()
  78. var (
  79. buff bytes.Buffer
  80. skipElementContent bool
  81. skippingElementsCount int64
  82. skipClosingTag bool
  83. closingTagToSkipStack []string
  84. mostRecentlyStartedToken string
  85. )
  86. tokenizer := html.NewTokenizer(r)
  87. for {
  88. if tokenizer.Next() == html.ErrorToken {
  89. err := tokenizer.Err()
  90. if err == io.EOF {
  91. // End of input means end of processing
  92. return &buff
  93. }
  94. // Raw tokenizer error
  95. return &bytes.Buffer{}
  96. }
  97. token := tokenizer.Token()
  98. switch token.Type {
  99. case html.DoctypeToken:
  100. // DocType is not handled as there is no safe parsing mechanism
  101. // provided by golang.org/x/net/html for the content, and this can
  102. // be misused to insert HTML tags that are not then sanitized
  103. //
  104. // One might wish to recursively sanitize here using the same policy
  105. // but I will need to do some further testing before considering
  106. // this.
  107. case html.CommentToken:
  108. // Comments are ignored by default
  109. case html.StartTagToken:
  110. mostRecentlyStartedToken = token.Data
  111. aps, ok := p.elsAndAttrs[token.Data]
  112. if !ok {
  113. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  114. skipElementContent = true
  115. skippingElementsCount++
  116. }
  117. if p.addSpaces {
  118. buff.WriteString(" ")
  119. }
  120. break
  121. }
  122. if len(token.Attr) != 0 {
  123. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  124. }
  125. if len(token.Attr) == 0 {
  126. if !p.allowNoAttrs(token.Data) {
  127. skipClosingTag = true
  128. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  129. if p.addSpaces {
  130. buff.WriteString(" ")
  131. }
  132. break
  133. }
  134. }
  135. if !skipElementContent {
  136. buff.WriteString(token.String())
  137. }
  138. case html.EndTagToken:
  139. if mostRecentlyStartedToken == token.Data {
  140. mostRecentlyStartedToken = ""
  141. }
  142. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  143. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  144. if len(closingTagToSkipStack) == 0 {
  145. skipClosingTag = false
  146. }
  147. if p.addSpaces {
  148. buff.WriteString(" ")
  149. }
  150. break
  151. }
  152. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  153. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  154. skippingElementsCount--
  155. if skippingElementsCount == 0 {
  156. skipElementContent = false
  157. }
  158. }
  159. if p.addSpaces {
  160. buff.WriteString(" ")
  161. }
  162. break
  163. }
  164. if !skipElementContent {
  165. buff.WriteString(token.String())
  166. }
  167. case html.SelfClosingTagToken:
  168. aps, ok := p.elsAndAttrs[token.Data]
  169. if !ok {
  170. if p.addSpaces {
  171. buff.WriteString(" ")
  172. }
  173. break
  174. }
  175. if len(token.Attr) != 0 {
  176. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  177. }
  178. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  179. if p.addSpaces {
  180. buff.WriteString(" ")
  181. }
  182. break
  183. }
  184. if !skipElementContent {
  185. buff.WriteString(token.String())
  186. }
  187. case html.TextToken:
  188. if !skipElementContent {
  189. switch mostRecentlyStartedToken {
  190. case "script":
  191. // not encouraged, but if a policy allows JavaScript we
  192. // should not HTML escape it as that would break the output
  193. buff.WriteString(token.Data)
  194. case "style":
  195. // not encouraged, but if a policy allows CSS styles we
  196. // should not HTML escape it as that would break the output
  197. buff.WriteString(token.Data)
  198. default:
  199. // HTML escape the text
  200. buff.WriteString(token.String())
  201. }
  202. }
  203. default:
  204. // A token that didn't exist in the html package when we wrote this
  205. return &bytes.Buffer{}
  206. }
  207. }
  208. }
  209. // sanitizeAttrs takes a set of element attribute policies and the global
  210. // attribute policies and applies them to the []html.Attribute returning a set
  211. // of html.Attributes that match the policies
  212. func (p *Policy) sanitizeAttrs(
  213. elementName string,
  214. attrs []html.Attribute,
  215. aps map[string]attrPolicy,
  216. ) []html.Attribute {
  217. if len(attrs) == 0 {
  218. return attrs
  219. }
  220. // Builds a new attribute slice based on the whether the attribute has been
  221. // whitelisted explicitly or globally.
  222. cleanAttrs := []html.Attribute{}
  223. for _, htmlAttr := range attrs {
  224. // Is there an element specific attribute policy that applies?
  225. if ap, ok := aps[htmlAttr.Key]; ok {
  226. if ap.regexp != nil {
  227. if ap.regexp.MatchString(htmlAttr.Val) {
  228. cleanAttrs = append(cleanAttrs, htmlAttr)
  229. continue
  230. }
  231. } else {
  232. cleanAttrs = append(cleanAttrs, htmlAttr)
  233. continue
  234. }
  235. }
  236. // Is there a global attribute policy that applies?
  237. if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
  238. if ap.regexp != nil {
  239. if ap.regexp.MatchString(htmlAttr.Val) {
  240. cleanAttrs = append(cleanAttrs, htmlAttr)
  241. }
  242. } else {
  243. cleanAttrs = append(cleanAttrs, htmlAttr)
  244. }
  245. }
  246. }
  247. if len(cleanAttrs) == 0 {
  248. // If nothing was allowed, let's get out of here
  249. return cleanAttrs
  250. }
  251. // cleanAttrs now contains the attributes that are permitted
  252. if linkable(elementName) {
  253. if p.requireParseableURLs {
  254. // Ensure URLs are parseable:
  255. // - a.href
  256. // - area.href
  257. // - link.href
  258. // - blockquote.cite
  259. // - q.cite
  260. // - img.src
  261. // - script.src
  262. tmpAttrs := []html.Attribute{}
  263. for _, htmlAttr := range cleanAttrs {
  264. switch elementName {
  265. case "a", "area", "link":
  266. if htmlAttr.Key == "href" {
  267. if u, ok := p.validURL(htmlAttr.Val); ok {
  268. htmlAttr.Val = u
  269. tmpAttrs = append(tmpAttrs, htmlAttr)
  270. }
  271. break
  272. }
  273. tmpAttrs = append(tmpAttrs, htmlAttr)
  274. case "blockquote", "q":
  275. if htmlAttr.Key == "cite" {
  276. if u, ok := p.validURL(htmlAttr.Val); ok {
  277. htmlAttr.Val = u
  278. tmpAttrs = append(tmpAttrs, htmlAttr)
  279. }
  280. break
  281. }
  282. tmpAttrs = append(tmpAttrs, htmlAttr)
  283. case "img", "script":
  284. if htmlAttr.Key == "src" {
  285. if u, ok := p.validURL(htmlAttr.Val); ok {
  286. htmlAttr.Val = u
  287. tmpAttrs = append(tmpAttrs, htmlAttr)
  288. }
  289. break
  290. }
  291. tmpAttrs = append(tmpAttrs, htmlAttr)
  292. default:
  293. tmpAttrs = append(tmpAttrs, htmlAttr)
  294. }
  295. }
  296. cleanAttrs = tmpAttrs
  297. }
  298. if (p.requireNoFollow ||
  299. p.requireNoFollowFullyQualifiedLinks ||
  300. p.addTargetBlankToFullyQualifiedLinks) &&
  301. len(cleanAttrs) > 0 {
  302. // Add rel="nofollow" if a "href" exists
  303. switch elementName {
  304. case "a", "area", "link":
  305. var hrefFound bool
  306. var externalLink bool
  307. for _, htmlAttr := range cleanAttrs {
  308. if htmlAttr.Key == "href" {
  309. hrefFound = true
  310. u, err := url.Parse(htmlAttr.Val)
  311. if err != nil {
  312. continue
  313. }
  314. if u.Host != "" {
  315. externalLink = true
  316. }
  317. continue
  318. }
  319. }
  320. if hrefFound {
  321. var (
  322. noFollowFound bool
  323. targetBlankFound bool
  324. )
  325. addNoFollow := (p.requireNoFollow ||
  326. externalLink && p.requireNoFollowFullyQualifiedLinks)
  327. addTargetBlank := (externalLink &&
  328. p.addTargetBlankToFullyQualifiedLinks)
  329. tmpAttrs := []html.Attribute{}
  330. for _, htmlAttr := range cleanAttrs {
  331. var appended bool
  332. if htmlAttr.Key == "rel" && addNoFollow {
  333. if strings.Contains(htmlAttr.Val, "nofollow") {
  334. noFollowFound = true
  335. tmpAttrs = append(tmpAttrs, htmlAttr)
  336. appended = true
  337. } else {
  338. htmlAttr.Val += " nofollow"
  339. noFollowFound = true
  340. tmpAttrs = append(tmpAttrs, htmlAttr)
  341. appended = true
  342. }
  343. }
  344. if elementName == "a" && htmlAttr.Key == "target" {
  345. if htmlAttr.Val == "_blank" {
  346. targetBlankFound = true
  347. }
  348. if addTargetBlank && !targetBlankFound {
  349. htmlAttr.Val = "_blank"
  350. targetBlankFound = true
  351. tmpAttrs = append(tmpAttrs, htmlAttr)
  352. appended = true
  353. }
  354. }
  355. if !appended {
  356. tmpAttrs = append(tmpAttrs, htmlAttr)
  357. }
  358. }
  359. if noFollowFound || targetBlankFound {
  360. cleanAttrs = tmpAttrs
  361. }
  362. if addNoFollow && !noFollowFound {
  363. rel := html.Attribute{}
  364. rel.Key = "rel"
  365. rel.Val = "nofollow"
  366. cleanAttrs = append(cleanAttrs, rel)
  367. }
  368. if elementName == "a" && addTargetBlank && !targetBlankFound {
  369. rel := html.Attribute{}
  370. rel.Key = "target"
  371. rel.Val = "_blank"
  372. targetBlankFound = true
  373. cleanAttrs = append(cleanAttrs, rel)
  374. }
  375. if targetBlankFound {
  376. // target="_blank" has a security risk that allows the
  377. // opened window/tab to issue JavaScript calls against
  378. // window.opener, which in effect allow the destination
  379. // of the link to control the source:
  380. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  381. //
  382. // To mitigate this risk, we need to add a specific rel
  383. // attribute if it is not already present.
  384. // rel="noopener"
  385. //
  386. // Unfortunately this is processing the rel twice (we
  387. // already looked at it earlier ^^) as we cannot be sure
  388. // of the ordering of the href and rel, and whether we
  389. // have fully satisfied that we need to do this. This
  390. // double processing only happens *if* target="_blank"
  391. // is true.
  392. var noOpenerAdded bool
  393. tmpAttrs := []html.Attribute{}
  394. for _, htmlAttr := range cleanAttrs {
  395. var appended bool
  396. if htmlAttr.Key == "rel" {
  397. if strings.Contains(htmlAttr.Val, "noopener") {
  398. noOpenerAdded = true
  399. tmpAttrs = append(tmpAttrs, htmlAttr)
  400. } else {
  401. htmlAttr.Val += " noopener"
  402. noOpenerAdded = true
  403. tmpAttrs = append(tmpAttrs, htmlAttr)
  404. }
  405. appended = true
  406. }
  407. if !appended {
  408. tmpAttrs = append(tmpAttrs, htmlAttr)
  409. }
  410. }
  411. if noOpenerAdded {
  412. cleanAttrs = tmpAttrs
  413. } else {
  414. // rel attr was not found, or else noopener would
  415. // have been added already
  416. rel := html.Attribute{}
  417. rel.Key = "rel"
  418. rel.Val = "noopener"
  419. cleanAttrs = append(cleanAttrs, rel)
  420. }
  421. }
  422. }
  423. default:
  424. }
  425. }
  426. }
  427. return cleanAttrs
  428. }
  429. func (p *Policy) allowNoAttrs(elementName string) bool {
  430. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  431. return ok
  432. }
  433. func (p *Policy) validURL(rawurl string) (string, bool) {
  434. if p.requireParseableURLs {
  435. // URLs are valid if when space is trimmed the URL is valid
  436. rawurl = strings.TrimSpace(rawurl)
  437. // URLs cannot contain whitespace, unless it is a data-uri
  438. if (strings.Contains(rawurl, " ") ||
  439. strings.Contains(rawurl, "\t") ||
  440. strings.Contains(rawurl, "\n")) &&
  441. !strings.HasPrefix(rawurl, `data:`) {
  442. return "", false
  443. }
  444. // URLs are valid if they parse
  445. u, err := url.Parse(rawurl)
  446. if err != nil {
  447. return "", false
  448. }
  449. if u.Scheme != "" {
  450. urlPolicy, ok := p.allowURLSchemes[u.Scheme]
  451. if !ok {
  452. return "", false
  453. }
  454. if urlPolicy == nil || urlPolicy(u) == true {
  455. return u.String(), true
  456. }
  457. return "", false
  458. }
  459. if p.allowRelativeURLs {
  460. if u.String() != "" {
  461. return u.String(), true
  462. }
  463. }
  464. return "", false
  465. }
  466. return rawurl, true
  467. }
  468. func linkable(elementName string) bool {
  469. switch elementName {
  470. case "a", "area", "blockquote", "img", "link", "script":
  471. return true
  472. default:
  473. return false
  474. }
  475. }