123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547 |
- // Copyright (c) 2014, David Kitchen <david@buro9.com>
- //
- // All rights reserved.
- //
- // Redistribution and use in source and binary forms, with or without
- // modification, are permitted provided that the following conditions are met:
- //
- // * Redistributions of source code must retain the above copyright notice, this
- // list of conditions and the following disclaimer.
- //
- // * Redistributions in binary form must reproduce the above copyright notice,
- // this list of conditions and the following disclaimer in the documentation
- // and/or other materials provided with the distribution.
- //
- // * Neither the name of the organisation (Microcosm) nor the names of its
- // contributors may be used to endorse or promote products derived from
- // this software without specific prior written permission.
- //
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- package bluemonday
- import (
- "bytes"
- "io"
- "net/url"
- "strings"
- "golang.org/x/net/html"
- )
- // Sanitize takes a string that contains a HTML fragment or document and applies
- // the given policy whitelist.
- //
- // It returns a HTML string that has been sanitized by the policy or an empty
- // string if an error has occurred (most likely as a consequence of extremely
- // malformed input)
- func (p *Policy) Sanitize(s string) string {
- if strings.TrimSpace(s) == "" {
- return s
- }
- return p.sanitize(strings.NewReader(s)).String()
- }
- // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
- // the given policy whitelist.
- //
- // It returns a []byte containing the HTML that has been sanitized by the policy
- // or an empty []byte if an error has occurred (most likely as a consequence of
- // extremely malformed input)
- func (p *Policy) SanitizeBytes(b []byte) []byte {
- if len(bytes.TrimSpace(b)) == 0 {
- return b
- }
- return p.sanitize(bytes.NewReader(b)).Bytes()
- }
- // SanitizeReader takes an io.Reader that contains a HTML fragment or document
- // and applies the given policy whitelist.
- //
- // It returns a bytes.Buffer containing the HTML that has been sanitized by the
- // policy. Errors during sanitization will merely return an empty result.
- func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
- return p.sanitize(r)
- }
- // Performs the actual sanitization process.
- func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
- // It is possible that the developer has created the policy via:
- // p := bluemonday.Policy{}
- // rather than:
- // p := bluemonday.NewPolicy()
- // If this is the case, and if they haven't yet triggered an action that
- // would initiliaze the maps, then we need to do that.
- p.init()
- var (
- buff bytes.Buffer
- skipElementContent bool
- skippingElementsCount int64
- skipClosingTag bool
- closingTagToSkipStack []string
- mostRecentlyStartedToken string
- )
- tokenizer := html.NewTokenizer(r)
- for {
- if tokenizer.Next() == html.ErrorToken {
- err := tokenizer.Err()
- if err == io.EOF {
- // End of input means end of processing
- return &buff
- }
- // Raw tokenizer error
- return &bytes.Buffer{}
- }
- token := tokenizer.Token()
- switch token.Type {
- case html.DoctypeToken:
- // DocType is not handled as there is no safe parsing mechanism
- // provided by golang.org/x/net/html for the content, and this can
- // be misused to insert HTML tags that are not then sanitized
- //
- // One might wish to recursively sanitize here using the same policy
- // but I will need to do some further testing before considering
- // this.
- case html.CommentToken:
- // Comments are ignored by default
- case html.StartTagToken:
- mostRecentlyStartedToken = token.Data
- aps, ok := p.elsAndAttrs[token.Data]
- if !ok {
- if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
- skipElementContent = true
- skippingElementsCount++
- }
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- if len(token.Attr) != 0 {
- token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
- }
- if len(token.Attr) == 0 {
- if !p.allowNoAttrs(token.Data) {
- skipClosingTag = true
- closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- }
- if !skipElementContent {
- buff.WriteString(token.String())
- }
- case html.EndTagToken:
- if mostRecentlyStartedToken == token.Data {
- mostRecentlyStartedToken = ""
- }
- if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
- closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
- if len(closingTagToSkipStack) == 0 {
- skipClosingTag = false
- }
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- if _, ok := p.elsAndAttrs[token.Data]; !ok {
- if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
- skippingElementsCount--
- if skippingElementsCount == 0 {
- skipElementContent = false
- }
- }
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- if !skipElementContent {
- buff.WriteString(token.String())
- }
- case html.SelfClosingTagToken:
- aps, ok := p.elsAndAttrs[token.Data]
- if !ok {
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- if len(token.Attr) != 0 {
- token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
- }
- if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
- if p.addSpaces {
- buff.WriteString(" ")
- }
- break
- }
- if !skipElementContent {
- buff.WriteString(token.String())
- }
- case html.TextToken:
- if !skipElementContent {
- switch mostRecentlyStartedToken {
- case "script":
- // not encouraged, but if a policy allows JavaScript we
- // should not HTML escape it as that would break the output
- buff.WriteString(token.Data)
- case "style":
- // not encouraged, but if a policy allows CSS styles we
- // should not HTML escape it as that would break the output
- buff.WriteString(token.Data)
- default:
- // HTML escape the text
- buff.WriteString(token.String())
- }
- }
- default:
- // A token that didn't exist in the html package when we wrote this
- return &bytes.Buffer{}
- }
- }
- }
- // sanitizeAttrs takes a set of element attribute policies and the global
- // attribute policies and applies them to the []html.Attribute returning a set
- // of html.Attributes that match the policies
- func (p *Policy) sanitizeAttrs(
- elementName string,
- attrs []html.Attribute,
- aps map[string]attrPolicy,
- ) []html.Attribute {
- if len(attrs) == 0 {
- return attrs
- }
- // Builds a new attribute slice based on the whether the attribute has been
- // whitelisted explicitly or globally.
- cleanAttrs := []html.Attribute{}
- for _, htmlAttr := range attrs {
- // Is there an element specific attribute policy that applies?
- if ap, ok := aps[htmlAttr.Key]; ok {
- if ap.regexp != nil {
- if ap.regexp.MatchString(htmlAttr.Val) {
- cleanAttrs = append(cleanAttrs, htmlAttr)
- continue
- }
- } else {
- cleanAttrs = append(cleanAttrs, htmlAttr)
- continue
- }
- }
- // Is there a global attribute policy that applies?
- if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
- if ap.regexp != nil {
- if ap.regexp.MatchString(htmlAttr.Val) {
- cleanAttrs = append(cleanAttrs, htmlAttr)
- }
- } else {
- cleanAttrs = append(cleanAttrs, htmlAttr)
- }
- }
- }
- if len(cleanAttrs) == 0 {
- // If nothing was allowed, let's get out of here
- return cleanAttrs
- }
- // cleanAttrs now contains the attributes that are permitted
- if linkable(elementName) {
- if p.requireParseableURLs {
- // Ensure URLs are parseable:
- // - a.href
- // - area.href
- // - link.href
- // - blockquote.cite
- // - q.cite
- // - img.src
- // - script.src
- tmpAttrs := []html.Attribute{}
- for _, htmlAttr := range cleanAttrs {
- switch elementName {
- case "a", "area", "link":
- if htmlAttr.Key == "href" {
- if u, ok := p.validURL(htmlAttr.Val); ok {
- htmlAttr.Val = u
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- break
- }
- tmpAttrs = append(tmpAttrs, htmlAttr)
- case "blockquote", "q":
- if htmlAttr.Key == "cite" {
- if u, ok := p.validURL(htmlAttr.Val); ok {
- htmlAttr.Val = u
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- break
- }
- tmpAttrs = append(tmpAttrs, htmlAttr)
- case "img", "script":
- if htmlAttr.Key == "src" {
- if u, ok := p.validURL(htmlAttr.Val); ok {
- htmlAttr.Val = u
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- break
- }
- tmpAttrs = append(tmpAttrs, htmlAttr)
- default:
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- }
- cleanAttrs = tmpAttrs
- }
- if (p.requireNoFollow ||
- p.requireNoFollowFullyQualifiedLinks ||
- p.addTargetBlankToFullyQualifiedLinks) &&
- len(cleanAttrs) > 0 {
- // Add rel="nofollow" if a "href" exists
- switch elementName {
- case "a", "area", "link":
- var hrefFound bool
- var externalLink bool
- for _, htmlAttr := range cleanAttrs {
- if htmlAttr.Key == "href" {
- hrefFound = true
- u, err := url.Parse(htmlAttr.Val)
- if err != nil {
- continue
- }
- if u.Host != "" {
- externalLink = true
- }
- continue
- }
- }
- if hrefFound {
- var (
- noFollowFound bool
- targetBlankFound bool
- )
- addNoFollow := (p.requireNoFollow ||
- externalLink && p.requireNoFollowFullyQualifiedLinks)
- addTargetBlank := (externalLink &&
- p.addTargetBlankToFullyQualifiedLinks)
- tmpAttrs := []html.Attribute{}
- for _, htmlAttr := range cleanAttrs {
- var appended bool
- if htmlAttr.Key == "rel" && addNoFollow {
- if strings.Contains(htmlAttr.Val, "nofollow") {
- noFollowFound = true
- tmpAttrs = append(tmpAttrs, htmlAttr)
- appended = true
- } else {
- htmlAttr.Val += " nofollow"
- noFollowFound = true
- tmpAttrs = append(tmpAttrs, htmlAttr)
- appended = true
- }
- }
- if elementName == "a" && htmlAttr.Key == "target" {
- if htmlAttr.Val == "_blank" {
- targetBlankFound = true
- }
- if addTargetBlank && !targetBlankFound {
- htmlAttr.Val = "_blank"
- targetBlankFound = true
- tmpAttrs = append(tmpAttrs, htmlAttr)
- appended = true
- }
- }
- if !appended {
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- }
- if noFollowFound || targetBlankFound {
- cleanAttrs = tmpAttrs
- }
- if addNoFollow && !noFollowFound {
- rel := html.Attribute{}
- rel.Key = "rel"
- rel.Val = "nofollow"
- cleanAttrs = append(cleanAttrs, rel)
- }
- if elementName == "a" && addTargetBlank && !targetBlankFound {
- rel := html.Attribute{}
- rel.Key = "target"
- rel.Val = "_blank"
- targetBlankFound = true
- cleanAttrs = append(cleanAttrs, rel)
- }
- if targetBlankFound {
- // target="_blank" has a security risk that allows the
- // opened window/tab to issue JavaScript calls against
- // window.opener, which in effect allow the destination
- // of the link to control the source:
- // https://dev.to/ben/the-targetblank-vulnerability-by-example
- //
- // To mitigate this risk, we need to add a specific rel
- // attribute if it is not already present.
- // rel="noopener"
- //
- // Unfortunately this is processing the rel twice (we
- // already looked at it earlier ^^) as we cannot be sure
- // of the ordering of the href and rel, and whether we
- // have fully satisfied that we need to do this. This
- // double processing only happens *if* target="_blank"
- // is true.
- var noOpenerAdded bool
- tmpAttrs := []html.Attribute{}
- for _, htmlAttr := range cleanAttrs {
- var appended bool
- if htmlAttr.Key == "rel" {
- if strings.Contains(htmlAttr.Val, "noopener") {
- noOpenerAdded = true
- tmpAttrs = append(tmpAttrs, htmlAttr)
- } else {
- htmlAttr.Val += " noopener"
- noOpenerAdded = true
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- appended = true
- }
- if !appended {
- tmpAttrs = append(tmpAttrs, htmlAttr)
- }
- }
- if noOpenerAdded {
- cleanAttrs = tmpAttrs
- } else {
- // rel attr was not found, or else noopener would
- // have been added already
- rel := html.Attribute{}
- rel.Key = "rel"
- rel.Val = "noopener"
- cleanAttrs = append(cleanAttrs, rel)
- }
- }
- }
- default:
- }
- }
- }
- return cleanAttrs
- }
- func (p *Policy) allowNoAttrs(elementName string) bool {
- _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
- return ok
- }
- func (p *Policy) validURL(rawurl string) (string, bool) {
- if p.requireParseableURLs {
- // URLs are valid if when space is trimmed the URL is valid
- rawurl = strings.TrimSpace(rawurl)
- // URLs cannot contain whitespace, unless it is a data-uri
- if (strings.Contains(rawurl, " ") ||
- strings.Contains(rawurl, "\t") ||
- strings.Contains(rawurl, "\n")) &&
- !strings.HasPrefix(rawurl, `data:`) {
- return "", false
- }
- // URLs are valid if they parse
- u, err := url.Parse(rawurl)
- if err != nil {
- return "", false
- }
- if u.Scheme != "" {
- urlPolicy, ok := p.allowURLSchemes[u.Scheme]
- if !ok {
- return "", false
- }
- if urlPolicy == nil || urlPolicy(u) == true {
- return u.String(), true
- }
- return "", false
- }
- if p.allowRelativeURLs {
- if u.String() != "" {
- return u.String(), true
- }
- }
- return "", false
- }
- return rawurl, true
- }
- func linkable(elementName string) bool {
- switch elementName {
- case "a", "area", "blockquote", "img", "link", "script":
- return true
- default:
- return false
- }
- }
|