sniff.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package http
  5. import (
  6. "bytes"
  7. "encoding/binary"
  8. )
  9. // The algorithm uses at most sniffLen bytes to make its decision.
  10. const sniffLen = 512
  11. // DetectContentType implements the algorithm described
  12. // at http://mimesniff.spec.whatwg.org/ to determine the
  13. // Content-Type of the given data. It considers at most the
  14. // first 512 bytes of data. DetectContentType always returns
  15. // a valid MIME type: if it cannot determine a more specific one, it
  16. // returns "application/octet-stream".
  17. func DetectContentType(data []byte) string {
  18. if len(data) > sniffLen {
  19. data = data[:sniffLen]
  20. }
  21. // Index of the first non-whitespace byte in data.
  22. firstNonWS := 0
  23. for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
  24. }
  25. for _, sig := range sniffSignatures {
  26. if ct := sig.match(data, firstNonWS); ct != "" {
  27. return ct
  28. }
  29. }
  30. return "application/octet-stream" // fallback
  31. }
  32. func isWS(b byte) bool {
  33. switch b {
  34. case '\t', '\n', '\x0c', '\r', ' ':
  35. return true
  36. }
  37. return false
  38. }
  39. type sniffSig interface {
  40. // match returns the MIME type of the data, or "" if unknown.
  41. match(data []byte, firstNonWS int) string
  42. }
  43. // Data matching the table in section 6.
  44. var sniffSignatures = []sniffSig{
  45. htmlSig("<!DOCTYPE HTML"),
  46. htmlSig("<HTML"),
  47. htmlSig("<HEAD"),
  48. htmlSig("<SCRIPT"),
  49. htmlSig("<IFRAME"),
  50. htmlSig("<H1"),
  51. htmlSig("<DIV"),
  52. htmlSig("<FONT"),
  53. htmlSig("<TABLE"),
  54. htmlSig("<A"),
  55. htmlSig("<STYLE"),
  56. htmlSig("<TITLE"),
  57. htmlSig("<B"),
  58. htmlSig("<BODY"),
  59. htmlSig("<BR"),
  60. htmlSig("<P"),
  61. htmlSig("<!--"),
  62. &maskedSig{mask: []byte("\xFF\xFF\xFF\xFF\xFF"), pat: []byte("<?xml"), skipWS: true, ct: "text/xml; charset=utf-8"},
  63. &exactSig{[]byte("%PDF-"), "application/pdf"},
  64. &exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
  65. // UTF BOMs.
  66. &maskedSig{mask: []byte("\xFF\xFF\x00\x00"), pat: []byte("\xFE\xFF\x00\x00"), ct: "text/plain; charset=utf-16be"},
  67. &maskedSig{mask: []byte("\xFF\xFF\x00\x00"), pat: []byte("\xFF\xFE\x00\x00"), ct: "text/plain; charset=utf-16le"},
  68. &maskedSig{mask: []byte("\xFF\xFF\xFF\x00"), pat: []byte("\xEF\xBB\xBF\x00"), ct: "text/plain; charset=utf-8"},
  69. &exactSig{[]byte("GIF87a"), "image/gif"},
  70. &exactSig{[]byte("GIF89a"), "image/gif"},
  71. &exactSig{[]byte("\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"), "image/png"},
  72. &exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
  73. &exactSig{[]byte("BM"), "image/bmp"},
  74. &maskedSig{
  75. mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
  76. pat: []byte("RIFF\x00\x00\x00\x00WEBPVP"),
  77. ct: "image/webp",
  78. },
  79. &exactSig{[]byte("\x00\x00\x01\x00"), "image/vnd.microsoft.icon"},
  80. &maskedSig{
  81. mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
  82. pat: []byte("RIFF\x00\x00\x00\x00WAVE"),
  83. ct: "audio/wave",
  84. },
  85. &maskedSig{
  86. mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
  87. pat: []byte("FORM\x00\x00\x00\x00AIFF"),
  88. ct: "audio/aiff",
  89. },
  90. &maskedSig{
  91. mask: []byte("\xFF\xFF\xFF\xFF"),
  92. pat: []byte(".snd"),
  93. ct: "audio/basic",
  94. },
  95. &maskedSig{
  96. mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
  97. pat: []byte("OggS\x00"),
  98. ct: "application/ogg",
  99. },
  100. &maskedSig{
  101. mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
  102. pat: []byte("MThd\x00\x00\x00\x06"),
  103. ct: "audio/midi",
  104. },
  105. &maskedSig{
  106. mask: []byte("\xFF\xFF\xFF"),
  107. pat: []byte("ID3"),
  108. ct: "audio/mpeg",
  109. },
  110. &maskedSig{
  111. mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
  112. pat: []byte("RIFF\x00\x00\x00\x00AVI "),
  113. ct: "video/avi",
  114. },
  115. // Fonts
  116. &maskedSig{
  117. // 34 NULL bytes followed by the string "LP"
  118. pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x4C\x50"),
  119. // 34 NULL bytes followed by \xF\xF
  120. mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
  121. ct: "application/vnd.ms-fontobject",
  122. },
  123. &exactSig{[]byte("\x00\x01\x00\x00"), "application/font-ttf"},
  124. &exactSig{[]byte("OTTO"), "application/font-off"},
  125. &exactSig{[]byte("ttcf"), "application/font-cff"},
  126. &exactSig{[]byte("wOFF"), "application/font-woff"},
  127. &exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
  128. &exactSig{[]byte("\x52\x61\x72\x20\x1A\x07\x00"), "application/x-rar-compressed"},
  129. &exactSig{[]byte("\x50\x4B\x03\x04"), "application/zip"},
  130. &exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
  131. mp4Sig{},
  132. textSig{}, // should be last
  133. }
  134. type exactSig struct {
  135. sig []byte
  136. ct string
  137. }
  138. func (e *exactSig) match(data []byte, firstNonWS int) string {
  139. if bytes.HasPrefix(data, e.sig) {
  140. return e.ct
  141. }
  142. return ""
  143. }
  144. type maskedSig struct {
  145. mask, pat []byte
  146. skipWS bool
  147. ct string
  148. }
  149. func (m *maskedSig) match(data []byte, firstNonWS int) string {
  150. // pattern matching algorithm section 6
  151. // https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
  152. if m.skipWS {
  153. data = data[firstNonWS:]
  154. }
  155. if len(m.pat) != len(m.mask) {
  156. return ""
  157. }
  158. if len(data) < len(m.mask) {
  159. return ""
  160. }
  161. for i, mask := range m.mask {
  162. db := data[i] & mask
  163. if db != m.pat[i] {
  164. return ""
  165. }
  166. }
  167. return m.ct
  168. }
  169. type htmlSig []byte
  170. func (h htmlSig) match(data []byte, firstNonWS int) string {
  171. data = data[firstNonWS:]
  172. if len(data) < len(h)+1 {
  173. return ""
  174. }
  175. for i, b := range h {
  176. db := data[i]
  177. if 'A' <= b && b <= 'Z' {
  178. db &= 0xDF
  179. }
  180. if b != db {
  181. return ""
  182. }
  183. }
  184. // Next byte must be space or right angle bracket.
  185. if db := data[len(h)]; db != ' ' && db != '>' {
  186. return ""
  187. }
  188. return "text/html; charset=utf-8"
  189. }
  190. var mp4ftype = []byte("ftyp")
  191. var mp4 = []byte("mp4")
  192. type mp4Sig struct{}
  193. func (mp4Sig) match(data []byte, firstNonWS int) string {
  194. // https://mimesniff.spec.whatwg.org/#signature-for-mp4
  195. // c.f. section 6.2.1
  196. if len(data) < 12 {
  197. return ""
  198. }
  199. boxSize := int(binary.BigEndian.Uint32(data[:4]))
  200. if boxSize%4 != 0 || len(data) < boxSize {
  201. return ""
  202. }
  203. if !bytes.Equal(data[4:8], mp4ftype) {
  204. return ""
  205. }
  206. for st := 8; st < boxSize; st += 4 {
  207. if st == 12 {
  208. // minor version number
  209. continue
  210. }
  211. if bytes.Equal(data[st:st+3], mp4) {
  212. return "video/mp4"
  213. }
  214. }
  215. return ""
  216. }
  217. type textSig struct{}
  218. func (textSig) match(data []byte, firstNonWS int) string {
  219. // c.f. section 5, step 4.
  220. for _, b := range data[firstNonWS:] {
  221. switch {
  222. case b <= 0x08,
  223. b == 0x0B,
  224. 0x0E <= b && b <= 0x1A,
  225. 0x1C <= b && b <= 0x1F:
  226. return ""
  227. }
  228. }
  229. return "text/plain; charset=utf-8"
  230. }