bbloom.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. // The MIT License (MIT)
  2. // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy of
  4. // this software and associated documentation files (the "Software"), to deal in
  5. // the Software without restriction, including without limitation the rights to
  6. // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  7. // the Software, and to permit persons to whom the Software is furnished to do so,
  8. // subject to the following conditions:
  9. // The above copyright notice and this permission notice shall be included in all
  10. // copies or substantial portions of the Software.
  11. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  13. // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
  14. // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  15. // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  16. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  17. package bbloom
  18. import (
  19. "bytes"
  20. "encoding/json"
  21. "log"
  22. "math"
  23. "sync"
  24. "unsafe"
  25. )
  26. // helper
  27. var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
  28. func getSize(ui64 uint64) (size uint64, exponent uint64) {
  29. if ui64 < uint64(512) {
  30. ui64 = uint64(512)
  31. }
  32. size = uint64(1)
  33. for size < ui64 {
  34. size <<= 1
  35. exponent++
  36. }
  37. return size, exponent
  38. }
  39. func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
  40. size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
  41. locs := math.Ceil(float64(0.69314718056) * size / numEntries)
  42. return uint64(size), uint64(locs)
  43. }
  44. // New
  45. // returns a new bloomfilter
  46. func New(params ...float64) (bloomfilter Bloom) {
  47. var entries, locs uint64
  48. if len(params) == 2 {
  49. if params[1] < 1 {
  50. entries, locs = calcSizeByWrongPositives(params[0], params[1])
  51. } else {
  52. entries, locs = uint64(params[0]), uint64(params[1])
  53. }
  54. } else {
  55. log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
  56. }
  57. size, exponent := getSize(uint64(entries))
  58. bloomfilter = Bloom{
  59. sizeExp: exponent,
  60. size: size - 1,
  61. setLocs: locs,
  62. shift: 64 - exponent,
  63. }
  64. bloomfilter.Size(size)
  65. return bloomfilter
  66. }
  67. // NewWithBoolset
  68. // takes a []byte slice and number of locs per entry
  69. // returns the bloomfilter with a bitset populated according to the input []byte
  70. func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) {
  71. bloomfilter = New(float64(len(*bs)<<3), float64(locs))
  72. ptr := uintptr(unsafe.Pointer(&bloomfilter.bitset[0]))
  73. for _, b := range *bs {
  74. *(*uint8)(unsafe.Pointer(ptr)) = b
  75. ptr++
  76. }
  77. return bloomfilter
  78. }
  79. // bloomJSONImExport
  80. // Im/Export structure used by JSONMarshal / JSONUnmarshal
  81. type bloomJSONImExport struct {
  82. FilterSet []byte
  83. SetLocs uint64
  84. }
  85. // JSONUnmarshal
  86. // takes JSON-Object (type bloomJSONImExport) as []bytes
  87. // returns bloom32 / bloom64 object
  88. func JSONUnmarshal(dbData []byte) Bloom {
  89. bloomImEx := bloomJSONImExport{}
  90. json.Unmarshal(dbData, &bloomImEx)
  91. buf := bytes.NewBuffer(bloomImEx.FilterSet)
  92. bs := buf.Bytes()
  93. bf := NewWithBoolset(&bs, bloomImEx.SetLocs)
  94. return bf
  95. }
  96. //
  97. // Bloom filter
  98. type Bloom struct {
  99. Mtx sync.Mutex
  100. ElemNum uint64
  101. bitset []uint64
  102. sizeExp uint64
  103. size uint64
  104. setLocs uint64
  105. shift uint64
  106. }
  107. // <--- http://www.cse.yorku.ca/~oz/hash.html
  108. // modified Berkeley DB Hash (32bit)
  109. // hash is casted to l, h = 16bit fragments
  110. // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
  111. // hash := uint64(len(*b))
  112. // for _, c := range *b {
  113. // hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
  114. // }
  115. // h = hash >> bl.shift
  116. // l = hash << bl.shift >> bl.shift
  117. // return l, h
  118. // }
  119. // Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm()
  120. // https://131002.net/siphash/
  121. // siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash
  122. // Add
  123. // set the bit(s) for entry; Adds an entry to the Bloom filter
  124. func (bl *Bloom) Add(entry []byte) {
  125. l, h := bl.sipHash(entry)
  126. for i := uint64(0); i < (*bl).setLocs; i++ {
  127. (*bl).Set((h + i*l) & (*bl).size)
  128. (*bl).ElemNum++
  129. }
  130. }
  131. // AddTS
  132. // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
  133. func (bl *Bloom) AddTS(entry []byte) {
  134. bl.Mtx.Lock()
  135. defer bl.Mtx.Unlock()
  136. bl.Add(entry[:])
  137. }
  138. // Has
  139. // check if bit(s) for entry is/are set
  140. // returns true if the entry was added to the Bloom Filter
  141. func (bl Bloom) Has(entry []byte) bool {
  142. l, h := bl.sipHash(entry)
  143. for i := uint64(0); i < bl.setLocs; i++ {
  144. switch bl.IsSet((h + i*l) & bl.size) {
  145. case false:
  146. return false
  147. }
  148. }
  149. return true
  150. }
  151. // HasTS
  152. // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry
  153. func (bl *Bloom) HasTS(entry []byte) bool {
  154. bl.Mtx.Lock()
  155. defer bl.Mtx.Unlock()
  156. return bl.Has(entry[:])
  157. }
  158. // AddIfNotHas
  159. // Only Add entry if it's not present in the bloomfilter
  160. // returns true if entry was added
  161. // returns false if entry was allready registered in the bloomfilter
  162. func (bl Bloom) AddIfNotHas(entry []byte) (added bool) {
  163. if bl.Has(entry[:]) {
  164. return added
  165. }
  166. bl.Add(entry[:])
  167. return true
  168. }
  169. // AddIfNotHasTS
  170. // Tread safe: Only Add entry if it's not present in the bloomfilter
  171. // returns true if entry was added
  172. // returns false if entry was allready registered in the bloomfilter
  173. func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) {
  174. bl.Mtx.Lock()
  175. defer bl.Mtx.Unlock()
  176. return bl.AddIfNotHas(entry[:])
  177. }
  178. // Size
  179. // make Bloom filter with as bitset of size sz
  180. func (bl *Bloom) Size(sz uint64) {
  181. (*bl).bitset = make([]uint64, sz>>6)
  182. }
  183. // Clear
  184. // resets the Bloom filter
  185. func (bl *Bloom) Clear() {
  186. for i, _ := range (*bl).bitset {
  187. (*bl).bitset[i] = 0
  188. }
  189. }
  190. // Set
  191. // set the bit[idx] of bitsit
  192. func (bl *Bloom) Set(idx uint64) {
  193. ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
  194. *(*uint8)(ptr) |= mask[idx%8]
  195. }
  196. // IsSet
  197. // check if bit[idx] of bitset is set
  198. // returns true/false
  199. func (bl *Bloom) IsSet(idx uint64) bool {
  200. ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
  201. r := ((*(*uint8)(ptr)) >> (idx % 8)) & 1
  202. return r == 1
  203. }
  204. // JSONMarshal
  205. // returns JSON-object (type bloomJSONImExport) as []byte
  206. func (bl Bloom) JSONMarshal() []byte {
  207. bloomImEx := bloomJSONImExport{}
  208. bloomImEx.SetLocs = uint64(bl.setLocs)
  209. bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
  210. ptr := uintptr(unsafe.Pointer(&bl.bitset[0]))
  211. for i := range bloomImEx.FilterSet {
  212. bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(ptr))
  213. ptr++
  214. }
  215. data, err := json.Marshal(bloomImEx)
  216. if err != nil {
  217. log.Fatal("json.Marshal failed: ", err)
  218. }
  219. return data
  220. }
  221. // // alternative hashFn
  222. // func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) {
  223. // h64 := fnv.New64a()
  224. // h64.Write(*b)
  225. // hash := h64.Sum64()
  226. // h = hash >> 32
  227. // l = hash << 32 >> 32
  228. // return l, h
  229. // }
  230. //
  231. // // <-- http://partow.net/programming/hashfunctions/index.html
  232. // // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3,
  233. // // under the topic of sorting and search chapter 6.4.
  234. // // modified to fit with boolset-length
  235. // func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) {
  236. // hash := uint64(len(*b))
  237. // for _, c := range *b {
  238. // hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c)
  239. // }
  240. // h = hash >> bl.shift
  241. // l = hash << bl.sizeExp >> bl.sizeExp
  242. // return l, h
  243. // }