123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604 |
- package bolt
- import (
- "bytes"
- "fmt"
- "sort"
- "unsafe"
- )
- // node represents an in-memory, deserialized page.
- type node struct {
- bucket *Bucket
- isLeaf bool
- unbalanced bool
- spilled bool
- key []byte
- pgid pgid
- parent *node
- children nodes
- inodes inodes
- }
- // root returns the top-level node this node is attached to.
- func (n *node) root() *node {
- if n.parent == nil {
- return n
- }
- return n.parent.root()
- }
- // minKeys returns the minimum number of inodes this node should have.
- func (n *node) minKeys() int {
- if n.isLeaf {
- return 1
- }
- return 2
- }
- // size returns the size of the node after serialization.
- func (n *node) size() int {
- sz, elsz := pageHeaderSize, n.pageElementSize()
- for i := 0; i < len(n.inodes); i++ {
- item := &n.inodes[i]
- sz += elsz + len(item.key) + len(item.value)
- }
- return sz
- }
- // sizeLessThan returns true if the node is less than a given size.
- // This is an optimization to avoid calculating a large node when we only need
- // to know if it fits inside a certain page size.
- func (n *node) sizeLessThan(v int) bool {
- sz, elsz := pageHeaderSize, n.pageElementSize()
- for i := 0; i < len(n.inodes); i++ {
- item := &n.inodes[i]
- sz += elsz + len(item.key) + len(item.value)
- if sz >= v {
- return false
- }
- }
- return true
- }
- // pageElementSize returns the size of each page element based on the type of node.
- func (n *node) pageElementSize() int {
- if n.isLeaf {
- return leafPageElementSize
- }
- return branchPageElementSize
- }
- // childAt returns the child node at a given index.
- func (n *node) childAt(index int) *node {
- if n.isLeaf {
- panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
- }
- return n.bucket.node(n.inodes[index].pgid, n)
- }
- // childIndex returns the index of a given child node.
- func (n *node) childIndex(child *node) int {
- index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
- return index
- }
- // numChildren returns the number of children.
- func (n *node) numChildren() int {
- return len(n.inodes)
- }
- // nextSibling returns the next node with the same parent.
- func (n *node) nextSibling() *node {
- if n.parent == nil {
- return nil
- }
- index := n.parent.childIndex(n)
- if index >= n.parent.numChildren()-1 {
- return nil
- }
- return n.parent.childAt(index + 1)
- }
- // prevSibling returns the previous node with the same parent.
- func (n *node) prevSibling() *node {
- if n.parent == nil {
- return nil
- }
- index := n.parent.childIndex(n)
- if index == 0 {
- return nil
- }
- return n.parent.childAt(index - 1)
- }
- // put inserts a key/value.
- func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
- if pgid >= n.bucket.tx.meta.pgid {
- panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
- } else if len(oldKey) <= 0 {
- panic("put: zero-length old key")
- } else if len(newKey) <= 0 {
- panic("put: zero-length new key")
- }
- // Find insertion index.
- index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
- // Add capacity and shift nodes if we don't have an exact match and need to insert.
- exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
- if !exact {
- n.inodes = append(n.inodes, inode{})
- copy(n.inodes[index+1:], n.inodes[index:])
- }
- inode := &n.inodes[index]
- inode.flags = flags
- inode.key = newKey
- inode.value = value
- inode.pgid = pgid
- _assert(len(inode.key) > 0, "put: zero-length inode key")
- }
- // del removes a key from the node.
- func (n *node) del(key []byte) {
- // Find index of key.
- index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
- // Exit if the key isn't found.
- if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
- return
- }
- // Delete inode from the node.
- n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
- // Mark the node as needing rebalancing.
- n.unbalanced = true
- }
- // read initializes the node from a page.
- func (n *node) read(p *page) {
- n.pgid = p.id
- n.isLeaf = ((p.flags & leafPageFlag) != 0)
- n.inodes = make(inodes, int(p.count))
- for i := 0; i < int(p.count); i++ {
- inode := &n.inodes[i]
- if n.isLeaf {
- elem := p.leafPageElement(uint16(i))
- inode.flags = elem.flags
- inode.key = elem.key()
- inode.value = elem.value()
- } else {
- elem := p.branchPageElement(uint16(i))
- inode.pgid = elem.pgid
- inode.key = elem.key()
- }
- _assert(len(inode.key) > 0, "read: zero-length inode key")
- }
- // Save first key so we can find the node in the parent when we spill.
- if len(n.inodes) > 0 {
- n.key = n.inodes[0].key
- _assert(len(n.key) > 0, "read: zero-length node key")
- } else {
- n.key = nil
- }
- }
- // write writes the items onto one or more pages.
- func (n *node) write(p *page) {
- // Initialize page.
- if n.isLeaf {
- p.flags |= leafPageFlag
- } else {
- p.flags |= branchPageFlag
- }
- if len(n.inodes) >= 0xFFFF {
- panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
- }
- p.count = uint16(len(n.inodes))
- // Stop here if there are no items to write.
- if p.count == 0 {
- return
- }
- // Loop over each item and write it to the page.
- b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
- for i, item := range n.inodes {
- _assert(len(item.key) > 0, "write: zero-length inode key")
- // Write the page element.
- if n.isLeaf {
- elem := p.leafPageElement(uint16(i))
- elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
- elem.flags = item.flags
- elem.ksize = uint32(len(item.key))
- elem.vsize = uint32(len(item.value))
- } else {
- elem := p.branchPageElement(uint16(i))
- elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
- elem.ksize = uint32(len(item.key))
- elem.pgid = item.pgid
- _assert(elem.pgid != p.id, "write: circular dependency occurred")
- }
- // If the length of key+value is larger than the max allocation size
- // then we need to reallocate the byte array pointer.
- //
- // See: https://github.com/boltdb/bolt/pull/335
- klen, vlen := len(item.key), len(item.value)
- if len(b) < klen+vlen {
- b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
- }
- // Write data for the element to the end of the page.
- copy(b[0:], item.key)
- b = b[klen:]
- copy(b[0:], item.value)
- b = b[vlen:]
- }
- // DEBUG ONLY: n.dump()
- }
- // split breaks up a node into multiple smaller nodes, if appropriate.
- // This should only be called from the spill() function.
- func (n *node) split(pageSize int) []*node {
- var nodes []*node
- node := n
- for {
- // Split node into two.
- a, b := node.splitTwo(pageSize)
- nodes = append(nodes, a)
- // If we can't split then exit the loop.
- if b == nil {
- break
- }
- // Set node to b so it gets split on the next iteration.
- node = b
- }
- return nodes
- }
- // splitTwo breaks up a node into two smaller nodes, if appropriate.
- // This should only be called from the split() function.
- func (n *node) splitTwo(pageSize int) (*node, *node) {
- // Ignore the split if the page doesn't have at least enough nodes for
- // two pages or if the nodes can fit in a single page.
- if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
- return n, nil
- }
- // Determine the threshold before starting a new node.
- var fillPercent = n.bucket.FillPercent
- if fillPercent < minFillPercent {
- fillPercent = minFillPercent
- } else if fillPercent > maxFillPercent {
- fillPercent = maxFillPercent
- }
- threshold := int(float64(pageSize) * fillPercent)
- // Determine split position and sizes of the two pages.
- splitIndex, _ := n.splitIndex(threshold)
- // Split node into two separate nodes.
- // If there's no parent then we'll need to create one.
- if n.parent == nil {
- n.parent = &node{bucket: n.bucket, children: []*node{n}}
- }
- // Create a new node and add it to the parent.
- next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
- n.parent.children = append(n.parent.children, next)
- // Split inodes across two nodes.
- next.inodes = n.inodes[splitIndex:]
- n.inodes = n.inodes[:splitIndex]
- // Update the statistics.
- n.bucket.tx.stats.Split++
- return n, next
- }
- // splitIndex finds the position where a page will fill a given threshold.
- // It returns the index as well as the size of the first page.
- // This is only be called from split().
- func (n *node) splitIndex(threshold int) (index, sz int) {
- sz = pageHeaderSize
- // Loop until we only have the minimum number of keys required for the second page.
- for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
- index = i
- inode := n.inodes[i]
- elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
- // If we have at least the minimum number of keys and adding another
- // node would put us over the threshold then exit and return.
- if i >= minKeysPerPage && sz+elsize > threshold {
- break
- }
- // Add the element size to the total size.
- sz += elsize
- }
- return
- }
- // spill writes the nodes to dirty pages and splits nodes as it goes.
- // Returns an error if dirty pages cannot be allocated.
- func (n *node) spill() error {
- var tx = n.bucket.tx
- if n.spilled {
- return nil
- }
- // Spill child nodes first. Child nodes can materialize sibling nodes in
- // the case of split-merge so we cannot use a range loop. We have to check
- // the children size on every loop iteration.
- sort.Sort(n.children)
- for i := 0; i < len(n.children); i++ {
- if err := n.children[i].spill(); err != nil {
- return err
- }
- }
- // We no longer need the child list because it's only used for spill tracking.
- n.children = nil
- // Split nodes into appropriate sizes. The first node will always be n.
- var nodes = n.split(tx.db.pageSize)
- for _, node := range nodes {
- // Add node's page to the freelist if it's not new.
- if node.pgid > 0 {
- tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
- node.pgid = 0
- }
- // Allocate contiguous space for the node.
- p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
- if err != nil {
- return err
- }
- // Write the node.
- if p.id >= tx.meta.pgid {
- panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
- }
- node.pgid = p.id
- node.write(p)
- node.spilled = true
- // Insert into parent inodes.
- if node.parent != nil {
- var key = node.key
- if key == nil {
- key = node.inodes[0].key
- }
- node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
- node.key = node.inodes[0].key
- _assert(len(node.key) > 0, "spill: zero-length node key")
- }
- // Update the statistics.
- tx.stats.Spill++
- }
- // If the root node split and created a new root then we need to spill that
- // as well. We'll clear out the children to make sure it doesn't try to respill.
- if n.parent != nil && n.parent.pgid == 0 {
- n.children = nil
- return n.parent.spill()
- }
- return nil
- }
- // rebalance attempts to combine the node with sibling nodes if the node fill
- // size is below a threshold or if there are not enough keys.
- func (n *node) rebalance() {
- if !n.unbalanced {
- return
- }
- n.unbalanced = false
- // Update statistics.
- n.bucket.tx.stats.Rebalance++
- // Ignore if node is above threshold (25%) and has enough keys.
- var threshold = n.bucket.tx.db.pageSize / 4
- if n.size() > threshold && len(n.inodes) > n.minKeys() {
- return
- }
- // Root node has special handling.
- if n.parent == nil {
- // If root node is a branch and only has one node then collapse it.
- if !n.isLeaf && len(n.inodes) == 1 {
- // Move root's child up.
- child := n.bucket.node(n.inodes[0].pgid, n)
- n.isLeaf = child.isLeaf
- n.inodes = child.inodes[:]
- n.children = child.children
- // Reparent all child nodes being moved.
- for _, inode := range n.inodes {
- if child, ok := n.bucket.nodes[inode.pgid]; ok {
- child.parent = n
- }
- }
- // Remove old child.
- child.parent = nil
- delete(n.bucket.nodes, child.pgid)
- child.free()
- }
- return
- }
- // If node has no keys then just remove it.
- if n.numChildren() == 0 {
- n.parent.del(n.key)
- n.parent.removeChild(n)
- delete(n.bucket.nodes, n.pgid)
- n.free()
- n.parent.rebalance()
- return
- }
- _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
- // Destination node is right sibling if idx == 0, otherwise left sibling.
- var target *node
- var useNextSibling = (n.parent.childIndex(n) == 0)
- if useNextSibling {
- target = n.nextSibling()
- } else {
- target = n.prevSibling()
- }
- // If both this node and the target node are too small then merge them.
- if useNextSibling {
- // Reparent all child nodes being moved.
- for _, inode := range target.inodes {
- if child, ok := n.bucket.nodes[inode.pgid]; ok {
- child.parent.removeChild(child)
- child.parent = n
- child.parent.children = append(child.parent.children, child)
- }
- }
- // Copy over inodes from target and remove target.
- n.inodes = append(n.inodes, target.inodes...)
- n.parent.del(target.key)
- n.parent.removeChild(target)
- delete(n.bucket.nodes, target.pgid)
- target.free()
- } else {
- // Reparent all child nodes being moved.
- for _, inode := range n.inodes {
- if child, ok := n.bucket.nodes[inode.pgid]; ok {
- child.parent.removeChild(child)
- child.parent = target
- child.parent.children = append(child.parent.children, child)
- }
- }
- // Copy over inodes to target and remove node.
- target.inodes = append(target.inodes, n.inodes...)
- n.parent.del(n.key)
- n.parent.removeChild(n)
- delete(n.bucket.nodes, n.pgid)
- n.free()
- }
- // Either this node or the target node was deleted from the parent so rebalance it.
- n.parent.rebalance()
- }
- // removes a node from the list of in-memory children.
- // This does not affect the inodes.
- func (n *node) removeChild(target *node) {
- for i, child := range n.children {
- if child == target {
- n.children = append(n.children[:i], n.children[i+1:]...)
- return
- }
- }
- }
- // dereference causes the node to copy all its inode key/value references to heap memory.
- // This is required when the mmap is reallocated so inodes are not pointing to stale data.
- func (n *node) dereference() {
- if n.key != nil {
- key := make([]byte, len(n.key))
- copy(key, n.key)
- n.key = key
- _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
- }
- for i := range n.inodes {
- inode := &n.inodes[i]
- key := make([]byte, len(inode.key))
- copy(key, inode.key)
- inode.key = key
- _assert(len(inode.key) > 0, "dereference: zero-length inode key")
- value := make([]byte, len(inode.value))
- copy(value, inode.value)
- inode.value = value
- }
- // Recursively dereference children.
- for _, child := range n.children {
- child.dereference()
- }
- // Update statistics.
- n.bucket.tx.stats.NodeDeref++
- }
- // free adds the node's underlying page to the freelist.
- func (n *node) free() {
- if n.pgid != 0 {
- n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
- n.pgid = 0
- }
- }
- // dump writes the contents of the node to STDERR for debugging purposes.
- /*
- func (n *node) dump() {
- // Write node header.
- var typ = "branch"
- if n.isLeaf {
- typ = "leaf"
- }
- warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
- // Write out abbreviated version of each item.
- for _, item := range n.inodes {
- if n.isLeaf {
- if item.flags&bucketLeafFlag != 0 {
- bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
- warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
- } else {
- warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
- }
- } else {
- warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
- }
- }
- warn("")
- }
- */
- type nodes []*node
- func (s nodes) Len() int { return len(s) }
- func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
- func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
- // inode represents an internal node inside of a node.
- // It can be used to point to elements in a page or point
- // to an element which hasn't been added to a page yet.
- type inode struct {
- flags uint32
- pgid pgid
- key []byte
- value []byte
- }
- type inodes []inode
|