-
Notifications
You must be signed in to change notification settings - Fork 87
/
btree.go
435 lines (375 loc) · 11.3 KB
/
btree.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
// B+-tree
//
// The tree we implement here is a B+-tree based on a paper by Ceylan and
// Mihalcea [1].
//
// B+-trees store all values in the leaves. In our case we store trigrams with
// the goal to quickly retrieve a pointer to the posting list for a given
// trigram. We choose the number of trigrams to store at each leaf based on the
// page size, IE we make sure we are able to load a bucket of ngrams with a
// single disk access.
//
// Here is an example of how our B+-tree looks like for a simple input:
//
// input: "hello world", bucketSize=2, v=2
//
// legend: ()=inner node, []=leaf
//
// (level 1) (hel, lo_)
//
// (level 2) (ell) (llo) (o_w, irl, red)
//
// (level 3) [_wo] [ell] [hel] [llo] [lo_] [o_w] [orl] [rld, wor]
//
// The leafs are stored as part of the index on disk (mmaped) while all inner
// nodes are loaded into memory when we load the shard.
//
// [1] H. Ceylan and R. Mihalcea. 2011. An Efficient Indexer for Large N-Gram
// Corpora, Proceedings of the ACL-HLT 2011 System Demonstrations, pages
// 103-108
package zoekt
import (
"encoding/binary"
"fmt"
"sort"
)
// btreeBucketSize should be chosen such that in the final tree the buckets are
// as close to the page size as possible, but not above. We insert ngrams in
// order(!), which means after a split of a leaf, the left leaf is not affected
// by further inserts and its size is fixed to bucketSize/2. The rightmost leaf
// might store up to btreeBucketSize ngrams, but the expected size is
// btreeBucketSize/2, too.
//
// On linux "getconf PAGESIZE" returns the number of bytes in a memory page.
const btreeBucketSize = (4096 * 2) / ngramEncoding
type btree struct {
root node
opts btreeOpts
lastBucketIndex int
}
type btreeOpts struct {
// How many ngrams can be stored at a leaf node.
bucketSize int
// all inner nodes, except root, have [v, 2v] children. In the literature,
// b-trees are inconsistently categorized either by the number of children
// or by the number of keys. We choose the former.
v int
}
func newBtree(opts btreeOpts) *btree {
return &btree{
root: &leaf{},
opts: opts,
}
}
// insert inserts ng into bt.
//
// Note: when all inserts are done, freeze must be called.
func (bt *btree) insert(ng ngram) {
if leftNode, rightNode, newKey, ok := bt.root.maybeSplit(bt.opts); ok {
bt.root = &innerNode{keys: []ngram{newKey}, children: []node{leftNode, rightNode}}
}
bt.root.insert(ng, bt.opts)
}
// find returns the tuple (bucketIndex, postingIndexOffset), both of which are
// stored at the leaf level. They are effectively pointers to the bucket and
// the posting lists for ngrams stored in the bucket. Since ngrams and their
// posting lists are stored in order, knowing the index of the posting list of
// the first item in the bucket is sufficient.
func (bt *btree) find(ng ngram) (int, int) {
if bt.root == nil {
return -1, -1
}
return bt.root.find(ng)
}
func (bt *btree) visit(f func(n node)) {
bt.root.visit(f)
}
// freeze must be called once we are done inserting. It backfills "pointers" to
// the buckets and posting lists.
func (bt *btree) freeze() {
// Note: Instead of backfilling we could maintain state during insertion,
// however the visitor pattern seems more natural and shouldn't be a
// performance issue, because, based on the typical number of trigrams
// (500k) per shard, the b-trees we construct here only have around 1000
// leaf nodes.
offset, bucketIndex := 0, 0
bt.visit(func(no node) {
switch n := no.(type) {
case *leaf:
n.bucketIndex = bucketIndex
bucketIndex++
n.postingIndexOffset = offset
offset += n.bucketSize
case *innerNode:
return
}
})
bt.lastBucketIndex = bucketIndex - 1
}
func (bt *btree) sizeBytes() int {
sz := 2 * 8 // opts
sz += int(interfaceBytes)
bt.visit(func(n node) {
sz += n.sizeBytes()
})
return sz
}
type node interface {
insert(ng ngram, opts btreeOpts)
maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool)
find(ng ngram) (int, int)
visit(func(n node))
sizeBytes() int
}
type innerNode struct {
keys []ngram
children []node
}
type leaf struct {
bucketIndex int
// postingIndexOffset is the index of the posting list of the first ngram
// in the bucket. This is enough to determine the index of the posting list
// for every other key in the bucket.
postingIndexOffset int
// Optimization: Because we insert ngrams in order, we don't actually have
// to fill the buckets. We just have to keep track of the size of the
// bucket, so we know when to split, and the key that we have to propagate
// up to the parent node when we split.
//
// If in the future we decide to mutate buckets, we have to replace
// bucketSize and splitKey by []ngram.
bucketSize int
splitKey ngram
}
func (n *innerNode) sizeBytes() int {
return len(n.keys)*ngramEncoding + len(n.children)*int(interfaceBytes)
}
func (n *leaf) sizeBytes() int {
return 4 * 8
}
func (n *leaf) insert(ng ngram, opts btreeOpts) {
n.bucketSize++
if n.bucketSize == (opts.bucketSize/2)+1 {
n.splitKey = ng
}
}
func (n *innerNode) insert(ng ngram, opts btreeOpts) {
insertAt := func(i int) {
// Invariant: Nodes always have a free slot.
//
// We split full nodes on the the way down to the leaf. This has the
// advantage that inserts are handled in a single pass.
if leftNode, rightNode, newKey, ok := n.children[i].maybeSplit(opts); ok {
n.keys = append(n.keys[0:i], append([]ngram{newKey}, n.keys[i:]...)...)
n.children = append(n.children[0:i], append([]node{leftNode, rightNode}, n.children[i+1:]...)...)
// A split might shift the target index by 1.
if ng >= n.keys[i] {
i++
}
}
n.children[i].insert(ng, opts)
}
for i, k := range n.keys {
if ng < k {
insertAt(i)
return
}
}
insertAt(len(n.children) - 1)
}
// See btree.find
func (n *innerNode) find(ng ngram) (int, int) {
for i, k := range n.keys {
if ng < k {
return n.children[i].find(ng)
}
}
return n.children[len(n.children)-1].find(ng)
}
// See btree.find
func (n *leaf) find(ng ngram) (int, int) {
return int(n.bucketIndex), int(n.postingIndexOffset)
}
func (n *leaf) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
if n.bucketSize < opts.bucketSize {
return
}
return &leaf{bucketSize: opts.bucketSize / 2},
&leaf{bucketSize: opts.bucketSize / 2},
n.splitKey,
true
}
func (n *innerNode) maybeSplit(opts btreeOpts) (left node, right node, newKey ngram, ok bool) {
if len(n.children) < 2*opts.v {
return
}
return &innerNode{
keys: append(make([]ngram, 0, opts.v-1), n.keys[0:opts.v-1]...),
children: append(make([]node, 0, opts.v), n.children[:opts.v]...),
},
&innerNode{
keys: append(make([]ngram, 0, (2*opts.v)-1), n.keys[opts.v:]...),
children: append(make([]node, 0, 2*opts.v), n.children[opts.v:]...),
},
n.keys[opts.v-1],
true
}
func (n *leaf) visit(f func(n node)) {
f(n)
return
}
func (n *innerNode) visit(f func(n node)) {
f(n)
for _, child := range n.children {
child.visit(f)
}
}
func (bt *btree) String() string {
s := ""
s += fmt.Sprintf("%+v", bt.opts)
bt.root.visit(func(n node) {
switch nd := n.(type) {
case *leaf:
return
case *innerNode:
s += fmt.Sprintf("[")
for _, key := range nd.keys {
s += fmt.Sprintf("%d,", key)
}
s = s[:len(s)-1] // remove trailing comma
s += fmt.Sprintf("]")
}
})
return s
}
type btreeIndex struct {
bt *btree
// We need the index to read buckets into memory.
file IndexFile
// buckets
ngramSec simpleSection
postingIndex simpleSection
}
// SizeBytes returns how much memory this structure uses in the heap.
func (b btreeIndex) SizeBytes() (sz int) {
// btree
if b.bt != nil {
sz += int(pointerSize) + b.bt.sizeBytes()
}
// ngramSec
sz += 8
// postingIndex
sz += 8
// postingDataSentinelOffset
sz += 4
return
}
// Get returns the simple section of the posting list associated with the
// ngram. The logic is as follows:
// 1. Search the inner nodes to find the bucket that may contain ng (in MEM)
// 2. Read the bucket from disk (1 disk access)
// 3. Binary search the bucket (in MEM)
// 4. Return the simple section pointing to the posting list (in MEM)
func (b btreeIndex) Get(ng ngram) (ss simpleSection) {
if b.bt == nil {
return simpleSection{}
}
// find bucket
bucketIndex, postingIndexOffset := b.bt.find(ng)
// read bucket into memory
off, sz := b.getBucket(bucketIndex)
bucket, err := b.file.Read(off, sz)
if err != nil {
return simpleSection{}
}
// find ngram in bucket
getNGram := func(i int) ngram {
i *= ngramEncoding
return ngram(binary.BigEndian.Uint64(bucket[i : i+ngramEncoding]))
}
bucketSize := len(bucket) / ngramEncoding
x := sort.Search(bucketSize, func(i int) bool {
return ng <= getNGram(i)
})
// return associated posting list
if x >= bucketSize || getNGram(x) != ng {
return simpleSection{}
}
return b.getPostingList(postingIndexOffset + x)
}
// getPostingList returns the simple section pointing to the posting list of
// the ngram at ngramIndex.
//
// Assumming we don't hit a page boundary, which should be rare given that we
// only read 8 bytes, we need 1 disk access to read the posting offset.
func (b btreeIndex) getPostingList(ngramIndex int) simpleSection {
relativeOffsetBytes := uint32(ngramIndex) * 4
if relativeOffsetBytes+8 <= b.postingIndex.sz {
// read 2 offsets
o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 8)
if err != nil {
return simpleSection{}
}
start := binary.BigEndian.Uint32(o[0:4])
end := binary.BigEndian.Uint32(o[4:8])
return simpleSection{
off: start,
sz: end - start,
}
} else {
// last ngram => read 1 offset and calculate the size of the posting
// list from the offset of index section.
o, err := b.file.Read(b.postingIndex.off+relativeOffsetBytes, 4)
if err != nil {
return simpleSection{}
}
start := binary.BigEndian.Uint32(o[0:4])
return simpleSection{
off: start,
// The layout of the posting list compound section on disk is
//
// start b.postingIndex.off
// v v
// [[posting lists (simple section)][index (simple section)]]
// <---------->
// last posting list
//
sz: b.postingIndex.off - start,
}
}
}
func (b btreeIndex) getBucket(bucketIndex int) (off uint32, sz uint32) {
// All but the rightmost bucket have exactly bucketSize/2 ngrams
sz = uint32(b.bt.opts.bucketSize / 2 * ngramEncoding)
off = b.ngramSec.off + uint32(bucketIndex)*sz
// Rightmost bucket has size upto the end of the ngramSec.
if bucketIndex == b.bt.lastBucketIndex {
sz = b.ngramSec.off + b.ngramSec.sz - off
}
return
}
// DumpMap is a debug method which returns the btree as an in-memory
// representation. This is how zoekt represents the ngram index in
// google/zoekt.
func (b btreeIndex) DumpMap() map[ngram]simpleSection {
if b.bt == nil {
return nil
}
m := make(map[ngram]simpleSection, b.ngramSec.sz/ngramEncoding)
b.bt.visit(func(no node) {
switch n := no.(type) {
case *leaf:
// read bucket into memory
off, sz := b.getBucket(n.bucketIndex)
bucket, _ := b.file.Read(off, sz)
// decode all ngrams in the bucket and fill map
for i := 0; i < len(bucket)/ngramEncoding; i++ {
gram := ngram(binary.BigEndian.Uint64(bucket[i*8:]))
m[gram] = b.getPostingList(int(n.postingIndexOffset) + i)
}
case *innerNode:
return
}
})
return m
}