summavy/space_usage/
mod.rs

1//! Representations for the space usage of various parts of a Tantivy index.
2//!
3//! This can be used programmatically, and will also be exposed in a human readable fashion in
4//! tantivy-cli.
5//!
6//! One important caveat for all of this functionality is that none of it currently takes
7//! storage-level details into consideration. For example, if your file system block size is 4096
8//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.
9
10use std::collections::HashMap;
11
12use serde::{Deserialize, Serialize};
13
14use crate::schema::Field;
15use crate::SegmentComponent;
16
17/// Indicates space usage in bytes
18pub type ByteCount = usize;
19
20/// Enum containing any of the possible space usage results for segment components.
21pub enum ComponentSpaceUsage {
22    /// Data is stored per field in a uniform way
23    PerField(PerFieldSpaceUsage),
24    /// Data is stored in separate pieces in the store
25    Store(StoreSpaceUsage),
26    /// Some sort of raw byte count
27    Basic(ByteCount),
28}
29
30/// Represents combined space usage of an entire searcher and its component segments.
31#[derive(Clone, Debug, Serialize, Deserialize)]
32pub struct SearcherSpaceUsage {
33    segments: Vec<SegmentSpaceUsage>,
34    total: ByteCount,
35}
36
37impl SearcherSpaceUsage {
38    pub(crate) fn new() -> SearcherSpaceUsage {
39        SearcherSpaceUsage {
40            segments: Vec::new(),
41            total: 0,
42        }
43    }
44
45    /// Add a segment, to `self`.
46    /// Performs no deduplication or other intelligence.
47    pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
48        self.total += segment.total();
49        self.segments.push(segment);
50    }
51
52    /// Per segment space usage
53    pub fn segments(&self) -> &[SegmentSpaceUsage] {
54        &self.segments[..]
55    }
56
57    /// Returns total byte usage of this searcher, including all large subcomponents.
58    /// Does not account for smaller things like `meta.json`.
59    pub fn total(&self) -> ByteCount {
60        self.total
61    }
62}
63
64/// Represents combined space usage for all of the large components comprising a segment.
65#[derive(Clone, Debug, Serialize, Deserialize)]
66pub struct SegmentSpaceUsage {
67    num_docs: u32,
68
69    termdict: PerFieldSpaceUsage,
70    postings: PerFieldSpaceUsage,
71    positions: PerFieldSpaceUsage,
72    fast_fields: PerFieldSpaceUsage,
73    fieldnorms: PerFieldSpaceUsage,
74
75    store: StoreSpaceUsage,
76
77    deletes: ByteCount,
78
79    total: ByteCount,
80}
81
82impl SegmentSpaceUsage {
83    #[allow(clippy::too_many_arguments)]
84    pub(crate) fn new(
85        num_docs: u32,
86        termdict: PerFieldSpaceUsage,
87        postings: PerFieldSpaceUsage,
88        positions: PerFieldSpaceUsage,
89        fast_fields: PerFieldSpaceUsage,
90        fieldnorms: PerFieldSpaceUsage,
91        store: StoreSpaceUsage,
92        deletes: ByteCount,
93    ) -> SegmentSpaceUsage {
94        let total = termdict.total()
95            + postings.total()
96            + positions.total()
97            + fast_fields.total()
98            + fieldnorms.total()
99            + store.total()
100            + deletes;
101        SegmentSpaceUsage {
102            num_docs,
103            termdict,
104            postings,
105            positions,
106            fast_fields,
107            fieldnorms,
108            store,
109            deletes,
110            total,
111        }
112    }
113
114    /// Space usage for the given component
115    ///
116    /// Clones the underlying data.
117    /// Use the components directly if this is somehow in performance critical code.
118    pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
119        use self::ComponentSpaceUsage::*;
120        use crate::SegmentComponent::*;
121        match component {
122            Postings => PerField(self.postings().clone()),
123            Positions => PerField(self.positions().clone()),
124            FastFields => PerField(self.fast_fields().clone()),
125            FieldNorms => PerField(self.fieldnorms().clone()),
126            Terms => PerField(self.termdict().clone()),
127            SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
128            SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
129            Delete => Basic(self.deletes()),
130        }
131    }
132
133    /// Num docs in segment
134    pub fn num_docs(&self) -> u32 {
135        self.num_docs
136    }
137
138    /// Space usage for term dictionary
139    pub fn termdict(&self) -> &PerFieldSpaceUsage {
140        &self.termdict
141    }
142
143    /// Space usage for postings list
144    pub fn postings(&self) -> &PerFieldSpaceUsage {
145        &self.postings
146    }
147
148    /// Space usage for positions
149    pub fn positions(&self) -> &PerFieldSpaceUsage {
150        &self.positions
151    }
152
153    /// Space usage for fast fields
154    pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
155        &self.fast_fields
156    }
157
158    /// Space usage for field norms
159    pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
160        &self.fieldnorms
161    }
162
163    /// Space usage for stored documents
164    pub fn store(&self) -> &StoreSpaceUsage {
165        &self.store
166    }
167
168    /// Space usage for document deletions
169    pub fn deletes(&self) -> ByteCount {
170        self.deletes
171    }
172
173    /// Total space usage in bytes for this segment.
174    pub fn total(&self) -> ByteCount {
175        self.total
176    }
177}
178
179/// Represents space usage for the Store for this segment.
180///
181/// This is composed of two parts.
182/// `data` represents the compressed data itself.
183/// `offsets` represents a lookup to find the start of a block
184#[derive(Clone, Debug, Serialize, Deserialize)]
185pub struct StoreSpaceUsage {
186    data: ByteCount,
187    offsets: ByteCount,
188}
189
190impl StoreSpaceUsage {
191    pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
192        StoreSpaceUsage { data, offsets }
193    }
194
195    /// Space usage for the data part of the store
196    pub fn data_usage(&self) -> ByteCount {
197        self.data
198    }
199
200    /// Space usage for the offsets part of the store (doc ID -> offset)
201    pub fn offsets_usage(&self) -> ByteCount {
202        self.offsets
203    }
204
205    /// Total space usage in bytes for this Store
206    pub fn total(&self) -> ByteCount {
207        self.data + self.offsets
208    }
209}
210
211/// Represents space usage for all of the (field, index) pairs that appear in a `CompositeFile`.
212///
213/// A field can appear with a single index (typically 0) or with multiple indexes.
214/// Multiple indexes are used to handle variable length things, where
215#[derive(Clone, Debug, Serialize, Deserialize)]
216pub struct PerFieldSpaceUsage {
217    fields: HashMap<Field, FieldUsage>,
218    total: ByteCount,
219}
220
221impl PerFieldSpaceUsage {
222    pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
223        let total = fields.values().map(FieldUsage::total).sum();
224        PerFieldSpaceUsage { fields, total }
225    }
226
227    /// Per field space usage
228    pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
229        self.fields.iter()
230    }
231
232    /// Bytes used by the represented file
233    pub fn total(&self) -> ByteCount {
234        self.total
235    }
236}
237
238/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
239/// comprise it.
240///
241/// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
242#[derive(Clone, Debug, Serialize, Deserialize)]
243pub struct FieldUsage {
244    field: Field,
245    num_bytes: ByteCount,
246    /// A field can be composed of more than one piece.
247    /// These pieces are indexed by arbitrary numbers starting at zero.
248    /// `self.num_bytes` includes all of `self.sub_num_bytes`.
249    sub_num_bytes: Vec<Option<ByteCount>>,
250}
251
252impl FieldUsage {
253    pub(crate) fn empty(field: Field) -> FieldUsage {
254        FieldUsage {
255            field,
256            num_bytes: 0,
257            sub_num_bytes: Vec::new(),
258        }
259    }
260
261    pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
262        if self.sub_num_bytes.len() < idx + 1 {
263            self.sub_num_bytes.resize(idx + 1, None);
264        }
265        assert!(self.sub_num_bytes[idx].is_none());
266        self.sub_num_bytes[idx] = Some(size);
267        self.num_bytes += size
268    }
269
270    /// Field
271    pub fn field(&self) -> Field {
272        self.field
273    }
274
275    /// Space usage for each index
276    pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
277        &self.sub_num_bytes[..]
278    }
279
280    /// Total bytes used for this field in this context
281    pub fn total(&self) -> ByteCount {
282        self.num_bytes
283    }
284}
285
286#[cfg(test)]
287mod test {
288    use crate::core::Index;
289    use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
290    use crate::space_usage::{ByteCount, PerFieldSpaceUsage};
291    use crate::Term;
292
293    #[test]
294    fn test_empty() {
295        let schema = Schema::builder().build();
296        let index = Index::create_in_ram(schema);
297        let reader = index.reader().unwrap();
298        let searcher = reader.searcher();
299        let searcher_space_usage = searcher.space_usage().unwrap();
300        assert_eq!(0, searcher_space_usage.total());
301    }
302
303    fn expect_single_field(
304        field_space: &PerFieldSpaceUsage,
305        field: &Field,
306        min_size: ByteCount,
307        max_size: ByteCount,
308    ) {
309        assert!(field_space.total() >= min_size);
310        assert!(field_space.total() <= max_size);
311        assert_eq!(
312            vec![(field, field_space.total())],
313            field_space
314                .fields()
315                .map(|(x, y)| (x, y.total()))
316                .collect::<Vec<_>>()
317        );
318    }
319
320    #[test]
321    fn test_fast_indexed() -> crate::Result<()> {
322        let mut schema_builder = Schema::builder();
323        let name = schema_builder.add_u64_field("name", FAST | INDEXED);
324        let schema = schema_builder.build();
325        let index = Index::create_in_ram(schema);
326
327        {
328            let mut index_writer = index.writer_for_tests()?;
329            index_writer.add_document(doc!(name => 1u64))?;
330            index_writer.add_document(doc!(name => 2u64))?;
331            index_writer.add_document(doc!(name => 10u64))?;
332            index_writer.add_document(doc!(name => 20u64))?;
333            index_writer.commit()?;
334        }
335
336        let reader = index.reader()?;
337        let searcher = reader.searcher();
338        let searcher_space_usage = searcher.space_usage()?;
339        assert!(searcher_space_usage.total() > 0);
340        assert_eq!(1, searcher_space_usage.segments().len());
341
342        let segment = &searcher_space_usage.segments()[0];
343        assert!(segment.total() > 0);
344
345        assert_eq!(4, segment.num_docs());
346
347        expect_single_field(segment.termdict(), &name, 1, 512);
348        expect_single_field(segment.postings(), &name, 1, 512);
349        assert_eq!(0, segment.positions().total());
350        expect_single_field(segment.fast_fields(), &name, 1, 512);
351        expect_single_field(segment.fieldnorms(), &name, 1, 512);
352        // TODO: understand why the following fails
353        //        assert_eq!(0, segment.store().total());
354        assert_eq!(0, segment.deletes());
355        Ok(())
356    }
357
358    #[test]
359    fn test_text() -> crate::Result<()> {
360        let mut schema_builder = Schema::builder();
361        let name = schema_builder.add_text_field("name", TEXT);
362        let schema = schema_builder.build();
363        let index = Index::create_in_ram(schema);
364
365        {
366            let mut index_writer = index.writer_for_tests()?;
367            index_writer.add_document(doc!(name => "hi"))?;
368            index_writer.add_document(doc!(name => "this is a test"))?;
369            index_writer.add_document(
370                doc!(name => "some more documents with some word overlap with the other test"),
371            )?;
372            index_writer.add_document(doc!(name => "hello hi goodbye"))?;
373            index_writer.commit()?;
374        }
375
376        let reader = index.reader()?;
377        let searcher = reader.searcher();
378        let searcher_space_usage = searcher.space_usage()?;
379        assert!(searcher_space_usage.total() > 0);
380        assert_eq!(1, searcher_space_usage.segments().len());
381
382        let segment = &searcher_space_usage.segments()[0];
383        assert!(segment.total() > 0);
384
385        assert_eq!(4, segment.num_docs());
386
387        expect_single_field(segment.termdict(), &name, 1, 512);
388        expect_single_field(segment.postings(), &name, 1, 512);
389        expect_single_field(segment.positions(), &name, 1, 512);
390        assert_eq!(0, segment.fast_fields().total());
391        expect_single_field(segment.fieldnorms(), &name, 1, 512);
392        // TODO: understand why the following fails
393        //        assert_eq!(0, segment.store().total());
394        assert_eq!(0, segment.deletes());
395        Ok(())
396    }
397
398    #[test]
399    fn test_store() -> crate::Result<()> {
400        let mut schema_builder = Schema::builder();
401        let name = schema_builder.add_text_field("name", STORED);
402        let schema = schema_builder.build();
403        let index = Index::create_in_ram(schema);
404
405        {
406            let mut index_writer = index.writer_for_tests()?;
407            index_writer.add_document(doc!(name => "hi"))?;
408            index_writer.add_document(doc!(name => "this is a test"))?;
409            index_writer.add_document(
410                doc!(name => "some more documents with some word overlap with the other test"),
411            )?;
412            index_writer.add_document(doc!(name => "hello hi goodbye"))?;
413            index_writer.commit()?;
414        }
415        let reader = index.reader()?;
416        let searcher = reader.searcher();
417        let searcher_space_usage = searcher.space_usage()?;
418        assert!(searcher_space_usage.total() > 0);
419        assert_eq!(1, searcher_space_usage.segments().len());
420
421        let segment = &searcher_space_usage.segments()[0];
422        assert!(segment.total() > 0);
423
424        assert_eq!(4, segment.num_docs());
425
426        assert_eq!(0, segment.termdict().total());
427        assert_eq!(0, segment.postings().total());
428        assert_eq!(0, segment.positions().total());
429        assert_eq!(0, segment.fast_fields().total());
430        assert_eq!(0, segment.fieldnorms().total());
431        assert!(segment.store().total() > 0);
432        assert!(segment.store().total() < 512);
433        assert_eq!(0, segment.deletes());
434        Ok(())
435    }
436
437    #[test]
438    fn test_deletes() -> crate::Result<()> {
439        let mut schema_builder = Schema::builder();
440        let name = schema_builder.add_u64_field("name", INDEXED);
441        let schema = schema_builder.build();
442        let index = Index::create_in_ram(schema);
443
444        {
445            let mut index_writer = index.writer_for_tests()?;
446            index_writer.add_document(doc!(name => 1u64))?;
447            index_writer.add_document(doc!(name => 2u64))?;
448            index_writer.add_document(doc!(name => 3u64))?;
449            index_writer.add_document(doc!(name => 4u64))?;
450            index_writer.commit()?;
451        }
452
453        {
454            let mut index_writer2 = index.writer(50_000_000)?;
455            index_writer2.delete_term(Term::from_field_u64(name, 2u64));
456            index_writer2.delete_term(Term::from_field_u64(name, 3u64));
457            // ok, now we should have a deleted doc
458            index_writer2.commit()?;
459        }
460
461        let reader = index.reader()?;
462        let searcher = reader.searcher();
463        let searcher_space_usage = searcher.space_usage()?;
464        assert!(searcher_space_usage.total() > 0);
465        assert_eq!(1, searcher_space_usage.segments().len());
466
467        let segment_space_usage = &searcher_space_usage.segments()[0];
468        assert!(segment_space_usage.total() > 0);
469
470        assert_eq!(2, segment_space_usage.num_docs());
471
472        expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
473        expect_single_field(segment_space_usage.postings(), &name, 1, 512);
474        assert_eq!(0, segment_space_usage.positions().total());
475        assert_eq!(0, segment_space_usage.fast_fields().total());
476        expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
477        assert!(segment_space_usage.deletes() > 0);
478        Ok(())
479    }
480}