Skip to main content

tantivy/space_usage/
mod.rs

1//! Representations for the space usage of various parts of a Tantivy index.
2//!
3//! This can be used programmatically, and will also be exposed in a human readable fashion in
4//! tantivy-cli.
5//!
6//! One important caveat for all of this functionality is that none of it currently takes
7//! storage-level details into consideration. For example, if your file system block size is 4096
8//! bytes, we can under-count actual resultant space usage by up to 4095 bytes per file.
9
10use std::collections::btree_map::Entry;
11use std::collections::BTreeMap;
12
13use columnar::ColumnSpaceUsage;
14use common::ByteCount;
15use serde::{Deserialize, Serialize};
16
17use crate::index::SegmentComponent;
18
19/// Enum containing any of the possible space usage results for segment components.
20pub enum ComponentSpaceUsage {
21    /// Data is stored per field in a uniform way
22    PerField(PerFieldSpaceUsage),
23    /// Data is stored in separate pieces in the store
24    Store(StoreSpaceUsage),
25    /// Some sort of raw byte count
26    Basic(ByteCount),
27}
28
29/// Represents combined space usage of an entire searcher and its component segments.
30#[derive(Clone, Debug, Serialize, Deserialize)]
31pub struct SearcherSpaceUsage {
32    segments: Vec<SegmentSpaceUsage>,
33    total: ByteCount,
34}
35
36impl SearcherSpaceUsage {
37    pub(crate) fn new() -> SearcherSpaceUsage {
38        SearcherSpaceUsage {
39            segments: Vec::new(),
40            total: Default::default(),
41        }
42    }
43
44    /// Add a segment, to `self`.
45    /// Performs no deduplication or other intelligence.
46    pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
47        self.total += segment.total();
48        self.segments.push(segment);
49    }
50
51    /// Per segment space usage
52    pub fn segments(&self) -> &[SegmentSpaceUsage] {
53        &self.segments[..]
54    }
55
56    /// Returns total byte usage of this searcher, including all large subcomponents.
57    /// Does not account for smaller things like `meta.json`.
58    pub fn total(&self) -> ByteCount {
59        self.total
60    }
61}
62
63/// Represents combined space usage for all of the large components comprising a segment.
64#[derive(Clone, Debug, Serialize, Deserialize)]
65pub struct SegmentSpaceUsage {
66    num_docs: u32,
67
68    termdict: PerFieldSpaceUsage,
69    postings: PerFieldSpaceUsage,
70    positions: PerFieldSpaceUsage,
71    fast_fields: PerFieldSpaceUsage,
72    fieldnorms: PerFieldSpaceUsage,
73
74    store: StoreSpaceUsage,
75
76    deletes: ByteCount,
77
78    total: ByteCount,
79}
80
81impl SegmentSpaceUsage {
82    #[expect(clippy::too_many_arguments)]
83    pub(crate) fn new(
84        num_docs: u32,
85        termdict: PerFieldSpaceUsage,
86        postings: PerFieldSpaceUsage,
87        positions: PerFieldSpaceUsage,
88        fast_fields: PerFieldSpaceUsage,
89        fieldnorms: PerFieldSpaceUsage,
90        store: StoreSpaceUsage,
91        deletes: ByteCount,
92    ) -> SegmentSpaceUsage {
93        let total = termdict.total()
94            + postings.total()
95            + positions.total()
96            + fast_fields.total()
97            + fieldnorms.total()
98            + store.total()
99            + deletes;
100        SegmentSpaceUsage {
101            num_docs,
102            termdict,
103            postings,
104            positions,
105            fast_fields,
106            fieldnorms,
107            store,
108            deletes,
109            total,
110        }
111    }
112
113    /// Space usage for the given component
114    ///
115    /// Clones the underlying data.
116    /// Use the components directly if this is somehow in performance critical code.
117    pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
118        use self::ComponentSpaceUsage::*;
119        use crate::index::SegmentComponent::*;
120        match component {
121            Postings => PerField(self.postings().clone()),
122            Positions => PerField(self.positions().clone()),
123            FastFields => PerField(self.fast_fields().clone()),
124            FieldNorms => PerField(self.fieldnorms().clone()),
125            Terms => PerField(self.termdict().clone()),
126            SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
127            Delete => Basic(self.deletes()),
128        }
129    }
130
131    /// Num docs in segment
132    pub fn num_docs(&self) -> u32 {
133        self.num_docs
134    }
135
136    /// Space usage for term dictionary
137    pub fn termdict(&self) -> &PerFieldSpaceUsage {
138        &self.termdict
139    }
140
141    /// Space usage for postings list
142    pub fn postings(&self) -> &PerFieldSpaceUsage {
143        &self.postings
144    }
145
146    /// Space usage for positions
147    pub fn positions(&self) -> &PerFieldSpaceUsage {
148        &self.positions
149    }
150
151    /// Space usage for fast fields
152    pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
153        &self.fast_fields
154    }
155
156    /// Space usage for field norms
157    pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
158        &self.fieldnorms
159    }
160
161    /// Space usage for stored documents
162    pub fn store(&self) -> &StoreSpaceUsage {
163        &self.store
164    }
165
166    /// Space usage for document deletions
167    pub fn deletes(&self) -> ByteCount {
168        self.deletes
169    }
170
171    /// Total space usage in bytes for this segment.
172    pub fn total(&self) -> ByteCount {
173        self.total
174    }
175}
176
177/// Represents space usage for the Store for this segment.
178///
179/// This is composed of two parts.
180/// `data` represents the compressed data itself.
181/// `offsets` represents a lookup to find the start of a block
182#[derive(Clone, Debug, Serialize, Deserialize)]
183pub struct StoreSpaceUsage {
184    data: ByteCount,
185    offsets: ByteCount,
186}
187
188impl StoreSpaceUsage {
189    pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
190        StoreSpaceUsage { data, offsets }
191    }
192
193    /// Space usage for the data part of the store
194    pub fn data_usage(&self) -> ByteCount {
195        self.data
196    }
197
198    /// Space usage for the offsets part of the store (doc ID -> offset)
199    pub fn offsets_usage(&self) -> ByteCount {
200        self.offsets
201    }
202
203    /// Total space usage in bytes for this Store
204    pub fn total(&self) -> ByteCount {
205        self.data + self.offsets
206    }
207}
208
209/// Represents space usage for all of the (field, index) pairs that appear in a `CompositeFile`.
210///
211/// A field can appear with a single index (typically 0) or with multiple indexes.
212/// Multiple indexes are used to handle variable length things, where
213#[derive(Clone, Debug, Serialize, Deserialize)]
214pub struct PerFieldSpaceUsage {
215    fields: BTreeMap<String, FieldUsage>,
216    total: ByteCount,
217}
218
219impl PerFieldSpaceUsage {
220    pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
221        let mut total = ByteCount::default();
222        let mut field_usage_map: BTreeMap<String, FieldUsage> = BTreeMap::new();
223        for field_usage in fields {
224            total += field_usage.total();
225            let field_name = field_usage.field_name().to_string();
226            match field_usage_map.entry(field_name) {
227                Entry::Vacant(entry) => {
228                    entry.insert(field_usage);
229                }
230                Entry::Occupied(mut entry) => {
231                    entry.get_mut().merge(field_usage);
232                }
233            }
234        }
235        PerFieldSpaceUsage {
236            fields: field_usage_map,
237            total,
238        }
239    }
240
241    /// Per field space usage
242    pub fn fields(&self) -> impl Iterator<Item = &FieldUsage> {
243        self.fields.values()
244    }
245
246    /// Bytes used by the represented file
247    pub fn total(&self) -> ByteCount {
248        self.total
249    }
250}
251
252/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
253/// comprise it.
254///
255/// See documentation for [`PerFieldSpaceUsage`] for slightly more information.
256#[derive(Clone, Debug, Serialize, Deserialize)]
257pub struct FieldUsage {
258    field_name: String,
259    num_bytes: ByteCount,
260    /// A field can be composed of more than one piece.
261    /// These pieces are indexed by arbitrary numbers starting at zero.
262    /// `self.num_bytes` includes all of `self.sub_num_bytes`.
263    sub_num_bytes: Vec<Option<ByteCount>>,
264    /// Space usage of the column for fast fields, if relevant.
265    column_space_usage: Option<ColumnSpaceUsage>,
266}
267
268impl FieldUsage {
269    pub(crate) fn empty(field_name: impl Into<String>) -> FieldUsage {
270        FieldUsage {
271            field_name: field_name.into(),
272            num_bytes: Default::default(),
273            sub_num_bytes: Vec::new(),
274            column_space_usage: None,
275        }
276    }
277
278    pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
279        if self.sub_num_bytes.len() < idx + 1 {
280            self.sub_num_bytes.resize(idx + 1, None);
281        }
282        assert!(self.sub_num_bytes[idx].is_none());
283        self.sub_num_bytes[idx] = Some(size);
284        self.num_bytes += size
285    }
286
287    pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) {
288        self.num_bytes += column_space_usage.total_num_bytes();
289        self.column_space_usage = Some(column_space_usage);
290    }
291
292    /// Field
293    pub fn field_name(&self) -> &str {
294        &self.field_name
295    }
296
297    /// Space usage for each index
298    pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
299        &self.sub_num_bytes[..]
300    }
301
302    /// Returns the number of bytes used by the column payload, if the field is columnar.
303    pub fn column_num_bytes(&self) -> Option<ByteCount> {
304        self.column_space_usage
305            .as_ref()
306            .map(ColumnSpaceUsage::column_num_bytes)
307    }
308
309    /// Returns the number of bytes used by the dictionary for dictionary-encoded columns.
310    pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
311        self.column_space_usage
312            .as_ref()
313            .and_then(ColumnSpaceUsage::dictionary_num_bytes)
314    }
315
316    /// Returns the space usage of the column, if any.
317    pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> {
318        self.column_space_usage.as_ref()
319    }
320
321    /// Total bytes used for this field in this context
322    pub fn total(&self) -> ByteCount {
323        self.num_bytes
324    }
325
326    fn merge(&mut self, other: FieldUsage) {
327        assert_eq!(self.field_name, other.field_name);
328        self.num_bytes += other.num_bytes;
329        if other.sub_num_bytes.len() > self.sub_num_bytes.len() {
330            self.sub_num_bytes.resize(other.sub_num_bytes.len(), None);
331        }
332        for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() {
333            if let Some(num_bytes) = num_bytes_opt {
334                match self.sub_num_bytes[idx] {
335                    Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes),
336                    None => self.sub_num_bytes[idx] = Some(num_bytes),
337                }
338            }
339        }
340        self.column_space_usage =
341            merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage);
342    }
343}
344
345fn merge_column_space_usage(
346    left: Option<ColumnSpaceUsage>,
347    right: Option<ColumnSpaceUsage>,
348) -> Option<ColumnSpaceUsage> {
349    match (left, right) {
350        (Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)),
351        (Some(space), None) | (None, Some(space)) => Some(space),
352        (None, None) => None,
353    }
354}
355
356#[cfg(test)]
357mod test {
358    use crate::index::Index;
359    use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT};
360    use crate::space_usage::PerFieldSpaceUsage;
361    use crate::{IndexWriter, Term};
362
363    #[test]
364    fn test_empty() {
365        let schema = Schema::builder().build();
366        let index = Index::create_in_ram(schema);
367        let reader = index.reader().unwrap();
368        let searcher = reader.searcher();
369        let searcher_space_usage = searcher.space_usage().unwrap();
370        assert_eq!(searcher_space_usage.total(), 0u64);
371    }
372
373    fn expect_single_field(
374        field_space: &PerFieldSpaceUsage,
375        field: &str,
376        min_size: u64,
377        max_size: u64,
378    ) {
379        assert!(field_space.total() >= min_size);
380        assert!(field_space.total() <= max_size);
381        assert_eq!(
382            vec![(field.to_string(), field_space.total())],
383            field_space
384                .fields()
385                .map(|usage| (usage.field_name().to_string(), usage.total()))
386                .collect::<Vec<_>>()
387        );
388    }
389
390    #[test]
391    fn test_fast_indexed() -> crate::Result<()> {
392        let mut schema_builder = Schema::builder();
393        let name = schema_builder.add_u64_field("name", FAST | INDEXED);
394        let schema = schema_builder.build();
395        let field_name = schema.get_field_name(name).to_string();
396        let index = Index::create_in_ram(schema);
397
398        {
399            let mut index_writer = index.writer_for_tests()?;
400            index_writer.add_document(doc!(name => 1u64))?;
401            index_writer.add_document(doc!(name => 2u64))?;
402            index_writer.add_document(doc!(name => 10u64))?;
403            index_writer.add_document(doc!(name => 20u64))?;
404            index_writer.commit()?;
405        }
406
407        let reader = index.reader()?;
408        let searcher = reader.searcher();
409        let searcher_space_usage = searcher.space_usage()?;
410        assert!(searcher_space_usage.total() > 0);
411        assert_eq!(1, searcher_space_usage.segments().len());
412
413        let segment = &searcher_space_usage.segments()[0];
414        assert!(segment.total() > 0);
415
416        assert_eq!(4, segment.num_docs());
417
418        expect_single_field(segment.termdict(), &field_name, 1, 512);
419        expect_single_field(segment.postings(), &field_name, 1, 512);
420        assert_eq!(segment.positions().total(), 0);
421        expect_single_field(segment.fast_fields(), &field_name, 1, 512);
422        expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
423        // TODO: understand why the following fails
424        //        assert_eq!(0, segment.store().total());
425        assert_eq!(segment.deletes(), 0);
426        Ok(())
427    }
428
429    #[test]
430    fn test_text() -> crate::Result<()> {
431        let mut schema_builder = Schema::builder();
432        let name = schema_builder.add_text_field("name", TEXT);
433        let schema = schema_builder.build();
434        let field_name = schema.get_field_name(name).to_string();
435        let index = Index::create_in_ram(schema);
436
437        {
438            let mut index_writer = index.writer_for_tests()?;
439            index_writer.add_document(doc!(name => "hi"))?;
440            index_writer.add_document(doc!(name => "this is a test"))?;
441            index_writer.add_document(
442                doc!(name => "some more documents with some word overlap with the other test"),
443            )?;
444            index_writer.add_document(doc!(name => "hello hi goodbye"))?;
445            index_writer.commit()?;
446        }
447
448        let reader = index.reader()?;
449        let searcher = reader.searcher();
450        let searcher_space_usage = searcher.space_usage()?;
451        assert!(searcher_space_usage.total() > 0);
452        assert_eq!(1, searcher_space_usage.segments().len());
453
454        let segment = &searcher_space_usage.segments()[0];
455        assert!(segment.total() > 0);
456
457        assert_eq!(4, segment.num_docs());
458
459        expect_single_field(segment.termdict(), &field_name, 1, 512);
460        expect_single_field(segment.postings(), &field_name, 1, 512);
461        expect_single_field(segment.positions(), &field_name, 1, 512);
462        assert_eq!(segment.fast_fields().total(), 0);
463        expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
464        // TODO: understand why the following fails
465        //        assert_eq!(0, segment.store().total());
466        assert_eq!(segment.deletes(), 0);
467        Ok(())
468    }
469
470    #[test]
471    fn test_store() -> crate::Result<()> {
472        let mut schema_builder = Schema::builder();
473        let name = schema_builder.add_text_field("name", STORED);
474        let schema = schema_builder.build();
475        let index = Index::create_in_ram(schema);
476
477        {
478            let mut index_writer = index.writer_for_tests()?;
479            index_writer.add_document(doc!(name => "hi"))?;
480            index_writer.add_document(doc!(name => "this is a test"))?;
481            index_writer.add_document(
482                doc!(name => "some more documents with some word overlap with the other test"),
483            )?;
484            index_writer.add_document(doc!(name => "hello hi goodbye"))?;
485            index_writer.commit()?;
486        }
487        let reader = index.reader()?;
488        let searcher = reader.searcher();
489        let searcher_space_usage = searcher.space_usage()?;
490        assert!(searcher_space_usage.total() > 0);
491        assert_eq!(1, searcher_space_usage.segments().len());
492
493        let segment = &searcher_space_usage.segments()[0];
494        assert!(segment.total() > 0);
495
496        assert_eq!(4, segment.num_docs());
497
498        assert_eq!(segment.termdict().total(), 0);
499        assert!(segment.termdict().fields().next().is_none());
500        assert_eq!(segment.postings().total(), 0);
501        assert!(segment.postings().fields().next().is_none());
502        assert_eq!(segment.positions().total(), 0);
503        assert!(segment.positions().fields().next().is_none());
504        assert_eq!(segment.fast_fields().total(), 0);
505        assert!(segment.fast_fields().fields().next().is_none());
506        assert_eq!(segment.fieldnorms().total(), 0);
507        assert!(segment.fieldnorms().fields().next().is_none());
508        assert!(segment.store().total() > 0);
509        assert!(segment.store().total() < 512);
510        assert_eq!(segment.deletes(), 0);
511        Ok(())
512    }
513
514    #[test]
515    fn test_deletes() -> crate::Result<()> {
516        let mut schema_builder = Schema::builder();
517        let name = schema_builder.add_u64_field("name", INDEXED);
518        let schema = schema_builder.build();
519        let field_name = schema.get_field_name(name).to_string();
520        let index = Index::create_in_ram(schema);
521
522        {
523            let mut index_writer: IndexWriter = index.writer_for_tests()?;
524            index_writer.add_document(doc!(name => 1u64))?;
525            index_writer.add_document(doc!(name => 2u64))?;
526            index_writer.add_document(doc!(name => 3u64))?;
527            index_writer.add_document(doc!(name => 4u64))?;
528            index_writer.commit()?;
529        }
530
531        {
532            let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
533            index_writer2.delete_term(Term::from_field_u64(name, 2u64));
534            index_writer2.delete_term(Term::from_field_u64(name, 3u64));
535            // ok, now we should have a deleted doc
536            index_writer2.commit()?;
537        }
538
539        let reader = index.reader()?;
540        let searcher = reader.searcher();
541        let searcher_space_usage = searcher.space_usage()?;
542        assert!(searcher_space_usage.total() > 0);
543        assert_eq!(1, searcher_space_usage.segments().len());
544
545        let segment_space_usage = &searcher_space_usage.segments()[0];
546        assert!(segment_space_usage.total() > 0);
547
548        assert_eq!(2, segment_space_usage.num_docs());
549
550        expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512);
551        expect_single_field(segment_space_usage.postings(), &field_name, 1, 512);
552        assert_eq!(segment_space_usage.positions().total(), 0u64);
553        assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
554        expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512);
555        assert!(segment_space_usage.deletes() > 0);
556        Ok(())
557    }
558}