1use std::collections::btree_map::Entry;
11use std::collections::BTreeMap;
12
13use columnar::ColumnSpaceUsage;
14use common::ByteCount;
15use serde::{Deserialize, Serialize};
16
17use crate::index::SegmentComponent;
18
19pub enum ComponentSpaceUsage {
21 PerField(PerFieldSpaceUsage),
23 Store(StoreSpaceUsage),
25 Basic(ByteCount),
27}
28
29#[derive(Clone, Debug, Serialize, Deserialize)]
31pub struct SearcherSpaceUsage {
32 segments: Vec<SegmentSpaceUsage>,
33 total: ByteCount,
34}
35
36impl SearcherSpaceUsage {
37 pub(crate) fn new() -> SearcherSpaceUsage {
38 SearcherSpaceUsage {
39 segments: Vec::new(),
40 total: Default::default(),
41 }
42 }
43
44 pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
47 self.total += segment.total();
48 self.segments.push(segment);
49 }
50
51 pub fn segments(&self) -> &[SegmentSpaceUsage] {
53 &self.segments[..]
54 }
55
56 pub fn total(&self) -> ByteCount {
59 self.total
60 }
61}
62
63#[derive(Clone, Debug, Serialize, Deserialize)]
65pub struct SegmentSpaceUsage {
66 num_docs: u32,
67
68 termdict: PerFieldSpaceUsage,
69 postings: PerFieldSpaceUsage,
70 positions: PerFieldSpaceUsage,
71 fast_fields: PerFieldSpaceUsage,
72 fieldnorms: PerFieldSpaceUsage,
73
74 store: StoreSpaceUsage,
75
76 deletes: ByteCount,
77
78 total: ByteCount,
79}
80
81impl SegmentSpaceUsage {
82 #[expect(clippy::too_many_arguments)]
83 pub(crate) fn new(
84 num_docs: u32,
85 termdict: PerFieldSpaceUsage,
86 postings: PerFieldSpaceUsage,
87 positions: PerFieldSpaceUsage,
88 fast_fields: PerFieldSpaceUsage,
89 fieldnorms: PerFieldSpaceUsage,
90 store: StoreSpaceUsage,
91 deletes: ByteCount,
92 ) -> SegmentSpaceUsage {
93 let total = termdict.total()
94 + postings.total()
95 + positions.total()
96 + fast_fields.total()
97 + fieldnorms.total()
98 + store.total()
99 + deletes;
100 SegmentSpaceUsage {
101 num_docs,
102 termdict,
103 postings,
104 positions,
105 fast_fields,
106 fieldnorms,
107 store,
108 deletes,
109 total,
110 }
111 }
112
113 pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
118 use self::ComponentSpaceUsage::*;
119 use crate::index::SegmentComponent::*;
120 match component {
121 Postings => PerField(self.postings().clone()),
122 Positions => PerField(self.positions().clone()),
123 FastFields => PerField(self.fast_fields().clone()),
124 FieldNorms => PerField(self.fieldnorms().clone()),
125 Terms => PerField(self.termdict().clone()),
126 SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
127 Delete => Basic(self.deletes()),
128 }
129 }
130
131 pub fn num_docs(&self) -> u32 {
133 self.num_docs
134 }
135
136 pub fn termdict(&self) -> &PerFieldSpaceUsage {
138 &self.termdict
139 }
140
141 pub fn postings(&self) -> &PerFieldSpaceUsage {
143 &self.postings
144 }
145
146 pub fn positions(&self) -> &PerFieldSpaceUsage {
148 &self.positions
149 }
150
151 pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
153 &self.fast_fields
154 }
155
156 pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
158 &self.fieldnorms
159 }
160
161 pub fn store(&self) -> &StoreSpaceUsage {
163 &self.store
164 }
165
166 pub fn deletes(&self) -> ByteCount {
168 self.deletes
169 }
170
171 pub fn total(&self) -> ByteCount {
173 self.total
174 }
175}
176
177#[derive(Clone, Debug, Serialize, Deserialize)]
183pub struct StoreSpaceUsage {
184 data: ByteCount,
185 offsets: ByteCount,
186}
187
188impl StoreSpaceUsage {
189 pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
190 StoreSpaceUsage { data, offsets }
191 }
192
193 pub fn data_usage(&self) -> ByteCount {
195 self.data
196 }
197
198 pub fn offsets_usage(&self) -> ByteCount {
200 self.offsets
201 }
202
203 pub fn total(&self) -> ByteCount {
205 self.data + self.offsets
206 }
207}
208
209#[derive(Clone, Debug, Serialize, Deserialize)]
214pub struct PerFieldSpaceUsage {
215 fields: BTreeMap<String, FieldUsage>,
216 total: ByteCount,
217}
218
219impl PerFieldSpaceUsage {
220 pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
221 let mut total = ByteCount::default();
222 let mut field_usage_map: BTreeMap<String, FieldUsage> = BTreeMap::new();
223 for field_usage in fields {
224 total += field_usage.total();
225 let field_name = field_usage.field_name().to_string();
226 match field_usage_map.entry(field_name) {
227 Entry::Vacant(entry) => {
228 entry.insert(field_usage);
229 }
230 Entry::Occupied(mut entry) => {
231 entry.get_mut().merge(field_usage);
232 }
233 }
234 }
235 PerFieldSpaceUsage {
236 fields: field_usage_map,
237 total,
238 }
239 }
240
241 pub fn fields(&self) -> impl Iterator<Item = &FieldUsage> {
243 self.fields.values()
244 }
245
246 pub fn total(&self) -> ByteCount {
248 self.total
249 }
250}
251
252#[derive(Clone, Debug, Serialize, Deserialize)]
257pub struct FieldUsage {
258 field_name: String,
259 num_bytes: ByteCount,
260 sub_num_bytes: Vec<Option<ByteCount>>,
264 column_space_usage: Option<ColumnSpaceUsage>,
266}
267
268impl FieldUsage {
269 pub(crate) fn empty(field_name: impl Into<String>) -> FieldUsage {
270 FieldUsage {
271 field_name: field_name.into(),
272 num_bytes: Default::default(),
273 sub_num_bytes: Vec::new(),
274 column_space_usage: None,
275 }
276 }
277
278 pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
279 if self.sub_num_bytes.len() < idx + 1 {
280 self.sub_num_bytes.resize(idx + 1, None);
281 }
282 assert!(self.sub_num_bytes[idx].is_none());
283 self.sub_num_bytes[idx] = Some(size);
284 self.num_bytes += size
285 }
286
287 pub(crate) fn set_column_usage(&mut self, column_space_usage: ColumnSpaceUsage) {
288 self.num_bytes += column_space_usage.total_num_bytes();
289 self.column_space_usage = Some(column_space_usage);
290 }
291
292 pub fn field_name(&self) -> &str {
294 &self.field_name
295 }
296
297 pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
299 &self.sub_num_bytes[..]
300 }
301
302 pub fn column_num_bytes(&self) -> Option<ByteCount> {
304 self.column_space_usage
305 .as_ref()
306 .map(ColumnSpaceUsage::column_num_bytes)
307 }
308
309 pub fn dictionary_num_bytes(&self) -> Option<ByteCount> {
311 self.column_space_usage
312 .as_ref()
313 .and_then(ColumnSpaceUsage::dictionary_num_bytes)
314 }
315
316 pub fn column_space_usage(&self) -> Option<&ColumnSpaceUsage> {
318 self.column_space_usage.as_ref()
319 }
320
321 pub fn total(&self) -> ByteCount {
323 self.num_bytes
324 }
325
326 fn merge(&mut self, other: FieldUsage) {
327 assert_eq!(self.field_name, other.field_name);
328 self.num_bytes += other.num_bytes;
329 if other.sub_num_bytes.len() > self.sub_num_bytes.len() {
330 self.sub_num_bytes.resize(other.sub_num_bytes.len(), None);
331 }
332 for (idx, num_bytes_opt) in other.sub_num_bytes.into_iter().enumerate() {
333 if let Some(num_bytes) = num_bytes_opt {
334 match self.sub_num_bytes[idx] {
335 Some(existing) => self.sub_num_bytes[idx] = Some(existing + num_bytes),
336 None => self.sub_num_bytes[idx] = Some(num_bytes),
337 }
338 }
339 }
340 self.column_space_usage =
341 merge_column_space_usage(self.column_space_usage.take(), other.column_space_usage);
342 }
343}
344
345fn merge_column_space_usage(
346 left: Option<ColumnSpaceUsage>,
347 right: Option<ColumnSpaceUsage>,
348) -> Option<ColumnSpaceUsage> {
349 match (left, right) {
350 (Some(lhs), Some(rhs)) => Some(lhs.merge(&rhs)),
351 (Some(space), None) | (None, Some(space)) => Some(space),
352 (None, None) => None,
353 }
354}
355
356#[cfg(test)]
357mod test {
358 use crate::index::Index;
359 use crate::schema::{Schema, FAST, INDEXED, STORED, TEXT};
360 use crate::space_usage::PerFieldSpaceUsage;
361 use crate::{IndexWriter, Term};
362
363 #[test]
364 fn test_empty() {
365 let schema = Schema::builder().build();
366 let index = Index::create_in_ram(schema);
367 let reader = index.reader().unwrap();
368 let searcher = reader.searcher();
369 let searcher_space_usage = searcher.space_usage().unwrap();
370 assert_eq!(searcher_space_usage.total(), 0u64);
371 }
372
373 fn expect_single_field(
374 field_space: &PerFieldSpaceUsage,
375 field: &str,
376 min_size: u64,
377 max_size: u64,
378 ) {
379 assert!(field_space.total() >= min_size);
380 assert!(field_space.total() <= max_size);
381 assert_eq!(
382 vec![(field.to_string(), field_space.total())],
383 field_space
384 .fields()
385 .map(|usage| (usage.field_name().to_string(), usage.total()))
386 .collect::<Vec<_>>()
387 );
388 }
389
390 #[test]
391 fn test_fast_indexed() -> crate::Result<()> {
392 let mut schema_builder = Schema::builder();
393 let name = schema_builder.add_u64_field("name", FAST | INDEXED);
394 let schema = schema_builder.build();
395 let field_name = schema.get_field_name(name).to_string();
396 let index = Index::create_in_ram(schema);
397
398 {
399 let mut index_writer = index.writer_for_tests()?;
400 index_writer.add_document(doc!(name => 1u64))?;
401 index_writer.add_document(doc!(name => 2u64))?;
402 index_writer.add_document(doc!(name => 10u64))?;
403 index_writer.add_document(doc!(name => 20u64))?;
404 index_writer.commit()?;
405 }
406
407 let reader = index.reader()?;
408 let searcher = reader.searcher();
409 let searcher_space_usage = searcher.space_usage()?;
410 assert!(searcher_space_usage.total() > 0);
411 assert_eq!(1, searcher_space_usage.segments().len());
412
413 let segment = &searcher_space_usage.segments()[0];
414 assert!(segment.total() > 0);
415
416 assert_eq!(4, segment.num_docs());
417
418 expect_single_field(segment.termdict(), &field_name, 1, 512);
419 expect_single_field(segment.postings(), &field_name, 1, 512);
420 assert_eq!(segment.positions().total(), 0);
421 expect_single_field(segment.fast_fields(), &field_name, 1, 512);
422 expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
423 assert_eq!(segment.deletes(), 0);
426 Ok(())
427 }
428
429 #[test]
430 fn test_text() -> crate::Result<()> {
431 let mut schema_builder = Schema::builder();
432 let name = schema_builder.add_text_field("name", TEXT);
433 let schema = schema_builder.build();
434 let field_name = schema.get_field_name(name).to_string();
435 let index = Index::create_in_ram(schema);
436
437 {
438 let mut index_writer = index.writer_for_tests()?;
439 index_writer.add_document(doc!(name => "hi"))?;
440 index_writer.add_document(doc!(name => "this is a test"))?;
441 index_writer.add_document(
442 doc!(name => "some more documents with some word overlap with the other test"),
443 )?;
444 index_writer.add_document(doc!(name => "hello hi goodbye"))?;
445 index_writer.commit()?;
446 }
447
448 let reader = index.reader()?;
449 let searcher = reader.searcher();
450 let searcher_space_usage = searcher.space_usage()?;
451 assert!(searcher_space_usage.total() > 0);
452 assert_eq!(1, searcher_space_usage.segments().len());
453
454 let segment = &searcher_space_usage.segments()[0];
455 assert!(segment.total() > 0);
456
457 assert_eq!(4, segment.num_docs());
458
459 expect_single_field(segment.termdict(), &field_name, 1, 512);
460 expect_single_field(segment.postings(), &field_name, 1, 512);
461 expect_single_field(segment.positions(), &field_name, 1, 512);
462 assert_eq!(segment.fast_fields().total(), 0);
463 expect_single_field(segment.fieldnorms(), &field_name, 1, 512);
464 assert_eq!(segment.deletes(), 0);
467 Ok(())
468 }
469
470 #[test]
471 fn test_store() -> crate::Result<()> {
472 let mut schema_builder = Schema::builder();
473 let name = schema_builder.add_text_field("name", STORED);
474 let schema = schema_builder.build();
475 let index = Index::create_in_ram(schema);
476
477 {
478 let mut index_writer = index.writer_for_tests()?;
479 index_writer.add_document(doc!(name => "hi"))?;
480 index_writer.add_document(doc!(name => "this is a test"))?;
481 index_writer.add_document(
482 doc!(name => "some more documents with some word overlap with the other test"),
483 )?;
484 index_writer.add_document(doc!(name => "hello hi goodbye"))?;
485 index_writer.commit()?;
486 }
487 let reader = index.reader()?;
488 let searcher = reader.searcher();
489 let searcher_space_usage = searcher.space_usage()?;
490 assert!(searcher_space_usage.total() > 0);
491 assert_eq!(1, searcher_space_usage.segments().len());
492
493 let segment = &searcher_space_usage.segments()[0];
494 assert!(segment.total() > 0);
495
496 assert_eq!(4, segment.num_docs());
497
498 assert_eq!(segment.termdict().total(), 0);
499 assert!(segment.termdict().fields().next().is_none());
500 assert_eq!(segment.postings().total(), 0);
501 assert!(segment.postings().fields().next().is_none());
502 assert_eq!(segment.positions().total(), 0);
503 assert!(segment.positions().fields().next().is_none());
504 assert_eq!(segment.fast_fields().total(), 0);
505 assert!(segment.fast_fields().fields().next().is_none());
506 assert_eq!(segment.fieldnorms().total(), 0);
507 assert!(segment.fieldnorms().fields().next().is_none());
508 assert!(segment.store().total() > 0);
509 assert!(segment.store().total() < 512);
510 assert_eq!(segment.deletes(), 0);
511 Ok(())
512 }
513
514 #[test]
515 fn test_deletes() -> crate::Result<()> {
516 let mut schema_builder = Schema::builder();
517 let name = schema_builder.add_u64_field("name", INDEXED);
518 let schema = schema_builder.build();
519 let field_name = schema.get_field_name(name).to_string();
520 let index = Index::create_in_ram(schema);
521
522 {
523 let mut index_writer: IndexWriter = index.writer_for_tests()?;
524 index_writer.add_document(doc!(name => 1u64))?;
525 index_writer.add_document(doc!(name => 2u64))?;
526 index_writer.add_document(doc!(name => 3u64))?;
527 index_writer.add_document(doc!(name => 4u64))?;
528 index_writer.commit()?;
529 }
530
531 {
532 let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
533 index_writer2.delete_term(Term::from_field_u64(name, 2u64));
534 index_writer2.delete_term(Term::from_field_u64(name, 3u64));
535 index_writer2.commit()?;
537 }
538
539 let reader = index.reader()?;
540 let searcher = reader.searcher();
541 let searcher_space_usage = searcher.space_usage()?;
542 assert!(searcher_space_usage.total() > 0);
543 assert_eq!(1, searcher_space_usage.segments().len());
544
545 let segment_space_usage = &searcher_space_usage.segments()[0];
546 assert!(segment_space_usage.total() > 0);
547
548 assert_eq!(2, segment_space_usage.num_docs());
549
550 expect_single_field(segment_space_usage.termdict(), &field_name, 1, 512);
551 expect_single_field(segment_space_usage.postings(), &field_name, 1, 512);
552 assert_eq!(segment_space_usage.positions().total(), 0u64);
553 assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
554 expect_single_field(segment_space_usage.fieldnorms(), &field_name, 1, 512);
555 assert!(segment_space_usage.deletes() > 0);
556 Ok(())
557 }
558}