use std::collections::HashMap;
use common::ByteCount;
use serde::{Deserialize, Serialize};
use crate::schema::Field;
use crate::SegmentComponent;
pub enum ComponentSpaceUsage {
PerField(PerFieldSpaceUsage),
Store(StoreSpaceUsage),
Basic(ByteCount),
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SearcherSpaceUsage {
segments: Vec<SegmentSpaceUsage>,
total: ByteCount,
}
impl SearcherSpaceUsage {
pub(crate) fn new() -> SearcherSpaceUsage {
SearcherSpaceUsage {
segments: Vec::new(),
total: Default::default(),
}
}
pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
self.total += segment.total();
self.segments.push(segment);
}
pub fn segments(&self) -> &[SegmentSpaceUsage] {
&self.segments[..]
}
pub fn total(&self) -> ByteCount {
self.total
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SegmentSpaceUsage {
num_docs: u32,
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
deletes: ByteCount,
total: ByteCount,
}
impl SegmentSpaceUsage {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
num_docs: u32,
termdict: PerFieldSpaceUsage,
postings: PerFieldSpaceUsage,
positions: PerFieldSpaceUsage,
fast_fields: PerFieldSpaceUsage,
fieldnorms: PerFieldSpaceUsage,
store: StoreSpaceUsage,
deletes: ByteCount,
) -> SegmentSpaceUsage {
let total = termdict.total()
+ postings.total()
+ positions.total()
+ fast_fields.total()
+ fieldnorms.total()
+ store.total()
+ deletes;
SegmentSpaceUsage {
num_docs,
termdict,
postings,
positions,
fast_fields,
fieldnorms,
store,
deletes,
total,
}
}
pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
use self::ComponentSpaceUsage::*;
use crate::SegmentComponent::*;
match component {
Postings => PerField(self.postings().clone()),
Positions => PerField(self.positions().clone()),
FastFields => PerField(self.fast_fields().clone()),
FieldNorms => PerField(self.fieldnorms().clone()),
Terms => PerField(self.termdict().clone()),
SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
Delete => Basic(self.deletes()),
}
}
pub fn num_docs(&self) -> u32 {
self.num_docs
}
pub fn termdict(&self) -> &PerFieldSpaceUsage {
&self.termdict
}
pub fn postings(&self) -> &PerFieldSpaceUsage {
&self.postings
}
pub fn positions(&self) -> &PerFieldSpaceUsage {
&self.positions
}
pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
&self.fast_fields
}
pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
&self.fieldnorms
}
pub fn store(&self) -> &StoreSpaceUsage {
&self.store
}
pub fn deletes(&self) -> ByteCount {
self.deletes
}
pub fn total(&self) -> ByteCount {
self.total
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct StoreSpaceUsage {
data: ByteCount,
offsets: ByteCount,
}
impl StoreSpaceUsage {
pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
StoreSpaceUsage { data, offsets }
}
pub fn data_usage(&self) -> ByteCount {
self.data
}
pub fn offsets_usage(&self) -> ByteCount {
self.offsets
}
pub fn total(&self) -> ByteCount {
self.data + self.offsets
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct PerFieldSpaceUsage {
fields: HashMap<Field, FieldUsage>,
total: ByteCount,
}
impl PerFieldSpaceUsage {
pub(crate) fn new(fields: Vec<FieldUsage>) -> PerFieldSpaceUsage {
let total = fields.iter().map(FieldUsage::total).sum();
let field_usage_map: HashMap<Field, FieldUsage> = fields
.into_iter()
.map(|field_usage| (field_usage.field(), field_usage))
.collect();
PerFieldSpaceUsage {
fields: field_usage_map,
total,
}
}
pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
self.fields.iter()
}
pub fn total(&self) -> ByteCount {
self.total
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FieldUsage {
field: Field,
num_bytes: ByteCount,
sub_num_bytes: Vec<Option<ByteCount>>,
}
impl FieldUsage {
pub(crate) fn empty(field: Field) -> FieldUsage {
FieldUsage {
field,
num_bytes: Default::default(),
sub_num_bytes: Vec::new(),
}
}
pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
if self.sub_num_bytes.len() < idx + 1 {
self.sub_num_bytes.resize(idx + 1, None);
}
assert!(self.sub_num_bytes[idx].is_none());
self.sub_num_bytes[idx] = Some(size);
self.num_bytes += size
}
pub fn field(&self) -> Field {
self.field
}
pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
&self.sub_num_bytes[..]
}
pub fn total(&self) -> ByteCount {
self.num_bytes
}
}
#[cfg(test)]
mod test {
use crate::index::Index;
use crate::schema::{Field, Schema, FAST, INDEXED, STORED, TEXT};
use crate::space_usage::PerFieldSpaceUsage;
use crate::{IndexWriter, Term};
#[test]
fn test_empty() {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage().unwrap();
assert_eq!(searcher_space_usage.total(), 0u64);
}
fn expect_single_field(
field_space: &PerFieldSpaceUsage,
field: &Field,
min_size: u64,
max_size: u64,
) {
assert!(field_space.total() >= min_size);
assert!(field_space.total() <= max_size);
assert_eq!(
vec![(field, field_space.total())],
field_space
.fields()
.map(|(x, y)| (x, y.total()))
.collect::<Vec<_>>()
);
}
#[test]
fn test_fast_indexed() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_u64_field("name", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => 1u64))?;
index_writer.add_document(doc!(name => 2u64))?;
index_writer.add_document(doc!(name => 10u64))?;
index_writer.add_document(doc!(name => 20u64))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage()?;
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
assert_eq!(segment.positions().total(), 0);
expect_single_field(segment.fast_fields(), &name, 1, 512);
expect_single_field(segment.fieldnorms(), &name, 1, 512);
assert_eq!(segment.deletes(), 0);
Ok(())
}
#[test]
fn test_text() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "hi"))?;
index_writer.add_document(doc!(name => "this is a test"))?;
index_writer.add_document(
doc!(name => "some more documents with some word overlap with the other test"),
)?;
index_writer.add_document(doc!(name => "hello hi goodbye"))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage()?;
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
expect_single_field(segment.termdict(), &name, 1, 512);
expect_single_field(segment.postings(), &name, 1, 512);
expect_single_field(segment.positions(), &name, 1, 512);
assert_eq!(segment.fast_fields().total(), 0);
expect_single_field(segment.fieldnorms(), &name, 1, 512);
assert_eq!(segment.deletes(), 0);
Ok(())
}
#[test]
fn test_store() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "hi"))?;
index_writer.add_document(doc!(name => "this is a test"))?;
index_writer.add_document(
doc!(name => "some more documents with some word overlap with the other test"),
)?;
index_writer.add_document(doc!(name => "hello hi goodbye"))?;
index_writer.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage()?;
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment = &searcher_space_usage.segments()[0];
assert!(segment.total() > 0);
assert_eq!(4, segment.num_docs());
assert_eq!(segment.termdict().total(), 0);
assert_eq!(segment.postings().total(), 0);
assert_eq!(segment.positions().total(), 0);
assert_eq!(segment.fast_fields().total(), 0);
assert_eq!(segment.fieldnorms().total(), 0);
assert!(segment.store().total() > 0);
assert!(segment.store().total() < 512);
assert_eq!(segment.deletes(), 0);
Ok(())
}
#[test]
fn test_deletes() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let name = schema_builder.add_u64_field("name", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.add_document(doc!(name => 1u64))?;
index_writer.add_document(doc!(name => 2u64))?;
index_writer.add_document(doc!(name => 3u64))?;
index_writer.add_document(doc!(name => 4u64))?;
index_writer.commit()?;
}
{
let mut index_writer2: IndexWriter = index.writer(50_000_000)?;
index_writer2.delete_term(Term::from_field_u64(name, 2u64));
index_writer2.delete_term(Term::from_field_u64(name, 3u64));
index_writer2.commit()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher_space_usage = searcher.space_usage()?;
assert!(searcher_space_usage.total() > 0);
assert_eq!(1, searcher_space_usage.segments().len());
let segment_space_usage = &searcher_space_usage.segments()[0];
assert!(segment_space_usage.total() > 0);
assert_eq!(2, segment_space_usage.num_docs());
expect_single_field(segment_space_usage.termdict(), &name, 1, 512);
expect_single_field(segment_space_usage.postings(), &name, 1, 512);
assert_eq!(segment_space_usage.positions().total(), 0u64);
assert_eq!(segment_space_usage.fast_fields().total(), 0u64);
expect_single_field(segment_space_usage.fieldnorms(), &name, 1, 512);
assert!(segment_space_usage.deletes() > 0);
Ok(())
}
}