use super::segment::create_segment;
use super::segment::Segment;
use crate::core::Executor;
use crate::core::IndexMeta;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::core::SegmentMetaInventory;
use crate::core::META_FILEPATH;
use crate::directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use crate::directory::MmapDirectory;
use crate::directory::INDEX_WRITER_LOCK;
use crate::directory::{Directory, RAMDirectory};
use crate::error::DataCorruption;
use crate::error::TantivyError;
use crate::indexer::index_writer::HEAP_SIZE_MIN;
use crate::indexer::segment_updater::save_new_metas;
use crate::reader::IndexReader;
use crate::reader::IndexReaderBuilder;
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
use crate::tokenizer::BoxedTokenizer;
use crate::tokenizer::TokenizerManager;
use crate::IndexWriter;
use crate::Result;
use num_cpus;
use std::borrow::BorrowMut;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path;
use std::sync::Arc;
fn load_metas(directory: &dyn Directory, inventory: &SegmentMetaInventory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
IndexMeta::deserialize(&meta_string, &inventory)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.to_path_buf(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
}
#[derive(Clone)]
pub struct Index {
directory: ManagedDirectory,
schema: Schema,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
}
impl Index {
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH)
}
pub fn search_executor(&self) -> &Executor {
self.executor.as_ref()
}
pub fn set_multithread_executor(&mut self, num_threads: usize) {
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
}
pub fn set_default_multithread_executor(&mut self) {
let default_num_threads = num_cpus::get();
self.set_multithread_executor(default_num_threads);
}
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
Index::create(ram_directory, schema).expect("Creating a RAMDirectory should never fail")
}
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&mmap_directory) {
return Err(TantivyError::IndexAlreadyExists);
}
Index::create(mmap_directory, schema)
}
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
if Index::exists(&dir) {
let index = Index::open(dir)?;
if index.schema() == schema {
Ok(index)
} else {
Err(TantivyError::SchemaError(
"An index exists but the schema does not match.".to_string(),
))
}
} else {
Index::create(dir, schema)
}
}
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
Index::create(mmap_directory, schema)
}
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::wrap(dir)?;
Index::from_directory(directory, schema)
}
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
}
fn create_from_metas(
directory: ManagedDirectory,
metas: &IndexMeta,
inventory: SegmentMetaInventory,
) -> Result<Index> {
let schema = metas.schema.clone();
Ok(Index {
directory,
schema,
tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
})
}
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
}
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
_ => None,
};
match tokenizer_name_opt {
Some(tokenizer) => Ok(tokenizer),
None => Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
))),
}
}
pub fn reader(&self) -> Result<IndexReader> {
self.reader_builder().try_into()
}
pub fn reader_builder(&self) -> IndexReaderBuilder {
IndexReaderBuilder::new(self.clone())
}
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open(mmap_directory)
}
pub(crate) fn inventory(&self) -> &SegmentMetaInventory {
&self.inventory
}
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?;
Index::create_from_metas(directory, &metas, inventory)
}
pub fn load_metas(&self) -> Result<IndexMeta> {
load_metas(self.directory(), &self.inventory)
}
pub fn writer_with_num_threads(
&self,
num_threads: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \
or in a different process."
.to_string(),
),
)
})?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
IndexWriter::new(
self,
num_threads,
heap_size_in_bytes_per_thread,
directory_lock,
)
}
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
let mut num_threads = num_cpus::get();
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
}
self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
}
pub fn schema(&self) -> Schema {
self.schema.clone()
}
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self
.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
}
#[doc(hidden)]
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
create_segment(self.clone(), segment_meta)
}
pub fn new_segment(&self) -> Segment {
let segment_meta = self
.inventory
.new_segment_meta(SegmentId::generate_random(), 0);
self.segment(segment_meta)
}
pub fn directory(&self) -> &ManagedDirectory {
&self.directory
}
pub fn directory_mut(&mut self) -> &mut ManagedDirectory {
&mut self.directory
}
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
Ok(self.load_metas()?.segments)
}
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self
.searchable_segment_metas()?
.iter()
.map(SegmentMeta::id)
.collect())
}
}
impl fmt::Debug for Index {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Index({:?})", self.directory)
}
}
#[cfg(test)]
mod tests {
use crate::directory::RAMDirectory;
use crate::schema::Field;
use crate::schema::{Schema, INDEXED, TEXT};
use crate::Index;
use crate::IndexReader;
use crate::IndexWriter;
use crate::ReloadPolicy;
use std::thread;
use std::time::Duration;
#[test]
fn test_indexer_for_field() {
let mut schema_builder = Schema::builder();
let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
assert!(index.tokenizer_for_field(body_field).is_ok());
assert_eq!(
format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
"Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
);
}
#[test]
fn test_index_exists() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_create() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_open() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
}
#[test]
fn create_should_wipeoff_existing() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
}
#[test]
fn open_or_create_exists_but_schema_does_not_match() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
let err = Index::open_or_create(directory, Schema::builder().build());
assert_eq!(
format!("{:?}", err.unwrap_err()),
"SchemaError(\"An index exists but the schema does not match.\")"
);
}
fn throw_away_schema() -> Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
schema_builder.build()
}
#[test]
fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[cfg(feature = "mmap")]
mod mmap_specific {
use super::*;
use std::path::PathBuf;
use tempdir::TempDir;
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(
field: Field,
writer: &mut IndexWriter,
reader: &IndexReader,
) {
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..100 {
count = reader.searcher().num_docs();
if count > 0 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 1);
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..10 {
count = reader.searcher().num_docs();
if count > 1 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 2);
}
#[cfg(not(target_os = "windows"))]
#[test]
fn garbage_collect_works_as_intended() {
let directory = RAMDirectory::create();
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema).unwrap();
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i));
}
writer.commit().unwrap();
let mem_right_after_commit = directory.total_mem_usage();
thread::sleep(Duration::from_millis(1_000));
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads().unwrap();
let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 8_000);
assert!(mem_right_after_merge_finished < mem_right_after_commit);
}
}