pub mod alphabet;
pub mod text_with_rank_support;
mod batch_computed_cursors;
mod config;
mod construction;
mod cursor;
mod lookup_table;
mod sampled_suffix_array;
mod text_id_search_tree;
use num_traits::NumCast;
#[doc(inline)]
pub use alphabet::Alphabet;
#[doc(inline)]
pub use config::FmIndexConfig;
#[doc(inline)]
pub use config::PerformancePriority;
#[doc(inline)]
pub use construction::IndexStorage;
#[doc(inline)]
pub use cursor::Cursor;
use batch_computed_cursors::BatchComputedCursors;
use construction::DataStructures;
use lookup_table::LookupTables;
use sampled_suffix_array::SampledSuffixArray;
use text_id_search_tree::TexdIdSearchTree;
use text_with_rank_support::{
Block64, Block512, CondensedTextWithRankSupport, FlatTextWithRankSupport, TextWithRankSupport,
};
#[cfg_attr(feature = "savefile", derive(savefile::savefile_derive::Savefile))]
#[savefile_doc_hidden]
#[derive(Clone)]
pub struct FmIndex<I, R = CondensedTextWithRankSupport<I, Block64>> {
alphabet: Alphabet,
count: Vec<usize>,
text_with_rank_support: R,
suffix_array: SampledSuffixArray<I>,
text_ids: TexdIdSearchTree,
lookup_tables: LookupTables<I>,
}
pub type FmIndexCondensed64<I> = FmIndex<I, CondensedTextWithRankSupport<I, Block64>>;
pub type FmIndexCondensed512<I> = FmIndex<I, CondensedTextWithRankSupport<I, Block512>>;
pub type FmIndexFlat64<I> = FmIndex<I, FlatTextWithRankSupport<I, Block64>>;
pub type FmIndexFlat512<I> = FmIndex<I, FlatTextWithRankSupport<I, Block512>>;
const BATCH_SIZE: usize = 64;
impl<I: IndexStorage, R: TextWithRankSupport<I>> FmIndex<I, R> {
fn new<T: AsRef<[u8]>>(
texts: impl IntoIterator<Item = T>,
alphabet: Alphabet,
config: FmIndexConfig<I, R>,
) -> Self {
let DataStructures {
count,
sampled_suffix_array,
text_ids,
text_with_rank_support,
} = construction::create_data_structures::<I, R, T>(texts, &config, &alphabet);
let num_searchable_dense_symbols = alphabet.num_searchable_dense_symbols();
let mut index = FmIndex {
alphabet,
count,
text_with_rank_support,
suffix_array: sampled_suffix_array,
text_ids,
lookup_tables: LookupTables::new_empty(),
};
lookup_table::fill_lookup_tables(
&mut index,
config.lookup_table_depth,
num_searchable_dense_symbols,
);
index
}
pub fn count(&self, query: &[u8]) -> usize {
self.cursor_for_query(query).count()
}
pub fn count_many<'a>(
&'a self,
queries: impl IntoIterator<Item = &'a [u8]>,
) -> impl Iterator<Item = usize> {
self.cursors_for_many_queries(queries)
.map(|cursor| cursor.count())
}
pub fn locate(&self, query: &[u8]) -> impl Iterator<Item = Hit> {
let cursor = self.cursor_for_query(query);
self.locate_interval(cursor.interval())
}
pub fn locate_many<'a>(
&'a self,
queries: impl IntoIterator<Item = &'a [u8]>,
) -> impl Iterator<Item: Iterator<Item = Hit>> {
self.cursors_for_many_queries(queries)
.map(|cursor| self.locate_interval(cursor.interval()))
}
fn locate_interval(&self, interval: HalfOpenInterval) -> impl Iterator<Item = Hit> {
self.suffix_array
.recover_range(interval.start..interval.end, self)
.map(|idx| {
let (text_id, position) = self
.text_ids
.backtransfrom_concatenated_text_index(<usize as NumCast>::from(idx).unwrap());
Hit { text_id, position }
})
}
pub fn cursor_empty<'a>(&'a self) -> Cursor<'a, I, R> {
Cursor {
index: self,
interval: HalfOpenInterval {
start: 0,
end: self.total_text_len(),
},
}
}
pub fn cursor_for_query<'a>(&'a self, query: &[u8]) -> Cursor<'a, I, R> {
let (remaining_query, query_suffix) = self.split_query_for_lookup(query);
let interval = self.lookup_tables.lookup(query_suffix, &self.alphabet);
let mut cursor = Cursor {
index: self,
interval,
};
for &symbol in remaining_query.iter().rev() {
cursor.extend_query_front(symbol);
if cursor.count() == 0 {
break;
}
}
cursor
}
pub fn cursors_for_many_queries<'a>(
&'a self,
queries: impl IntoIterator<Item = &'a [u8]>,
) -> impl Iterator<Item = Cursor<'a, I, R>> {
BatchComputedCursors::<I, R, _, BATCH_SIZE>::new(self, queries.into_iter())
}
fn cursor_for_query_without_alphabet_translation<'a>(
&'a self,
query: &[u8],
) -> Cursor<'a, I, R> {
let (remaining_query, query_suffix) = self.split_query_for_lookup(query);
let interval = self
.lookup_tables
.lookup_without_alphabet_translation(query_suffix);
let mut cursor = Cursor {
index: self,
interval,
};
for &symbol in remaining_query.iter().rev() {
cursor.extend_front_without_alphabet_translation(symbol);
if cursor.count() == 0 {
break;
}
}
cursor
}
fn lf_mapping_step(&self, symbol: u8, idx: usize) -> usize {
self.count[symbol as usize] + self.text_with_rank_support.rank(symbol, idx)
}
fn split_query_for_lookup<'a>(&self, query: &'a [u8]) -> (&'a [u8], &'a [u8]) {
let lookup_depth = std::cmp::min(query.len(), self.lookup_tables.max_depth());
let suffix_idx = query.len() - lookup_depth;
query.split_at(suffix_idx)
}
pub fn alphabet(&self) -> &Alphabet {
&self.alphabet
}
pub fn num_texts(&self) -> usize {
self.text_ids.sentinel_indices.len()
}
pub fn total_text_len(&self) -> usize {
self.text_with_rank_support.text_len()
}
#[cfg(feature = "savefile")]
const VERSION_FOR_SAVEFILE: u32 = 0;
#[cfg(feature = "savefile")]
pub fn load_from_reader(
reader: &mut impl std::io::Read,
) -> Result<Self, savefile::SavefileError> {
savefile::load(reader, Self::VERSION_FOR_SAVEFILE)
}
#[cfg(feature = "savefile")]
pub fn load_from_file(
filepath: impl AsRef<std::path::Path>,
) -> Result<Self, savefile::SavefileError> {
savefile::load_file(filepath, Self::VERSION_FOR_SAVEFILE)
}
#[cfg(feature = "savefile")]
pub fn save_to_writer(
&self,
writer: &mut impl std::io::Write,
) -> Result<(), savefile::SavefileError> {
savefile::save(writer, Self::VERSION_FOR_SAVEFILE, self)
}
#[cfg(feature = "savefile")]
pub fn save_to_file(
&self,
filepath: impl AsRef<std::path::Path>,
) -> Result<(), savefile::SavefileError> {
savefile::save_file(filepath, Self::VERSION_FOR_SAVEFILE, self)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Hit {
pub text_id: usize,
pub position: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub(crate) struct HalfOpenInterval {
pub start: usize,
pub end: usize,
}
mod maybe_savefile {
#[cfg(feature = "savefile")]
pub trait MaybeSavefile: savefile::Savefile {}
#[cfg(not(feature = "savefile"))]
pub trait MaybeSavefile {}
impl MaybeSavefile for i32 {}
impl MaybeSavefile for u32 {}
impl MaybeSavefile for i64 {}
}
mod sealed {
pub trait Sealed {}
}