fm-index 0.1.2 - Docs.rs

//! This crate provides implementations of *FM-Index* and its variants.
//!
//! FM-Index, originally proposed by Paolo Ferragina and Giovanni Manzini [1],
//! is a compressed full-text index which supports the following queries:
//!
//! - `count`: Given a pattern string, counts the number of its occurrences.
//! - `locate`: Given a pattern string, lists the all position of its occurrences.
//! - `extract`: Given an integer, gets the character of the text at that position.
//!
//! `fm-index` crate does not support the third query (extracting a character from arbitrary position) now.
//! Instead, it provides backward/forward iterators that return the text characters starting from a search result.
//!
//! # Usage
//!
//! Add this to your `Cargo.toml`.
//!
//! ```toml
//! [dependencies]
//! fm-index = "0.1"
//! ```
//!
//! # Example
//! ```
//! use fm_index::converter::RangeConverter;
//! use fm_index::suffix_array::SuffixOrderSampler;
//! use fm_index::{BackwardSearchIndex, FMIndex};
//!
//! // Prepare a text string to search for patterns.
//! let text = concat!(
//!     "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
//!     "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",
//!     "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.",
//!     "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.",
//! ).as_bytes().to_vec();
//!
//! // Converter converts each character into packed representation.
//! // `' '` ~ `'~'` represents a range of ASCII printable characters.
//! let converter = RangeConverter::new(b' ', b'~');
//!
//! // To perform locate queries, we need to retain suffix array generated in the construction phase.
//! // However, we don't need the whole array since we can interpolate missing elements in a suffix array from others.
//! // A sampler will _sieve_ a suffix array for this purpose.
//! // You can also use `NullSampler` if you don't perform location queries (disabled in type-level).
//! let sampler = SuffixOrderSampler::new().level(2);
//! let index = FMIndex::new(text, converter, sampler);
//!
//! // Search for a pattern string.
//! let pattern = "dolor";
//! let search = index.search_backward(pattern);
//!
//! // Count the number of occurrences.
//! let n = search.count();
//! assert_eq!(n, 4);
//!
//! // List the position of all occurrences.
//! let positions = search.locate();
//! assert_eq!(positions, vec![246, 12, 300, 103]);
//!
//! // Extract preceding characters from a search position.
//! let i = 0;
//! let mut prefix = search.iter_backward(i).take(16).collect::<Vec<u8>>();
//! prefix.reverse();
//! assert_eq!(prefix, b"Duis aute irure ".to_owned());
//!
//! // Extract succeeding characters from a search position.
//! let i = 3;
//! let postfix = search.iter_forward(i).take(20).collect::<Vec<u8>>();
//! assert_eq!(postfix, b"dolore magna aliqua.".to_owned());
//!
//! // Search can be chained backward.
//! let search_chained = search.search_backward("et ");
//! assert_eq!(search_chained.count(), 1);
//! ```
//!
//! # Implementations
//!
//! ## FM-Index
//!
//! Implementation is based on [1]. The index is constructed with a suffix array generated by SA-IS [3] in _O(n)_ time, where _n_ is the size of a text string.
//!
//! Basically it consists of
//!
//! - a Burrows-Wheeler transform (BWT) of the text string represented with _wavelet matrix_ [4]
//! - an array of size _O(σ)_ (_σ_: number of characters) which stores the number of characters smaller than a given character
//! - a (sampled) suffix array
//!
//! ## Run-Length FM-Index
//!
//! Based on [2]. The index is constructed with a suffix array generated by SA-IS [3].
//!
//! It consists of
//!
//! - a wavelet matrix that stores the run heads of BWT of the text string
//! - a succinct bit vector which stores the run lengths of BWT of the text string
//! - a succinct bit vector which stores the run lengths of BWT of the text string
//! sorted in alphabetical order of corresponding run heads
//! - an array of size _O(σ)_ (_σ_: number of characters)
//! which stores the number of characters smaller than a given character in run heads
//!
//! # Reference
//!
//! [1] Ferragina, P., & Manzini, G. (2000). Opportunistic data structures with applications. Annual Symposium on Foundations of Computer Science - Proceedings, 390–398. https://doi.org/10.1109/sfcs.2000.892127
//!
//! [2] Mäkinen, V., & Navarro, G. (2005). Succinct suffix arrays based on run-length encoding. In Lecture Notes in Computer Science (Vol. 3537). https://doi.org/10.1007/11496656_5
//!
//! [3] Ge Nong, Sen Zhang, & Wai Hong Chan. (2010). Two Efficient Algorithms for Linear Time Suffix Array Construction. IEEE Transactions on Computers, 60(10), 1471–1484. https://doi.org/10.1109/tc.2010.188
//!
//! [4] Claude F., Navarro G. (2012). The Wavelet Matrix. In: Calderón-Benavides L., González-Caro C., Chávez E., Ziviani N. (eds) String Processing and Information Retrieval. SPIRE 2012. https://doi.org/10.1007/978-3-642-34109-0_18
//!
//! # License
//!
//! MIT
#![allow(clippy::len_without_is_empty)]

pub mod converter;
pub mod suffix_array;

mod character;
mod fm_index;
mod iter;
mod rlfmi;
mod sais;
mod search;
mod util;
mod wavelet_matrix;

pub use crate::fm_index::FMIndex;
pub use crate::rlfmi::RLFMIndex;

pub use iter::{BackwardIterableIndex, ForwardIterableIndex};
pub use search::BackwardSearchIndex;