use super::builder::StorageBuilder;
use super::indexer::{IndexerInit, Indexes, IndexesForQuery, QueryOp};
use super::text::{char_filter_prefix_and_suffix, TokenFilter, Tokenizer};
use super::QueryBuilder;
use super::{errors::EncryptionError, plaintext::Plaintext, IndexTerm};
use crate::zerokms::IndexKey;
use cipherstash_config::column;
use cipherstash_core::bloom_filter::{BloomFilter, BloomFilterOps};
impl IndexerInit for MatchIndexer {
type Args = MatchIndexerOptions;
type Error = EncryptionError;
fn try_init<A>(opts: A) -> Result<Self, Self::Error>
where
Self::Args: TryFrom<A, Error = Self::Error>,
{
let opts = MatchIndexerOptions::try_from(opts)?;
Ok(Self::new(opts))
}
}
impl<'k> Indexes<'k, Plaintext> for MatchIndexer {
fn index(
&self,
mut builder: StorageBuilder<'k, Plaintext>,
) -> Result<StorageBuilder<'k, Plaintext>, EncryptionError> {
let index_term = self.encrypt(builder.plaintext(), builder.index_key())?;
builder.add_index_term(index_term);
Ok(builder)
}
}
impl<C> IndexesForQuery<Plaintext, C> for MatchIndexer {
fn query_index(
&self,
builder: QueryBuilder<Plaintext, C>,
_op: QueryOp,
) -> Result<IndexTerm, EncryptionError> {
let index_term = self.encrypt(builder.plaintext(), builder.index_key())?;
Ok(index_term)
}
}
pub struct MatchIndexerOptions {
pub tokenizer: Tokenizer,
pub token_filters: Vec<TokenFilter>,
pub filter_opts: BloomFilterOps,
}
impl Default for MatchIndexerOptions {
fn default() -> Self {
Self {
tokenizer: Tokenizer::Ngram { token_length: 3 },
token_filters: vec![TokenFilter::Downcase],
filter_opts: Default::default(),
}
}
}
impl TryFrom<&column::IndexType> for MatchIndexerOptions {
type Error = EncryptionError;
fn try_from(value: &column::IndexType) -> Result<Self, Self::Error> {
match value {
column::IndexType::Match {
tokenizer,
token_filters,
k,
m,
..
} => Ok(Self {
tokenizer: Tokenizer::from(*tokenizer),
token_filters: token_filters.iter().copied().map(From::from).collect(),
filter_opts: BloomFilterOps::default()
.with_filter_size(*m as u32)
.with_hash_function_count(*k),
}),
_ => Err(EncryptionError::IndexingError(
"MatchIndexerOptions can only be created from a Match index configuration"
.to_string(),
)),
}
}
}
impl Default for MatchIndexer {
fn default() -> Self {
Self::new(Default::default())
}
}
pub struct MatchIndexer {
tokenizer: Tokenizer,
token_filters: Vec<TokenFilter>,
filter_opts: BloomFilterOps,
}
impl MatchIndexer {
pub fn new(
MatchIndexerOptions {
tokenizer,
token_filters,
filter_opts,
}: MatchIndexerOptions,
) -> Self {
Self {
tokenizer,
token_filters,
filter_opts,
}
}
pub fn encrypt(
&self,
plaintext: &Plaintext,
index_key: &IndexKey,
) -> Result<IndexTerm, EncryptionError> {
match plaintext {
Plaintext::Utf8Str(Some(value)) => {
let filtered_output = char_filter_prefix_and_suffix(value.as_str(), &['%', '_']);
let tokens = self.tokenizer.process(filtered_output);
let terms = self
.token_filters
.iter()
.fold(tokens, |tokens, filter| filter.process(tokens));
let mut filter =
BloomFilter::new(*index_key.key(), self.filter_opts).map_err(|e| {
EncryptionError::IndexingError(format!(
"Bloom Filter init failed with error {e}"
))
})?;
filter.add_terms(terms);
Ok(IndexTerm::BitMap(filter.into_vec()))
}
Plaintext::Utf8Str(None) => Ok(IndexTerm::Null),
_ => Err(EncryptionError::IndexingError(format!(
"{plaintext:?} is not supported by match indexes"
))),
}
}
}
impl From<column::Tokenizer> for Tokenizer {
fn from(value: column::Tokenizer) -> Self {
match value {
column::Tokenizer::Standard => Self::Standard,
column::Tokenizer::Ngram { token_length } => Self::Ngram { token_length },
column::Tokenizer::EdgeNgram { min_gram, max_gram } => {
Self::EdgeNgram { min_gram, max_gram }
}
}
}
}
impl From<column::TokenFilter> for TokenFilter {
fn from(value: column::TokenFilter) -> Self {
match value {
column::TokenFilter::Downcase => TokenFilter::Downcase,
column::TokenFilter::Upcase => TokenFilter::Upcase,
column::TokenFilter::Stemmer => TokenFilter::Stemmer,
column::TokenFilter::Stop => TokenFilter::Stop,
}
}
}
#[cfg(test)]
mod tests {
use column::{ColumnConfig, Index};
use zerokms_protocol::cipherstash_config::operator;
use super::*;
#[test]
fn test_encrypt_term() -> Result<(), Box<dyn std::error::Error>> {
let config = ColumnConfig::build("name").add_index(Index::new_match());
let index = config
.index_for_operator(&operator::Operator::Like)
.unwrap();
let index_key = IndexKey::from([0u8; 32]);
let match_indexer_opts = MatchIndexerOptions::try_from(&index.index_type)?;
let indexer = MatchIndexer::new(match_indexer_opts);
let term = indexer.encrypt(&"Dan Draper".into(), &index_key)?;
assert!(matches!(term, IndexTerm::BitMap(_)));
Ok(())
}
}