cipherstash-client 0.34.1-alpha.1

use super::builder::StorageBuilder;
use super::indexer::{IndexerInit, Indexes, IndexesForQuery, QueryOp};
use super::text::{char_filter_prefix_and_suffix, TokenFilter, Tokenizer};
use super::QueryBuilder;
use super::{errors::EncryptionError, plaintext::Plaintext, IndexTerm};
use crate::zerokms::IndexKey;
use cipherstash_config::column;
use cipherstash_core::bloom_filter::{BloomFilter, BloomFilterOps};

impl IndexerInit for MatchIndexer {
    type Args = MatchIndexerOptions;
    type Error = EncryptionError;

    fn try_init<A>(opts: A) -> Result<Self, Self::Error>
    where
        Self::Args: TryFrom<A, Error = Self::Error>,
    {
        let opts = MatchIndexerOptions::try_from(opts)?;
        Ok(Self::new(opts))
    }
}

impl<'k> Indexes<'k, Plaintext> for MatchIndexer {
    fn index(
        &self,
        mut builder: StorageBuilder<'k, Plaintext>,
    ) -> Result<StorageBuilder<'k, Plaintext>, EncryptionError> {
        let index_term = self.encrypt(builder.plaintext(), builder.index_key())?;
        builder.add_index_term(index_term);

        Ok(builder)
    }
}

impl<C> IndexesForQuery<Plaintext, C> for MatchIndexer {
    fn query_index(
        &self,
        builder: QueryBuilder<Plaintext, C>,
        _op: QueryOp,
    ) -> Result<IndexTerm, EncryptionError> {
        let index_term = self.encrypt(builder.plaintext(), builder.index_key())?;
        Ok(index_term)
    }
}

pub struct MatchIndexerOptions {
    pub tokenizer: Tokenizer,
    pub token_filters: Vec<TokenFilter>,
    pub filter_opts: BloomFilterOps,
}

impl Default for MatchIndexerOptions {
    fn default() -> Self {
        Self {
            tokenizer: Tokenizer::Ngram { token_length: 3 },
            token_filters: vec![TokenFilter::Downcase],
            filter_opts: Default::default(),
        }
    }
}

impl TryFrom<&column::IndexType> for MatchIndexerOptions {
    type Error = EncryptionError;

    fn try_from(value: &column::IndexType) -> Result<Self, Self::Error> {
        match value {
            column::IndexType::Match {
                tokenizer,
                token_filters,
                k,
                m,
                ..
            } => Ok(Self {
                tokenizer: Tokenizer::from(*tokenizer),
                token_filters: token_filters.iter().copied().map(From::from).collect(),
                filter_opts: BloomFilterOps::default()
                    .with_filter_size(*m as u32)
                    .with_hash_function_count(*k),
            }),
            _ => Err(EncryptionError::IndexingError(
                "MatchIndexerOptions can only be created from a Match index configuration"
                    .to_string(),
            )),
        }
    }
}

impl Default for MatchIndexer {
    fn default() -> Self {
        Self::new(Default::default())
    }
}

pub struct MatchIndexer {
    tokenizer: Tokenizer,
    token_filters: Vec<TokenFilter>,
    filter_opts: BloomFilterOps,
}

impl MatchIndexer {
    pub fn new(
        MatchIndexerOptions {
            tokenizer,
            token_filters,
            filter_opts,
        }: MatchIndexerOptions,
    ) -> Self {
        Self {
            tokenizer,
            token_filters,
            filter_opts,
        }
    }

    pub fn encrypt(
        &self,
        plaintext: &Plaintext,
        index_key: &IndexKey,
    ) -> Result<IndexTerm, EncryptionError> {
        match plaintext {
            Plaintext::Utf8Str(Some(value)) => {
                // We call the char filter here, to remove '%' and '_' operators from the beginning and end of the string.
                // This is a short term solution, so the most common of LIKE/ILIKE queries will work without any code changes. (based off what is being used
                // in the demo and current clients codebases)
                //
                // This will cover the below LIKE/ILIKE queries:
                // %value%, %value, value%
                //
                // Until we do a proper implementation, the below LIKE/ILIKE queries are not handled correctly.
                //
                // a%e, a_e, value_, _a%
                //
                // Also, this could mean that plaintext values that genuinely have these chars (%, _) will be stripped of those and
                // returned results will be incomplete.
                //
                // A proper implementation of Like/ILike queries will be handled in this card
                // https://www.notion.so/cipherstash/WIP-Driver-more-robust-LIKE-op-handling-7ccf85c873374fb68ad651816f6bd9f6?pvs=4
                let filtered_output = char_filter_prefix_and_suffix(value.as_str(), &['%', '_']);

                let tokens = self.tokenizer.process(filtered_output);
                let terms = self
                    .token_filters
                    .iter()
                    .fold(tokens, |tokens, filter| filter.process(tokens));

                // FIXME: Bloomfilter should be moved out of cipherstash-core and into client so that we can use the IndexKey and avoid this clone
                // Bloomfilter keys won't be zeroized at present
                // See https://linear.app/cipherstash/issue/CIP-844/wip-move-bloomfilter-out-of-cipherstash-core
                let mut filter =
                    BloomFilter::new(*index_key.key(), self.filter_opts).map_err(|e| {
                        EncryptionError::IndexingError(format!(
                            "Bloom Filter init failed with error {e}"
                        ))
                    })?;
                filter.add_terms(terms);

                Ok(IndexTerm::BitMap(filter.into_vec()))
            }
            Plaintext::Utf8Str(None) => Ok(IndexTerm::Null),
            _ => Err(EncryptionError::IndexingError(format!(
                "{plaintext:?} is not supported by match indexes"
            ))),
        }
    }
}

/// Converts a schema (cipherstash-config) Tokenizer (which has no impl)
/// into the type used in the `text` module which *is* implemented 😅
impl From<column::Tokenizer> for Tokenizer {
    fn from(value: column::Tokenizer) -> Self {
        match value {
            column::Tokenizer::Standard => Self::Standard,
            column::Tokenizer::Ngram { token_length } => Self::Ngram { token_length },
            column::Tokenizer::EdgeNgram { min_gram, max_gram } => {
                Self::EdgeNgram { min_gram, max_gram }
            }
        }
    }
}

/// Converts a schema (cipherstash-config) TokenFilter (which has no impl)
/// into the type used in the `text` module which *is* implemented 😅
impl From<column::TokenFilter> for TokenFilter {
    fn from(value: column::TokenFilter) -> Self {
        match value {
            column::TokenFilter::Downcase => TokenFilter::Downcase,
            column::TokenFilter::Upcase => TokenFilter::Upcase,
            column::TokenFilter::Stemmer => TokenFilter::Stemmer,
            column::TokenFilter::Stop => TokenFilter::Stop,
        }
    }
}

#[cfg(test)]
mod tests {
    use column::{ColumnConfig, Index};
    use zerokms_protocol::cipherstash_config::operator;

    use super::*;

    #[test]
    fn test_encrypt_term() -> Result<(), Box<dyn std::error::Error>> {
        let config = ColumnConfig::build("name").add_index(Index::new_match());
        let index = config
            .index_for_operator(&operator::Operator::Like)
            .unwrap();

        let index_key = IndexKey::from([0u8; 32]);
        let match_indexer_opts = MatchIndexerOptions::try_from(&index.index_type)?;
        let indexer = MatchIndexer::new(match_indexer_opts);

        let term = indexer.encrypt(&"Dan Draper".into(), &index_key)?;
        assert!(matches!(term, IndexTerm::BitMap(_)));

        Ok(())
    }
}