valentinus 1.1.2

A thread-safe vector database for model inference inside LMDB.
Documentation
use log::{debug, info};
use serde_json::Value;

/// Possible errors while filtering may be due to
///
/// parsing, invalid operation value etc.
#[derive(Debug)]
pub enum Md2fsError {
    SerdeJsonError,
    ParseError,
}

/// Where clause keys
#[derive(Debug, PartialEq, Eq)]
enum FilterOperations {
    EqualTo,
    GreaterThanEqualTo,
    GreaterThan,
    LessThan,
    LessThanEqualTo,
    In,
    Noop,
}

impl FilterOperations {
    /// Seek and return enum for pattern matching
    fn get_enum(s: &str) -> FilterOperations {
        let op = s.strip_prefix('$').unwrap_or(s);
        match op {
            "eq" => FilterOperations::EqualTo,
            "gt" => FilterOperations::GreaterThan,
            "gte" => FilterOperations::GreaterThanEqualTo,
            "lt" => FilterOperations::LessThan,
            "lte" => FilterOperations::LessThanEqualTo,
            "in" => FilterOperations::In,
            _ => FilterOperations::EqualTo,
        }
    }
}

#[derive(Debug)]
enum MetadataFilterResult {
    U64Filter(MetadataFilter<u64>),
    StringFilter(MetadataFilter<String>),
    StringVecFilter(MetadataFilter<Vec<String>>),
}

/// Metadata filter
#[derive(Debug)]
pub struct MetadataFilter<T> {
    /// Key to filter on
    key: String,
    /// Valid json type to filter on
    value: T,
    /// Filter operations eq, gt, gte, in, lt, lte
    filter: FilterOperations,
}

impl<T: Default> Default for MetadataFilter<T> {
    fn default() -> Self {
        MetadataFilter {
            key: Default::default(),
            value: Default::default(),
            filter: FilterOperations::Noop,
        }
    }
}


pub trait Filter<T> {
    fn create_filter(raw: &str) -> Result<MetadataFilterResult, Md2fsError>;
    fn eq(self, m: MetadataFilter<T>) -> bool;
    fn gt(self, m: MetadataFilter<T>) -> bool;
    fn gte(self, m: MetadataFilter<T>) -> bool;
    fn lt(self, m: MetadataFilter<T>) -> bool;
    fn lte(self, m: MetadataFilter<T>) -> bool;
}

impl<T> Filter<T> for MetadataFilter<T>
where
    T: PartialEq + PartialOrd + Default,
{
    /// Create a filter on a valid string value
    fn create_filter(raw: &str) -> Result<MetadataFilterResult, Md2fsError> {
        let v: Result<Value, serde_json::Error> = serde_json::from_str(raw);
        if v.is_err() {
            debug!("invalid json string");
            return Err(Md2fsError::SerdeJsonError);
        }
        let u_v: Value = v.map_err(|_| Md2fsError::ParseError)?;
        let vo = u_v.as_object();
        if vo.is_none() {
            debug!("could not parse string");
            return Err(Md2fsError::ParseError);
        }
        let key = match vo {
            Some(v) => v.keys().next().unwrap_or(&String::new()).to_string(),
            _ => String::new(),
        };
        let vo2 = match vo {
            Some(v) => v[&key].as_object(),
            _ => None,
        };
        if vo2.is_none() {
            info!("no op key found, processing as metadata");
            let p_value = &u_v[&key];
            if p_value.is_string() {
                let value: String = p_value.as_str().unwrap_or_default().to_string();
                return Ok(MetadataFilterResult::StringFilter(MetadataFilter {
                    key,
                    filter: FilterOperations::Noop,
                    value,
                }));
            } else {
                let value: u64 = p_value.as_u64().unwrap_or_default();
                return Ok(MetadataFilterResult::U64Filter(MetadataFilter {
                    key,
                    filter: FilterOperations::Noop,
                    value,
                }));
            }
        }
        let op = match vo2 {
            Some(v) => v.keys().next().unwrap_or(&String::new()).to_string(),
            _ => String::new(),
        };
        let value = match vo2 {
            Some(v) => &v[&op],
            _ => &Value::Null,
        };
        let filter: FilterOperations = FilterOperations::get_enum(&op);

        if filter == FilterOperations::In {
            if let Some(arr) = value.as_array() {
                let str_vec: Vec<String> = arr
                    .iter()
                    .filter_map(|v| v.as_str().map(String::from))
                    .collect();
                return Ok(MetadataFilterResult::StringVecFilter(MetadataFilter {
                    key,
                    filter,
                    value: str_vec,
                }));
            }
        }

        if value.is_string() {
            let value = value.as_str().unwrap_or_default().to_string();
            return Ok(MetadataFilterResult::StringFilter(MetadataFilter {
                key,
                filter,
                value,
            }));
        }
        if value.is_number() {
            let value = value.as_u64().unwrap_or_default();
            return Ok(MetadataFilterResult::U64Filter(MetadataFilter {
                key,
                filter,
                value,
            }));
        }
        Err(Md2fsError::ParseError)
    }
    fn eq(self, m: MetadataFilter<T>) -> bool {
        self.key == m.key && self.value == m.value
    }
    fn gt(self, m: MetadataFilter<T>) -> bool {
        self.key == m.key && m.value > self.value
    }
    fn gte(self, m: MetadataFilter<T>) -> bool {
        self.key == m.key && m.value >= self.value
    }
    fn lt(self, m: MetadataFilter<T>) -> bool {
        self.key == m.key && m.value < self.value
    }
    fn lte(self, m: MetadataFilter<T>) -> bool {
        self.key == m.key && m.value <= self.value
    }
}

fn process_filter(raw_f: &str, raw_m: &str) -> Result<bool, Md2fsError> {
    // 1. Parse the filter JSON to get key, op, and filter value.
    let filter_result = MetadataFilter::<String>::create_filter(raw_f)?;

    // 2. Parse the metadata JSON into a generic Value object.
    let meta_json: Value = serde_json::from_str(raw_m).map_err(|_| Md2fsError::SerdeJsonError)?;
    let meta_obj = match meta_json.as_object() {
        Some(obj) => obj,
        None => return Ok(false), // Metadata is not a valid JSON object.
    };

    // 3. Match on the filter type and perform the check.
    match filter_result {
        MetadataFilterResult::StringVecFilter(f_vec) => {
            if let Some(meta_val) = meta_obj.get(&f_vec.key) {
                if let Some(m_str) = meta_val.as_str() {
                    if f_vec.filter == FilterOperations::In {
                        return Ok(f_vec.value.contains(&m_str.to_string()));
                    }
                }
            }
            Ok(false)
        }
        MetadataFilterResult::StringFilter(f_str) => {
            if let Some(meta_val) = meta_obj.get(&f_str.key) {
                if let Some(m_str) = meta_val.as_str() {
                    if f_str.filter == FilterOperations::EqualTo || f_str.filter == FilterOperations::Noop {
                        return Ok(f_str.value == m_str);
                    }
                }
            }
            Ok(false)
        }
        MetadataFilterResult::U64Filter(f_u64) => {
            if let Some(meta_val) = meta_obj.get(&f_u64.key) {
                if let Some(m_u64) = meta_val.as_u64() {
                    return Ok(match f_u64.filter {
                        FilterOperations::EqualTo | FilterOperations::Noop => m_u64 == f_u64.value,
                        FilterOperations::GreaterThan => m_u64 > f_u64.value,
                        FilterOperations::GreaterThanEqualTo => m_u64 >= f_u64.value,
                        FilterOperations::LessThan => m_u64 < f_u64.value,
                        FilterOperations::LessThanEqualTo => m_u64 <= f_u64.value,
                        _ => false,
                    });
                }
            }
            Ok(false)
        }
    }
}

/// Proces two raw json strings. Let `raw_f` be a valid metadata filter
///
/// and `raw_m` be valid metadata that is not a nested object. Returns true
///
/// on a valid match. The equivalent of an SQL `where` clause.
/// Proces two raw json strings. Let `raw_f` be a valid metadata filter
///
/// and `raw_m` be valid metadata that is not a nested object. Returns true
///
/// on a valid match. The equivalent of an SQL `where` clause.
pub fn filter_where(raw_f: &[String], raw_m: &[String]) -> Result<bool, Md2fsError> {
    if raw_f.is_empty() {
        return Ok(true);
    }

    // For each filter, check if it matches at least one of the metadata parts.
    for filter in raw_f {
        let mut filter_matched = false;
        for meta_part in raw_m {
            if process_filter(filter, meta_part)? {
                filter_matched = true;
                break; // This filter is satisfied, move to the next one.
            }
        }
        // If any filter doesn't find a match in the metadata, the whole document fails.
        if !filter_matched {
            return Ok(false);
        }
    }

    // If all filters found a match, the document passes.
    Ok(true)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gte_pass() {
        let filter = r#"{"Rating": {"$gte": 4}}"#.to_string();
        let meta = r#"{"Rating": 5}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(result);
    }

    #[test]
    fn test_gte_fail() {
        let filter = r#"{"Rating": {"$gte": 4}}"#.to_string();
        let meta = r#"{"Rating": 3}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(!result);
    }

    #[test]
    fn test_gte_equal_pass() {
        let filter = r#"{"Rating": {"$gte": 4}}"#.to_string();
        let meta = r#"{"Rating": 4}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(result);
    }
    
    #[test]
    fn test_lte_pass() {
        let filter = r#"{"Rating": {"$lte": 4}}"#.to_string();
        let meta = r#"{"Rating": 3}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(result);
    }

    #[test]
    fn test_in_pass() {
        let filter = r#"{"genre": {"$in": ["music", "history"]}}"#.to_string();
        let meta = r#"{"genre": "history"}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(result);
    }

    #[test]
    fn test_in_fail() {
        let filter = r#"{"genre": {"$in": ["music", "history"]}}"#.to_string();
        let meta = r#"{"genre": "sci-fi"}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(!result);
    }
    
    #[test]
    fn test_key_mismatch() {
        let filter = r#"{"Rating": {"$gte": 4}}"#.to_string();
        let meta = r#"{"Score": 5}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(!result);
    }
    
    #[test]
    fn test_type_mismatch() {
        let filter = r#"{"Rating": {"$gte": 4}}"#.to_string();
        let meta = r#"{"Rating": "good"}"#.to_string();
        let result = process_filter(&filter, &meta).unwrap();
        assert!(!result);
    }

    #[test]
    fn test_filter_where_pass() {
        let filters = vec![
            r#"{"Rating": {"$gte": 4}}"#.to_string(),
            r#"{"year": {"$eq": 2020}}"#.to_string()
        ];
        let metadata = vec![
            r#"{"Rating": 5, "year": 2020}"#.to_string()
        ];
        let result = filter_where(&filters, &metadata).unwrap();
        assert!(result);
    }
    
    #[test]
    fn test_filter_where_fail() {
        let filters = vec![
            r#"{"Rating": {"$gte": 4}}"#.to_string(),
            r#"{"year": {"$eq": 2020}}"#.to_string()
        ];
        let metadata = vec![
            r#"{"Rating": 3, "year": 2020}"#.to_string()
        ];
        let result = filter_where(&filters, &metadata).unwrap();
        assert!(!result);
    }
    
    #[test]
    fn test_filter_where_no_filters() {
        let filters = vec![];
        let metadata = vec![
            r#"{"Rating": 5}"#.to_string()
        ];
        let result = filter_where(&filters, &metadata).unwrap();
        assert!(result);
    }
    
    #[test]
    fn test_filter_where_no_matching_meta() {
        let filters = vec![
            r#"{"genre": {"$in": ["sci-fi"]}}"#.to_string()
        ];
        let metadata = vec![
            r#"{"genre": "history"}"#.to_string(),
            r#"{"genre": "music"}"#.to_string()
        ];
        let result = filter_where(&filters, &metadata).unwrap();
        assert!(!result);
    }
}