lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
use std::fmt;

use crate::core::LuciError;

use crate::mapping::quantization::QuantizationType;

/// The data type of a field in a Luci index.
///
/// M0 types cover text, keyword, numeric, boolean, and date. Vector,
/// geospatial, and nested types are added in later milestones.
///
/// See [[architecture-api-surface#Schema Definition]].
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum FieldType {
    /// Full-text field, analyzed into tokens for search. Supports `match`,
    /// `match_phrase`, and `query_string` queries.
    Text,
    /// Exact-value string field. Not analyzed. Supports `term`, `terms`,
    /// `prefix`, `wildcard`, `regexp`, and `exists` queries. Stored as
    /// dictionary-encoded columnar data for fast aggregations and sorting.
    Keyword,
    /// 32-bit signed integer.
    Integer,
    /// 64-bit signed integer.
    Long,
    /// 32-bit IEEE 754 float.
    Float,
    /// 64-bit IEEE 754 double.
    Double,
    /// Boolean value (`true` / `false`).
    Boolean,
    /// Date/time value. Stored internally as epoch milliseconds (i64).
    /// Accepts ISO 8601 strings and epoch millis on input.
    Date,
    /// Dense vector for kNN search. Fixed dimensionality and quantization
    /// scheme. Both must be set explicitly when constructing the variant
    /// directly; [`Self::dense_vector`] builds one with [`QuantizationType::DEFAULT`].
    DenseVector {
        dims: usize,
        quantization: QuantizationType,
    },
    /// Geographic point (latitude, longitude).
    GeoPoint,
    /// Nested object array — each object indexed as a hidden document.
    Nested,
    /// Geographic shape (Polygon, LineString, etc.) for spatial relation queries.
    /// Indexed in a packed R-tree for efficient candidate selection.
    ///
    /// See [[feature-geo-shape]] and [[geospatial]].
    GeoShape,
    /// Token count: accepts a string, analyzes it, stores the number of tokens
    /// as an integer. Supports range queries and numeric aggregations.
    /// In ES this is typically a multi-field; in Luci it's a standalone field.
    TokenCount,
    /// IP address (IPv4 and IPv6). Stored as keyword for term queries and
    /// as numeric for range queries. Supports CIDR notation in term queries.
    Ip,
}

impl FieldType {
    /// Parse a field type name from an ES-compatible mapping string.
    ///
    /// # Errors
    ///
    /// Returns `LuciError::InvalidQuery` for unrecognized type names.
    pub fn from_es_name(name: &str) -> crate::core::Result<Self> {
        match name {
            "text" => Ok(Self::Text),
            "keyword" => Ok(Self::Keyword),
            "integer" => Ok(Self::Integer),
            "long" => Ok(Self::Long),
            "float" => Ok(Self::Float),
            "double" => Ok(Self::Double),
            "boolean" => Ok(Self::Boolean),
            "date" => Ok(Self::Date),
            // dims and quantization are set by the JSON parser from
            // sibling fields after this returns; the placeholder here
            // is overwritten before the value reaches user code.
            "dense_vector" => Ok(Self::DenseVector {
                dims: 0,
                quantization: QuantizationType::DEFAULT,
            }),
            "geo_point" => Ok(Self::GeoPoint),
            "nested" => Ok(Self::Nested),
            "geo_shape" => Ok(Self::GeoShape),
            "token_count" => Ok(Self::TokenCount),
            "ip" => Ok(Self::Ip),
            _ => Err(LuciError::InvalidQuery(format!(
                "unsupported field type: {name}"
            ))),
        }
    }

    /// The ES-compatible type name used in JSON mappings.
    pub fn es_name(&self) -> &'static str {
        match self {
            Self::Text => "text",
            Self::Keyword => "keyword",
            Self::Integer => "integer",
            Self::Long => "long",
            Self::Float => "float",
            Self::Double => "double",
            Self::Boolean => "boolean",
            Self::Date => "date",
            Self::DenseVector { .. } => "dense_vector",
            Self::GeoPoint => "geo_point",
            Self::Nested => "nested",
            Self::GeoShape => "geo_shape",
            Self::TokenCount => "token_count",
            Self::Ip => "ip",
        }
    }

    /// Whether this type is numeric (supports range queries, numeric sort,
    /// and metric aggregations).
    pub fn is_numeric(&self) -> bool {
        matches!(
            self,
            Self::Integer | Self::Long | Self::Float | Self::Double | Self::TokenCount
        )
    }

    /// Whether this type is a dense vector.
    pub fn is_dense_vector(&self) -> bool {
        matches!(self, Self::DenseVector { .. })
    }

    /// Get vector dimensions, or None if not a vector type.
    pub fn vector_dims(&self) -> Option<usize> {
        match self {
            Self::DenseVector { dims, .. } => Some(*dims),
            _ => None,
        }
    }

    /// Get the configured quantization scheme, or None if not a vector type.
    pub fn vector_quantization(&self) -> Option<QuantizationType> {
        match self {
            Self::DenseVector { quantization, .. } => Some(*quantization),
            _ => None,
        }
    }

    /// Construct a `DenseVector` field type with the given dimensionality
    /// and the default quantization. Convenience for tests and code that
    /// doesn't need to override the default.
    pub fn dense_vector(dims: usize) -> Self {
        Self::DenseVector {
            dims,
            quantization: QuantizationType::DEFAULT,
        }
    }
}

impl fmt::Display for FieldType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.es_name())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn round_trip_es_names() {
        let types = [
            FieldType::Text,
            FieldType::Keyword,
            FieldType::Integer,
            FieldType::Long,
            FieldType::Float,
            FieldType::Double,
            FieldType::Boolean,
            FieldType::Date,
        ];
        for ft in &types {
            let name = ft.es_name();
            let parsed = FieldType::from_es_name(name).unwrap();
            assert_eq!(&parsed, ft);
        }
    }

    #[test]
    fn unknown_type_is_error() {
        assert!(FieldType::from_es_name("percolator").is_err());
        assert!(FieldType::from_es_name("").is_err());
    }

    #[test]
    fn is_numeric() {
        assert!(FieldType::Integer.is_numeric());
        assert!(FieldType::Long.is_numeric());
        assert!(FieldType::Float.is_numeric());
        assert!(FieldType::Double.is_numeric());
        assert!(!FieldType::Text.is_numeric());
        assert!(!FieldType::Keyword.is_numeric());
        assert!(!FieldType::Boolean.is_numeric());
        assert!(!FieldType::Date.is_numeric());
    }

    #[test]
    fn display() {
        assert_eq!(format!("{}", FieldType::Text), "text");
        assert_eq!(format!("{}", FieldType::Long), "long");
    }
}