lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
use serde_json::Value;

use crate::mapping::field_type::FieldType;

/// Infer a [`FieldType`] from a JSON value using Elasticsearch's dynamic
/// mapping rules.
///
/// | JSON Value | Inferred Type |
/// |-----------|---------------|
/// | Boolean | `boolean` |
/// | Integer (fits i64) | `long` |
/// | Float | `double` |
/// | ISO 8601 string | `date` |
/// | Other string | `text` |
///
/// Returns `None` for null, objects, and arrays (these require special
/// handling by the caller — objects may become nested, arrays of floats
/// may become dense_vector, etc.).
///
/// See [[architecture-api-surface#Dynamic Mapping]].
pub fn detect_field_type(value: &Value) -> Option<FieldType> {
    match value {
        Value::Bool(_) => Some(FieldType::Boolean),
        Value::Number(n) => {
            if n.is_i64() {
                Some(FieldType::Long)
            } else {
                Some(FieldType::Double)
            }
        }
        Value::String(s) => {
            if is_iso8601(s) {
                Some(FieldType::Date)
            } else {
                Some(FieldType::Text)
            }
        }
        Value::Null | Value::Array(_) | Value::Object(_) => None,
    }
}

/// Best-effort ISO 8601 date detection.
///
/// Checks common patterns without pulling in a full datetime parser. Matches:
/// - `YYYY-MM-DD`
/// - `YYYY-MM-DDTHH:MM:SS` (with optional fractional seconds and timezone)
fn is_iso8601(s: &str) -> bool {
    let bytes = s.as_bytes();

    // Minimum: "YYYY-MM-DD" = 10 chars.
    if bytes.len() < 10 {
        return false;
    }

    // Check "DDDD-DD-DD" pattern.
    let date_ok = bytes[0..4].iter().all(|b| b.is_ascii_digit())
        && bytes[4] == b'-'
        && bytes[5..7].iter().all(|b| b.is_ascii_digit())
        && bytes[7] == b'-'
        && bytes[8..10].iter().all(|b| b.is_ascii_digit());

    if !date_ok {
        return false;
    }

    // "YYYY-MM-DD" alone is valid.
    if bytes.len() == 10 {
        return true;
    }

    // Next char must be 'T' or ' ' to separate date from time.
    if bytes.len() > 10 && bytes[10] != b'T' && bytes[10] != b' ' {
        return false;
    }

    // If there's a time component, do a basic check: at least HH:MM:SS.
    if bytes.len() >= 19 {
        let time_ok = bytes[11..13].iter().all(|b| b.is_ascii_digit())
            && bytes[13] == b':'
            && bytes[14..16].iter().all(|b| b.is_ascii_digit())
            && bytes[16] == b':'
            && bytes[17..19].iter().all(|b| b.is_ascii_digit());
        return time_ok;
    }

    // Partial time — doesn't match a common pattern. Reject.
    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    #[test]
    fn detect_boolean() {
        assert_eq!(detect_field_type(&json!(true)), Some(FieldType::Boolean));
        assert_eq!(detect_field_type(&json!(false)), Some(FieldType::Boolean));
    }

    #[test]
    fn detect_long() {
        assert_eq!(detect_field_type(&json!(42)), Some(FieldType::Long));
        assert_eq!(detect_field_type(&json!(-1)), Some(FieldType::Long));
        assert_eq!(detect_field_type(&json!(0)), Some(FieldType::Long));
    }

    #[test]
    fn detect_double() {
        assert_eq!(detect_field_type(&json!(3.14)), Some(FieldType::Double));
        assert_eq!(detect_field_type(&json!(-0.5)), Some(FieldType::Double));
    }

    #[test]
    fn detect_text() {
        assert_eq!(
            detect_field_type(&json!("hello world")),
            Some(FieldType::Text)
        );
        assert_eq!(detect_field_type(&json!("")), Some(FieldType::Text));
        assert_eq!(
            detect_field_type(&json!("not-a-date")),
            Some(FieldType::Text)
        );
    }

    #[test]
    fn detect_date_date_only() {
        assert_eq!(
            detect_field_type(&json!("2024-01-15")),
            Some(FieldType::Date)
        );
    }

    #[test]
    fn detect_date_with_time() {
        assert_eq!(
            detect_field_type(&json!("2024-01-15T10:30:00")),
            Some(FieldType::Date)
        );
    }

    #[test]
    fn detect_date_with_timezone() {
        assert_eq!(
            detect_field_type(&json!("2024-01-15T10:30:00Z")),
            Some(FieldType::Date)
        );
        assert_eq!(
            detect_field_type(&json!("2024-01-15T10:30:00+05:30")),
            Some(FieldType::Date)
        );
    }

    #[test]
    fn detect_date_with_fractional_seconds() {
        assert_eq!(
            detect_field_type(&json!("2024-01-15T10:30:00.123")),
            Some(FieldType::Date)
        );
    }

    #[test]
    fn detect_date_space_separator() {
        assert_eq!(
            detect_field_type(&json!("2024-01-15 10:30:00")),
            Some(FieldType::Date)
        );
    }

    #[test]
    fn reject_partial_dates() {
        // "2024-01" is not a full date.
        assert_eq!(detect_field_type(&json!("2024-01")), Some(FieldType::Text));
    }

    #[test]
    fn null_returns_none() {
        assert_eq!(detect_field_type(&json!(null)), None);
    }

    #[test]
    fn array_returns_none() {
        assert_eq!(detect_field_type(&json!([1, 2, 3])), None);
    }

    #[test]
    fn object_returns_none() {
        assert_eq!(detect_field_type(&json!({"key": "val"})), None);
    }

    #[test]
    fn large_integer_is_long() {
        assert_eq!(
            detect_field_type(&json!(9_999_999_999_i64)),
            Some(FieldType::Long)
        );
    }
}