laurus 0.7.0 - Docs.rs

pub mod query;
pub mod schema;
pub mod search;

use std::collections::{HashMap, HashSet};
use std::sync::Arc;

use parking_lot::RwLock;

use crate::analysis::analyzer::analyzer::Analyzer;
use crate::analysis::analyzer::keyword::KeywordAnalyzer;
use crate::analysis::analyzer::per_field::PerFieldAnalyzer;
use crate::analysis::analyzer::standard::StandardAnalyzer;
use crate::data::Document;
use crate::embedding::embedder::Embedder;
use crate::error::Result;
use crate::lexical::store::LexicalStore;
use crate::lexical::store::config::LexicalIndexConfig;
use crate::storage::Storage;
use crate::storage::prefixed::PrefixedStorage;
use crate::store::log::{DocumentLog, LogEntry};
use crate::vector::store::VectorStore;
use crate::vector::store::config::VectorIndexConfig;

use self::schema::Schema;

/// Combined statistics from both the lexical and vector stores.
#[derive(Debug, Clone, Default)]
pub struct EngineStats {
    /// Total number of documents in the index (from the lexical store).
    pub document_count: u64,
    /// Per-field vector statistics, keyed by field name.
    /// Empty when the schema contains no vector fields.
    pub vector_fields: HashMap<String, crate::vector::index::field::VectorFieldStats>,
}

/// Unified Engine that manages both Lexical and Vector indices.
///
/// This engine acts as a facade, coordinating document ingestion and search
/// across the underlying specialized engines. All index mutations are
/// WAL-backed via [`DocumentLog`] for crash-recovery durability.
///
/// A system field `_id` is automatically injected into every indexed document
/// to track the external document identifier.
pub struct Engine {
    schema: RwLock<Schema>,
    lexical: LexicalStore,
    vector: VectorStore,
    log: Arc<DocumentLog>,
}

use crate::engine::search::{FusionAlgorithm, SearchResult};

impl Engine {
    /// Create a new Unified Engine with default analyzer and no embedder.
    ///
    /// For custom analyzer or embedder configuration, use [`Engine::builder`].
    ///
    /// # Errors
    ///
    /// Returns an error if storage initialization, index creation, or
    /// WAL recovery fails.
    pub async fn new(storage: Arc<dyn Storage>, schema: Schema) -> Result<Self> {
        EngineBuilder::new(storage, schema).build().await
    }

    /// Create an [`EngineBuilder`] for custom configuration.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let engine = Engine::builder(storage, schema)
    ///     .analyzer(Arc::new(StandardAnalyzer::default()))
    ///     .embedder(Arc::new(MyEmbedder))
    ///     .build()
    ///     .await?;
    /// ```
    pub fn builder(storage: Arc<dyn Storage>, schema: Schema) -> EngineBuilder {
        EngineBuilder::new(storage, schema)
    }

    /// Recover index state from the document log.
    async fn recover(&self) -> Result<()> {
        // read_all() internally syncs next_doc_id with doc_store segments.
        let records = self.log.read_all()?;

        if records.is_empty() {
            return Ok(());
        }

        let vector_last_seq = self.vector.last_wal_seq();
        let lexical_last_seq = self.lexical.last_wal_seq();

        for record in records {
            if record.seq <= vector_last_seq && record.seq <= lexical_last_seq {
                continue;
            }

            match record.entry {
                LogEntry::Upsert {
                    doc_id,
                    external_id: _,
                    document,
                } => {
                    // Restore document into document store
                    let stored_doc = self.filter_stored_fields(&document);
                    self.log.store_document(doc_id, stored_doc);

                    // Re-index into both stores using the recorded doc_id.
                    // Update seq only after BOTH stores succeed to maintain atomicity.
                    if record.seq > lexical_last_seq {
                        self.lexical.upsert_document(doc_id, document.clone())?;
                    }

                    if record.seq > vector_last_seq {
                        // Filter for vector fields
                        let mut vector_doc = Document::new();
                        {
                            let schema = self.schema.read();
                            for (name, val) in &document.fields {
                                if schema.fields.get(name).is_some_and(|fc| fc.is_vector()) {
                                    vector_doc.fields.insert(name.clone(), val.clone());
                                }
                            }
                        }
                        self.vector
                            .upsert_document_by_internal_id(doc_id, vector_doc)
                            .await?;
                    }

                    // Both stores succeeded — now update seq trackers
                    if record.seq > lexical_last_seq {
                        self.lexical.set_last_wal_seq(record.seq)?;
                    }
                    if record.seq > vector_last_seq {
                        self.vector.set_last_wal_seq(record.seq);
                    }
                }
                LogEntry::Delete {
                    doc_id,
                    external_id: _,
                } => {
                    if record.seq > lexical_last_seq {
                        self.lexical.delete_document_by_internal_id(doc_id)?;
                    }
                    if record.seq > vector_last_seq {
                        self.vector.delete_document_by_internal_id(doc_id).await?;
                    }

                    // Both stores succeeded — now update seq trackers
                    if record.seq > lexical_last_seq {
                        self.lexical.set_last_wal_seq(record.seq)?;
                    }
                    if record.seq > vector_last_seq {
                        self.vector.set_last_wal_seq(record.seq);
                    }
                }
            }
        }
        Ok(())
    }

    /// Put (upsert) a document.
    ///
    /// If a document with the same external ID exists, all its chunks are
    /// deleted before the new document is indexed. A `_id` field is
    /// automatically inserted into the document with the provided `id` value.
    /// A WAL entry is written before any index mutations to ensure durability.
    ///
    /// The document fields are routed to the appropriate underlying stores
    /// (lexical or vector) based on the schema field configuration. If the
    /// vector store indexing fails after the lexical store has already been
    /// updated, the lexical insert is rolled back to maintain cross-store
    /// consistency.
    ///
    /// # Parameters
    ///
    /// - `id` - The external document identifier.
    /// - `doc` - The document to index.
    ///
    /// # Errors
    ///
    /// Returns an error if the WAL write, deletion of existing documents,
    /// or indexing into either the lexical or vector store fails.
    pub async fn put_document(&self, id: &str, doc: Document) -> Result<()> {
        let _ = self.index_internal(id, doc, false).await?;
        Ok(())
    }

    /// Add a document as a new chunk (always appends, never deletes existing).
    ///
    /// Unlike [`put_document`](Self::put_document), this method does **not**
    /// delete existing documents with the same external ID. Multiple chunks
    /// can share the same ID, which is useful for indexing parts of a large
    /// document (e.g. paragraphs or pages) separately while keeping them
    /// associated with the same logical document.
    ///
    /// A `_id` field is automatically inserted into the document with the
    /// provided `id` value. A WAL entry is written before any index mutations
    /// to ensure durability.
    ///
    /// # Parameters
    ///
    /// - `id` - The external document identifier (may duplicate existing IDs).
    /// - `doc` - The document chunk to index.
    ///
    /// # Errors
    ///
    /// Returns an error if the WAL write or indexing into either the lexical
    /// or vector store fails.
    pub async fn add_document(&self, id: &str, doc: Document) -> Result<()> {
        let _ = self.index_internal(id, doc, true).await?;
        Ok(())
    }

    async fn index_internal(&self, id: &str, mut doc: Document, as_chunk: bool) -> Result<u64> {
        // 1. Inject _id field
        use crate::data::DataValue;
        doc.fields
            .insert("_id".to_string(), DataValue::Text(id.to_string()));

        if !as_chunk {
            self.delete_documents(id).await?;
        }

        // 2. Write-Ahead Log: assign doc_id + persist (before any index updates)
        let (doc_id, seq) = self.log.append(id, doc.clone())?;

        // 3. Store only stored fields for retrieval (WAL has full data for recovery)
        let stored_doc = self.filter_stored_fields(&doc);
        self.log.store_document(doc_id, stored_doc);

        // 4. Prepare vector document (extract vector fields only)
        let mut vector_doc = Document::new();
        {
            let schema = self.schema.read();
            for (name, val) in &doc.fields {
                if schema.fields.get(name).is_some_and(|fc| fc.is_vector()) {
                    vector_doc.fields.insert(name.clone(), val.clone());
                }
            }
        }

        // 5. Index into Lexical and Vector stores
        self.lexical.upsert_document(doc_id, doc)?;
        if let Err(e) = self
            .vector
            .upsert_document_by_internal_id(doc_id, vector_doc)
            .await
        {
            // Rollback lexical insert to maintain consistency
            let _ = self.lexical.delete_document_by_internal_id(doc_id);
            return Err(e);
        }

        // 6. Update sub-stores sequence tracker AFTER both stores succeed.
        // This ensures failed index operations are retried on recovery.
        self.lexical.set_last_wal_seq(seq)?;
        self.vector.set_last_wal_seq(seq);

        Ok(doc_id)
    }

    /// Delete all documents (including chunks) by external ID.
    ///
    /// Looks up all internal document IDs associated with the given external
    /// `id` via the `_id` field in the lexical index, then removes each one
    /// from both the lexical and vector stores. A WAL delete entry is written
    /// for each matched document before mutation.
    ///
    /// If no documents match the given ID, the operation completes
    /// successfully without error (non-existent IDs are silently ignored).
    ///
    /// # Parameters
    ///
    /// - `id` - The external document identifier to delete.
    ///
    /// # Errors
    ///
    /// Returns an error if the WAL write, lexical deletion, or vector
    /// deletion fails for any matched document.
    pub async fn delete_documents(&self, id: &str) -> Result<()> {
        let doc_ids = self.lexical.find_doc_ids_by_term("_id", id)?;
        for doc_id in doc_ids {
            // 1. Write to log
            let seq = self.log.append_delete(doc_id, id)?;
            // 2. Delete from Lexical
            self.lexical.delete_document_by_internal_id(doc_id)?;
            // 3. Delete from Vector
            self.vector.delete_document_by_internal_id(doc_id).await?;
            // 4. Update trackers AFTER both deletes succeed.
            // This ensures failed deletes are retried on recovery.
            self.lexical.set_last_wal_seq(seq)?;
            self.vector.set_last_wal_seq(seq);
        }
        Ok(())
    }

    /// Commit changes to both stores and truncate the WAL.
    ///
    /// Persists all pending changes in the lexical store, vector store, and
    /// document store (in that order), then truncates the WAL. After a
    /// successful commit, the WAL is empty and all data is durable in the
    /// underlying storage.
    ///
    /// # Errors
    ///
    /// Returns an error if committing the lexical store, vector store,
    /// document store, or truncating the WAL fails.
    pub async fn commit(&self) -> Result<()> {
        self.lexical.commit()?;
        self.vector.commit().await?;
        self.log.commit_documents()?;
        // After successful commit to all stores, truncate the log
        self.log.truncate()?;
        Ok(())
    }

    /// Get combined index statistics from both the lexical and vector stores.
    ///
    /// Returns an [`EngineStats`] containing:
    /// - `document_count` from the lexical index (authoritative source).
    /// - Per-field vector statistics from the vector store (empty when no
    ///   vector fields are defined in the schema).
    ///
    /// # Errors
    ///
    /// Returns an error if the lexical index statistics cannot be retrieved.
    pub fn stats(&self) -> Result<EngineStats> {
        let lexical_stats = self.lexical.stats()?;

        let vector_fields = match self.vector.stats() {
            Ok(vs) => vs.fields,
            Err(_) => std::collections::HashMap::new(),
        };

        // doc_count includes deleted documents (soft-deleted, pending merge).
        // Subtract deleted_count for the live document count.
        let live_count = lexical_stats
            .doc_count
            .saturating_sub(lexical_stats.deleted_count);

        Ok(EngineStats {
            document_count: live_count,
            vector_fields,
        })
    }

    /// Return a clone of the current schema.
    ///
    /// This can be used to inspect the schema after dynamic field additions
    /// or to persist it to storage (e.g., `schema.toml`).
    pub fn schema(&self) -> Schema {
        self.schema.read().clone()
    }

    /// Returns the embedder used by the vector store.
    ///
    /// This is useful for constructing a [`VectorQueryParser`] or
    /// [`UnifiedQueryParser`] that shares the same embedder configuration
    /// as the engine.
    pub fn embedder(&self) -> Arc<dyn Embedder> {
        self.vector.embedder()
    }

    /// Create a [`UnifiedQueryParser`] configured for this engine.
    ///
    /// The returned parser uses the engine's analyzer for lexical queries
    /// and the engine's embedder for vector queries. Default fields are
    /// derived from the schema: `default_fields` for lexical queries, and
    /// all vector fields for vector queries.
    ///
    /// # Errors
    ///
    /// Returns an error if the lexical query parser cannot be constructed
    /// (e.g. the analyzer is misconfigured).
    pub fn unified_query_parser(&self) -> Result<self::query::UnifiedQueryParser> {
        let lexical_parser = self.lexical.query_parser()?;
        let embedder = self.embedder();

        let schema = self.schema.read();
        let vector_fields: Vec<String> = schema
            .fields
            .iter()
            .filter(|(_, opt)| opt.is_vector())
            .map(|(name, _)| name.clone())
            .collect();

        let vector_field_set: std::collections::HashSet<String> =
            vector_fields.iter().cloned().collect();

        let mut vector_parser = crate::vector::query::parser::VectorQueryParser::new(embedder);
        if !vector_fields.is_empty() {
            vector_parser = vector_parser.with_default_fields(vector_fields);
        }

        Ok(self::query::UnifiedQueryParser::new(
            lexical_parser,
            vector_parser,
            vector_field_set,
        ))
    }

    /// Dynamically add a new field to the engine at runtime.
    ///
    /// This method registers the field in both the engine schema and the
    /// appropriate underlying store (lexical or vector). Only field addition
    /// is supported; removal or type changes are not allowed.
    ///
    /// After adding a field, new documents can include values for this field
    /// and searches can target it. Existing documents are unaffected (they
    /// simply do not have a value for the new field).
    ///
    /// # Arguments
    ///
    /// * `name` - The field name. Must not collide with an existing field.
    /// * `option` - The field configuration (e.g., `FieldOption::Text`,
    ///   `FieldOption::Hnsw`, etc.).
    ///
    /// # Returns
    ///
    /// Returns the updated [`Schema`] on success. The caller is responsible
    /// for persisting it (e.g., writing `schema.toml`).
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - A field with the same name already exists.
    /// - The field references an unknown analyzer or embedder.
    /// - The underlying store rejects the field.
    pub async fn add_field(&self, name: &str, option: schema::FieldOption) -> Result<Schema> {
        // 1. Check for duplicates.
        {
            let schema = self.schema.read();
            if schema.fields.contains_key(name) {
                return Err(crate::error::LaurusError::invalid_argument(format!(
                    "Field '{name}' already exists in the schema"
                )));
            }
        }

        // 2. Register in the appropriate store.
        if option.is_lexical() {
            // Resolve the per-field analyzer if configured.
            let field_analyzer = if let schema::FieldOption::Text(ref text_opt) = option
                && let Some(ref analyzer_name) = text_opt.analyzer
            {
                let schema = self.schema.read();
                let analyzer = match crate::analysis::analyzer::registry::create_analyzer_by_name(
                    analyzer_name,
                ) {
                    Ok(a) => a,
                    Err(_) => {
                        let def = schema.analyzers.get(analyzer_name).ok_or_else(|| {
                            crate::error::LaurusError::invalid_argument(format!(
                                "Unknown analyzer '{analyzer_name}' for field '{name}': \
                                     not a built-in and not defined in schema.analyzers"
                            ))
                        })?;
                        crate::analysis::analyzer::registry::create_analyzer_from_definition(
                            analyzer_name,
                            def,
                        )?
                    }
                };
                Some(analyzer)
            } else {
                None
            };

            let lexical_opt = option
                .to_lexical()
                .expect("is_lexical() was true but to_lexical() returned None");
            self.lexical.add_field(name, lexical_opt, field_analyzer)?;
        }

        if option.is_vector() {
            // Resolve the per-field embedder if configured.
            // Clone the embedder definition out of the schema lock before
            // calling the async factory so that the non-Send parking_lot
            // guard is not held across an await point.
            let field_embedder = if let Some(embedder_name) = option.embedder_name() {
                let embedder_def = {
                    let schema = self.schema.read();
                    schema.embedders.get(embedder_name).cloned()
                };
                if let Some(def) = embedder_def {
                    Some(
                        crate::embedding::registry::create_embedder_from_definition(
                            embedder_name,
                            &def,
                        )
                        .await?,
                    )
                } else {
                    None
                }
            } else {
                None
            };

            self.vector.add_field(name, field_embedder).await;
        }

        // 3. Update the schema.
        {
            let mut schema = self.schema.write();
            schema.fields.insert(name.to_string(), option);
        }

        Ok(self.schema.read().clone())
    }

    /// Dynamically remove a field from the engine schema at runtime.
    ///
    /// This removes the field definition from the schema so that it is no longer
    /// available for indexing or searching. Existing data already stored in the
    /// index is **not** deleted; it simply becomes inaccessible through the
    /// normal query path.
    ///
    /// For lexical fields, the field is also removed from the underlying
    /// [`LexicalStore`] (if it was dynamically added) and any per-field analyzer
    /// is unregistered. For vector fields, the per-field embedder is
    /// unregistered and writer/searcher caches are invalidated.
    ///
    /// If the deleted field appears in [`Schema::default_fields`], it is removed
    /// from that list as well.
    ///
    /// # Arguments
    ///
    /// * `name` - The name of the field to delete
    ///
    /// # Returns
    ///
    /// The updated [`Schema`] after the field has been removed.
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - No field with the given name exists in the schema.
    /// - The underlying store rejects the deletion.
    pub async fn delete_field(&self, name: &str) -> Result<Schema> {
        // 1. Check that the field exists.
        let option = {
            let schema = self.schema.read();
            schema.fields.get(name).cloned().ok_or_else(|| {
                crate::error::LaurusError::invalid_argument(format!(
                    "Field '{name}' does not exist in the schema"
                ))
            })?
        };

        // 2. Remove from the appropriate store.
        if option.is_lexical() {
            self.lexical.delete_field(name)?;
        }

        if option.is_vector() {
            self.vector.delete_field(name).await;
        }

        // 3. Update the schema.
        {
            let mut schema = self.schema.write();
            schema.fields.remove(name);
            schema.default_fields.retain(|f| f != name);
        }

        Ok(self.schema.read().clone())
    }

    /// Resolve a [`LexicalSearchQuery`] into a concrete [`Query`] object.
    ///
    /// If the query is already an `Obj` variant, it is returned as-is.
    /// If it is a `Dsl` string, it is parsed using the lexical store's
    /// query parser (which includes the configured analyzer and default fields).
    ///
    /// # Arguments
    ///
    /// * `query` - The query to resolve.
    ///
    /// # Errors
    ///
    /// Returns an error if the DSL string cannot be parsed.
    fn resolve_query(
        &self,
        query: crate::lexical::search::searcher::LexicalSearchQuery,
    ) -> Result<Box<dyn crate::lexical::query::Query>> {
        match query {
            crate::lexical::search::searcher::LexicalSearchQuery::Obj(q) => Ok(q),
            crate::lexical::search::searcher::LexicalSearchQuery::Dsl(dsl) => {
                let parser = self.lexical.query_parser()?;
                parser.parse(&dsl)
            }
        }
    }

    /// Resolve a [`SearchQuery`](self::search::SearchQuery) into internal
    /// search request types for the lexical and vector stores.
    ///
    /// This method converts the public query enum variants into the
    /// internal `LexicalSearchRequest` and `VectorSearchRequest` types,
    /// applying the relevant options.
    ///
    /// # Parameters
    ///
    /// * `query` - The search query to resolve.
    /// * `offset` - The pagination offset from the search request.
    /// * `limit` - The result limit from the search request.
    /// * `fusion_algorithm` - The caller-specified fusion algorithm, if any.
    /// * `lexical_options` - Lexical search options.
    /// * `vector_options` - Vector search options.
    ///
    /// # Errors
    ///
    /// Panics (via `unreachable!`) if called with `SearchQuery::Dsl`, which
    /// must be resolved before calling this method.
    #[allow(clippy::type_complexity)]
    fn resolve_search_query_from_parts(
        &self,
        query: self::search::SearchQuery,
        offset: usize,
        limit: usize,
        fusion_algorithm: Option<FusionAlgorithm>,
        lexical_options: &self::search::LexicalSearchOptions,
        vector_options: &self::search::VectorSearchOptions,
    ) -> Result<(
        Option<crate::lexical::search::searcher::LexicalSearchRequest>,
        Option<crate::vector::store::request::VectorSearchRequest>,
        Option<FusionAlgorithm>,
        self::search::HybridMode,
    )> {
        let fetch_count = offset.saturating_add(limit);

        match query {
            self::search::SearchQuery::Dsl(_) => {
                // DSL should be parsed by UnifiedQueryParser before calling this
                unreachable!("DSL should be resolved before resolve_search_query_from_parts")
            }
            self::search::SearchQuery::Lexical(lexical_query) => {
                let lex_req = crate::lexical::search::searcher::LexicalSearchRequest {
                    query: lexical_query,
                    params: crate::lexical::search::searcher::LexicalSearchParams {
                        limit: 0, // Controlled by engine
                        min_score: lexical_options.min_score,
                        load_documents: true,
                        timeout_ms: lexical_options.timeout_ms,
                        parallel: lexical_options.parallel,
                        sort_by: lexical_options.sort_by.clone(),
                    },
                    field_boosts: lexical_options.field_boosts.clone(),
                };
                Ok((Some(lex_req), None, None, self::search::HybridMode::Union))
            }
            self::search::SearchQuery::Vector(vector_query) => {
                let vec_req = self.build_vector_request(vector_query, vector_options, fetch_count);
                Ok((None, Some(vec_req), None, self::search::HybridMode::Union))
            }
            self::search::SearchQuery::Hybrid {
                lexical,
                vector,
                mode,
            } => {
                let lex_req = crate::lexical::search::searcher::LexicalSearchRequest {
                    query: lexical,
                    params: crate::lexical::search::searcher::LexicalSearchParams {
                        limit: 0, // Controlled by engine
                        min_score: lexical_options.min_score,
                        load_documents: true,
                        timeout_ms: lexical_options.timeout_ms,
                        parallel: lexical_options.parallel,
                        sort_by: lexical_options.sort_by.clone(),
                    },
                    field_boosts: lexical_options.field_boosts.clone(),
                };
                let vec_req = self.build_vector_request(vector, vector_options, fetch_count);
                let fusion = fusion_algorithm.or(Some(FusionAlgorithm::RRF { k: 60.0 }));
                Ok((Some(lex_req), Some(vec_req), fusion, mode))
            }
        }
    }

    /// Build a [`VectorSearchRequest`](crate::vector::store::request::VectorSearchRequest)
    /// from a [`VectorSearchQuery`](self::search::VectorSearchQuery) and options.
    ///
    /// # Parameters
    ///
    /// * `query` - The vector search query (payloads or pre-embedded vectors).
    /// * `opts` - Vector search options (score mode, min score).
    /// * `limit` - Maximum number of results to fetch.
    fn build_vector_request(
        &self,
        query: self::search::VectorSearchQuery,
        opts: &self::search::VectorSearchOptions,
        limit: usize,
    ) -> crate::vector::store::request::VectorSearchRequest {
        crate::vector::store::request::VectorSearchRequest {
            query,
            params: crate::vector::search::searcher::VectorSearchParams {
                fields: None,
                limit,
                score_mode: opts.score_mode,
                overfetch: 2.0,
                min_score: opts.min_score,
                allowed_ids: None,
            },
        }
    }

    /// Get all documents (including chunks) by external ID.
    ///
    /// Only fields marked as stored in the schema are included in the
    /// returned documents. If no documents match the given ID, an empty
    /// `Vec` is returned (not an error).
    ///
    /// # Parameters
    ///
    /// - `id` - The external document identifier to look up.
    ///
    /// # Errors
    ///
    /// Returns an error if the internal ID lookup or document retrieval fails.
    pub async fn get_documents(&self, id: &str) -> Result<Vec<Document>> {
        let doc_ids = self.lexical.find_doc_ids_by_term("_id", id)?;
        let mut docs = Vec::with_capacity(doc_ids.len());
        for doc_id in doc_ids {
            if let Some(doc) = self.get_document_by_internal_id(doc_id)? {
                docs.push(doc);
            }
        }
        Ok(docs)
    }

    /// Check if a field should be stored based on the schema.
    ///
    /// - `_id`: always stored (system field)
    /// - Lexical fields: stored only if `stored=true`
    /// - Vector fields: always stored
    /// - Unknown fields: not stored
    fn is_field_stored(&self, name: &str) -> bool {
        use crate::engine::schema::FieldOption;

        if name == "_id" {
            return true;
        }
        let schema = self.schema.read();
        if let Some(field_opt) = schema.fields.get(name) {
            match field_opt {
                FieldOption::Text(o) => o.stored,
                FieldOption::Integer(o) => o.stored,
                FieldOption::Float(o) => o.stored,
                FieldOption::Boolean(o) => o.stored,
                FieldOption::DateTime(o) => o.stored,
                FieldOption::Geo(o) => o.stored,
                FieldOption::Bytes(o) => o.stored,
                // Vector fields are always stored
                FieldOption::Hnsw(_) | FieldOption::Flat(_) | FieldOption::Ivf(_) => true,
            }
        } else {
            false
        }
    }

    /// Filter a document to only include fields that should be stored.
    ///
    /// The document log (WAL) stores ALL fields for recovery, but the
    /// document store only keeps stored fields to save space.
    fn filter_stored_fields(&self, doc: &Document) -> Document {
        let mut stored_doc = Document::new();
        for (name, val) in &doc.fields {
            if self.is_field_stored(name) {
                stored_doc.fields.insert(name.clone(), val.clone());
            }
        }
        stored_doc
    }

    /// Get a document by its internal ID (private helper).
    ///
    /// Retrieves from the document log and filters out non-stored fields.
    fn get_document_by_internal_id(&self, doc_id: u64) -> Result<Option<Document>> {
        let doc = self.log.get_document(doc_id)?;

        if let Some(doc) = doc {
            Ok(Some(self.filter_stored_fields(&doc)))
        } else {
            Ok(None)
        }
    }

    /// Batch-resolve external IDs and documents for multiple internal IDs.
    ///
    /// Fetches all documents in one pass through the document store,
    /// reducing per-document lock acquisition overhead.
    ///
    /// # Arguments
    ///
    /// * `internal_ids` - Slice of internal document IDs.
    ///
    /// # Returns
    ///
    /// A map from internal ID to `(external_id, Option<Document>)`.
    fn resolve_ids_and_documents_batch(
        &self,
        internal_ids: &[u64],
    ) -> Result<HashMap<u64, (String, Option<Document>)>> {
        let mut results = HashMap::with_capacity(internal_ids.len());
        for &id in internal_ids {
            if let Some(doc) = self.log.get_document(id)? {
                let external_id = doc
                    .fields
                    .get("_id")
                    .and_then(|v| v.as_text())
                    .map(|s| s.to_string())
                    .unwrap_or_else(|| format!("unknown_{}", id));
                let filtered = self.filter_stored_fields(&doc);
                results.insert(id, (external_id, Some(filtered)));
            } else {
                results.insert(id, (format!("unknown_{}", id), None));
            }
        }
        Ok(results)
    }

    /// Split the unified schema into specialized configs.
    async fn split_schema(
        schema: &Schema,
        analyzer: Option<Arc<dyn Analyzer>>,
        embedder: Option<Arc<dyn Embedder>>,
    ) -> Result<(LexicalIndexConfig, VectorIndexConfig)> {
        // Construct Lexical Config
        let analyzer = match analyzer {
            Some(a) => a,
            None => Arc::new(StandardAnalyzer::new()?),
        };

        // If the user passed a PerFieldAnalyzer, clone it and ensure _id uses KeywordAnalyzer.
        // Otherwise, wrap the simple analyzer in a new PerFieldAnalyzer.
        let per_field_analyzer =
            if let Some(existing) = analyzer.as_any().downcast_ref::<PerFieldAnalyzer>() {
                let pfa = existing.clone();
                pfa.add_analyzer("_id", Arc::new(KeywordAnalyzer::new()));
                pfa
            } else {
                let pfa = PerFieldAnalyzer::new(analyzer);
                pfa.add_analyzer("_id", Arc::new(KeywordAnalyzer::new()));
                pfa
            };

        // Register per-field analyzers declared in the schema.
        // Resolution order: built-in name → custom definition in schema.analyzers.
        for (name, field_option) in &schema.fields {
            if let schema::FieldOption::Text(text_opt) = field_option
                && let Some(analyzer_name) = &text_opt.analyzer
            {
                let field_analyzer =
                    match crate::analysis::analyzer::registry::create_analyzer_by_name(
                        analyzer_name,
                    ) {
                        Ok(a) => a,
                        Err(_) => {
                            let def = schema.analyzers.get(analyzer_name).ok_or_else(|| {
                                crate::error::LaurusError::invalid_argument(format!(
                                    "Unknown analyzer '{analyzer_name}' for field '{name}': \
                                 not a built-in and not defined in schema.analyzers"
                                ))
                            })?;
                            crate::analysis::analyzer::registry::create_analyzer_from_definition(
                                analyzer_name,
                                def,
                            )?
                        }
                    };
                per_field_analyzer.add_analyzer(name, field_analyzer);
            }
        }

        let mut lexical_builder =
            LexicalIndexConfig::builder().analyzer(Arc::new(per_field_analyzer));

        if !schema.default_fields.is_empty() {
            lexical_builder = lexical_builder.default_fields(schema.default_fields.clone());
        }

        for (name, field_option) in &schema.fields {
            if let Some(lexical_opt) = field_option.to_lexical() {
                lexical_builder = lexical_builder.add_field(name, lexical_opt);
            }
        }

        let lexical_config = lexical_builder.build();

        // Construct Vector Config — resolve embedder from schema if not explicitly provided.
        let embedder = if embedder.is_some() {
            embedder
        } else if !schema.embedders.is_empty() {
            // Build a PerFieldEmbedder from schema.embedders declarations.
            let mut embedder_cache: HashMap<String, Arc<dyn crate::embedding::embedder::Embedder>> =
                HashMap::new();
            let default_embedder: Arc<dyn crate::embedding::embedder::Embedder> =
                Arc::new(crate::embedding::precomputed::PrecomputedEmbedder::new());
            let per_field = crate::embedding::per_field::PerFieldEmbedder::new(default_embedder);

            for (name, field_option) in &schema.fields {
                if let Some(embedder_name) = field_option.embedder_name() {
                    let emb = if let Some(cached) = embedder_cache.get(embedder_name) {
                        cached.clone()
                    } else {
                        let def = schema.embedders.get(embedder_name).ok_or_else(|| {
                            crate::error::LaurusError::invalid_argument(format!(
                                "Unknown embedder '{embedder_name}' for field '{name}': \
                                 not defined in schema.embedders"
                            ))
                        })?;
                        let emb = crate::embedding::registry::create_embedder_from_definition(
                            embedder_name,
                            def,
                        )
                        .await?;
                        embedder_cache.insert(embedder_name.to_string(), emb.clone());
                        emb
                    };
                    per_field.add_embedder(name, emb);
                }
            }

            let emb: Arc<dyn crate::embedding::embedder::Embedder> = Arc::new(per_field);
            Some(emb)
        } else {
            None
        };

        let mut vector_builder = VectorIndexConfig::builder();
        if let Some(embedder) = &embedder {
            vector_builder = vector_builder.embedder(embedder.clone());
        }

        for (name, field_option) in &schema.fields {
            if let Some(vector_opt) = field_option.to_vector() {
                vector_builder = vector_builder.add_field(name, vector_opt)?;
            }
        }

        let vector_config = vector_builder.build()?;

        Ok((lexical_config, vector_config))
    }

    /// Search the index.
    ///
    /// Supports three modes depending on how the
    /// [`SearchRequest`](self::search::SearchRequest) is configured:
    ///
    /// - **Unified query DSL** (via `query_dsl`): The query string is
    ///   parsed using [`UnifiedQueryParser`](self::query::UnifiedQueryParser)
    ///   to automatically extract lexical and/or vector components. This is
    ///   the recommended approach for external callers.
    /// - **Structured fields** (via `lexical_search_request` /
    ///   `vector_search_request`): Lower-level API for programmatic use.
    ///
    /// When `query_dsl` is set, it is parsed first, and the resulting
    /// lexical/vector components replace any explicitly set fields. The
    /// `fusion_algorithm`, `limit`, `offset`, and `filter_query` fields
    /// from the original request are preserved.
    ///
    /// After resolving the query source, the engine executes the
    /// appropriate search mode:
    ///
    /// - **Lexical only**: BM25-scored inverted index search.
    /// - **Vector only**: Nearest-neighbor vector search.
    /// - **Hybrid**: Both searches run and results are merged using the
    ///   configured `fusion_algorithm` (defaults to
    ///   [`RRF { k: 60.0 }`](FusionAlgorithm::RRF)).
    ///
    /// When a `filter_query` is present, it is evaluated first to determine
    /// the set of candidate documents. For lexical search, the filter is
    /// combined with the user query via a boolean `must` + `filter` clause.
    /// For vector search, the filter produces an `allowed_ids` list that
    /// restricts candidate scoring. If the filter matches zero documents,
    /// an empty result is returned immediately.
    ///
    /// When both lexical and vector search requests are present, both fetch
    /// limits are doubled (2x overfetch) to improve fusion quality.
    ///
    /// Results are paginated via `offset` and `limit` on the
    /// [`SearchRequest`](self::search::SearchRequest).
    ///
    /// # Parameters
    ///
    /// - `request` - The unified search request.
    ///
    /// # Errors
    ///
    /// Returns an error if the unified query parsing, filter query
    /// execution, lexical search, vector search, embedding, or document
    /// retrieval fails.
    pub async fn search(
        &self,
        request: self::search::SearchRequest,
    ) -> Result<Vec<self::search::SearchResult>> {
        // 0a. Resolve query to internal search components
        //
        // When the query is a DSL string, parse it with UnifiedQueryParser to
        // extract both lexical and vector components. For other variants,
        // construct the internal request types from the query + options.
        //
        // Destructure the request upfront so that `query` can be moved
        // independently while the remaining fields stay available.
        let self::search::SearchRequest {
            query: request_query,
            limit: request_limit,
            offset: request_offset,
            fusion_algorithm: request_fusion,
            filter_query: request_filter,
            lexical_options,
            vector_options,
        } = request;

        let (lexical_search_request, vector_search_request, fusion_algorithm, hybrid_mode) =
            match request_query {
                self::search::SearchQuery::Dsl(ref dsl) => {
                    let parser = self.unified_query_parser()?;
                    let parser = if let Some(fusion) = request_fusion {
                        parser.with_fusion(fusion)
                    } else {
                        parser
                    };
                    let parsed = parser.parse(dsl).await?;
                    // UnifiedQueryParser now returns Lexical/Vector/Hybrid variants
                    self.resolve_search_query_from_parts(
                        parsed.query,
                        request_offset,
                        request_limit,
                        request_fusion,
                        &lexical_options,
                        &vector_options,
                    )?
                }
                other => self.resolve_search_query_from_parts(
                    other,
                    request_offset,
                    request_limit,
                    request_fusion,
                    &lexical_options,
                    &vector_options,
                )?,
            };

        // 0b. Pre-process Filter
        let (allowed_ids, lexical_query_override) = if let Some(filter_query) = &request_filter {
            let req = crate::lexical::search::searcher::LexicalSearchRequest::new(
                filter_query.clone_box(),
            )
            .limit(1_000_000)
            .load_documents(false);

            let filter_hits = self.lexical.search(req)?.hits;
            let ids: Vec<u64> = filter_hits.into_iter().map(|h| h.doc_id).collect();

            if ids.is_empty() {
                return Ok(Vec::new());
            }

            let new_lexical_query: Option<Box<dyn crate::lexical::query::Query>> =
                if let Some(lex_req) = &lexical_search_request {
                    use crate::lexical::query::boolean::BooleanQueryBuilder;
                    let user_query = self.resolve_query(lex_req.query.clone())?;
                    let bool_query = BooleanQueryBuilder::new()
                        .must(user_query)
                        .filter(filter_query.clone_box())
                        .build();
                    Some(Box::new(bool_query))
                } else {
                    None
                };

            (Some(ids), new_lexical_query)
        } else {
            (None, None)
        };

        // 1. Execute Lexical Search
        let mut lexical_query_to_use = if lexical_query_override.is_some() {
            lexical_query_override
        } else if let Some(lex_req) = &lexical_search_request {
            Some(self.resolve_query(lex_req.query.clone())?)
        } else {
            None
        };

        if let Some(query) = &mut lexical_query_to_use
            && let Some(lex_req) = &lexical_search_request
            && !lex_req.field_boosts.is_empty()
        {
            query.apply_field_boosts(&lex_req.field_boosts);
        }

        let fetch_count = request_offset.saturating_add(request_limit);

        let lexical_hits = if let Some(query) = &lexical_query_to_use {
            let q = query.clone_box();
            let overfetch_limit = if vector_search_request.is_some() {
                fetch_count.saturating_mul(2)
            } else {
                fetch_count
            };
            let req = crate::lexical::search::searcher::LexicalSearchRequest::new(q)
                .limit(overfetch_limit)
                .load_documents(false);

            self.lexical.search(req)?.hits
        } else {
            Vec::new()
        };

        // 2. Execute Vector Search
        let vector_hits = if let Some(vector_req) = &vector_search_request {
            let mut vreq = vector_req.clone();
            if lexical_search_request.is_some() && vreq.params.limit < fetch_count.saturating_mul(2)
            {
                vreq.params.limit = fetch_count.saturating_mul(2);
            }
            if let Some(ids) = &allowed_ids {
                vreq.params.allowed_ids = Some(ids.clone());
            }
            // Embed Payloads into Vectors before searching.
            // NOTE: When using VectorQueryParser, query is already Vectors
            // at parse time, so this block is skipped. This fallback remains for
            // VectorSearchRequestBuilder users who populate Payloads directly.
            if let crate::vector::search::searcher::VectorSearchQuery::Payloads(ref payloads) =
                vreq.query
            {
                use crate::data::DataValue;
                use crate::embedding::embedder::EmbedInput;
                use crate::embedding::per_field::PerFieldEmbedder;
                use crate::vector::store::request::QueryVector;

                let embedder = self.vector.embedder();
                let mut query_vectors = Vec::new();
                for payload in payloads {
                    let (text_owned, bytes_owned, mime_owned) = match &payload.payload {
                        DataValue::Text(t) => (Some(t.clone()), None, None),
                        DataValue::Bytes(b, m) => (None, Some(b.clone()), m.clone()),
                        _ => continue,
                    };
                    let field_name = payload.field.clone();
                    let input = if let Some(ref text) = text_owned {
                        EmbedInput::Text(text)
                    } else if let Some(ref bytes) = bytes_owned {
                        EmbedInput::Bytes(bytes, mime_owned.as_deref())
                    } else {
                        unreachable!()
                    };
                    let vector =
                        if let Some(pf) = embedder.as_any().downcast_ref::<PerFieldEmbedder>() {
                            pf.embed_field(&field_name, &input).await?
                        } else {
                            embedder.embed(&input).await?
                        };
                    query_vectors.push(QueryVector {
                        vector,
                        weight: payload.weight,
                        fields: Some(vec![payload.field.clone()]),
                    });
                }
                vreq.query =
                    crate::vector::search::searcher::VectorSearchQuery::Vectors(query_vectors);
            }
            self.vector.search(vreq)?.hits
        } else {
            Vec::new()
        };

        // 3. Fusion
        if lexical_search_request.is_some() && vector_search_request.is_some() {
            let algorithm = fusion_algorithm.unwrap_or(FusionAlgorithm::RRF { k: 60.0 });
            let mut results = self.fuse_results(
                lexical_hits,
                vector_hits,
                algorithm,
                hybrid_mode,
                fetch_count,
            )?;
            if request_offset > 0 {
                results = results.into_iter().skip(request_offset).collect();
            }
            results.truncate(request_limit);
            Ok(results)
        } else if !vector_hits.is_empty() {
            // Only vector results — batch-resolve external IDs and documents.
            let paginated: Vec<_> = vector_hits
                .into_iter()
                .skip(request_offset)
                .take(request_limit)
                .collect();
            let ids: Vec<u64> = paginated.iter().map(|h| h.doc_id).collect();
            let resolved = self.resolve_ids_and_documents_batch(&ids)?;
            let mut results = Vec::with_capacity(paginated.len());
            for hit in paginated {
                if let Some((external_id, document)) = resolved.get(&hit.doc_id) {
                    results.push(SearchResult {
                        id: external_id.clone(),
                        score: hit.score,
                        document: document.clone(),
                    });
                }
            }
            Ok(results)
        } else {
            // Only lexical results (or both empty)
            let paginated: Vec<_> = lexical_hits
                .into_iter()
                .skip(request_offset)
                .take(request_limit)
                .collect();
            let ids: Vec<u64> = paginated.iter().map(|h| h.doc_id).collect();
            let resolved = self.resolve_ids_and_documents_batch(&ids)?;
            let mut results = Vec::with_capacity(paginated.len());
            for hit in paginated {
                if let Some((external_id, document)) = resolved.get(&hit.doc_id) {
                    results.push(SearchResult {
                        id: external_id.clone(),
                        score: hit.score,
                        document: document.clone(),
                    });
                }
            }
            Ok(results)
        }
    }

    /// Combine results from lexical and vector engines.
    fn fuse_results(
        &self,
        lexical_hits: Vec<crate::lexical::query::SearchHit>,
        vector_hits: Vec<crate::vector::store::response::VectorHit>,
        fusion: FusionAlgorithm,
        mode: self::search::HybridMode,
        limit: usize,
    ) -> Result<Vec<SearchResult>> {
        // Collect doc_id sets upfront for intersection filtering.
        let lexical_ids: HashSet<u64> = lexical_hits.iter().map(|h| h.doc_id).collect();
        let vector_ids: HashSet<u64> = vector_hits.iter().map(|h| h.doc_id).collect();

        let mut fused_scores: HashMap<u64, (f32, Option<crate::data::Document>)> = HashMap::new();

        match fusion {
            FusionAlgorithm::RRF { k } => {
                for (rank, hit) in lexical_hits.into_iter().enumerate() {
                    let rrf_score = 1.0 / (k + (rank + 1) as f64);
                    let entry = fused_scores
                        .entry(hit.doc_id)
                        .or_insert((0.0, hit.document));
                    entry.0 += rrf_score as f32;
                }
                for (rank, hit) in vector_hits.into_iter().enumerate() {
                    let rrf_score = 1.0 / (k + (rank + 1) as f64);
                    let entry = fused_scores.entry(hit.doc_id).or_insert((0.0, None));
                    entry.0 += rrf_score as f32;
                }
            }
            FusionAlgorithm::WeightedSum {
                lexical_weight,
                vector_weight,
            } => {
                let lexical_min = lexical_hits
                    .iter()
                    .map(|h| h.score)
                    .fold(f32::INFINITY, f32::min);
                let lexical_max = lexical_hits
                    .iter()
                    .map(|h| h.score)
                    .fold(f32::NEG_INFINITY, f32::max);

                for hit in lexical_hits {
                    let norm_score = if lexical_max > lexical_min {
                        (hit.score - lexical_min) / (lexical_max - lexical_min)
                    } else {
                        1.0
                    };
                    let entry = fused_scores
                        .entry(hit.doc_id)
                        .or_insert((0.0, hit.document));
                    entry.0 += norm_score * lexical_weight;
                }

                let vector_min = vector_hits
                    .iter()
                    .map(|h| h.score)
                    .fold(f32::INFINITY, f32::min);
                let vector_max = vector_hits
                    .iter()
                    .map(|h| h.score)
                    .fold(f32::NEG_INFINITY, f32::max);

                for hit in vector_hits {
                    let norm_score = if vector_max > vector_min {
                        (hit.score - vector_min) / (vector_max - vector_min)
                    } else {
                        1.0
                    };
                    let entry = fused_scores.entry(hit.doc_id).or_insert((0.0, None));
                    entry.0 += norm_score * vector_weight;
                }
            }
        }

        // Intersection mode: keep only documents appearing in BOTH result sets.
        if mode == self::search::HybridMode::Intersection {
            fused_scores.retain(|id, _| lexical_ids.contains(id) && vector_ids.contains(id));
        }

        let mut intermediate: Vec<(u64, f32, Option<crate::data::Document>)> = fused_scores
            .into_iter()
            .map(|(doc_id, (score, document))| (doc_id, score, document))
            .collect();

        // Sort by fused score descending
        intermediate.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

        // Limit results
        if intermediate.len() > limit {
            intermediate.truncate(limit);
        }

        // Batch-resolve external IDs and fill missing documents.
        // Collect IDs that need resolution (either missing external ID or
        // missing document).
        let ids_to_resolve: Vec<u64> = intermediate.iter().map(|(doc_id, _, _)| *doc_id).collect();
        let resolved = self.resolve_ids_and_documents_batch(&ids_to_resolve)?;

        let mut results = Vec::with_capacity(intermediate.len());
        for (doc_id, score, document) in intermediate {
            if let Some((external_id, resolved_doc)) = resolved.get(&doc_id) {
                // Prefer the document already fetched by the lexical search;
                // fall back to the batch-resolved copy.
                let final_doc = if document.is_some() {
                    document
                } else {
                    resolved_doc.clone()
                };
                results.push(SearchResult {
                    id: external_id.clone(),
                    score,
                    document: final_doc,
                });
            }
        }

        Ok(results)
    }
}

/// Builder for constructing an [`Engine`] with custom configuration.
///
/// Use this when you need to specify a custom text analyzer or embedding
/// model. For simple cases with default settings (StandardAnalyzer, no
/// embedder), use [`Engine::new`] directly.
///
/// # Example
///
/// ```ignore
/// use std::sync::Arc;
///
/// let schema = Schema::builder()
///     .add_field("content", FieldOption::Text(TextOption::default()))
///     .add_field("content_vec", FieldOption::Flat(FlatOption { dimension: 384, ..Default::default() }))
///     .build();
///
/// let engine = Engine::builder(storage, schema)
///     .analyzer(Arc::new(StandardAnalyzer::default()))
///     .embedder(Arc::new(MyEmbedder))
///     .build()
///     .await?;
/// ```
pub struct EngineBuilder {
    storage: Arc<dyn Storage>,
    schema: Schema,
    analyzer: Option<Arc<dyn Analyzer>>,
    embedder: Option<Arc<dyn Embedder>>,
}

impl EngineBuilder {
    /// Create a new builder with the given storage and schema.
    pub fn new(storage: Arc<dyn Storage>, schema: Schema) -> Self {
        Self {
            storage,
            schema,
            analyzer: None,
            embedder: None,
        }
    }

    /// Set the analyzer for text fields.
    ///
    /// Both simple analyzers (e.g., [`StandardAnalyzer`]) and [`PerFieldAnalyzer`] are
    /// supported. When a `PerFieldAnalyzer` is passed, it is used directly (with `_id`
    /// automatically set to `KeywordAnalyzer` if not already configured).
    ///
    /// If not set, [`StandardAnalyzer`] is used as the default.
    pub fn analyzer(mut self, analyzer: Arc<dyn Analyzer>) -> Self {
        self.analyzer = Some(analyzer);
        self
    }

    /// Set the embedder for vector fields.
    ///
    /// Both simple embedders and [`PerFieldEmbedder`](crate::embedding::per_field::PerFieldEmbedder)
    /// are supported. When a `PerFieldEmbedder` is passed, each vector field will use
    /// the embedder registered for that field name, falling back to the default.
    ///
    /// If not set, no embedder is configured.
    pub fn embedder(mut self, embedder: Arc<dyn Embedder>) -> Self {
        self.embedder = Some(embedder);
        self
    }

    /// Build the [`Engine`].
    ///
    /// Creates the lexical store, vector store, and document log (WAL),
    /// then runs WAL recovery to replay any uncommitted changes from a
    /// previous session.
    ///
    /// # Errors
    ///
    /// Returns an error if storage initialization, index creation, WAL
    /// opening, or recovery replay fails.
    pub async fn build(self) -> Result<Engine> {
        let (lexical_config, vector_config) =
            Engine::split_schema(&self.schema, self.analyzer, self.embedder).await?;

        let lexical_storage = Arc::new(PrefixedStorage::new("lexical", self.storage.clone()));
        let vector_storage = Arc::new(PrefixedStorage::new("vector", self.storage.clone()));
        let document_storage: Arc<dyn Storage> =
            Arc::new(PrefixedStorage::new("documents", self.storage.clone()));

        let lexical = LexicalStore::new(lexical_storage, lexical_config)?;
        let vector = VectorStore::new(vector_storage, vector_config)?;

        let log = Arc::new(DocumentLog::new(
            self.storage,
            "engine.wal",
            document_storage,
        )?);

        let engine = Engine {
            schema: RwLock::new(self.schema),
            lexical,
            vector,
            log,
        };

        engine.recover().await?;

        Ok(engine)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::embedding::per_field::PerFieldEmbedder;
    use crate::embedding::precomputed::PrecomputedEmbedder;
    use crate::storage::memory::MemoryStorage;

    #[tokio::test]
    async fn test_accepts_per_field_analyzer() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let per_field = PerFieldAnalyzer::new(Arc::new(StandardAnalyzer::default()));

        let result = Engine::builder(storage, schema)
            .analyzer(Arc::new(per_field))
            .build()
            .await;

        assert!(result.is_ok(), "Should accept PerFieldAnalyzer");
    }

    #[tokio::test]
    async fn test_accepts_per_field_embedder() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let dummy_embedder = Arc::new(PrecomputedEmbedder::new());
        let per_field = PerFieldEmbedder::new(dummy_embedder);

        let result = Engine::builder(storage, schema)
            .embedder(Arc::new(per_field))
            .build()
            .await;

        assert!(result.is_ok(), "Should accept PerFieldEmbedder");
    }

    #[tokio::test]
    async fn test_accepts_simple_analyzer() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let result = Engine::builder(storage, schema)
            .analyzer(Arc::new(StandardAnalyzer::default()))
            .build()
            .await;

        assert!(result.is_ok(), "Should accept StandardAnalyzer");
    }

    #[tokio::test]
    async fn test_accepts_simple_embedder() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let dummy_embedder = Arc::new(PrecomputedEmbedder::new());

        let result = Engine::builder(storage, schema)
            .embedder(dummy_embedder)
            .build()
            .await;

        assert!(result.is_ok(), "Should accept simple embedder");
    }

    #[tokio::test]
    async fn test_schema_per_field_analyzer() {
        use crate::data::DataValue;
        use crate::engine::schema::FieldOption;
        use crate::lexical::core::field::TextOption;
        use crate::lexical::search::searcher::LexicalSearchQuery;

        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        // "category" uses keyword analyzer (no tokenization).
        // "body" uses default (standard) analyzer.
        let schema = Schema::builder()
            .add_field(
                "category",
                FieldOption::Text(TextOption::default().analyzer("keyword")),
            )
            .add_field("body", FieldOption::Text(TextOption::default()))
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        let mut doc = crate::data::Document::new();
        doc.fields
            .insert("category".into(), DataValue::Text("Rust Lang".into()));
        doc.fields.insert(
            "body".into(),
            DataValue::Text("Rust is a systems programming language".into()),
        );
        engine.put_document("doc1", doc).await.unwrap();
        engine.commit().await.unwrap();

        // "Rust Lang" as keyword — exact match required.
        let request = crate::engine::search::SearchRequestBuilder::new()
            .lexical_query(LexicalSearchQuery::from("category:\"Rust Lang\""))
            .limit(10)
            .build();
        let results = engine.search(request).await.unwrap();
        assert_eq!(
            results.len(),
            1,
            "Keyword analyzer should match exact phrase"
        );

        // Partial token "Rust" should NOT match keyword-analyzed category.
        let request = crate::engine::search::SearchRequestBuilder::new()
            .lexical_query(LexicalSearchQuery::from("category:Rust"))
            .limit(10)
            .build();
        let results = engine.search(request).await.unwrap();
        assert!(
            results.is_empty(),
            "Keyword analyzer should not match partial tokens"
        );

        // Standard-analyzed "body" field should match single token "rust".
        let request = crate::engine::search::SearchRequestBuilder::new()
            .lexical_query(LexicalSearchQuery::from("body:rust"))
            .limit(10)
            .build();
        let results = engine.search(request).await.unwrap();
        assert_eq!(
            results.len(),
            1,
            "Standard analyzer should tokenize and match"
        );
    }

    #[tokio::test]
    async fn test_custom_analyzer_definition_in_schema() {
        use crate::data::DataValue;
        use crate::engine::schema::FieldOption;
        use crate::engine::schema::analyzer::{
            AnalyzerDefinition, CharFilterConfig, TokenFilterConfig, TokenizerConfig,
        };
        use crate::lexical::core::field::TextOption;
        use crate::lexical::search::searcher::LexicalSearchQuery;

        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        // Define a custom analyzer: whitespace + NFKC normalization + lowercase.
        let schema = Schema::builder()
            .add_analyzer(
                "my_custom",
                AnalyzerDefinition {
                    char_filters: vec![CharFilterConfig::UnicodeNormalization {
                        form: "nfkc".into(),
                    }],
                    tokenizer: TokenizerConfig::Whitespace,
                    token_filters: vec![TokenFilterConfig::Lowercase],
                },
            )
            .add_field(
                "content",
                FieldOption::Text(TextOption::default().analyzer("my_custom")),
            )
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        let mut doc = crate::data::Document::new();
        // Fullwidth "ＨＥＬＬＯ" should be normalized to "HELLO", then lowercased.
        doc.fields.insert(
            "content".into(),
            DataValue::Text("\u{ff28}\u{ff25}\u{ff2c}\u{ff2c}\u{ff2f} world".into()),
        );
        engine.put_document("doc1", doc).await.unwrap();
        engine.commit().await.unwrap();

        // Search for "hello" should match (NFKC + lowercase).
        let request = crate::engine::search::SearchRequestBuilder::new()
            .lexical_query(LexicalSearchQuery::from("content:hello"))
            .limit(10)
            .build();
        let results = engine.search(request).await.unwrap();
        assert_eq!(
            results.len(),
            1,
            "Custom analyzer (NFKC + lowercase) should match normalized text"
        );
    }

    #[tokio::test]
    async fn test_add_lexical_field() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        // Start with a schema containing only "title".
        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        // Dynamically add a "category" field.
        let updated = engine
            .add_field(
                "category",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .await
            .unwrap();

        assert!(updated.fields.contains_key("category"));
        assert!(updated.fields.contains_key("title"));

        // Index a document that uses the new field.
        engine
            .add_document(
                "doc1",
                Document::builder()
                    .add_text("title", "Rust Programming")
                    .add_text("category", "programming")
                    .build(),
            )
            .await
            .unwrap();
        engine.commit().await.unwrap();

        // Search on the dynamically added field.
        use crate::lexical::search::searcher::LexicalSearchQuery;
        let request = crate::engine::search::SearchRequestBuilder::new()
            .lexical_query(LexicalSearchQuery::from("category:programming"))
            .limit(10)
            .build();
        let results = engine.search(request).await.unwrap();
        assert_eq!(
            results.len(),
            1,
            "Should find doc via dynamically added field"
        );
    }

    #[tokio::test]
    async fn test_add_field_duplicate_rejected() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        // Adding a field with the same name should fail.
        let result = engine
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .await;
        assert!(result.is_err(), "Duplicate field should be rejected");
    }

    #[tokio::test]
    async fn test_add_vector_field() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .build();

        let dummy_embedder = Arc::new(PrecomputedEmbedder::new());
        let per_field = PerFieldEmbedder::new(dummy_embedder);

        let engine = Engine::builder(storage, schema)
            .embedder(Arc::new(per_field))
            .build()
            .await
            .unwrap();

        // Dynamically add a vector field with dimension 128 (matching PrecomputedEmbedder default).
        let updated = engine
            .add_field(
                "embedding",
                schema::FieldOption::Flat(
                    crate::vector::core::field::FlatOption::default().dimension(128),
                ),
            )
            .await
            .unwrap();

        assert!(updated.fields.contains_key("embedding"));

        // Index a document with the vector field.
        let vec_data: Vec<f32> = (0..128).map(|i| i as f32 / 128.0).collect();
        engine
            .add_document(
                "doc1",
                Document::builder()
                    .add_text("title", "Hello")
                    .add_vector("embedding", vec_data)
                    .build(),
            )
            .await
            .unwrap();
        engine.commit().await.unwrap();

        // Verify document was indexed successfully.
        let docs = engine.get_documents("doc1").await.unwrap();
        assert_eq!(docs.len(), 1);
    }

    #[tokio::test]
    async fn test_schema_returns_current_state() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let engine = Engine::new(storage, schema).await.unwrap();

        // Initially empty (no user fields).
        assert!(engine.schema().fields.is_empty());

        // Add a field.
        engine
            .add_field(
                "body",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .await
            .unwrap();

        // schema() should reflect the addition.
        let current = engine.schema();
        assert!(current.fields.contains_key("body"));
    }

    #[tokio::test]
    async fn test_delete_lexical_field() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        // Dynamically add a "category" field, then delete it.
        engine
            .add_field(
                "category",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .await
            .unwrap();
        assert!(engine.schema().fields.contains_key("category"));

        let updated = engine.delete_field("category").await.unwrap();
        assert!(!updated.fields.contains_key("category"));
        assert!(updated.fields.contains_key("title"));
    }

    #[tokio::test]
    async fn test_delete_field_removes_from_default_fields() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .add_default_field("title")
            .build();

        let engine = Engine::new(storage, schema).await.unwrap();

        let updated = engine.delete_field("title").await.unwrap();
        assert!(!updated.fields.contains_key("title"));
        assert!(!updated.default_fields.contains(&"title".to_string()));
    }

    #[tokio::test]
    async fn test_delete_field_nonexistent_rejected() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));
        let schema = Schema::new();

        let engine = Engine::new(storage, schema).await.unwrap();

        let result = engine.delete_field("nonexistent").await;
        assert!(result.is_err(), "Deleting a nonexistent field should fail");
    }

    #[tokio::test]
    async fn test_delete_vector_field() {
        let storage: Arc<dyn Storage> = Arc::new(MemoryStorage::new(Default::default()));

        let schema = Schema::builder()
            .add_field(
                "title",
                schema::FieldOption::Text(crate::lexical::core::field::TextOption::default()),
            )
            .build();

        let dummy_embedder = Arc::new(PrecomputedEmbedder::new());
        let per_field = PerFieldEmbedder::new(dummy_embedder);

        let engine = Engine::builder(storage, schema)
            .embedder(Arc::new(per_field))
            .build()
            .await
            .unwrap();

        // Add then delete a vector field.
        engine
            .add_field(
                "embedding",
                schema::FieldOption::Hnsw(crate::vector::core::field::HnswOption {
                    dimension: 4,
                    ..Default::default()
                }),
            )
            .await
            .unwrap();
        assert!(engine.schema().fields.contains_key("embedding"));

        let updated = engine.delete_field("embedding").await.unwrap();
        assert!(!updated.fields.contains_key("embedding"));
    }
}