lucisearch 0.8.1

//! IndexWriter: buffer documents, analyze text, flush segments, commit.
//!
//! The public write API for Luci. Documents are validated against the
//! schema, text fields are analyzed, and results are accumulated in an
//! in-memory segment buffer. On commit (or memory budget exceeded), the
//! buffer is flushed to a segment and written to storage.
//!
//! See [[architecture-indexing-pipeline]] and [[architecture-overview#Step 9]].

use crate::analysis::{AnalyzerRegistry, Token};
use crate::core::{FieldId, LuciError, Result, SegmentId};
use crate::mapping::{DynamicMode, FieldType, Mapping};
use crate::storage::Storage;

use crate::columnar::writer::ColumnValue;
use crate::segment::builder::SegmentBuilder;
use crate::spatial::geo::GeoPoint;
use crate::vector::global::GlobalHnsw;
use crate::vector::hnsw::BuildThreads;

/// Default memory budget before auto-flush (64 MB).
const DEFAULT_MEMORY_BUDGET: usize = 64 * 1024 * 1024;

/// Writes documents to a Luci index.
///
/// Usage:
/// 1. Create with `IndexWriter::new(storage, schema, analyzers)`
/// 2. Call `put(doc)` for each document
/// 3. Call `commit()` to make documents searchable
pub struct IndexWriter {
    storage: Box<dyn Storage>,
    schema: Mapping,
    analyzers: AnalyzerRegistry,
    builder: SegmentBuilder,
    next_segment_id: u64,
    memory_budget: usize,
    /// Approximate bytes consumed by the current buffer.
    buffer_size: usize,
    /// Merge policy configuration.
    merge_policy: crate::merge_policy::MergePolicy,
    /// Pending deletions (applied during search, cleaned up on merge).
    pending_deletions: crate::deletion::DeletionMap,
    /// Analysis settings JSON to persist alongside the mapping.
    /// See [[feature-analysis-pipeline]].
    analysis_json: Option<serde_json::Value>,
    /// Single global HNSW per dense_vector field, decoupled from the
    /// segment model per [[global-vector-indices]]
    /// Alternative B. Vectors flow only here — segments no longer
    /// carry their own per-field HNSWs. Persisted via the storage
    /// layer's per-field `write_vector_index` API on commit.
    global_hnsw: GlobalHnsw,
    /// Thread budget for the parallel HNSW connect phase run at commit.
    /// `Ambient` (the production default) uses rayon's global pool;
    /// deterministic test/profile harnesses force `Fixed(1)` for a
    /// bit-identical graph. See [[optimization-concurrent-hnsw-insert]].
    build_threads: BuildThreads,
}

impl IndexWriter {
    /// Create a new IndexWriter.
    pub fn new(
        storage: impl Storage + 'static,
        schema: Mapping,
        analyzers: AnalyzerRegistry,
    ) -> Self {
        let next_id = storage.generation() + 1;
        let builder = SegmentBuilder::new(SegmentId::new(next_id), &schema);
        let global_hnsw = GlobalHnsw::new(&schema);
        Self {
            storage: Box::new(storage),
            schema,
            analyzers,
            builder,
            next_segment_id: next_id,
            memory_budget: DEFAULT_MEMORY_BUDGET,
            buffer_size: 0,
            merge_policy: crate::merge_policy::MergePolicy::default(),
            pending_deletions: crate::deletion::DeletionMap::new(),
            analysis_json: None,
            global_hnsw,
            build_threads: BuildThreads::Ambient,
        }
    }

    /// Override the thread budget for the commit-time HNSW connect phase.
    /// Production leaves the `Ambient` default (rayon global pool);
    /// deterministic Rust tests/profile harnesses set `Fixed(1)` to get a
    /// bit-identical graph independent of the ambient pool size. Not
    /// exposed to the Python API. See [[optimization-concurrent-hnsw-insert]].
    pub fn set_build_threads(&mut self, threads: BuildThreads) {
        self.build_threads = threads;
    }

    /// Set the analysis settings JSON to persist alongside the mapping.
    pub fn set_analysis_json(&mut self, json: Option<serde_json::Value>) {
        self.analysis_json = json;
    }

    /// Load persisted deletions (called on Index::open).
    pub fn load_deletions(&mut self, deletions: crate::deletion::DeletionMap) {
        self.pending_deletions = deletions;
    }

    /// Replace the in-memory global HNSW with one deserialized from
    /// persisted bytes (called on `Index::open`).
    pub fn load_global_hnsw(&mut self, global_hnsw: GlobalHnsw) {
        self.global_hnsw = global_hnsw;
    }

    /// Access the global HNSW for serialization on commit and for
    /// taking a search-side snapshot via `to_bytes`.
    pub fn global_hnsw(&self) -> &GlobalHnsw {
        &self.global_hnsw
    }

    /// Mark a document as deleted. Takes effect on next search (no commit needed).
    pub fn mark_deleted(&mut self, segment_id: SegmentId, doc_id: crate::core::DocId) {
        self.pending_deletions.mark_deleted(segment_id, doc_id);
    }

    /// Get the current deletion map (for search-time filtering).
    pub fn deletions(&self) -> &crate::deletion::DeletionMap {
        &self.pending_deletions
    }

    /// Set the memory budget for auto-flush.
    pub fn set_memory_budget(&mut self, budget: usize) {
        self.memory_budget = budget;
    }

    /// Set the timeout for acquiring the cross-process write lock.
    ///
    /// Default: 5 seconds. If another process holds the write lock,
    /// retries until the timeout, then returns `WriterLocked`.
    pub fn set_write_timeout(&mut self, timeout: std::time::Duration) {
        self.storage.set_write_timeout(timeout);
    }

    /// Add a JSON document to the index.
    ///
    /// The document is validated against the schema, text fields are analyzed,
    /// and the result is buffered. Call `commit()` to make it searchable.
    pub fn add(&mut self, doc: serde_json::Value) -> Result<()> {
        let obj = doc
            .as_object()
            .ok_or_else(|| LuciError::InvalidQuery("document must be a JSON object".into()))?;

        let source = serde_json::to_vec(&doc)
            .map_err(|e| LuciError::InvalidQuery(format!("JSON serialization failed: {e}")))?;

        let mut analyzed_fields: Vec<(FieldId, Vec<Token>)> = Vec::new();
        let mut column_values: Vec<(FieldId, ColumnValue)> = Vec::new();

        // Inject _id: use user-provided or auto-generate via UUID v4.
        // Random v4 (not v7) — our inverted index is FST-based, not a B-tree,
        // so we don't need time-sortable IDs for insert locality. v4 avoids
        // leaking the index time via the ID.
        let doc_id_str = match obj.get("_id").and_then(|v| v.as_str()) {
            Some(id) => id.to_string(),
            None => uuid::Uuid::new_v4().to_string(),
        };
        if let Some(id_field_id) = self.schema.field_id("_id") {
            analyzed_fields.push((id_field_id, vec![Token::new(doc_id_str.clone(), 0, 0, 0)]));
            column_values.push((id_field_id, ColumnValue::keyword(doc_id_str)?));
        }
        let mut vector_fields: Vec<(FieldId, Vec<f32>)> = Vec::new();
        let mut geo_points: Vec<(FieldId, GeoPoint)> = Vec::new();
        let mut geo_shapes: Vec<(FieldId, ::geo::Geometry<f64>)> = Vec::new();
        let mut copy_to_pairs: Vec<(String, String)> = Vec::new(); // (target_field, source_text)

        for (field_name, value) in obj {
            // _id is handled above (explicit injection + columnar push).
            // Without this skip, the field loop would push _id to the
            // columnar store a second time, doubling the per-doc ordinal
            // stride and breaking `SearchHit.id` column reads.
            if field_name == "_id" {
                continue;
            }
            // Look up or dynamically map the field
            let field_id = match self.schema.field_id(field_name) {
                Some(id) => id,
                None => match self.schema.dynamic_mode() {
                    DynamicMode::False => continue, // store in _source but don't index
                    DynamicMode::True => {
                        // TODO: infer field type and add to mapping dynamically.
                        // For now, skip unknown fields (stored in _source only).
                        continue;
                    }
                },
            };

            let mapping = self.schema.field(field_id);

            // Build tokens for the inverted index (skipped if index: false)
            let tokens = match &mapping.field_type {
                FieldType::Text => {
                    let text = value.as_str().unwrap_or_default();
                    let analyzer_name = mapping.analyzer.as_deref().unwrap_or("standard");
                    let analyzer = self.analyzers.get(analyzer_name);
                    analyzer.analyze(text)
                }
                FieldType::Keyword => {
                    let text = match value {
                        serde_json::Value::String(s) => s.clone(),
                        other => other.to_string(),
                    };
                    vec![Token::new(text.clone(), 0, 0, 0)]
                }
                FieldType::Ip => {
                    // Index normalized IP string as keyword token
                    let text = value.as_str().unwrap_or_default();
                    let normalized = crate::ip::normalize_ip(text);
                    if normalized.is_empty() {
                        Vec::new()
                    } else {
                        vec![Token::new(normalized, 0, 0, 0)]
                    }
                }
                _ => {
                    // Numeric/boolean/date fields: only columnar, no inverted index
                    Vec::new()
                }
            };

            if !tokens.is_empty() && mapping.indexed {
                analyzed_fields.push((field_id, tokens));
            }

            // Store geo shape for geo_shape fields
            if matches!(mapping.field_type, FieldType::GeoShape) {
                if let Some(geom) = crate::spatial::shape::parse_geojson(value) {
                    geo_shapes.push((field_id, geom));
                }
            }

            // Store geo point for geo_point fields
            if matches!(mapping.field_type, FieldType::GeoPoint) {
                if let Some(point) = GeoPoint::from_json(value) {
                    geo_points.push((field_id, point));
                }
            }

            // Store vector for dense_vector fields
            if mapping.field_type.is_dense_vector() {
                if let serde_json::Value::Array(arr) = value {
                    let vec: Vec<f32> = arr
                        .iter()
                        .filter_map(|v| v.as_f64().map(|f| f as f32))
                        .collect();
                    if !vec.is_empty() {
                        vector_fields.push((field_id, vec));
                    }
                }
            }

            // Store column value for doc_values fields
            if mapping.doc_values {
                let col_val = match &mapping.field_type {
                    FieldType::Keyword => match value {
                        serde_json::Value::String(s) => ColumnValue::keyword(s.clone())?,
                        serde_json::Value::Null => ColumnValue::Null,
                        other => ColumnValue::keyword(other.to_string())?,
                    },
                    FieldType::Integer | FieldType::Long => match value {
                        serde_json::Value::Number(n) => ColumnValue::I64(n.as_i64().unwrap_or(0)),
                        _ => ColumnValue::Null,
                    },
                    FieldType::Float | FieldType::Double => match value {
                        serde_json::Value::Number(n) => ColumnValue::F64(n.as_f64().unwrap_or(0.0)),
                        _ => ColumnValue::Null,
                    },
                    FieldType::Boolean => match value {
                        serde_json::Value::Bool(b) => ColumnValue::Bool(*b),
                        _ => ColumnValue::Null,
                    },
                    FieldType::TokenCount => {
                        let text = value.as_str().unwrap_or_default();
                        let analyzer_name = mapping.analyzer.as_deref().unwrap_or("standard");
                        let analyzer = self.analyzers.get(analyzer_name);
                        ColumnValue::I64(analyzer.analyze(text).len() as i64)
                    }
                    FieldType::Ip => {
                        let text = value.as_str().unwrap_or_default();
                        match crate::ip::ip_to_i64(text) {
                            Some(v) => ColumnValue::I64(v),
                            None => ColumnValue::Null,
                        }
                    }
                    _ => ColumnValue::Null, // Text fields don't get doc_values
                };
                column_values.push((field_id, col_val));
            }

            // Collect copy_to pairs for post-loop processing
            if !mapping.copy_to.is_empty() {
                let source_text = match value {
                    serde_json::Value::String(s) => s.clone(),
                    other => other.to_string(),
                };
                for target in &mapping.copy_to {
                    copy_to_pairs.push((target.clone(), source_text.clone()));
                }
            }
        }

        // Process copy_to: analyze source text with target field's analyzer.
        // See [[feature-mapping-copy-to]]. Targets are validated at index
        // creation time via `Mapping::validate`, so an unknown target here
        // is an upstream invariant violation, not user input — panic loudly
        // rather than silently dropping the copy. See [[code-must-not-lie]].
        for (target_name, source_text) in &copy_to_pairs {
            let target_id = self.schema.field_id(target_name).unwrap_or_else(|| {
                panic!(
                    "copy_to target \"{target_name}\" missing from schema; \
                     Mapping::validate should have rejected this at index \
                     creation. This is an internal wiring bug, not user input."
                );
            });
            let target_mapping = self.schema.field(target_id);
            if !target_mapping.indexed {
                continue;
            }

            let tokens = match &target_mapping.field_type {
                FieldType::Text => {
                    let analyzer_name = target_mapping.analyzer.as_deref().unwrap_or("standard");
                    let analyzer = self.analyzers.get(analyzer_name);
                    analyzer.analyze(source_text)
                }
                FieldType::Keyword => {
                    vec![Token::new(source_text.clone(), 0, 0, 0)]
                }
                _ => continue,
            };
            if !tokens.is_empty() {
                analyzed_fields.push((target_id, tokens));
            }
        }

        // Process multi-field sub-fields: route parent's source value
        // through the sub-field's analysis chain.
        // See [[feature-mapping-multi-fields]].
        for mapping in self.schema.fields() {
            if let Some(ref parent_name) = mapping.parent_field {
                // Get the parent's value from the document
                let parent_value = match obj.get(parent_name) {
                    Some(v) => v,
                    None => continue,
                };
                let field_id = match self.schema.field_id(&mapping.name) {
                    Some(id) => id,
                    None => continue,
                };

                if mapping.indexed {
                    let tokens = match &mapping.field_type {
                        FieldType::Text => {
                            let text = parent_value.as_str().unwrap_or_default();
                            let analyzer_name = mapping.analyzer.as_deref().unwrap_or("standard");
                            let analyzer = self.analyzers.get(analyzer_name);
                            analyzer.analyze(text)
                        }
                        FieldType::Keyword => {
                            let text = match parent_value {
                                serde_json::Value::String(s) => s.clone(),
                                other => other.to_string(),
                            };
                            vec![Token::new(text, 0, 0, 0)]
                        }
                        _ => continue,
                    };
                    analyzed_fields.push((field_id, tokens));
                }

                if mapping.doc_values {
                    let col_val = match &mapping.field_type {
                        FieldType::Keyword => {
                            let text = match parent_value {
                                serde_json::Value::String(s) => s.clone(),
                                other => other.to_string(),
                            };
                            ColumnValue::keyword(text)?
                        }
                        _ => continue,
                    };
                    column_values.push((field_id, col_val));
                }
            }
        }

        self.buffer_size += source.len();

        // Mark this as a parent doc if we have nested fields
        let has_nested = self
            .schema
            .fields()
            .iter()
            .any(|f| matches!(f.field_type, FieldType::Nested));

        // Snapshot the segment+doc id this document will land at
        // *before* `add_document` increments `doc_count`. The global
        // HNSW resolver uses this pair to route kNN hits back to the
        // owning segment.
        let segment_id = self.builder.segment_id();
        let local_doc_id = self.builder.doc_count();

        self.builder.add_document(&analyzed_fields, &source);

        if has_nested {
            self.builder.mark_parent();
        }

        // Add column values for doc_values fields
        for (field_id, col_val) in column_values {
            self.builder.add_column_value(field_id, col_val);
        }

        // Vectors flow only to the index-wide global HNSW per
        // [[global-vector-indices]] Alternative B. `store_vector` defers
        // graph linkage to the commit-time `connect_pending` (parallel),
        // but still normalizes and rejects zero/non-finite cosine inputs
        // up front — propagate so the bulk caller aborts the batch instead
        // of silently embedding a degenerate vector. See
        // [[optimize-cosine-norm-precompute]] §"Zero-vector policy" and
        // [[optimization-concurrent-hnsw-insert]] §Write model.
        for (field_id, vec) in vector_fields {
            self.global_hnsw
                .store_vector(field_id, segment_id, local_doc_id, vec)?;
        }

        // Add geo points
        for (field_id, point) in geo_points {
            self.builder.add_geo_point(field_id, point);
        }

        // Add geo shapes
        for (field_id, geom) in &geo_shapes {
            self.builder.add_geo_shape(*field_id, geom);
        }

        // Index nested objects as hidden documents
        for mapping in self.schema.fields() {
            if !matches!(mapping.field_type, FieldType::Nested) {
                continue;
            }
            let field_name = &mapping.name;
            if let Some(serde_json::Value::Array(nested_arr)) = obj.get(field_name) {
                for nested_obj in nested_arr {
                    if let Some(nested_map) = nested_obj.as_object() {
                        // Index each nested object as a hidden document
                        let mut nested_fields: Vec<(FieldId, Vec<Token>)> = Vec::new();
                        for (nested_key, nested_val) in nested_map {
                            // Path-prefixed field name: "offers.seller"
                            let prefixed = format!("{field_name}.{nested_key}");
                            if let Some(fid) = self.schema.field_id(&prefixed) {
                                let m = self.schema.field(fid);
                                let tokens = match &m.field_type {
                                    FieldType::Text => {
                                        let text = nested_val.as_str().unwrap_or_default();
                                        let analyzer = self
                                            .analyzers
                                            .get(m.analyzer.as_deref().unwrap_or("standard"));
                                        analyzer.analyze(text)
                                    }
                                    FieldType::Keyword => {
                                        let text = match nested_val {
                                            serde_json::Value::String(s) => s.clone(),
                                            other => other.to_string(),
                                        };
                                        vec![Token::new(text, 0, 0, 0)]
                                    }
                                    _ => continue,
                                };
                                if !tokens.is_empty() {
                                    nested_fields.push((fid, tokens));
                                }
                            }
                        }
                        // Add as hidden doc (empty source)
                        self.builder.add_document(&nested_fields, b"{}");
                        self.builder.mark_nested();
                    }
                }
            }
        }

        // Auto-flush if memory budget exceeded
        if self.buffer_size >= self.memory_budget {
            self.flush()?;
        }

        Ok(())
    }

    /// Flush the current buffer to a segment (without committing).
    fn flush(&mut self) -> Result<()> {
        if self.builder.is_empty() {
            return Ok(());
        }

        let segment_id = SegmentId::new(self.next_segment_id);
        self.next_segment_id += 1;

        // Take the current builder and replace with a new one
        let builder = std::mem::replace(
            &mut self.builder,
            SegmentBuilder::new(SegmentId::new(self.next_segment_id), &self.schema),
        );

        let segment_data = builder.build();
        self.storage.write_segment(segment_id, &segment_data)?;
        self.buffer_size = 0;

        Ok(())
    }

    /// Flush the buffer and commit all pending segments to storage.
    ///
    /// Persists the current field mappings alongside segment metadata.
    /// After commit returns, all documents added via `add()` are searchable.
    /// If the merge policy triggers, a synchronous merge is executed before
    /// returning.
    pub fn commit(&mut self) -> Result<()> {
        self.flush()?;
        // Persist mapping (and analysis settings) as user metadata
        let mut mapping_json = self.schema.to_json();
        if let Some(ref analysis) = self.analysis_json {
            if let Some(obj) = mapping_json.as_object_mut() {
                let mut settings = serde_json::Map::new();
                settings.insert("analysis".to_string(), analysis.clone());
                obj.insert("settings".to_string(), serde_json::Value::Object(settings));
            }
        }
        let mapping_bytes = serde_json::to_vec(&mapping_json).map_err(|e| {
            LuciError::Io(std::io::Error::new(
                std::io::ErrorKind::Other,
                format!("failed to serialize mapping: {e}"),
            ))
        })?;

        // user_metadata stays small: mapping JSON + deletion bitmap,
        // length-prefixed. The global HNSW does NOT live here — it's
        // stored as a per-field vector index via the storage layer's
        // `write_vector_index` API. See [[global-vector-indices]].
        let deletion_bytes = self.pending_deletions.to_bytes();
        let mut metadata = Vec::with_capacity(4 + mapping_bytes.len() + 4 + deletion_bytes.len());
        metadata.extend_from_slice(&(mapping_bytes.len() as u32).to_le_bytes());
        metadata.extend_from_slice(&mapping_bytes);
        metadata.extend_from_slice(&(deletion_bytes.len() as u32).to_le_bytes());
        metadata.extend_from_slice(&deletion_bytes);
        self.storage.set_user_metadata(metadata);

        // Link every pending vector tail BEFORE persisting. This is the
        // load-bearing ordering invariant: `connect_pending` runs here,
        // ahead of both `field_to_bytes` below and the `maybe_merge`
        // re-persist further down, so no write-side persist can observe an
        // unlinked tail. A disconnected graph (every neighbor list empty)
        // would silently drop every kNN hit past the entry point. See
        // [[optimization-concurrent-hnsw-insert]] §Write model.
        for field_id in self.global_hnsw.non_empty_field_ids() {
            self.global_hnsw
                .connect_pending(field_id, self.build_threads);
        }

        // Persist each non-empty vector index as its own extent.
        // `write_vector_index` replaces the prior committed bytes for
        // the same FieldId, freeing the old extent during commit.
        for field_id in self.global_hnsw.non_empty_field_ids() {
            if let Some(bytes) = self.global_hnsw.field_to_bytes(field_id) {
                self.storage.write_vector_index(field_id, &bytes)?;
            }
        }

        self.storage.commit()?;
        self.maybe_merge()?;

        Ok(())
    }

    /// Check the merge policy and execute a merge if needed.
    fn maybe_merge(&mut self) -> Result<()> {
        use crate::merge_policy::{SegmentInfo, find_merge};

        let infos: Vec<SegmentInfo> = self
            .storage
            .segments()
            .iter()
            .map(|e| {
                SegmentInfo {
                    segment_id: e.segment_id,
                    size_bytes: e.data_len,
                    doc_count: 0, // TODO: store doc_count in SegmentEntry
                    deletion_count: 0,
                }
            })
            .collect();

        let candidate = match find_merge(&self.merge_policy, &infos) {
            Some(c) => c,
            None => return Ok(()),
        };

        self.execute_merge(&candidate.segment_ids)
    }

    /// Execute a merge: read source segments, merge, write result, replace.
    fn execute_merge(&mut self, source_ids: &[SegmentId]) -> Result<()> {
        use crate::deletion::DeletionMap;
        use crate::segment::reader::SegmentReader;

        // Open source segment readers
        let mut readers = Vec::new();
        let mut segment_data = Vec::new();
        for &seg_id in source_ids {
            let data = self.storage.read_segment(seg_id)?;
            segment_data.push(data);
        }
        for data in &segment_data {
            readers.push(SegmentReader::open(data.clone())?);
        }
        let reader_refs: Vec<&SegmentReader> = readers.iter().collect();

        // Merge into a new segment
        let new_id = SegmentId::new(self.next_segment_id);
        self.next_segment_id += 1;

        let deletions = DeletionMap::new();
        let merge_output = crate::merge::merge_segments(
            new_id,
            &reader_refs,
            &deletions,
            &self.schema,
            &self.analyzers,
        )?;

        // Write the merged segment
        self.storage.write_segment(new_id, &merge_output.bytes)?;

        // Remove source segments
        self.storage.remove_segments(source_ids);

        // Rewrite the global HNSW resolver so existing vector ordinals
        // point at the merged segment with the new local doc ids.
        // Without this step the resolver would carry dangling entries
        // for the merged-away segment ids.
        self.global_hnsw
            .rewrite_after_merge(&merge_output.ord_remap);

        // Re-persist every vector index with the rewritten resolver.
        // `commit()` persisted the vector index *before* calling
        // `maybe_merge`, so the on-disk copy still routes the merged
        // docs to the now-removed source segments. The reader loads the
        // persisted copy (not the writer's in-memory resolver), so
        // without this re-persist it carries dangling entries and
        // silently drops every kNN hit that resolves to a merged-away
        // segment — a [[code-must-not-lie]] silent-drop that cost ~0.09
        // recall at 600k once the segment count crossed the merge
        // threshold. See [[vector-recall-investigation-audit]] H6.
        for field_id in self.global_hnsw.non_empty_field_ids() {
            if let Some(bytes) = self.global_hnsw.field_to_bytes(field_id) {
                self.storage.write_vector_index(field_id, &bytes)?;
            }
        }

        // Commit the replacement
        self.storage.commit()?;

        Ok(())
    }

    /// Force-merge all segments down to at most `max_segments`.
    ///
    /// Repeatedly merges until the segment count is at or below the target.
    /// This is expensive and should only be called after bulk indexing is
    /// complete, not during normal operation.
    pub fn force_merge(&mut self, max_segments: usize) -> Result<()> {
        loop {
            let segments = self.storage.segments();
            if segments.len() <= max_segments {
                break;
            }

            // Merge all segments into one batch (up to max_merge_at_once)
            let ids: Vec<SegmentId> = segments
                .iter()
                .take(self.merge_policy.max_merge_at_once)
                .map(|e| e.segment_id)
                .collect();

            if ids.len() < 2 {
                break;
            }

            self.execute_merge(&ids)?;
        }
        Ok(())
    }

    /// Number of documents in the current (unflushed) buffer.
    pub fn buffered_doc_count(&self) -> u32 {
        self.builder.doc_count()
    }

    /// Discard the in-memory segment buffer without flushing to storage.
    ///
    /// Used for transaction rollback. Resets the builder to an empty state.
    pub fn discard_buffer(&mut self) {
        let seg_id = SegmentId::new(self.next_segment_id);
        self.builder = SegmentBuilder::new(seg_id, &self.schema);
        self.buffer_size = 0;
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::mapping::FieldType;
    use crate::storage::SingleFileDirectory;
    use std::path::PathBuf;

    fn test_dir(name: &str) -> PathBuf {
        let dir =
            std::env::temp_dir().join(format!("luci_writer_test_{}_{name}", std::process::id()));
        let _ = std::fs::remove_dir_all(&dir);
        dir
    }

    fn cleanup(path: &std::path::Path) {
        let _ = std::fs::remove_dir_all(path);
    }

    fn basic_setup(name: &str) -> (PathBuf, IndexWriter) {
        let path = test_dir(name);
        let storage = SingleFileDirectory::create(&path).unwrap();
        let schema = Mapping::builder()
            .field("title", FieldType::Text)
            .field("status", FieldType::Keyword)
            .build();
        let analyzers = AnalyzerRegistry::new();
        let writer = IndexWriter::new(storage, schema, analyzers);
        (path, writer)
    }

    #[test]
    fn put_single_doc_and_commit() {
        let (path, mut writer) = basic_setup("single");
        writer
            .add(serde_json::json!({
                "title": "hello world",
                "status": "active"
            }))
            .unwrap();
        writer.commit().unwrap();

        // Verify segment was written
        let storage = SingleFileDirectory::open(&path).unwrap();
        assert_eq!(storage.segments().len(), 1);

        cleanup(&path);
    }

    #[test]
    fn put_multiple_docs_and_commit() {
        let (path, mut writer) = basic_setup("multi");
        for i in 0..10 {
            writer
                .add(serde_json::json!({
                    "title": format!("document {i}"),
                    "status": "published"
                }))
                .unwrap();
        }
        writer.commit().unwrap();

        let storage = SingleFileDirectory::open(&path).unwrap();
        assert_eq!(storage.segments().len(), 1);

        cleanup(&path);
    }

    #[test]
    fn text_fields_analyzed() {
        let (path, mut writer) = basic_setup("analyzed");
        writer
            .add(serde_json::json!({
                "title": "The Quick Brown Fox",
                "status": "active"
            }))
            .unwrap();
        writer.commit().unwrap();

        // Read the segment and verify lowercase terms are in the index
        let storage = SingleFileDirectory::open(&path).unwrap();
        let seg_id = storage.segments()[0].segment_id;
        let data = storage.read_segment(seg_id).unwrap();

        use crate::segment::reader::SegmentReader;
        let reader = SegmentReader::open(data).unwrap();

        // Standard analyzer should lowercase
        assert!(reader.postings(FieldId::new(0), "the").is_some());
        assert!(reader.postings(FieldId::new(0), "quick").is_some());
        assert!(reader.postings(FieldId::new(0), "brown").is_some());
        assert!(reader.postings(FieldId::new(0), "fox").is_some());

        // Original casing should not be found
        assert!(reader.postings(FieldId::new(0), "The").is_none());
        assert!(reader.postings(FieldId::new(0), "Quick").is_none());

        cleanup(&path);
    }

    #[test]
    fn keyword_fields_exact() {
        let (path, mut writer) = basic_setup("keyword");
        writer
            .add(serde_json::json!({
                "title": "test",
                "status": "Active"
            }))
            .unwrap();
        writer.commit().unwrap();

        let storage = SingleFileDirectory::open(&path).unwrap();
        let data = storage
            .read_segment(storage.segments()[0].segment_id)
            .unwrap();

        use crate::segment::reader::SegmentReader;
        let reader = SegmentReader::open(data).unwrap();

        // Keyword field should preserve case
        assert!(reader.postings(FieldId::new(1), "Active").is_some());
        assert!(reader.postings(FieldId::new(1), "active").is_none());

        cleanup(&path);
    }

    #[test]
    fn commit_with_no_docs_is_noop() {
        let (path, mut writer) = basic_setup("empty_commit");
        writer.commit().unwrap();

        let storage = SingleFileDirectory::open(&path).unwrap();
        assert!(storage.segments().is_empty());

        cleanup(&path);
    }

    #[test]
    fn auto_flush_on_memory_budget() {
        let (path, mut writer) = basic_setup("autoflush");
        writer.set_memory_budget(100); // Very small budget

        for i in 0..5 {
            writer
                .add(serde_json::json!({
                    "title": format!("document number {i} with some extra text to exceed the budget"),
                    "status": "active"
                }))
                .unwrap();
        }
        writer.commit().unwrap();

        // Multiple segments should have been created due to auto-flush
        let storage = SingleFileDirectory::open(&path).unwrap();
        assert!(
            storage.segments().len() > 1,
            "expected multiple segments from auto-flush, got {}",
            storage.segments().len()
        );

        cleanup(&path);
    }

    #[test]
    fn dynamic_false_ignores_unknown() {
        let path = test_dir("dynamic_false");
        let storage = SingleFileDirectory::create(&path).unwrap();
        let schema = Mapping::builder()
            .field("title", FieldType::Text)
            .dynamic(DynamicMode::False)
            .build();
        let analyzers = AnalyzerRegistry::new();
        let mut writer = IndexWriter::new(storage, schema, analyzers);

        // Should succeed — unknown field is silently ignored
        writer
            .add(serde_json::json!({
                "title": "hello",
                "unknown_field": "value"
            }))
            .unwrap();
        writer.commit().unwrap();

        cleanup(&path);
    }

    #[test]
    fn multiple_commits() {
        let (path, mut writer) = basic_setup("multi_commit");
        writer
            .add(serde_json::json!({"title": "first", "status": "a"}))
            .unwrap();
        writer.commit().unwrap();

        writer
            .add(serde_json::json!({"title": "second", "status": "b"}))
            .unwrap();
        writer.commit().unwrap();

        let storage = SingleFileDirectory::open(&path).unwrap();
        assert_eq!(storage.segments().len(), 2);

        cleanup(&path);
    }

    #[test]
    fn source_stored_correctly() {
        let (path, mut writer) = basic_setup("source");
        let doc = serde_json::json!({"title": "hello world", "status": "active"});
        writer.add(doc.clone()).unwrap();
        writer.commit().unwrap();

        let storage = SingleFileDirectory::open(&path).unwrap();
        let data = storage
            .read_segment(storage.segments()[0].segment_id)
            .unwrap();

        use crate::segment::reader::SegmentReader;
        let reader = SegmentReader::open(data).unwrap();
        let source = reader.doc_store().get(0).unwrap();
        let stored: serde_json::Value = serde_json::from_slice(&source).unwrap();
        assert_eq!(stored, doc);

        cleanup(&path);
    }
}