lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Tiered merge policy: select segments to merge based on size tiers.
//!
//! Groups segments into tiers by size, selects merge candidates when a
//! tier has too many segments. Smaller segments are merged more aggressively.
//! Follows Lucene's TieredMergePolicy design.
//!
//! See [[architecture-segment-merge]] and [[architecture-merge-policy]].

use crate::core::SegmentId;

/// Lightweight segment metadata for merge decisions.
#[derive(Clone, Debug)]
pub struct SegmentInfo {
    pub segment_id: SegmentId,
    pub size_bytes: u64,
    pub doc_count: u32,
    pub deletion_count: u32,
}

impl SegmentInfo {
    /// Live (non-deleted) document count.
    pub fn live_doc_count(&self) -> u32 {
        self.doc_count.saturating_sub(self.deletion_count)
    }

    /// Fraction of docs that are deleted.
    pub fn deletion_ratio(&self) -> f64 {
        if self.doc_count == 0 {
            0.0
        } else {
            self.deletion_count as f64 / self.doc_count as f64
        }
    }
}

/// A proposed merge operation.
#[derive(Clone, Debug)]
pub struct MergeCandidate {
    pub segment_ids: Vec<SegmentId>,
    /// Total bytes of the source segments.
    pub total_bytes: u64,
}

/// Tiered merge policy parameters.
#[derive(Clone, Debug)]
pub struct MergePolicy {
    /// Maximum segments merged in a single operation.
    pub max_merge_at_once: usize,
    /// Target segment count per size tier.
    pub segments_per_tier: usize,
    /// Segments below this size are always eligible for merge.
    pub floor_segment_size: u64,
    /// Never produce a merged segment larger than this.
    pub max_merged_segment_size: u64,
}

impl Default for MergePolicy {
    fn default() -> Self {
        Self {
            max_merge_at_once: 10,
            segments_per_tier: 10,
            floor_segment_size: 2 * 1024 * 1024, // 2 MB
            max_merged_segment_size: 5 * 1024 * 1024 * 1024, // 5 GB
        }
    }
}

/// Given current segments, return the best merge candidate (if any).
///
/// Algorithm (simplified tiered merge):
/// 1. Sort segments by size ascending
/// 2. Slide a window of up to `max_merge_at_once` consecutive segments
/// 3. Score each window: prefer small total size, reward high deletion
///    ratios, penalize merging segments larger than the tier size
/// 4. Return the best-scoring candidate
///
/// A merge is considered if the total segment count exceeds
/// `segments_per_tier`, or if any segment has a deletion ratio above
/// the threshold.
///
/// Returns `None` if no merge is beneficial.
pub fn find_merge(policy: &MergePolicy, segments: &[SegmentInfo]) -> Option<MergeCandidate> {
    if segments.len() <= 1 {
        return None;
    }

    let needs_merge = segments.len() > policy.segments_per_tier
        || segments.iter().any(|s| s.deletion_ratio() > 0.0);

    if !needs_merge {
        return None;
    }

    // Sort by size ascending (merge smallest first)
    let mut sorted: Vec<&SegmentInfo> = segments.iter().collect();
    sorted.sort_by_key(|s| s.size_bytes);

    // Slide a window across sorted segments, score each candidate
    let mut best: Option<(f64, MergeCandidate)> = None;

    let max_window = policy.max_merge_at_once.min(sorted.len());

    for start in 0..sorted.len() {
        let end = sorted.len().min(start + max_window);
        if end - start < 2 {
            continue;
        }

        // Try all window sizes from 2 to max at this start position
        for window_end in (start + 2)..=end {
            let window = &sorted[start..window_end];
            let total_bytes: u64 = window.iter().map(|s| s.size_bytes).sum();

            // Skip if merged result would be too large
            if total_bytes > policy.max_merged_segment_size {
                break; // Larger windows will also exceed
            }

            let score = merge_score(policy, window);

            if best.as_ref().is_none_or(|(s, _)| score < *s) {
                best = Some((
                    score,
                    MergeCandidate {
                        segment_ids: window.iter().map(|s| s.segment_id).collect(),
                        total_bytes,
                    },
                ));
            }
        }
    }

    best.map(|(_, c)| c)
}

/// Score a merge candidate. Lower is better.
///
/// Prefers merging small segments (normalized by max_merged_segment_size)
/// and rewards high average deletion ratio.
fn merge_score(policy: &MergePolicy, segments: &[&SegmentInfo]) -> f64 {
    let total_bytes: u64 = segments.iter().map(|s| s.size_bytes).sum();

    // Size component: merging large segments is expensive
    let size_ratio = total_bytes as f64 / policy.max_merged_segment_size as f64;

    // Deletion component: high deletion = more space to reclaim
    let avg_deletion: f64 = if segments.is_empty() {
        0.0
    } else {
        segments.iter().map(|s| s.deletion_ratio()).sum::<f64>() / segments.len() as f64
    };

    // Score: size penalty minus deletion bonus
    size_ratio - avg_deletion
}

#[cfg(test)]
mod tests {
    use super::*;

    fn seg(id: u64, size_mb: u64, doc_count: u32, deletions: u32) -> SegmentInfo {
        SegmentInfo {
            segment_id: SegmentId::new(id),
            size_bytes: size_mb * 1024 * 1024,
            doc_count,
            deletion_count: deletions,
        }
    }

    #[test]
    fn no_merge_single_segment() {
        let policy = MergePolicy::default();
        let segments = vec![seg(1, 100, 10000, 0)];
        assert!(find_merge(&policy, &segments).is_none());
    }

    #[test]
    fn no_merge_under_threshold() {
        let policy = MergePolicy::default();
        // 5 segments, each 100MB, no deletions — under segments_per_tier
        let segments: Vec<_> = (0..5).map(|i| seg(i, 100, 10000, 0)).collect();
        assert!(find_merge(&policy, &segments).is_none());
    }

    #[test]
    fn merge_many_small_segments() {
        let policy = MergePolicy::default();
        // 20 segments of 1MB each — many below floor, should trigger merge
        let segments: Vec<_> = (0..20).map(|i| seg(i, 1, 1000, 0)).collect();
        let candidate = find_merge(&policy, &segments);
        assert!(candidate.is_some());
        let c = candidate.unwrap();
        assert!(c.segment_ids.len() >= 2);
        assert!(c.segment_ids.len() <= policy.max_merge_at_once);
    }

    #[test]
    fn prefers_small_segments() {
        let policy = MergePolicy::default();
        // Mix of sizes: many small + a few large
        let mut segments = vec![seg(100, 500, 100000, 0), seg(101, 400, 80000, 0)];
        for i in 0..15 {
            segments.push(seg(i, 1, 1000, 0));
        }
        let candidate = find_merge(&policy, &segments).unwrap();
        // Should pick small segments, not the large ones
        for id in &candidate.segment_ids {
            assert!(id.as_u64() < 100, "should not merge large segments");
        }
    }

    #[test]
    fn respects_max_merged_size() {
        let mut policy = MergePolicy::default();
        policy.max_merged_segment_size = 100 * 1024 * 1024; // 100 MB cap
        // 15 segments of 50MB each — merging all would exceed 100MB
        let segments: Vec<_> = (0..15).map(|i| seg(i, 50, 5000, 0)).collect();
        let candidate = find_merge(&policy, &segments).unwrap();
        assert!(
            candidate.total_bytes <= policy.max_merged_segment_size,
            "merged size {} exceeds cap {}",
            candidate.total_bytes,
            policy.max_merged_segment_size,
        );
    }

    #[test]
    fn deletion_ratio_affects_scoring() {
        let policy = MergePolicy::default();
        // Two groups of segments: same size, different deletion ratios
        let mut segments = Vec::new();
        // Group A: 10MB each, 50% deleted
        for i in 0..12 {
            segments.push(seg(i, 10, 10000, 5000));
        }
        // Group B: 10MB each, 0% deleted
        for i in 12..24 {
            segments.push(seg(i, 10, 10000, 0));
        }
        let candidate = find_merge(&policy, &segments).unwrap();
        // Candidate should include at least some high-deletion segments
        // (they score better due to deletion bonus)
        let has_deleted = candidate.segment_ids.iter().any(|id| id.as_u64() < 12);
        assert!(has_deleted, "should prefer segments with deletions");
    }

    #[test]
    fn max_merge_at_once_respected() {
        let mut policy = MergePolicy::default();
        policy.max_merge_at_once = 3;
        let segments: Vec<_> = (0..20).map(|i| seg(i, 1, 1000, 0)).collect();
        let candidate = find_merge(&policy, &segments).unwrap();
        assert!(
            candidate.segment_ids.len() <= 3,
            "should merge at most 3, got {}",
            candidate.segment_ids.len(),
        );
    }

    #[test]
    fn exactly_at_threshold_no_merge() {
        let policy = MergePolicy::default();
        // Exactly segments_per_tier segments, no deletions — should not merge
        let segments: Vec<_> = (0..10).map(|i| seg(i, 50, 5000, 0)).collect();
        assert!(find_merge(&policy, &segments).is_none());
    }

    #[test]
    fn empty_segments() {
        let policy = MergePolicy::default();
        assert!(find_merge(&policy, &[]).is_none());
    }
}