use crate::core::SegmentId;
#[derive(Clone, Debug)]
pub struct SegmentInfo {
pub segment_id: SegmentId,
pub size_bytes: u64,
pub doc_count: u32,
pub deletion_count: u32,
}
impl SegmentInfo {
pub fn live_doc_count(&self) -> u32 {
self.doc_count.saturating_sub(self.deletion_count)
}
pub fn deletion_ratio(&self) -> f64 {
if self.doc_count == 0 {
0.0
} else {
self.deletion_count as f64 / self.doc_count as f64
}
}
}
#[derive(Clone, Debug)]
pub struct MergeCandidate {
pub segment_ids: Vec<SegmentId>,
pub total_bytes: u64,
}
#[derive(Clone, Debug)]
pub struct MergePolicy {
pub max_merge_at_once: usize,
pub segments_per_tier: usize,
pub floor_segment_size: u64,
pub max_merged_segment_size: u64,
}
impl Default for MergePolicy {
fn default() -> Self {
Self {
max_merge_at_once: 10,
segments_per_tier: 10,
floor_segment_size: 2 * 1024 * 1024, max_merged_segment_size: 5 * 1024 * 1024 * 1024, }
}
}
pub fn find_merge(policy: &MergePolicy, segments: &[SegmentInfo]) -> Option<MergeCandidate> {
if segments.len() <= 1 {
return None;
}
let needs_merge = segments.len() > policy.segments_per_tier
|| segments.iter().any(|s| s.deletion_ratio() > 0.0);
if !needs_merge {
return None;
}
let mut sorted: Vec<&SegmentInfo> = segments.iter().collect();
sorted.sort_by_key(|s| s.size_bytes);
let mut best: Option<(f64, MergeCandidate)> = None;
let max_window = policy.max_merge_at_once.min(sorted.len());
for start in 0..sorted.len() {
let end = sorted.len().min(start + max_window);
if end - start < 2 {
continue;
}
for window_end in (start + 2)..=end {
let window = &sorted[start..window_end];
let total_bytes: u64 = window.iter().map(|s| s.size_bytes).sum();
if total_bytes > policy.max_merged_segment_size {
break; }
let score = merge_score(policy, window);
if best.as_ref().is_none_or(|(s, _)| score < *s) {
best = Some((
score,
MergeCandidate {
segment_ids: window.iter().map(|s| s.segment_id).collect(),
total_bytes,
},
));
}
}
}
best.map(|(_, c)| c)
}
fn merge_score(policy: &MergePolicy, segments: &[&SegmentInfo]) -> f64 {
let total_bytes: u64 = segments.iter().map(|s| s.size_bytes).sum();
let size_ratio = total_bytes as f64 / policy.max_merged_segment_size as f64;
let avg_deletion: f64 = if segments.is_empty() {
0.0
} else {
segments.iter().map(|s| s.deletion_ratio()).sum::<f64>() / segments.len() as f64
};
size_ratio - avg_deletion
}
#[cfg(test)]
mod tests {
use super::*;
fn seg(id: u64, size_mb: u64, doc_count: u32, deletions: u32) -> SegmentInfo {
SegmentInfo {
segment_id: SegmentId::new(id),
size_bytes: size_mb * 1024 * 1024,
doc_count,
deletion_count: deletions,
}
}
#[test]
fn no_merge_single_segment() {
let policy = MergePolicy::default();
let segments = vec![seg(1, 100, 10000, 0)];
assert!(find_merge(&policy, &segments).is_none());
}
#[test]
fn no_merge_under_threshold() {
let policy = MergePolicy::default();
let segments: Vec<_> = (0..5).map(|i| seg(i, 100, 10000, 0)).collect();
assert!(find_merge(&policy, &segments).is_none());
}
#[test]
fn merge_many_small_segments() {
let policy = MergePolicy::default();
let segments: Vec<_> = (0..20).map(|i| seg(i, 1, 1000, 0)).collect();
let candidate = find_merge(&policy, &segments);
assert!(candidate.is_some());
let c = candidate.unwrap();
assert!(c.segment_ids.len() >= 2);
assert!(c.segment_ids.len() <= policy.max_merge_at_once);
}
#[test]
fn prefers_small_segments() {
let policy = MergePolicy::default();
let mut segments = vec![seg(100, 500, 100000, 0), seg(101, 400, 80000, 0)];
for i in 0..15 {
segments.push(seg(i, 1, 1000, 0));
}
let candidate = find_merge(&policy, &segments).unwrap();
for id in &candidate.segment_ids {
assert!(id.as_u64() < 100, "should not merge large segments");
}
}
#[test]
fn respects_max_merged_size() {
let mut policy = MergePolicy::default();
policy.max_merged_segment_size = 100 * 1024 * 1024; let segments: Vec<_> = (0..15).map(|i| seg(i, 50, 5000, 0)).collect();
let candidate = find_merge(&policy, &segments).unwrap();
assert!(
candidate.total_bytes <= policy.max_merged_segment_size,
"merged size {} exceeds cap {}",
candidate.total_bytes,
policy.max_merged_segment_size,
);
}
#[test]
fn deletion_ratio_affects_scoring() {
let policy = MergePolicy::default();
let mut segments = Vec::new();
for i in 0..12 {
segments.push(seg(i, 10, 10000, 5000));
}
for i in 12..24 {
segments.push(seg(i, 10, 10000, 0));
}
let candidate = find_merge(&policy, &segments).unwrap();
let has_deleted = candidate.segment_ids.iter().any(|id| id.as_u64() < 12);
assert!(has_deleted, "should prefer segments with deletions");
}
#[test]
fn max_merge_at_once_respected() {
let mut policy = MergePolicy::default();
policy.max_merge_at_once = 3;
let segments: Vec<_> = (0..20).map(|i| seg(i, 1, 1000, 0)).collect();
let candidate = find_merge(&policy, &segments).unwrap();
assert!(
candidate.segment_ids.len() <= 3,
"should merge at most 3, got {}",
candidate.segment_ids.len(),
);
}
#[test]
fn exactly_at_threshold_no_merge() {
let policy = MergePolicy::default();
let segments: Vec<_> = (0..10).map(|i| seg(i, 50, 5000, 0)).collect();
assert!(find_merge(&policy, &segments).is_none());
}
#[test]
fn empty_segments() {
let policy = MergePolicy::default();
assert!(find_merge(&policy, &[]).is_none());
}
}