lucisearch 0.8.1

//! Top-K document collector with min-heap.
//!
//! Collects the highest-scoring documents up to a configurable limit,
//! using a min-heap to efficiently maintain the top-K set.
//!
//! See [[architecture-query-execution#Aggregation Collection]].

use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::core::{DocId, SegmentId};

/// A scored document result (internal to the search pipeline).
#[derive(Clone, Debug)]
pub(crate) struct ScoreDoc {
    pub(crate) doc_id: DocId,
    pub(crate) segment_id: SegmentId,
    pub(crate) score: f32,
}

/// Wrapper for min-heap ordering (lowest score at top).
struct MinScoreDoc(ScoreDoc);

impl PartialEq for MinScoreDoc {
    fn eq(&self, other: &Self) -> bool {
        self.0.score == other.0.score
    }
}

impl Eq for MinScoreDoc {}

impl PartialOrd for MinScoreDoc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for MinScoreDoc {
    fn cmp(&self, other: &Self) -> Ordering {
        // Reverse ordering: we want lowest score at the top of the BinaryHeap
        // (BinaryHeap is a max-heap, so reversing makes it a min-heap).
        other
            .0
            .score
            .partial_cmp(&self.0.score)
            .unwrap_or(Ordering::Equal)
            .then_with(|| other.0.doc_id.cmp(&self.0.doc_id))
    }
}

/// Collects the top-K documents by score.
pub struct TopDocsCollector {
    heap: BinaryHeap<MinScoreDoc>,
    limit: usize,
}

impl TopDocsCollector {
    /// Create a collector that keeps the top `limit` documents.
    pub fn new(limit: usize) -> Self {
        Self {
            heap: BinaryHeap::with_capacity(limit + 1),
            limit,
        }
    }

    /// Offer a scored document. It will be kept if it's in the top-K.
    ///
    /// Uses replace-top via `peek_mut()` for competitive documents — a single
    /// sift-down instead of pop (sift-up) + push (sift-down).
    /// See [[optimization-scoring-throughput#Phase 2]].
    pub fn collect(&mut self, doc_id: DocId, segment_id: SegmentId, score: f32) {
        if self.heap.len() < self.limit {
            self.heap.push(MinScoreDoc(ScoreDoc {
                doc_id,
                segment_id,
                score,
            }));
            return;
        }
        // Fast path: non-competitive (most common case).
        // peek() returns None when limit=0 (agg-only queries).
        if let Some(min) = self.heap.peek() {
            if score <= min.0.score {
                return;
            }
        } else {
            return;
        }
        // Replace top and sift down (1 sift instead of 2)
        let mut top = self.heap.peek_mut().unwrap();
        *top = MinScoreDoc(ScoreDoc {
            doc_id,
            segment_id,
            score,
        });
        // PeekMut::drop calls sift_down automatically
    }

    /// Current minimum score in the top-K heap.
    /// Returns 0.0 if the heap is not yet full.
    pub fn min_score(&self) -> f32 {
        if self.heap.len() < self.limit {
            0.0
        } else {
            self.heap.peek().map(|m| m.0.score).unwrap_or(0.0)
        }
    }

    /// Merge another collector's results into this one.
    pub fn merge(&mut self, other: TopDocsCollector) {
        for item in other.heap {
            self.collect(item.0.doc_id, item.0.segment_id, item.0.score);
        }
    }

    /// Drain the collector and return results sorted by score descending.
    pub(crate) fn into_sorted_results(self) -> Vec<ScoreDoc> {
        let mut results: Vec<ScoreDoc> = self.heap.into_iter().map(|m| m.0).collect();
        results.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap_or(Ordering::Equal)
                .then_with(|| a.doc_id.cmp(&b.doc_id))
        });
        results
    }
}

// --- TopFieldCollector: sort by field values ---
// See [[feature-sort-by-field]].

use crate::search::{SortField, SortValue, compare_sort_values_cascade};

/// A collected document with its sort values.
#[derive(Clone, Debug)]
pub(crate) struct FieldDoc {
    pub(crate) doc_id: DocId,
    pub(crate) segment_id: SegmentId,
    pub(crate) score: f32,
    pub(crate) sort_values: Vec<SortValue>,
}

/// Wrapper for min-heap ordering by sort fields.
/// The "worst" doc (that should be evicted first) sits at the top.
struct MinFieldDoc {
    doc: FieldDoc,
    /// Shared reference to sort spec for comparison. We store a raw pointer
    /// because BinaryHeap requires Ord which can't borrow the collector.
    /// SAFETY: The pointer is valid for the lifetime of the TopFieldCollector.
    sort_fields: *const Vec<SortField>,
}

// SAFETY: MinFieldDoc is only used within TopFieldCollector which owns
// the sort_fields Vec and outlives all MinFieldDoc instances.
unsafe impl Send for MinFieldDoc {}

impl PartialEq for MinFieldDoc {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == Ordering::Equal
    }
}
impl Eq for MinFieldDoc {}

impl PartialOrd for MinFieldDoc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for MinFieldDoc {
    fn cmp(&self, other: &Self) -> Ordering {
        // Max-heap retention: peek returns the doc we evict first (the
        // "worst" for retention). `compare_sort_values_cascade` already
        // honors SortOrder — for ASC, 199 vs 79 returns Greater because
        // 199 sorts later, which is exactly the "worst to keep" signal.
        // Secondary by doc_id ascending matches Lucene's TopDocs tiebreak
        // (retain smaller doc_id on ties).
        let sort_fields = unsafe { &*self.sort_fields };
        compare_sort_values_cascade(&self.doc.sort_values, &other.doc.sort_values, sort_fields)
            .then_with(|| self.doc.doc_id.cmp(&other.doc.doc_id))
    }
}

/// Collects the top-K documents ordered by field sort values.
pub struct TopFieldCollector {
    heap: BinaryHeap<MinFieldDoc>,
    limit: usize,
    sort_fields: Vec<SortField>,
    /// Cursor for search_after pagination. Docs sorting <= this are skipped.
    search_after: Option<Vec<SortValue>>,
}

impl TopFieldCollector {
    pub fn new(limit: usize, sort_fields: Vec<SortField>) -> Self {
        Self {
            heap: BinaryHeap::with_capacity(limit + 1),
            limit,
            sort_fields,
            search_after: None,
        }
    }

    /// Set the search_after cursor. Docs sorting at or before this
    /// position are skipped. See [[feature-search-after]].
    pub fn set_search_after(&mut self, cursor: Vec<SortValue>) {
        self.search_after = Some(cursor);
    }

    /// Quick check: would a doc with this primary sort value be competitive?
    /// Returns true if the heap isn't full or the value beats the worst entry
    /// on the primary sort field. Used to avoid Vec allocation for non-competitive docs.
    #[inline]
    pub fn is_competitive_primary(&self, primary: &SortValue) -> bool {
        if self.heap.len() < self.limit {
            return true;
        }
        if let Some(top) = self.heap.peek() {
            if let Some(worst_primary) = top.doc.sort_values.first() {
                let cmp = primary.compare(worst_primary, &self.sort_fields[0]);
                // Competitive if this doc sorts BEFORE the worst
                return cmp == Ordering::Less;
            }
        }
        false
    }

    /// Quick check for keyword primary sort: compare &str without allocating.
    #[inline]
    pub fn is_competitive_keyword(&self, value: Option<&str>) -> bool {
        if self.heap.len() < self.limit {
            return true;
        }
        if let Some(top) = self.heap.peek() {
            if let Some(worst) = top.doc.sort_values.first() {
                let _sv = match value {
                    Some(_s) => SortValue::Str(String::new()), // placeholder, compare below
                    None => SortValue::Null,
                };
                // Direct comparison without allocating
                let natural = match (value, worst) {
                    (None, SortValue::Null) => std::cmp::Ordering::Equal,
                    (None, _) => match self.sort_fields[0].missing {
                        crate::search::MissingValue::Last => match self.sort_fields[0].order {
                            crate::search::SortOrder::Asc => std::cmp::Ordering::Greater,
                            crate::search::SortOrder::Desc => std::cmp::Ordering::Less,
                        },
                        crate::search::MissingValue::First => match self.sort_fields[0].order {
                            crate::search::SortOrder::Asc => std::cmp::Ordering::Less,
                            crate::search::SortOrder::Desc => std::cmp::Ordering::Greater,
                        },
                    },
                    (Some(_), SortValue::Null) => match self.sort_fields[0].missing {
                        crate::search::MissingValue::Last => match self.sort_fields[0].order {
                            crate::search::SortOrder::Asc => std::cmp::Ordering::Less,
                            crate::search::SortOrder::Desc => std::cmp::Ordering::Greater,
                        },
                        crate::search::MissingValue::First => match self.sort_fields[0].order {
                            crate::search::SortOrder::Asc => std::cmp::Ordering::Greater,
                            crate::search::SortOrder::Desc => std::cmp::Ordering::Less,
                        },
                    },
                    (Some(s), SortValue::Str(w)) => s.cmp(w.as_str()),
                    _ => std::cmp::Ordering::Equal,
                };
                let cmp = match self.sort_fields[0].order {
                    crate::search::SortOrder::Asc => natural,
                    crate::search::SortOrder::Desc => natural.reverse(),
                };
                return cmp == Ordering::Less;
            }
        }
        false
    }

    /// Offer a document with its sort values.
    pub fn collect(
        &mut self,
        doc_id: DocId,
        segment_id: SegmentId,
        score: f32,
        sort_values: Vec<SortValue>,
    ) {
        // search_after filter: skip docs at or before the cursor
        if let Some(ref after) = self.search_after {
            let cmp = compare_sort_values_cascade(&sort_values, after, &self.sort_fields);
            if cmp != Ordering::Greater {
                return;
            }
        }

        let doc = FieldDoc {
            doc_id,
            segment_id,
            score,
            sort_values,
        };

        if self.heap.len() < self.limit {
            self.heap.push(MinFieldDoc {
                doc,
                sort_fields: &self.sort_fields as *const Vec<SortField>,
            });
            return;
        }
        // Competitiveness check: candidate must sort strictly better
        // than the current heap top (the worst-retention doc). Primary
        // sort first, then doc_id ascending — matches Lucene's
        // TopDocs tiebreak, so on primary-ties the smaller doc_id wins
        // retention.
        if let Some(top) = self.heap.peek() {
            let cmp = compare_sort_values_cascade(
                &doc.sort_values,
                &top.doc.sort_values,
                &self.sort_fields,
            )
            .then_with(|| doc.doc_id.cmp(&top.doc.doc_id));
            if cmp != Ordering::Less {
                return;
            }
        } else {
            return;
        }
        // Replace top (worst) with this better doc
        let mut top = self.heap.peek_mut().unwrap();
        *top = MinFieldDoc {
            doc,
            sort_fields: &self.sort_fields as *const Vec<SortField>,
        };
    }

    /// Merge another collector's results into this one.
    pub fn merge(&mut self, other: TopFieldCollector) {
        for item in other.heap {
            self.collect(
                item.doc.doc_id,
                item.doc.segment_id,
                item.doc.score,
                item.doc.sort_values,
            );
        }
    }

    /// Drain and return results in sort order (best first).
    pub(crate) fn into_sorted_results(self) -> Vec<FieldDoc> {
        let sort_fields = self.sort_fields;
        let mut results: Vec<FieldDoc> = self.heap.into_iter().map(|m| m.doc).collect();
        results.sort_by(|a, b| {
            compare_sort_values_cascade(&a.sort_values, &b.sort_values, &sort_fields)
                .then_with(|| a.doc_id.cmp(&b.doc_id))
        });
        results
    }
}

// --- CollapsingCollector: deduplicate by field value ---
// See [[feature-search-collapse]].

use std::collections::HashMap;

/// A collapse group key.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum CollapseKey {
    Null,
    Keyword(String),
    Numeric(i64),
}

impl CollapseKey {
    pub fn to_json(&self) -> serde_json::Value {
        match self {
            CollapseKey::Null => serde_json::Value::Null,
            CollapseKey::Keyword(s) => serde_json::json!(s),
            CollapseKey::Numeric(n) => serde_json::json!(n),
        }
    }
}

/// A collected doc with its collapse key.
#[derive(Clone, Debug)]
pub(crate) struct CollapsedDoc {
    pub(crate) doc_id: DocId,
    pub(crate) segment_id: SegmentId,
    pub(crate) score: f32,
    pub(crate) collapse_key: CollapseKey,
}

/// Collects the highest-scoring document per unique collapse value.
pub struct CollapsingCollector {
    groups: HashMap<CollapseKey, CollapsedDoc>,
    limit: usize,
}

impl CollapsingCollector {
    pub fn new(limit: usize) -> Self {
        Self {
            groups: HashMap::new(),
            limit,
        }
    }

    /// Offer a scored document with its collapse key.
    pub(crate) fn collect(
        &mut self,
        doc_id: DocId,
        segment_id: SegmentId,
        score: f32,
        key: CollapseKey,
    ) {
        let entry = self.groups.entry(key.clone()).or_insert(CollapsedDoc {
            doc_id,
            segment_id,
            score,
            collapse_key: key,
        });
        if score > entry.score {
            entry.doc_id = doc_id;
            entry.segment_id = segment_id;
            entry.score = score;
        }
    }

    /// Merge another collector's groups into this one.
    pub fn merge(&mut self, other: CollapsingCollector) {
        for (key, doc) in other.groups {
            self.collect(doc.doc_id, doc.segment_id, doc.score, key);
        }
    }

    /// Total number of unique groups (pre-collapse total_hits).
    pub fn group_count(&self) -> usize {
        self.groups.len()
    }

    /// Drain and return the top-K groups sorted by score descending.
    pub(crate) fn into_sorted_results(self) -> Vec<CollapsedDoc> {
        let mut results: Vec<CollapsedDoc> = self.groups.into_values().collect();
        results.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap_or(Ordering::Equal)
                .then_with(|| a.doc_id.cmp(&b.doc_id))
        });
        results.truncate(self.limit);
        results
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn collect_under_limit() {
        let mut collector = TopDocsCollector::new(10);
        collector.collect(DocId::new(0), SegmentId::new(1), 1.0);
        collector.collect(DocId::new(1), SegmentId::new(1), 2.0);
        collector.collect(DocId::new(2), SegmentId::new(1), 0.5);

        let results = collector.into_sorted_results();
        assert_eq!(results.len(), 3);
        assert_eq!(results[0].doc_id, DocId::new(1)); // highest score
        assert_eq!(results[1].doc_id, DocId::new(0));
        assert_eq!(results[2].doc_id, DocId::new(2)); // lowest score
    }

    #[test]
    fn top_k_limiting() {
        let mut collector = TopDocsCollector::new(2);
        collector.collect(DocId::new(0), SegmentId::new(1), 1.0);
        collector.collect(DocId::new(1), SegmentId::new(1), 3.0);
        collector.collect(DocId::new(2), SegmentId::new(1), 2.0);
        collector.collect(DocId::new(3), SegmentId::new(1), 0.5);

        let results = collector.into_sorted_results();
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].doc_id, DocId::new(1)); // score 3.0
        assert_eq!(results[1].doc_id, DocId::new(2)); // score 2.0
    }

    #[test]
    fn score_descending_order() {
        let mut collector = TopDocsCollector::new(5);
        for i in 0..5 {
            collector.collect(DocId::new(i), SegmentId::new(1), i as f32);
        }
        let results = collector.into_sorted_results();
        for i in 0..results.len() - 1 {
            assert!(results[i].score >= results[i + 1].score);
        }
    }

    #[test]
    fn empty_collector() {
        let collector = TopDocsCollector::new(10);
        let results = collector.into_sorted_results();
        assert!(results.is_empty());
    }

    #[test]
    fn multi_segment() {
        let mut collector = TopDocsCollector::new(3);
        collector.collect(DocId::new(0), SegmentId::new(1), 1.0);
        collector.collect(DocId::new(0), SegmentId::new(2), 2.0);
        collector.collect(DocId::new(1), SegmentId::new(1), 3.0);
        collector.collect(DocId::new(1), SegmentId::new(2), 0.5);

        let results = collector.into_sorted_results();
        assert_eq!(results.len(), 3);
        assert_eq!(results[0].score, 3.0);
        assert_eq!(results[0].segment_id, SegmentId::new(1));
    }

    // --- TopFieldCollector comparator tests ---
    // Regression: MinFieldDoc::cmp previously reversed natural order,
    // causing peek() to return the wrong heap element and retaining the
    // wrong extreme on top-K eviction. See
    // [[fix-sort-pagination-multi-segment]].

    use crate::search::{MissingValue, SortField, SortFieldType, SortOrder, SortValue};

    fn sort_field_asc(name: &str) -> SortField {
        SortField {
            field: SortFieldType::Field(name.to_string()),
            order: SortOrder::Asc,
            missing: MissingValue::Last,
        }
    }

    fn sort_field_desc(name: &str) -> SortField {
        SortField {
            field: SortFieldType::Field(name.to_string()),
            order: SortOrder::Desc,
            missing: MissingValue::Last,
        }
    }

    fn prices(results: &[FieldDoc]) -> Vec<f64> {
        results
            .iter()
            .map(|r| match r.sort_values[0] {
                SortValue::F64(f) => f,
                _ => panic!("expected F64 sort value"),
            })
            .collect()
    }

    #[test]
    fn top_field_collector_asc_retains_smallest() {
        let mut c = TopFieldCollector::new(2, vec![sort_field_asc("price")]);
        for (i, price) in [199.0, 79.0, 9.99, 59.0, 25.0].into_iter().enumerate() {
            c.collect(
                DocId::new(i as u32),
                SegmentId::new(0),
                1.0,
                vec![SortValue::F64(price)],
            );
        }
        assert_eq!(prices(&c.into_sorted_results()), vec![9.99, 25.0]);
    }

    #[test]
    fn top_field_collector_desc_retains_largest() {
        let mut c = TopFieldCollector::new(2, vec![sort_field_desc("price")]);
        for (i, price) in [199.0, 79.0, 9.99, 59.0, 25.0].into_iter().enumerate() {
            c.collect(
                DocId::new(i as u32),
                SegmentId::new(0),
                1.0,
                vec![SortValue::F64(price)],
            );
        }
        assert_eq!(prices(&c.into_sorted_results()), vec![199.0, 79.0]);
    }

    #[test]
    fn top_field_collector_tiebreak_prefers_smaller_doc_id() {
        // Two docs with identical sort values; Lucene's TopDocs tiebreak
        // retains the doc with the smaller doc_id.
        let mut c = TopFieldCollector::new(1, vec![sort_field_asc("price")]);
        c.collect(
            DocId::new(7),
            SegmentId::new(0),
            1.0,
            vec![SortValue::F64(42.0)],
        );
        c.collect(
            DocId::new(3),
            SegmentId::new(0),
            1.0,
            vec![SortValue::F64(42.0)],
        );
        let results = c.into_sorted_results();
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].doc_id, DocId::new(3));
    }
}