Skip to main content

lean_ctx/core/
context_packing.rs

1//! Submodular context packing — choosing *which* items to include under a
2//! budget so the selected set maximises coverage of what the task cares about.
3//!
4//! ## Why submodular?
5//!
6//! Picking a set of context items to maximise term coverage is the classic
7//! **maximum coverage** problem. Coverage is *monotone submodular*: each added
8//! item helps, but with diminishing returns (a second file covering the same
9//! terms adds little). For such objectives the simple **greedy** algorithm —
10//! repeatedly take the item with the largest marginal gain per unit cost — is
11//! provably within a `1 − 1/e ≈ 0.63` factor of the optimum (Nemhauser,
12//! Wolsey & Fisher 1978). That guarantee, plus `O(n·k)` cost, is exactly what
13//! we want for online context assembly.
14//!
15//! This module is deliberately generic and side-effect free: callers map their
16//! domain objects (symbol bodies, ranked files, chunks) onto [`CoverageItem`]s
17//! and get back the indices to keep, in selection order.
18
19use std::collections::HashSet;
20
21/// A candidate for inclusion: the set of weighted terms it covers and the
22/// budget cost of including it (e.g. its token count).
23#[derive(Debug, Clone)]
24pub struct CoverageItem {
25    /// Distinct terms this item covers. Weights come from `term_weight`.
26    pub terms: HashSet<String>,
27    /// Budget consumed if this item is selected (must be ≥ 1).
28    pub cost: usize,
29}
30
31/// Greedy submodular maximisation of weighted term coverage under `budget`.
32///
33/// Returns the indices of selected items in selection order. Items whose cost
34/// alone exceeds the remaining budget are skipped. `term_weight` lets callers
35/// prioritise rarer / more salient terms (uniform weight ⇒ plain coverage).
36///
37/// Guarantees: with unit costs this is the standard greedy max-coverage with a
38/// `1 − 1/e` approximation ratio. With costs it uses marginal-gain-per-cost,
39/// the cost-effective greedy heuristic.
40pub fn greedy_max_coverage<F>(items: &[CoverageItem], budget: usize, term_weight: F) -> Vec<usize>
41where
42    F: Fn(&str) -> f64,
43{
44    let mut covered: HashSet<&str> = HashSet::new();
45    let mut selected: Vec<usize> = Vec::new();
46    let mut remaining: Vec<usize> = (0..items.len()).collect();
47    let mut spent = 0usize;
48
49    while !remaining.is_empty() && spent < budget {
50        let mut best: Option<(usize, usize, f64)> = None; // (pos_in_remaining, item_idx, gain_per_cost)
51
52        for (pos, &idx) in remaining.iter().enumerate() {
53            let item = &items[idx];
54            if item.cost == 0 || spent + item.cost > budget {
55                continue;
56            }
57            // Marginal gain = weight of newly covered terms.
58            let gain: f64 = item
59                .terms
60                .iter()
61                .filter(|t| !covered.contains(t.as_str()))
62                .map(|t| term_weight(t))
63                .sum();
64            if gain <= 0.0 {
65                continue;
66            }
67            let gain_per_cost = gain / item.cost as f64;
68            let take = match best {
69                Some((_, _, best_gpc)) => gain_per_cost > best_gpc,
70                None => true,
71            };
72            if take {
73                best = Some((pos, idx, gain_per_cost));
74            }
75        }
76
77        match best {
78            Some((pos, idx, _)) => {
79                for t in &items[idx].terms {
80                    covered.insert(t.as_str());
81                }
82                spent += items[idx].cost;
83                selected.push(idx);
84                remaining.swap_remove(pos);
85            }
86            // No remaining item adds positive marginal coverage within budget.
87            None => break,
88        }
89    }
90
91    selected
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    fn item(terms: &[&str], cost: usize) -> CoverageItem {
99        CoverageItem {
100            terms: terms.iter().map(|s| (*s).to_string()).collect(),
101            cost,
102        }
103    }
104
105    #[test]
106    fn picks_complementary_items_over_redundant_ones() {
107        // A and B together cover {x,y,z,w}; C is fully redundant with A.
108        let items = vec![
109            item(&["x", "y"], 1), // A
110            item(&["z", "w"], 1), // B
111            item(&["x", "y"], 1), // C (redundant with A)
112        ];
113        let sel = greedy_max_coverage(&items, 2, |_| 1.0);
114        assert_eq!(sel.len(), 2);
115        assert!(sel.contains(&0) || sel.contains(&2)); // one of the {x,y} items
116        assert!(sel.contains(&1)); // the complementary {z,w} item is always taken
117        assert!(
118            !(sel.contains(&0) && sel.contains(&2)),
119            "must not pick both redundant items"
120        );
121    }
122
123    #[test]
124    fn respects_budget() {
125        let items = vec![item(&["a"], 1), item(&["b"], 1), item(&["c"], 1)];
126        let sel = greedy_max_coverage(&items, 2, |_| 1.0);
127        assert_eq!(sel.len(), 2);
128    }
129
130    #[test]
131    fn skips_items_that_add_no_new_coverage() {
132        let items = vec![item(&["a", "b"], 1), item(&["a"], 1)];
133        let sel = greedy_max_coverage(&items, 5, |_| 1.0);
134        assert_eq!(sel, vec![0], "second item is fully subsumed → skipped");
135    }
136
137    #[test]
138    fn term_weight_prioritises_rare_terms() {
139        // Item 0 covers a common term; item 1 covers a rare (high-weight) one.
140        let items = vec![item(&["common"], 1), item(&["rare"], 1)];
141        let weight = |t: &str| if t == "rare" { 10.0 } else { 1.0 };
142        let sel = greedy_max_coverage(&items, 1, weight);
143        assert_eq!(sel, vec![1], "rare/high-weight term wins the single slot");
144    }
145
146    #[test]
147    fn empty_input_yields_empty_selection() {
148        let sel = greedy_max_coverage(&[], 5, |_| 1.0);
149        assert!(sel.is_empty());
150    }
151
152    #[test]
153    fn zero_cost_items_are_skipped() {
154        let items = vec![item(&["a"], 0), item(&["b"], 1)];
155        let sel = greedy_max_coverage(&items, 5, |_| 1.0);
156        assert_eq!(sel, vec![1]);
157    }
158}