1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
//! Submodular context packing — choosing *which* items to include under a
//! budget so the selected set maximises coverage of what the task cares about.
//!
//! ## Why submodular?
//!
//! Picking a set of context items to maximise term coverage is the classic
//! **maximum coverage** problem. Coverage is *monotone submodular*: each added
//! item helps, but with diminishing returns (a second file covering the same
//! terms adds little). For such objectives the simple **greedy** algorithm —
//! repeatedly take the item with the largest marginal gain per unit cost — is
//! provably within a `1 − 1/e ≈ 0.63` factor of the optimum (Nemhauser,
//! Wolsey & Fisher 1978). That guarantee, plus `O(n·k)` cost, is exactly what
//! we want for online context assembly.
//!
//! This module is deliberately generic and side-effect free: callers map their
//! domain objects (symbol bodies, ranked files, chunks) onto [`CoverageItem`]s
//! and get back the indices to keep, in selection order.
use std::collections::HashSet;
/// A candidate for inclusion: the set of weighted terms it covers and the
/// budget cost of including it (e.g. its token count).
#[derive(Debug, Clone)]
pub struct CoverageItem {
/// Distinct terms this item covers. Weights come from `term_weight`.
pub terms: HashSet<String>,
/// Budget consumed if this item is selected (must be ≥ 1).
pub cost: usize,
}
/// Greedy submodular maximisation of weighted term coverage under `budget`.
///
/// Returns the indices of selected items in selection order. Items whose cost
/// alone exceeds the remaining budget are skipped. `term_weight` lets callers
/// prioritise rarer / more salient terms (uniform weight ⇒ plain coverage).
///
/// Guarantees: with unit costs this is the standard greedy max-coverage with a
/// `1 − 1/e` approximation ratio. With costs it uses marginal-gain-per-cost,
/// the cost-effective greedy heuristic.
pub fn greedy_max_coverage<F>(items: &[CoverageItem], budget: usize, term_weight: F) -> Vec<usize>
where
F: Fn(&str) -> f64,
{
let mut covered: HashSet<&str> = HashSet::new();
let mut selected: Vec<usize> = Vec::new();
let mut remaining: Vec<usize> = (0..items.len()).collect();
let mut spent = 0usize;
while !remaining.is_empty() && spent < budget {
let mut best: Option<(usize, usize, f64)> = None; // (pos_in_remaining, item_idx, gain_per_cost)
for (pos, &idx) in remaining.iter().enumerate() {
let item = &items[idx];
if item.cost == 0 || spent + item.cost > budget {
continue;
}
// Marginal gain = weight of newly covered terms.
let gain: f64 = item
.terms
.iter()
.filter(|t| !covered.contains(t.as_str()))
.map(|t| term_weight(t))
.sum();
if gain <= 0.0 {
continue;
}
let gain_per_cost = gain / item.cost as f64;
let take = match best {
Some((_, _, best_gpc)) => gain_per_cost > best_gpc,
None => true,
};
if take {
best = Some((pos, idx, gain_per_cost));
}
}
match best {
Some((pos, idx, _)) => {
for t in &items[idx].terms {
covered.insert(t.as_str());
}
spent += items[idx].cost;
selected.push(idx);
remaining.swap_remove(pos);
}
// No remaining item adds positive marginal coverage within budget.
None => break,
}
}
selected
}
#[cfg(test)]
mod tests {
use super::*;
fn item(terms: &[&str], cost: usize) -> CoverageItem {
CoverageItem {
terms: terms.iter().map(|s| (*s).to_string()).collect(),
cost,
}
}
#[test]
fn picks_complementary_items_over_redundant_ones() {
// A and B together cover {x,y,z,w}; C is fully redundant with A.
let items = vec![
item(&["x", "y"], 1), // A
item(&["z", "w"], 1), // B
item(&["x", "y"], 1), // C (redundant with A)
];
let sel = greedy_max_coverage(&items, 2, |_| 1.0);
assert_eq!(sel.len(), 2);
assert!(sel.contains(&0) || sel.contains(&2)); // one of the {x,y} items
assert!(sel.contains(&1)); // the complementary {z,w} item is always taken
assert!(
!(sel.contains(&0) && sel.contains(&2)),
"must not pick both redundant items"
);
}
#[test]
fn respects_budget() {
let items = vec![item(&["a"], 1), item(&["b"], 1), item(&["c"], 1)];
let sel = greedy_max_coverage(&items, 2, |_| 1.0);
assert_eq!(sel.len(), 2);
}
#[test]
fn skips_items_that_add_no_new_coverage() {
let items = vec![item(&["a", "b"], 1), item(&["a"], 1)];
let sel = greedy_max_coverage(&items, 5, |_| 1.0);
assert_eq!(sel, vec![0], "second item is fully subsumed → skipped");
}
#[test]
fn term_weight_prioritises_rare_terms() {
// Item 0 covers a common term; item 1 covers a rare (high-weight) one.
let items = vec![item(&["common"], 1), item(&["rare"], 1)];
let weight = |t: &str| if t == "rare" { 10.0 } else { 1.0 };
let sel = greedy_max_coverage(&items, 1, weight);
assert_eq!(sel, vec![1], "rare/high-weight term wins the single slot");
}
#[test]
fn empty_input_yields_empty_selection() {
let sel = greedy_max_coverage(&[], 5, |_| 1.0);
assert!(sel.is_empty());
}
#[test]
fn zero_cost_items_are_skipped() {
let items = vec![item(&["a"], 0), item(&["b"], 1)];
let sel = greedy_max_coverage(&items, 5, |_| 1.0);
assert_eq!(sel, vec![1]);
}
}