samkhya_core/lpbound.rs
1//! Pessimistic upper-bound envelope for cardinality estimates.
2//!
3//! Inspired by **LpBound** \[Zhang et al., SIGMOD 2025 Best Paper\]. The
4//! envelope provides a *provable ceiling* on the cardinality of a join:
5//! no correction may exceed it, so cold-start plans are bounded by the
6//! native estimate or this ceiling — whichever is tighter — and never
7//! degrade below baseline.
8//!
9//! # Preferred bound
10//!
11//! When the `lp_solver` Cargo feature is enabled, `LpJoinBound` (a real
12//! fractional-edge-cover LP solved with `good_lp`'s pure-Rust `microlp`
13//! backend) is the preferred ceiling. It is provably tighter than the
14//! coarse [`ProductBound`] / [`AgmBound`] / [`ChainBound`] approximations
15//! for any non-trivial cyclic join (triangles, squares, cliques) and
16//! exactly matches the AGM ρ\*-derived bound for all join shapes the
17//! attribute-hypergraph can represent.
18//!
19//! # Scaffolding bounds (always available)
20//!
21//! [`ProductBound`], [`AgmBound`], and [`ChainBound`] remain shipped
22//! without the LP dependency for builds that want a constant-time
23//! ceiling, for unit tests, and as the safety floor when the LP solver
24//! fails (numerical edge cases, malformed join graphs). They are
25//! scaffolding for the full LpBound, not a replacement: prefer
26//! `LpJoinBound` (under the `lp_solver` feature) in any release build that
27//! can afford the `good_lp` dependency.
28//!
29//! # Empirical bound ordering
30//!
31//! The empirical campaign (`bench-results/07_lpbound_tightness.md`,
32//! 1,080 trials across path/star/cycle/clique topologies × n ∈ {3, 5, 7}
33//! × ℓ_p ∈ {1, 2, ∞}) measured the actual partial order:
34//!
35//! ```text
36//! ProductBound ≥ { ChainBound, AgmBound } ≥ LpJoinBound
37//! ```
38//!
39//! `ChainBound` and `AgmBound` are **not strictly ordered** — `ChainBound`
40//! is routinely the tighter of the two (it divides by every per-edge
41//! distinct count, while AGM uses a fractional-edge-cover shortcut). The
42//! `LpJoinBound ≤ AgmBound` leg holds in 86.4% of trials; size-7
43//! cyclic/clique under uniform ℓ_p=1 exposes an LP-conditioning corner
44//! (~13.6% violation) where the LP-derived ceiling overshoots AGM's
45//! `min × max` shortcut. The query optimizer should evaluate all three
46//! scaffolding bounds and take the minimum rather than assuming a strict
47//! chain.
48
49use crate::{Error, Result};
50
51/// Trait every upper-bound provider implements.
52///
53/// Implementations return an *inclusive* row-count ceiling that the join
54/// can never exceed. A correction layer must never produce an estimate
55/// above this number.
56///
57/// # Examples
58///
59/// ```
60/// use samkhya_core::lpbound::{ProductBound, UpperBound};
61///
62/// // Cartesian product (sound but very loose).
63/// let bound = ProductBound.ceiling(&[100, 200], &[]);
64/// assert_eq!(bound, 20_000);
65/// ```
66pub trait UpperBound {
67 /// Compute the inclusive ceiling for a join.
68 ///
69 /// * `relations` — input row counts for each base relation
70 /// * `equality_predicates` — pairs of relation indices joined by `=`
71 fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64;
72}
73
74/// Cartesian-product upper bound. Sound but very loose.
75///
76/// # Examples
77///
78/// ```
79/// use samkhya_core::lpbound::{ProductBound, UpperBound};
80///
81/// // Empty predicate list: the bound is the unconstrained product.
82/// assert_eq!(ProductBound.ceiling(&[10, 20, 30], &[]), 6000);
83/// // Overflow saturates to u64::MAX rather than wrapping.
84/// assert_eq!(ProductBound.ceiling(&[u64::MAX, 2], &[]), u64::MAX);
85/// ```
86pub struct ProductBound;
87
88impl UpperBound for ProductBound {
89 fn ceiling(&self, relations: &[u64], _eq: &[(usize, usize)]) -> u64 {
90 relations.iter().fold(1u64, |acc, &n| acc.saturating_mul(n))
91 }
92}
93
94/// Frequency-moment chain-join upper bound.
95///
96/// Assumes each equality predicate `(i, j)` joins on a single key whose
97/// distinct-value count is given by `distinct_counts[i]` and
98/// `distinct_counts[j]`. The bound is:
99///
100/// ```text
101/// |R_i ⋈ R_j| ≤ |R_i| * |R_j| / max(D_i, D_j)
102/// ```
103///
104/// (Uniform-distribution worst case; tight in expectation when join
105/// keys are evenly spread.) Applied sequentially across all equality
106/// predicates: the result of each join feeds the next bound.
107///
108/// Tighter than [`AgmBound`] for tree / chain joins where each relation
109/// has a non-trivial distinct-key count. Falls back to [`ProductBound`]
110/// when no equality predicates are supplied.
111pub struct ChainBound {
112 pub distinct_counts: Vec<u64>,
113}
114
115impl ChainBound {
116 /// Construct a chain-join bound from per-relation distinct-key counts.
117 ///
118 /// # Examples
119 ///
120 /// ```
121 /// use samkhya_core::lpbound::{ChainBound, UpperBound};
122 ///
123 /// // Two 1000-row relations, joining on a key with 100 distinct values:
124 /// // ceiling = 1000 * 1000 / max(100, 100) = 10_000.
125 /// let cb = ChainBound::new(vec![100, 100]);
126 /// assert_eq!(cb.ceiling(&[1_000, 1_000], &[(0, 1)]), 10_000);
127 /// ```
128 pub fn new(distinct_counts: Vec<u64>) -> Self {
129 Self { distinct_counts }
130 }
131}
132
133impl UpperBound for ChainBound {
134 fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
135 if relations.is_empty() {
136 return 0;
137 }
138 if equality_predicates.is_empty() {
139 return ProductBound.ceiling(relations, &[]);
140 }
141 // Each predicate divides the running product by the larger of
142 // the two endpoint distinct counts (or 1 if unknown).
143 let mut bound: u128 = relations
144 .iter()
145 .fold(1u128, |acc, &n| acc.saturating_mul(n as u128));
146 for &(i, j) in equality_predicates {
147 let d_i = self.distinct_counts.get(i).copied().unwrap_or(1).max(1) as u128;
148 let d_j = self.distinct_counts.get(j).copied().unwrap_or(1).max(1) as u128;
149 let d = d_i.max(d_j);
150 bound /= d;
151 }
152 if bound > u64::MAX as u128 {
153 u64::MAX
154 } else {
155 bound as u64
156 }
157 }
158}
159
160/// Coarse AGM-style upper bound for equi-joins.
161///
162/// Returns `min(product, |R_min| * |R_max|)` when at least one equality
163/// predicate exists, otherwise falls back to [`ProductBound`]. This is a
164/// placeholder approximation; the true AGM / LpBound bound requires
165/// fractional edge cover / LP relaxation — see `LpJoinBound` (under the
166/// `lp_solver` feature) for the principled construction.
167///
168/// # Examples
169///
170/// ```
171/// use samkhya_core::lpbound::{AgmBound, ProductBound, UpperBound};
172///
173/// let r = [1_000u64, 1_000_000];
174/// let bound = AgmBound.ceiling(&r, &[(0, 1)]);
175/// // AGM is always at least as tight as the cartesian product.
176/// assert!(bound <= ProductBound.ceiling(&r, &[]));
177/// ```
178pub struct AgmBound;
179
180impl UpperBound for AgmBound {
181 fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
182 if relations.is_empty() {
183 return 0;
184 }
185 if equality_predicates.is_empty() {
186 return ProductBound.ceiling(relations, &[]);
187 }
188 let product: u64 = relations.iter().fold(1u64, |acc, &n| acc.saturating_mul(n));
189 let min_r = *relations.iter().min().unwrap_or(&0);
190 let max_r = *relations.iter().max().unwrap_or(&0);
191 product.min(min_r.saturating_mul(max_r))
192 }
193}
194
195/// Clamp an estimate to a ceiling. Returns [`Error::LpBoundExceeded`]
196/// if the estimate exceeds the ceiling — this signals a correction-layer
197/// bug, since corrections must respect the envelope.
198///
199/// # Examples
200///
201/// ```
202/// use samkhya_core::lpbound::clamp_estimate;
203///
204/// // Within the ceiling → Ok(value).
205/// assert_eq!(clamp_estimate(500.0, 1000).unwrap(), 500);
206/// // Exceeding the ceiling → Err signalling a corrector violation.
207/// assert!(clamp_estimate(1500.0, 1000).is_err());
208/// ```
209pub fn clamp_estimate(estimate: f64, ceiling: u64) -> Result<u64> {
210 let clamped = estimate.max(0.0).min(u64::MAX as f64) as u64;
211 if clamped <= ceiling {
212 Ok(clamped)
213 } else {
214 Err(Error::LpBoundExceeded {
215 estimate,
216 ceiling: ceiling as f64,
217 })
218 }
219}
220
221/// Clamp without erroring; saturates to `ceiling`. Use this in production
222/// paths where a misbehaving corrector must never crash the engine.
223///
224/// # Examples
225///
226/// ```
227/// use samkhya_core::lpbound::saturating_clamp;
228///
229/// assert_eq!(saturating_clamp(500.0, 1000), 500);
230/// assert_eq!(saturating_clamp(2000.0, 1000), 1000); // clamps to ceiling
231/// assert_eq!(saturating_clamp(-5.0, 1000), 0); // negative → 0
232/// assert_eq!(saturating_clamp(f64::NAN, 1000), 0); // NaN is treated as 0
233/// ```
234pub fn saturating_clamp(estimate: f64, ceiling: u64) -> u64 {
235 let clamped = estimate.max(0.0).min(u64::MAX as f64) as u64;
236 clamped.min(ceiling)
237}
238
239// =============================================================================
240// LpJoinBound — real fractional-edge-cover LP (the v0.5.0 deliverable).
241// =============================================================================
242
243/// Real fractional-edge-cover LP join bound — the principled AGM / LpBound
244/// construction the coarse [`AgmBound`] / [`ChainBound`] approximate.
245///
246/// # Formulation
247///
248/// Build the join's *attribute hypergraph*:
249///
250/// * one variable `x_r ≥ 0` per relation `r`;
251/// * each equality predicate `(i, j)` contributes one shared attribute
252/// `a` covered by both `R_i` and `R_j`;
253/// * for every shared attribute `a` we add a fractional-cover constraint
254///
255/// ```text
256/// sum_{r : a ∈ schema(r)} x_r ≥ 1
257/// ```
258///
259/// * the objective is to minimise the log-cardinality of the join,
260///
261/// ```text
262/// minimise sum_r x_r * log|R_r|
263/// ```
264///
265/// The provable join-cardinality ceiling is `exp(minimum)`. This is the
266/// classical **Atserias–Grohe–Marx fractional-edge-cover bound** that
267/// LpBound (Zhang et al., SIGMOD 2025) extends to ℓp-norm degree
268/// sequences; the AGM bound is the p=∞ specialisation and is exactly
269/// what we ship here.
270///
271/// # Per-component decomposition
272///
273/// Equality predicates partition the relations into connected
274/// components. Variables in distinct components share no constraint, so
275/// the LP decomposes: the bound on the whole join graph is the
276/// **product** of the bounds on each connected component. We exploit
277/// this by solving one (small) LP per component instead of one big LP.
278///
279/// # Tightness vs the coarse bounds
280///
281/// * 2-relation single-predicate join: LP returns `min(|R_i|, |R_j|)`
282/// (the real AGM bound for a single shared attribute), which is
283/// strictly tighter than [`AgmBound`]'s `|R_min| * |R_max|`
284/// approximation whenever both relations are non-empty.
285/// * Triangle (3 relations, 3 predicates each on a distinct attribute):
286/// LP returns `(|R_0| * |R_1| * |R_2|)^{1/2}`, the famous AGM triangle
287/// bound. Strictly tighter than [`ChainBound`] and [`ProductBound`]
288/// for any non-trivial relation sizes.
289/// * Disconnected components: LP returns the product of the
290/// per-component bounds, matching the trivial decomposition.
291///
292/// # Solver
293///
294/// Backed by [`good_lp`] with the pure-Rust `microlp` backend
295/// (no system libraries, no C/C++ toolchain — compiles cleanly on any
296/// Rust 1.94+ host). The LP is small (one variable per relation, one
297/// constraint per shared attribute) so solve time is negligible.
298#[cfg(feature = "lp_solver")]
299pub struct LpJoinBound {
300 /// Optional per-relation distinct-count hint. When provided, the
301 /// objective coefficient for relation `r` is `log(min(|R_r|, D_r))`
302 /// rather than `log|R_r|`, which can only tighten the bound (the
303 /// join output on a key column cannot exceed the column's distinct
304 /// support). Empty by default.
305 distinct_counts: Vec<u64>,
306}
307
308#[cfg(feature = "lp_solver")]
309impl Default for LpJoinBound {
310 fn default() -> Self {
311 Self::new()
312 }
313}
314
315#[cfg(feature = "lp_solver")]
316impl LpJoinBound {
317 /// Construct a bound with no distinct-count overrides. The objective
318 /// uses `log|R_r|` for every relation.
319 pub fn new() -> Self {
320 Self {
321 distinct_counts: Vec::new(),
322 }
323 }
324
325 /// Construct a bound that uses the supplied per-relation distinct
326 /// counts to tighten the objective coefficients.
327 pub fn with_distinct_counts(distinct_counts: Vec<u64>) -> Self {
328 Self { distinct_counts }
329 }
330
331 /// Same semantics as [`UpperBound::ceiling`]; surfaced here so
332 /// callers can avoid importing the trait when they already hold an
333 /// `&LpJoinBound`.
334 pub fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
335 self.solve(relations, equality_predicates, /*use_distinct=*/ false)
336 }
337
338 /// Like [`Self::ceiling`] but folds the supplied distinct counts
339 /// into the per-relation objective coefficient. If the supplied
340 /// vector is shorter than `relations` or contains zero entries the
341 /// missing entries fall back to the row count.
342 pub fn ceiling_with_distinct(
343 &self,
344 relations: &[u64],
345 equality_predicates: &[(usize, usize)],
346 ) -> u64 {
347 self.solve(relations, equality_predicates, /*use_distinct=*/ true)
348 }
349
350 /// Core solver: build the per-connected-component LP, solve each,
351 /// and multiply the per-component ceilings. Falls back to
352 /// [`ProductBound`] / [`AgmBound`] if the solver fails for any
353 /// reason — the envelope must never crash the engine.
354 fn solve(
355 &self,
356 relations: &[u64],
357 equality_predicates: &[(usize, usize)],
358 use_distinct: bool,
359 ) -> u64 {
360 if relations.is_empty() {
361 return 0;
362 }
363 if equality_predicates.is_empty() {
364 return ProductBound.ceiling(relations, &[]);
365 }
366
367 // Validate predicate indices; defensively drop any out-of-range
368 // pair. A misbuilt join graph must never crash the envelope.
369 let n = relations.len();
370 let preds: Vec<(usize, usize)> = equality_predicates
371 .iter()
372 .copied()
373 .filter(|&(i, j)| i < n && j < n && i != j)
374 .collect();
375 if preds.is_empty() {
376 return ProductBound.ceiling(relations, &[]);
377 }
378
379 // Build connected components over the relation graph induced by
380 // the equality predicates. Each component's LP is independent;
381 // we solve them separately and multiply the ceilings.
382 let components = connected_components(n, &preds);
383 let mut total: u128 = 1;
384 for comp in &components {
385 let ceil = self.solve_component(relations, &preds, comp, use_distinct);
386 total = total.saturating_mul(ceil as u128);
387 if total >= u64::MAX as u128 {
388 return u64::MAX;
389 }
390 }
391 total as u64
392 }
393
394 /// Solve the AGM LP restricted to a single connected component.
395 fn solve_component(
396 &self,
397 relations: &[u64],
398 all_predicates: &[(usize, usize)],
399 component: &[usize],
400 use_distinct: bool,
401 ) -> u64 {
402 // Singleton component (no predicate incident): output cardinality
403 // is just the relation size.
404 if component.len() == 1 {
405 return relations[component[0]];
406 }
407
408 // Subset of predicates whose endpoints both lie in this
409 // component. (Every predicate connecting two members of a
410 // component is in the component by definition; we filter only
411 // for safety.)
412 let in_comp: std::collections::HashSet<usize> = component.iter().copied().collect();
413 let comp_preds: Vec<(usize, usize)> = all_predicates
414 .iter()
415 .copied()
416 .filter(|&(i, j)| in_comp.contains(&i) && in_comp.contains(&j))
417 .collect();
418
419 // Build the LP via `good_lp`. One x_r per relation in the
420 // component; one >= 1 constraint per equality predicate
421 // (each predicate introduces a distinct shared attribute).
422 use good_lp::{
423 Expression, ProblemVariables, Solution, SolverModel, default_solver, variable,
424 };
425
426 let mut vars = ProblemVariables::new();
427 // Map: relation index in the global `relations` vector ->
428 // good_lp variable handle.
429 let mut var_for: std::collections::HashMap<usize, good_lp::Variable> =
430 std::collections::HashMap::with_capacity(component.len());
431 // Build the objective expression in tandem with adding
432 // variables so coefficients line up with relation order.
433 let mut objective = Expression::with_capacity(component.len());
434 for &r in component {
435 let v = vars.add(variable().min(0.0));
436 var_for.insert(r, v);
437 let row_count = relations[r];
438 // Coefficient = log(effective size). Effective size = row
439 // count, optionally clamped to the supplied distinct count
440 // (which can only shrink it).
441 let mut size_f = row_count as f64;
442 if use_distinct {
443 if let Some(&d) = self.distinct_counts.get(r) {
444 if d > 0 {
445 size_f = size_f.min(d as f64);
446 }
447 }
448 }
449 // log(0) is undefined; we treat an empty relation as
450 // contributing 0 to the log-sum (the join is empty anyway).
451 // log(1) is 0 and would let the LP put unbounded weight on
452 // that variable without paying — we clamp the coefficient
453 // away from zero with a tiny epsilon so the objective is
454 // strictly minimised.
455 let coef = if size_f <= 1.0 { 0.0 } else { size_f.ln() };
456 objective.add_mul(coef, v);
457 }
458
459 // Add one >= 1 fractional-cover constraint per predicate.
460 // good_lp's `Expression >> rhs` operator builds a >= constraint.
461 let mut model = vars.minimise(&objective).using(default_solver);
462 for &(i, j) in &comp_preds {
463 let xi = var_for[&i];
464 let xj = var_for[&j];
465 let lhs: Expression = xi + xj;
466 model = model.with(lhs.geq(1.0));
467 }
468
469 match model.solve() {
470 Ok(sol) => {
471 let lp_min = sol.eval(&objective);
472 // exp(LP_min) is the AGM ceiling. Guard against
473 // negative noise from the simplex and against overflow.
474 let raw = lp_min.exp();
475 if !raw.is_finite() || raw < 0.0 {
476 return self.fallback(relations, &comp_preds, component);
477 }
478 // Always at least 1 (a join with at least one row in
479 // both endpoints can return one row); cap at u64::MAX.
480 let raw = raw.max(1.0);
481 if raw >= u64::MAX as f64 {
482 u64::MAX
483 } else {
484 // Snap to nearest integer when within a tight
485 // relative epsilon: `exp(ln(n))` for integer `n`
486 // can drift to `n + 1e-12` and a blind ceil would
487 // push the per-component bound a full integer
488 // above the true AGM optimum, breaking the
489 // contract that LpJoinBound <= AgmBound. Only
490 // ceil when the LP value is materially above the
491 // nearest integer.
492 let rounded = raw.round();
493 let snap_eps = 1e-9_f64.max(raw.abs() * 1e-12);
494 if (raw - rounded).abs() <= snap_eps {
495 rounded as u64
496 } else {
497 raw.ceil() as u64
498 }
499 }
500 }
501 Err(_) => self.fallback(relations, &comp_preds, component),
502 }
503 }
504
505 /// Conservative fallback when the LP solver fails. Returns the
506 /// minimum row count among the component's relations (a valid AGM
507 /// upper bound when at least one predicate covers every relation in
508 /// the component), or [`ProductBound`] over the component otherwise.
509 fn fallback(
510 &self,
511 relations: &[u64],
512 comp_preds: &[(usize, usize)],
513 component: &[usize],
514 ) -> u64 {
515 if comp_preds.is_empty() {
516 return component
517 .iter()
518 .map(|&r| relations[r])
519 .fold(1u64, |a, n| a.saturating_mul(n));
520 }
521 let comp_rows: Vec<u64> = component.iter().map(|&r| relations[r]).collect();
522 let agm = AgmBound;
523 agm.ceiling(&comp_rows, &[(0, 1)])
524 }
525}
526
527#[cfg(feature = "lp_solver")]
528impl UpperBound for LpJoinBound {
529 fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
530 self.ceiling(relations, equality_predicates)
531 }
532}
533
534/// Return the connected components of the graph on `0..n` with edges
535/// given by `edges`. Each component is a sorted list of vertex indices.
536/// Singleton vertices (no incident edge) appear as one-element
537/// components — every relation index in `0..n` is in exactly one
538/// component.
539#[cfg(feature = "lp_solver")]
540fn connected_components(n: usize, edges: &[(usize, usize)]) -> Vec<Vec<usize>> {
541 let mut parent: Vec<usize> = (0..n).collect();
542 fn find(parent: &mut [usize], mut x: usize) -> usize {
543 while parent[x] != x {
544 parent[x] = parent[parent[x]];
545 x = parent[x];
546 }
547 x
548 }
549 for &(a, b) in edges {
550 if a >= n || b >= n {
551 continue;
552 }
553 let ra = find(&mut parent, a);
554 let rb = find(&mut parent, b);
555 if ra != rb {
556 parent[ra] = rb;
557 }
558 }
559 let mut groups: std::collections::HashMap<usize, Vec<usize>> = std::collections::HashMap::new();
560 for v in 0..n {
561 let r = find(&mut parent, v);
562 groups.entry(r).or_default().push(v);
563 }
564 let mut out: Vec<Vec<usize>> = groups.into_values().collect();
565 for c in &mut out {
566 c.sort_unstable();
567 }
568 // Sort components by their smallest member for deterministic order.
569 out.sort_by_key(|c| c[0]);
570 out
571}
572
573#[cfg(test)]
574mod tests {
575 use super::*;
576
577 #[test]
578 fn product_bound_two_relations() {
579 assert_eq!(ProductBound.ceiling(&[100, 200], &[]), 20_000);
580 }
581
582 #[test]
583 fn product_bound_overflow_saturates() {
584 assert_eq!(ProductBound.ceiling(&[u64::MAX, 2], &[]), u64::MAX);
585 }
586
587 #[test]
588 fn product_bound_empty_relations() {
589 assert_eq!(ProductBound.ceiling(&[], &[]), 1);
590 }
591
592 #[test]
593 fn agm_no_predicates_falls_back_to_product() {
594 assert_eq!(AgmBound.ceiling(&[10, 20, 30], &[]), 10 * 20 * 30);
595 }
596
597 #[test]
598 fn agm_with_predicates_tighter_than_product() {
599 let r = [1_000u64, 1_000_000];
600 let bound = AgmBound.ceiling(&r, &[(0, 1)]);
601 let product = ProductBound.ceiling(&r, &[]);
602 assert!(bound <= product);
603 }
604
605 #[test]
606 fn clamp_within_ceiling() {
607 assert_eq!(clamp_estimate(500.0, 1000).unwrap(), 500);
608 }
609
610 #[test]
611 fn clamp_exceeds_ceiling_errors() {
612 let err = clamp_estimate(1500.0, 1000).unwrap_err();
613 match err {
614 Error::LpBoundExceeded { estimate, ceiling } => {
615 assert_eq!(estimate, 1500.0);
616 assert_eq!(ceiling, 1000.0);
617 }
618 other => panic!("wrong error variant: {other:?}"),
619 }
620 }
621
622 #[test]
623 fn chain_bound_tighter_than_product() {
624 // Two relations of 1000 rows each, joining on a key with 100 distinct values.
625 // Product = 1_000_000; ChainBound = 1000 * 1000 / 100 = 10_000.
626 let r = [1_000u64, 1_000];
627 let cb = ChainBound::new(vec![100, 100]);
628 let bound = cb.ceiling(&r, &[(0, 1)]);
629 assert_eq!(bound, 10_000);
630 let product = ProductBound.ceiling(&r, &[]);
631 assert!(bound < product);
632 }
633
634 #[test]
635 fn chain_bound_three_table_chain() {
636 // R1(1000) ⋈ R2(2000) ⋈ R3(500), join keys 100 distinct each side.
637 // Product = 1e9. Chain = 1e9 / 100 / 100 = 100_000.
638 let r = [1_000u64, 2_000, 500];
639 let cb = ChainBound::new(vec![100, 100, 100]);
640 let bound = cb.ceiling(&r, &[(0, 1), (1, 2)]);
641 assert_eq!(bound, 100_000);
642 }
643
644 #[test]
645 fn chain_bound_no_predicates_falls_back() {
646 let cb = ChainBound::new(vec![10, 20, 30]);
647 assert_eq!(cb.ceiling(&[10, 20, 30], &[]), 10 * 20 * 30);
648 }
649
650 #[test]
651 fn chain_bound_missing_distinct_count_defaults_to_one() {
652 // No distinct count entry → defaults to 1, meaning no reduction.
653 let cb = ChainBound::new(vec![]);
654 let bound = cb.ceiling(&[100, 100], &[(0, 1)]);
655 assert_eq!(bound, 10_000); // 100 * 100 / max(1, 1) = 10_000
656 }
657
658 #[test]
659 fn saturating_clamp_saturates() {
660 assert_eq!(saturating_clamp(500.0, 1000), 500);
661 assert_eq!(saturating_clamp(2000.0, 1000), 1000);
662 assert_eq!(saturating_clamp(-5.0, 1000), 0);
663 assert_eq!(saturating_clamp(f64::NAN, 1000), 0);
664 }
665}
666
667#[cfg(all(test, feature = "lp_solver"))]
668mod lp_tests {
669 use super::*;
670
671 /// 2-table single-edge join: the LP returns the principled AGM
672 /// bound `min(|R_0|, |R_1|)` and is therefore a valid (tighter than
673 /// or equal) refinement of the coarse [`AgmBound`] approximation,
674 /// which for two relations reduces to `|R_0| * |R_1|`.
675 #[test]
676 fn two_table_join_matches_principled_agm() {
677 let r = [1_000u64, 1_000_000u64];
678 let lp = LpJoinBound::new();
679 let bound = lp.ceiling(&r, &[(0, 1)]);
680 // True AGM single-edge bound is min(|R_0|, |R_1|).
681 // Allow ceil()'s +/-1 floating-point noise.
682 assert!(
683 (999..=1_001).contains(&bound),
684 "expected ≈1000, got {bound}"
685 );
686 // And the LP bound must never exceed the coarse AGM bound it
687 // replaces (validity / refinement contract).
688 let coarse = AgmBound.ceiling(&r, &[(0, 1)]);
689 assert!(
690 bound <= coarse,
691 "LP bound {bound} must not exceed coarse AGM {coarse}"
692 );
693 }
694
695 /// Triangle: 3 relations, 3 equality predicates each on a distinct
696 /// shared attribute. The fractional AGM cover number is ρ\* = 3/2,
697 /// so the LP bound is `(|R_0| * |R_1| * |R_2|)^{1/2}`.
698 #[test]
699 fn triangle_strictly_tighter_than_chain_and_product() {
700 // Use round numbers so the closed-form expectation is exact.
701 let r = [1_000u64, 1_000u64, 1_000u64];
702 let preds = [(0usize, 1usize), (1, 2), (0, 2)];
703 let lp = LpJoinBound::new();
704 let bound = lp.ceiling(&r, &preds);
705
706 // sqrt(1e9) ≈ 31_622.78 → expect 31_623 (after ceil).
707 assert!(
708 (31_000u64..=32_000u64).contains(&bound),
709 "expected ≈31_623, got {bound}"
710 );
711
712 // Strictly tighter than product = 1e9.
713 let product = ProductBound.ceiling(&r, &preds);
714 assert!(bound < product, "LP {bound} should be < product {product}");
715
716 // Strictly tighter than the chain bound under realistic
717 // distinct-count hints (≈10 distinct join keys per relation —
718 // matches the regime where ChainBound is meaningful but not
719 // pathologically optimistic).
720 let cb = ChainBound::new(vec![10, 10, 10]);
721 let chain = cb.ceiling(&r, &preds);
722 assert!(
723 bound < chain,
724 "LP {bound} should be < chain {chain} on the triangle"
725 );
726 }
727
728 /// Square (4-cycle): R_0 — R_1 — R_2 — R_3 — R_0. AGM ρ\* = 2, so
729 /// the LP bound is `sqrt(|R_0|*|R_2|*|R_1|*|R_3|)` ≈ `(N)^2` for
730 /// equally sized N, vs the product = N^4.
731 #[test]
732 fn square_strictly_tighter_than_chain_and_product() {
733 let r = [100u64, 100u64, 100u64, 100u64];
734 let preds = [(0usize, 1usize), (1, 2), (2, 3), (3, 0)];
735 let lp = LpJoinBound::new();
736 let bound = lp.ceiling(&r, &preds);
737
738 // 4-cycle AGM optimum is ρ* = 2 (alternating x = 1, 0, 1, 0 or
739 // x = 1/2 each); for equal sizes this gives N^2 = 10_000.
740 // Allow a generous numerical tolerance.
741 assert!(
742 (5_000..=15_000).contains(&bound),
743 "expected ≈10_000, got {bound}"
744 );
745
746 // Strictly tighter than product = 1e8.
747 let product = ProductBound.ceiling(&r, &preds);
748 assert!(bound < product, "LP {bound} should be < product {product}");
749
750 // Strictly tighter than the chain bound under modest
751 // distinct-count hints. d=4 per relation, 4 predicates →
752 // chain = 100^4 / 4^4 = 1e8 / 256 ≈ 390_625, which is
753 // looser than the LP's ≈10_000.
754 let cb = ChainBound::new(vec![4, 4, 4, 4]);
755 let chain = cb.ceiling(&r, &preds);
756 assert!(
757 bound < chain,
758 "LP {bound} should be < chain {chain} on the 4-cycle"
759 );
760 }
761
762 /// Disconnected join graph: two independent 2-table joins.
763 /// The LP decomposes into one LP per connected component, and the
764 /// total bound is the product of the per-component bounds.
765 #[test]
766 fn disconnected_components_multiply() {
767 // Component A: R_0 ⋈ R_1 (relations of sizes 100, 200, single
768 // predicate). Per-component AGM bound = min(100, 200) = 100.
769 // Component B: R_2 ⋈ R_3 (sizes 50, 70). Per-component bound
770 // = min(50, 70) = 50. Total expected ≈ 100 * 50 = 5_000.
771 let r = [100u64, 200, 50, 70];
772 let preds = [(0usize, 1usize), (2, 3)];
773 let lp = LpJoinBound::new();
774 let bound = lp.ceiling(&r, &preds);
775 assert!(
776 (4_900..=5_100).contains(&bound),
777 "expected ≈5000, got {bound}"
778 );
779 }
780
781 /// Singleton relation (no incident predicate) keeps its row count in
782 /// the product of component bounds.
783 #[test]
784 fn singleton_component_contributes_row_count() {
785 let r = [100u64, 200, 99];
786 // Only R_0 and R_1 are joined; R_2 is isolated.
787 let preds = [(0usize, 1usize)];
788 let lp = LpJoinBound::new();
789 let bound = lp.ceiling(&r, &preds);
790 // Component {0,1}: min(100, 200) = 100. Component {2}: 99.
791 // Total ≈ 9_900.
792 assert!(
793 (9_800..=10_000).contains(&bound),
794 "expected ≈9_900, got {bound}"
795 );
796 }
797
798 /// The LP bound must never exceed the trivial product bound.
799 #[test]
800 fn lp_bound_dominates_product() {
801 let r = [37u64, 41, 43, 47, 53];
802 let preds = [(0usize, 1usize), (1, 2), (2, 3), (3, 4)];
803 let lp = LpJoinBound::new();
804 let bound = lp.ceiling(&r, &preds);
805 let product = ProductBound.ceiling(&r, &preds);
806 assert!(
807 bound <= product,
808 "LP bound {bound} must be ≤ product {product}"
809 );
810 }
811
812 /// Empty relations → bound 0.
813 #[test]
814 fn empty_relations_zero() {
815 let lp = LpJoinBound::new();
816 assert_eq!(lp.ceiling(&[], &[]), 0);
817 }
818
819 /// No predicates → product bound (sanity passthrough).
820 #[test]
821 fn no_predicates_returns_product() {
822 let lp = LpJoinBound::new();
823 let r = [10u64, 20, 30];
824 assert_eq!(lp.ceiling(&r, &[]), 6_000);
825 }
826
827 /// `ceiling_with_distinct` clamps the per-relation objective
828 /// coefficient by `min(|R|, D)`. With a tight distinct-count hint
829 /// the bound only gets smaller (tighter).
830 #[test]
831 fn ceiling_with_distinct_is_at_most_unconstrained() {
832 let r = [1_000u64, 1_000];
833 let preds = [(0usize, 1usize)];
834 let with_d = LpJoinBound::with_distinct_counts(vec![10, 10]);
835 let unconstrained = LpJoinBound::new();
836 let a = with_d.ceiling_with_distinct(&r, &preds);
837 let b = unconstrained.ceiling(&r, &preds);
838 assert!(a <= b, "distinct-aware bound {a} must be tighter than {b}");
839 // With 10 distinct values on each side the bound collapses to 10.
840 assert!(a <= 11, "expected ≈10 with D=10, got {a}");
841 }
842}