samkhya_core/
lpbound.rs

1//! Pessimistic upper-bound envelope for cardinality estimates.
2//!
3//! Inspired by **LpBound** \[Zhang et al., SIGMOD 2025 Best Paper\]. The
4//! envelope provides a *provable ceiling* on the cardinality of a join:
5//! no correction may exceed it, so cold-start plans are bounded by the
6//! native estimate or this ceiling — whichever is tighter — and never
7//! degrade below baseline.
8//!
9//! # Preferred bound
10//!
11//! When the `lp_solver` Cargo feature is enabled, `LpJoinBound` (a real
12//! fractional-edge-cover LP solved with `good_lp`'s pure-Rust `microlp`
13//! backend) is the preferred ceiling. It is provably tighter than the
14//! coarse [`ProductBound`] / [`AgmBound`] / [`ChainBound`] approximations
15//! for any non-trivial cyclic join (triangles, squares, cliques) and
16//! exactly matches the AGM ρ\*-derived bound for all join shapes the
17//! attribute-hypergraph can represent.
18//!
19//! # Scaffolding bounds (always available)
20//!
21//! [`ProductBound`], [`AgmBound`], and [`ChainBound`] remain shipped
22//! without the LP dependency for builds that want a constant-time
23//! ceiling, for unit tests, and as the safety floor when the LP solver
24//! fails (numerical edge cases, malformed join graphs). They are
25//! scaffolding for the full LpBound, not a replacement: prefer
26//! `LpJoinBound` (under the `lp_solver` feature) in any release build that
27//! can afford the `good_lp` dependency.
28//!
29//! # Empirical bound ordering
30//!
31//! The empirical campaign (`bench-results/07_lpbound_tightness.md`,
32//! 1,080 trials across path/star/cycle/clique topologies × n ∈ {3, 5, 7}
33//! × ℓ_p ∈ {1, 2, ∞}) measured the actual partial order:
34//!
35//! ```text
36//!   ProductBound  ≥  { ChainBound,  AgmBound }  ≥  LpJoinBound
37//! ```
38//!
39//! `ChainBound` and `AgmBound` are **not strictly ordered** — `ChainBound`
40//! is routinely the tighter of the two (it divides by every per-edge
41//! distinct count, while AGM uses a fractional-edge-cover shortcut). The
42//! `LpJoinBound ≤ AgmBound` leg holds in 86.4% of trials; size-7
43//! cyclic/clique under uniform ℓ_p=1 exposes an LP-conditioning corner
44//! (~13.6% violation) where the LP-derived ceiling overshoots AGM's
45//! `min × max` shortcut. The query optimizer should evaluate all three
46//! scaffolding bounds and take the minimum rather than assuming a strict
47//! chain.
48
49use crate::{Error, Result};
50
51/// Trait every upper-bound provider implements.
52///
53/// Implementations return an *inclusive* row-count ceiling that the join
54/// can never exceed. A correction layer must never produce an estimate
55/// above this number.
56///
57/// # Examples
58///
59/// ```
60/// use samkhya_core::lpbound::{ProductBound, UpperBound};
61///
62/// // Cartesian product (sound but very loose).
63/// let bound = ProductBound.ceiling(&[100, 200], &[]);
64/// assert_eq!(bound, 20_000);
65/// ```
66pub trait UpperBound {
67    /// Compute the inclusive ceiling for a join.
68    ///
69    /// * `relations`           — input row counts for each base relation
70    /// * `equality_predicates` — pairs of relation indices joined by `=`
71    fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64;
72}
73
74/// Cartesian-product upper bound. Sound but very loose.
75///
76/// # Examples
77///
78/// ```
79/// use samkhya_core::lpbound::{ProductBound, UpperBound};
80///
81/// // Empty predicate list: the bound is the unconstrained product.
82/// assert_eq!(ProductBound.ceiling(&[10, 20, 30], &[]), 6000);
83/// // Overflow saturates to u64::MAX rather than wrapping.
84/// assert_eq!(ProductBound.ceiling(&[u64::MAX, 2], &[]), u64::MAX);
85/// ```
86pub struct ProductBound;
87
88impl UpperBound for ProductBound {
89    fn ceiling(&self, relations: &[u64], _eq: &[(usize, usize)]) -> u64 {
90        relations.iter().fold(1u64, |acc, &n| acc.saturating_mul(n))
91    }
92}
93
94/// Frequency-moment chain-join upper bound.
95///
96/// Assumes each equality predicate `(i, j)` joins on a single key whose
97/// distinct-value count is given by `distinct_counts[i]` and
98/// `distinct_counts[j]`. The bound is:
99///
100/// ```text
101/// |R_i ⋈ R_j| ≤ |R_i| * |R_j| / max(D_i, D_j)
102/// ```
103///
104/// (Uniform-distribution worst case; tight in expectation when join
105/// keys are evenly spread.) Applied sequentially across all equality
106/// predicates: the result of each join feeds the next bound.
107///
108/// Tighter than [`AgmBound`] for tree / chain joins where each relation
109/// has a non-trivial distinct-key count. Falls back to [`ProductBound`]
110/// when no equality predicates are supplied.
111pub struct ChainBound {
112    pub distinct_counts: Vec<u64>,
113}
114
115impl ChainBound {
116    /// Construct a chain-join bound from per-relation distinct-key counts.
117    ///
118    /// # Examples
119    ///
120    /// ```
121    /// use samkhya_core::lpbound::{ChainBound, UpperBound};
122    ///
123    /// // Two 1000-row relations, joining on a key with 100 distinct values:
124    /// // ceiling = 1000 * 1000 / max(100, 100) = 10_000.
125    /// let cb = ChainBound::new(vec![100, 100]);
126    /// assert_eq!(cb.ceiling(&[1_000, 1_000], &[(0, 1)]), 10_000);
127    /// ```
128    pub fn new(distinct_counts: Vec<u64>) -> Self {
129        Self { distinct_counts }
130    }
131}
132
133impl UpperBound for ChainBound {
134    fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
135        if relations.is_empty() {
136            return 0;
137        }
138        if equality_predicates.is_empty() {
139            return ProductBound.ceiling(relations, &[]);
140        }
141        // Each predicate divides the running product by the larger of
142        // the two endpoint distinct counts (or 1 if unknown).
143        let mut bound: u128 = relations
144            .iter()
145            .fold(1u128, |acc, &n| acc.saturating_mul(n as u128));
146        for &(i, j) in equality_predicates {
147            let d_i = self.distinct_counts.get(i).copied().unwrap_or(1).max(1) as u128;
148            let d_j = self.distinct_counts.get(j).copied().unwrap_or(1).max(1) as u128;
149            let d = d_i.max(d_j);
150            bound /= d;
151        }
152        if bound > u64::MAX as u128 {
153            u64::MAX
154        } else {
155            bound as u64
156        }
157    }
158}
159
160/// Coarse AGM-style upper bound for equi-joins.
161///
162/// Returns `min(product, |R_min| * |R_max|)` when at least one equality
163/// predicate exists, otherwise falls back to [`ProductBound`]. This is a
164/// placeholder approximation; the true AGM / LpBound bound requires
165/// fractional edge cover / LP relaxation — see `LpJoinBound` (under the
166/// `lp_solver` feature) for the principled construction.
167///
168/// # Examples
169///
170/// ```
171/// use samkhya_core::lpbound::{AgmBound, ProductBound, UpperBound};
172///
173/// let r = [1_000u64, 1_000_000];
174/// let bound = AgmBound.ceiling(&r, &[(0, 1)]);
175/// // AGM is always at least as tight as the cartesian product.
176/// assert!(bound <= ProductBound.ceiling(&r, &[]));
177/// ```
178pub struct AgmBound;
179
180impl UpperBound for AgmBound {
181    fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
182        if relations.is_empty() {
183            return 0;
184        }
185        if equality_predicates.is_empty() {
186            return ProductBound.ceiling(relations, &[]);
187        }
188        let product: u64 = relations.iter().fold(1u64, |acc, &n| acc.saturating_mul(n));
189        let min_r = *relations.iter().min().unwrap_or(&0);
190        let max_r = *relations.iter().max().unwrap_or(&0);
191        product.min(min_r.saturating_mul(max_r))
192    }
193}
194
195/// Clamp an estimate to a ceiling. Returns [`Error::LpBoundExceeded`]
196/// if the estimate exceeds the ceiling — this signals a correction-layer
197/// bug, since corrections must respect the envelope.
198///
199/// # Examples
200///
201/// ```
202/// use samkhya_core::lpbound::clamp_estimate;
203///
204/// // Within the ceiling → Ok(value).
205/// assert_eq!(clamp_estimate(500.0, 1000).unwrap(), 500);
206/// // Exceeding the ceiling → Err signalling a corrector violation.
207/// assert!(clamp_estimate(1500.0, 1000).is_err());
208/// ```
209pub fn clamp_estimate(estimate: f64, ceiling: u64) -> Result<u64> {
210    let clamped = estimate.max(0.0).min(u64::MAX as f64) as u64;
211    if clamped <= ceiling {
212        Ok(clamped)
213    } else {
214        Err(Error::LpBoundExceeded {
215            estimate,
216            ceiling: ceiling as f64,
217        })
218    }
219}
220
221/// Clamp without erroring; saturates to `ceiling`. Use this in production
222/// paths where a misbehaving corrector must never crash the engine.
223///
224/// # Examples
225///
226/// ```
227/// use samkhya_core::lpbound::saturating_clamp;
228///
229/// assert_eq!(saturating_clamp(500.0, 1000), 500);
230/// assert_eq!(saturating_clamp(2000.0, 1000), 1000);   // clamps to ceiling
231/// assert_eq!(saturating_clamp(-5.0, 1000), 0);        // negative → 0
232/// assert_eq!(saturating_clamp(f64::NAN, 1000), 0);    // NaN is treated as 0
233/// ```
234pub fn saturating_clamp(estimate: f64, ceiling: u64) -> u64 {
235    let clamped = estimate.max(0.0).min(u64::MAX as f64) as u64;
236    clamped.min(ceiling)
237}
238
239// =============================================================================
240// LpJoinBound — real fractional-edge-cover LP (the v0.5.0 deliverable).
241// =============================================================================
242
243/// Real fractional-edge-cover LP join bound — the principled AGM / LpBound
244/// construction the coarse [`AgmBound`] / [`ChainBound`] approximate.
245///
246/// # Formulation
247///
248/// Build the join's *attribute hypergraph*:
249///
250/// * one variable `x_r ≥ 0` per relation `r`;
251/// * each equality predicate `(i, j)` contributes one shared attribute
252///   `a` covered by both `R_i` and `R_j`;
253/// * for every shared attribute `a` we add a fractional-cover constraint
254///
255///   ```text
256///   sum_{r : a ∈ schema(r)} x_r ≥ 1
257///   ```
258///
259/// * the objective is to minimise the log-cardinality of the join,
260///
261///   ```text
262///   minimise   sum_r x_r * log|R_r|
263///   ```
264///
265/// The provable join-cardinality ceiling is `exp(minimum)`. This is the
266/// classical **Atserias–Grohe–Marx fractional-edge-cover bound** that
267/// LpBound (Zhang et al., SIGMOD 2025) extends to ℓp-norm degree
268/// sequences; the AGM bound is the p=∞ specialisation and is exactly
269/// what we ship here.
270///
271/// # Per-component decomposition
272///
273/// Equality predicates partition the relations into connected
274/// components. Variables in distinct components share no constraint, so
275/// the LP decomposes: the bound on the whole join graph is the
276/// **product** of the bounds on each connected component. We exploit
277/// this by solving one (small) LP per component instead of one big LP.
278///
279/// # Tightness vs the coarse bounds
280///
281/// * 2-relation single-predicate join: LP returns `min(|R_i|, |R_j|)`
282///   (the real AGM bound for a single shared attribute), which is
283///   strictly tighter than [`AgmBound`]'s `|R_min| * |R_max|`
284///   approximation whenever both relations are non-empty.
285/// * Triangle (3 relations, 3 predicates each on a distinct attribute):
286///   LP returns `(|R_0| * |R_1| * |R_2|)^{1/2}`, the famous AGM triangle
287///   bound. Strictly tighter than [`ChainBound`] and [`ProductBound`]
288///   for any non-trivial relation sizes.
289/// * Disconnected components: LP returns the product of the
290///   per-component bounds, matching the trivial decomposition.
291///
292/// # Solver
293///
294/// Backed by [`good_lp`] with the pure-Rust `microlp` backend
295/// (no system libraries, no C/C++ toolchain — compiles cleanly on any
296/// Rust 1.94+ host). The LP is small (one variable per relation, one
297/// constraint per shared attribute) so solve time is negligible.
298#[cfg(feature = "lp_solver")]
299pub struct LpJoinBound {
300    /// Optional per-relation distinct-count hint. When provided, the
301    /// objective coefficient for relation `r` is `log(min(|R_r|, D_r))`
302    /// rather than `log|R_r|`, which can only tighten the bound (the
303    /// join output on a key column cannot exceed the column's distinct
304    /// support). Empty by default.
305    distinct_counts: Vec<u64>,
306}
307
308#[cfg(feature = "lp_solver")]
309impl Default for LpJoinBound {
310    fn default() -> Self {
311        Self::new()
312    }
313}
314
315#[cfg(feature = "lp_solver")]
316impl LpJoinBound {
317    /// Construct a bound with no distinct-count overrides. The objective
318    /// uses `log|R_r|` for every relation.
319    pub fn new() -> Self {
320        Self {
321            distinct_counts: Vec::new(),
322        }
323    }
324
325    /// Construct a bound that uses the supplied per-relation distinct
326    /// counts to tighten the objective coefficients.
327    pub fn with_distinct_counts(distinct_counts: Vec<u64>) -> Self {
328        Self { distinct_counts }
329    }
330
331    /// Same semantics as [`UpperBound::ceiling`]; surfaced here so
332    /// callers can avoid importing the trait when they already hold an
333    /// `&LpJoinBound`.
334    pub fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
335        self.solve(relations, equality_predicates, /*use_distinct=*/ false)
336    }
337
338    /// Like [`Self::ceiling`] but folds the supplied distinct counts
339    /// into the per-relation objective coefficient. If the supplied
340    /// vector is shorter than `relations` or contains zero entries the
341    /// missing entries fall back to the row count.
342    pub fn ceiling_with_distinct(
343        &self,
344        relations: &[u64],
345        equality_predicates: &[(usize, usize)],
346    ) -> u64 {
347        self.solve(relations, equality_predicates, /*use_distinct=*/ true)
348    }
349
350    /// Core solver: build the per-connected-component LP, solve each,
351    /// and multiply the per-component ceilings. Falls back to
352    /// [`ProductBound`] / [`AgmBound`] if the solver fails for any
353    /// reason — the envelope must never crash the engine.
354    fn solve(
355        &self,
356        relations: &[u64],
357        equality_predicates: &[(usize, usize)],
358        use_distinct: bool,
359    ) -> u64 {
360        if relations.is_empty() {
361            return 0;
362        }
363        if equality_predicates.is_empty() {
364            return ProductBound.ceiling(relations, &[]);
365        }
366
367        // Validate predicate indices; defensively drop any out-of-range
368        // pair. A misbuilt join graph must never crash the envelope.
369        let n = relations.len();
370        let preds: Vec<(usize, usize)> = equality_predicates
371            .iter()
372            .copied()
373            .filter(|&(i, j)| i < n && j < n && i != j)
374            .collect();
375        if preds.is_empty() {
376            return ProductBound.ceiling(relations, &[]);
377        }
378
379        // Build connected components over the relation graph induced by
380        // the equality predicates. Each component's LP is independent;
381        // we solve them separately and multiply the ceilings.
382        let components = connected_components(n, &preds);
383        let mut total: u128 = 1;
384        for comp in &components {
385            let ceil = self.solve_component(relations, &preds, comp, use_distinct);
386            total = total.saturating_mul(ceil as u128);
387            if total >= u64::MAX as u128 {
388                return u64::MAX;
389            }
390        }
391        total as u64
392    }
393
394    /// Solve the AGM LP restricted to a single connected component.
395    fn solve_component(
396        &self,
397        relations: &[u64],
398        all_predicates: &[(usize, usize)],
399        component: &[usize],
400        use_distinct: bool,
401    ) -> u64 {
402        // Singleton component (no predicate incident): output cardinality
403        // is just the relation size.
404        if component.len() == 1 {
405            return relations[component[0]];
406        }
407
408        // Subset of predicates whose endpoints both lie in this
409        // component. (Every predicate connecting two members of a
410        // component is in the component by definition; we filter only
411        // for safety.)
412        let in_comp: std::collections::HashSet<usize> = component.iter().copied().collect();
413        let comp_preds: Vec<(usize, usize)> = all_predicates
414            .iter()
415            .copied()
416            .filter(|&(i, j)| in_comp.contains(&i) && in_comp.contains(&j))
417            .collect();
418
419        // Build the LP via `good_lp`. One x_r per relation in the
420        // component; one >= 1 constraint per equality predicate
421        // (each predicate introduces a distinct shared attribute).
422        use good_lp::{
423            Expression, ProblemVariables, Solution, SolverModel, default_solver, variable,
424        };
425
426        let mut vars = ProblemVariables::new();
427        // Map: relation index in the global `relations` vector ->
428        // good_lp variable handle.
429        let mut var_for: std::collections::HashMap<usize, good_lp::Variable> =
430            std::collections::HashMap::with_capacity(component.len());
431        // Build the objective expression in tandem with adding
432        // variables so coefficients line up with relation order.
433        let mut objective = Expression::with_capacity(component.len());
434        for &r in component {
435            let v = vars.add(variable().min(0.0));
436            var_for.insert(r, v);
437            let row_count = relations[r];
438            // Coefficient = log(effective size). Effective size = row
439            // count, optionally clamped to the supplied distinct count
440            // (which can only shrink it).
441            let mut size_f = row_count as f64;
442            if use_distinct {
443                if let Some(&d) = self.distinct_counts.get(r) {
444                    if d > 0 {
445                        size_f = size_f.min(d as f64);
446                    }
447                }
448            }
449            // log(0) is undefined; we treat an empty relation as
450            // contributing 0 to the log-sum (the join is empty anyway).
451            // log(1) is 0 and would let the LP put unbounded weight on
452            // that variable without paying — we clamp the coefficient
453            // away from zero with a tiny epsilon so the objective is
454            // strictly minimised.
455            let coef = if size_f <= 1.0 { 0.0 } else { size_f.ln() };
456            objective.add_mul(coef, v);
457        }
458
459        // Add one >= 1 fractional-cover constraint per predicate.
460        // good_lp's `Expression >> rhs` operator builds a >= constraint.
461        let mut model = vars.minimise(&objective).using(default_solver);
462        for &(i, j) in &comp_preds {
463            let xi = var_for[&i];
464            let xj = var_for[&j];
465            let lhs: Expression = xi + xj;
466            model = model.with(lhs.geq(1.0));
467        }
468
469        match model.solve() {
470            Ok(sol) => {
471                let lp_min = sol.eval(&objective);
472                // exp(LP_min) is the AGM ceiling. Guard against
473                // negative noise from the simplex and against overflow.
474                let raw = lp_min.exp();
475                if !raw.is_finite() || raw < 0.0 {
476                    return self.fallback(relations, &comp_preds, component);
477                }
478                // Always at least 1 (a join with at least one row in
479                // both endpoints can return one row); cap at u64::MAX.
480                let raw = raw.max(1.0);
481                if raw >= u64::MAX as f64 {
482                    u64::MAX
483                } else {
484                    // Snap to nearest integer when within a tight
485                    // relative epsilon: `exp(ln(n))` for integer `n`
486                    // can drift to `n + 1e-12` and a blind ceil would
487                    // push the per-component bound a full integer
488                    // above the true AGM optimum, breaking the
489                    // contract that LpJoinBound <= AgmBound. Only
490                    // ceil when the LP value is materially above the
491                    // nearest integer.
492                    let rounded = raw.round();
493                    let snap_eps = 1e-9_f64.max(raw.abs() * 1e-12);
494                    if (raw - rounded).abs() <= snap_eps {
495                        rounded as u64
496                    } else {
497                        raw.ceil() as u64
498                    }
499                }
500            }
501            Err(_) => self.fallback(relations, &comp_preds, component),
502        }
503    }
504
505    /// Conservative fallback when the LP solver fails. Returns the
506    /// minimum row count among the component's relations (a valid AGM
507    /// upper bound when at least one predicate covers every relation in
508    /// the component), or [`ProductBound`] over the component otherwise.
509    fn fallback(
510        &self,
511        relations: &[u64],
512        comp_preds: &[(usize, usize)],
513        component: &[usize],
514    ) -> u64 {
515        if comp_preds.is_empty() {
516            return component
517                .iter()
518                .map(|&r| relations[r])
519                .fold(1u64, |a, n| a.saturating_mul(n));
520        }
521        let comp_rows: Vec<u64> = component.iter().map(|&r| relations[r]).collect();
522        let agm = AgmBound;
523        agm.ceiling(&comp_rows, &[(0, 1)])
524    }
525}
526
527#[cfg(feature = "lp_solver")]
528impl UpperBound for LpJoinBound {
529    fn ceiling(&self, relations: &[u64], equality_predicates: &[(usize, usize)]) -> u64 {
530        self.ceiling(relations, equality_predicates)
531    }
532}
533
534/// Return the connected components of the graph on `0..n` with edges
535/// given by `edges`. Each component is a sorted list of vertex indices.
536/// Singleton vertices (no incident edge) appear as one-element
537/// components — every relation index in `0..n` is in exactly one
538/// component.
539#[cfg(feature = "lp_solver")]
540fn connected_components(n: usize, edges: &[(usize, usize)]) -> Vec<Vec<usize>> {
541    let mut parent: Vec<usize> = (0..n).collect();
542    fn find(parent: &mut [usize], mut x: usize) -> usize {
543        while parent[x] != x {
544            parent[x] = parent[parent[x]];
545            x = parent[x];
546        }
547        x
548    }
549    for &(a, b) in edges {
550        if a >= n || b >= n {
551            continue;
552        }
553        let ra = find(&mut parent, a);
554        let rb = find(&mut parent, b);
555        if ra != rb {
556            parent[ra] = rb;
557        }
558    }
559    let mut groups: std::collections::HashMap<usize, Vec<usize>> = std::collections::HashMap::new();
560    for v in 0..n {
561        let r = find(&mut parent, v);
562        groups.entry(r).or_default().push(v);
563    }
564    let mut out: Vec<Vec<usize>> = groups.into_values().collect();
565    for c in &mut out {
566        c.sort_unstable();
567    }
568    // Sort components by their smallest member for deterministic order.
569    out.sort_by_key(|c| c[0]);
570    out
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    #[test]
578    fn product_bound_two_relations() {
579        assert_eq!(ProductBound.ceiling(&[100, 200], &[]), 20_000);
580    }
581
582    #[test]
583    fn product_bound_overflow_saturates() {
584        assert_eq!(ProductBound.ceiling(&[u64::MAX, 2], &[]), u64::MAX);
585    }
586
587    #[test]
588    fn product_bound_empty_relations() {
589        assert_eq!(ProductBound.ceiling(&[], &[]), 1);
590    }
591
592    #[test]
593    fn agm_no_predicates_falls_back_to_product() {
594        assert_eq!(AgmBound.ceiling(&[10, 20, 30], &[]), 10 * 20 * 30);
595    }
596
597    #[test]
598    fn agm_with_predicates_tighter_than_product() {
599        let r = [1_000u64, 1_000_000];
600        let bound = AgmBound.ceiling(&r, &[(0, 1)]);
601        let product = ProductBound.ceiling(&r, &[]);
602        assert!(bound <= product);
603    }
604
605    #[test]
606    fn clamp_within_ceiling() {
607        assert_eq!(clamp_estimate(500.0, 1000).unwrap(), 500);
608    }
609
610    #[test]
611    fn clamp_exceeds_ceiling_errors() {
612        let err = clamp_estimate(1500.0, 1000).unwrap_err();
613        match err {
614            Error::LpBoundExceeded { estimate, ceiling } => {
615                assert_eq!(estimate, 1500.0);
616                assert_eq!(ceiling, 1000.0);
617            }
618            other => panic!("wrong error variant: {other:?}"),
619        }
620    }
621
622    #[test]
623    fn chain_bound_tighter_than_product() {
624        // Two relations of 1000 rows each, joining on a key with 100 distinct values.
625        // Product = 1_000_000; ChainBound = 1000 * 1000 / 100 = 10_000.
626        let r = [1_000u64, 1_000];
627        let cb = ChainBound::new(vec![100, 100]);
628        let bound = cb.ceiling(&r, &[(0, 1)]);
629        assert_eq!(bound, 10_000);
630        let product = ProductBound.ceiling(&r, &[]);
631        assert!(bound < product);
632    }
633
634    #[test]
635    fn chain_bound_three_table_chain() {
636        // R1(1000) ⋈ R2(2000) ⋈ R3(500), join keys 100 distinct each side.
637        // Product = 1e9. Chain = 1e9 / 100 / 100 = 100_000.
638        let r = [1_000u64, 2_000, 500];
639        let cb = ChainBound::new(vec![100, 100, 100]);
640        let bound = cb.ceiling(&r, &[(0, 1), (1, 2)]);
641        assert_eq!(bound, 100_000);
642    }
643
644    #[test]
645    fn chain_bound_no_predicates_falls_back() {
646        let cb = ChainBound::new(vec![10, 20, 30]);
647        assert_eq!(cb.ceiling(&[10, 20, 30], &[]), 10 * 20 * 30);
648    }
649
650    #[test]
651    fn chain_bound_missing_distinct_count_defaults_to_one() {
652        // No distinct count entry → defaults to 1, meaning no reduction.
653        let cb = ChainBound::new(vec![]);
654        let bound = cb.ceiling(&[100, 100], &[(0, 1)]);
655        assert_eq!(bound, 10_000); // 100 * 100 / max(1, 1) = 10_000
656    }
657
658    #[test]
659    fn saturating_clamp_saturates() {
660        assert_eq!(saturating_clamp(500.0, 1000), 500);
661        assert_eq!(saturating_clamp(2000.0, 1000), 1000);
662        assert_eq!(saturating_clamp(-5.0, 1000), 0);
663        assert_eq!(saturating_clamp(f64::NAN, 1000), 0);
664    }
665}
666
667#[cfg(all(test, feature = "lp_solver"))]
668mod lp_tests {
669    use super::*;
670
671    /// 2-table single-edge join: the LP returns the principled AGM
672    /// bound `min(|R_0|, |R_1|)` and is therefore a valid (tighter than
673    /// or equal) refinement of the coarse [`AgmBound`] approximation,
674    /// which for two relations reduces to `|R_0| * |R_1|`.
675    #[test]
676    fn two_table_join_matches_principled_agm() {
677        let r = [1_000u64, 1_000_000u64];
678        let lp = LpJoinBound::new();
679        let bound = lp.ceiling(&r, &[(0, 1)]);
680        // True AGM single-edge bound is min(|R_0|, |R_1|).
681        // Allow ceil()'s +/-1 floating-point noise.
682        assert!(
683            (999..=1_001).contains(&bound),
684            "expected ≈1000, got {bound}"
685        );
686        // And the LP bound must never exceed the coarse AGM bound it
687        // replaces (validity / refinement contract).
688        let coarse = AgmBound.ceiling(&r, &[(0, 1)]);
689        assert!(
690            bound <= coarse,
691            "LP bound {bound} must not exceed coarse AGM {coarse}"
692        );
693    }
694
695    /// Triangle: 3 relations, 3 equality predicates each on a distinct
696    /// shared attribute. The fractional AGM cover number is ρ\* = 3/2,
697    /// so the LP bound is `(|R_0| * |R_1| * |R_2|)^{1/2}`.
698    #[test]
699    fn triangle_strictly_tighter_than_chain_and_product() {
700        // Use round numbers so the closed-form expectation is exact.
701        let r = [1_000u64, 1_000u64, 1_000u64];
702        let preds = [(0usize, 1usize), (1, 2), (0, 2)];
703        let lp = LpJoinBound::new();
704        let bound = lp.ceiling(&r, &preds);
705
706        // sqrt(1e9) ≈ 31_622.78  → expect 31_623 (after ceil).
707        assert!(
708            (31_000u64..=32_000u64).contains(&bound),
709            "expected ≈31_623, got {bound}"
710        );
711
712        // Strictly tighter than product = 1e9.
713        let product = ProductBound.ceiling(&r, &preds);
714        assert!(bound < product, "LP {bound} should be < product {product}");
715
716        // Strictly tighter than the chain bound under realistic
717        // distinct-count hints (≈10 distinct join keys per relation —
718        // matches the regime where ChainBound is meaningful but not
719        // pathologically optimistic).
720        let cb = ChainBound::new(vec![10, 10, 10]);
721        let chain = cb.ceiling(&r, &preds);
722        assert!(
723            bound < chain,
724            "LP {bound} should be < chain {chain} on the triangle"
725        );
726    }
727
728    /// Square (4-cycle): R_0 — R_1 — R_2 — R_3 — R_0. AGM ρ\* = 2, so
729    /// the LP bound is `sqrt(|R_0|*|R_2|*|R_1|*|R_3|)` ≈ `(N)^2` for
730    /// equally sized N, vs the product = N^4.
731    #[test]
732    fn square_strictly_tighter_than_chain_and_product() {
733        let r = [100u64, 100u64, 100u64, 100u64];
734        let preds = [(0usize, 1usize), (1, 2), (2, 3), (3, 0)];
735        let lp = LpJoinBound::new();
736        let bound = lp.ceiling(&r, &preds);
737
738        // 4-cycle AGM optimum is ρ* = 2 (alternating x = 1, 0, 1, 0 or
739        // x = 1/2 each); for equal sizes this gives N^2 = 10_000.
740        // Allow a generous numerical tolerance.
741        assert!(
742            (5_000..=15_000).contains(&bound),
743            "expected ≈10_000, got {bound}"
744        );
745
746        // Strictly tighter than product = 1e8.
747        let product = ProductBound.ceiling(&r, &preds);
748        assert!(bound < product, "LP {bound} should be < product {product}");
749
750        // Strictly tighter than the chain bound under modest
751        // distinct-count hints. d=4 per relation, 4 predicates →
752        // chain = 100^4 / 4^4 = 1e8 / 256 ≈ 390_625, which is
753        // looser than the LP's ≈10_000.
754        let cb = ChainBound::new(vec![4, 4, 4, 4]);
755        let chain = cb.ceiling(&r, &preds);
756        assert!(
757            bound < chain,
758            "LP {bound} should be < chain {chain} on the 4-cycle"
759        );
760    }
761
762    /// Disconnected join graph: two independent 2-table joins.
763    /// The LP decomposes into one LP per connected component, and the
764    /// total bound is the product of the per-component bounds.
765    #[test]
766    fn disconnected_components_multiply() {
767        // Component A: R_0 ⋈ R_1 (relations of sizes 100, 200, single
768        // predicate). Per-component AGM bound = min(100, 200) = 100.
769        // Component B: R_2 ⋈ R_3 (sizes 50, 70). Per-component bound
770        // = min(50, 70) = 50. Total expected ≈ 100 * 50 = 5_000.
771        let r = [100u64, 200, 50, 70];
772        let preds = [(0usize, 1usize), (2, 3)];
773        let lp = LpJoinBound::new();
774        let bound = lp.ceiling(&r, &preds);
775        assert!(
776            (4_900..=5_100).contains(&bound),
777            "expected ≈5000, got {bound}"
778        );
779    }
780
781    /// Singleton relation (no incident predicate) keeps its row count in
782    /// the product of component bounds.
783    #[test]
784    fn singleton_component_contributes_row_count() {
785        let r = [100u64, 200, 99];
786        // Only R_0 and R_1 are joined; R_2 is isolated.
787        let preds = [(0usize, 1usize)];
788        let lp = LpJoinBound::new();
789        let bound = lp.ceiling(&r, &preds);
790        // Component {0,1}: min(100, 200) = 100. Component {2}: 99.
791        // Total ≈ 9_900.
792        assert!(
793            (9_800..=10_000).contains(&bound),
794            "expected ≈9_900, got {bound}"
795        );
796    }
797
798    /// The LP bound must never exceed the trivial product bound.
799    #[test]
800    fn lp_bound_dominates_product() {
801        let r = [37u64, 41, 43, 47, 53];
802        let preds = [(0usize, 1usize), (1, 2), (2, 3), (3, 4)];
803        let lp = LpJoinBound::new();
804        let bound = lp.ceiling(&r, &preds);
805        let product = ProductBound.ceiling(&r, &preds);
806        assert!(
807            bound <= product,
808            "LP bound {bound} must be ≤ product {product}"
809        );
810    }
811
812    /// Empty relations → bound 0.
813    #[test]
814    fn empty_relations_zero() {
815        let lp = LpJoinBound::new();
816        assert_eq!(lp.ceiling(&[], &[]), 0);
817    }
818
819    /// No predicates → product bound (sanity passthrough).
820    #[test]
821    fn no_predicates_returns_product() {
822        let lp = LpJoinBound::new();
823        let r = [10u64, 20, 30];
824        assert_eq!(lp.ceiling(&r, &[]), 6_000);
825    }
826
827    /// `ceiling_with_distinct` clamps the per-relation objective
828    /// coefficient by `min(|R|, D)`. With a tight distinct-count hint
829    /// the bound only gets smaller (tighter).
830    #[test]
831    fn ceiling_with_distinct_is_at_most_unconstrained() {
832        let r = [1_000u64, 1_000];
833        let preds = [(0usize, 1usize)];
834        let with_d = LpJoinBound::with_distinct_counts(vec![10, 10]);
835        let unconstrained = LpJoinBound::new();
836        let a = with_d.ceiling_with_distinct(&r, &preds);
837        let b = unconstrained.ceiling(&r, &preds);
838        assert!(a <= b, "distinct-aware bound {a} must be tighter than {b}");
839        // With 10 distinct values on each side the bound collapses to 10.
840        assert!(a <= 11, "expected ≈10 with D=10, got {a}");
841    }
842}
samkhya_core/lpbound.rs

samkhya_core/
lpbound.rs