velesdb-core 1.13.1

High-performance vector database engine written in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
//! Cost estimator for hybrid MATCH + NEAR query planning.
//!
//! Uses [`OperationCostFactors`] (calibrated or default) to compute I/O and
//! CPU costs for query plan nodes.
//!
//! # Transition from hard-coded constants (Issue #467)
//!
//! The former constants `FILTER_SCAN_IO_WEIGHT` (0.2), `FILTER_SCAN_CPU_WEIGHT`
//! (0.8), `HNSW_IO_WEIGHT` (0.5), and `HNSW_CPU_WEIGHT` (1.0) have been
//! removed. Cost computation now derives I/O and CPU weights from the fields
//! of [`OperationCostFactors`], which are calibrated dynamically during
//! `analyze()` based on collection statistics and histograms.
//!
//! Backward-compatible formulas (using `COMPAT_FILTER_IO`, `COMPAT_HNSW_IO`,
//! etc.) ensure that **default factors produce identical costs** to the old
//! hard-coded constants. When calibrated factors differ from defaults, costs
//! scale proportionally via `(calibrated / default)` ratios.
//!
//! # Module layout (Devin Finding F on PR #606)
//!
//! The estimator was split across three files to respect the 500 NLOC file
//! limit:
//!
//! - `mod.rs` (this file) — public types (`Cost`, `SelectivityMethod`,
//!   `CostEstimator`), internal `CostFactorsRef`, base selectivity helpers
//!   (`estimate_condition_selectivity`, comparison/in/between/like),
//!   filter-cost + HNSW-cost helpers.
//! - `selectivity_method.rs` — method-aware variants
//!   (`estimate_condition_selectivity_with_method` + `_with_method` helpers
//!   per predicate kind) + the `has_cardinality_data` guard.
//! - `plan_cost.rs` — `estimate_plan_cost` + per-node cost functions
//!   (vector search, table scan, limit/offset, index lookup, match
//!   traversal).
//!
//! All three files contribute `impl CostEstimator<'_>` blocks — Rust allows
//! multiple impl blocks across files for the same type, so there is no
//! public API change and no cross-module trait indirection.

// Reason: usize/u64 → f64 for selectivity ratios and log2 inputs; these are
// cardinalities where ±1 ULP has no operational impact on query planning.
#![allow(clippy::cast_precision_loss)]

mod plan_cost;
mod selectivity_method;

use crate::collection::query_cost::cost_model::OperationCostFactors;
use crate::collection::stats::next_after;
use crate::collection::stats::CollectionStats;
use crate::collection::stats::Histogram;
use crate::velesql::ast::{CompareOp, Condition, Value};

// ---------------------------------------------------------------------------
// Backward-compatibility constants
// ---------------------------------------------------------------------------
// These reproduce the historical I/O and CPU ratios when factors == default.
// The formulas multiply these by (factors.field / default.field) so that
// calibrated factors scale the cost proportionally while default factors
// yield the exact same costs as the old hard-coded constants.

/// Historical I/O ratio for filter scan cost.
pub(super) const COMPAT_FILTER_IO: f64 = 0.2;
/// Historical CPU ratio for filter scan cost.
pub(super) const COMPAT_FILTER_CPU: f64 = 0.8;
/// Historical I/O ratio for HNSW search cost.
pub(super) const COMPAT_HNSW_IO: f64 = 0.5;
/// Historical CPU ratio for HNSW search cost.
pub(super) const COMPAT_HNSW_CPU: f64 = 1.0;

/// Composite cost estimate.
#[derive(Debug, Clone, Copy, Default, PartialEq)]
pub struct Cost {
    /// Estimated I/O component (arbitrary units).
    pub io_cost: f64,
    /// Estimated CPU component (arbitrary units).
    pub cpu_cost: f64,
}

/// Source of a selectivity estimate, used by EXPLAIN to report how a
/// predicate's selectivity was computed (issue #471, Devin finding 2).
///
/// Ordered by increasing noise / decreasing confidence:
/// 1. `Histogram` — derived from calibrated histogram buckets (most accurate).
/// 2. `Cardinality` — derived from `distinct_count` only (no distribution).
/// 3. `Heuristic` — hard-coded constant (e.g. 0.1 for `Match`, 0.05 for
///    `ContainsText`) because the predicate type has no stats path at all.
///
/// For compound predicates (`And`/`Or`/`Not`/`Group`), the reported method is
/// the **worst case** among children in the order above, so EXPLAIN never
/// overstates its confidence.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum SelectivityMethod {
    /// Selectivity computed from histogram bucket data.
    Histogram,
    /// Selectivity computed from `distinct_count` cardinality (no histogram).
    Cardinality,
    /// Selectivity computed from a heuristic constant.
    Heuristic,
}

impl SelectivityMethod {
    /// Returns the EXPLAIN display label for this method.
    #[must_use]
    pub const fn as_str(self) -> &'static str {
        match self {
            Self::Histogram => "histogram",
            Self::Cardinality => "cardinality",
            Self::Heuristic => "heuristic",
        }
    }

    /// Returns the worst (least confident) of two methods.
    ///
    /// Ordering: `Heuristic > Cardinality > Histogram`. Used to combine the
    /// methods of sub-predicates under `And`/`Or`/`Not`/`Group` so the
    /// reported method reflects the loosest child.
    #[must_use]
    pub const fn worst(self, other: Self) -> Self {
        match (self, other) {
            (Self::Heuristic, _) | (_, Self::Heuristic) => Self::Heuristic,
            (Self::Cardinality, _) | (_, Self::Cardinality) => Self::Cardinality,
            _ => Self::Histogram,
        }
    }
}

impl std::fmt::Display for SelectivityMethod {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}

impl Cost {
    #[must_use]
    /// Creates a new cost value from I/O and CPU components.
    pub const fn new(io_cost: f64, cpu_cost: f64) -> Self {
        Self { io_cost, cpu_cost }
    }

    #[must_use]
    /// Returns the total cost (I/O + CPU).
    pub const fn total(self) -> f64 {
        self.io_cost + self.cpu_cost
    }
}

/// Reference to cost factors — either calibrated from stats, or default.
///
/// Zero-allocation on cache-hit path: `Calibrated` borrows from
/// `CollectionStats`, `Default` is a unit variant resolved inline.
#[derive(Debug)]
enum CostFactorsRef<'a> {
    /// Calibrated factors stored in `CollectionStats` (zero-copy borrow).
    Calibrated(&'a OperationCostFactors),
    /// Default factors (no allocation needed).
    Default,
}

impl CostFactorsRef<'_> {
    /// Returns a reference to the effective factors.
    ///
    /// For `Calibrated`, returns the borrowed reference directly.
    /// For `Default`, returns a reference to a lazily-initialized static default.
    fn get(&self) -> &OperationCostFactors {
        match self {
            Self::Calibrated(f) => f,
            Self::Default => {
                use std::sync::LazyLock;
                static DEFAULT_FACTORS: LazyLock<OperationCostFactors> =
                    LazyLock::new(OperationCostFactors::default);
                &DEFAULT_FACTORS
            }
        }
    }
}

/// Cost estimator based on collection statistics.
///
/// Uses `OperationCostFactors` (calibrated or default) to compute I/O and
/// CPU costs. Zero-allocation on cache-hit path via `CostFactorsRef`.
#[derive(Debug)]
pub struct CostEstimator<'a> {
    pub(super) stats: &'a CollectionStats,
    factors: CostFactorsRef<'a>,
}

/// Converts a VelesQL `Value` to `f64` for histogram lookup.
///
/// Returns `Some(f64)` for Integer, `UnsignedInteger`, Float, and Boolean.
/// Returns `None` for Parameter, Null, String, Temporal, and Subquery.
pub(super) fn value_to_f64(value: &Value) -> Option<f64> {
    match value {
        Value::Integer(i) => Some(*i as f64),
        Value::UnsignedInteger(u) => Some(*u as f64),
        Value::Float(f) => Some(*f),
        Value::Boolean(b) => Some(if *b { 1.0 } else { 0.0 }),
        _ => None,
    }
}

/// Lazily-initialized default factors for ratio computation.
pub(super) fn default_factors() -> &'static OperationCostFactors {
    use std::sync::LazyLock;
    static DEFAULT: LazyLock<OperationCostFactors> = LazyLock::new(OperationCostFactors::default);
    &DEFAULT
}

impl<'a> CostEstimator<'a> {
    #[must_use]
    /// Creates a new estimator with calibrated factors from the collection (if available).
    ///
    /// If `stats.calibrated_cost_factors` is `Some`, uses the calibrated factors.
    /// Otherwise, uses `OperationCostFactors::default()`.
    pub fn new(stats: &'a CollectionStats) -> Self {
        let factors = match &stats.calibrated_cost_factors {
            Some(f) => CostFactorsRef::Calibrated(f),
            None => CostFactorsRef::Default,
        };
        Self { stats, factors }
    }

    /// Creates an estimator with explicit factors (for tests or override).
    #[must_use]
    pub fn with_factors(stats: &'a CollectionStats, factors: &'a OperationCostFactors) -> Self {
        Self {
            stats,
            factors: CostFactorsRef::Calibrated(factors),
        }
    }

    /// Returns the active `OperationCostFactors` for use in sibling modules.
    pub(super) fn factors(&self) -> &OperationCostFactors {
        self.factors.get()
    }

    /// Returns the histogram for a column, delegating to `CollectionStats`.
    pub(super) fn get_histogram(&self, column: &str) -> Option<&Histogram> {
        self.stats.get_column_histogram(column)
    }

    #[must_use]
    /// Estimates filter cost using selectivity derived from stats.
    ///
    /// Uses backward-compatible formulas:
    /// - `io_cost  = scan_rows * COMPAT_FILTER_IO  * (factors.seq_page_cost / default.seq_page_cost)`
    /// - `cpu_cost = scan_rows * COMPAT_FILTER_CPU * (factors.cpu_tuple_cost / default.cpu_tuple_cost)`
    ///
    /// With default factors, this produces identical costs to the old constants.
    pub fn estimate_filter_cost(&self, filter: &Condition) -> Cost {
        let selectivity = self.estimate_condition_selectivity(filter).clamp(0.0, 1.0);
        let total = self.stats.total_points.max(self.stats.row_count) as f64;
        let scan_rows = (total * selectivity).max(1.0);

        let f = self.factors.get();
        let d = default_factors();
        let io_ratio = f.seq_page_cost / d.seq_page_cost;
        let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;

        Cost::new(
            scan_rows * COMPAT_FILTER_IO * io_ratio,
            scan_rows * COMPAT_FILTER_CPU * cpu_ratio,
        )
    }

    #[must_use]
    /// Estimates HNSW search cost for top-k retrieval.
    ///
    /// Uses backward-compatible formulas:
    /// - `io_cost  = probe * COMPAT_HNSW_IO  * (factors.random_page_cost / default.random_page_cost)`
    /// - `cpu_cost = probe * COMPAT_HNSW_CPU * (factors.cpu_distance_cost / default.cpu_distance_cost)`
    ///
    /// With default factors, this produces identical costs to the old constants.
    pub fn estimate_hnsw_search_cost(&self, k: usize) -> Cost {
        let total = self.stats.total_points.max(self.stats.row_count).max(1) as f64;
        let probe = (k.max(1) as f64) * total.log2().max(1.0);
        self.hnsw_cost_from_probe(probe)
    }

    /// Estimates HNSW search cost parametrized by the actual `ef_search`
    /// (frontier size) and `candidates` (top-k request) — issue #471, Devin
    /// finding 4.
    ///
    /// Uses the same `(ef + k) * log2(total)` probe formula as
    /// `Self::estimate_vector_search_node_cost` (private), so callers that have
    /// `ef_search` / `candidates` available (e.g. pre/post-filter strategy
    /// comparison in `plan_builder`) get a cost that reflects the real query
    /// instead of a fixed `k = 10`.
    #[must_use]
    pub fn estimate_hnsw_search_cost_with_ef(&self, ef_search: u32, candidates: u32) -> Cost {
        let total = self.stats.total_points.max(self.stats.row_count).max(1);
        self.estimate_hnsw_search_cost_with_ef_on_size(ef_search, candidates, total)
    }

    /// Variant of [`Self::estimate_hnsw_search_cost_with_ef`] that takes the
    /// effective collection size explicitly.
    ///
    /// Callers use this when the HNSW pass runs over a **subset** of the
    /// collection — typically the surviving rows after a pre-filter. Modeling
    /// the cost as `(ef + k) * log2(collection_size)` preserves the
    /// logarithmic scaling HNSW actually exhibits, whereas multiplying the
    /// full-collection cost by the filter selectivity would imply linear
    /// scaling in the reduced size (Devin finding E on PR #606).
    #[must_use]
    pub fn estimate_hnsw_search_cost_with_ef_on_size(
        &self,
        ef_search: u32,
        candidates: u32,
        collection_size: u64,
    ) -> Cost {
        let total = collection_size.max(1) as f64;
        let ef = f64::from(ef_search.max(1));
        let k = f64::from(candidates.max(1));
        let probe = (ef + k) * total.log2().max(1.0);
        self.hnsw_cost_from_probe(probe)
    }

    /// Applies calibrated I/O and CPU weights to a raw HNSW probe count.
    ///
    /// Single source of truth for the `Cost::new(probe * io_w, probe * cpu_w)`
    /// formula used by every HNSW cost helper — avoids duplicating the
    /// factor-ratio resolution in three places.
    pub(super) fn hnsw_cost_from_probe(&self, probe: f64) -> Cost {
        let f = self.factors.get();
        let d = default_factors();
        let io_ratio = f.random_page_cost / d.random_page_cost;
        let cpu_ratio = f.cpu_distance_cost / d.cpu_distance_cost;
        Cost::new(
            probe * COMPAT_HNSW_IO * io_ratio,
            probe * COMPAT_HNSW_CPU * cpu_ratio,
        )
    }

    #[must_use]
    /// Estimates predicate selectivity in the `[0.0, 1.0]` range.
    ///
    /// Dispatches on every `Condition` variant — no catch-all. Comparison,
    /// In, Between, and Like use histogram data when available; text/geo
    /// predicates return explicit heuristic constants; compound predicates
    /// use product (AND), inclusion-exclusion (OR), and complement (NOT).
    pub fn estimate_condition_selectivity(&self, condition: &Condition) -> f64 {
        match condition {
            Condition::Comparison(cmp) => self.estimate_comparison_selectivity_with_histogram(
                &cmp.column,
                cmp.operator,
                &cmp.value,
            ),
            Condition::In(cond) => {
                self.estimate_in_selectivity(&cond.column, &cond.values, cond.negated)
            }
            Condition::Between(cond) => {
                self.estimate_between_selectivity(&cond.column, &cond.low, &cond.high)
            }
            Condition::Like(cond) => self.estimate_like_selectivity(&cond.column, &cond.pattern),
            Condition::IsNull(cond) => self
                .stats
                .field_stats
                .get(cond.column.as_str())
                .map_or(0.1, |s| {
                    s.null_count as f64 / self.stats.total_points.max(1) as f64
                }),
            Condition::Match(_) | Condition::Contains(_) | Condition::GeoDistance(_) => 0.1,
            Condition::ContainsText(_) => 0.05,
            Condition::GeoBbox(_) => 0.2,
            Condition::GraphMatch(_) => 0.5,
            Condition::And(left, right) => {
                self.estimate_condition_selectivity(left)
                    * self.estimate_condition_selectivity(right)
            }
            Condition::Or(left, right) => {
                let l = self.estimate_condition_selectivity(left);
                let r = self.estimate_condition_selectivity(right);
                (l + r - (l * r)).clamp(0.0, 1.0)
            }
            Condition::Not(inner) => 1.0 - self.estimate_condition_selectivity(inner),
            Condition::Group(inner) => self.estimate_condition_selectivity(inner),
            Condition::VectorSearch(_)
            | Condition::VectorFusedSearch(_)
            | Condition::SparseVectorSearch(_)
            | Condition::Similarity(_) => 1.0,
        }
    }

    /// Estimates selectivity for a `Comparison` condition using histogram data.
    ///
    /// Dispatches on `CompareOp`: Eq → histogram equality, NotEq → complement,
    /// Lt/Lte/Gt/Gte → histogram less-than with appropriate adjustments.
    /// Falls back to `CollectionStats::estimate_selectivity()` when no histogram
    /// is available or the value cannot be converted to `f64`.
    pub(super) fn estimate_comparison_selectivity_with_histogram(
        &self,
        column: &str,
        op: CompareOp,
        value: &Value,
    ) -> f64 {
        // Parameter values are unknown at plan time — use heuristic.
        if matches!(value, Value::Parameter(_)) {
            return 0.1;
        }

        let Some(v) = value_to_f64(value) else {
            return self.stats.estimate_selectivity(column);
        };

        let Some(hist) = self.get_histogram(column) else {
            return self.stats.estimate_selectivity(column);
        };

        let sel = match op {
            CompareOp::Eq => hist.estimate_eq_selectivity(v),
            CompareOp::NotEq => 1.0 - hist.estimate_eq_selectivity(v),
            CompareOp::Lt => hist.estimate_lt_selectivity(v),
            CompareOp::Lte => hist.estimate_lt_selectivity(next_after(v)),
            CompareOp::Gt => 1.0 - hist.estimate_lt_selectivity(next_after(v)),
            CompareOp::Gte => 1.0 - hist.estimate_lt_selectivity(v),
        };
        sel.clamp(0.0, 1.0)
    }

    /// Estimates selectivity for a `Between` condition using histogram range.
    ///
    /// Converts low/high to `f64` and delegates to `Histogram::estimate_range_selectivity`.
    /// Falls back to `0.3` when no histogram is available or conversion fails.
    pub(super) fn estimate_between_selectivity(
        &self,
        column: &str,
        low: &Value,
        high: &Value,
    ) -> f64 {
        let (Some(low_f), Some(high_f)) = (value_to_f64(low), value_to_f64(high)) else {
            return 0.3;
        };

        match self.get_histogram(column) {
            // BETWEEN is inclusive on both ends (low <= x <= high).
            // Use next_after(high_f) so bucket_range_fraction includes values
            // at the exact upper boundary — consistent with CompareOp::Lte.
            Some(h) => h.estimate_range_selectivity(low_f, next_after(high_f)),
            None => 0.3,
        }
    }

    /// Estimates selectivity for an `In` condition.
    ///
    /// Sums per-value equality selectivities via histogram lookups when available.
    /// Falls back to `base_selectivity × list_size` without a histogram.
    /// If negated (NOT IN), returns `1.0 - sel`.
    pub(super) fn estimate_in_selectivity(
        &self,
        column: &str,
        values: &[Value],
        negated: bool,
    ) -> f64 {
        let sel = if let Some(h) = self.get_histogram(column) {
            let numeric_sels: Vec<f64> = values
                .iter()
                .filter_map(value_to_f64)
                .map(|v| h.estimate_eq_selectivity(v))
                .collect();
            if numeric_sels.is_empty() {
                // All values are non-numeric (e.g. strings) — fall back to
                // cardinality-based estimate so we don't silently return 0.0.
                let base = self.stats.estimate_selectivity(column);
                (base * values.len() as f64).clamp(0.0, 1.0)
            } else {
                let sum: f64 = numeric_sels.into_iter().sum();
                sum.clamp(0.0, 1.0)
            }
        } else {
            let base = self.stats.estimate_selectivity(column);
            (base * values.len() as f64).clamp(0.0, 1.0)
        };

        if negated {
            1.0 - sel
        } else {
            sel
        }
    }

    /// Estimates the cost of applying a post-filter predicate to the
    /// candidate set returned by an HNSW search (issue #609).
    ///
    /// Unlike [`Self::estimate_filter_cost_from_selectivity`] (which scales
    /// with `total × selectivity`), the post-filter runs only on the set of
    /// candidates HNSW explored, not on the full collection.
    ///
    /// # Choice of cardinality
    ///
    /// VelesDB's actual post-filter execution (`search_post_filter` +
    /// `filter_and_hydrate` in `collection/search/vector_filter.rs`)
    /// evaluates the predicate on an *oversampled* candidate set
    /// (`compute_oversampled_k(k, filter) = clamp(k/selectivity, [k+10,
    /// 10_000])`) and truncates to `k` **after** the filter. The predicate
    /// therefore runs on more than `k` tuples in the common case — a
    /// `k`-only model (naive "top-k post-filter") under-estimates the
    /// cost for high-recall HNSW regimes (Devin review on PR #612).
    ///
    /// The model below uses `max(k, ef_search)` as the effective
    /// predicate-evaluation cardinality. This is a conservative upper
    /// bound on the HNSW-returned set size:
    /// - when `k ≥ ef_search` (unusual), the predicate runs on `k`;
    /// - when `ef_search > k` (typical: ef≈160, k≈10), it runs on
    ///   up to `ef_search` candidates before truncation.
    ///
    /// The true oversampled count
    /// (`clamp(k/selectivity, [k+10, 10_000])`) is not passed into the
    /// planner — the planner does not know the execution-time selectivity
    /// estimate that `compute_oversampled_k` uses. `max(k, ef_search)` is
    /// the closest bound available at plan time.
    ///
    /// Returns a zero-I/O cost: the candidates are already in memory
    /// after the HNSW pass, so no page reads are charged.
    #[must_use]
    pub fn estimate_post_filter_topk_cost(&self, k: u32, ef_search: u32) -> Cost {
        let n = f64::from(k.max(ef_search).max(1));
        let f = self.factors.get();
        let d = default_factors();
        let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;
        // Reason: max(k, ef_search) × default cpu_tuple_cost scaled by
        // ratio to the calibrated factor — the physical reality of
        // evaluating a predicate on the HNSW candidate set before top-k
        // truncation (see `search_post_filter` + `filter_and_hydrate`).
        Cost::new(0.0, n * d.cpu_tuple_cost * cpu_ratio)
    }

    /// Estimates filter cost from an already-computed selectivity value.
    ///
    /// Useful when the caller has a pre-computed selectivity (e.g. from
    /// `estimate_condition_selectivity` or a heuristic) and wants to translate
    /// it into a calibrated cost without building a `Condition` AST.
    ///
    /// Uses the same backward-compatible formula as `estimate_filter_cost`.
    #[must_use]
    pub fn estimate_filter_cost_from_selectivity(&self, selectivity: f64) -> Cost {
        let sel = selectivity.clamp(0.0, 1.0);
        let total = self.stats.total_points.max(self.stats.row_count) as f64;
        let scan_rows = (total * sel).max(1.0);

        let f = self.factors.get();
        let d = default_factors();
        let io_ratio = f.seq_page_cost / d.seq_page_cost;
        let cpu_ratio = f.cpu_tuple_cost / d.cpu_tuple_cost;

        Cost::new(
            scan_rows * COMPAT_FILTER_IO * io_ratio,
            scan_rows * COMPAT_FILTER_CPU * cpu_ratio,
        )
    }

    /// Estimates selectivity for a `Like` condition.
    ///
    /// Prefix patterns (ending with `%`, not starting with `%`) use histogram
    /// range estimation on the ordinal prefix range when available.
    /// Non-prefix patterns return `0.05`.
    pub(super) fn estimate_like_selectivity(&self, column: &str, pattern: &str) -> f64 {
        let is_prefix = pattern.ends_with('%') && !pattern.starts_with('%');
        if !is_prefix {
            return 0.05;
        }

        let Some(_hist) = self.get_histogram(column) else {
            return 0.1;
        };

        // For string columns the histogram is built on ordinal ranks.
        // A prefix pattern 'abc%' matches a contiguous range of ordinal
        // values. Without the full string→rank mapping at plan time we
        // approximate: the prefix covers roughly 1/distinct_count of the
        // domain, scaled by the number of buckets that span that range.
        // This is more accurate than the previous 1/bucket_count heuristic.
        let distinct = self
            .stats
            .column_stats
            .get(column)
            .or_else(|| self.stats.field_stats.get(column))
            .map_or(1, |cs| cs.distinct_count.max(1));
        (1.0 / distinct as f64).clamp(0.01, 1.0)
    }
}