vibesql_storage/statistics/
sampling.rs

1//! Statistical sampling for large table statistics
2//!
3//! Provides efficient sampling methods for computing statistics on large tables
4//! without requiring a full table scan. Implements various sampling strategies:
5//! - Random sampling: Randomly select rows
6//! - Systematic sampling: Every Nth row
7//! - Reservoir sampling: Single-pass streaming sample
8//! - Adaptive sampling: Choose sample size based on table size
9
10use rand::{prelude::IndexedRandom, Rng};
11
12/// Configuration for statistical sampling
13#[derive(Debug, Clone)]
14pub struct SamplingConfig {
15    /// Sample size strategy
16    pub sample_size: SampleSize,
17
18    /// Sampling method to use
19    pub method: SamplingMethod,
20
21    /// Statistical confidence level (e.g., 0.95 for 95% confidence)
22    pub confidence_level: f64,
23}
24
25impl Default for SamplingConfig {
26    fn default() -> Self {
27        Self {
28            sample_size: SampleSize::Adaptive,
29            method: SamplingMethod::Random,
30            confidence_level: 0.95,
31        }
32    }
33}
34
35/// Sample size specification
36#[derive(Debug, Clone, PartialEq)]
37pub enum SampleSize {
38    /// Exact number of rows to sample
39    RowCount(usize),
40
41    /// Percentage of table (0.0 to 1.0)
42    Percentage(f64),
43
44    /// Automatically determine sample size based on table size
45    ///
46    /// Strategy:
47    /// - Small tables (< 1000 rows): No sampling, use all rows
48    /// - Medium tables (1K-100K rows): 10% sample
49    /// - Large tables (> 100K rows): Fixed 10K row sample
50    Adaptive,
51}
52
53/// Sampling method
54#[derive(Debug, Clone, PartialEq)]
55pub enum SamplingMethod {
56    /// Pure random sampling (requires index-based access)
57    Random,
58
59    /// Every Nth row (systematic sampling)
60    Systematic,
61
62    /// Reservoir sampling (single-pass, constant memory)
63    Reservoir,
64}
65
66impl SamplingConfig {
67    /// Create a new sampling configuration
68    pub fn new(sample_size: SampleSize, method: SamplingMethod) -> Self {
69        Self { sample_size, method, confidence_level: 0.95 }
70    }
71
72    /// Create configuration for adaptive sampling (default)
73    pub fn adaptive() -> Self {
74        Self::default()
75    }
76
77    /// Create configuration for percentage-based sampling
78    pub fn percentage(pct: f64) -> Self {
79        Self {
80            sample_size: SampleSize::Percentage(pct),
81            method: SamplingMethod::Random,
82            confidence_level: 0.95,
83        }
84    }
85
86    /// Create configuration for fixed-size sampling
87    pub fn fixed(count: usize) -> Self {
88        Self {
89            sample_size: SampleSize::RowCount(count),
90            method: SamplingMethod::Random,
91            confidence_level: 0.95,
92        }
93    }
94
95    /// Determine actual sample size for a given table
96    ///
97    /// Returns (sample_size, should_sample) tuple
98    /// - sample_size: Number of rows to sample
99    /// - should_sample: Whether sampling is needed (false for small tables)
100    pub fn determine_sample_size(&self, total_rows: usize) -> (usize, bool) {
101        match &self.sample_size {
102            SampleSize::RowCount(count) => {
103                let sample_size = (*count).min(total_rows);
104                (sample_size, sample_size < total_rows)
105            }
106            SampleSize::Percentage(pct) => {
107                let sample_size = (total_rows as f64 * pct).ceil() as usize;
108                let sample_size = sample_size.min(total_rows).max(1);
109                (sample_size, sample_size < total_rows)
110            }
111            SampleSize::Adaptive => {
112                // Adaptive strategy based on table size
113                if total_rows < 1000 {
114                    // Small table: no sampling
115                    (total_rows, false)
116                } else if total_rows < 100_000 {
117                    // Medium table: 10% sample
118                    let sample_size = (total_rows / 10).max(1000);
119                    (sample_size, true)
120                } else {
121                    // Large table: fixed 10K sample
122                    (10_000, true)
123                }
124            }
125        }
126    }
127}
128
129/// Sample rows from a table using the specified sampling method
130pub fn sample_rows<T: Clone>(rows: &[T], config: &SamplingConfig, rng: &mut impl Rng) -> Vec<T> {
131    let (sample_size, should_sample) = config.determine_sample_size(rows.len());
132
133    if !should_sample {
134        // No sampling needed, return all rows
135        return rows.to_vec();
136    }
137
138    match config.method {
139        SamplingMethod::Random => random_sample(rows, sample_size, rng),
140        SamplingMethod::Systematic => systematic_sample(rows, sample_size),
141        SamplingMethod::Reservoir => reservoir_sample(rows, sample_size, rng),
142    }
143}
144
145/// Random sampling: randomly select n rows
146fn random_sample<T: Clone>(rows: &[T], n: usize, rng: &mut impl Rng) -> Vec<T> {
147    let n = n.min(rows.len());
148    let indices: Vec<usize> = (0..rows.len()).collect();
149    let sampled_indices = indices.choose_multiple(rng, n);
150
151    sampled_indices.map(|&i| rows[i].clone()).collect()
152}
153
154/// Systematic sampling: every Nth row
155fn systematic_sample<T: Clone>(rows: &[T], n: usize) -> Vec<T> {
156    let n = n.min(rows.len());
157    if n == 0 {
158        return Vec::new();
159    }
160
161    let step = rows.len() / n;
162    if step == 0 {
163        return rows.to_vec();
164    }
165
166    rows.iter().step_by(step).take(n).cloned().collect()
167}
168
169/// Reservoir sampling: single-pass, constant memory
170///
171/// Knuth's Algorithm R - processes stream in one pass,
172/// maintaining a reservoir of k samples with equal probability
173fn reservoir_sample<T: Clone>(rows: &[T], k: usize, rng: &mut impl Rng) -> Vec<T> {
174    let k = k.min(rows.len());
175    if k == 0 {
176        return Vec::new();
177    }
178
179    let mut reservoir: Vec<T> = rows.iter().take(k).cloned().collect();
180
181    for (i, row) in rows.iter().enumerate().skip(k) {
182        // Randomly decide whether to include this element
183        let j = rng.random_range(0..=i);
184        if j < k {
185            reservoir[j] = row.clone();
186        }
187    }
188
189    reservoir
190}
191
192/// Metadata about a sample
193#[derive(Debug, Clone)]
194pub struct SampleMetadata {
195    /// Total number of rows in the table
196    pub total_rows: usize,
197
198    /// Number of rows in the sample
199    pub sample_size: usize,
200
201    /// Sampling ratio (sample_size / total_rows)
202    pub sampling_ratio: f64,
203
204    /// Whether sampling was actually used
205    pub sampled: bool,
206
207    /// Confidence level for statistical estimates
208    pub confidence_level: f64,
209}
210
211impl SampleMetadata {
212    /// Create metadata for a sample
213    pub fn new(
214        total_rows: usize,
215        sample_size: usize,
216        sampled: bool,
217        confidence_level: f64,
218    ) -> Self {
219        let sampling_ratio =
220            if total_rows > 0 { sample_size as f64 / total_rows as f64 } else { 1.0 };
221
222        Self { total_rows, sample_size, sampling_ratio, sampled, confidence_level }
223    }
224
225    /// Extrapolate a count from sample to full table
226    ///
227    /// Given a count in the sample, estimate the count in the full table
228    pub fn extrapolate_count(&self, sample_count: usize) -> usize {
229        if !self.sampled || self.sampling_ratio == 0.0 {
230            return sample_count;
231        }
232
233        (sample_count as f64 / self.sampling_ratio).round() as usize
234    }
235
236    /// Extrapolate a frequency/selectivity from sample to full table
237    ///
238    /// Frequencies typically don't need adjustment, but we apply
239    /// a small confidence adjustment for small samples
240    pub fn extrapolate_frequency(&self, sample_frequency: f64) -> f64 {
241        if !self.sampled {
242            return sample_frequency;
243        }
244
245        // For small samples, be more conservative with estimates
246        // Apply Laplace smoothing for small samples
247        if self.sample_size < 100 {
248            let smoothing = 1.0 / self.sample_size as f64;
249            (sample_frequency + smoothing) / (1.0 + smoothing)
250        } else {
251            sample_frequency
252        }
253    }
254
255    /// Estimate the standard error for a proportion
256    ///
257    /// Used to provide confidence intervals for selectivity estimates
258    pub fn standard_error(&self, proportion: f64) -> f64 {
259        if !self.sampled || self.sample_size == 0 {
260            return 0.0;
261        }
262
263        // Standard error of proportion: sqrt(p(1-p)/n)
264        // With finite population correction
265        let fpc = if self.total_rows > self.sample_size {
266            ((self.total_rows - self.sample_size) as f64 / (self.total_rows - 1) as f64).sqrt()
267        } else {
268            1.0
269        };
270
271        (proportion * (1.0 - proportion) / self.sample_size as f64).sqrt() * fpc
272    }
273
274    /// Get confidence interval for a proportion estimate
275    ///
276    /// Returns (lower_bound, upper_bound) for the confidence level
277    pub fn confidence_interval(&self, proportion: f64) -> (f64, f64) {
278        if !self.sampled {
279            return (proportion, proportion);
280        }
281
282        // Use normal approximation (valid for n*p > 5 and n*(1-p) > 5)
283        // Z-score for 95% confidence is approximately 1.96
284        let z_score = match self.confidence_level {
285            x if x >= 0.99 => 2.576,
286            x if x >= 0.95 => 1.96,
287            x if x >= 0.90 => 1.645,
288            _ => 1.96, // default to 95%
289        };
290
291        let se = self.standard_error(proportion);
292        let margin = z_score * se;
293
294        let lower = (proportion - margin).max(0.0);
295        let upper = (proportion + margin).min(1.0);
296
297        (lower, upper)
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use rand::{rngs::StdRng, SeedableRng};
304
305    use super::*;
306
307    #[test]
308    fn test_sample_size_adaptive() {
309        let config = SamplingConfig::adaptive();
310
311        // Small table: no sampling
312        let (size, should_sample) = config.determine_sample_size(500);
313        assert_eq!(size, 500);
314        assert!(!should_sample);
315
316        // Medium table: 10% sample
317        let (size, should_sample) = config.determine_sample_size(50_000);
318        assert_eq!(size, 5_000);
319        assert!(should_sample);
320
321        // Large table: fixed 10K sample
322        let (size, should_sample) = config.determine_sample_size(1_000_000);
323        assert_eq!(size, 10_000);
324        assert!(should_sample);
325    }
326
327    #[test]
328    fn test_sample_size_percentage() {
329        let config = SamplingConfig::percentage(0.1);
330
331        let (size, should_sample) = config.determine_sample_size(1000);
332        assert_eq!(size, 100);
333        assert!(should_sample);
334
335        // Edge case: percentage results in 0
336        let (size, should_sample) = config.determine_sample_size(5);
337        assert_eq!(size, 1); // Clamped to minimum 1
338        assert!(should_sample);
339    }
340
341    #[test]
342    fn test_sample_size_fixed() {
343        let config = SamplingConfig::fixed(100);
344
345        let (size, should_sample) = config.determine_sample_size(1000);
346        assert_eq!(size, 100);
347        assert!(should_sample);
348
349        // Edge case: sample size larger than table
350        let (size, should_sample) = config.determine_sample_size(50);
351        assert_eq!(size, 50);
352        assert!(!should_sample);
353    }
354
355    #[test]
356    fn test_random_sample() {
357        let mut rng = StdRng::seed_from_u64(42);
358        let rows: Vec<i32> = (0..100).collect();
359
360        let config = SamplingConfig::fixed(10);
361        let sample = sample_rows(&rows, &config, &mut rng);
362
363        assert_eq!(sample.len(), 10);
364        // All sampled values should be in range
365        for val in sample {
366            assert!((0..100).contains(&val));
367        }
368    }
369
370    #[test]
371    fn test_systematic_sample() {
372        let rows: Vec<i32> = (0..100).collect();
373        let sample = systematic_sample(&rows, 10);
374
375        assert_eq!(sample.len(), 10);
376        // Systematic sampling should be evenly spaced
377        assert_eq!(sample[0], 0);
378        assert_eq!(sample[1], 10);
379        assert_eq!(sample[9], 90);
380    }
381
382    #[test]
383    fn test_reservoir_sample() {
384        let mut rng = StdRng::seed_from_u64(42);
385        let rows: Vec<i32> = (0..100).collect();
386        let sample = reservoir_sample(&rows, 10, &mut rng);
387
388        assert_eq!(sample.len(), 10);
389        // All sampled values should be valid
390        for val in sample {
391            assert!((0..100).contains(&val));
392        }
393    }
394
395    #[test]
396    fn test_sample_metadata_extrapolation() {
397        // 10% sample (1000 out of 10000)
398        let metadata = SampleMetadata::new(10_000, 1_000, true, 0.95);
399
400        // Count extrapolation
401        assert_eq!(metadata.extrapolate_count(100), 1_000);
402        assert_eq!(metadata.extrapolate_count(50), 500);
403
404        // Frequency extrapolation (should be mostly unchanged)
405        let freq = metadata.extrapolate_frequency(0.5);
406        assert!((freq - 0.5).abs() < 0.1);
407    }
408
409    #[test]
410    fn test_confidence_interval() {
411        let metadata = SampleMetadata::new(10_000, 1_000, true, 0.95);
412
413        // For p=0.5 with n=1000, standard error ≈ 0.0158
414        // 95% CI margin ≈ 1.96 * 0.0158 ≈ 0.031
415        let (lower, upper) = metadata.confidence_interval(0.5);
416
417        assert!(lower < 0.5);
418        assert!(upper > 0.5);
419        assert!((upper - lower) < 0.1); // Reasonable interval width
420    }
421
422    #[test]
423    fn test_no_sampling_for_small_table() {
424        let mut rng = StdRng::seed_from_u64(42);
425        let rows: Vec<i32> = (0..50).collect();
426
427        let config = SamplingConfig::adaptive();
428        let sample = sample_rows(&rows, &config, &mut rng);
429
430        // Should return all rows for small table
431        assert_eq!(sample.len(), 50);
432        assert_eq!(sample, rows);
433    }
434}