vibesql_storage/statistics/
sampling.rs1use rand::{prelude::IndexedRandom, Rng};
11
12#[derive(Debug, Clone)]
14pub struct SamplingConfig {
15 pub sample_size: SampleSize,
17
18 pub method: SamplingMethod,
20
21 pub confidence_level: f64,
23}
24
25impl Default for SamplingConfig {
26 fn default() -> Self {
27 Self {
28 sample_size: SampleSize::Adaptive,
29 method: SamplingMethod::Random,
30 confidence_level: 0.95,
31 }
32 }
33}
34
35#[derive(Debug, Clone, PartialEq)]
37pub enum SampleSize {
38 RowCount(usize),
40
41 Percentage(f64),
43
44 Adaptive,
51}
52
53#[derive(Debug, Clone, PartialEq)]
55pub enum SamplingMethod {
56 Random,
58
59 Systematic,
61
62 Reservoir,
64}
65
66impl SamplingConfig {
67 pub fn new(sample_size: SampleSize, method: SamplingMethod) -> Self {
69 Self { sample_size, method, confidence_level: 0.95 }
70 }
71
72 pub fn adaptive() -> Self {
74 Self::default()
75 }
76
77 pub fn percentage(pct: f64) -> Self {
79 Self {
80 sample_size: SampleSize::Percentage(pct),
81 method: SamplingMethod::Random,
82 confidence_level: 0.95,
83 }
84 }
85
86 pub fn fixed(count: usize) -> Self {
88 Self {
89 sample_size: SampleSize::RowCount(count),
90 method: SamplingMethod::Random,
91 confidence_level: 0.95,
92 }
93 }
94
95 pub fn determine_sample_size(&self, total_rows: usize) -> (usize, bool) {
101 match &self.sample_size {
102 SampleSize::RowCount(count) => {
103 let sample_size = (*count).min(total_rows);
104 (sample_size, sample_size < total_rows)
105 }
106 SampleSize::Percentage(pct) => {
107 let sample_size = (total_rows as f64 * pct).ceil() as usize;
108 let sample_size = sample_size.min(total_rows).max(1);
109 (sample_size, sample_size < total_rows)
110 }
111 SampleSize::Adaptive => {
112 if total_rows < 1000 {
114 (total_rows, false)
116 } else if total_rows < 100_000 {
117 let sample_size = (total_rows / 10).max(1000);
119 (sample_size, true)
120 } else {
121 (10_000, true)
123 }
124 }
125 }
126 }
127}
128
129pub fn sample_rows<T: Clone>(rows: &[T], config: &SamplingConfig, rng: &mut impl Rng) -> Vec<T> {
131 let (sample_size, should_sample) = config.determine_sample_size(rows.len());
132
133 if !should_sample {
134 return rows.to_vec();
136 }
137
138 match config.method {
139 SamplingMethod::Random => random_sample(rows, sample_size, rng),
140 SamplingMethod::Systematic => systematic_sample(rows, sample_size),
141 SamplingMethod::Reservoir => reservoir_sample(rows, sample_size, rng),
142 }
143}
144
145fn random_sample<T: Clone>(rows: &[T], n: usize, rng: &mut impl Rng) -> Vec<T> {
147 let n = n.min(rows.len());
148 let indices: Vec<usize> = (0..rows.len()).collect();
149 let sampled_indices = indices.choose_multiple(rng, n);
150
151 sampled_indices.map(|&i| rows[i].clone()).collect()
152}
153
154fn systematic_sample<T: Clone>(rows: &[T], n: usize) -> Vec<T> {
156 let n = n.min(rows.len());
157 if n == 0 {
158 return Vec::new();
159 }
160
161 let step = rows.len() / n;
162 if step == 0 {
163 return rows.to_vec();
164 }
165
166 rows.iter().step_by(step).take(n).cloned().collect()
167}
168
169fn reservoir_sample<T: Clone>(rows: &[T], k: usize, rng: &mut impl Rng) -> Vec<T> {
174 let k = k.min(rows.len());
175 if k == 0 {
176 return Vec::new();
177 }
178
179 let mut reservoir: Vec<T> = rows.iter().take(k).cloned().collect();
180
181 for (i, row) in rows.iter().enumerate().skip(k) {
182 let j = rng.random_range(0..=i);
184 if j < k {
185 reservoir[j] = row.clone();
186 }
187 }
188
189 reservoir
190}
191
192#[derive(Debug, Clone)]
194pub struct SampleMetadata {
195 pub total_rows: usize,
197
198 pub sample_size: usize,
200
201 pub sampling_ratio: f64,
203
204 pub sampled: bool,
206
207 pub confidence_level: f64,
209}
210
211impl SampleMetadata {
212 pub fn new(
214 total_rows: usize,
215 sample_size: usize,
216 sampled: bool,
217 confidence_level: f64,
218 ) -> Self {
219 let sampling_ratio =
220 if total_rows > 0 { sample_size as f64 / total_rows as f64 } else { 1.0 };
221
222 Self { total_rows, sample_size, sampling_ratio, sampled, confidence_level }
223 }
224
225 pub fn extrapolate_count(&self, sample_count: usize) -> usize {
229 if !self.sampled || self.sampling_ratio == 0.0 {
230 return sample_count;
231 }
232
233 (sample_count as f64 / self.sampling_ratio).round() as usize
234 }
235
236 pub fn extrapolate_frequency(&self, sample_frequency: f64) -> f64 {
241 if !self.sampled {
242 return sample_frequency;
243 }
244
245 if self.sample_size < 100 {
248 let smoothing = 1.0 / self.sample_size as f64;
249 (sample_frequency + smoothing) / (1.0 + smoothing)
250 } else {
251 sample_frequency
252 }
253 }
254
255 pub fn standard_error(&self, proportion: f64) -> f64 {
259 if !self.sampled || self.sample_size == 0 {
260 return 0.0;
261 }
262
263 let fpc = if self.total_rows > self.sample_size {
266 ((self.total_rows - self.sample_size) as f64 / (self.total_rows - 1) as f64).sqrt()
267 } else {
268 1.0
269 };
270
271 (proportion * (1.0 - proportion) / self.sample_size as f64).sqrt() * fpc
272 }
273
274 pub fn confidence_interval(&self, proportion: f64) -> (f64, f64) {
278 if !self.sampled {
279 return (proportion, proportion);
280 }
281
282 let z_score = match self.confidence_level {
285 x if x >= 0.99 => 2.576,
286 x if x >= 0.95 => 1.96,
287 x if x >= 0.90 => 1.645,
288 _ => 1.96, };
290
291 let se = self.standard_error(proportion);
292 let margin = z_score * se;
293
294 let lower = (proportion - margin).max(0.0);
295 let upper = (proportion + margin).min(1.0);
296
297 (lower, upper)
298 }
299}
300
301#[cfg(test)]
302mod tests {
303 use rand::{rngs::StdRng, SeedableRng};
304
305 use super::*;
306
307 #[test]
308 fn test_sample_size_adaptive() {
309 let config = SamplingConfig::adaptive();
310
311 let (size, should_sample) = config.determine_sample_size(500);
313 assert_eq!(size, 500);
314 assert!(!should_sample);
315
316 let (size, should_sample) = config.determine_sample_size(50_000);
318 assert_eq!(size, 5_000);
319 assert!(should_sample);
320
321 let (size, should_sample) = config.determine_sample_size(1_000_000);
323 assert_eq!(size, 10_000);
324 assert!(should_sample);
325 }
326
327 #[test]
328 fn test_sample_size_percentage() {
329 let config = SamplingConfig::percentage(0.1);
330
331 let (size, should_sample) = config.determine_sample_size(1000);
332 assert_eq!(size, 100);
333 assert!(should_sample);
334
335 let (size, should_sample) = config.determine_sample_size(5);
337 assert_eq!(size, 1); assert!(should_sample);
339 }
340
341 #[test]
342 fn test_sample_size_fixed() {
343 let config = SamplingConfig::fixed(100);
344
345 let (size, should_sample) = config.determine_sample_size(1000);
346 assert_eq!(size, 100);
347 assert!(should_sample);
348
349 let (size, should_sample) = config.determine_sample_size(50);
351 assert_eq!(size, 50);
352 assert!(!should_sample);
353 }
354
355 #[test]
356 fn test_random_sample() {
357 let mut rng = StdRng::seed_from_u64(42);
358 let rows: Vec<i32> = (0..100).collect();
359
360 let config = SamplingConfig::fixed(10);
361 let sample = sample_rows(&rows, &config, &mut rng);
362
363 assert_eq!(sample.len(), 10);
364 for val in sample {
366 assert!((0..100).contains(&val));
367 }
368 }
369
370 #[test]
371 fn test_systematic_sample() {
372 let rows: Vec<i32> = (0..100).collect();
373 let sample = systematic_sample(&rows, 10);
374
375 assert_eq!(sample.len(), 10);
376 assert_eq!(sample[0], 0);
378 assert_eq!(sample[1], 10);
379 assert_eq!(sample[9], 90);
380 }
381
382 #[test]
383 fn test_reservoir_sample() {
384 let mut rng = StdRng::seed_from_u64(42);
385 let rows: Vec<i32> = (0..100).collect();
386 let sample = reservoir_sample(&rows, 10, &mut rng);
387
388 assert_eq!(sample.len(), 10);
389 for val in sample {
391 assert!((0..100).contains(&val));
392 }
393 }
394
395 #[test]
396 fn test_sample_metadata_extrapolation() {
397 let metadata = SampleMetadata::new(10_000, 1_000, true, 0.95);
399
400 assert_eq!(metadata.extrapolate_count(100), 1_000);
402 assert_eq!(metadata.extrapolate_count(50), 500);
403
404 let freq = metadata.extrapolate_frequency(0.5);
406 assert!((freq - 0.5).abs() < 0.1);
407 }
408
409 #[test]
410 fn test_confidence_interval() {
411 let metadata = SampleMetadata::new(10_000, 1_000, true, 0.95);
412
413 let (lower, upper) = metadata.confidence_interval(0.5);
416
417 assert!(lower < 0.5);
418 assert!(upper > 0.5);
419 assert!((upper - lower) < 0.1); }
421
422 #[test]
423 fn test_no_sampling_for_small_table() {
424 let mut rng = StdRng::seed_from_u64(42);
425 let rows: Vec<i32> = (0..50).collect();
426
427 let config = SamplingConfig::adaptive();
428 let sample = sample_rows(&rows, &config, &mut rng);
429
430 assert_eq!(sample.len(), 50);
432 assert_eq!(sample, rows);
433 }
434}