1use crate::error::{StatsError, StatsResult};
7use crate::error_standardization::ErrorMessages;
8#[cfg(feature = "memmap")]
9use memmap2::Mmap;
10use scirs2_core::ndarray::{s, ArrayBase, ArrayViewMut1, Data, Ix1, Ix2};
11use scirs2_core::numeric::{Float, NumCast};
12use std::cmp::Ordering;
13use std::fs::File;
14use std::io::{BufRead, BufReader, Read};
15use std::path::Path;
16
17const CHUNK_SIZE: usize = 8192;
19
20#[allow(dead_code)]
34pub fn streaming_mean<F, I>(mut data_iter: I, totalcount: usize) -> StatsResult<F>
35where
36 F: Float + NumCast,
37 I: Iterator<Item = F> + std::fmt::Display,
38{
39 if totalcount == 0 {
40 return Err(ErrorMessages::empty_array("dataset"));
41 }
42
43 let mut sum = F::zero();
44 let mut _count = 0;
45
46 while _count < totalcount {
48 let chunk_sum = data_iter
49 .by_ref()
50 .take(CHUNK_SIZE)
51 .fold(F::zero(), |acc, val| acc + val);
52
53 sum = sum + chunk_sum;
54 _count += CHUNK_SIZE.min(totalcount - _count);
55 }
56
57 Ok(sum / F::from(totalcount).expect("Failed to convert to float"))
58}
59
60#[allow(dead_code)]
74pub fn welford_variance<F, D>(x: &ArrayBase<D, Ix1>, ddof: usize) -> StatsResult<(F, F)>
75where
76 F: Float + NumCast,
77 D: Data<Elem = F>,
78{
79 let n = x.len();
80 if n <= ddof {
81 return Err(StatsError::InvalidArgument(
82 "Not enough data points for the given degrees of freedom".to_string(),
83 ));
84 }
85
86 let mut mean = F::zero();
87 let mut m2 = F::zero();
88 let mut count = 0;
89
90 for &value in x.iter() {
91 count += 1;
92 let delta = value - mean;
93 mean = mean + delta / F::from(count).expect("Failed to convert to float");
94 let delta2 = value - mean;
95 m2 = m2 + delta * delta2;
96 }
97
98 let variance = m2 / F::from(n - ddof).expect("Failed to convert to float");
99 Ok((mean, variance))
100}
101
102#[allow(dead_code)]
112pub fn normalize_inplace<F>(data: &mut ArrayViewMut1<F>, ddof: usize) -> StatsResult<()>
113where
114 F: Float + NumCast + std::fmt::Display,
115{
116 let (mean, variance) = welford_variance(&data.to_owned(), ddof)?;
117
118 if variance <= F::epsilon() {
119 return Err(StatsError::InvalidArgument(
120 "Cannot normalize data with zero variance".to_string(),
121 ));
122 }
123
124 let std_dev = variance.sqrt();
125
126 for val in data.iter_mut() {
128 *val = (*val - mean) / std_dev;
129 }
130
131 Ok(())
132}
133
134#[allow(dead_code)]
148pub fn quantile_quickselect<F>(data: &mut [F], q: F) -> StatsResult<F>
149where
150 F: Float + NumCast + std::fmt::Display,
151{
152 if data.is_empty() {
153 return Err(StatsError::InvalidArgument(
154 "Cannot compute quantile of empty array".to_string(),
155 ));
156 }
157
158 if q < F::zero() || q > F::one() {
159 return Err(StatsError::domain("Quantile must be between 0 and 1"));
160 }
161
162 let n = data.len();
163 let pos = q * F::from(n - 1).expect("Failed to convert to float");
164 let k = NumCast::from(pos.floor()).expect("Operation failed");
165
166 quickselect(data, k);
168
169 let lower = data[k];
170
171 let frac = pos - pos.floor();
173 if frac > F::zero() && k + 1 < n {
174 quickselect(&mut data[(k + 1)..], 0);
175 let upper = data[k + 1];
176 Ok(lower + frac * (upper - lower))
177 } else {
178 Ok(lower)
179 }
180}
181
182#[allow(dead_code)]
184fn quickselect<F: Float>(data: &mut [F], k: usize) {
185 let len = data.len();
186 if len <= 1 {
187 return;
188 }
189
190 let mut left = 0;
191 let mut right = len - 1;
192
193 while left < right {
194 let pivot_idx = partition(data, left, right);
195
196 match k.cmp(&pivot_idx) {
197 Ordering::Less => right = pivot_idx - 1,
198 Ordering::Greater => left = pivot_idx + 1,
199 Ordering::Equal => return,
200 }
201 }
202}
203
204#[allow(dead_code)]
206fn partition<F: Float>(data: &mut [F], left: usize, right: usize) -> usize {
207 let pivot_idx = left + (right - left) / 2;
208 let pivot = data[pivot_idx];
209
210 data.swap(pivot_idx, right);
211
212 let mut store_idx = left;
213 for i in left..right {
214 if data[i] < pivot {
215 data.swap(i, store_idx);
216 store_idx += 1;
217 }
218 }
219
220 data.swap(store_idx, right);
221 store_idx
222}
223
224#[allow(dead_code)]
238pub fn covariance_chunked<F, D>(
239 data: &ArrayBase<D, Ix2>,
240 ddof: usize,
241) -> StatsResult<scirs2_core::ndarray::Array2<F>>
242where
243 F: Float + NumCast,
244 D: Data<Elem = F>,
245{
246 let n_obs = data.nrows();
247 let n_vars = data.ncols();
248
249 if n_obs <= ddof {
250 return Err(StatsError::InvalidArgument(
251 "Not enough observations for the given degrees of freedom".to_string(),
252 ));
253 }
254
255 let mut means = scirs2_core::ndarray::Array1::zeros(n_vars);
257 for j in 0..n_vars {
258 let col = data.slice(s![.., j]);
259 means[j] = col.iter().fold(F::zero(), |acc, &val| acc + val)
260 / F::from(n_obs).expect("Failed to convert to float");
261 }
262
263 let mut cov_matrix = scirs2_core::ndarray::Array2::zeros((n_vars, n_vars));
265
266 let chunksize = CHUNK_SIZE / n_vars;
268 for chunk_start in (0..n_obs).step_by(chunksize) {
269 let chunk_end = (chunk_start + chunksize).min(n_obs);
270 let chunk = data.slice(s![chunk_start..chunk_end, ..]);
271
272 for i in 0..n_vars {
274 for j in i..n_vars {
275 let mut sum = F::zero();
276 for k in 0..chunk.nrows() {
277 let xi = chunk[(k, i)] - means[i];
278 let xj = chunk[(k, j)] - means[j];
279 sum = sum + xi * xj;
280 }
281 cov_matrix[(i, j)] = cov_matrix[(i, j)] + sum;
282 }
283 }
284 }
285
286 let factor = F::from(n_obs - ddof).expect("Failed to convert to float");
288 for i in 0..n_vars {
289 for j in i..n_vars {
290 cov_matrix[(i, j)] = cov_matrix[(i, j)] / factor;
291 if i != j {
292 cov_matrix[(j, i)] = cov_matrix[(i, j)];
293 }
294 }
295 }
296
297 Ok(cov_matrix)
298}
299
300#[allow(dead_code)]
304pub struct StreamingCorrelation<F: Float> {
305 n: usize,
306 sum_x: F,
307 sum_y: F,
308 sum_xx: F,
309 sum_yy: F,
310 sum_xy: F,
311}
312
313#[allow(dead_code)]
314impl<F: Float + NumCast + std::fmt::Display> StreamingCorrelation<F> {
315 pub fn new() -> Self {
317 Self {
318 n: 0,
319 sum_x: F::zero(),
320 sum_y: F::zero(),
321 sum_xx: F::zero(),
322 sum_yy: F::zero(),
323 sum_xy: F::zero(),
324 }
325 }
326
327 pub fn update(&mut self, x: F, y: F) {
329 self.n += 1;
330 self.sum_x = self.sum_x + x;
331 self.sum_y = self.sum_y + y;
332 self.sum_xx = self.sum_xx + x * x;
333 self.sum_yy = self.sum_yy + y * y;
334 self.sum_xy = self.sum_xy + x * y;
335 }
336
337 pub fn update_batch<D>(&mut self, x: &ArrayBase<D, Ix1>, y: &ArrayBase<D, Ix1>)
339 where
340 D: Data<Elem = F>,
341 {
342 for (&xi, &yi) in x.iter().zip(y.iter()) {
343 self.update(xi, yi);
344 }
345 }
346
347 pub fn correlation(&self) -> StatsResult<F> {
349 if self.n < 2 {
350 return Err(StatsError::InvalidArgument(
351 "Need at least 2 observations to compute correlation".to_string(),
352 ));
353 }
354
355 let n = F::from(self.n).expect("Failed to convert to float");
356 let mean_x = self.sum_x / n;
357 let mean_y = self.sum_y / n;
358
359 let cov_xy = (self.sum_xy - n * mean_x * mean_y) / (n - F::one());
360 let var_x = (self.sum_xx - n * mean_x * mean_x) / (n - F::one());
361 let var_y = (self.sum_yy - n * mean_y * mean_y) / (n - F::one());
362
363 if var_x <= F::epsilon() || var_y <= F::epsilon() {
364 return Err(StatsError::InvalidArgument(
365 "Cannot compute correlation when one or both variables have zero variance"
366 .to_string(),
367 ));
368 }
369
370 Ok(cov_xy / (var_x * var_y).sqrt())
371 }
372
373 pub fn merge(&mut self, other: &Self) {
375 self.n += other.n;
376 self.sum_x = self.sum_x + other.sum_x;
377 self.sum_y = self.sum_y + other.sum_y;
378 self.sum_xx = self.sum_xx + other.sum_xx;
379 self.sum_yy = self.sum_yy + other.sum_yy;
380 self.sum_xy = self.sum_xy + other.sum_xy;
381 }
382}
383
384#[allow(dead_code)]
388pub struct IncrementalCovariance<F: Float> {
389 n: usize,
390 means: scirs2_core::ndarray::Array1<F>,
391 cov_matrix: scirs2_core::ndarray::Array2<F>,
392 n_vars: usize,
393}
394
395#[allow(dead_code)]
396impl<F: Float + NumCast + scirs2_core::ndarray::ScalarOperand + std::fmt::Display>
397 IncrementalCovariance<F>
398{
399 pub fn new(_nvars: usize) -> Self {
401 Self {
402 n: 0,
403 means: scirs2_core::ndarray::Array1::zeros(_nvars),
404 cov_matrix: scirs2_core::ndarray::Array2::zeros((_nvars, _nvars)),
405 n_vars: _nvars,
406 }
407 }
408
409 pub fn update(&mut self, observation: &scirs2_core::ndarray::ArrayView1<F>) -> StatsResult<()> {
411 if observation.len() != self.n_vars {
412 return Err(StatsError::DimensionMismatch(
413 "Observation dimension doesn't match".to_string(),
414 ));
415 }
416
417 self.n += 1;
418 let n = F::from(self.n).expect("Failed to convert to float");
419
420 let mut delta = scirs2_core::ndarray::Array1::zeros(self.n_vars);
422
423 for i in 0..self.n_vars {
424 delta[i] = observation[i] - self.means[i];
425 self.means[i] = self.means[i] + delta[i] / n;
426 }
427
428 if self.n > 1 {
429 for i in 0..self.n_vars {
430 for j in i..self.n_vars {
431 let delta_new = observation[j] - self.means[j];
432 let cov_update = delta[i] * delta_new * (n - F::one()) / n;
433 self.cov_matrix[(i, j)] = self.cov_matrix[(i, j)] + cov_update;
434 if i != j {
435 self.cov_matrix[(j, i)] = self.cov_matrix[(i, j)];
436 }
437 }
438 }
439 }
440
441 Ok(())
442 }
443
444 pub fn covariance(&self, ddof: usize) -> StatsResult<scirs2_core::ndarray::Array2<F>> {
446 if self.n <= ddof {
447 return Err(StatsError::InvalidArgument(
448 "Not enough observations for the given degrees of freedom".to_string(),
449 ));
450 }
451
452 let factor = F::from(self.n - ddof).expect("Failed to convert to float");
453 Ok(&self.cov_matrix / factor)
454 }
455
456 pub fn means(&self) -> &scirs2_core::ndarray::Array1<F> {
458 &self.means
459 }
460}
461
462#[allow(dead_code)]
466pub struct RollingStats<F: Float> {
467 windowsize: usize,
468 buffer: Vec<F>,
469 position: usize,
470 is_full: bool,
471 sum: F,
472 sum_squares: F,
473}
474
475#[allow(dead_code)]
476impl<F: Float + NumCast + std::fmt::Display> RollingStats<F> {
477 pub fn new(_windowsize: usize) -> StatsResult<Self> {
479 if _windowsize == 0 {
480 return Err(StatsError::InvalidArgument(
481 "Window size must be positive".to_string(),
482 ));
483 }
484
485 Ok(Self {
486 windowsize: _windowsize,
487 buffer: vec![F::zero(); _windowsize],
488 position: 0,
489 is_full: false,
490 sum: F::zero(),
491 sum_squares: F::zero(),
492 })
493 }
494
495 pub fn push(&mut self, value: F) {
497 let old_value = self.buffer[self.position];
498
499 self.sum = self.sum - old_value + value;
501 self.sum_squares = self.sum_squares - old_value * old_value + value * value;
502
503 self.buffer[self.position] = value;
505 self.position = (self.position + 1) % self.windowsize;
506
507 if !self.is_full && self.position == 0 {
508 self.is_full = true;
509 }
510 }
511
512 pub fn len(&self) -> usize {
514 if self.is_full {
515 self.windowsize
516 } else {
517 self.position
518 }
519 }
520
521 pub fn mean(&self) -> F {
523 let n = self.len();
524 if n == 0 {
525 F::zero()
526 } else {
527 self.sum / F::from(n).expect("Failed to convert to float")
528 }
529 }
530
531 pub fn variance(&self, ddof: usize) -> StatsResult<F> {
533 let n = self.len();
534 if n <= ddof {
535 return Err(StatsError::InvalidArgument(
536 "Not enough data for the given degrees of freedom".to_string(),
537 ));
538 }
539
540 let mean = self.mean();
541 let n_f = F::from(n).expect("Failed to convert to float");
542 let variance = (self.sum_squares / n_f) - mean * mean;
543 Ok(variance * n_f / F::from(n - ddof).expect("Failed to convert to float"))
544 }
545
546 pub fn as_array(&self) -> scirs2_core::ndarray::Array1<F> {
548 if self.is_full {
549 scirs2_core::ndarray::Array1::from_vec(self.buffer.clone())
550 } else {
551 scirs2_core::ndarray::Array1::from_vec(self.buffer[..self.position].to_vec())
552 }
553 }
554}
555
556pub struct StreamingHistogram<F: Float> {
560 bins: Vec<F>,
561 counts: Vec<usize>,
562 min_val: F,
563 max_val: F,
564 total_count: usize,
565}
566
567impl<F: Float + NumCast + std::fmt::Display> StreamingHistogram<F> {
568 pub fn new(_n_bins: usize, min_val: F, maxval: F) -> Self {
570 let bin_width = (maxval - min_val) / F::from(_n_bins).expect("Failed to convert to float");
571 let bins: Vec<F> = (0..=_n_bins)
572 .map(|i| min_val + F::from(i).expect("Failed to convert to float") * bin_width)
573 .collect();
574
575 Self {
576 bins,
577 counts: vec![0; _n_bins],
578 min_val,
579 max_val: maxval,
580 total_count: 0,
581 }
582 }
583
584 pub fn add_value(&mut self, value: F) {
586 if value >= self.min_val && value <= self.max_val {
587 let n_bins = self.counts.len();
588 let bin_width = (self.max_val - self.min_val)
589 / F::from(n_bins).expect("Failed to convert to float");
590 let bin_idx = ((value - self.min_val) / bin_width).floor();
591 let bin_idx: usize = NumCast::from(bin_idx).unwrap_or(n_bins - 1).min(n_bins - 1);
592 self.counts[bin_idx] += 1;
593 self.total_count += 1;
594 }
595 }
596
597 pub fn add_values<D>(&mut self, values: &ArrayBase<D, Ix1>)
599 where
600 D: Data<Elem = F>,
601 {
602 for &value in values.iter() {
603 self.add_value(value);
604 }
605 }
606
607 pub fn get_histogram(&self) -> (Vec<F>, Vec<usize>) {
609 (self.bins.clone(), self.counts.clone())
610 }
611
612 pub fn get_density(&self) -> (Vec<F>, Vec<F>) {
614 let n_bins = self.counts.len();
615 let bin_width =
616 (self.max_val - self.min_val) / F::from(n_bins).expect("Failed to convert to float");
617 let total = F::from(self.total_count).expect("Failed to convert to float") * bin_width;
618
619 let density: Vec<F> = self
620 .counts
621 .iter()
622 .map(|&count| F::from(count).expect("Failed to convert to float") / total)
623 .collect();
624
625 (self.bins.clone(), density)
626 }
627}
628
629#[allow(dead_code)]
633pub struct OutOfCoreStats<F: Float> {
634 chunksize: usize,
635 _phantom: std::marker::PhantomData<F>,
636}
637
638#[allow(dead_code)]
639impl<F: Float + NumCast + std::str::FromStr + std::fmt::Display> OutOfCoreStats<F> {
640 pub fn new(_chunksize: usize) -> Self {
642 Self {
643 chunksize: _chunksize,
644 _phantom: std::marker::PhantomData,
645 }
646 }
647
648 pub fn mean_from_csv<P: AsRef<Path>>(
650 &self,
651 path: P,
652 column: usize,
653 has_header: bool,
654 ) -> StatsResult<F> {
655 let file = File::open(path)
656 .map_err(|e| StatsError::InvalidArgument(format!("Failed to open file: {e}")))?;
657 let mut reader = BufReader::new(file);
658
659 let mut sum = F::zero();
660 let mut count = 0;
661 let mut line = String::new();
662 let mut line_num = 0;
663
664 if has_header {
666 reader
667 .read_line(&mut line)
668 .map_err(|e| StatsError::InvalidArgument(format!("Failed to read header: {e}")))?;
669 line.clear();
670 }
671
672 loop {
674 match reader.read_line(&mut line) {
675 Ok(0) => break, Ok(_) => {
677 line_num += 1;
678 let fields: Vec<&str> = line.trim().split(',').collect();
679
680 if fields.len() > column {
681 if let Ok(value) = fields[column].parse::<F>() {
682 sum = sum + value;
683 count += 1;
684 }
685 }
686
687 line.clear();
688 }
689 Err(e) => {
690 return Err(StatsError::ComputationError(format!(
691 "Error reading line {line_num}: {e}"
692 )))
693 }
694 }
695 }
696
697 if count == 0 {
698 return Err(StatsError::InvalidArgument(
699 "No valid data found".to_string(),
700 ));
701 }
702
703 Ok(sum / F::from(count).expect("Failed to convert to float"))
704 }
705
706 pub fn variance_from_csv<P: AsRef<Path>>(
708 &self,
709 path: P,
710 column: usize,
711 has_header: bool,
712 ddof: usize,
713 ) -> StatsResult<(F, F)> {
714 let mean = self.mean_from_csv(&path, column, has_header)?;
716
717 let file = File::open(path)
719 .map_err(|e| StatsError::InvalidArgument(format!("Failed to open file: {e}")))?;
720 let mut reader = BufReader::new(file);
721
722 let mut sum_sq = F::zero();
723 let mut count = 0;
724 let mut line = String::new();
725
726 if has_header {
728 reader
729 .read_line(&mut line)
730 .map_err(|e| StatsError::InvalidArgument(format!("Failed to read header: {e}")))?;
731 line.clear();
732 }
733
734 loop {
736 match reader.read_line(&mut line) {
737 Ok(0) => break, Ok(_) => {
739 let fields: Vec<&str> = line.trim().split(',').collect();
740
741 if fields.len() > column {
742 if let Ok(value) = fields[column].parse::<F>() {
743 let diff = value - mean;
744 sum_sq = sum_sq + diff * diff;
745 count += 1;
746 }
747 }
748
749 line.clear();
750 }
751 Err(e) => {
752 return Err(StatsError::ComputationError(format!(
753 "Error reading file: {}",
754 e
755 )))
756 }
757 }
758 }
759
760 if count <= ddof {
761 return Err(StatsError::InvalidArgument(
762 "Not enough data for the given degrees of freedom".to_string(),
763 ));
764 }
765
766 let variance = sum_sq
767 / F::from(count - ddof).ok_or_else(|| {
768 StatsError::ComputationError(
769 "Failed to convert count - ddof to target type".to_string(),
770 )
771 })?;
772 Ok((mean, variance))
773 }
774
775 pub fn process_binary_file<P: AsRef<Path>, G>(
777 &self,
778 path: P,
779 mut processor: G,
780 ) -> StatsResult<()>
781 where
782 G: FnMut(&[F]) -> StatsResult<()>,
783 {
784 use std::mem;
785
786 let file = File::open(path)
787 .map_err(|e| StatsError::InvalidArgument(format!("Failed to open file: {e}")))?;
788 let mut reader = BufReader::new(file);
789
790 let elementsize = mem::size_of::<F>();
791 let buffersize = self.chunksize * elementsize;
792 let mut buffer = vec![0u8; buffersize];
793
794 loop {
795 match reader.read(&mut buffer) {
796 Ok(0) => break, Ok(bytes_read) => {
798 let n_elements = bytes_read / elementsize;
799
800 let floats = unsafe {
802 std::slice::from_raw_parts(buffer.as_ptr() as *const F, n_elements)
803 };
804
805 processor(floats)?;
806 }
807 Err(e) => {
808 return Err(StatsError::ComputationError(format!(
809 "Error reading file: {}",
810 e
811 )))
812 }
813 }
814 }
815
816 Ok(())
817 }
818}
819
820#[cfg(feature = "memmap")]
824pub struct MemoryMappedStats<F: Float> {
825 _phantom: std::marker::PhantomData<F>,
826}
827
828#[cfg(feature = "memmap")]
829impl<F: Float + NumCast + std::fmt::Display> MemoryMappedStats<F> {
830 pub fn new() -> Self {
832 Self {
833 _phantom: std::marker::PhantomData,
834 }
835 }
836
837 pub fn process_mmap_file<P: AsRef<Path>, G>(&self, path: P, processor: G) -> StatsResult<()>
839 where
840 G: FnOnce(&[F]) -> StatsResult<()>,
841 {
842 let file = File::open(path)
843 .map_err(|e| StatsError::InvalidArgument(format!("Failed to open file: {e}")))?;
844
845 let mmap = unsafe {
846 Mmap::map(&file)
847 .map_err(|e| StatsError::ComputationError(format!("Failed to mmap file: {}", e)))?
848 };
849
850 let data = unsafe {
852 std::slice::from_raw_parts(
853 mmap.as_ptr() as *const F,
854 mmap.len() / std::mem::size_of::<F>(),
855 )
856 };
857
858 processor(data)
859 }
860}