use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use thiserror::Error;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatabaseStatistics {
pub database_file: PathBuf,
pub kmer_size: u8,
pub canonical: bool,
pub sorted: bool,
pub total_kmers: u64,
pub unique_kmers: u64,
pub min_count: u32,
pub max_count: u32,
pub mean_count: f64,
pub median_count: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub frequency_distribution: Option<Vec<(u32, u64)>>,
#[serde(serialize_with = "serialize_duration")]
#[serde(deserialize_with = "deserialize_duration")]
pub processing_time: Duration,
pub memory_peak_bytes: u64,
}
fn serialize_duration<S>(duration: &Duration, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let millis = duration.as_millis() as u64;
serializer.serialize_u64(millis)
}
fn deserialize_duration<'de, D>(deserializer: D) -> std::result::Result<Duration, D::Error>
where
D: serde::Deserializer<'de>,
{
let millis = u64::deserialize(deserializer)?;
Ok(Duration::from_millis(millis))
}
#[derive(Debug, Clone)]
pub struct StatsConfiguration {
pub output_format: OutputFormat,
pub detailed: bool,
pub max_bins: usize,
pub approximate: bool,
pub show_progress: bool,
pub output_path: Option<PathBuf>,
pub split_output: bool,
pub freq_output_path: Option<PathBuf>,
}
#[derive(Debug, Clone)]
pub enum OutputFormat {
Text,
Json,
Csv,
Tsv,
}
#[derive(Error, Debug)]
pub enum StatsError {
#[error("Database file not found: {path}")]
DatabaseNotFound { path: PathBuf },
#[error("Invalid database format: {reason}")]
InvalidFormat { reason: String },
#[error("Database empty: no k-mers found")]
EmptyDatabase,
#[error("Memory limit exceeded: required {required}MB, limit {limit}MB")]
MemoryLimitExceeded { required: u64, limit: u64 },
#[error("I/O error: {source}")]
Io {
#[from]
source: std::io::Error,
},
#[error("Serialization error: {format} - {source}")]
Serialization {
format: String,
#[source]
source: Box<dyn std::error::Error + Send + Sync>,
},
#[error("CSV error: {0}")]
Csv(#[from] csv::Error),
#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),
}
pub type Result<T> = std::result::Result<T, StatsError>;
pub struct StreamingStatsProcessor {
tdigest: tdigest::TDigest,
total_kmers: u64,
unique_kmers: u64,
min_count: u32,
max_count: u32,
sum_counts: u128,
frequency_histogram: HashMap<u32, u64>,
config: StatsConfiguration,
}
impl StreamingStatsProcessor {
pub fn new(config: StatsConfiguration) -> Self {
Self {
tdigest: tdigest::TDigest::default(),
total_kmers: 0,
unique_kmers: 0,
min_count: u32::MAX,
max_count: 0,
sum_counts: 0,
frequency_histogram: HashMap::new(),
config,
}
}
pub fn add_count(&mut self, count: u32) -> Result<()> {
self.total_kmers += count as u64;
self.unique_kmers += 1;
self.min_count = self.min_count.min(count);
self.max_count = self.max_count.max(count);
self.sum_counts += count as u128;
self.tdigest = self.tdigest.merge_unsorted(vec![count as f64]);
if count <= 1000 || self.frequency_histogram.len() < self.config.max_bins {
*self.frequency_histogram.entry(count).or_insert(0) += 1;
}
Ok(())
}
pub fn median(&self) -> f64 {
self.tdigest.estimate_quantile(0.5)
}
pub fn mean(&self) -> f64 {
if self.unique_kmers == 0 {
0.0
} else {
self.sum_counts as f64 / self.unique_kmers as f64
}
}
pub fn frequency_distribution(&self) -> Option<Vec<(u32, u64)>> {
if !self.config.detailed {
return None;
}
if self.unique_kmers == 0 {
return None;
}
let mut distribution = Vec::new();
for count in self.min_count..=self.max_count {
let frequency = self.frequency_histogram.get(&count).copied().unwrap_or(0);
distribution.push((count, frequency));
}
Some(distribution)
}
pub fn finalize(
self,
database_file: PathBuf,
kmer_size: u8,
canonical: bool,
sorted: bool,
processing_time: Duration,
) -> DatabaseStatistics {
DatabaseStatistics {
database_file,
kmer_size,
canonical,
sorted,
total_kmers: self.total_kmers,
unique_kmers: self.unique_kmers,
min_count: if self.unique_kmers > 0 {
self.min_count
} else {
0
},
max_count: self.max_count,
mean_count: self.mean(),
median_count: self.median(),
frequency_distribution: self.frequency_distribution(),
processing_time,
memory_peak_bytes: 0, }
}
}