use crate::fuzzy::{mutation, normalization, wildcard, FuzzyError, FuzzyResult};
use serde::{Deserialize, Serialize};
use std::time::Instant;
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum ExpansionMethod {
WildcardOnly { wildcard_count: usize },
MutationOnly { mutation_distance: usize },
Combined {
wildcard_count: usize,
mutation_distance: usize,
},
LengthNormalization {
original_length: usize,
target_length: usize,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QueryExpansion {
pub concrete_kmers: Vec<String>,
pub expansion_method: ExpansionMethod,
pub combination_count: usize,
pub original_query: String,
pub generation_time_ms: u64,
}
pub fn generate_query_expansion(query: &crate::fuzzy::FuzzyQuery) -> FuzzyResult<QueryExpansion> {
let start_time = Instant::now();
let mut concrete_kmers = Vec::new();
let (normalized_query, expansion_method) = if query.query_string.len() != query.kmer_size {
let variants =
normalization::generate_normalized_variants(&query.query_string, query.kmer_size)?;
(
variants,
ExpansionMethod::LengthNormalization {
original_length: query.query_string.len(),
target_length: query.kmer_size,
},
)
} else {
let wildcard_count = query.query_string.chars().filter(|&c| c == 'N').count();
(
vec![query.query_string.clone()],
ExpansionMethod::WildcardOnly { wildcard_count },
)
};
for normalized in normalized_query {
let wildcard_variants = wildcard::expand_wildcards(&normalized, query.max_variants)?;
concrete_kmers.extend(wildcard_variants);
}
if query.mutation_tolerance > 0 {
let mut mutation_variants = Vec::new();
for concrete in &concrete_kmers {
let variants = mutation::generate_hybrid_mutation_variants(
concrete,
query.mutation_tolerance,
query.position_mutations.as_ref(),
query.max_variants,
)?;
mutation_variants.extend(variants);
}
concrete_kmers = mutation_variants;
if let ExpansionMethod::WildcardOnly { wildcard_count } = expansion_method {
ExpansionMethod::Combined {
wildcard_count,
mutation_distance: query.mutation_tolerance,
}
} else {
expansion_method
}
} else {
expansion_method
};
let final_expansion_method = if query.mutation_tolerance > 0 {
if let ExpansionMethod::WildcardOnly { wildcard_count } = expansion_method {
ExpansionMethod::Combined {
wildcard_count,
mutation_distance: query.mutation_tolerance,
}
} else {
expansion_method
}
} else {
expansion_method
};
concrete_kmers.sort();
concrete_kmers.dedup();
let combination_count = concrete_kmers.len();
let generation_time_ms = start_time.elapsed().as_millis() as u64;
Ok(QueryExpansion {
concrete_kmers,
expansion_method: final_expansion_method,
combination_count,
original_query: query.query_string.clone(),
generation_time_ms,
})
}
pub fn generate_batch_expansions(
queries: &[crate::fuzzy::FuzzyQuery],
) -> FuzzyResult<Vec<QueryExpansion>> {
let mut expansions = Vec::with_capacity(queries.len());
for query in queries {
expansions.push(generate_query_expansion(query)?);
}
Ok(expansions)
}
pub fn validate_expansion_params(
query_length: usize,
kmer_size: usize,
mutation_tolerance: usize,
max_variants: Option<usize>,
) -> FuzzyResult<()> {
if query_length > kmer_size + 3 || query_length < kmer_size - 3 {
return Err(FuzzyError::InvalidParameters(
"Query length differs too much from k-mer size (max difference: 3)".to_string(),
));
}
if mutation_tolerance > kmer_size / 2 {
return Err(FuzzyError::InvalidParameters(
"Mutation tolerance cannot exceed k-mer size / 2".to_string(),
));
}
if let Some(max_variants) = max_variants {
let wildcard_count = 0; let estimated_variants =
estimate_variant_count(wildcard_count, query_length, mutation_tolerance);
if estimated_variants > max_variants {
return Err(FuzzyError::TooManyVariants {
actual: estimated_variants,
limit: max_variants,
});
}
}
Ok(())
}
pub fn estimate_variant_count(
wildcard_count: usize,
sequence_length: usize,
mutation_tolerance: usize,
) -> usize {
let wildcard_variants = 4_usize.pow(wildcard_count as u32);
let mutation_variants = if mutation_tolerance == 0 {
1
} else {
sequence_length * mutation_tolerance * 3
};
wildcard_variants * mutation_variants
}
pub fn would_exceed_limits(
wildcard_count: usize,
sequence_length: usize,
mutation_tolerance: usize,
max_variants: usize,
) -> bool {
estimate_variant_count(wildcard_count, sequence_length, mutation_tolerance) > max_variants
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fuzzy::FuzzyQuery;
#[test]
fn test_query_expansion_simple() {
let query = FuzzyQuery::new("ATGCGATGCTAGN", 13, 0);
let expansion = generate_query_expansion(&query).unwrap();
assert_eq!(expansion.original_query, "ATGCGATGCTAGN");
assert_eq!(expansion.combination_count, 4);
}
#[test]
fn test_query_expansion_multiple_wildcards() {
let query = FuzzyQuery::new("ATNNGATGCTAGC", 13, 0);
let expansion = generate_query_expansion(&query).unwrap();
assert_eq!(expansion.combination_count, 16); }
#[test]
fn test_query_expansion_with_mutations() {
let query = FuzzyQuery::new("ATGCGATGCTAGCG", 13, 1);
let expansion = generate_query_expansion(&query).unwrap();
assert!(expansion.combination_count > 13); }
#[test]
fn test_estimate_variant_count() {
assert_eq!(estimate_variant_count(1, 13, 0), 4);
assert_eq!(estimate_variant_count(2, 13, 0), 16);
assert!(estimate_variant_count(0, 13, 1) >= 13);
}
#[test]
fn test_would_exceed_limits() {
assert!(would_exceed_limits(3, 13, 0, 50)); assert!(!would_exceed_limits(2, 13, 0, 20)); }
#[test]
fn test_validate_expansion_params() {
assert!(validate_expansion_params(13, 13, 0, Some(100)).is_ok());
assert!(validate_expansion_params(20, 13, 0, Some(100)).is_err());
assert!(validate_expansion_params(13, 13, 8, Some(100)).is_err());
}
}