rustkmer 0.5.2

High-performance k-mer counting tool in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! Wildcard expansion for N nucleotides
//!
//! This module handles the expansion of wildcard characters (N) into all possible
//! nucleotides (A, T, C, G) with efficient algorithms for combinatorial explosion management.

use crate::fuzzy::{constants, FuzzyError, FuzzyResult};
use std::collections::HashSet;

/// Nucleotide bases
const NUCLEOTIDES: [char; 4] = ['A', 'T', 'C', 'G'];

/// Expand wildcards (N) in a query string to all possible combinations
///
/// # Arguments
/// * `query` - Query string possibly containing 'N' wildcards
/// * `max_variants` - Maximum number of variants to generate (None for default)
///
/// # Returns
/// Vector of all concrete sequences without wildcards
///
/// # Examples
/// ```
/// use rustkmer::fuzzy::wildcard::expand_wildcards;
///
/// let variants = expand_wildcards("ATGCGATGCTAGCN", None).unwrap();
/// assert_eq!(variants.len(), 4);
/// assert!(variants.contains(&"ATGCGATGCTAGCA".to_string()));
/// assert!(variants.contains(&"ATGCGATGCTAGCT".to_string()));
/// assert!(variants.contains(&"ATGCGATGCTAGCC".to_string()));
/// assert!(variants.contains(&"ATGCGATGCTAGCG".to_string()));
/// ```
pub fn expand_wildcards(query: &str, max_variants: Option<usize>) -> FuzzyResult<Vec<String>> {
    let wildcard_positions: Vec<usize> = query
        .chars()
        .enumerate()
        .filter(|(_, c)| *c == 'N')
        .map(|(i, _)| i)
        .collect();

    if wildcard_positions.is_empty() {
        return Ok(vec![query.to_string()]);
    }

    // Check combinatorial explosion protection
    let combination_count = 4_usize.pow(wildcard_positions.len() as u32);
    let limit = max_variants.unwrap_or(constants::DEFAULT_MAX_VARIANTS);
    if combination_count > limit {
        return Err(FuzzyError::TooManyVariants {
            actual: combination_count,
            limit,
        });
    }

    let mut variants = Vec::with_capacity(combination_count);
    generate_wildcard_combinations(query, &wildcard_positions, 0, &mut variants);

    Ok(variants)
}

/// Generate all wildcard combinations recursively
fn generate_wildcard_combinations(
    query: &str,
    wildcard_positions: &[usize],
    current_index: usize,
    variants: &mut Vec<String>,
) {
    if current_index >= wildcard_positions.len() {
        // All wildcards processed, add the current combination
        variants.push(query.to_string());
        return;
    }

    let wildcard_pos = wildcard_positions[current_index];

    // Try each nucleotide at this position
    for &nucleotide in &NUCLEOTIDES {
        let mut query_chars: Vec<char> = query.chars().collect();
        query_chars[wildcard_pos] = nucleotide;
        let new_query: String = query_chars.iter().collect();

        // Recursively process remaining wildcards
        generate_wildcard_combinations(&new_query, wildcard_positions, current_index + 1, variants);
    }
}

/// Expand wildcards using an iterative approach (memory efficient)
///
/// This method generates variants iteratively to avoid deep recursion
/// for queries with many wildcards.
///
/// # Arguments
/// * `query` - Query string possibly containing 'N' wildcards
/// * `max_variants` - Maximum number of variants to generate (None for default)
///
/// # Returns
/// Vector of all concrete sequences without wildcards
pub fn expand_wildcards_iterative(
    query: &str,
    max_variants: Option<usize>,
) -> FuzzyResult<Vec<String>> {
    let wildcard_positions: Vec<usize> = query
        .chars()
        .enumerate()
        .filter(|(_, c)| *c == 'N')
        .map(|(i, _)| i)
        .collect();

    if wildcard_positions.is_empty() {
        return Ok(vec![query.to_string()]);
    }

    // Check combinatorial explosion protection
    let combination_count = 4_usize.pow(wildcard_positions.len() as u32);
    let limit = max_variants.unwrap_or(constants::DEFAULT_MAX_VARIANTS);
    if combination_count > limit {
        return Err(FuzzyError::TooManyVariants {
            actual: combination_count,
            limit,
        });
    }

    let mut variants = vec![query.to_string()];

    // Process each wildcard position iteratively
    for &pos in &wildcard_positions {
        let mut new_variants = Vec::with_capacity(variants.len() * 4);

        for variant in variants {
            for &nucleotide in &NUCLEOTIDES {
                let mut chars: Vec<char> = variant.chars().collect();
                chars[pos] = nucleotide;
                new_variants.push(chars.iter().collect::<String>());
            }
        }

        variants = new_variants;
    }

    Ok(variants)
}

/// Count the number of wildcards in a query
pub fn count_wildcards(query: &str) -> usize {
    query.chars().filter(|&c| c == 'N').count()
}

/// Estimate the number of variants that would be generated
pub fn estimate_wildcard_variants(query: &str) -> usize {
    4_usize.pow(count_wildcards(query) as u32)
}

/// Check if a query would exceed variant limits
pub fn would_exceed_wildcard_limit(query: &str, max_variants: usize) -> bool {
    estimate_wildcard_variants(query) > max_variants
}

/// Validate wildcard query parameters
pub fn validate_wildcard_query(query: &str, max_variants: Option<usize>) -> FuzzyResult<()> {
    // Check for invalid characters
    if !query
        .chars()
        .all(|c| matches!(c, 'A' | 'T' | 'C' | 'G' | 'N'))
    {
        return Err(FuzzyError::InvalidQuery(
            "Query contains invalid characters (only A,T,C,G,N allowed)".to_string(),
        ));
    }

    // Check combinatorial explosion
    if let Some(max_variants) = max_variants {
        if would_exceed_wildcard_limit(query, max_variants) {
            return Err(FuzzyError::TooManyVariants {
                actual: estimate_wildcard_variants(query),
                limit: max_variants,
            });
        }
    }

    Ok(())
}

/// Generate wildcard variants with streaming processing
///
/// This method processes variants in batches to reduce memory usage
/// for large wildcard expansions.
///
/// # Arguments
/// * `query` - Query string possibly containing 'N' wildcards
/// * `batch_size` - Size of each batch for processing
/// * `max_variants` - Maximum number of variants to generate (None for default)
/// * `processor` - Function to process each batch of variants
///
/// # Returns
/// Result indicating success or failure
pub fn expand_wildcards_streaming(
    query: &str,
    batch_size: usize,
    max_variants: Option<usize>,
    mut processor: impl FnMut(&[String]) -> FuzzyResult<()>,
) -> FuzzyResult<()> {
    let _wildcard_count = count_wildcards(query);
    let total_variants = estimate_wildcard_variants(query);

    let limit = max_variants.unwrap_or(constants::DEFAULT_MAX_VARIANTS);
    if total_variants > limit {
        return Err(FuzzyError::TooManyVariants {
            actual: total_variants,
            limit,
        });
    }

    // Generate variants in batches
    let mut batch = Vec::with_capacity(batch_size);
    let mut generated = 0;

    // Use iterative generation with batch processing
    generate_wildcard_combinations_batched(
        query,
        0,
        &mut batch,
        &mut generated,
        total_variants,
        batch_size,
        &mut processor,
    )?;

    // Process any remaining variants in the last batch
    if !batch.is_empty() {
        processor(&batch)?;
    }

    Ok(())
}

/// Generate wildcard combinations in batches for memory efficiency
fn generate_wildcard_combinations_batched(
    query: &str,
    start_wildcard: usize,
    batch: &mut Vec<String>,
    generated: &mut usize,
    total_variants: usize,
    batch_size: usize,
    processor: &mut impl FnMut(&[String]) -> FuzzyResult<()>,
) -> FuzzyResult<()> {
    let query_chars: Vec<char> = query.chars().collect();
    let wildcard_positions: Vec<usize> = query_chars
        .iter()
        .enumerate()
        .filter(|(_, c)| **c == 'N')
        .map(|(i, _)| i)
        .collect();

    if start_wildcard >= wildcard_positions.len() {
        // No more wildcards to process, add this variant
        // Build string from the current modified characters
        let variant: String = query_chars.iter().collect();
        batch.push(variant);
        *generated += 1;

        // Process batch if full
        if batch.len() >= batch_size {
            processor(batch)?;
            batch.clear();
        }
        return Ok(());
    }

    let wildcard_pos = wildcard_positions[start_wildcard];

    // Try each nucleotide at this position
    for &nucleotide in &NUCLEOTIDES {
        let mut new_chars = query_chars.clone();
        new_chars[wildcard_pos] = nucleotide;
        let new_query: String = new_chars.iter().collect();

        // Start fresh with the new query (recalculate wildcard positions)
        generate_wildcard_combinations_batched(
            &new_query,
            0, // Start fresh for the new query
            batch,
            generated,
            total_variants,
            batch_size,
            processor,
        )?;
    }

    Ok(())
}

/// Remove duplicate variants (maintain order)
pub fn remove_duplicate_variants(variants: &mut Vec<String>) {
    let mut seen = HashSet::new();
    variants.retain(|variant| seen.insert(variant.clone()));
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_expand_wildcards_single() {
        let variants = expand_wildcards("ATGCGATGCTAGCN", None).unwrap();
        assert_eq!(variants.len(), 4);

        let expected_variants = [
            "ATGCGATGCTAGCA",
            "ATGCGATGCTAGCT",
            "ATGCGATGCTAGCC",
            "ATGCGATGCTAGCG",
        ];

        for expected in &expected_variants {
            assert!(variants.contains(&expected.to_string()));
        }
    }

    #[test]
    fn test_expand_wildcards_multiple() {
        let variants = expand_wildcards("ATNNGATGCTAGCG", None).unwrap();
        assert_eq!(variants.len(), 16); // 4^2 = 16
    }

    #[test]
    fn test_expand_wildcards_none() {
        let variants = expand_wildcards("ATGCGATGCTAGCG", None).unwrap();
        assert_eq!(variants.len(), 1);
        assert_eq!(variants[0], "ATGCGATGCTAGCG");
    }

    #[test]
    fn test_expand_wildcards_iterative() {
        let variants_recursive = expand_wildcards("ATNNGATGCTAGCG", None).unwrap();
        let variants_iterative = expand_wildcards_iterative("ATNNGATGCTAGCG", None).unwrap();

        assert_eq!(variants_recursive.len(), variants_iterative.len());

        // Convert to sets for comparison (order may differ)
        let recursive_set: HashSet<String> = variants_recursive.into_iter().collect();
        let iterative_set: HashSet<String> = variants_iterative.into_iter().collect();
        assert_eq!(recursive_set, iterative_set);
    }

    #[test]
    fn test_count_wildcards() {
        assert_eq!(count_wildcards("ATGCGATGCTAGCN"), 1);
        assert_eq!(count_wildcards("ATNNGATGCTGCN"), 3);
        assert_eq!(count_wildcards("ATGCGATGCTAGCG"), 0);
    }

    #[test]
    fn test_estimate_wildcard_variants() {
        assert_eq!(estimate_wildcard_variants("ATGCGATGCTAGCN"), 4);
        assert_eq!(estimate_wildcard_variants("ATNNGATGCTGCN"), 64); // 4^3
        assert_eq!(estimate_wildcard_variants("ATGCGATGCTAGCG"), 1);
    }

    #[test]
    fn test_would_exceed_wildcard_limit() {
        assert!(would_exceed_wildcard_limit("ATNNNNATGCTNGCN", 1000)); // 4^6 = 4096 > 1000
        assert!(!would_exceed_wildcard_limit("ATNNGATGCTGCN", 100)); // 4^3 = 64 <= 100
    }

    #[test]
    fn test_validate_wildcard_query() {
        // Valid query
        assert!(validate_wildcard_query("ATGCGATGCTAGCN", Some(100)).is_ok());

        // Invalid characters
        assert!(validate_wildcard_query("ATGCGXATGCTAGC", Some(100)).is_err());

        // Too many variants
        assert!(validate_wildcard_query("ATNNNNATGCTNGCN", Some(1000)).is_err());
    }

    #[test]
    fn test_remove_duplicate_variants() {
        let mut variants = vec![
            "ATGCGATGCTAGCA".to_string(),
            "ATGCGATGCTAGCT".to_string(),
            "ATGCGATGCTAGCA".to_string(), // Duplicate
            "ATGCGATGCTAGCC".to_string(),
        ];

        remove_duplicate_variants(&mut variants);
        assert_eq!(variants.len(), 3);
        assert!(variants.contains(&"ATGCGATGCTAGCA".to_string()));
        assert!(variants.contains(&"ATGCGATGCTAGCT".to_string()));
        assert!(variants.contains(&"ATGCGATGCTAGCC".to_string()));
    }

    #[test]
    fn test_expand_wildcards_combinatorial_explosion() {
        // This should fail due to too many variants
        let result = expand_wildcards("NNNNNNNNNN", None); // 4^10 = 1,048,576 variants
        assert!(result.is_err());

        match result.unwrap_err() {
            FuzzyError::TooManyVariants { actual, limit } => {
                assert!(actual > limit);
            }
            _ => panic!("Expected TooManyVariants error"),
        }
    }

    #[test]
    fn test_expand_wildcards_streaming() {
        let mut processed_variants = Vec::new();
        let mut processor = |batch: &[String]| -> FuzzyResult<()> {
            for variant in batch {
                processed_variants.push(variant.clone());
            }
            Ok(())
        };

        expand_wildcards_streaming("ATNNGATGCTAGCG", 5, None, &mut processor).unwrap();

        // Should have processed all 16 variants
        assert_eq!(processed_variants.len(), 16);

        // All variants should be unique
        let unique_variants: HashSet<String> = processed_variants.into_iter().collect();
        assert_eq!(unique_variants.len(), 16);
    }
}