deepbiop_core/
kmer.rs

1use itertools::Itertools;
2use rayon::prelude::*;
3use std::ops::Range;
4
5use crate::{
6    error::DPError,
7    types::{Element, Id2KmerTable, Kmer2IdTable},
8};
9use anyhow::Error;
10use anyhow::Result;
11
12/// Convert a sequence of k-mer IDs back into a DNA sequence.
13///
14/// This function takes a slice of k-mer IDs and a lookup table mapping IDs to k-mers,
15/// and reconstructs the original DNA sequence by converting each ID back to its k-mer
16/// and joining them together.
17///
18/// # Arguments
19///
20/// * `kmer_ids` - A slice of integer IDs representing k-mers
21/// * `id2kmer_table` - A HashMap mapping k-mer IDs to their byte sequences
22///
23/// # Returns
24///
25/// The reconstructed DNA sequence as a vector of bytes, wrapped in a Result.
26/// Returns an error if any k-mer ID is not found in the lookup table.
27///
28/// # Errors
29///
30/// Returns `DPError::InvalidKmerId` if a k-mer ID is not found in the lookup table
31pub fn kmerids_to_seq(kmer_ids: &[Element], id2kmer_table: Id2KmerTable) -> Result<Vec<u8>> {
32    let result = kmer_ids
33        .par_iter()
34        .map(|&id| {
35            id2kmer_table
36                .get(&id)
37                .ok_or(Error::new(DPError::InvalidKmerId))
38                .map(|kmer| kmer.as_ref())
39        })
40        .collect::<Result<Vec<_>>>()?;
41
42    kmers_to_seq(result)
43}
44
45/// Convert a k-mer target region back to the original sequence target region.
46///
47/// This function takes a target region that was adjusted for k-mer calculations and converts it
48/// back to the corresponding region in the original sequence by reversing the k-mer adjustments.
49///
50/// # Arguments
51///
52/// * `kmer_target` - The target region adjusted for k-mers as a Range<usize>
53/// * `k` - The length of k-mers used
54///
55/// # Returns
56///
57/// A Range<usize> representing the target region in the original sequence
58///
59/// # Example
60///
61/// ```
62/// use std::ops::Range;
63/// use deepbiop_core::kmer::to_original_target_region;
64///
65/// let kmer_target = 0..3;  // A k-mer target region
66/// let k = 4;  // k-mer length
67/// let original_target = to_original_target_region(&kmer_target, k);
68/// assert_eq!(original_target, 0..6);  // Original sequence region
69/// ```
70pub fn to_original_target_region(kmer_target: &Range<usize>, k: usize) -> Range<usize> {
71    // The start of the target region remains the same
72    let original_start = kmer_target.start;
73
74    // Attempt to reverse the end adjustment by adding k - 1, assuming the adjustment was due to k-mer calculation
75    let original_end = if kmer_target.end > original_start {
76        kmer_target.end + k - 1
77    } else {
78        kmer_target.end
79    };
80
81    original_start..original_end
82}
83
84/// Convert an original sequence target region to a k-mer target region.
85///
86/// This function takes a target region from the original sequence and converts it to the corresponding
87/// region for k-mer calculations by adjusting for k-mer length and sequence boundaries.
88///
89/// # Arguments
90///
91/// * `original_target` - The target region in the original sequence as a Range<usize>
92/// * `k` - The length of k-mers to use
93/// * `seq_len` - Optional sequence length to validate target region bounds
94///
95/// # Returns
96///
97/// A Result containing a Range<usize> representing the adjusted k-mer target region
98///
99/// # Errors
100///
101/// Returns `DPError::TargetRegionInvalid` if:
102/// - The target region is invalid (start >= end)
103/// - k is 0
104/// - The target region extends beyond sequence length (if seq_len provided)
105///
106/// # Example
107///
108/// ```
109/// use std::ops::Range;
110/// use deepbiop_core::kmer::to_kmer_target_region;
111///
112/// let original_target = 0..6;  // Original sequence region
113/// let k = 4;  // k-mer length
114/// let kmer_target = to_kmer_target_region(&original_target, k, Some(10)).unwrap();
115/// assert_eq!(kmer_target, 0..3);  // Adjusted k-mer region
116/// ```
117pub fn to_kmer_target_region(
118    original_target: &Range<usize>,
119    k: usize,
120    seq_len: Option<usize>,
121) -> Result<Range<usize>> {
122    if original_target.start >= original_target.end || k == 0 {
123        return Err(Error::from(DPError::TargetRegionInvalid));
124    }
125
126    if let Some(seq_len) = seq_len {
127        // Ensure the target region is valid.
128        if original_target.end > seq_len {
129            return Err(Error::new(DPError::TargetRegionInvalid));
130        }
131    }
132
133    // Calculate how many k-mers can be formed starting within the original target region.
134    let num_kmers_in_target = if original_target.end - original_target.start >= k {
135        original_target.end - original_target.start - k + 1
136    } else {
137        0
138    };
139
140    // The new target region starts at the same position as the original target region.
141    let new_start = original_target.start;
142
143    // The end of the new target region needs to be adjusted based on the number of k-mers.
144    // It is the start position of the last k-mer that can be formed within the original target region.
145    let new_end = if num_kmers_in_target > 0 {
146        new_start + num_kmers_in_target
147    } else {
148        original_target.end
149    };
150
151    Ok(new_start..new_end)
152}
153
154/// Convert a DNA sequence into k-mers.
155///
156/// This function takes a DNA sequence and splits it into k-mers of specified length.
157/// The k-mers can be either overlapping or non-overlapping based on the `overlap` parameter.
158///
159/// # Arguments
160///
161/// * `seq` - A byte slice containing the DNA sequence
162/// * `k` - The length of each k-mer
163/// * `overlap` - Whether to generate overlapping k-mers
164///
165/// # Returns
166///
167/// A vector of k-mer byte slices
168///
169/// # Example
170///
171/// ```
172/// use deepbiop_core::kmer::seq_to_kmers;
173///
174/// let seq = b"ATCGATCG";
175///
176/// // Overlapping k-mers
177/// let kmers = seq_to_kmers(seq, 3, true);
178/// assert_eq!(kmers, vec![b"ATC", b"TCG", b"CGA", b"GAT", b"ATC", b"TCG"]);
179///
180/// // Non-overlapping k-mers
181/// let kmers = seq_to_kmers(seq, 3, false);
182/// assert_eq!(kmers, vec![b"ATC".as_slice(), b"GAT".as_slice(), b"CG".as_slice()]);
183/// ```
184pub fn seq_to_kmers(seq: &[u8], k: usize, overlap: bool) -> Vec<&[u8]> {
185    if overlap {
186        seq.par_windows(k).collect()
187    } else {
188        seq.par_chunks(k).collect()
189    }
190}
191/// Convert k-mers back into a DNA sequence.
192///
193/// This function takes a vector of k-mers and reconstructs the original DNA sequence.
194/// The k-mers are assumed to be in order and overlapping by k-1 bases.
195///
196/// # Arguments
197///
198/// * `kmers` - A vector of k-mer byte slices
199///
200/// # Returns
201///
202/// A Result containing the reconstructed DNA sequence as a byte vector
203///
204/// # Errors
205///
206/// Returns an error if any k-mer is invalid (empty)
207///
208/// # Example
209///
210/// ```
211/// use deepbiop_core::kmer::kmers_to_seq;
212///
213/// let kmers = vec![b"ATC".as_slice(), b"TCG".as_slice(), b"CGA".as_slice()];
214/// let seq = kmers_to_seq(kmers).unwrap();
215/// assert_eq!(seq, b"ATCGA");
216/// ```
217pub fn kmers_to_seq(kmers: Vec<&[u8]>) -> Result<Vec<u8>> {
218    // Early return for empty input
219    if kmers.is_empty() {
220        return Ok(Vec::new());
221    }
222
223    // Validate first kmer
224    let first_kmer = kmers[0];
225    if first_kmer.is_empty() {
226        return Err(DPError::InvalidKmerId.into());
227    }
228
229    // Initialize result with first kmer
230    let mut result = Vec::with_capacity(first_kmer.len() + kmers.len() - 1);
231    result.extend_from_slice(first_kmer);
232
233    // Process remaining kmers in parallel
234    let remaining_bases: Result<Vec<u8>> = kmers
235        .into_par_iter()
236        .skip(1)
237        .map(|kmer| {
238            kmer.last()
239                .ok_or_else(|| DPError::InvalidKmerId.into())
240                .copied()
241        })
242        .collect();
243
244    // Extend result with remaining bases
245    result.extend(remaining_bases?);
246
247    Ok(result)
248}
249
250/// Convert a DNA sequence into k-mers with their positions in the original sequence.
251///
252/// This function takes a DNA sequence and splits it into k-mers of specified length,
253/// returning both the k-mers and their start/end positions in the original sequence.
254///
255/// # Arguments
256///
257/// * `seq` - A DNA sequence as a byte slice
258/// * `kmer_size` - The length of each k-mer
259/// * `overlap` - Whether to generate overlapping k-mers
260///
261/// # Returns
262///
263/// A Result containing a vector of tuples, where each tuple contains:
264/// - A k-mer as a byte slice
265/// - A tuple of (start_position, end_position) indicating the k-mer's location in the original sequence
266///
267/// # Errors
268///
269/// Returns an error if:
270/// - `kmer_size` is 0
271/// - `kmer_size` is greater than the sequence length
272///
273/// # Example
274///
275/// ```
276/// use deepbiop_core::kmer::seq_to_kmers_and_offset;
277///
278/// let seq = b"ATCGA";
279/// let result = seq_to_kmers_and_offset(seq, 3, true).unwrap();
280/// assert_eq!(result.len(), 3);
281/// assert_eq!(result[0], (b"ATC".as_slice(), (0, 3)));
282/// assert_eq!(result[1], (b"TCG".as_slice(), (1, 4)));
283/// assert_eq!(result[2], (b"CGA".as_slice(), (2, 5)));
284/// ```
285#[allow(clippy::type_complexity)]
286pub fn seq_to_kmers_and_offset(
287    seq: &[u8],
288    kmer_size: usize,
289    overlap: bool,
290) -> Result<Vec<(&[u8], (usize, usize))>> {
291    // Check for invalid kmer_size
292    if kmer_size == 0 || kmer_size > seq.len() {
293        return Err(DPError::SeqShorterThanKmer.into());
294    }
295
296    if seq.is_empty() {
297        return Ok(Vec::new());
298    }
299
300    if overlap {
301        // Overlapping case: use .windows() with step of 1 (default behavior of .windows())
302        Ok(seq
303            .par_windows(kmer_size)
304            .enumerate()
305            .map(|(i, kmer)| (kmer, (i, i + kmer_size)))
306            .collect())
307    } else {
308        // Non-overlapping case: iterate with steps of kmer_size
309        Ok(seq
310            .par_chunks(kmer_size)
311            .enumerate()
312            .filter_map(|(i, chunk)| {
313                if chunk.len() == kmer_size {
314                    Some((chunk, (i * kmer_size, i * kmer_size + kmer_size)))
315                } else {
316                    // ignore the last chunk if it's shorter than kmer_size
317                    None
318                }
319            })
320            .collect())
321    }
322}
323
324/// Generate a lookup table mapping k-mers to unique IDs.
325///
326/// This function takes a slice of base characters and a k-mer length,
327/// and generates a HashMap mapping each possible k-mer to a unique integer ID.
328///
329/// # Arguments
330///
331/// * `base` - A slice containing the base characters to use (e.g. b"ATCG")
332/// * `k` - The length of k-mers to generate
333///
334/// # Returns
335///
336/// A HashMap mapping k-mer byte sequences to integer IDs
337///
338/// # Example
339///
340/// ```
341/// use deepbiop_core::kmer::generate_kmers_table;
342///
343/// let bases = b"AC";
344/// let k = 2;
345/// let table = generate_kmers_table(bases, k);
346/// assert_eq!(table.len(), 4); // AA, AC, CA, CC
347/// ```
348pub fn generate_kmers_table(base: &[u8], k: u8) -> Kmer2IdTable {
349    generate_kmers(base, k)
350        .into_par_iter()
351        .enumerate()
352        .map(|(id, kmer)| (kmer, id as Element))
353        .collect()
354}
355
356/// Generate all possible k-mers from a set of base characters.
357///
358/// This function takes a slice of base characters and a k-mer length,
359/// and generates all possible k-mer combinations of that length.
360///
361/// # Arguments
362///
363/// * `bases` - A slice containing the base characters to use (e.g. b"ATCG")
364/// * `k` - The length of k-mers to generate
365///
366/// # Returns
367///
368/// A vector containing all possible k-mer combinations as byte vectors
369///
370/// # Example
371///
372/// ```
373/// use deepbiop_core::kmer::generate_kmers;
374///
375/// let bases = b"AC";
376/// let k = 2;
377/// let kmers = generate_kmers(bases, 2);
378/// assert_eq!(kmers.len(), 4); // AA, AC, CA, CC
379/// ```
380pub fn generate_kmers(bases: &[u8], k: u8) -> Vec<Vec<u8>> {
381    // Convert u8 slice to char Vec directly where needed
382    (0..k)
383        .map(|_| bases.iter().map(|&c| c as char)) // Direct conversion to char iter
384        .multi_cartesian_product()
385        .map(|combo| combo.into_iter().map(|c| c as u8).collect::<Vec<_>>())
386        .collect::<Vec<_>>()
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392    use bio::utils::Interval;
393
394    #[test]
395    fn test_seq_to_kmers() {
396        let seq1 = b"ATCGT";
397        let k = 2;
398        let kmers = seq_to_kmers(seq1, k, true);
399        assert_eq!(kmers.len(), seq1.len() - k + 1);
400
401        let seq2 = b"AT";
402        let k = 3;
403        let kmers = seq_to_kmers(seq2, k, true);
404        println!("{:?}", kmers);
405        assert_eq!(kmers.len(), 0);
406    }
407
408    #[test]
409    fn test_generate_kmers() {
410        // Test case 1: bases = ['A', 'C', 'G', 'T'], k = 2
411        let bases1 = b"ACGT";
412        let k1 = 2;
413        let expected1 = vec![
414            "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC",
415            "TG", "TT",
416        ]
417        .into_iter()
418        .map(|s| s.chars().map(|c| c as u8).collect::<Vec<_>>())
419        .collect::<Vec<_>>();
420
421        assert_eq!(generate_kmers(bases1, k1), expected1);
422
423        // Test case 2: bases = ['A', 'C'], k = 3
424        let bases2 = b"AC";
425        let k2 = 3;
426        let expected2 = vec!["AAA", "AAC", "ACA", "ACC", "CAA", "CAC", "CCA", "CCC"]
427            .into_iter()
428            .map(|s| s.chars().map(|c| c as u8).collect::<Vec<_>>())
429            .collect::<Vec<_>>();
430        assert_eq!(generate_kmers(bases2, k2), expected2);
431    }
432
433    #[test]
434    fn test_generate_kmers_table() {
435        let base = b"ACGT";
436        let k = 2;
437        let expected_table: Kmer2IdTable = [
438            ("AA", 0),
439            ("GC", 9),
440            ("GT", 11),
441            ("CA", 4),
442            ("TA", 12),
443            ("TC", 13),
444            ("CG", 6),
445            ("CT", 7),
446            ("GA", 8),
447            ("AG", 2),
448            ("AC", 1),
449            ("AT", 3),
450            ("CC", 5),
451            ("GG", 10),
452            ("TG", 14),
453            ("TT", 15),
454        ]
455        .iter()
456        .map(|&(kmer, id)| (kmer.chars().map(|c| c as u8).collect(), id))
457        .collect();
458
459        assert_eq!(generate_kmers_table(base, k), expected_table);
460    }
461
462    #[test]
463    fn test_generate_kmers_table_empty_base() {
464        use ahash::HashMap;
465        use ahash::HashMapExt;
466
467        let base = b"";
468        let k = 2;
469        let expected_table: Kmer2IdTable = HashMap::new();
470        assert_eq!(generate_kmers_table(base, k), expected_table);
471    }
472
473    #[test]
474    fn test_construct_seq_from_kmers() {
475        let k = 3;
476        let seq = b"AAACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
477        let kmers = seq_to_kmers(seq, k, true);
478        let kmers_as_bytes: Vec<&[u8]> = kmers.into_iter().collect();
479        let result = kmers_to_seq(kmers_as_bytes).unwrap();
480        assert_eq!(seq.to_vec(), result);
481    }
482
483    #[test]
484    fn test_update_target_region() {
485        let original_target: Interval<usize> = (2..6).into(); // Target region [2, 6)
486        let k = 3; // K-mer size
487        let new_target_region = to_kmer_target_region(&original_target, k, None).unwrap();
488        assert_eq!(new_target_region, 2..4);
489    }
490
491    #[test]
492    fn test_update_target_region_valid() {
493        let original_target = Interval::new(0..10).unwrap();
494        let k = 3;
495        let seq_len = Some(20);
496
497        let result = to_kmer_target_region(&original_target, k, seq_len);
498
499        assert!(result.is_ok());
500        let new_target = result.unwrap();
501
502        assert_eq!(new_target.start, original_target.start);
503        assert_eq!(new_target.end, original_target.start + 8);
504    }
505
506    #[test]
507    fn test_update_target_region_invalid_start_greater_than_end() {
508        let original_target = Interval::new(10..10).unwrap();
509        let k = 3;
510        let seq_len = Some(20);
511
512        let result = to_kmer_target_region(&original_target, k, seq_len);
513        assert!(result.is_err());
514
515        assert_eq!(
516            result.unwrap_err().to_string(),
517            DPError::TargetRegionInvalid.to_string()
518        );
519    }
520
521    #[test]
522    fn test_update_target_region_invalid_end_greater_than_seq_len() {
523        let original_target = Interval::new(0..25).unwrap();
524        let k = 3;
525        let seq_len = Some(20);
526
527        let result = to_kmer_target_region(&original_target, k, seq_len);
528
529        assert!(result.is_err());
530        assert_eq!(
531            result.unwrap_err().to_string(),
532            DPError::TargetRegionInvalid.to_string()
533        );
534    }
535
536    #[test]
537    fn test_to_original_target_region() {
538        // Test case 1: kmer_target.end > original_start
539        let kmer_target = 2..5;
540        let k = 3;
541        let expected = 2..7;
542
543        assert_eq!(
544            to_kmer_target_region(&expected, k, None).unwrap(),
545            kmer_target
546        );
547        assert_eq!(to_original_target_region(&kmer_target, k), expected);
548
549        // Test case 3: kmer_target.end == original_start
550        let kmer_target = 5..5;
551        let k = 3;
552        let expected = 5..5;
553        assert_eq!(to_original_target_region(&kmer_target, k), expected);
554    }
555
556    #[test]
557    fn test_seq_to_kmers_and_offset_overlap() {
558        let seq = b"ATCGATCGATCG";
559        let kmer_size = 4;
560        let overlap = true;
561        let result = seq_to_kmers_and_offset(seq, kmer_size, overlap).unwrap();
562        assert_eq!(result.len(), seq.len() - kmer_size + 1);
563        assert_eq!(result[0], (&b"ATCG"[..], (0, 4)));
564        assert_eq!(result[1], (&b"TCGA"[..], (1, 5)));
565        assert_eq!(result[result.len() - 1], (&b"ATCG"[..], (8, 12)));
566    }
567
568    #[test]
569    fn test_seq_to_kmers_and_offset_non_overlap() {
570        let seq = b"ATCGATCGATCG";
571        let kmer_size = 4;
572        let overlap = false;
573        let result = seq_to_kmers_and_offset(seq, kmer_size, overlap).unwrap();
574        assert_eq!(result.len(), seq.len() / kmer_size);
575        assert_eq!(result[0], (&b"ATCG"[..], (0, 4)));
576        assert_eq!(result[1], (&b"ATCG"[..], (4, 8)));
577    }
578}