deepbiop_core/kmer.rs
1use itertools::Itertools;
2use rayon::prelude::*;
3use std::ops::Range;
4
5use crate::{
6 error::DPError,
7 types::{Element, Id2KmerTable, Kmer2IdTable},
8};
9use anyhow::Error;
10use anyhow::Result;
11
12/// Convert a sequence of k-mer IDs back into a DNA sequence.
13///
14/// This function takes a slice of k-mer IDs and a lookup table mapping IDs to k-mers,
15/// and reconstructs the original DNA sequence by converting each ID back to its k-mer
16/// and joining them together.
17///
18/// # Arguments
19///
20/// * `kmer_ids` - A slice of integer IDs representing k-mers
21/// * `id2kmer_table` - A HashMap mapping k-mer IDs to their byte sequences
22///
23/// # Returns
24///
25/// The reconstructed DNA sequence as a vector of bytes, wrapped in a Result.
26/// Returns an error if any k-mer ID is not found in the lookup table.
27///
28/// # Errors
29///
30/// Returns `DPError::InvalidKmerId` if a k-mer ID is not found in the lookup table
31pub fn kmerids_to_seq(kmer_ids: &[Element], id2kmer_table: Id2KmerTable) -> Result<Vec<u8>> {
32 let result = kmer_ids
33 .par_iter()
34 .map(|&id| {
35 id2kmer_table
36 .get(&id)
37 .ok_or(Error::new(DPError::InvalidKmerId))
38 .map(|kmer| kmer.as_ref())
39 })
40 .collect::<Result<Vec<_>>>()?;
41
42 kmers_to_seq(result)
43}
44
45/// Convert a k-mer target region back to the original sequence target region.
46///
47/// This function takes a target region that was adjusted for k-mer calculations and converts it
48/// back to the corresponding region in the original sequence by reversing the k-mer adjustments.
49///
50/// # Arguments
51///
52/// * `kmer_target` - The target region adjusted for k-mers as a Range<usize>
53/// * `k` - The length of k-mers used
54///
55/// # Returns
56///
57/// A Range<usize> representing the target region in the original sequence
58///
59/// # Example
60///
61/// ```
62/// use std::ops::Range;
63/// use deepbiop_core::kmer::to_original_target_region;
64///
65/// let kmer_target = 0..3; // A k-mer target region
66/// let k = 4; // k-mer length
67/// let original_target = to_original_target_region(&kmer_target, k);
68/// assert_eq!(original_target, 0..6); // Original sequence region
69/// ```
70pub fn to_original_target_region(kmer_target: &Range<usize>, k: usize) -> Range<usize> {
71 // The start of the target region remains the same
72 let original_start = kmer_target.start;
73
74 // Attempt to reverse the end adjustment by adding k - 1, assuming the adjustment was due to k-mer calculation
75 let original_end = if kmer_target.end > original_start {
76 kmer_target.end + k - 1
77 } else {
78 kmer_target.end
79 };
80
81 original_start..original_end
82}
83
84/// Convert an original sequence target region to a k-mer target region.
85///
86/// This function takes a target region from the original sequence and converts it to the corresponding
87/// region for k-mer calculations by adjusting for k-mer length and sequence boundaries.
88///
89/// # Arguments
90///
91/// * `original_target` - The target region in the original sequence as a Range<usize>
92/// * `k` - The length of k-mers to use
93/// * `seq_len` - Optional sequence length to validate target region bounds
94///
95/// # Returns
96///
97/// A Result containing a Range<usize> representing the adjusted k-mer target region
98///
99/// # Errors
100///
101/// Returns `DPError::TargetRegionInvalid` if:
102/// - The target region is invalid (start >= end)
103/// - k is 0
104/// - The target region extends beyond sequence length (if seq_len provided)
105///
106/// # Example
107///
108/// ```
109/// use std::ops::Range;
110/// use deepbiop_core::kmer::to_kmer_target_region;
111///
112/// let original_target = 0..6; // Original sequence region
113/// let k = 4; // k-mer length
114/// let kmer_target = to_kmer_target_region(&original_target, k, Some(10)).unwrap();
115/// assert_eq!(kmer_target, 0..3); // Adjusted k-mer region
116/// ```
117pub fn to_kmer_target_region(
118 original_target: &Range<usize>,
119 k: usize,
120 seq_len: Option<usize>,
121) -> Result<Range<usize>> {
122 if original_target.start >= original_target.end || k == 0 {
123 return Err(Error::from(DPError::TargetRegionInvalid));
124 }
125
126 if let Some(seq_len) = seq_len {
127 // Ensure the target region is valid.
128 if original_target.end > seq_len {
129 return Err(Error::new(DPError::TargetRegionInvalid));
130 }
131 }
132
133 // Calculate how many k-mers can be formed starting within the original target region.
134 let num_kmers_in_target = if original_target.end - original_target.start >= k {
135 original_target.end - original_target.start - k + 1
136 } else {
137 0
138 };
139
140 // The new target region starts at the same position as the original target region.
141 let new_start = original_target.start;
142
143 // The end of the new target region needs to be adjusted based on the number of k-mers.
144 // It is the start position of the last k-mer that can be formed within the original target region.
145 let new_end = if num_kmers_in_target > 0 {
146 new_start + num_kmers_in_target
147 } else {
148 original_target.end
149 };
150
151 Ok(new_start..new_end)
152}
153
154/// Convert a DNA sequence into k-mers.
155///
156/// This function takes a DNA sequence and splits it into k-mers of specified length.
157/// The k-mers can be either overlapping or non-overlapping based on the `overlap` parameter.
158///
159/// # Arguments
160///
161/// * `seq` - A byte slice containing the DNA sequence
162/// * `k` - The length of each k-mer
163/// * `overlap` - Whether to generate overlapping k-mers
164///
165/// # Returns
166///
167/// A vector of k-mer byte slices
168///
169/// # Example
170///
171/// ```
172/// use deepbiop_core::kmer::seq_to_kmers;
173///
174/// let seq = b"ATCGATCG";
175///
176/// // Overlapping k-mers
177/// let kmers = seq_to_kmers(seq, 3, true);
178/// assert_eq!(kmers, vec![b"ATC", b"TCG", b"CGA", b"GAT", b"ATC", b"TCG"]);
179///
180/// // Non-overlapping k-mers
181/// let kmers = seq_to_kmers(seq, 3, false);
182/// assert_eq!(kmers, vec![b"ATC".as_slice(), b"GAT".as_slice(), b"CG".as_slice()]);
183/// ```
184pub fn seq_to_kmers(seq: &[u8], k: usize, overlap: bool) -> Vec<&[u8]> {
185 if overlap {
186 seq.par_windows(k).collect()
187 } else {
188 seq.par_chunks(k).collect()
189 }
190}
191/// Convert k-mers back into a DNA sequence.
192///
193/// This function takes a vector of k-mers and reconstructs the original DNA sequence.
194/// The k-mers are assumed to be in order and overlapping by k-1 bases.
195///
196/// # Arguments
197///
198/// * `kmers` - A vector of k-mer byte slices
199///
200/// # Returns
201///
202/// A Result containing the reconstructed DNA sequence as a byte vector
203///
204/// # Errors
205///
206/// Returns an error if any k-mer is invalid (empty)
207///
208/// # Example
209///
210/// ```
211/// use deepbiop_core::kmer::kmers_to_seq;
212///
213/// let kmers = vec![b"ATC".as_slice(), b"TCG".as_slice(), b"CGA".as_slice()];
214/// let seq = kmers_to_seq(kmers).unwrap();
215/// assert_eq!(seq, b"ATCGA");
216/// ```
217pub fn kmers_to_seq(kmers: Vec<&[u8]>) -> Result<Vec<u8>> {
218 // Early return for empty input
219 if kmers.is_empty() {
220 return Ok(Vec::new());
221 }
222
223 // Validate first kmer
224 let first_kmer = kmers[0];
225 if first_kmer.is_empty() {
226 return Err(DPError::InvalidKmerId.into());
227 }
228
229 // Initialize result with first kmer
230 let mut result = Vec::with_capacity(first_kmer.len() + kmers.len() - 1);
231 result.extend_from_slice(first_kmer);
232
233 // Process remaining kmers in parallel
234 let remaining_bases: Result<Vec<u8>> = kmers
235 .into_par_iter()
236 .skip(1)
237 .map(|kmer| {
238 kmer.last()
239 .ok_or_else(|| DPError::InvalidKmerId.into())
240 .copied()
241 })
242 .collect();
243
244 // Extend result with remaining bases
245 result.extend(remaining_bases?);
246
247 Ok(result)
248}
249
250/// Convert a DNA sequence into k-mers with their positions in the original sequence.
251///
252/// This function takes a DNA sequence and splits it into k-mers of specified length,
253/// returning both the k-mers and their start/end positions in the original sequence.
254///
255/// # Arguments
256///
257/// * `seq` - A DNA sequence as a byte slice
258/// * `kmer_size` - The length of each k-mer
259/// * `overlap` - Whether to generate overlapping k-mers
260///
261/// # Returns
262///
263/// A Result containing a vector of tuples, where each tuple contains:
264/// - A k-mer as a byte slice
265/// - A tuple of (start_position, end_position) indicating the k-mer's location in the original sequence
266///
267/// # Errors
268///
269/// Returns an error if:
270/// - `kmer_size` is 0
271/// - `kmer_size` is greater than the sequence length
272///
273/// # Example
274///
275/// ```
276/// use deepbiop_core::kmer::seq_to_kmers_and_offset;
277///
278/// let seq = b"ATCGA";
279/// let result = seq_to_kmers_and_offset(seq, 3, true).unwrap();
280/// assert_eq!(result.len(), 3);
281/// assert_eq!(result[0], (b"ATC".as_slice(), (0, 3)));
282/// assert_eq!(result[1], (b"TCG".as_slice(), (1, 4)));
283/// assert_eq!(result[2], (b"CGA".as_slice(), (2, 5)));
284/// ```
285#[allow(clippy::type_complexity)]
286pub fn seq_to_kmers_and_offset(
287 seq: &[u8],
288 kmer_size: usize,
289 overlap: bool,
290) -> Result<Vec<(&[u8], (usize, usize))>> {
291 // Check for invalid kmer_size
292 if kmer_size == 0 || kmer_size > seq.len() {
293 return Err(DPError::SeqShorterThanKmer.into());
294 }
295
296 if seq.is_empty() {
297 return Ok(Vec::new());
298 }
299
300 if overlap {
301 // Overlapping case: use .windows() with step of 1 (default behavior of .windows())
302 Ok(seq
303 .par_windows(kmer_size)
304 .enumerate()
305 .map(|(i, kmer)| (kmer, (i, i + kmer_size)))
306 .collect())
307 } else {
308 // Non-overlapping case: iterate with steps of kmer_size
309 Ok(seq
310 .par_chunks(kmer_size)
311 .enumerate()
312 .filter_map(|(i, chunk)| {
313 if chunk.len() == kmer_size {
314 Some((chunk, (i * kmer_size, i * kmer_size + kmer_size)))
315 } else {
316 // ignore the last chunk if it's shorter than kmer_size
317 None
318 }
319 })
320 .collect())
321 }
322}
323
324/// Generate a lookup table mapping k-mers to unique IDs.
325///
326/// This function takes a slice of base characters and a k-mer length,
327/// and generates a HashMap mapping each possible k-mer to a unique integer ID.
328///
329/// # Arguments
330///
331/// * `base` - A slice containing the base characters to use (e.g. b"ATCG")
332/// * `k` - The length of k-mers to generate
333///
334/// # Returns
335///
336/// A HashMap mapping k-mer byte sequences to integer IDs
337///
338/// # Example
339///
340/// ```
341/// use deepbiop_core::kmer::generate_kmers_table;
342///
343/// let bases = b"AC";
344/// let k = 2;
345/// let table = generate_kmers_table(bases, k);
346/// assert_eq!(table.len(), 4); // AA, AC, CA, CC
347/// ```
348pub fn generate_kmers_table(base: &[u8], k: u8) -> Kmer2IdTable {
349 generate_kmers(base, k)
350 .into_par_iter()
351 .enumerate()
352 .map(|(id, kmer)| (kmer, id as Element))
353 .collect()
354}
355
356/// Generate all possible k-mers from a set of base characters.
357///
358/// This function takes a slice of base characters and a k-mer length,
359/// and generates all possible k-mer combinations of that length.
360///
361/// # Arguments
362///
363/// * `bases` - A slice containing the base characters to use (e.g. b"ATCG")
364/// * `k` - The length of k-mers to generate
365///
366/// # Returns
367///
368/// A vector containing all possible k-mer combinations as byte vectors
369///
370/// # Example
371///
372/// ```
373/// use deepbiop_core::kmer::generate_kmers;
374///
375/// let bases = b"AC";
376/// let k = 2;
377/// let kmers = generate_kmers(bases, 2);
378/// assert_eq!(kmers.len(), 4); // AA, AC, CA, CC
379/// ```
380pub fn generate_kmers(bases: &[u8], k: u8) -> Vec<Vec<u8>> {
381 // Convert u8 slice to char Vec directly where needed
382 (0..k)
383 .map(|_| bases.iter().map(|&c| c as char)) // Direct conversion to char iter
384 .multi_cartesian_product()
385 .map(|combo| combo.into_iter().map(|c| c as u8).collect::<Vec<_>>())
386 .collect::<Vec<_>>()
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392 use bio::utils::Interval;
393
394 #[test]
395 fn test_seq_to_kmers() {
396 let seq1 = b"ATCGT";
397 let k = 2;
398 let kmers = seq_to_kmers(seq1, k, true);
399 assert_eq!(kmers.len(), seq1.len() - k + 1);
400
401 let seq2 = b"AT";
402 let k = 3;
403 let kmers = seq_to_kmers(seq2, k, true);
404 println!("{:?}", kmers);
405 assert_eq!(kmers.len(), 0);
406 }
407
408 #[test]
409 fn test_generate_kmers() {
410 // Test case 1: bases = ['A', 'C', 'G', 'T'], k = 2
411 let bases1 = b"ACGT";
412 let k1 = 2;
413 let expected1 = vec![
414 "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC",
415 "TG", "TT",
416 ]
417 .into_iter()
418 .map(|s| s.chars().map(|c| c as u8).collect::<Vec<_>>())
419 .collect::<Vec<_>>();
420
421 assert_eq!(generate_kmers(bases1, k1), expected1);
422
423 // Test case 2: bases = ['A', 'C'], k = 3
424 let bases2 = b"AC";
425 let k2 = 3;
426 let expected2 = vec!["AAA", "AAC", "ACA", "ACC", "CAA", "CAC", "CCA", "CCC"]
427 .into_iter()
428 .map(|s| s.chars().map(|c| c as u8).collect::<Vec<_>>())
429 .collect::<Vec<_>>();
430 assert_eq!(generate_kmers(bases2, k2), expected2);
431 }
432
433 #[test]
434 fn test_generate_kmers_table() {
435 let base = b"ACGT";
436 let k = 2;
437 let expected_table: Kmer2IdTable = [
438 ("AA", 0),
439 ("GC", 9),
440 ("GT", 11),
441 ("CA", 4),
442 ("TA", 12),
443 ("TC", 13),
444 ("CG", 6),
445 ("CT", 7),
446 ("GA", 8),
447 ("AG", 2),
448 ("AC", 1),
449 ("AT", 3),
450 ("CC", 5),
451 ("GG", 10),
452 ("TG", 14),
453 ("TT", 15),
454 ]
455 .iter()
456 .map(|&(kmer, id)| (kmer.chars().map(|c| c as u8).collect(), id))
457 .collect();
458
459 assert_eq!(generate_kmers_table(base, k), expected_table);
460 }
461
462 #[test]
463 fn test_generate_kmers_table_empty_base() {
464 use ahash::HashMap;
465 use ahash::HashMapExt;
466
467 let base = b"";
468 let k = 2;
469 let expected_table: Kmer2IdTable = HashMap::new();
470 assert_eq!(generate_kmers_table(base, k), expected_table);
471 }
472
473 #[test]
474 fn test_construct_seq_from_kmers() {
475 let k = 3;
476 let seq = b"AAACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT";
477 let kmers = seq_to_kmers(seq, k, true);
478 let kmers_as_bytes: Vec<&[u8]> = kmers.into_iter().collect();
479 let result = kmers_to_seq(kmers_as_bytes).unwrap();
480 assert_eq!(seq.to_vec(), result);
481 }
482
483 #[test]
484 fn test_update_target_region() {
485 let original_target: Interval<usize> = (2..6).into(); // Target region [2, 6)
486 let k = 3; // K-mer size
487 let new_target_region = to_kmer_target_region(&original_target, k, None).unwrap();
488 assert_eq!(new_target_region, 2..4);
489 }
490
491 #[test]
492 fn test_update_target_region_valid() {
493 let original_target = Interval::new(0..10).unwrap();
494 let k = 3;
495 let seq_len = Some(20);
496
497 let result = to_kmer_target_region(&original_target, k, seq_len);
498
499 assert!(result.is_ok());
500 let new_target = result.unwrap();
501
502 assert_eq!(new_target.start, original_target.start);
503 assert_eq!(new_target.end, original_target.start + 8);
504 }
505
506 #[test]
507 fn test_update_target_region_invalid_start_greater_than_end() {
508 let original_target = Interval::new(10..10).unwrap();
509 let k = 3;
510 let seq_len = Some(20);
511
512 let result = to_kmer_target_region(&original_target, k, seq_len);
513 assert!(result.is_err());
514
515 assert_eq!(
516 result.unwrap_err().to_string(),
517 DPError::TargetRegionInvalid.to_string()
518 );
519 }
520
521 #[test]
522 fn test_update_target_region_invalid_end_greater_than_seq_len() {
523 let original_target = Interval::new(0..25).unwrap();
524 let k = 3;
525 let seq_len = Some(20);
526
527 let result = to_kmer_target_region(&original_target, k, seq_len);
528
529 assert!(result.is_err());
530 assert_eq!(
531 result.unwrap_err().to_string(),
532 DPError::TargetRegionInvalid.to_string()
533 );
534 }
535
536 #[test]
537 fn test_to_original_target_region() {
538 // Test case 1: kmer_target.end > original_start
539 let kmer_target = 2..5;
540 let k = 3;
541 let expected = 2..7;
542
543 assert_eq!(
544 to_kmer_target_region(&expected, k, None).unwrap(),
545 kmer_target
546 );
547 assert_eq!(to_original_target_region(&kmer_target, k), expected);
548
549 // Test case 3: kmer_target.end == original_start
550 let kmer_target = 5..5;
551 let k = 3;
552 let expected = 5..5;
553 assert_eq!(to_original_target_region(&kmer_target, k), expected);
554 }
555
556 #[test]
557 fn test_seq_to_kmers_and_offset_overlap() {
558 let seq = b"ATCGATCGATCG";
559 let kmer_size = 4;
560 let overlap = true;
561 let result = seq_to_kmers_and_offset(seq, kmer_size, overlap).unwrap();
562 assert_eq!(result.len(), seq.len() - kmer_size + 1);
563 assert_eq!(result[0], (&b"ATCG"[..], (0, 4)));
564 assert_eq!(result[1], (&b"TCGA"[..], (1, 5)));
565 assert_eq!(result[result.len() - 1], (&b"ATCG"[..], (8, 12)));
566 }
567
568 #[test]
569 fn test_seq_to_kmers_and_offset_non_overlap() {
570 let seq = b"ATCGATCGATCG";
571 let kmer_size = 4;
572 let overlap = false;
573 let result = seq_to_kmers_and_offset(seq, kmer_size, overlap).unwrap();
574 assert_eq!(result.len(), seq.len() / kmer_size);
575 assert_eq!(result[0], (&b"ATCG"[..], (0, 4)));
576 assert_eq!(result[1], (&b"ATCG"[..], (4, 8)));
577 }
578}