readfish_tools/paf.rs
1//! Paf file functions
2//! In this module we implement a Paf struct and functions to read and write Paf files.
3//! A lot of this was lifted from https://github.com/mrvollger/rustybam/blob/main/src/paf.rs
4//!
5
6use crate::{
7 readfish::Conf,
8 readfish_io::{reader, DynResult},
9 sequencing_summary::SeqSum,
10 Summary,
11};
12use lazy_static::lazy_static;
13use regex::Regex;
14use std::{
15 io::BufRead,
16 path::{Path, PathBuf},
17};
18
19lazy_static! {
20 static ref PAF_TAG: Regex = Regex::new("(..):(.):(.*)").unwrap();
21}
22
23/// Store metadata that is provided by a tuple in a call to parse_paf_by_iter in lib.rs.
24/// See also `[sequencing_summary::SeqSumInfo]`.
25#[derive(Debug)]
26pub struct Metadata {
27 /// The identifier for the read.
28 pub read_id: String,
29 /// The channel associated with the read.
30 pub channel: usize,
31 /// An optional barcode associated with the read, if available.
32 pub barcode: Option<String>,
33}
34
35impl From<(String, usize, Option<String>)> for Metadata {
36 fn from(value: (String, usize, Option<String>)) -> Self {
37 Metadata {
38 read_id: value.0,
39 channel: value.1,
40 barcode: value.2,
41 }
42 }
43}
44
45impl Metadata {
46 /// Get the identifier for the read.
47 pub fn read_id(&self) -> &String {
48 &self.read_id
49 }
50
51 /// Get the channel associated with the read.
52 pub fn channel(&self) -> usize {
53 self.channel
54 }
55
56 /// Get the optional barcode associated with the read, if available.
57 pub fn barcode(&self) -> Option<&String> {
58 self.barcode.as_ref()
59 }
60}
61
62/// Store a PafRecord for quick unpacking to update the summary
63#[derive(Debug, Clone)]
64pub struct PafRecord {
65 /// The name of the query sequence (read).
66 pub query_name: String,
67 /// The length of the query sequence (read).
68 pub query_length: usize,
69 /// The start position of the alignment on the query sequence (read).
70 pub query_start: usize,
71 /// The end position of the alignment on the query sequence (read).
72 pub query_end: usize,
73 /// The strand of the alignment ('+' or '-').
74 pub strand: char,
75 /// The name of the target sequence (reference).
76 pub target_name: String,
77 /// The length of the target sequence (reference).
78 pub target_length: usize,
79 /// The start position of the alignment on the target sequence (reference).
80 pub target_start: usize,
81 /// The end position of the alignment on the target sequence (reference).
82 pub target_end: usize,
83 /// The number of matching bases in the alignment.
84 pub nmatch: usize,
85 /// The total length of the alignment.
86 pub aln_len: usize,
87 /// The mapping quality of the alignment.
88 pub mapq: usize,
89 // pub cigar: CigarString,
90 // A vector of additional tags associated with the alignment.
91 // pub tags: Vec<String>,
92 // pub tpos_aln: Vec<u64>,
93 // pub qpos_aln: Vec<u64>,
94 // pub long_cigar: CigarString,
95 // pub id: String,
96 // pub order: u64,
97 // pub contained: bool,
98}
99/// Errors that can occur while parsing PAF (Pairwise mApping Format) files.
100#[derive(Debug)]
101pub enum Error {
102 /// An error occurred while parsing the CIGAR string in the PAF record.
103 PafParseCigar {
104 /// The error message.
105 msg: String,
106 },
107 /// An error occurred while parsing the CS (Coordinate System) tag in the PAF record.
108 PafParseCS {
109 /// The error message.
110 msg: String,
111 },
112 /// An error occurred while parsing an integer value in the PAF record.
113 ParseIntError {
114 /// The error message.
115 msg: String,
116 },
117 /// An error occurred while parsing a column in the PAF record.
118 ParsePafColumn {},
119}
120
121/// A type alias for a Result with the error type specialized to `crate::paf::Error`.
122type PafResult<T> = Result<T, crate::paf::Error>;
123
124impl PafRecord {
125 /// New paf record
126 pub fn new(t: Vec<&str>) -> PafResult<PafRecord> {
127 // make the record
128 let rec = PafRecord {
129 query_name: t[0].to_string(),
130 query_length: t[1]
131 .parse::<usize>()
132 .map_err(|_| Error::ParsePafColumn {})?,
133 query_start: t[2]
134 .parse::<usize>()
135 .map_err(|_| Error::ParsePafColumn {})?,
136 query_end: t[3]
137 .parse::<usize>()
138 .map_err(|_| Error::ParsePafColumn {})?,
139 strand: t[4].parse::<char>().map_err(|_| Error::ParsePafColumn {})?,
140 target_name: t[5].to_string(),
141 target_length: t[6]
142 .parse::<usize>()
143 .map_err(|_| Error::ParsePafColumn {})?,
144 target_start: t[7]
145 .parse::<usize>()
146 .map_err(|_| Error::ParsePafColumn {})?,
147 target_end: t[8]
148 .parse::<usize>()
149 .map_err(|_| Error::ParsePafColumn {})?,
150 nmatch: t[9]
151 .parse::<usize>()
152 .map_err(|_| Error::ParsePafColumn {})?,
153 aln_len: t[10]
154 .parse::<usize>()
155 .map_err(|_| Error::ParsePafColumn {})?,
156 mapq: t[11]
157 .parse::<usize>()
158 .map_err(|_| Error::ParsePafColumn {})?,
159 };
160 Ok(rec)
161 }
162}
163
164/// A struct representing a PAF record reader and writers for demultiplexing.
165///
166/// This struct holds a reader and a list of writers used for demultiplexing PAF records
167/// into different files. The `reader` field is a `Box<dyn BufRead + Send>` representing a
168/// buffered input reader from which PAF records are read. The `writers` field is a `Vec<Box<dyn Write>>`
169/// holding multiple output writers for writing the demultiplexed PAF records into different files.
170///
171/// # Fields
172///
173/// * `reader`: A boxed trait object implementing `BufRead` and `Send`, used as the input reader
174/// for reading PAF records.
175/// * `writers`: A vector of boxed trait objects implementing `Write`, used as the output writers
176/// for writing the demultiplexed PAF records into different files.
177/// * `paf_file`: The path to the PAF file.
178///
179/// # Examples
180///
181/// ```rust, ignore
182/// use std::fs::File;
183/// use std::io::{BufReader, BufWriter};
184/// use std::path::Path;
185///
186/// // Create a reader for the PAF file
187/// let file_path = Path::new("example.paf");
188/// let file = File::open(file_path).expect("Error: Failed to open file");
189/// let reader = Box::new(BufReader::new(file));
190///
191/// // Create multiple writers for demultiplexing the PAF records
192/// let writer1 = Box::new(BufWriter::new(File::create("output1.paf").unwrap()));
193/// let writer2 = Box::new(BufWriter::new(File::create("output2.paf").unwrap()));
194/// let writers = vec![writer1, writer2];
195///
196/// // Create a PAF object
197/// let paf = Paf { reader, writers };
198/// ```
199///
200pub struct Paf {
201 /// The provided PAF file.
202 pub paf_file: PathBuf,
203 /// Reader for the Paf file.
204 pub reader: Box<dyn BufRead + Send>,
205 // / Multiple writes, one for each demultiplexed file.
206 // pub writers: Vec<Box<dyn Write>>,
207}
208
209impl Paf {
210 /// Create a new `Paf` object with the given PAF file.
211 ///
212 /// This function creates a new `Paf` object by parsing the specified PAF file
213 /// and initializing the `reader` field with the resulting buffered input reader.
214 /// The `writers` field is initialized as an empty vector of output writers.
215 ///
216 /// # Arguments
217 ///
218 /// * `paf_file`: An implementation of the `AsRef<Path>` trait representing the path to the PAF file.
219 ///
220 /// # Returns
221 ///
222 /// A new `Paf` object with the parsed PAF file as the input reader and an empty vector of writers.
223 ///
224 /// # Panics
225 ///
226 /// This function will panic if there is an error while parsing the PAF file or creating the buffered input reader.
227 ///
228 /// # Examples
229 ///
230 /// ```rust,ignore
231 /// use std::path::Path;
232 /// use readfish_tools::Paf;
233 ///
234 /// // Create a new Paf object from the "example.paf" file
235 /// let paf_file_path = Path::new("example.paf");
236 /// let paf = Paf::new(paf_file_path);
237 /// ```
238 ///
239 pub fn new(paf_file: impl AsRef<Path>) -> Paf {
240 Paf {
241 paf_file: paf_file.as_ref().to_path_buf(),
242 reader: open_paf_for_reading(paf_file).unwrap(),
243 // writers: vec![],
244 }
245 }
246
247 /// Demultiplexes the PAF file by processing each line and obtaining corresponding sequencing summary records.
248 ///
249 /// This function reads the PAF file line by line, parses each line, and processes the custom tags present in the PAF format.
250 /// These custom tags are add by readfish's implementation summarise on the Aligner.
251 /// If the `sequencing_summary` argument is provided, it retrieves the sequencing summary record for each line's query name.
252 /// The function processes custom tags in the PAF file and ensures they are present. If `sequencing_summary` is None and custom tags are missing,
253 /// the function will panic.
254 ///
255 /// If `sequencing_summary` is provided, the function retrieves the sequencing summary record for each query name using the `get_record` function.
256 /// If a sequencing summary record is not found in the buffer, the function reads from the sequencing summary file until the record is found.
257 /// The function consumes the bytes in the PAF file and updates the `previous_read_id` to avoid removing multiple mappings from the `sequencing_summary`
258 /// only when the new Read Id is not the same as the old read_id.
259 ///
260 /// # Arguments
261 ///
262 /// - `toml`: A reference to the `Conf` struct, which contains configuration settings.
263 /// - `sequencing_summary`: An optional mutable reference to the `SeqSum` struct, representing the sequencing summary file.
264 ///
265 /// # Errors
266 ///
267 /// This function returns a `DynResult`, which is a specialized `Result` type with an error message.
268 /// An error is returned if there is any issue reading the PAF file or if the sequencing summary file is not found.
269 ///
270 /// # Examples
271 ///
272 /// ```rust,ignore
273 /// // Import necessary libraries
274 /// use std::error::Error;
275 /// use my_crate::{SeqSum, Conf};
276 ///
277 /// // Create a new sequencing summary instance
278 /// let mut sequencing_summary = SeqSum::from_file("path/to/sequencing_summary.toml")?;
279 ///
280 /// // Load the TOML configuration
281 /// let toml = Conf::from_file("path/to/config.toml")?;
282 ///
283 /// // Demultiplex the PAF file using the sequencing summary
284 /// sequencing_summary.demultiplex(&toml, Some(&mut sequencing_summary))?;
285 /// ```
286 pub fn demultiplex(
287 &mut self,
288 _toml: &mut Conf,
289 sequencing_summary: Option<&mut SeqSum>,
290 mut summary: Option<&mut Summary>,
291 ) -> DynResult<()> {
292 let seq_sum = sequencing_summary.unwrap();
293
294 // Remove multiple mappings from seq_sum dictionary only when the new Read Id is not the same as the old read_id
295 for line in open_paf_for_reading(self.paf_file.clone())?.lines() {
296 let (paf_record, read_on, condition_name) =
297 _parse_paf_line(line?, _toml, None, Some(seq_sum))?;
298
299 if let Some(summary) = summary.as_deref_mut() {
300 let condition_summary = summary.conditions(condition_name.as_str());
301 condition_summary.update(paf_record, read_on).unwrap();
302 }
303 }
304 Ok(())
305 }
306}
307
308/// Parses the PAF file and returns a buffered reader for further processing.
309///
310/// This function takes the `file_name` as an input and returns a `Result` containing
311/// a boxed buffered reader (`Box<dyn BufRead + Send>`) if the PAF file is valid.
312///
313/// # Arguments
314///
315/// * `file_name`: The path to the PAF file to be parsed. It should implement the `AsRef<Path>` trait.
316///
317/// # Returns
318///
319/// A `Result` containing the buffered reader (`Box<dyn BufRead + Send>`) if the PAF file is valid,
320/// otherwise an `Err` containing an error message.
321///
322/// # Errors
323///
324/// The function returns an `Err` in the following cases:
325/// - If the file is empty (contains no bytes), it returns an error with the message "Error: empty file".
326/// - If the file format is invalid, specifically if any of the first twelve columns contains a ':' character,
327/// it returns an error with the message "Error: invalid format for PAF file. Missing one of first twelve columns, or values contain a :."
328/// - If there are any I/O errors while reading the file, the function returns an error with the specific I/O error message.
329///
330/// # Example
331///
332/// ```rust,ignore
333/// use std::path::Path;
334/// use std::io::BufRead;
335///
336/// let file_name = Path::new("example.paf");
337/// match open_paf_for_reading(file_name) {
338/// Ok(reader) => {
339/// let mut lines = reader.lines();
340/// // Process the PAF file line by line
341/// for line in lines {
342/// match line {
343/// Ok(line_content) => {
344/// // Process the content of the PAF line
345/// // ...
346/// }
347/// Err(error) => {
348/// eprintln!("Error while reading PAF line: {}", error);
349/// }
350/// }
351/// }
352/// }
353/// Err(error) => {
354/// eprintln!("Error while parsing PAF file: {}", error);
355/// }
356/// }
357/// ```
358pub fn open_paf_for_reading(file_name: impl AsRef<Path>) -> DynResult<Box<dyn BufRead + Send>> {
359 // create reader to check file first line
360 let mut paf_file = reader(&file_name, None);
361
362 // Check the file isn't empty
363 let mut buffer = [0; 1];
364 let bytes_read = paf_file.read(&mut buffer)?;
365 if bytes_read == 0 {
366 return Err("Error: empty file".into());
367 }
368 let mut line = String::new();
369 paf_file.read_line(&mut line)?;
370 let t: Vec<&str> = line.split_ascii_whitespace().collect();
371 if t.iter().take(12).any(|item| item.contains(':')) {
372 return Err("Error: invalid format for Paf file. Missing one of first twelve columns, or values contain a :.".into());
373 }
374
375 let paf_file = reader(file_name, None);
376
377 Ok(paf_file)
378}
379
380/// Parses a line from the PAF file and extracts relevant information.
381///
382/// This function takes a PAF line (as a reference to a string) and attempts to parse it to extract
383/// relevant information, including creating a [`PafRecord`] and making decisions based on the provided
384/// metadata or sequencing summary. It returns a tuple containing the `PafRecord`, a boolean value
385/// indicating if the read is considered "on-target", and the condition name associated with the read.
386///
387/// # Arguments
388///
389/// * `paf_line`: A reference to a string slice representing a line from the PAF file.
390/// * `_toml`: A reference to a `Conf` struct, holding configuration information.
391/// * `meta_data`: An optional mutable reference to a `Metadata` struct containing read metadata.
392/// * `sequencing_summary`: An optional mutable reference to a `SeqSum` struct containing sequencing summary data.
393///
394/// # Returns
395///
396/// A `DynResult` holding a tuple containing the following elements:
397/// * `PafRecord`: The parsed PAF record representing the alignment information.
398/// * `bool`: A boolean value indicating if the read is considered "on-target".
399/// * `&'a String`: A reference to the condition name associated with the read.
400///
401/// # Panics
402///
403/// This function panics if the PAF line contains missing items in the first 12 columns or if both `meta_data`
404/// and `sequencing_summary` are `None`.
405///
406/// # Examples
407///
408/// ```rust, ignore
409/// # use your_crate::{PafRecord, Metadata, SeqSum, _parse_paf_line};
410///
411/// // Assuming we have valid inputs
412/// let paf_line = "read123 200 0 200 + contig123 300 0 300 200 200 50 ch=1";
413/// let _toml = Conf::default();
414/// let mut metadata = Metadata {
415/// read_id: "read123".to_string(),
416/// channel: 1,
417/// barcode: Some("sampleA".to_string()),
418/// };
419/// let mut seq_sum = SeqSum::default();
420///
421/// let result = _parse_paf_line(paf_line, &_toml, Some(&mut metadata), Some(&mut seq_sum));
422///
423/// match result {
424/// Ok((paf_record, read_on, condition_name)) => {
425/// // Do something with the parsed data
426/// }
427/// Err(err) => {
428/// // Handle the error
429/// }
430/// }
431/// ```
432pub fn _parse_paf_line<'a>(
433 paf_line: impl AsRef<str>,
434 _toml: &'a Conf,
435 meta_data: Option<&mut Metadata>,
436 sequencing_summary: Option<&mut SeqSum>,
437) -> DynResult<(PafRecord, bool, &'a String)> {
438 let line = paf_line.as_ref();
439 let t: Vec<&str> = line.split_ascii_whitespace().collect();
440 // Todo do without clone
441 let paf_record = PafRecord::new(t.clone()).unwrap();
442 // Check first 12 columns for missing items, assumes tags will have been brought forwards
443 assert!(
444 t.iter().take(12).all(|item| !item.contains(':')),
445 "Missing colon in PAF line: {}",
446 line
447 );
448 // check if we have custom tags from readfish aligner analyse
449 let channel: usize;
450 let barcode: Option<String>;
451 // for token in t.iter().skip(12) {
452 // debug_assert!(PAF_TAG.is_match(token));
453 // let caps = PAF_TAG.captures(token).unwrap();
454 // let tag = &caps[1];
455 // // let value = &caps[3];
456 // if (tag == "ch") | (tag == "ba") {
457 // has_tags = true;
458 // }
459 // }
460 // Break the Paf line into its components
461 let query_name = t[0];
462 // let query_length: usize = t[1].parse()?;
463 let strand = t[4];
464 let contig = t[5];
465 // let contig_length: usize = t[6].parse()?;
466 let mapping_start: usize = t[7].parse()?;
467 let read_on: bool;
468 if meta_data.is_none() & sequencing_summary.is_none() {
469 panic!("Cannot parse paf line without provided metdata or sequencing summary_file");
470 }
471 // If sequencing summary is provided, get the sequencing summary record for the query name
472 // Use it for things like barcodes and channels
473 if let Some(seq_sum_struct) = sequencing_summary {
474 let seq_sum_record = seq_sum_struct.get_record(query_name, None);
475 if let Ok(record) = seq_sum_record {
476 read_on = _toml.make_decision(
477 record.1.get_channel().unwrap(),
478 record.2.get_barcode().map(|x| x.as_str()),
479 contig,
480 strand,
481 mapping_start,
482 );
483 channel = record.1.get_channel().unwrap();
484 barcode = Some(record.2.get_barcode().unwrap_or(&"".to_string()).clone());
485 } else {
486 return Err("Error: sequencing summary record not found".into());
487 }
488 seq_sum_struct.previous_read_id = query_name.to_string();
489 // We must have metatdata
490 } else {
491 let metadata = meta_data.unwrap();
492 // println!("{contig}, {strand}, {mapping_start}");
493 read_on = _toml.make_decision(
494 metadata.channel(),
495 metadata.barcode().map(|x| x.as_str()),
496 contig,
497 strand,
498 mapping_start,
499 );
500 channel = metadata.channel();
501 barcode = Some(metadata.barcode().unwrap_or(&"".to_string()).clone());
502 }
503 // get the condition so we can access name etc.
504 let (_control, condition) = _toml.get_conditions(channel, barcode)?;
505 let condition = condition.get_condition();
506 let condition_name = &condition.name;
507
508 Ok((paf_record, read_on, condition_name))
509}
510
511#[cfg(test)]
512mod tests {
513 use super::*;
514
515 fn get_resource_dir() -> PathBuf {
516 let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
517 path.push("resources/");
518 path
519 }
520
521 fn get_test_file(file: &str) -> PathBuf {
522 let mut path = get_resource_dir();
523 path.push(file);
524 path
525 }
526
527 #[test]
528 fn test_from_tuple() {
529 let tuple = ("ABC123".to_string(), 1, Some("BCDE".to_string()));
530 let metadata = Metadata::from(tuple);
531
532 assert_eq!(metadata.read_id(), "ABC123");
533 assert_eq!(metadata.channel(), 1);
534 assert_eq!(metadata.barcode(), Some(&"BCDE".to_string()));
535 }
536
537 #[test]
538 fn test_from_tuple_no_barcode() {
539 let tuple = ("XYZ789".to_string(), 2, None);
540 let metadata = Metadata::from(tuple);
541
542 assert_eq!(metadata.read_id(), "XYZ789");
543 assert_eq!(metadata.channel(), 2);
544 assert_eq!(metadata.barcode(), None);
545 }
546
547 #[test]
548 fn test_read_id() {
549 let metadata = Metadata {
550 read_id: "ABC123".to_string(),
551 channel: 1,
552 barcode: None,
553 };
554
555 assert_eq!(metadata.read_id(), "ABC123");
556 }
557
558 #[test]
559 fn test_channel() {
560 let metadata = Metadata {
561 read_id: "ABC123".to_string(),
562 channel: 1,
563 barcode: Some("BCDE".to_string()),
564 };
565
566 assert_eq!(metadata.channel(), 1);
567 }
568
569 #[test]
570 fn test_barcode_present() {
571 let metadata = Metadata {
572 read_id: "ABC123".to_string(),
573 channel: 1,
574 barcode: Some("BCDE".to_string()),
575 };
576
577 assert_eq!(metadata.barcode(), Some(&"BCDE".to_string()));
578 }
579
580 #[test]
581 fn test_barcode_absent() {
582 let metadata = Metadata {
583 read_id: "ABC123".to_string(),
584 channel: 1,
585 barcode: None,
586 };
587
588 assert_eq!(metadata.barcode(), None);
589 }
590
591 #[test]
592 fn test_from_file_valid_paf() {
593 let file_name = get_test_file("test_hum_4000.paf");
594 let result = open_paf_for_reading(file_name);
595 assert!(
596 result.is_ok(),
597 "Expected Ok, but got an error: {:?}",
598 result.err()
599 );
600 }
601
602 #[test]
603 fn test_from_file_invalid_paf() {
604 let file_name = get_test_file("invalid_file.paf");
605 let result = open_paf_for_reading(file_name);
606 assert!(result.is_err(), "Expected Err, but got Ok");
607 }
608
609 #[test]
610 fn test_from_file_empty_file() {
611 let file_name = get_test_file("empty.paf");
612 let result = open_paf_for_reading(file_name);
613 assert!(result.is_err(), "Expected Err, but got Ok");
614 }
615
616 #[test]
617 #[should_panic]
618 fn test_from_file_nonexistent_file() {
619 let file_name = get_test_file("no_existo.paf");
620 let result = open_paf_for_reading(file_name);
621 assert!(result.is_err(), "Expected Err, but got Ok");
622 }
623
624 #[test]
625 fn test_paf_from_file() {
626 open_paf_for_reading(get_test_file("test_hum_4000.paf")).unwrap();
627 // assert_eq!(paf.records.len(), 4148usize);
628 }
629}