readfish_tools/
paf.rs

1//! Paf file functions
2//! In this module we implement a Paf struct and functions to read and write Paf files.
3//! A lot of this was lifted from https://github.com/mrvollger/rustybam/blob/main/src/paf.rs
4//!
5
6use crate::{
7    readfish::Conf,
8    readfish_io::{reader, DynResult},
9    sequencing_summary::SeqSum,
10    Summary,
11};
12use lazy_static::lazy_static;
13use regex::Regex;
14use std::{
15    io::BufRead,
16    path::{Path, PathBuf},
17};
18
19lazy_static! {
20    static ref PAF_TAG: Regex = Regex::new("(..):(.):(.*)").unwrap();
21}
22
23/// Store metadata that is provided by a tuple in a call to parse_paf_by_iter in lib.rs.
24/// See also `[sequencing_summary::SeqSumInfo]`.
25#[derive(Debug)]
26pub struct Metadata {
27    /// The identifier for the read.
28    pub read_id: String,
29    /// The channel associated with the read.
30    pub channel: usize,
31    /// An optional barcode associated with the read, if available.
32    pub barcode: Option<String>,
33}
34
35impl From<(String, usize, Option<String>)> for Metadata {
36    fn from(value: (String, usize, Option<String>)) -> Self {
37        Metadata {
38            read_id: value.0,
39            channel: value.1,
40            barcode: value.2,
41        }
42    }
43}
44
45impl Metadata {
46    /// Get the identifier for the read.
47    pub fn read_id(&self) -> &String {
48        &self.read_id
49    }
50
51    /// Get the channel associated with the read.
52    pub fn channel(&self) -> usize {
53        self.channel
54    }
55
56    /// Get the optional barcode associated with the read, if available.
57    pub fn barcode(&self) -> Option<&String> {
58        self.barcode.as_ref()
59    }
60}
61
62/// Store a PafRecord for quick unpacking to update the summary
63#[derive(Debug, Clone)]
64pub struct PafRecord {
65    /// The name of the query sequence (read).
66    pub query_name: String,
67    /// The length of the query sequence (read).
68    pub query_length: usize,
69    /// The start position of the alignment on the query sequence (read).
70    pub query_start: usize,
71    /// The end position of the alignment on the query sequence (read).
72    pub query_end: usize,
73    /// The strand of the alignment ('+' or '-').
74    pub strand: char,
75    /// The name of the target sequence (reference).
76    pub target_name: String,
77    /// The length of the target sequence (reference).
78    pub target_length: usize,
79    /// The start position of the alignment on the target sequence (reference).
80    pub target_start: usize,
81    /// The end position of the alignment on the target sequence (reference).
82    pub target_end: usize,
83    /// The number of matching bases in the alignment.
84    pub nmatch: usize,
85    /// The total length of the alignment.
86    pub aln_len: usize,
87    /// The mapping quality of the alignment.
88    pub mapq: usize,
89    // pub cigar: CigarString,
90    // A vector of additional tags associated with the alignment.
91    // pub tags: Vec<String>,
92    // pub tpos_aln: Vec<u64>,
93    // pub qpos_aln: Vec<u64>,
94    // pub long_cigar: CigarString,
95    // pub id: String,
96    // pub order: u64,
97    // pub contained: bool,
98}
99/// Errors that can occur while parsing PAF (Pairwise mApping Format) files.
100#[derive(Debug)]
101pub enum Error {
102    /// An error occurred while parsing the CIGAR string in the PAF record.
103    PafParseCigar {
104        /// The error message.
105        msg: String,
106    },
107    /// An error occurred while parsing the CS (Coordinate System) tag in the PAF record.
108    PafParseCS {
109        /// The error message.
110        msg: String,
111    },
112    /// An error occurred while parsing an integer value in the PAF record.
113    ParseIntError {
114        /// The error message.
115        msg: String,
116    },
117    /// An error occurred while parsing a column in the PAF record.
118    ParsePafColumn {},
119}
120
121/// A type alias for a Result with the error type specialized to `crate::paf::Error`.
122type PafResult<T> = Result<T, crate::paf::Error>;
123
124impl PafRecord {
125    /// New paf record
126    pub fn new(t: Vec<&str>) -> PafResult<PafRecord> {
127        // make the record
128        let rec = PafRecord {
129            query_name: t[0].to_string(),
130            query_length: t[1]
131                .parse::<usize>()
132                .map_err(|_| Error::ParsePafColumn {})?,
133            query_start: t[2]
134                .parse::<usize>()
135                .map_err(|_| Error::ParsePafColumn {})?,
136            query_end: t[3]
137                .parse::<usize>()
138                .map_err(|_| Error::ParsePafColumn {})?,
139            strand: t[4].parse::<char>().map_err(|_| Error::ParsePafColumn {})?,
140            target_name: t[5].to_string(),
141            target_length: t[6]
142                .parse::<usize>()
143                .map_err(|_| Error::ParsePafColumn {})?,
144            target_start: t[7]
145                .parse::<usize>()
146                .map_err(|_| Error::ParsePafColumn {})?,
147            target_end: t[8]
148                .parse::<usize>()
149                .map_err(|_| Error::ParsePafColumn {})?,
150            nmatch: t[9]
151                .parse::<usize>()
152                .map_err(|_| Error::ParsePafColumn {})?,
153            aln_len: t[10]
154                .parse::<usize>()
155                .map_err(|_| Error::ParsePafColumn {})?,
156            mapq: t[11]
157                .parse::<usize>()
158                .map_err(|_| Error::ParsePafColumn {})?,
159        };
160        Ok(rec)
161    }
162}
163
164/// A struct representing a PAF record reader and writers for demultiplexing.
165///
166/// This struct holds a reader and a list of writers used for demultiplexing PAF records
167/// into different files. The `reader` field is a `Box<dyn BufRead + Send>` representing a
168/// buffered input reader from which PAF records are read. The `writers` field is a `Vec<Box<dyn Write>>`
169/// holding multiple output writers for writing the demultiplexed PAF records into different files.
170///
171/// # Fields
172///
173/// * `reader`: A boxed trait object implementing `BufRead` and `Send`, used as the input reader
174///   for reading PAF records.
175/// * `writers`: A vector of boxed trait objects implementing `Write`, used as the output writers
176///   for writing the demultiplexed PAF records into different files.
177/// * `paf_file`: The path to the PAF file.
178///
179/// # Examples
180///
181/// ```rust, ignore
182/// use std::fs::File;
183/// use std::io::{BufReader, BufWriter};
184/// use std::path::Path;
185///
186/// // Create a reader for the PAF file
187/// let file_path = Path::new("example.paf");
188/// let file = File::open(file_path).expect("Error: Failed to open file");
189/// let reader = Box::new(BufReader::new(file));
190///
191/// // Create multiple writers for demultiplexing the PAF records
192/// let writer1 = Box::new(BufWriter::new(File::create("output1.paf").unwrap()));
193/// let writer2 = Box::new(BufWriter::new(File::create("output2.paf").unwrap()));
194/// let writers = vec![writer1, writer2];
195///
196/// // Create a PAF object
197/// let paf = Paf { reader, writers };
198/// ```
199///
200pub struct Paf {
201    /// The provided PAF file.
202    pub paf_file: PathBuf,
203    /// Reader for the Paf file.
204    pub reader: Box<dyn BufRead + Send>,
205    // / Multiple writes, one for each demultiplexed file.
206    // pub writers: Vec<Box<dyn Write>>,
207}
208
209impl Paf {
210    /// Create a new `Paf` object with the given PAF file.
211    ///
212    /// This function creates a new `Paf` object by parsing the specified PAF file
213    /// and initializing the `reader` field with the resulting buffered input reader.
214    /// The `writers` field is initialized as an empty vector of output writers.
215    ///
216    /// # Arguments
217    ///
218    /// * `paf_file`: An implementation of the `AsRef<Path>` trait representing the path to the PAF file.
219    ///
220    /// # Returns
221    ///
222    /// A new `Paf` object with the parsed PAF file as the input reader and an empty vector of writers.
223    ///
224    /// # Panics
225    ///
226    /// This function will panic if there is an error while parsing the PAF file or creating the buffered input reader.
227    ///
228    /// # Examples
229    ///
230    /// ```rust,ignore
231    /// use std::path::Path;
232    /// use readfish_tools::Paf;
233    ///
234    /// // Create a new Paf object from the "example.paf" file
235    /// let paf_file_path = Path::new("example.paf");
236    /// let paf = Paf::new(paf_file_path);
237    /// ```
238    ///
239    pub fn new(paf_file: impl AsRef<Path>) -> Paf {
240        Paf {
241            paf_file: paf_file.as_ref().to_path_buf(),
242            reader: open_paf_for_reading(paf_file).unwrap(),
243            // writers: vec![],
244        }
245    }
246
247    /// Demultiplexes the PAF file by processing each line and obtaining corresponding sequencing summary records.
248    ///
249    /// This function reads the PAF file line by line, parses each line, and processes the custom tags present in the PAF format.
250    /// These custom tags are add by readfish's implementation summarise on the Aligner.
251    /// If the `sequencing_summary` argument is provided, it retrieves the sequencing summary record for each line's query name.
252    /// The function processes custom tags in the PAF file and ensures they are present. If `sequencing_summary` is None and custom tags are missing,
253    /// the function will panic.
254    ///
255    /// If `sequencing_summary` is provided, the function retrieves the sequencing summary record for each query name using the `get_record` function.
256    /// If a sequencing summary record is not found in the buffer, the function reads from the sequencing summary file until the record is found.
257    /// The function consumes the bytes in the PAF file and updates the `previous_read_id` to avoid removing multiple mappings from the `sequencing_summary`
258    /// only when the new Read Id is not the same as the old read_id.
259    ///
260    /// # Arguments
261    ///
262    /// - `toml`: A reference to the `Conf` struct, which contains configuration settings.
263    /// - `sequencing_summary`: An optional mutable reference to the `SeqSum` struct, representing the sequencing summary file.
264    ///
265    /// # Errors
266    ///
267    /// This function returns a `DynResult`, which is a specialized `Result` type with an error message.
268    /// An error is returned if there is any issue reading the PAF file or if the sequencing summary file is not found.
269    ///
270    /// # Examples
271    ///
272    /// ```rust,ignore
273    /// // Import necessary libraries
274    /// use std::error::Error;
275    /// use my_crate::{SeqSum, Conf};
276    ///
277    /// // Create a new sequencing summary instance
278    /// let mut sequencing_summary = SeqSum::from_file("path/to/sequencing_summary.toml")?;
279    ///
280    /// // Load the TOML configuration
281    /// let toml = Conf::from_file("path/to/config.toml")?;
282    ///
283    /// // Demultiplex the PAF file using the sequencing summary
284    /// sequencing_summary.demultiplex(&toml, Some(&mut sequencing_summary))?;
285    /// ```
286    pub fn demultiplex(
287        &mut self,
288        _toml: &mut Conf,
289        sequencing_summary: Option<&mut SeqSum>,
290        mut summary: Option<&mut Summary>,
291    ) -> DynResult<()> {
292        let seq_sum = sequencing_summary.unwrap();
293
294        // Remove multiple mappings from seq_sum dictionary only when the new Read Id is not the same as the old read_id
295        for line in open_paf_for_reading(self.paf_file.clone())?.lines() {
296            let (paf_record, read_on, condition_name) =
297                _parse_paf_line(line?, _toml, None, Some(seq_sum))?;
298
299            if let Some(summary) = summary.as_deref_mut() {
300                let condition_summary = summary.conditions(condition_name.as_str());
301                condition_summary.update(paf_record, read_on).unwrap();
302            }
303        }
304        Ok(())
305    }
306}
307
308/// Parses the PAF file and returns a buffered reader for further processing.
309///
310/// This function takes the `file_name` as an input and returns a `Result` containing
311/// a boxed buffered reader (`Box<dyn BufRead + Send>`) if the PAF file is valid.
312///
313/// # Arguments
314///
315/// * `file_name`: The path to the PAF file to be parsed. It should implement the `AsRef<Path>` trait.
316///
317/// # Returns
318///
319/// A `Result` containing the buffered reader (`Box<dyn BufRead + Send>`) if the PAF file is valid,
320/// otherwise an `Err` containing an error message.
321///
322/// # Errors
323///
324/// The function returns an `Err` in the following cases:
325/// - If the file is empty (contains no bytes), it returns an error with the message "Error: empty file".
326/// - If the file format is invalid, specifically if any of the first twelve columns contains a ':' character,
327///   it returns an error with the message "Error: invalid format for PAF file. Missing one of first twelve columns, or values contain a :."
328/// - If there are any I/O errors while reading the file, the function returns an error with the specific I/O error message.
329///
330/// # Example
331///
332/// ```rust,ignore
333/// use std::path::Path;
334/// use std::io::BufRead;
335///
336/// let file_name = Path::new("example.paf");
337/// match open_paf_for_reading(file_name) {
338///     Ok(reader) => {
339///         let mut lines = reader.lines();
340///         // Process the PAF file line by line
341///         for line in lines {
342///             match line {
343///                 Ok(line_content) => {
344///                     // Process the content of the PAF line
345///                     // ...
346///                 }
347///                 Err(error) => {
348///                     eprintln!("Error while reading PAF line: {}", error);
349///                 }
350///             }
351///         }
352///     }
353///     Err(error) => {
354///         eprintln!("Error while parsing PAF file: {}", error);
355///     }
356/// }
357/// ```
358pub fn open_paf_for_reading(file_name: impl AsRef<Path>) -> DynResult<Box<dyn BufRead + Send>> {
359    // create reader to check file first line
360    let mut paf_file = reader(&file_name, None);
361
362    // Check the file isn't empty
363    let mut buffer = [0; 1];
364    let bytes_read = paf_file.read(&mut buffer)?;
365    if bytes_read == 0 {
366        return Err("Error: empty file".into());
367    }
368    let mut line = String::new();
369    paf_file.read_line(&mut line)?;
370    let t: Vec<&str> = line.split_ascii_whitespace().collect();
371    if t.iter().take(12).any(|item| item.contains(':')) {
372        return Err("Error: invalid format for Paf file. Missing one of first twelve columns, or values contain a :.".into());
373    }
374
375    let paf_file = reader(file_name, None);
376
377    Ok(paf_file)
378}
379
380/// Parses a line from the PAF file and extracts relevant information.
381///
382/// This function takes a PAF line (as a reference to a string) and attempts to parse it to extract
383/// relevant information, including creating a [`PafRecord`] and making decisions based on the provided
384/// metadata or sequencing summary. It returns a tuple containing the `PafRecord`, a boolean value
385/// indicating if the read is considered "on-target", and the condition name associated with the read.
386///
387/// # Arguments
388///
389/// * `paf_line`: A reference to a string slice representing a line from the PAF file.
390/// * `_toml`: A reference to a `Conf` struct, holding configuration information.
391/// * `meta_data`: An optional mutable reference to a `Metadata` struct containing read metadata.
392/// * `sequencing_summary`: An optional mutable reference to a `SeqSum` struct containing sequencing summary data.
393///
394/// # Returns
395///
396/// A `DynResult` holding a tuple containing the following elements:
397/// * `PafRecord`: The parsed PAF record representing the alignment information.
398/// * `bool`: A boolean value indicating if the read is considered "on-target".
399/// * `&'a String`: A reference to the condition name associated with the read.
400///
401/// # Panics
402///
403/// This function panics if the PAF line contains missing items in the first 12 columns or if both `meta_data`
404/// and `sequencing_summary` are `None`.
405///
406/// # Examples
407///
408/// ```rust, ignore
409/// # use your_crate::{PafRecord, Metadata, SeqSum, _parse_paf_line};
410///
411/// // Assuming we have valid inputs
412/// let paf_line = "read123 200 0 200 + contig123 300 0 300 200 200 50 ch=1";
413/// let _toml = Conf::default();
414/// let mut metadata = Metadata {
415///     read_id: "read123".to_string(),
416///     channel: 1,
417///     barcode: Some("sampleA".to_string()),
418/// };
419/// let mut seq_sum = SeqSum::default();
420///
421/// let result = _parse_paf_line(paf_line, &_toml, Some(&mut metadata), Some(&mut seq_sum));
422///
423/// match result {
424///     Ok((paf_record, read_on, condition_name)) => {
425///         // Do something with the parsed data
426///     }
427///     Err(err) => {
428///         // Handle the error
429///     }
430/// }
431/// ```
432pub fn _parse_paf_line<'a>(
433    paf_line: impl AsRef<str>,
434    _toml: &'a Conf,
435    meta_data: Option<&mut Metadata>,
436    sequencing_summary: Option<&mut SeqSum>,
437) -> DynResult<(PafRecord, bool, &'a String)> {
438    let line = paf_line.as_ref();
439    let t: Vec<&str> = line.split_ascii_whitespace().collect();
440    // Todo do without clone
441    let paf_record = PafRecord::new(t.clone()).unwrap();
442    // Check first 12 columns for missing items, assumes tags will have been brought forwards
443    assert!(
444        t.iter().take(12).all(|item| !item.contains(':')),
445        "Missing colon in PAF line: {}",
446        line
447    );
448    // check if we have custom tags from readfish aligner analyse
449    let channel: usize;
450    let barcode: Option<String>;
451    // for token in t.iter().skip(12) {
452    //     debug_assert!(PAF_TAG.is_match(token));
453    //     let caps = PAF_TAG.captures(token).unwrap();
454    //     let tag = &caps[1];
455    //     // let value = &caps[3];
456    //     if (tag == "ch") | (tag == "ba") {
457    //         has_tags = true;
458    //     }
459    // }
460    // Break the Paf line into its components
461    let query_name = t[0];
462    // let query_length: usize = t[1].parse()?;
463    let strand = t[4];
464    let contig = t[5];
465    // let contig_length: usize = t[6].parse()?;
466    let mapping_start: usize = t[7].parse()?;
467    let read_on: bool;
468    if meta_data.is_none() & sequencing_summary.is_none() {
469        panic!("Cannot parse paf line without provided metdata or sequencing summary_file");
470    }
471    // If sequencing summary is provided, get the sequencing summary record for the query name
472    // Use it for things like barcodes and channels
473    if let Some(seq_sum_struct) = sequencing_summary {
474        let seq_sum_record = seq_sum_struct.get_record(query_name, None);
475        if let Ok(record) = seq_sum_record {
476            read_on = _toml.make_decision(
477                record.1.get_channel().unwrap(),
478                record.2.get_barcode().map(|x| x.as_str()),
479                contig,
480                strand,
481                mapping_start,
482            );
483            channel = record.1.get_channel().unwrap();
484            barcode = Some(record.2.get_barcode().unwrap_or(&"".to_string()).clone());
485        } else {
486            return Err("Error: sequencing summary record not found".into());
487        }
488        seq_sum_struct.previous_read_id = query_name.to_string();
489    // We must have metatdata
490    } else {
491        let metadata = meta_data.unwrap();
492        // println!("{contig}, {strand}, {mapping_start}");
493        read_on = _toml.make_decision(
494            metadata.channel(),
495            metadata.barcode().map(|x| x.as_str()),
496            contig,
497            strand,
498            mapping_start,
499        );
500        channel = metadata.channel();
501        barcode = Some(metadata.barcode().unwrap_or(&"".to_string()).clone());
502    }
503    // get the condition so we can access name etc.
504    let (_control, condition) = _toml.get_conditions(channel, barcode)?;
505    let condition = condition.get_condition();
506    let condition_name = &condition.name;
507
508    Ok((paf_record, read_on, condition_name))
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514
515    fn get_resource_dir() -> PathBuf {
516        let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
517        path.push("resources/");
518        path
519    }
520
521    fn get_test_file(file: &str) -> PathBuf {
522        let mut path = get_resource_dir();
523        path.push(file);
524        path
525    }
526
527    #[test]
528    fn test_from_tuple() {
529        let tuple = ("ABC123".to_string(), 1, Some("BCDE".to_string()));
530        let metadata = Metadata::from(tuple);
531
532        assert_eq!(metadata.read_id(), "ABC123");
533        assert_eq!(metadata.channel(), 1);
534        assert_eq!(metadata.barcode(), Some(&"BCDE".to_string()));
535    }
536
537    #[test]
538    fn test_from_tuple_no_barcode() {
539        let tuple = ("XYZ789".to_string(), 2, None);
540        let metadata = Metadata::from(tuple);
541
542        assert_eq!(metadata.read_id(), "XYZ789");
543        assert_eq!(metadata.channel(), 2);
544        assert_eq!(metadata.barcode(), None);
545    }
546
547    #[test]
548    fn test_read_id() {
549        let metadata = Metadata {
550            read_id: "ABC123".to_string(),
551            channel: 1,
552            barcode: None,
553        };
554
555        assert_eq!(metadata.read_id(), "ABC123");
556    }
557
558    #[test]
559    fn test_channel() {
560        let metadata = Metadata {
561            read_id: "ABC123".to_string(),
562            channel: 1,
563            barcode: Some("BCDE".to_string()),
564        };
565
566        assert_eq!(metadata.channel(), 1);
567    }
568
569    #[test]
570    fn test_barcode_present() {
571        let metadata = Metadata {
572            read_id: "ABC123".to_string(),
573            channel: 1,
574            barcode: Some("BCDE".to_string()),
575        };
576
577        assert_eq!(metadata.barcode(), Some(&"BCDE".to_string()));
578    }
579
580    #[test]
581    fn test_barcode_absent() {
582        let metadata = Metadata {
583            read_id: "ABC123".to_string(),
584            channel: 1,
585            barcode: None,
586        };
587
588        assert_eq!(metadata.barcode(), None);
589    }
590
591    #[test]
592    fn test_from_file_valid_paf() {
593        let file_name = get_test_file("test_hum_4000.paf");
594        let result = open_paf_for_reading(file_name);
595        assert!(
596            result.is_ok(),
597            "Expected Ok, but got an error: {:?}",
598            result.err()
599        );
600    }
601
602    #[test]
603    fn test_from_file_invalid_paf() {
604        let file_name = get_test_file("invalid_file.paf");
605        let result = open_paf_for_reading(file_name);
606        assert!(result.is_err(), "Expected Err, but got Ok");
607    }
608
609    #[test]
610    fn test_from_file_empty_file() {
611        let file_name = get_test_file("empty.paf");
612        let result = open_paf_for_reading(file_name);
613        assert!(result.is_err(), "Expected Err, but got Ok");
614    }
615
616    #[test]
617    #[should_panic]
618    fn test_from_file_nonexistent_file() {
619        let file_name = get_test_file("no_existo.paf");
620        let result = open_paf_for_reading(file_name);
621        assert!(result.is_err(), "Expected Err, but got Ok");
622    }
623
624    #[test]
625    fn test_paf_from_file() {
626        open_paf_for_reading(get_test_file("test_hum_4000.paf")).unwrap();
627        // assert_eq!(paf.records.len(), 4148usize);
628    }
629}