paf/
reader.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::io::{self, BufRead};
4use std::path::Path;
5
6use crate::{Error, ErrorKind, Result};
7
8/// Enum representing the possible types of optional fields.
9#[derive(Debug)]
10pub enum Type {
11    Int(i64),
12    Float(f64),
13    String(String),
14    Char(char),
15}
16
17impl Type {
18    fn parse(field_type: &str, value: &str) -> Option<Self> {
19        match field_type {
20            "i" => value.parse::<i64>().ok().map(Type::Int),
21            "f" => value.parse::<f64>().ok().map(Type::Float),
22            "Z" => Some(Type::String(value.to_string())),
23            "A" => value.chars().next().map(Type::Char),
24            _ => Some(Type::String(value.to_string())), // Default to string
25        }
26    }
27
28    /// Get the inner integer out.
29    pub fn get_int(&self) -> Option<&i64> {
30        match self {
31            Type::Int(v) => Some(v),
32            _ => None,
33        }
34    }
35
36    /// Get the inner float out.
37    pub fn get_float(&self) -> Option<&f64> {
38        match self {
39            Type::Float(v) => Some(v),
40            _ => None,
41        }
42    }
43
44    /// Get the inner string out.
45    pub fn get_string(&self) -> Option<&String> {
46        match self {
47            Type::String(v) => Some(v),
48            _ => None,
49        }
50    }
51
52    /// Get the inner char out.
53    pub fn get_char(&self) -> Option<&char> {
54        match self {
55            Type::Char(v) => Some(v),
56            _ => None,
57        }
58    }
59}
60
61/// Enum representing the possible types of tags.
62#[derive(Debug)]
63#[allow(non_camel_case_types)]
64pub enum Tag {
65    /// Type of aln: P/primary, S/secondary and I,i/inversion.
66    tp(Type),
67    /// Number of minimizers on the chain.
68    cm(Type),
69    /// Chaining score.
70    s1(Type),
71    /// Chaining score of the best secondary chain.
72    s2(Type),
73    /// Total number of mismatches and gaps in the alignment.
74    NM(Type),
75    /// To generate the ref sequence in the alignment.
76    MD(Type),
77    /// DP alignment score.
78    AS(Type),
79    /// List of other supplementary alignments.
80    SA(Type),
81    /// DP score of the max scoring segment in the alignment.
82    ms(Type),
83    /// Number of ambiguous bases in the alignment.
84    nn(Type),
85    /// Transcript strand (splice mode only).
86    ts(Type),
87    /// CIGAR string.
88    cg(Type),
89    /// Difference string.
90    cs(Type),
91    /// Approximate per-base sequence divergence.
92    dv(Type),
93    /// Gap-compressed per-base sequence divergence.
94    de(Type),
95    /// Length of query regions harboring repetitive seeds.
96    rl(Type),
97    /// ZD?
98    zd(Type),
99}
100
101impl Tag {
102    /// Parse a tag from a string.
103    pub fn parse(tag: &str, value: Type) -> Result<Self> {
104        match tag {
105            "tp" => Ok(Tag::tp(value)),
106            "cm" => Ok(Tag::cm(value)),
107            "s1" => Ok(Tag::s1(value)),
108            "s2" => Ok(Tag::s2(value)),
109            "NM" => Ok(Tag::NM(value)),
110            "MD" => Ok(Tag::MD(value)),
111            "AS" => Ok(Tag::AS(value)),
112            "SA" => Ok(Tag::SA(value)),
113            "ms" => Ok(Tag::ms(value)),
114            "nn" => Ok(Tag::nn(value)),
115            "ts" => Ok(Tag::ts(value)),
116            "cg" => Ok(Tag::cg(value)),
117            "cs" => Ok(Tag::cs(value)),
118            "dv" => Ok(Tag::dv(value)),
119            "de" => Ok(Tag::de(value)),
120            "rl" => Ok(Tag::rl(value)),
121            "zd" => Ok(Tag::zd(value)),
122            _ => Err(Error::new(ErrorKind::ReadRecord(format!(
123                "Invalid PAF tag: {}",
124                tag
125            )))),
126        }
127    }
128
129    /// Tag to string function.
130    fn to_string(&self) -> String {
131        match self {
132            Tag::tp(_) => "tp".into(),
133            Tag::cm(_) => "cm".into(),
134            Tag::s1(_) => "s1".into(),
135            Tag::s2(_) => "s2".into(),
136            Tag::NM(_) => "NM".into(),
137            Tag::MD(_) => "MD".into(),
138            Tag::AS(_) => "AS".into(),
139            Tag::SA(_) => "SA".into(),
140            Tag::ms(_) => "ms".into(),
141            Tag::nn(_) => "nn".into(),
142            Tag::ts(_) => "ts".into(),
143            Tag::cg(_) => "cg".into(),
144            Tag::cs(_) => "cs".into(),
145            Tag::dv(_) => "dv".into(),
146            Tag::de(_) => "de".into(),
147            Tag::rl(_) => "rl".into(),
148            Tag::zd(_) => "zd".into(),
149        }
150    }
151}
152
153/// Struct representing a PAF record.
154#[derive(Debug)]
155pub struct PafRecord {
156    /// Query sequence name.
157    query_name: String,
158    /// Query sequence length.
159    query_len: u32,
160    /// Query start coordinate (0-based).
161    query_start: u32,
162    /// Query end coordinate (0-based).
163    query_end: u32,
164    /// ‘+’ if query/target on the same strand; ‘-’ if opposite.
165    strand: char,
166    /// Target sequence name.
167    target_name: String,
168    /// Target sequence length.
169    target_len: u32,
170    /// Target start coordinate on the original strand.
171    target_start: u32,
172    /// Target end coordinate on the original strand.
173    target_end: u32,
174    /// Number of matching bases in the mapping.
175    residue_matches: u32,
176    /// Number bases, including gaps, in the mapping.
177    alignment_block_len: u32,
178    /// Mapping quality (0-255 with 255 for missing).
179    mapping_quality: u8,
180
181    /// The optional fields.
182    optional: HashMap<String, Tag>,
183}
184
185impl PafRecord {
186    /// Create a new PAF record.
187    pub fn new(
188        query_name: String,
189        query_len: u32,
190        query_start: u32,
191        query_end: u32,
192        strand: char,
193        target_name: String,
194        target_len: u32,
195        target_start: u32,
196        target_end: u32,
197        residue_matches: u32,
198        alignment_block_len: u32,
199        mapping_quality: u8,
200        optional: HashMap<String, Tag>,
201    ) -> PafRecord {
202        PafRecord {
203            query_name,
204            query_len,
205            query_start,
206            query_end,
207            strand,
208            target_name,
209            target_len,
210            target_start,
211            target_end,
212            residue_matches,
213            alignment_block_len,
214            mapping_quality,
215            optional,
216        }
217    }
218
219    /// Get the query name.
220    pub fn query_name(&self) -> &str {
221        &self.query_name
222    }
223    /// Get the query length.
224    pub fn query_len(&self) -> u32 {
225        self.query_len
226    }
227    /// Get the query start position.
228    pub fn query_start(&self) -> u32 {
229        self.query_start
230    }
231    /// Get the query end position.
232    pub fn query_end(&self) -> u32 {
233        self.query_end
234    }
235    /// Get the target name.
236    pub fn target_name(&self) -> &str {
237        &self.target_name
238    }
239    /// Get the target length.
240    pub fn target_len(&self) -> u32 {
241        self.target_len
242    }
243    /// Get the target start position.
244    pub fn target_start(&self) -> u32 {
245        self.target_start
246    }
247    /// Get the target end position.
248    pub fn target_end(&self) -> u32 {
249        self.target_end
250    }
251    /// Get the number of residue matches.
252    pub fn residue_matches(&self) -> u32 {
253        self.residue_matches
254    }
255    /// Get the alignment block length.
256    pub fn alignment_block_len(&self) -> u32 {
257        self.alignment_block_len
258    }
259    /// Get the mapping quality.
260    pub fn mapping_quality(&self) -> u8 {
261        self.mapping_quality
262    }
263    /// Get the strand.
264    pub fn strand(&self) -> char {
265        self.strand
266    }
267    /// Get all the optional fields.
268    pub fn optional_fields(&self) -> &HashMap<String, Tag> {
269        &self.optional
270    }
271    /// Get type of aln: P/primary, S/secondary and I,i/inversion.
272    pub fn tp(&self) -> Option<&char> {
273        self.optional.get("tp").map(|tag| match tag {
274            Tag::tp(t) => t.get_char().unwrap(),
275            _ => panic!("Invalid tag"),
276        })
277    }
278    /// Get number of minimizers on the chain
279    pub fn cm(&self) -> Option<&i64> {
280        self.optional.get("cm").map(|tag| match tag {
281            Tag::cm(t) => t.get_int().unwrap(),
282            _ => panic!("Invalid tag"),
283        })
284    }
285    /// Get chaining score.
286    pub fn s1(&self) -> Option<&i64> {
287        self.optional.get("s1").map(|tag| match tag {
288            Tag::s1(t) => t.get_int().unwrap(),
289            _ => panic!("Invalid tag"),
290        })
291    }
292    /// Get chaining score of the best secondary chain.
293    pub fn s2(&self) -> Option<&i64> {
294        self.optional.get("s2").map(|tag| match tag {
295            Tag::s2(t) => t.get_int().unwrap(),
296            _ => panic!("Invalid tag"),
297        })
298    }
299    /// Get total number of mismatches and gaps in the alignment.
300    pub fn nm(&self) -> Option<&i64> {
301        self.optional.get("NM").map(|tag| match tag {
302            Tag::NM(t) => t.get_int().unwrap(),
303            _ => panic!("Invalid tag"),
304        })
305    }
306    /// Get the ref sequence in the alignment.
307    pub fn md(&self) -> Option<&String> {
308        self.optional.get("MD").map(|tag| match tag {
309            Tag::MD(t) => t.get_string().unwrap(),
310            _ => panic!("Invalid tag"),
311        })
312    }
313    /// Get DP alignment score.
314    pub fn as_(&self) -> Option<&i64> {
315        self.optional.get("AS").map(|tag| match tag {
316            Tag::AS(t) => t.get_int().unwrap(),
317            _ => panic!("Invalid tag"),
318        })
319    }
320    /// Get a list of other supplementary alignments.
321    pub fn sa(&self) -> Option<&String> {
322        self.optional.get("SA").map(|tag| match tag {
323            Tag::SA(t) => t.get_string().unwrap(),
324            _ => panic!("Invalid tag"),
325        })
326    }
327    /// Get DP score of the max scoring segment in the alignment.
328    pub fn ms(&self) -> Option<&i64> {
329        self.optional.get("ms").map(|tag| match tag {
330            Tag::ms(t) => t.get_int().unwrap(),
331            _ => panic!("Invalid tag"),
332        })
333    }
334    /// Get number of ambiguous bases in the alignment.
335    pub fn nn(&self) -> Option<&i64> {
336        self.optional.get("nn").map(|tag| match tag {
337            Tag::nn(t) => t.get_int().unwrap(),
338            _ => panic!("Invalid tag"),
339        })
340    }
341    /// Get transcript strand (splice mode only).
342    pub fn ts(&self) -> Option<&char> {
343        self.optional.get("ts").map(|tag| match tag {
344            Tag::ts(t) => t.get_char().unwrap(),
345            _ => panic!("Invalid tag"),
346        })
347    }
348    /// Get CIGAR string (only in PAF).
349    pub fn cg(&self) -> Option<&String> {
350        self.optional.get("cg").map(|tag| match tag {
351            Tag::cg(t) => t.get_string().unwrap(),
352            _ => panic!("Invalid tag"),
353        })
354    }
355    /// Get difference string.
356    pub fn cs(&self) -> Option<&String> {
357        self.optional.get("cs").map(|tag| match tag {
358            Tag::cs(t) => t.get_string().unwrap(),
359            _ => panic!("Invalid tag"),
360        })
361    }
362    /// Get approximate per-base sequence divergence.
363    pub fn dv(&self) -> Option<&f64> {
364        self.optional.get("dv").map(|tag| match tag {
365            Tag::dv(t) => t.get_float().unwrap(),
366            _ => panic!("Invalid tag"),
367        })
368    }
369    /// Get gap-compressed per-base sequence divergence.
370    pub fn de(&self) -> Option<&f64> {
371        self.optional.get("de").map(|tag| match tag {
372            Tag::de(t) => t.get_float().unwrap(),
373            _ => panic!("Invalid tag"),
374        })
375    }
376    /// Get length of query regions harboring repetitive seeds.
377    pub fn rl(&self) -> Option<&i64> {
378        self.optional.get("rl").map(|tag| match tag {
379            Tag::rl(t) => t.get_int().unwrap(),
380            _ => panic!("Invalid tag"),
381        })
382    }
383}
384
385/// Struct representing a PAF parser iterator.
386pub struct Reader<R> {
387    reader: io::BufReader<R>,
388    line: u64,
389}
390
391impl Reader<File> {
392    /// Creates a new PAF parser from a file path.
393    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Reader<File>> {
394        Ok(Reader::new(File::open(path)?))
395    }
396
397    /// Creates a new PAF parser from a reader.
398    pub fn from_reader<R: io::Read>(rdr: R) -> Reader<R> {
399        Reader::new(rdr)
400    }
401}
402
403/// Parse optional fields from the PAF line.
404fn parse_optional_fields(fields: &[&str]) -> Result<HashMap<String, Tag>> {
405    let mut map = HashMap::new();
406
407    // NM:i:48730
408    for field in fields {
409        let parts: Vec<&str> = field.split(':').collect();
410        if parts.len() < 3 {
411            return Err(Error::new(ErrorKind::ReadRecord(
412                "Invalid PAF line: invalid optional field - too few parts".into(),
413            )));
414        }
415
416        let tag = parts[0];
417        let type_ = parts[1];
418        let inner = parts[2];
419
420        let type_ = Type::parse(type_, inner).ok_or_else(|| {
421            Error::new(ErrorKind::ReadRecord(format!(
422                "Invalid PAF line: invalid optional field type: {}",
423                type_
424            )))
425        })?;
426
427        let tag = Tag::parse(tag, type_)?;
428
429        map.insert(tag.to_string(), tag);
430    }
431    Ok(map)
432}
433
434impl<R: io::Read> Reader<R> {
435    /// Creates a new PAF parser from a buffered reader.
436    pub fn new(rdr: R) -> Self {
437        Reader {
438            reader: io::BufReader::new(rdr),
439            line: 0,
440        }
441    }
442
443    /// A borrowed iterator over the records of a PAF file.
444    pub fn records(&mut self) -> RecordsIter<R> {
445        RecordsIter::new(self)
446    }
447
448    /// An owned iterator over the records of a PAF file.
449    pub fn into_records(self) -> RecordsIntoIter<R> {
450        RecordsIntoIter::new(self)
451    }
452
453    /// Read a single record.
454    pub fn read_record(&mut self) -> Result<Option<PafRecord>> {
455        let mut line = String::new();
456        let bytes_read = match self.reader.read_line(&mut line) {
457            Ok(b) => b,
458            Err(e) => return Err(Error::new(ErrorKind::Io(e))),
459        };
460
461        if bytes_read == 0 {
462            return Ok(None); // EOF
463        }
464
465        let columns: Vec<&str> = line.trim().split('\t').collect();
466        if columns.len() < 12 {
467            return Err(Error::new(ErrorKind::ReadRecord(format!(
468                "Invalid PAF at line {}: less than 12 mandatory fields",
469                self.line
470            ))));
471        }
472
473        // parse the mandatory fields
474        let query_name = columns[0].to_string();
475        let query_len = columns[1].parse::<u32>()?;
476        let query_start = columns[2].parse::<u32>()?;
477        let query_end = columns[3].parse::<u32>()?;
478        let strand = columns[4]
479            .chars()
480            .next()
481            .ok_or_else(|| Error::new(ErrorKind::ReadRecord("Empty strand field".into())))?;
482
483        if strand != '+' && strand != '-' {
484            return Err(Error::new(ErrorKind::ReadRecord(format!(
485                "Invalid strand field at line {}: {}",
486                self.line, strand
487            ))));
488        }
489
490        let target_name = columns[5].to_string();
491        let target_len = columns[6].parse::<u32>()?;
492        let target_start = columns[7].parse::<u32>()?;
493        let target_end = columns[8].parse::<u32>()?;
494        let residue_matches = columns[9].parse::<u32>()?;
495        let alignment_block_len = columns[10].parse::<u32>()?;
496        let mapping_quality = columns[11].parse::<u8>()?;
497
498        let optional = parse_optional_fields(&columns[12..])?;
499
500        let record = PafRecord {
501            query_name,
502            query_len,
503            query_start,
504            query_end,
505            strand,
506            target_name,
507            target_len,
508            target_start,
509            target_end,
510            residue_matches,
511            alignment_block_len,
512            mapping_quality,
513            optional,
514        };
515
516        Ok(Some(record))
517    }
518}
519
520/// A borrowed iterator over the records of a PAF file.
521pub struct RecordsIter<'r, R: 'r> {
522    /// The underlying reader
523    rdr: &'r mut Reader<R>,
524}
525
526impl<'r, R: io::Read> RecordsIter<'r, R> {
527    /// Create a new iterator.
528    fn new(rdr: &'r mut Reader<R>) -> RecordsIter<'r, R> {
529        RecordsIter { rdr }
530    }
531    /// Return a reference to the underlying reader.
532    pub fn reader(&self) -> &Reader<R> {
533        self.rdr
534    }
535
536    /// Return a mutable reference to the underlying reader.
537    pub fn reader_mut(&mut self) -> &mut Reader<R> {
538        self.rdr
539    }
540}
541
542impl<'r, R: io::Read> Iterator for RecordsIter<'r, R> {
543    type Item = Result<PafRecord>;
544
545    fn next(&mut self) -> Option<Result<PafRecord>> {
546        match self.rdr.read_record() {
547            Ok(Some(r)) => {
548                self.rdr.line += 1;
549                Some(Ok(r))
550            }
551            Ok(None) => None,
552            Err(e) => Some(Err(e)),
553        }
554    }
555}
556
557/// An owned iterator over the records of a PAF file.
558pub struct RecordsIntoIter<R> {
559    /// The underlying reader.
560    rdr: Reader<R>,
561}
562
563impl<R: io::Read> RecordsIntoIter<R> {
564    /// Create a new iterator.
565    fn new(rdr: Reader<R>) -> RecordsIntoIter<R> {
566        RecordsIntoIter { rdr }
567    }
568    /// Return a reference to the underlying reader.
569    pub fn reader(&self) -> &Reader<R> {
570        &self.rdr
571    }
572
573    /// Return a mutable reference to the underlying reader.
574    pub fn reader_mut(&mut self) -> &mut Reader<R> {
575        &mut self.rdr
576    }
577
578    /// Drop this iterator and return the underlying reader.
579    pub fn into_reader(self) -> Reader<R> {
580        self.rdr
581    }
582}
583
584impl<R: io::Read> Iterator for RecordsIntoIter<R> {
585    type Item = Result<PafRecord>;
586
587    fn next(&mut self) -> Option<Result<PafRecord>> {
588        match self.rdr.read_record() {
589            Ok(Some(r)) => {
590                self.rdr.line += 1;
591                Some(Ok(r))
592            }
593            Ok(None) => None,
594            Err(e) => Some(Err(e)),
595        }
596    }
597}
598
599#[cfg(test)]
600mod tests {
601    use super::Reader;
602
603    const PAF_RECORD_1: &[u8] = b"NC_041798.1	41841605	28850796	29394458	+	SUPER_10	44636193	31974877	32470190	495111	515145	60	NM:i:48730	ms:i:488389	AS:i:439775	nn:i:28696	tp:A:P	cm:i:46495	s1:i:466570	s2:i:10896	de:f:0.0003	zd:i:3	rl:i:3568165	cg:Z:770M1D945M1D389M1I9141M1I356M1D196M1I30268M2D789M3I992M2D1819M1D7M1D7M1I10M6D2922M1D17899M2D1010M4D12324M1I1376M1D5549M6D1839M1I2206M1D770M1D2287M1D16103M1D3238M1D2014M1D140M5I14M1D8496M2I2151M1I335M1D14424M1D1093M1I567M1D1835M2D1995M1D5257M1D639M1I699M1I133M1I52M1I99M2I26M1I195M1I1543M1I240M1I176M1I412M2D159M1I261M1D1158M1I933M2D12836M1D993M1D12263M2D4975M2I16452M3I396M1I3924M2D929M3I3015M1D225M1D4225M1D717M2D752M1D2051M1D5110M1D15073M1D1053M2D4369M1D619M3I13564M2I4386M1D1431M2D617M1I612M2I3445M2I252M1D220M1D237M1I903M1I145M1I53M1I197M1I1280M1D4201M1D1736M1D1289M1I3344M2D5456M1D488M1I1655M2D1830M1D796M1I19341M2D1165M1D1926M1D6041M1D2170M1D3917M1D926M1D759M1D400M2I8802M1I836M1I381M48451I166M1I4896M2D1522M49D2729M1D947M2D927M6D911M2D800M2D3040M1D13213M1D8999M3D847M1D220M1I673M1D165M1I901M1I2887M1I105M2I597M1I1201M1I53M2I494M1I23M1D99M1I146M1D29906M1D5661M1I27598M1D520M1I166M2D11600M1D388M1D844M1D4583M1D8390M1D5789M2D3773M1D4494M1D448M1D846M3D531M";
604
605    #[test]
606    fn test_read_record() {
607        let mut parser = Reader::from_reader(&PAF_RECORD_1[..]);
608        let record = parser.read_record().unwrap().unwrap();
609
610        assert_eq!(record.query_name(), "NC_041798.1");
611        assert_eq!(record.query_len(), 41841605);
612        assert_eq!(record.query_start(), 28850796);
613        assert_eq!(record.query_end(), 29394458);
614        assert_eq!(record.strand(), '+');
615        assert_eq!(record.target_name(), "SUPER_10");
616        assert_eq!(record.target_len(), 44636193);
617        assert_eq!(record.target_start(), 31974877);
618        assert_eq!(record.target_end(), 32470190);
619        assert_eq!(record.residue_matches(), 495111);
620        assert_eq!(record.alignment_block_len(), 515145);
621        assert_eq!(record.mapping_quality(), 60);
622
623        let nm = record.nm().unwrap();
624        assert_eq!(nm, &48730);
625    }
626}