Skip to main content

ref_solver/parsing/
fai.rs

1//! Parser for FASTA index (.fai) files using noodles.
2//!
3//! FAI format provides name and length for each contig, but no MD5 or aliases.
4//! Format: `name\tlength\toffset\tline_bases\tline_width`
5
6use std::io::BufReader;
7use std::path::Path;
8
9use crate::core::contig::Contig;
10use crate::core::header::QueryHeader;
11use crate::parsing::sam::ParseError;
12use crate::utils::validation::check_contig_limit;
13
14/// Parse a FASTA index (.fai) file using noodles
15///
16/// # Errors
17///
18/// Returns `ParseError::Io` if the file cannot be read, `ParseError::Noodles` if
19/// parsing fails, `ParseError::InvalidFormat` if no contigs are found, or
20/// `ParseError::TooManyContigs` if the limit is exceeded.
21pub fn parse_fai_file(path: &Path) -> Result<QueryHeader, ParseError> {
22    use noodles::fasta;
23
24    let reader = std::fs::File::open(path).map(BufReader::new)?;
25
26    let index = fasta::fai::io::Reader::new(reader)
27        .read_index()
28        .map_err(|e| ParseError::Noodles(format!("Failed to parse FAI file: {e}")))?;
29
30    index_to_query(&index)
31}
32
33/// Convert noodles FAI index to `QueryHeader`
34fn index_to_query(index: &noodles::fasta::fai::Index) -> Result<QueryHeader, ParseError> {
35    let mut contigs = Vec::new();
36
37    for record in index.as_ref() {
38        // Check contig limit for DOS protection
39        if check_contig_limit(contigs.len()).is_some() {
40            return Err(ParseError::TooManyContigs(contigs.len()));
41        }
42
43        let name = String::from_utf8_lossy(record.name()).to_string();
44        let length = record.length();
45
46        contigs.push(Contig::new(name, length));
47    }
48
49    if contigs.is_empty() {
50        return Err(ParseError::InvalidFormat(
51            "No contigs found in FAI file".to_string(),
52        ));
53    }
54
55    Ok(QueryHeader::new(contigs))
56}
57
58/// Parse FAI from text (fallback for raw text input)
59///
60/// # Errors
61///
62/// Returns `ParseError::InvalidFormat` if the text has invalid format or no contigs,
63/// or `ParseError::TooManyContigs` if the limit is exceeded.
64pub fn parse_fai_text(text: &str) -> Result<QueryHeader, ParseError> {
65    let mut contigs = Vec::new();
66
67    for line in text.lines() {
68        let line = line.trim();
69        if line.is_empty() || line.starts_with('#') {
70            continue;
71        }
72
73        let fields: Vec<&str> = line.split('\t').collect();
74        if fields.len() < 2 {
75            continue;
76        }
77
78        // Check contig limit for DOS protection
79        if check_contig_limit(contigs.len()).is_some() {
80            return Err(ParseError::TooManyContigs(contigs.len()));
81        }
82
83        let name = fields[0].to_string();
84        let length: u64 = fields[1].parse().map_err(|_| {
85            ParseError::InvalidFormat(format!(
86                "Invalid length for contig '{}': {}",
87                name, fields[1]
88            ))
89        })?;
90
91        contigs.push(Contig::new(name, length));
92    }
93
94    if contigs.is_empty() {
95        return Err(ParseError::InvalidFormat(
96            "No contigs found in FAI file".to_string(),
97        ));
98    }
99
100    Ok(QueryHeader::new(contigs))
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    /// Parsed contig from FAI file with offset information
108    #[derive(Debug, Clone)]
109    pub struct FaiEntry {
110        pub name: String,
111        pub length: u64,
112        pub offset: u64,
113        pub line_bases: u32,
114        pub line_width: u32,
115    }
116
117    /// Parse FAI file with full entry information
118    pub fn parse_fai_entries(text: &str) -> Result<Vec<FaiEntry>, ParseError> {
119        let mut entries = Vec::new();
120
121        for line in text.lines() {
122            let line = line.trim();
123            if line.is_empty() || line.starts_with('#') {
124                continue;
125            }
126
127            let fields: Vec<&str> = line.split('\t').collect();
128            if fields.len() < 5 {
129                return Err(ParseError::InvalidFormat(format!(
130                    "FAI line has {} fields, expected 5: {}",
131                    fields.len(),
132                    line
133                )));
134            }
135
136            // Check limit
137            if check_contig_limit(entries.len()).is_some() {
138                return Err(ParseError::TooManyContigs(entries.len()));
139            }
140
141            let name = fields[0].to_string();
142            let length: u64 = fields[1]
143                .parse()
144                .map_err(|_| ParseError::InvalidFormat(format!("Invalid length: {}", fields[1])))?;
145            let offset: u64 = fields[2]
146                .parse()
147                .map_err(|_| ParseError::InvalidFormat(format!("Invalid offset: {}", fields[2])))?;
148            let line_bases: u32 = fields[3].parse().map_err(|_| {
149                ParseError::InvalidFormat(format!("Invalid line_bases: {}", fields[3]))
150            })?;
151            let line_width: u32 = fields[4].parse().map_err(|_| {
152                ParseError::InvalidFormat(format!("Invalid line_width: {}", fields[4]))
153            })?;
154
155            entries.push(FaiEntry {
156                name,
157                length,
158                offset,
159                line_bases,
160                line_width,
161            });
162        }
163
164        if entries.is_empty() {
165            return Err(ParseError::InvalidFormat(
166                "No entries found in FAI file".to_string(),
167            ));
168        }
169
170        Ok(entries)
171    }
172
173    #[test]
174    fn test_parse_fai_text() {
175        let fai = r"chr1	248956422	112	70	71
176chr2	242193529	253404903	70	71
177chrM	16569	3099922541	70	71
178";
179
180        let query = parse_fai_text(fai).unwrap();
181        assert_eq!(query.contigs.len(), 3);
182
183        assert_eq!(query.contigs[0].name, "chr1");
184        assert_eq!(query.contigs[0].length, 248_956_422);
185        assert!(query.contigs[0].md5.is_none()); // FAI doesn't have MD5
186
187        assert_eq!(query.contigs[1].name, "chr2");
188        assert_eq!(query.contigs[1].length, 242_193_529);
189
190        assert_eq!(query.contigs[2].name, "chrM");
191        assert_eq!(query.contigs[2].length, 16569);
192    }
193
194    #[test]
195    fn test_parse_fai_entries() {
196        let fai = "chr1\t248956422\t112\t70\t71\n";
197
198        let entries = parse_fai_entries(fai).unwrap();
199        assert_eq!(entries.len(), 1);
200        assert_eq!(entries[0].name, "chr1");
201        assert_eq!(entries[0].length, 248_956_422);
202        assert_eq!(entries[0].offset, 112);
203        assert_eq!(entries[0].line_bases, 70);
204        assert_eq!(entries[0].line_width, 71);
205    }
206
207    #[test]
208    fn test_parse_fai_empty() {
209        let result = parse_fai_text("");
210        assert!(result.is_err());
211    }
212}