Skip to main content

cyanea_seq/
fasta_index.rs

1//! Indexed FASTA reading with random access via `.fai` index files.
2//!
3//! Provides [`FastaIndex`] for parsing, building, and writing samtools-compatible
4//! `.fai` index files, and [`IndexedFastaReader`] for seeking directly to named
5//! regions within a FASTA file without scanning the entire file.
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write};
10use std::path::Path;
11
12use cyanea_core::{CyaneaError, Result};
13
14/// A single entry in a FASTA index (.fai file).
15#[derive(Debug, Clone)]
16pub struct FastaIndexEntry {
17    /// Sequence name.
18    pub name: String,
19    /// Total length of the sequence in bases.
20    pub length: usize,
21    /// Byte offset of the first base in the FASTA file.
22    pub offset: u64,
23    /// Number of bases per line.
24    pub line_bases: usize,
25    /// Number of bytes per line (including newline).
26    pub line_width: usize,
27}
28
29/// FASTA index parsed from a `.fai` file.
30#[derive(Debug, Clone)]
31pub struct FastaIndex {
32    entries: Vec<FastaIndexEntry>,
33    name_to_idx: HashMap<String, usize>,
34}
35
36impl FastaIndex {
37    /// Parse a `.fai` index file (tab-separated: name, length, offset, line_bases, line_width).
38    pub fn from_file(fai_path: &Path) -> Result<Self> {
39        let file = File::open(fai_path)?;
40        let reader = BufReader::new(file);
41        let mut entries = Vec::new();
42        let mut name_to_idx = HashMap::new();
43
44        for (line_num, line) in reader.lines().enumerate() {
45            let line = line?;
46            let line = line.trim_end();
47            if line.is_empty() {
48                continue;
49            }
50            let fields: Vec<&str> = line.split('\t').collect();
51            if fields.len() < 5 {
52                return Err(CyaneaError::Parse(format!(
53                    "line {}: expected 5 tab-separated fields, got {}",
54                    line_num + 1,
55                    fields.len()
56                )));
57            }
58            let name = fields[0].to_string();
59            let length = fields[1].parse::<usize>().map_err(|e| {
60                CyaneaError::Parse(format!("line {}: invalid length: {}", line_num + 1, e))
61            })?;
62            let offset = fields[2].parse::<u64>().map_err(|e| {
63                CyaneaError::Parse(format!("line {}: invalid offset: {}", line_num + 1, e))
64            })?;
65            let line_bases = fields[3].parse::<usize>().map_err(|e| {
66                CyaneaError::Parse(format!("line {}: invalid line_bases: {}", line_num + 1, e))
67            })?;
68            let line_width = fields[4].parse::<usize>().map_err(|e| {
69                CyaneaError::Parse(format!("line {}: invalid line_width: {}", line_num + 1, e))
70            })?;
71
72            name_to_idx.insert(name.clone(), entries.len());
73            entries.push(FastaIndexEntry {
74                name,
75                length,
76                offset,
77                line_bases,
78                line_width,
79            });
80        }
81
82        Ok(Self {
83            entries,
84            name_to_idx,
85        })
86    }
87
88    /// Build an index by scanning a FASTA file.
89    pub fn build(fasta_path: &Path) -> Result<Self> {
90        let file = File::open(fasta_path)?;
91        let mut reader = BufReader::new(file);
92        let mut entries = Vec::new();
93        let mut name_to_idx = HashMap::new();
94        let mut byte_pos: u64 = 0;
95
96        // State for the current sequence being scanned.
97        let mut current_name: Option<String> = None;
98        let mut seq_offset: u64 = 0;
99        let mut seq_length: usize = 0;
100        let mut line_bases: usize = 0;
101        let mut line_width: usize = 0;
102        let mut first_seq_line = true;
103
104        let mut line_buf = String::new();
105        loop {
106            line_buf.clear();
107            let bytes_read = reader.read_line(&mut line_buf)?;
108            if bytes_read == 0 {
109                break;
110            }
111
112            if line_buf.starts_with('>') {
113                // Flush previous record.
114                if let Some(name) = current_name.take() {
115                    name_to_idx.insert(name.clone(), entries.len());
116                    entries.push(FastaIndexEntry {
117                        name,
118                        length: seq_length,
119                        offset: seq_offset,
120                        line_bases,
121                        line_width,
122                    });
123                }
124
125                let header = line_buf[1..].trim_end_matches(&['\n', '\r'][..]);
126                let name = header
127                    .split_whitespace()
128                    .next()
129                    .unwrap_or(header)
130                    .to_string();
131                current_name = Some(name);
132                seq_length = 0;
133                line_bases = 0;
134                line_width = 0;
135                first_seq_line = true;
136                seq_offset = byte_pos + bytes_read as u64;
137            } else if current_name.is_some() {
138                let bases = line_buf.trim_end_matches(&['\n', '\r'][..]).len();
139                seq_length += bases;
140                if first_seq_line {
141                    line_bases = bases;
142                    line_width = bytes_read;
143                    first_seq_line = false;
144                }
145            }
146
147            byte_pos += bytes_read as u64;
148        }
149
150        // Flush final record.
151        if let Some(name) = current_name.take() {
152            name_to_idx.insert(name.clone(), entries.len());
153            entries.push(FastaIndexEntry {
154                name,
155                length: seq_length,
156                offset: seq_offset,
157                line_bases,
158                line_width,
159            });
160        }
161
162        Ok(Self {
163            entries,
164            name_to_idx,
165        })
166    }
167
168    /// Write the index to a `.fai` file.
169    pub fn write(&self, path: &Path) -> Result<()> {
170        let mut file = File::create(path)?;
171        for entry in &self.entries {
172            writeln!(
173                file,
174                "{}\t{}\t{}\t{}\t{}",
175                entry.name, entry.length, entry.offset, entry.line_bases, entry.line_width
176            )?;
177        }
178        Ok(())
179    }
180
181    /// Look up an entry by sequence name.
182    pub fn get(&self, name: &str) -> Option<&FastaIndexEntry> {
183        self.name_to_idx.get(name).map(|&idx| &self.entries[idx])
184    }
185
186    /// List all sequence names in file order.
187    pub fn names(&self) -> Vec<&str> {
188        self.entries.iter().map(|e| e.name.as_str()).collect()
189    }
190
191    /// Number of indexed sequences.
192    pub fn len(&self) -> usize {
193        self.entries.len()
194    }
195}
196
197/// Reader for random access into an indexed FASTA file.
198pub struct IndexedFastaReader {
199    index: FastaIndex,
200    reader: BufReader<File>,
201}
202
203impl IndexedFastaReader {
204    /// Open a FASTA file with a pre-built index.
205    pub fn new(fasta_path: &Path, index: FastaIndex) -> Result<Self> {
206        let file = File::open(fasta_path)?;
207        Ok(Self {
208            index,
209            reader: BufReader::new(file),
210        })
211    }
212
213    /// Open a FASTA file and automatically load its `.fai` sidecar index.
214    pub fn open(fasta_path: &Path) -> Result<Self> {
215        let fai_path = fasta_path.with_extension({
216            let mut ext = fasta_path
217                .extension()
218                .unwrap_or_default()
219                .to_os_string();
220            ext.push(".fai");
221            ext
222        });
223        let index = FastaIndex::from_file(&fai_path)?;
224        Self::new(fasta_path, index)
225    }
226
227    /// Fetch bases `[start, end)` from the named sequence.
228    pub fn fetch(&mut self, name: &str, start: usize, end: usize) -> Result<Vec<u8>> {
229        let entry = self.index.get(name).ok_or_else(|| {
230            CyaneaError::InvalidInput(format!("sequence not found: {}", name))
231        })?;
232
233        if start > end || end > entry.length {
234            return Err(CyaneaError::InvalidInput(format!(
235                "region [{}..{}) out of bounds for {} (length {})",
236                start, end, name, entry.length
237            )));
238        }
239
240        let len = end - start;
241        if len == 0 {
242            return Ok(Vec::new());
243        }
244
245        let byte_start = entry.offset
246            + (start / entry.line_bases) as u64 * entry.line_width as u64
247            + (start % entry.line_bases) as u64;
248
249        self.reader.seek(SeekFrom::Start(byte_start))?;
250
251        let mut result = Vec::with_capacity(len);
252        let mut remaining = len;
253        let mut buf = [0u8; 8192];
254
255        while remaining > 0 {
256            let to_read = remaining.min(buf.len());
257            let n = self.reader.read(&mut buf[..to_read])?;
258            if n == 0 {
259                return Err(CyaneaError::Parse("unexpected end of FASTA file".into()));
260            }
261            for &b in &buf[..n] {
262                if b != b'\n' && b != b'\r' {
263                    result.push(b);
264                    remaining -= 1;
265                    if remaining == 0 {
266                        break;
267                    }
268                }
269            }
270        }
271
272        Ok(result)
273    }
274
275    /// Fetch the entire sequence by name.
276    pub fn fetch_all(&mut self, name: &str) -> Result<Vec<u8>> {
277        let length = self
278            .index
279            .get(name)
280            .ok_or_else(|| {
281                CyaneaError::InvalidInput(format!("sequence not found: {}", name))
282            })?
283            .length;
284        self.fetch(name, 0, length)
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291    use tempfile::NamedTempFile;
292
293    fn write_fasta(content: &str) -> NamedTempFile {
294        let mut f = NamedTempFile::new().unwrap();
295        f.write_all(content.as_bytes()).unwrap();
296        f.flush().unwrap();
297        f
298    }
299
300    const MULTI_FASTA: &str = "\
301>chr1
302ACGTACGT
303AAAACCCC
304>chr2
305TTTTGGGG
306";
307
308    #[test]
309    fn build_index_from_fasta() {
310        let f = write_fasta(MULTI_FASTA);
311        let idx = FastaIndex::build(f.path()).unwrap();
312        assert_eq!(idx.len(), 2);
313        assert_eq!(idx.names(), vec!["chr1", "chr2"]);
314
315        let e1 = idx.get("chr1").unwrap();
316        assert_eq!(e1.length, 16);
317        assert_eq!(e1.line_bases, 8);
318        assert_eq!(e1.line_width, 9); // 8 bases + newline
319
320        let e2 = idx.get("chr2").unwrap();
321        assert_eq!(e2.length, 8);
322    }
323
324    #[test]
325    fn write_and_read_roundtrip() {
326        let fa = write_fasta(MULTI_FASTA);
327        let idx = FastaIndex::build(fa.path()).unwrap();
328
329        let fai_file = NamedTempFile::new().unwrap();
330        idx.write(fai_file.path()).unwrap();
331
332        let idx2 = FastaIndex::from_file(fai_file.path()).unwrap();
333        assert_eq!(idx2.len(), idx.len());
334        for name in idx.names() {
335            let a = idx.get(name).unwrap();
336            let b = idx2.get(name).unwrap();
337            assert_eq!(a.length, b.length);
338            assert_eq!(a.offset, b.offset);
339            assert_eq!(a.line_bases, b.line_bases);
340            assert_eq!(a.line_width, b.line_width);
341        }
342    }
343
344    #[test]
345    fn parse_fai_format() {
346        let fai_content = "chr1\t16\t6\t8\t9\nchr2\t8\t31\t8\t9\n";
347        let f = write_fasta(fai_content);
348        let idx = FastaIndex::from_file(f.path()).unwrap();
349        assert_eq!(idx.len(), 2);
350        let e = idx.get("chr1").unwrap();
351        assert_eq!(e.length, 16);
352        assert_eq!(e.offset, 6);
353    }
354
355    #[test]
356    fn fetch_subsequence() {
357        let fa = write_fasta(MULTI_FASTA);
358        let idx = FastaIndex::build(fa.path()).unwrap();
359        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
360
361        let sub = reader.fetch("chr1", 2, 6).unwrap();
362        assert_eq!(sub, b"GTAC");
363    }
364
365    #[test]
366    fn fetch_across_line_boundary() {
367        let fa = write_fasta(MULTI_FASTA);
368        let idx = FastaIndex::build(fa.path()).unwrap();
369        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
370
371        // Spans the newline between the two lines of chr1.
372        let sub = reader.fetch("chr1", 6, 10).unwrap();
373        assert_eq!(sub, b"GTAA");
374    }
375
376    #[test]
377    fn fetch_entire_sequence() {
378        let fa = write_fasta(MULTI_FASTA);
379        let idx = FastaIndex::build(fa.path()).unwrap();
380        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
381
382        let all = reader.fetch_all("chr2").unwrap();
383        assert_eq!(all, b"TTTTGGGG");
384    }
385
386    #[test]
387    fn error_unknown_sequence() {
388        let fa = write_fasta(MULTI_FASTA);
389        let idx = FastaIndex::build(fa.path()).unwrap();
390        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
391
392        let err = reader.fetch("chrX", 0, 1).unwrap_err();
393        assert!(err.to_string().contains("sequence not found"));
394    }
395
396    #[test]
397    fn error_out_of_bounds() {
398        let fa = write_fasta(MULTI_FASTA);
399        let idx = FastaIndex::build(fa.path()).unwrap();
400        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
401
402        let err = reader.fetch("chr2", 0, 100).unwrap_err();
403        assert!(err.to_string().contains("out of bounds"));
404    }
405
406    #[test]
407    fn multi_sequence_independent_fetch() {
408        let fa = write_fasta(MULTI_FASTA);
409        let idx = FastaIndex::build(fa.path()).unwrap();
410        let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
411
412        let s1 = reader.fetch_all("chr1").unwrap();
413        let s2 = reader.fetch_all("chr2").unwrap();
414        assert_eq!(s1, b"ACGTACGTAAAACCCC");
415        assert_eq!(s2, b"TTTTGGGG");
416
417        // Re-fetch chr1 after chr2 to confirm seeking works both ways.
418        let s1_again = reader.fetch("chr1", 0, 4).unwrap();
419        assert_eq!(s1_again, b"ACGT");
420    }
421}