1use std::collections::HashMap;
8use std::fs::File;
9use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write};
10use std::path::Path;
11
12use cyanea_core::{CyaneaError, Result};
13
14#[derive(Debug, Clone)]
16pub struct FastaIndexEntry {
17 pub name: String,
19 pub length: usize,
21 pub offset: u64,
23 pub line_bases: usize,
25 pub line_width: usize,
27}
28
29#[derive(Debug, Clone)]
31pub struct FastaIndex {
32 entries: Vec<FastaIndexEntry>,
33 name_to_idx: HashMap<String, usize>,
34}
35
36impl FastaIndex {
37 pub fn from_file(fai_path: &Path) -> Result<Self> {
39 let file = File::open(fai_path)?;
40 let reader = BufReader::new(file);
41 let mut entries = Vec::new();
42 let mut name_to_idx = HashMap::new();
43
44 for (line_num, line) in reader.lines().enumerate() {
45 let line = line?;
46 let line = line.trim_end();
47 if line.is_empty() {
48 continue;
49 }
50 let fields: Vec<&str> = line.split('\t').collect();
51 if fields.len() < 5 {
52 return Err(CyaneaError::Parse(format!(
53 "line {}: expected 5 tab-separated fields, got {}",
54 line_num + 1,
55 fields.len()
56 )));
57 }
58 let name = fields[0].to_string();
59 let length = fields[1].parse::<usize>().map_err(|e| {
60 CyaneaError::Parse(format!("line {}: invalid length: {}", line_num + 1, e))
61 })?;
62 let offset = fields[2].parse::<u64>().map_err(|e| {
63 CyaneaError::Parse(format!("line {}: invalid offset: {}", line_num + 1, e))
64 })?;
65 let line_bases = fields[3].parse::<usize>().map_err(|e| {
66 CyaneaError::Parse(format!("line {}: invalid line_bases: {}", line_num + 1, e))
67 })?;
68 let line_width = fields[4].parse::<usize>().map_err(|e| {
69 CyaneaError::Parse(format!("line {}: invalid line_width: {}", line_num + 1, e))
70 })?;
71
72 name_to_idx.insert(name.clone(), entries.len());
73 entries.push(FastaIndexEntry {
74 name,
75 length,
76 offset,
77 line_bases,
78 line_width,
79 });
80 }
81
82 Ok(Self {
83 entries,
84 name_to_idx,
85 })
86 }
87
88 pub fn build(fasta_path: &Path) -> Result<Self> {
90 let file = File::open(fasta_path)?;
91 let mut reader = BufReader::new(file);
92 let mut entries = Vec::new();
93 let mut name_to_idx = HashMap::new();
94 let mut byte_pos: u64 = 0;
95
96 let mut current_name: Option<String> = None;
98 let mut seq_offset: u64 = 0;
99 let mut seq_length: usize = 0;
100 let mut line_bases: usize = 0;
101 let mut line_width: usize = 0;
102 let mut first_seq_line = true;
103
104 let mut line_buf = String::new();
105 loop {
106 line_buf.clear();
107 let bytes_read = reader.read_line(&mut line_buf)?;
108 if bytes_read == 0 {
109 break;
110 }
111
112 if line_buf.starts_with('>') {
113 if let Some(name) = current_name.take() {
115 name_to_idx.insert(name.clone(), entries.len());
116 entries.push(FastaIndexEntry {
117 name,
118 length: seq_length,
119 offset: seq_offset,
120 line_bases,
121 line_width,
122 });
123 }
124
125 let header = line_buf[1..].trim_end_matches(&['\n', '\r'][..]);
126 let name = header
127 .split_whitespace()
128 .next()
129 .unwrap_or(header)
130 .to_string();
131 current_name = Some(name);
132 seq_length = 0;
133 line_bases = 0;
134 line_width = 0;
135 first_seq_line = true;
136 seq_offset = byte_pos + bytes_read as u64;
137 } else if current_name.is_some() {
138 let bases = line_buf.trim_end_matches(&['\n', '\r'][..]).len();
139 seq_length += bases;
140 if first_seq_line {
141 line_bases = bases;
142 line_width = bytes_read;
143 first_seq_line = false;
144 }
145 }
146
147 byte_pos += bytes_read as u64;
148 }
149
150 if let Some(name) = current_name.take() {
152 name_to_idx.insert(name.clone(), entries.len());
153 entries.push(FastaIndexEntry {
154 name,
155 length: seq_length,
156 offset: seq_offset,
157 line_bases,
158 line_width,
159 });
160 }
161
162 Ok(Self {
163 entries,
164 name_to_idx,
165 })
166 }
167
168 pub fn write(&self, path: &Path) -> Result<()> {
170 let mut file = File::create(path)?;
171 for entry in &self.entries {
172 writeln!(
173 file,
174 "{}\t{}\t{}\t{}\t{}",
175 entry.name, entry.length, entry.offset, entry.line_bases, entry.line_width
176 )?;
177 }
178 Ok(())
179 }
180
181 pub fn get(&self, name: &str) -> Option<&FastaIndexEntry> {
183 self.name_to_idx.get(name).map(|&idx| &self.entries[idx])
184 }
185
186 pub fn names(&self) -> Vec<&str> {
188 self.entries.iter().map(|e| e.name.as_str()).collect()
189 }
190
191 pub fn len(&self) -> usize {
193 self.entries.len()
194 }
195}
196
197pub struct IndexedFastaReader {
199 index: FastaIndex,
200 reader: BufReader<File>,
201}
202
203impl IndexedFastaReader {
204 pub fn new(fasta_path: &Path, index: FastaIndex) -> Result<Self> {
206 let file = File::open(fasta_path)?;
207 Ok(Self {
208 index,
209 reader: BufReader::new(file),
210 })
211 }
212
213 pub fn open(fasta_path: &Path) -> Result<Self> {
215 let fai_path = fasta_path.with_extension({
216 let mut ext = fasta_path
217 .extension()
218 .unwrap_or_default()
219 .to_os_string();
220 ext.push(".fai");
221 ext
222 });
223 let index = FastaIndex::from_file(&fai_path)?;
224 Self::new(fasta_path, index)
225 }
226
227 pub fn fetch(&mut self, name: &str, start: usize, end: usize) -> Result<Vec<u8>> {
229 let entry = self.index.get(name).ok_or_else(|| {
230 CyaneaError::InvalidInput(format!("sequence not found: {}", name))
231 })?;
232
233 if start > end || end > entry.length {
234 return Err(CyaneaError::InvalidInput(format!(
235 "region [{}..{}) out of bounds for {} (length {})",
236 start, end, name, entry.length
237 )));
238 }
239
240 let len = end - start;
241 if len == 0 {
242 return Ok(Vec::new());
243 }
244
245 let byte_start = entry.offset
246 + (start / entry.line_bases) as u64 * entry.line_width as u64
247 + (start % entry.line_bases) as u64;
248
249 self.reader.seek(SeekFrom::Start(byte_start))?;
250
251 let mut result = Vec::with_capacity(len);
252 let mut remaining = len;
253 let mut buf = [0u8; 8192];
254
255 while remaining > 0 {
256 let to_read = remaining.min(buf.len());
257 let n = self.reader.read(&mut buf[..to_read])?;
258 if n == 0 {
259 return Err(CyaneaError::Parse("unexpected end of FASTA file".into()));
260 }
261 for &b in &buf[..n] {
262 if b != b'\n' && b != b'\r' {
263 result.push(b);
264 remaining -= 1;
265 if remaining == 0 {
266 break;
267 }
268 }
269 }
270 }
271
272 Ok(result)
273 }
274
275 pub fn fetch_all(&mut self, name: &str) -> Result<Vec<u8>> {
277 let length = self
278 .index
279 .get(name)
280 .ok_or_else(|| {
281 CyaneaError::InvalidInput(format!("sequence not found: {}", name))
282 })?
283 .length;
284 self.fetch(name, 0, length)
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291 use tempfile::NamedTempFile;
292
293 fn write_fasta(content: &str) -> NamedTempFile {
294 let mut f = NamedTempFile::new().unwrap();
295 f.write_all(content.as_bytes()).unwrap();
296 f.flush().unwrap();
297 f
298 }
299
300 const MULTI_FASTA: &str = "\
301>chr1
302ACGTACGT
303AAAACCCC
304>chr2
305TTTTGGGG
306";
307
308 #[test]
309 fn build_index_from_fasta() {
310 let f = write_fasta(MULTI_FASTA);
311 let idx = FastaIndex::build(f.path()).unwrap();
312 assert_eq!(idx.len(), 2);
313 assert_eq!(idx.names(), vec!["chr1", "chr2"]);
314
315 let e1 = idx.get("chr1").unwrap();
316 assert_eq!(e1.length, 16);
317 assert_eq!(e1.line_bases, 8);
318 assert_eq!(e1.line_width, 9); let e2 = idx.get("chr2").unwrap();
321 assert_eq!(e2.length, 8);
322 }
323
324 #[test]
325 fn write_and_read_roundtrip() {
326 let fa = write_fasta(MULTI_FASTA);
327 let idx = FastaIndex::build(fa.path()).unwrap();
328
329 let fai_file = NamedTempFile::new().unwrap();
330 idx.write(fai_file.path()).unwrap();
331
332 let idx2 = FastaIndex::from_file(fai_file.path()).unwrap();
333 assert_eq!(idx2.len(), idx.len());
334 for name in idx.names() {
335 let a = idx.get(name).unwrap();
336 let b = idx2.get(name).unwrap();
337 assert_eq!(a.length, b.length);
338 assert_eq!(a.offset, b.offset);
339 assert_eq!(a.line_bases, b.line_bases);
340 assert_eq!(a.line_width, b.line_width);
341 }
342 }
343
344 #[test]
345 fn parse_fai_format() {
346 let fai_content = "chr1\t16\t6\t8\t9\nchr2\t8\t31\t8\t9\n";
347 let f = write_fasta(fai_content);
348 let idx = FastaIndex::from_file(f.path()).unwrap();
349 assert_eq!(idx.len(), 2);
350 let e = idx.get("chr1").unwrap();
351 assert_eq!(e.length, 16);
352 assert_eq!(e.offset, 6);
353 }
354
355 #[test]
356 fn fetch_subsequence() {
357 let fa = write_fasta(MULTI_FASTA);
358 let idx = FastaIndex::build(fa.path()).unwrap();
359 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
360
361 let sub = reader.fetch("chr1", 2, 6).unwrap();
362 assert_eq!(sub, b"GTAC");
363 }
364
365 #[test]
366 fn fetch_across_line_boundary() {
367 let fa = write_fasta(MULTI_FASTA);
368 let idx = FastaIndex::build(fa.path()).unwrap();
369 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
370
371 let sub = reader.fetch("chr1", 6, 10).unwrap();
373 assert_eq!(sub, b"GTAA");
374 }
375
376 #[test]
377 fn fetch_entire_sequence() {
378 let fa = write_fasta(MULTI_FASTA);
379 let idx = FastaIndex::build(fa.path()).unwrap();
380 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
381
382 let all = reader.fetch_all("chr2").unwrap();
383 assert_eq!(all, b"TTTTGGGG");
384 }
385
386 #[test]
387 fn error_unknown_sequence() {
388 let fa = write_fasta(MULTI_FASTA);
389 let idx = FastaIndex::build(fa.path()).unwrap();
390 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
391
392 let err = reader.fetch("chrX", 0, 1).unwrap_err();
393 assert!(err.to_string().contains("sequence not found"));
394 }
395
396 #[test]
397 fn error_out_of_bounds() {
398 let fa = write_fasta(MULTI_FASTA);
399 let idx = FastaIndex::build(fa.path()).unwrap();
400 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
401
402 let err = reader.fetch("chr2", 0, 100).unwrap_err();
403 assert!(err.to_string().contains("out of bounds"));
404 }
405
406 #[test]
407 fn multi_sequence_independent_fetch() {
408 let fa = write_fasta(MULTI_FASTA);
409 let idx = FastaIndex::build(fa.path()).unwrap();
410 let mut reader = IndexedFastaReader::new(fa.path(), idx).unwrap();
411
412 let s1 = reader.fetch_all("chr1").unwrap();
413 let s2 = reader.fetch_all("chr2").unwrap();
414 assert_eq!(s1, b"ACGTACGTAAAACCCC");
415 assert_eq!(s2, b"TTTTGGGG");
416
417 let s1_again = reader.fetch("chr1", 0, 4).unwrap();
419 assert_eq!(s1_again, b"ACGT");
420 }
421}