genomicframe_core/formats/sam/
reader.rs1use crate::core::{GenomicReader, GenomicRecordIterator};
52use crate::error::{Error, Result};
53use crate::formats::bam::{CigarOp, RefSeq, TagValue};
54use crate::io::Compression;
55use flate2::read::MultiGzDecoder;
56use std::collections::HashMap;
57use std::fs::File;
58use std::io::{BufRead, BufReader};
59use std::path::Path;
60
61
62#[derive(Debug, Clone)]
64pub struct SamHeader {
65 pub lines: Vec<String>,
67 pub references: Vec<RefSeq>,
69 ref_index: HashMap<String, usize>,
71}
72
73impl SamHeader {
74 pub fn new() -> Self {
76 Self {
77 lines: Vec::new(),
78 references: Vec::new(),
79 ref_index: HashMap::new(),
80 }
81 }
82
83 pub fn ref_index(&self, name: &str) -> Option<usize> {
85 self.ref_index.get(name).copied()
86 }
87
88 pub fn ref_name(&self, idx: usize) -> Option<&str> {
90 self.references.get(idx).map(|r| r.name.as_str())
91 }
92}
93
94impl Default for SamHeader {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100#[derive(Debug, Clone)]
104pub struct SamRecord {
105 pub qname: String,
107 pub flag: u16,
109 pub rname: String,
111 pub pos: i32,
113 pub mapq: u8,
115 pub cigar: Vec<CigarOp>,
117 pub rnext: String,
119 pub pnext: i32,
121 pub tlen: i32,
123 pub seq: Vec<u8>,
125 pub qual: Vec<u8>,
127 pub tags: HashMap<String, TagValue>,
129}
130
131impl SamRecord {
132 pub fn is_paired(&self) -> bool {
134 (self.flag & 0x1) != 0
135 }
136
137 pub fn is_proper_pair(&self) -> bool {
139 (self.flag & 0x2) != 0
140 }
141
142 pub fn is_unmapped(&self) -> bool {
144 (self.flag & 0x4) != 0
145 }
146
147 pub fn is_mate_unmapped(&self) -> bool {
149 (self.flag & 0x8) != 0
150 }
151
152 pub fn is_reverse(&self) -> bool {
154 (self.flag & 0x10) != 0
155 }
156
157 pub fn is_mate_reverse(&self) -> bool {
159 (self.flag & 0x20) != 0
160 }
161
162 pub fn is_first_in_pair(&self) -> bool {
164 (self.flag & 0x40) != 0
165 }
166
167 pub fn is_second_in_pair(&self) -> bool {
169 (self.flag & 0x80) != 0
170 }
171
172 pub fn is_secondary(&self) -> bool {
174 (self.flag & 0x100) != 0
175 }
176
177 pub fn is_qc_fail(&self) -> bool {
179 (self.flag & 0x200) != 0
180 }
181
182 pub fn is_duplicate(&self) -> bool {
184 (self.flag & 0x400) != 0
185 }
186
187 pub fn is_supplementary(&self) -> bool {
189 (self.flag & 0x800) != 0
190 }
191
192 pub fn cigar_string(&self) -> String {
194 if self.cigar.is_empty() {
195 "*".to_string()
196 } else {
197 self.cigar.iter().map(|op| op.to_string()).collect()
198 }
199 }
200
201 pub fn seq_string(&self) -> String {
203 if self.seq.is_empty() {
204 "*".to_string()
205 } else {
206 String::from_utf8_lossy(&self.seq).to_string()
207 }
208 }
209}
210
211pub struct SamReader {
213 reader: Box<dyn BufRead>,
214 header: Option<SamHeader>,
215 line_buffer: String,
216 line_number: usize,
217}
218
219impl SamReader {
220 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
222 let path = path.as_ref();
223 let file = File::open(path)?;
224 let compression = Compression::from_path(path);
225
226 let reader: Box<dyn BufRead> = match compression {
227 Compression::Gzip => Box::new(BufReader::new(MultiGzDecoder::new(file))),
228 _ => Box::new(BufReader::new(file)),
229 };
230
231 Ok(Self {
232 reader,
233 header: None,
234 line_buffer: String::with_capacity(512),
235 line_number: 0,
236 })
237 }
238
239 pub fn new<R: BufRead + 'static>(reader: R) -> Self {
241 Self {
242 reader: Box::new(reader),
243 header: None,
244 line_buffer: String::with_capacity(512),
245 line_number: 0,
246 }
247 }
248
249 pub fn read_header(&mut self) -> Result<&SamHeader> {
254 if self.header.is_some() {
255 return Ok(self.header.as_ref().unwrap());
256 }
257
258 let mut header = SamHeader::new();
259
260 loop {
261 self.line_buffer.clear();
262 let bytes = self.reader.read_line(&mut self.line_buffer)?;
263 if bytes == 0 {
264 break;
266 }
267
268 self.line_number += 1;
269
270 let line = self.line_buffer.trim();
271
272 if line.is_empty() {
273 continue;
274 }
275
276 if !line.starts_with('@') {
277 self.line_number -= 1;
280 break;
281 }
282
283 header.lines.push(line.to_string());
284
285 if line.starts_with("@SQ") {
287 let mut name = None;
288 let mut length = None;
289
290 for field in line.split('\t').skip(1) {
292 if let Some((key, value)) = field.split_once(':') {
293 match key {
294 "SN" => name = Some(value.to_string()),
295 "LN" => length = value.parse::<u32>().ok(),
296 _ => {}
297 }
298 }
299 }
300
301 if let (Some(name), Some(length)) = (name, length) {
302 let idx = header.references.len();
303 header.ref_index.insert(name.clone(), idx);
304 header.references.push(RefSeq { name, length });
305 }
306 }
307 }
308
309 self.header = Some(header);
310 Ok(self.header.as_ref().unwrap())
311 }
312
313 pub fn header(&self) -> Option<&SamHeader> {
315 self.header.as_ref()
316 }
317
318 fn parse_cigar(cigar_str: &str) -> Result<Vec<CigarOp>> {
320 CigarOp::parse_cigar_string(cigar_str)
321 }
322
323 fn parse_record(&mut self, line: &str) -> Result<SamRecord> {
325 let fields: Vec<&str> = line.split('\t').collect();
326
327 if fields.len() < 11 {
328 return Err(Error::Parse(format!(
329 "Line {}: SAM record must have at least 11 fields, got {}",
330 self.line_number,
331 fields.len()
332 )));
333 }
334
335 let qname = fields[0].to_string();
337 let flag = fields[1].parse::<u16>().map_err(|_| {
338 Error::Parse(format!("Line {}: Invalid FLAG: {}", self.line_number, fields[1]))
339 })?;
340 let rname = fields[2].to_string();
341 let pos = fields[3].parse::<i32>().map_err(|_| {
342 Error::Parse(format!("Line {}: Invalid POS: {}", self.line_number, fields[3]))
343 })?;
344 let mapq = fields[4].parse::<u8>().map_err(|_| {
345 Error::Parse(format!("Line {}: Invalid MAPQ: {}", self.line_number, fields[4]))
346 })?;
347 let cigar = Self::parse_cigar(fields[5])?;
348 let rnext = fields[6].to_string();
349 let pnext = fields[7].parse::<i32>().map_err(|_| {
350 Error::Parse(format!("Line {}: Invalid PNEXT: {}", self.line_number, fields[7]))
351 })?;
352 let tlen = fields[8].parse::<i32>().map_err(|_| {
353 Error::Parse(format!("Line {}: Invalid TLEN: {}", self.line_number, fields[8]))
354 })?;
355
356 let seq = if fields[9] == "*" {
358 Vec::new()
359 } else {
360 fields[9].as_bytes().to_vec()
361 };
362
363 let qual = if fields[10] == "*" {
365 Vec::new()
366 } else {
367 fields[10].bytes().map(|b| b.saturating_sub(33)).collect()
369 };
370
371 let mut tags = HashMap::new();
373 for field in fields.iter().skip(11) {
374 if let Some((tag, rest)) = field.split_once(':') {
375 if let Some((type_str, value)) = rest.split_once(':') {
376 let tag_value = match type_str {
377 "A" => {
378 TagValue::Char(value.bytes().next().unwrap_or(0))
380 }
381 "i" => {
382 TagValue::Int(value.parse().unwrap_or(0))
384 }
385 "f" => {
386 TagValue::Float(value.parse().unwrap_or(0.0))
388 }
389 "Z" => {
390 TagValue::String(value.to_string())
392 }
393 "H" => {
394 TagValue::String(value.to_string())
396 }
397 "B" => {
398 TagValue::ByteArray(value.as_bytes().to_vec())
400 }
401 _ => continue,
402 };
403
404 tags.insert(tag.to_string(), tag_value);
405 }
406 }
407 }
408
409 Ok(SamRecord {
410 qname,
411 flag,
412 rname,
413 pos,
414 mapq,
415 cigar,
416 rnext,
417 pnext,
418 tlen,
419 seq,
420 qual,
421 tags,
422 })
423 }
424
425 pub fn compute_stats(
427 mut self,
428 _config: Option<crate::parallel::ParallelConfig>,
429 ) -> Result<super::stats::SamStats> {
430 self.read_header()?;
432
433 super::stats::SamStats::compute(&mut self)
435 }
436}
437
438impl GenomicRecordIterator for SamReader {
439 type Record = SamRecord;
440
441 fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
442 self.line_buffer.clear();
443 let bytes = self.reader.read_line(&mut self.line_buffer)?;
444
445 if bytes == 0 {
446 return Ok(None);
447 }
448
449 self.line_number += 1;
450
451 let line = self.line_buffer.trim();
452
453 if line.is_empty() {
455 return self.next_raw();
456 }
457
458 if line.starts_with('@') {
460 return self.next_raw();
461 }
462
463 Ok(Some(line.as_bytes().to_vec()))
464 }
465
466 fn next_record(&mut self) -> Result<Option<Self::Record>> {
467 self.line_buffer.clear();
468 let bytes = self.reader.read_line(&mut self.line_buffer)?;
469
470 if bytes == 0 {
471 return Ok(None);
472 }
473
474 self.line_number += 1;
475
476 let line = self.line_buffer.trim().to_string(); if line.is_empty() {
480 return self.next_record();
481 }
482
483 if line.starts_with('@') {
485 return self.next_record();
486 }
487
488 Ok(Some(self.parse_record(&line)?))
489 }
490}
491
492impl GenomicReader for SamReader {
493 type Metadata = SamHeader;
494
495 fn metadata(&self) -> &Self::Metadata {
496 self.header
497 .as_ref()
498 .expect("Header not read yet. Call read_header() first.")
499 }
500}
501
502#[cfg(test)]
503mod tests {
504 use super::*;
505
506 #[test]
507 fn test_cigar_parsing() {
508 let cigar = SamReader::parse_cigar("10M2I5D").unwrap();
509 assert_eq!(cigar.len(), 3);
510 assert_eq!(cigar[0], CigarOp::Match(10));
511 assert_eq!(cigar[1], CigarOp::Ins(2));
512 assert_eq!(cigar[2], CigarOp::Del(5));
513 }
514
515 #[test]
516 fn test_cigar_star() {
517 let cigar = SamReader::parse_cigar("*").unwrap();
518 assert!(cigar.is_empty());
519 }
520
521 #[test]
522 fn test_sam_record_flags() {
523 let record = SamRecord {
524 qname: "read1".to_string(),
525 flag: 0x1 | 0x2, rname: "chr1".to_string(),
527 pos: 100,
528 mapq: 60,
529 cigar: vec![CigarOp::Match(100)],
530 rnext: "=".to_string(),
531 pnext: 200,
532 tlen: 150,
533 seq: b"ACGT".to_vec(),
534 qual: vec![30, 30, 30, 30],
535 tags: HashMap::new(),
536 };
537
538 assert!(record.is_paired());
539 assert!(record.is_proper_pair());
540 assert!(!record.is_unmapped());
541 assert!(!record.is_duplicate());
542 }
543}