use std::path::Path;
use crate::core::contig::Contig;
use crate::core::header::QueryHeader;
use crate::parsing::sam::ParseError;
use crate::utils::validation::{check_contig_limit, normalize_md5};
pub fn parse_tsv_file(path: &Path, delimiter: char) -> Result<QueryHeader, ParseError> {
let content = std::fs::read_to_string(path)?;
parse_tsv_text(&content, delimiter)
}
pub fn parse_tsv_text(text: &str, delimiter: char) -> Result<QueryHeader, ParseError> {
let mut contigs = Vec::new();
let mut first_data_line = true;
for (i, line) in text.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let fields: Vec<&str> = line.split(delimiter).collect();
if first_data_line {
first_data_line = false;
let first = fields.first().map(|s| s.to_lowercase()).unwrap_or_default();
if first == "name" || first == "sn" || first == "contig" || first == "chrom" {
continue;
}
}
let line_num = i + 1;
if fields.len() < 2 {
return Err(ParseError::InvalidFormat(format!(
"Line {line_num} has fewer than 2 fields"
)));
}
let name = fields[0].trim().to_string();
let length: u64 = fields[1].trim().parse().map_err(|_| {
ParseError::InvalidFormat(format!(
"Invalid length on line {}: '{}'",
line_num, fields[1]
))
})?;
let mut contig = Contig::new(name, length);
if fields.len() > 2 {
contig.md5 = normalize_md5(fields[2].trim());
}
if check_contig_limit(contigs.len()).is_some() {
return Err(ParseError::TooManyContigs(contigs.len()));
}
contigs.push(contig);
}
if contigs.is_empty() {
return Err(ParseError::InvalidFormat(
"No contigs found in file".to_string(),
));
}
Ok(QueryHeader::new(contigs))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_tsv_text() {
let tsv = r"name length md5
chr1 248956422 6aef897c3d6ff0c78aff06ac189178dd
chr2 242193529 f98db672eb0993dcfdabafe2a882905c
chrM 16569
";
let query = parse_tsv_text(tsv, '\t').unwrap();
assert_eq!(query.contigs.len(), 3);
assert_eq!(query.contigs[0].name, "chr1");
assert_eq!(query.contigs[0].length, 248_956_422);
assert!(query.contigs[0].md5.is_some());
assert!(query.contigs[2].md5.is_none());
}
#[test]
fn test_parse_csv_text() {
let csv = r"chrom,length,md5
chr1,248956422,6aef897c3d6ff0c78aff06ac189178dd
chr2,242193529,f98db672eb0993dcfdabafe2a882905c
";
let query = parse_tsv_text(csv, ',').unwrap();
assert_eq!(query.contigs.len(), 2);
}
#[test]
fn test_parse_tsv_no_header() {
let tsv = "chr1\t248956422\nchr2\t242193529\n";
let query = parse_tsv_text(tsv, '\t').unwrap();
assert_eq!(query.contigs.len(), 2);
}
#[test]
fn test_parse_tsv_comments_before_header() {
let tsv = r"# This is a comment
# Another comment
name length md5
chr1 248956422 6aef897c3d6ff0c78aff06ac189178dd
chr2 242193529 f98db672eb0993dcfdabafe2a882905c
";
let query = parse_tsv_text(tsv, '\t').unwrap();
assert_eq!(query.contigs.len(), 2);
assert_eq!(query.contigs[0].name, "chr1");
}
}