ref_solver/parsing/
tsv.rs1use std::path::Path;
2
3use crate::core::contig::Contig;
4use crate::core::header::QueryHeader;
5use crate::parsing::sam::ParseError;
6use crate::utils::validation::{check_contig_limit, normalize_md5};
7
8pub fn parse_tsv_file(path: &Path, delimiter: char) -> Result<QueryHeader, ParseError> {
15 let content = std::fs::read_to_string(path)?;
16 parse_tsv_text(&content, delimiter)
17}
18
19pub fn parse_tsv_text(text: &str, delimiter: char) -> Result<QueryHeader, ParseError> {
27 let mut contigs = Vec::new();
28 let mut first_data_line = true;
29
30 for (i, line) in text.lines().enumerate() {
31 let line = line.trim();
32 if line.is_empty() || line.starts_with('#') {
33 continue;
34 }
35
36 let fields: Vec<&str> = line.split(delimiter).collect();
37
38 if first_data_line {
40 first_data_line = false;
41 let first = fields.first().map(|s| s.to_lowercase()).unwrap_or_default();
42 if first == "name" || first == "sn" || first == "contig" || first == "chrom" {
43 continue;
44 }
45 }
46
47 let line_num = i + 1;
49
50 if fields.len() < 2 {
51 return Err(ParseError::InvalidFormat(format!(
52 "Line {line_num} has fewer than 2 fields"
53 )));
54 }
55
56 let name = fields[0].trim().to_string();
57 let length: u64 = fields[1].trim().parse().map_err(|_| {
58 ParseError::InvalidFormat(format!(
59 "Invalid length on line {}: '{}'",
60 line_num, fields[1]
61 ))
62 })?;
63
64 let mut contig = Contig::new(name, length);
65
66 if fields.len() > 2 {
68 contig.md5 = normalize_md5(fields[2].trim());
70 }
71
72 if check_contig_limit(contigs.len()).is_some() {
74 return Err(ParseError::TooManyContigs(contigs.len()));
75 }
76
77 contigs.push(contig);
78 }
79
80 if contigs.is_empty() {
81 return Err(ParseError::InvalidFormat(
82 "No contigs found in file".to_string(),
83 ));
84 }
85
86 Ok(QueryHeader::new(contigs))
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92
93 #[test]
94 fn test_parse_tsv_text() {
95 let tsv = r"name length md5
96chr1 248956422 6aef897c3d6ff0c78aff06ac189178dd
97chr2 242193529 f98db672eb0993dcfdabafe2a882905c
98chrM 16569
99";
100
101 let query = parse_tsv_text(tsv, '\t').unwrap();
102 assert_eq!(query.contigs.len(), 3);
103 assert_eq!(query.contigs[0].name, "chr1");
104 assert_eq!(query.contigs[0].length, 248_956_422);
105 assert!(query.contigs[0].md5.is_some());
106 assert!(query.contigs[2].md5.is_none());
107 }
108
109 #[test]
110 fn test_parse_csv_text() {
111 let csv = r"chrom,length,md5
112chr1,248956422,6aef897c3d6ff0c78aff06ac189178dd
113chr2,242193529,f98db672eb0993dcfdabafe2a882905c
114";
115
116 let query = parse_tsv_text(csv, ',').unwrap();
117 assert_eq!(query.contigs.len(), 2);
118 }
119
120 #[test]
121 fn test_parse_tsv_no_header() {
122 let tsv = "chr1\t248956422\nchr2\t242193529\n";
123 let query = parse_tsv_text(tsv, '\t').unwrap();
124 assert_eq!(query.contigs.len(), 2);
125 }
126
127 #[test]
128 fn test_parse_tsv_comments_before_header() {
129 let tsv = r"# This is a comment
131# Another comment
132
133name length md5
134chr1 248956422 6aef897c3d6ff0c78aff06ac189178dd
135chr2 242193529 f98db672eb0993dcfdabafe2a882905c
136";
137 let query = parse_tsv_text(tsv, '\t').unwrap();
138 assert_eq!(query.contigs.len(), 2);
139 assert_eq!(query.contigs[0].name, "chr1");
140 }
141}