Skip to main content

ref_solver/parsing/
tsv.rs

1use std::path::Path;
2
3use crate::core::contig::Contig;
4use crate::core::header::QueryHeader;
5use crate::parsing::sam::ParseError;
6use crate::utils::validation::{check_contig_limit, normalize_md5};
7
8/// Parse a TSV/CSV file with columns: name, length, [md5]
9///
10/// # Errors
11///
12/// Returns `ParseError::Io` if the file cannot be read, or other parse errors
13/// if the content is invalid.
14pub fn parse_tsv_file(path: &Path, delimiter: char) -> Result<QueryHeader, ParseError> {
15    let content = std::fs::read_to_string(path)?;
16    parse_tsv_text(&content, delimiter)
17}
18
19/// Parse TSV/CSV text with columns: name, length, [md5]
20///
21/// # Errors
22///
23/// Returns `ParseError::InvalidFormat` if lines have fewer than 2 fields,
24/// contain invalid length values, or no contigs are found, or
25/// `ParseError::TooManyContigs` if the limit is exceeded.
26pub fn parse_tsv_text(text: &str, delimiter: char) -> Result<QueryHeader, ParseError> {
27    let mut contigs = Vec::new();
28    let mut first_data_line = true;
29
30    for (i, line) in text.lines().enumerate() {
31        let line = line.trim();
32        if line.is_empty() || line.starts_with('#') {
33            continue;
34        }
35
36        let fields: Vec<&str> = line.split(delimiter).collect();
37
38        // Check if first non-empty/non-comment line is a header
39        if first_data_line {
40            first_data_line = false;
41            let first = fields.first().map(|s| s.to_lowercase()).unwrap_or_default();
42            if first == "name" || first == "sn" || first == "contig" || first == "chrom" {
43                continue;
44            }
45        }
46
47        // Line numbers in errors are 1-based for user friendliness
48        let line_num = i + 1;
49
50        if fields.len() < 2 {
51            return Err(ParseError::InvalidFormat(format!(
52                "Line {line_num} has fewer than 2 fields"
53            )));
54        }
55
56        let name = fields[0].trim().to_string();
57        let length: u64 = fields[1].trim().parse().map_err(|_| {
58            ParseError::InvalidFormat(format!(
59                "Invalid length on line {}: '{}'",
60                line_num, fields[1]
61            ))
62        })?;
63
64        let mut contig = Contig::new(name, length);
65
66        // Optional MD5 in third column
67        if fields.len() > 2 {
68            // Validate and normalize MD5 using centralized helper
69            contig.md5 = normalize_md5(fields[2].trim());
70        }
71
72        // Check contig limit for DOS protection
73        if check_contig_limit(contigs.len()).is_some() {
74            return Err(ParseError::TooManyContigs(contigs.len()));
75        }
76
77        contigs.push(contig);
78    }
79
80    if contigs.is_empty() {
81        return Err(ParseError::InvalidFormat(
82            "No contigs found in file".to_string(),
83        ));
84    }
85
86    Ok(QueryHeader::new(contigs))
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    #[test]
94    fn test_parse_tsv_text() {
95        let tsv = r"name	length	md5
96chr1	248956422	6aef897c3d6ff0c78aff06ac189178dd
97chr2	242193529	f98db672eb0993dcfdabafe2a882905c
98chrM	16569
99";
100
101        let query = parse_tsv_text(tsv, '\t').unwrap();
102        assert_eq!(query.contigs.len(), 3);
103        assert_eq!(query.contigs[0].name, "chr1");
104        assert_eq!(query.contigs[0].length, 248_956_422);
105        assert!(query.contigs[0].md5.is_some());
106        assert!(query.contigs[2].md5.is_none());
107    }
108
109    #[test]
110    fn test_parse_csv_text() {
111        let csv = r"chrom,length,md5
112chr1,248956422,6aef897c3d6ff0c78aff06ac189178dd
113chr2,242193529,f98db672eb0993dcfdabafe2a882905c
114";
115
116        let query = parse_tsv_text(csv, ',').unwrap();
117        assert_eq!(query.contigs.len(), 2);
118    }
119
120    #[test]
121    fn test_parse_tsv_no_header() {
122        let tsv = "chr1\t248956422\nchr2\t242193529\n";
123        let query = parse_tsv_text(tsv, '\t').unwrap();
124        assert_eq!(query.contigs.len(), 2);
125    }
126
127    #[test]
128    fn test_parse_tsv_comments_before_header() {
129        // Test that header detection works even with comments before it
130        let tsv = r"# This is a comment
131# Another comment
132
133name	length	md5
134chr1	248956422	6aef897c3d6ff0c78aff06ac189178dd
135chr2	242193529	f98db672eb0993dcfdabafe2a882905c
136";
137        let query = parse_tsv_text(tsv, '\t').unwrap();
138        assert_eq!(query.contigs.len(), 2);
139        assert_eq!(query.contigs[0].name, "chr1");
140    }
141}