Skip to main content

ref_solver/parsing/
vcf.rs

1//! Parser for VCF header contig lines.
2//!
3//! VCF files have contig definitions in the header as:
4//! `##contig=<ID=chr1,length=248956422>`
5//!
6//! Additional fields like `md5` and `assembly` may also be present.
7//!
8//! Uses noodles for file parsing, with manual fallback for text parsing
9//! to ensure all fields (including MD5) are properly extracted.
10
11use std::path::Path;
12
13use crate::core::contig::Contig;
14use crate::core::header::QueryHeader;
15use crate::parsing::sam::ParseError;
16use crate::utils::validation::{check_contig_limit, normalize_md5};
17
18/// Parse VCF file and extract contig definitions from header
19///
20/// # Errors
21///
22/// Returns `ParseError::Io` if the file cannot be read, or other parse errors
23/// if the content is invalid.
24pub fn parse_vcf_file(path: &Path) -> Result<QueryHeader, ParseError> {
25    let content = std::fs::read_to_string(path)?;
26    parse_vcf_header_text(&content)
27}
28
29/// Parse VCF header text and extract contig definitions
30///
31/// # Errors
32///
33/// Returns `ParseError::InvalidFormat` if contig lines are malformed or no
34/// contig lines are found, or `ParseError::TooManyContigs` if the limit is exceeded.
35pub fn parse_vcf_header_text(text: &str) -> Result<QueryHeader, ParseError> {
36    let mut contigs = Vec::new();
37
38    for line in text.lines() {
39        // VCF contig lines start with ##contig=
40        if !line.starts_with("##contig=") {
41            // Stop at the header line (starts with #CHROM)
42            if line.starts_with("#CHROM") {
43                break;
44            }
45            continue;
46        }
47
48        if let Some(contig) = parse_contig_line(line)? {
49            // Check contig limit for DOS protection
50            if check_contig_limit(contigs.len()).is_some() {
51                return Err(ParseError::TooManyContigs(contigs.len()));
52            }
53            contigs.push(contig);
54        }
55    }
56
57    if contigs.is_empty() {
58        return Err(ParseError::InvalidFormat(
59            "No ##contig lines found in VCF header".to_string(),
60        ));
61    }
62
63    Ok(QueryHeader::new(contigs))
64}
65
66/// Parse a single ##contig=<...> line
67fn parse_contig_line(line: &str) -> Result<Option<Contig>, ParseError> {
68    // Format: ##contig=<ID=chr1,length=248956422,md5=abc123,...>
69    let content = line
70        .strip_prefix("##contig=<")
71        .and_then(|s| s.strip_suffix('>'))
72        .ok_or_else(|| ParseError::InvalidFormat(format!("Invalid contig line format: {line}")))?;
73
74    let mut name: Option<String> = None;
75    let mut length: Option<u64> = None;
76    let mut md5: Option<String> = None;
77    let mut assembly: Option<String> = None;
78
79    // Parse key=value pairs, handling quoted values
80    for part in split_contig_fields(content) {
81        if let Some((key, value)) = part.split_once('=') {
82            let key = key.trim();
83            // Remove quotes from value if present
84            let value = value.trim().trim_matches('"');
85
86            match key.to_lowercase().as_str() {
87                "id" => name = Some(value.to_string()),
88                "length" => length = value.parse().ok(),
89                "md5" => {
90                    // Validate and normalize MD5 using centralized helper
91                    md5 = normalize_md5(value);
92                }
93                "assembly" => assembly = Some(value.to_string()),
94                _ => {}
95            }
96        }
97    }
98
99    match (name, length) {
100        (Some(name), Some(length)) => {
101            let mut contig = Contig::new(name, length);
102            contig.md5 = md5;
103            contig.assembly = assembly;
104            Ok(Some(contig))
105        }
106        (Some(name), None) => Err(ParseError::InvalidFormat(format!(
107            "Contig '{name}' missing length"
108        ))),
109        _ => Ok(None), // Skip malformed lines without ID
110    }
111}
112
113/// Split contig fields, handling commas inside quoted values.
114///
115/// This is UTF-8 safe because:
116/// - Commas are single-byte ASCII (0x2C)
117/// - `char_indices()` yields byte positions at character boundaries
118/// - After a comma at position `i`, `i + 1` is always a valid boundary
119fn split_contig_fields(content: &str) -> Vec<&str> {
120    let mut fields = Vec::new();
121    let mut start = 0;
122    let mut in_quotes = false;
123
124    for (i, c) in content.char_indices() {
125        match c {
126            '"' => in_quotes = !in_quotes,
127            ',' if !in_quotes => {
128                fields.push(&content[start..i]);
129                // Safe: comma is 1 byte, so i + 1 is a valid char boundary
130                start = i + 1;
131            }
132            _ => {}
133        }
134    }
135
136    // Don't forget the last field
137    if start <= content.len() {
138        fields.push(&content[start..]);
139    }
140
141    fields
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    #[test]
149    fn test_parse_vcf_header() {
150        let vcf = r#"##fileformat=VCFv4.2
151##contig=<ID=chr1,length=248956422>
152##contig=<ID=chr2,length=242193529,md5=f98db672eb0993dcfdabafe2a882905c>
153##contig=<ID=chrM,length=16569,assembly=GRCh38>
154##INFO=<ID=DP,Number=1,Type=Integer,Description="Depth">
155#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
156"#;
157
158        let query = parse_vcf_header_text(vcf).unwrap();
159        assert_eq!(query.contigs.len(), 3);
160
161        assert_eq!(query.contigs[0].name, "chr1");
162        assert_eq!(query.contigs[0].length, 248_956_422);
163        assert!(query.contigs[0].md5.is_none());
164
165        assert_eq!(query.contigs[1].name, "chr2");
166        assert_eq!(query.contigs[1].length, 242_193_529);
167        assert_eq!(
168            query.contigs[1].md5,
169            Some("f98db672eb0993dcfdabafe2a882905c".to_string())
170        );
171
172        assert_eq!(query.contigs[2].name, "chrM");
173        assert_eq!(query.contigs[2].length, 16569);
174        assert_eq!(query.contigs[2].assembly, Some("GRCh38".to_string()));
175    }
176
177    #[test]
178    fn test_parse_vcf_no_contigs() {
179        let vcf = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\n";
180        let result = parse_vcf_header_text(vcf);
181        assert!(result.is_err());
182    }
183
184    #[test]
185    fn test_parse_contig_line() {
186        let line = "##contig=<ID=chr1,length=248956422,md5=6aef897c3d6ff0c78aff06ac189178dd>";
187        let contig = parse_contig_line(line).unwrap().unwrap();
188
189        assert_eq!(contig.name, "chr1");
190        assert_eq!(contig.length, 248_956_422);
191        assert_eq!(
192            contig.md5,
193            Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
194        );
195    }
196
197    #[test]
198    fn test_parse_vcf_quoted_values() {
199        // Test that quoted values are handled properly
200        let vcf = r#"##fileformat=VCFv4.2
201##contig=<ID=chr1,length=248956422,assembly="GRCh38.p14">
202#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
203"#;
204        let query = parse_vcf_header_text(vcf).unwrap();
205        assert_eq!(query.contigs.len(), 1);
206        assert_eq!(query.contigs[0].name, "chr1");
207        assert_eq!(query.contigs[0].assembly, Some("GRCh38.p14".to_string()));
208    }
209
210    #[test]
211    fn test_split_contig_fields() {
212        let fields = split_contig_fields(r#"ID=chr1,length=123,desc="foo,bar""#);
213        assert_eq!(fields.len(), 3);
214        assert_eq!(fields[0], "ID=chr1");
215        assert_eq!(fields[1], "length=123");
216        assert_eq!(fields[2], r#"desc="foo,bar""#);
217    }
218
219    #[test]
220    fn test_split_contig_fields_utf8() {
221        // Verify UTF-8 handling with multi-byte characters
222        let fields = split_contig_fields("ID=chrα,length=123,desc=日本語");
223        assert_eq!(fields.len(), 3);
224        assert_eq!(fields[0], "ID=chrα");
225        assert_eq!(fields[1], "length=123");
226        assert_eq!(fields[2], "desc=日本語");
227    }
228
229    #[test]
230    fn test_split_contig_fields_empty() {
231        // Edge case: empty string
232        let fields = split_contig_fields("");
233        assert_eq!(fields.len(), 1);
234        assert_eq!(fields[0], "");
235
236        // Edge case: single field
237        let fields = split_contig_fields("ID=chr1");
238        assert_eq!(fields.len(), 1);
239        assert_eq!(fields[0], "ID=chr1");
240    }
241}