ref_solver/parsing/
vcf.rs1use std::path::Path;
12
13use crate::core::contig::Contig;
14use crate::core::header::QueryHeader;
15use crate::parsing::sam::ParseError;
16use crate::utils::validation::{check_contig_limit, normalize_md5};
17
18pub fn parse_vcf_file(path: &Path) -> Result<QueryHeader, ParseError> {
25 let content = std::fs::read_to_string(path)?;
26 parse_vcf_header_text(&content)
27}
28
29pub fn parse_vcf_header_text(text: &str) -> Result<QueryHeader, ParseError> {
36 let mut contigs = Vec::new();
37
38 for line in text.lines() {
39 if !line.starts_with("##contig=") {
41 if line.starts_with("#CHROM") {
43 break;
44 }
45 continue;
46 }
47
48 if let Some(contig) = parse_contig_line(line)? {
49 if check_contig_limit(contigs.len()).is_some() {
51 return Err(ParseError::TooManyContigs(contigs.len()));
52 }
53 contigs.push(contig);
54 }
55 }
56
57 if contigs.is_empty() {
58 return Err(ParseError::InvalidFormat(
59 "No ##contig lines found in VCF header".to_string(),
60 ));
61 }
62
63 Ok(QueryHeader::new(contigs))
64}
65
66fn parse_contig_line(line: &str) -> Result<Option<Contig>, ParseError> {
68 let content = line
70 .strip_prefix("##contig=<")
71 .and_then(|s| s.strip_suffix('>'))
72 .ok_or_else(|| ParseError::InvalidFormat(format!("Invalid contig line format: {line}")))?;
73
74 let mut name: Option<String> = None;
75 let mut length: Option<u64> = None;
76 let mut md5: Option<String> = None;
77 let mut assembly: Option<String> = None;
78
79 for part in split_contig_fields(content) {
81 if let Some((key, value)) = part.split_once('=') {
82 let key = key.trim();
83 let value = value.trim().trim_matches('"');
85
86 match key.to_lowercase().as_str() {
87 "id" => name = Some(value.to_string()),
88 "length" => length = value.parse().ok(),
89 "md5" => {
90 md5 = normalize_md5(value);
92 }
93 "assembly" => assembly = Some(value.to_string()),
94 _ => {}
95 }
96 }
97 }
98
99 match (name, length) {
100 (Some(name), Some(length)) => {
101 let mut contig = Contig::new(name, length);
102 contig.md5 = md5;
103 contig.assembly = assembly;
104 Ok(Some(contig))
105 }
106 (Some(name), None) => Err(ParseError::InvalidFormat(format!(
107 "Contig '{name}' missing length"
108 ))),
109 _ => Ok(None), }
111}
112
113fn split_contig_fields(content: &str) -> Vec<&str> {
120 let mut fields = Vec::new();
121 let mut start = 0;
122 let mut in_quotes = false;
123
124 for (i, c) in content.char_indices() {
125 match c {
126 '"' => in_quotes = !in_quotes,
127 ',' if !in_quotes => {
128 fields.push(&content[start..i]);
129 start = i + 1;
131 }
132 _ => {}
133 }
134 }
135
136 if start <= content.len() {
138 fields.push(&content[start..]);
139 }
140
141 fields
142}
143
144#[cfg(test)]
145mod tests {
146 use super::*;
147
148 #[test]
149 fn test_parse_vcf_header() {
150 let vcf = r#"##fileformat=VCFv4.2
151##contig=<ID=chr1,length=248956422>
152##contig=<ID=chr2,length=242193529,md5=f98db672eb0993dcfdabafe2a882905c>
153##contig=<ID=chrM,length=16569,assembly=GRCh38>
154##INFO=<ID=DP,Number=1,Type=Integer,Description="Depth">
155#CHROM POS ID REF ALT QUAL FILTER INFO
156"#;
157
158 let query = parse_vcf_header_text(vcf).unwrap();
159 assert_eq!(query.contigs.len(), 3);
160
161 assert_eq!(query.contigs[0].name, "chr1");
162 assert_eq!(query.contigs[0].length, 248_956_422);
163 assert!(query.contigs[0].md5.is_none());
164
165 assert_eq!(query.contigs[1].name, "chr2");
166 assert_eq!(query.contigs[1].length, 242_193_529);
167 assert_eq!(
168 query.contigs[1].md5,
169 Some("f98db672eb0993dcfdabafe2a882905c".to_string())
170 );
171
172 assert_eq!(query.contigs[2].name, "chrM");
173 assert_eq!(query.contigs[2].length, 16569);
174 assert_eq!(query.contigs[2].assembly, Some("GRCh38".to_string()));
175 }
176
177 #[test]
178 fn test_parse_vcf_no_contigs() {
179 let vcf = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\n";
180 let result = parse_vcf_header_text(vcf);
181 assert!(result.is_err());
182 }
183
184 #[test]
185 fn test_parse_contig_line() {
186 let line = "##contig=<ID=chr1,length=248956422,md5=6aef897c3d6ff0c78aff06ac189178dd>";
187 let contig = parse_contig_line(line).unwrap().unwrap();
188
189 assert_eq!(contig.name, "chr1");
190 assert_eq!(contig.length, 248_956_422);
191 assert_eq!(
192 contig.md5,
193 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
194 );
195 }
196
197 #[test]
198 fn test_parse_vcf_quoted_values() {
199 let vcf = r#"##fileformat=VCFv4.2
201##contig=<ID=chr1,length=248956422,assembly="GRCh38.p14">
202#CHROM POS ID REF ALT QUAL FILTER INFO
203"#;
204 let query = parse_vcf_header_text(vcf).unwrap();
205 assert_eq!(query.contigs.len(), 1);
206 assert_eq!(query.contigs[0].name, "chr1");
207 assert_eq!(query.contigs[0].assembly, Some("GRCh38.p14".to_string()));
208 }
209
210 #[test]
211 fn test_split_contig_fields() {
212 let fields = split_contig_fields(r#"ID=chr1,length=123,desc="foo,bar""#);
213 assert_eq!(fields.len(), 3);
214 assert_eq!(fields[0], "ID=chr1");
215 assert_eq!(fields[1], "length=123");
216 assert_eq!(fields[2], r#"desc="foo,bar""#);
217 }
218
219 #[test]
220 fn test_split_contig_fields_utf8() {
221 let fields = split_contig_fields("ID=chrα,length=123,desc=日本語");
223 assert_eq!(fields.len(), 3);
224 assert_eq!(fields[0], "ID=chrα");
225 assert_eq!(fields[1], "length=123");
226 assert_eq!(fields[2], "desc=日本語");
227 }
228
229 #[test]
230 fn test_split_contig_fields_empty() {
231 let fields = split_contig_fields("");
233 assert_eq!(fields.len(), 1);
234 assert_eq!(fields[0], "");
235
236 let fields = split_contig_fields("ID=chr1");
238 assert_eq!(fields.len(), 1);
239 assert_eq!(fields[0], "ID=chr1");
240 }
241}