sshash_lib/builder/
cf_seg.rs1use anyhow::{Context, Result, bail};
17use std::io::{BufRead, BufReader};
18use std::path::Path;
19
20#[derive(Debug, Clone)]
28pub struct CfSegData {
29 pub segment_ids: Vec<u64>,
31 pub sequences: Vec<String>,
33}
34
35impl CfSegData {
36 pub fn len(&self) -> usize {
38 self.sequences.len()
39 }
40
41 pub fn is_empty(&self) -> bool {
43 self.sequences.is_empty()
44 }
45}
46
47pub fn parse_cf_seg<P: AsRef<Path>>(path: P) -> Result<CfSegData> {
56 let path = path.as_ref();
57 let file = std::fs::File::open(path)
58 .with_context(|| format!("Failed to open cf_seg file: {}", path.display()))?;
59 let reader = BufReader::new(file);
60
61 let mut segment_ids = Vec::new();
62 let mut sequences = Vec::new();
63
64 for (line_num, line) in reader.lines().enumerate() {
65 let line = line
66 .with_context(|| format!("Failed to read line {} of {}", line_num + 1, path.display()))?;
67 let line = line.trim();
68 if line.is_empty() {
69 continue;
70 }
71
72 let (id_str, seq) = line.split_once(|c: char| c.is_ascii_whitespace())
73 .with_context(|| {
74 format!(
75 "Malformed cf_seg line {} in {}: expected '<id> <sequence>'",
76 line_num + 1,
77 path.display()
78 )
79 })?;
80
81 let seg_id: u64 = id_str.parse()
82 .with_context(|| {
83 format!(
84 "Invalid segment ID '{}' on line {} of {}",
85 id_str,
86 line_num + 1,
87 path.display()
88 )
89 })?;
90
91 let seq = seq.trim();
92 if seq.is_empty() {
93 bail!(
94 "Empty sequence on line {} of {}",
95 line_num + 1,
96 path.display()
97 );
98 }
99
100 super::parse::validate_dna_sequence(seq.as_bytes())
101 .with_context(|| {
102 format!(
103 "Invalid DNA in segment {} on line {} of {}",
104 seg_id,
105 line_num + 1,
106 path.display()
107 )
108 })?;
109
110 segment_ids.push(seg_id);
111 sequences.push(seq.to_uppercase());
112 }
113
114 Ok(CfSegData {
115 segment_ids,
116 sequences,
117 })
118}
119
120#[cfg(test)]
121mod tests {
122 use super::*;
123 use std::io::Write;
124 use tempfile::NamedTempFile;
125
126 #[test]
127 fn test_parse_cf_seg_basic() -> Result<()> {
128 let mut f = NamedTempFile::new()?;
129 writeln!(f, "100\tACGTACGT")?;
130 writeln!(f, "42\tTGCATGCA")?;
131 writeln!(f, "999\tAAAACCCC")?;
132 f.flush()?;
133
134 let data = parse_cf_seg(f.path())?;
135 assert_eq!(data.len(), 3);
136 assert_eq!(data.segment_ids, vec![100, 42, 999]);
137 assert_eq!(data.sequences, vec!["ACGTACGT", "TGCATGCA", "AAAACCCC"]);
138 Ok(())
139 }
140
141 #[test]
142 fn test_parse_cf_seg_spaces() -> Result<()> {
143 let mut f = NamedTempFile::new()?;
144 writeln!(f, "1 ACGT")?;
145 writeln!(f, "2\tTGCA")?;
146 f.flush()?;
147
148 let data = parse_cf_seg(f.path())?;
149 assert_eq!(data.len(), 2);
150 assert_eq!(data.segment_ids, vec![1, 2]);
151 Ok(())
152 }
153
154 #[test]
155 fn test_parse_cf_seg_skips_blank_lines() -> Result<()> {
156 let mut f = NamedTempFile::new()?;
157 writeln!(f, "1\tACGT")?;
158 writeln!(f)?;
159 writeln!(f, "2\tTGCA")?;
160 f.flush()?;
161
162 let data = parse_cf_seg(f.path())?;
163 assert_eq!(data.len(), 2);
164 Ok(())
165 }
166
167 #[test]
168 fn test_parse_cf_seg_lowercase() -> Result<()> {
169 let mut f = NamedTempFile::new()?;
170 writeln!(f, "1\tacgtacgt")?;
171 f.flush()?;
172
173 let data = parse_cf_seg(f.path())?;
174 assert_eq!(data.sequences[0], "ACGTACGT");
175 Ok(())
176 }
177
178 #[test]
179 fn test_parse_cf_seg_malformed_no_seq() {
180 let mut f = NamedTempFile::new().unwrap();
181 writeln!(f, "42").unwrap();
182 f.flush().unwrap();
183
184 assert!(parse_cf_seg(f.path()).is_err());
185 }
186
187 #[test]
188 fn test_parse_cf_seg_bad_id() {
189 let mut f = NamedTempFile::new().unwrap();
190 writeln!(f, "notanumber\tACGT").unwrap();
191 f.flush().unwrap();
192
193 assert!(parse_cf_seg(f.path()).is_err());
194 }
195
196 #[test]
197 fn test_parse_cf_seg_invalid_dna() {
198 let mut f = NamedTempFile::new().unwrap();
199 writeln!(f, "1\tACGTNACGT").unwrap();
200 f.flush().unwrap();
201
202 assert!(parse_cf_seg(f.path()).is_err());
203 }
204}