Skip to main content

sshash_lib/builder/
cf_seg.rs

1//! Parser for cuttlefish `.cf_seg` segment files
2//!
3//! The `.cf_seg` format encodes the unitigs (segments) of a compacted de Bruijn
4//! graph. Each line contains a numeric segment identifier followed by the unitig
5//! DNA sequence, separated by whitespace:
6//!
7//! ```text
8//! 47966863    CGCACATCCGTATCATGAAG...
9//! 19947264    TATGAGGGTGGGAAGGTTGC...
10//! ```
11//!
12//! Segment IDs are unique within a file but are neither ordered nor contiguous.
13//! The sequences are the spectrum-preserving string set (SPSS) over which
14//! sshash builds its dictionary.
15
16use anyhow::{Context, Result, bail};
17use std::io::{BufRead, BufReader};
18use std::path::Path;
19
20/// Parsed contents of a `.cf_seg` file.
21///
22/// The `i`-th entry in `segment_ids` corresponds to the `i`-th entry in
23/// `sequences`. When these sequences are passed to
24/// [`DictionaryBuilder::build_from_sequences`](super::DictionaryBuilder::build_from_sequences),
25/// the resulting sshash string ID for each sequence equals its index in
26/// `sequences`, establishing the segment_id → string_id mapping.
27#[derive(Debug, Clone)]
28pub struct CfSegData {
29    /// Cuttlefish segment identifiers, one per unitig, in file order.
30    pub segment_ids: Vec<u64>,
31    /// DNA sequences (unitigs), one per line, in file order.
32    pub sequences: Vec<String>,
33}
34
35impl CfSegData {
36    /// Number of segments.
37    pub fn len(&self) -> usize {
38        self.sequences.len()
39    }
40
41    /// Returns `true` if empty.
42    pub fn is_empty(&self) -> bool {
43        self.sequences.is_empty()
44    }
45}
46
47/// Parse a `.cf_seg` file into segment IDs and sequences.
48///
49/// # Arguments
50/// * `path` - Path to the `.cf_seg` file
51///
52/// # Errors
53/// Returns an error if the file cannot be opened, a line is malformed
54/// (missing ID or sequence), or a segment ID cannot be parsed as `u64`.
55pub fn parse_cf_seg<P: AsRef<Path>>(path: P) -> Result<CfSegData> {
56    let path = path.as_ref();
57    let file = std::fs::File::open(path)
58        .with_context(|| format!("Failed to open cf_seg file: {}", path.display()))?;
59    let reader = BufReader::new(file);
60
61    let mut segment_ids = Vec::new();
62    let mut sequences = Vec::new();
63
64    for (line_num, line) in reader.lines().enumerate() {
65        let line = line
66            .with_context(|| format!("Failed to read line {} of {}", line_num + 1, path.display()))?;
67        let line = line.trim();
68        if line.is_empty() {
69            continue;
70        }
71
72        let (id_str, seq) = line.split_once(|c: char| c.is_ascii_whitespace())
73            .with_context(|| {
74                format!(
75                    "Malformed cf_seg line {} in {}: expected '<id> <sequence>'",
76                    line_num + 1,
77                    path.display()
78                )
79            })?;
80
81        let seg_id: u64 = id_str.parse()
82            .with_context(|| {
83                format!(
84                    "Invalid segment ID '{}' on line {} of {}",
85                    id_str,
86                    line_num + 1,
87                    path.display()
88                )
89            })?;
90
91        let seq = seq.trim();
92        if seq.is_empty() {
93            bail!(
94                "Empty sequence on line {} of {}",
95                line_num + 1,
96                path.display()
97            );
98        }
99
100        super::parse::validate_dna_sequence(seq.as_bytes())
101            .with_context(|| {
102                format!(
103                    "Invalid DNA in segment {} on line {} of {}",
104                    seg_id,
105                    line_num + 1,
106                    path.display()
107                )
108            })?;
109
110        segment_ids.push(seg_id);
111        sequences.push(seq.to_uppercase());
112    }
113
114    Ok(CfSegData {
115        segment_ids,
116        sequences,
117    })
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123    use std::io::Write;
124    use tempfile::NamedTempFile;
125
126    #[test]
127    fn test_parse_cf_seg_basic() -> Result<()> {
128        let mut f = NamedTempFile::new()?;
129        writeln!(f, "100\tACGTACGT")?;
130        writeln!(f, "42\tTGCATGCA")?;
131        writeln!(f, "999\tAAAACCCC")?;
132        f.flush()?;
133
134        let data = parse_cf_seg(f.path())?;
135        assert_eq!(data.len(), 3);
136        assert_eq!(data.segment_ids, vec![100, 42, 999]);
137        assert_eq!(data.sequences, vec!["ACGTACGT", "TGCATGCA", "AAAACCCC"]);
138        Ok(())
139    }
140
141    #[test]
142    fn test_parse_cf_seg_spaces() -> Result<()> {
143        let mut f = NamedTempFile::new()?;
144        writeln!(f, "1 ACGT")?;
145        writeln!(f, "2\tTGCA")?;
146        f.flush()?;
147
148        let data = parse_cf_seg(f.path())?;
149        assert_eq!(data.len(), 2);
150        assert_eq!(data.segment_ids, vec![1, 2]);
151        Ok(())
152    }
153
154    #[test]
155    fn test_parse_cf_seg_skips_blank_lines() -> Result<()> {
156        let mut f = NamedTempFile::new()?;
157        writeln!(f, "1\tACGT")?;
158        writeln!(f)?;
159        writeln!(f, "2\tTGCA")?;
160        f.flush()?;
161
162        let data = parse_cf_seg(f.path())?;
163        assert_eq!(data.len(), 2);
164        Ok(())
165    }
166
167    #[test]
168    fn test_parse_cf_seg_lowercase() -> Result<()> {
169        let mut f = NamedTempFile::new()?;
170        writeln!(f, "1\tacgtacgt")?;
171        f.flush()?;
172
173        let data = parse_cf_seg(f.path())?;
174        assert_eq!(data.sequences[0], "ACGTACGT");
175        Ok(())
176    }
177
178    #[test]
179    fn test_parse_cf_seg_malformed_no_seq() {
180        let mut f = NamedTempFile::new().unwrap();
181        writeln!(f, "42").unwrap();
182        f.flush().unwrap();
183
184        assert!(parse_cf_seg(f.path()).is_err());
185    }
186
187    #[test]
188    fn test_parse_cf_seg_bad_id() {
189        let mut f = NamedTempFile::new().unwrap();
190        writeln!(f, "notanumber\tACGT").unwrap();
191        f.flush().unwrap();
192
193        assert!(parse_cf_seg(f.path()).is_err());
194    }
195
196    #[test]
197    fn test_parse_cf_seg_invalid_dna() {
198        let mut f = NamedTempFile::new().unwrap();
199        writeln!(f, "1\tACGTNACGT").unwrap();
200        f.flush().unwrap();
201
202        assert!(parse_cf_seg(f.path()).is_err());
203    }
204}