Skip to main content

rsomics_bed_validate/
lib.rs

1use rsomics_common::{Result, RsomicsError};
2use std::io::BufRead;
3
4/// Validation result for a BED file.
5pub struct BedValidation {
6    /// Number of data records (non-header, non-blank lines).
7    pub records: u64,
8    /// Human-readable error messages, one per problem found.
9    pub errors: Vec<String>,
10    /// True when `errors` is empty.
11    pub is_valid: bool,
12}
13
14/// Validate a BED stream for structural correctness.
15///
16/// Checks:
17/// - At least three tab-separated fields per record.
18/// - start and end are valid non-negative integers.
19/// - start <= end.
20///
21/// Lines starting with `#`, `track`, or `browser` are treated as headers and skipped.
22pub fn validate<R: BufRead>(reader: R) -> Result<BedValidation> {
23    let mut records: u64 = 0;
24    let mut errors: Vec<String> = Vec::new();
25    let mut line_num: u64 = 0;
26
27    for line in reader.lines() {
28        let line = line.map_err(RsomicsError::Io)?;
29        line_num += 1;
30
31        if line.starts_with('#')
32            || line.starts_with("track")
33            || line.starts_with("browser")
34            || line.is_empty()
35        {
36            continue;
37        }
38
39        let fields: Vec<&str> = line.split('\t').collect();
40        if fields.len() < 3 {
41            errors.push(format!(
42                "line {line_num}: need >= 3 fields, got {}",
43                fields.len()
44            ));
45            continue;
46        }
47
48        let start = fields[1].parse::<u64>();
49        let end = fields[2].parse::<u64>();
50
51        if start.is_err() {
52            errors.push(format!("line {line_num}: start is not a valid integer"));
53        }
54        if end.is_err() {
55            errors.push(format!("line {line_num}: end is not a valid integer"));
56        }
57        if let (Ok(s), Ok(e)) = (start, end)
58            && s > e
59        {
60            errors.push(format!("line {line_num}: start ({s}) > end ({e})"));
61        }
62
63        records += 1;
64    }
65
66    let is_valid = errors.is_empty();
67    Ok(BedValidation {
68        records,
69        errors,
70        is_valid,
71    })
72}
73
74#[cfg(test)]
75mod tests {
76    use super::*;
77    use std::io::Cursor;
78
79    #[test]
80    fn valid_bed3_passes() {
81        let bed = "chr1\t0\t100\nchr2\t50\t200\n";
82        let v = validate(Cursor::new(bed)).unwrap();
83        assert!(v.is_valid);
84        assert_eq!(v.records, 2);
85        assert!(v.errors.is_empty());
86    }
87
88    #[test]
89    fn start_greater_than_end_fails() {
90        let bed = "chr1\t200\t100\n";
91        let v = validate(Cursor::new(bed)).unwrap();
92        assert!(!v.is_valid);
93        assert_eq!(v.errors.len(), 1);
94        assert!(v.errors[0].contains("start (200) > end (100)"));
95    }
96
97    #[test]
98    fn too_few_fields_fails() {
99        let bed = "chr1\t100\n";
100        let v = validate(Cursor::new(bed)).unwrap();
101        assert!(!v.is_valid);
102        assert!(v.errors[0].contains("need >= 3 fields"));
103    }
104
105    #[test]
106    fn non_integer_coordinates_fail() {
107        let bed = "chr1\tabc\t100\n";
108        let v = validate(Cursor::new(bed)).unwrap();
109        assert!(!v.is_valid);
110        assert!(v.errors[0].contains("start is not a valid integer"));
111    }
112
113    #[test]
114    fn headers_skipped() {
115        let bed = "# comment\ntrack name=test\nbrowser position chr1:1-100\nchr1\t0\t100\n";
116        let v = validate(Cursor::new(bed)).unwrap();
117        assert!(v.is_valid);
118        assert_eq!(v.records, 1);
119    }
120}