rsomics-bed-validate 0.1.0

Validate BED file format: check field counts, coordinate ordering, and integer parsing
Documentation
use rsomics_common::{Result, RsomicsError};
use std::io::BufRead;

/// Validation result for a BED file.
pub struct BedValidation {
    /// Number of data records (non-header, non-blank lines).
    pub records: u64,
    /// Human-readable error messages, one per problem found.
    pub errors: Vec<String>,
    /// True when `errors` is empty.
    pub is_valid: bool,
}

/// Validate a BED stream for structural correctness.
///
/// Checks:
/// - At least three tab-separated fields per record.
/// - start and end are valid non-negative integers.
/// - start <= end.
///
/// Lines starting with `#`, `track`, or `browser` are treated as headers and skipped.
pub fn validate<R: BufRead>(reader: R) -> Result<BedValidation> {
    let mut records: u64 = 0;
    let mut errors: Vec<String> = Vec::new();
    let mut line_num: u64 = 0;

    for line in reader.lines() {
        let line = line.map_err(RsomicsError::Io)?;
        line_num += 1;

        if line.starts_with('#')
            || line.starts_with("track")
            || line.starts_with("browser")
            || line.is_empty()
        {
            continue;
        }

        let fields: Vec<&str> = line.split('\t').collect();
        if fields.len() < 3 {
            errors.push(format!(
                "line {line_num}: need >= 3 fields, got {}",
                fields.len()
            ));
            continue;
        }

        let start = fields[1].parse::<u64>();
        let end = fields[2].parse::<u64>();

        if start.is_err() {
            errors.push(format!("line {line_num}: start is not a valid integer"));
        }
        if end.is_err() {
            errors.push(format!("line {line_num}: end is not a valid integer"));
        }
        if let (Ok(s), Ok(e)) = (start, end)
            && s > e
        {
            errors.push(format!("line {line_num}: start ({s}) > end ({e})"));
        }

        records += 1;
    }

    let is_valid = errors.is_empty();
    Ok(BedValidation {
        records,
        errors,
        is_valid,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    #[test]
    fn valid_bed3_passes() {
        let bed = "chr1\t0\t100\nchr2\t50\t200\n";
        let v = validate(Cursor::new(bed)).unwrap();
        assert!(v.is_valid);
        assert_eq!(v.records, 2);
        assert!(v.errors.is_empty());
    }

    #[test]
    fn start_greater_than_end_fails() {
        let bed = "chr1\t200\t100\n";
        let v = validate(Cursor::new(bed)).unwrap();
        assert!(!v.is_valid);
        assert_eq!(v.errors.len(), 1);
        assert!(v.errors[0].contains("start (200) > end (100)"));
    }

    #[test]
    fn too_few_fields_fails() {
        let bed = "chr1\t100\n";
        let v = validate(Cursor::new(bed)).unwrap();
        assert!(!v.is_valid);
        assert!(v.errors[0].contains("need >= 3 fields"));
    }

    #[test]
    fn non_integer_coordinates_fail() {
        let bed = "chr1\tabc\t100\n";
        let v = validate(Cursor::new(bed)).unwrap();
        assert!(!v.is_valid);
        assert!(v.errors[0].contains("start is not a valid integer"));
    }

    #[test]
    fn headers_skipped() {
        let bed = "# comment\ntrack name=test\nbrowser position chr1:1-100\nchr1\t0\t100\n";
        let v = validate(Cursor::new(bed)).unwrap();
        assert!(v.is_valid);
        assert_eq!(v.records, 1);
    }
}