cgats 0.2.0

Parse, transform, and write CGATS color files
Documentation
//! Parsing utilities

use crate::*;
use std::str::FromStr;

impl FromStr for Cgats {
    type Err = BoxErr;
    fn from_str(s: &str) -> Result<Self> {
        let mut cgats = Cgats::new();

        let mut lines = s.lines()
            .flat_map(|line| line.split('\r'))
            .map(|line| line.trim_end());

        cgats.vendor = lines.next().ok_or("NO DATA")?.parse()?;

        let mut push = Push::MetaData;
        let (mut hit_format, mut hit_data) = (false, false);
        for next in lines {
            match next.trim() {
                "BEGIN_DATA_FORMAT" => {
                    push = Push::DataFormat;
                    hit_format = true;
                    continue;
                }
                "END_DATA_FORMAT" => {
                    push = Push::MetaData;
                    cgats.data = Vec::with_capacity(cgats.len());
                    continue;
                }
                "BEGIN_DATA" => {
                    push = Push::Data;
                    hit_data = true;
                    continue;
                }
                "END_DATA" => {
                    push = Push::Stop;
                    continue;
                }
                _ => (),
            }

            match &push {
                Push::MetaData => cgats.metadata.push(next.parse()?),
                Push::DataFormat => cgats.data_format = next.parse()?,
                Push::Data => {
                    for val in next.split_whitespace() {
                        cgats.data.push(val.parse()?);
                    }
                }
                Push::Stop => (),
            }
        }

        // Set the implied ColorBurst DataFormat
        if cgats.vendor == Vendor::ColorBurst {
            cgats.data_format = DataFormat::colorburst();
        }

        // Force SAMPLE_ID into [`DataPoint::Int`]
        if let Some(index) = cgats.index_by_field(&SAMPLE_ID) {
            let id_column = cgats.get_col_mut(index);
            for id in id_column {
                *id = id.to_int().unwrap_or_else(|_| id.to_owned());
            }
        }

        // Return an error if we didn't hit BEGIN_DATA_FORMAT tag
        if !hit_format && cgats.vendor != Vendor::ColorBurst {
            return err!("DATA_FORMAT tag not found");
        }

        // Return an error if we didn't hit BEGIN_DATA tag
        if !hit_data {
            return err!("BEGIN_DATA tag not found");
        }

        // Return an error if there is data without a format
        if cgats.data_format.is_empty() && !cgats.is_empty() {
            return err!("DATA exists, but DATA_FORMAT is empty");
        }

        // Return an error if the DATA doesn't fit the DATA_FORMAT
        if cgats.n_rows() * cgats.n_cols() != cgats.len() {
            return err!("rows * cols != len");
        }

        Ok(cgats)
    }
}

#[derive(Debug)]
enum Push {
    MetaData,
    DataFormat,
    Data,
    Stop,
}

impl FromStr for MetaData {
    type Err = BoxErr;
    fn from_str(s: &str) -> Result<Self> {
        if s.trim().is_empty() {
            return Ok(MetaData::Blank);
        }

        if s.trim().starts_with('#') {
            return Ok(MetaData::Comment(s.to_owned()));
        }

        let mut split = s.split_whitespace();
        let key = split.next().ok_or("MetaData key not found")?.into();
        let val = split.collect();
        Ok(MetaData::KeyVal { key, val })
    }
}

impl FromStr for DataFormat {
    type Err = BoxErr;
    fn from_str(s: &str) -> Result<Self> {
        let s = s.trim();
        let mut fields = Vec::new();

        for field in s.split_whitespace() {
            fields.push(field.parse()?);
        }

        Ok(DataFormat { fields })
    }
}

fn alpha(c: char) -> bool {
    !c.is_ascii_digit() && c != '.' && c != '-'
}

#[test]
fn test_has_alpha() {
    assert!(alpha('a'));
    assert!(alpha('e'));
    assert!(alpha('E'));
    assert!(alpha('Z'));
    assert!(!alpha('1'));
    assert!(!alpha('2'));
    assert!(!alpha('0'));
    assert!(!alpha('9'));
}

impl FromStr for DataPoint {
    type Err = BoxErr;
    fn from_str(s: &str) -> Result<Self> {
        let s = s.trim();
        // don't try to parse scientific notation
        if s.contains(alpha) {
            return Ok(Alpha(s.into()));
        }
        Ok(if let Ok(i) = s.parse() {
            Int(i)
        } else if let Ok(f) = s.parse() {
            Float(f)
        } else {
            Alpha(s.into())
        })
    }
}

#[test]
fn data_point_from_str() -> Result<()> {
    match "42".parse()? {
        Int(i) => if i != 42 { panic!() },
        _ => panic!(),
    }

    match "42.0".parse()? {
        Float(f) => if f != 42.0 { panic!() },
        _ => panic!(),
    }

    match "1A1".parse()? {
        Alpha(a) => if a != "1A1" { panic!() },
        _ => panic!(),
    }

    match "1E3".parse()? {
        Alpha(a) => if a != "1E3" { panic!() },
        x => panic!("type is {x:?} but expected Alpha(\"1E3\")"),
    }

    Ok(())
}

#[test]
fn parse_file() {
    use std::{fs::File, io::Read};

    let mut cgats = String::new();
    File::open("test_files/cgats1.tsv").unwrap().read_to_string(&mut cgats).unwrap();

    let cgats: Cgats = cgats.parse().unwrap();
    dbg!(&cgats, cgats.n_cols(), cgats.n_rows(), cgats.len());
    
    dbg!(&cgats.data_format);

    let mut row1 = cgats.get_row(1).unwrap();
    // dbg!(&row1);
    assert_eq!(row1.nth(1), Some(&Alpha("Magenta".into())));

    let mut col1 = cgats.get_col(1);
    // dbg!(&col1);
    assert_eq!(col1.nth(3), Some(&Alpha("Black".into())));
}

#[test]
fn parse_err() {
    let cgats: Result<Cgats> =
    "CGATS.17
    BEGIN_DATA_FORMAT
    END_DATA_FORMAT
    BEGIN_DATA
    END_DATA"
    .parse();
    assert!(cgats.is_ok());

    let cgats: Result<Cgats> =
    "CGATS.17
    BEGIN_DATA_FORMAT
    END_DATA_FORMAT"
    .parse();
    assert!(cgats.is_err());

    let cgats: Result<Cgats> =
    "CGATS.17
    BEGIN_DATA
    END_DATA"
    .parse();
    assert!(cgats.is_err());
}