onelib 0.1.0 - Docs.rs

//! ONEcode schema definitions and parsing.
//!
//! A schema describes the line types in a ONEcode file: which letter is the
//! object type, which are auxiliary data lines, which is the group type, and
//! what fields each line type carries.

use std::collections::HashMap;

use crate::types::FieldType;

/// Per-line-type metadata, describing the fields and role of a line type.
#[derive(Debug, Clone)]
pub struct LineInfo {
    /// Field types in schema order.
    pub field_types: Vec<FieldType>,
    /// Index of the list field, if any (at most one per line type).
    pub list_field: Option<usize>,
    /// Size in bytes of a single list element (0 if no list).
    pub list_elt_size: usize,
    /// Whether this is an object type (indexed, counted).
    pub is_object: bool,
    /// Whether this is a group type.
    pub is_group: bool,
    /// Comment from the schema definition line, if any.
    pub comment: Option<String>,
}

impl LineInfo {
    /// Create a new `LineInfo` from a list of field types.
    pub fn new(field_types: Vec<FieldType>, is_object: bool, is_group: bool) -> Self {
        let mut list_field = None;
        let mut list_elt_size = 0;

        for (i, &ft) in field_types.iter().enumerate() {
            if ft.is_list() {
                list_field = Some(i);
                list_elt_size = ft.list_element_size();
                break; // at most one list per line type
            }
        }

        Self {
            field_types,
            list_field,
            list_elt_size,
            is_object,
            is_group,
            comment: None,
        }
    }
}

/// A single primary-type schema entry.
#[derive(Debug, Clone)]
pub struct SchemaEntry {
    /// Primary type name (e.g. "seq", "foo").
    pub primary: String,
    /// Secondary type names, if any.
    pub secondary: Vec<String>,
    /// Per-line-type info, keyed by the line type character.
    pub info: HashMap<u8, LineInfo>,
    /// Order in which O/D/G definitions appear (for output).
    pub defn_order: Vec<u8>,
}

impl SchemaEntry {
    /// Create a new empty schema entry for a primary type.
    pub fn new(primary: impl Into<String>) -> Self {
        Self {
            primary: primary.into(),
            secondary: Vec::new(),
            info: HashMap::new(),
            defn_order: Vec::new(),
        }
    }

    /// Add an object or data line type definition.
    ///
    /// Returns an error string if the line type is already defined (unless
    /// it was previously registered as a group, in which case the group flag
    /// is merged).
    pub fn add_line_type(
        &mut self,
        line_type: u8,
        field_types: Vec<FieldType>,
        is_object: bool,
        is_group: bool,
        comment: Option<String>,
    ) -> Result<(), String> {
        if is_group && field_types.is_empty() {
            // A bare group declaration (e.g. "G S 0") just records the
            // grouping relationship — it doesn't create a full LineInfo.
            // The actual LineInfo is created when the O/D line appears.
            self.defn_order.push(line_type);
            // If the line type already exists, mark it as a group.
            if let Some(li) = self.info.get_mut(&line_type) {
                li.is_group = true;
            } else {
                // Store a placeholder that will be replaced by O/D.
                let mut li = LineInfo::new(Vec::new(), false, true);
                li.comment = comment;
                self.info.insert(line_type, li);
            }
            return Ok(());
        }

        if let Some(existing) = self.info.get(&line_type) {
            if existing.is_group && existing.field_types.is_empty() {
                // Replace the group placeholder with the full definition,
                // preserving the group flag.
                let mut li = LineInfo::new(field_types, is_object, true);
                li.comment = comment;
                self.info.insert(line_type, li);
                self.defn_order.push(line_type);
                return Ok(());
            }
            return Err(format!(
                "duplicate line type '{}' in schema for '{}'",
                line_type as char, self.primary
            ));
        }

        let mut li = LineInfo::new(field_types, is_object, is_group);
        li.comment = comment;
        self.info.insert(line_type, li);
        self.defn_order.push(line_type);
        Ok(())
    }
}

impl SchemaEntry {
    /// Write this schema entry as ONEcode schema text.
    ///
    /// Writes a `P` line, any `S` lines, and `O`/`D`/`G` definition lines
    /// in `defn_order`. The output uses bare line prefixes (no `~`), suitable
    /// for standalone `.schema` files.
    pub fn write_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result {
        writeln!(w, "P {} {}", self.primary.len(), self.primary)?;
        for sec in &self.secondary {
            writeln!(w, "S {} {sec}", sec.len())?;
        }
        // Track which line types we've already emitted a full definition for,
        // so bare group placeholders (G x 0) followed by an O/D line don't
        // produce duplicate output.
        let mut emitted = std::collections::HashSet::new();
        for &lt in &self.defn_order {
            let Some(li) = self.info.get(&lt) else {
                continue;
            };
            // Bare group placeholder — emit only if no full definition follows.
            if li.is_group && li.field_types.is_empty() {
                // Check whether a later defn_order entry will emit a full
                // definition for this line type (with is_group merged in).
                let has_full_def = self.defn_order.iter().any(|&other| {
                    other == lt
                        && self
                            .info
                            .get(&other)
                            .is_some_and(|oi| !oi.field_types.is_empty())
                });
                if !has_full_def {
                    writeln!(w, "G {} 0", lt as char)?;
                    emitted.insert(lt);
                }
                continue;
            }
            if !emitted.insert(lt) {
                continue;
            }
            // If the line type has a merged group flag, emit G first then
            // the O/D definition. The C parser treats "G x 0" as a grouping
            // declaration independent of the O/D definition.
            if li.is_group && !li.field_types.is_empty() {
                writeln!(w, "G {} 0", lt as char)?;
            }
            let kind = if li.is_object { 'O' } else { 'D' };
            write!(w, "{kind} {} {}", lt as char, li.field_types.len())?;
            for ft in &li.field_types {
                let name = ft.name();
                write!(w, " {} {name}", name.len())?;
            }
            if let Some(ref comment) = li.comment {
                write!(w, "             {comment}")?;
            }
            writeln!(w)?;
        }
        Ok(())
    }
}

/// A collection of schema entries, one per primary file type.
#[derive(Debug, Clone, Default)]
pub struct Schema {
    pub entries: Vec<SchemaEntry>,
}

impl Schema {
    /// Look up a schema entry by primary type name.
    pub fn find(&self, primary: &str) -> Option<&SchemaEntry> {
        self.entries.iter().find(|e| e.primary == primary)
    }

    /// Write the full schema as ONEcode schema text.
    pub fn write_to(&self, w: &mut impl std::fmt::Write) -> std::fmt::Result {
        for entry in &self.entries {
            entry.write_to(w)?;
        }
        Ok(())
    }

    /// Serialise the schema to a string.
    pub fn to_text(&self) -> String {
        let mut buf = String::new();
        self.write_to(&mut buf).expect("write to String cannot fail");
        buf
    }

    /// Parse a schema from text in ONEcode schema format.
    ///
    /// The text contains P/S/O/D/G lines. Each P line starts a new primary
    /// type definition. O/D/G lines define line types within the current
    /// primary type.
    ///
    /// # Format
    ///
    /// ```text
    /// P <len> <name>
    /// S <len> <name>
    /// O <char> <nfield> [<width> <type_name>]...    comment text
    /// D <char> <nfield> [<width> <type_name>]...    comment text
    /// G <char> <nfield> [<width> <type_name>]...    comment text
    /// ```
    ///
    /// Width values are display hints and are ignored during parsing.
    pub fn from_text(text: &str) -> Result<Self, String> {
        let mut schema = Schema::default();
        let mut current: Option<&mut SchemaEntry> = None;

        for (line_num, line) in text.lines().enumerate() {
            let line = line.trim();
            if line.is_empty() || line.starts_with('.') {
                continue;
            }

            let line_char = line.as_bytes()[0];
            match line_char {
                b'P' => {
                    let name = parse_length_prefixed_string(line, line_num)?;
                    schema.entries.push(SchemaEntry::new(name));
                    // Re-borrow to satisfy the borrow checker.
                    current = schema.entries.last_mut();
                }
                b'S' => {
                    let name = parse_length_prefixed_string(line, line_num)?;
                    let entry = current.as_deref_mut().ok_or_else(|| {
                        format!("line {}: S line before any P line", line_num + 1)
                    })?;
                    entry.secondary.push(name);
                }
                b'O' | b'D' | b'G' => {
                    let is_object = line_char == b'O';
                    let is_group = line_char == b'G';
                    let entry = current.as_deref_mut().ok_or_else(|| {
                        format!("line {}: {} line before any P line", line_num + 1, line_char as char)
                    })?;

                    let (lt, field_types, comment) =
                        parse_definition_line(line, line_num)?;

                    entry
                        .add_line_type(lt, field_types, is_object, is_group, comment)
                        .map_err(|e| format!("line {}: {}", line_num + 1, e))?;
                }
                b'~' => {
                    // Embedded schema line — strip the ~ prefix and parse the
                    // rest as an O/D/G line.
                    let rest = line[1..].trim_start();
                    if rest.is_empty() {
                        continue;
                    }
                    let inner_char = rest.as_bytes()[0];
                    match inner_char {
                        b'O' | b'D' | b'G' => {
                            let is_object = inner_char == b'O';
                            let is_group = inner_char == b'G';
                            let entry = current.as_deref_mut().ok_or_else(|| {
                                format!(
                                    "line {}: ~ line before any P line",
                                    line_num + 1
                                )
                            })?;

                            let (lt, field_types, comment) =
                                parse_definition_line(rest, line_num)?;

                            entry
                                .add_line_type(
                                    lt,
                                    field_types,
                                    is_object,
                                    is_group,
                                    comment,
                                )
                                .map_err(|e| format!("line {}: {}", line_num + 1, e))?;
                        }
                        _ => {
                            return Err(format!(
                                "line {}: unexpected ~ line content: {}",
                                line_num + 1,
                                rest
                            ));
                        }
                    }
                }
                _ => {
                    // Ignore unrecognised lines (comments, header metadata).
                }
            }
        }

        Ok(schema)
    }
}

/// Parse a `~ O/D/G` line embedded in a file header into its components.
///
/// Returns `(line_type_char, field_types, comment)`.
pub fn parse_embedded_schema_line(
    line: &str,
) -> Result<(u8, u8, Vec<FieldType>, Option<String>), String> {
    let rest = line.trim();
    if rest.is_empty() || rest.as_bytes()[0] != b'~' {
        return Err(format!("not a schema line: {line}"));
    }
    let inner = rest[1..].trim_start();
    let kind = inner.as_bytes()[0]; // O, D, or G
    let (lt, field_types, comment) = parse_definition_line(inner, 0)?;
    Ok((kind, lt, field_types, comment))
}

// --- parsing helpers ---

/// Parse an O/D/G definition line (public for use by the reader module).
///
/// This is the same as `parse_definition_line` but accessible from outside
/// the module.
pub fn parse_definition_line_internal(
    line: &str,
    line_num: i64,
) -> std::result::Result<(u8, Vec<FieldType>, Option<String>), String> {
    parse_definition_line(line, line_num as usize)
}

/// Parse a length-prefixed string from a P or S line.
/// Format: `P <len> <name>` or `S <len> <name>`
fn parse_length_prefixed_string(line: &str, line_num: usize) -> Result<String, String> {
    let tokens: Vec<&str> = line.split_whitespace().collect();
    if tokens.len() < 3 {
        return Err(format!(
            "line {}: expected '<type> <len> <name>', got: {line}",
            line_num + 1
        ));
    }
    // tokens[1] is the length hint — we don't need it, just take the name.
    Ok(tokens[2].to_string())
}

/// Parse an O/D/G definition line.
/// Format: `O <char> <nfield> [<width> <type_name>]...    comment`
///
/// Returns `(line_type_char, field_types, optional_comment)`.
fn parse_definition_line(
    line: &str,
    line_num: usize,
) -> Result<(u8, Vec<FieldType>, Option<String>), String> {
    let tokens: Vec<&str> = line.split_whitespace().collect();
    if tokens.len() < 3 {
        return Err(format!(
            "line {}: definition line too short: {line}",
            line_num + 1
        ));
    }

    // tokens[0] = O/D/G
    // tokens[1] = line type character
    let lt = tokens[1].as_bytes()[0];
    if !lt.is_ascii_alphabetic() {
        return Err(format!(
            "line {}: line type must be alphabetic, got '{}'",
            line_num + 1,
            lt as char
        ));
    }

    // tokens[2] = field count
    let n_field: usize = tokens[2].parse().map_err(|_| {
        format!(
            "line {}: expected field count, got '{}'",
            line_num + 1,
            tokens[2]
        )
    })?;

    // Each field takes two tokens: width hint + type name.
    let mut field_types = Vec::with_capacity(n_field);
    let mut token_idx = 3;
    let mut last_field_token_end = token_idx;

    for _ in 0..n_field {
        if token_idx + 1 >= tokens.len() {
            return Err(format!(
                "line {}: expected {} fields but ran out of tokens",
                line_num + 1,
                n_field
            ));
        }
        // Skip width hint (tokens[token_idx]).
        let type_name = tokens[token_idx + 1];
        let ft = FieldType::from_name(type_name).ok_or_else(|| {
            format!(
                "line {}: unknown field type '{type_name}'",
                line_num + 1
            )
        })?;
        field_types.push(ft);
        token_idx += 2;
        last_field_token_end = token_idx;
    }

    // Anything remaining after the field definitions is a comment.
    let comment = if last_field_token_end < tokens.len() {
        Some(tokens[last_field_token_end..].join(" "))
    } else {
        None
    };

    Ok((lt, field_types, comment))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_minimal_schema() {
        let text = "P 3 foo\nO B 1 3 INT\n";
        let schema = Schema::from_text(text).unwrap();
        assert_eq!(schema.entries.len(), 1);

        let entry = &schema.entries[0];
        assert_eq!(entry.primary, "foo");
        assert_eq!(entry.defn_order, vec![b'B']);

        let info = &entry.info[&b'B'];
        assert!(info.is_object);
        assert_eq!(info.field_types, vec![FieldType::Int]);
        assert!(info.list_field.is_none());
    }

    #[test]
    fn parse_seq_schema() {
        let text = "\
P 3 seq
O S 1 3 DNA
D I 1 6 STRING
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        assert_eq!(entry.primary, "seq");

        let s_info = &entry.info[&b'S'];
        assert!(s_info.is_object);
        assert_eq!(s_info.field_types, vec![FieldType::Dna]);
        assert_eq!(s_info.list_field, Some(0));
        assert_eq!(s_info.list_elt_size, 1);

        let i_info = &entry.info[&b'I'];
        assert!(!i_info.is_object);
        assert_eq!(i_info.field_types, vec![FieldType::String]);
    }

    #[test]
    fn parse_schema_with_comments() {
        let text = "\
P 3 seq
O S 1 3 DNA             sequence: the DNA string
D I 1 6 STRING          id: sequence identifier
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];

        assert_eq!(
            entry.info[&b'S'].comment.as_deref(),
            Some("sequence: the DNA string")
        );
        assert_eq!(
            entry.info[&b'I'].comment.as_deref(),
            Some("id: sequence identifier")
        );
    }

    #[test]
    fn parse_embedded_tilde_lines() {
        let text = "\
P 3 seq
~ O S 1 3 DNA
~ D I 1 6 STRING
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        assert!(entry.info.contains_key(&b'S'));
        assert!(entry.info.contains_key(&b'I'));
    }

    #[test]
    fn parse_secondary_types() {
        let text = "\
P 3 seq
S 3 irp
O S 1 3 DNA
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        assert_eq!(entry.secondary, vec!["irp"]);
    }

    #[test]
    fn parse_multi_field_line() {
        let text = "\
P 4 adna
O S 1 3 DNA
D N 3 3 INT 4 CHAR 3 INT
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        let n_info = &entry.info[&b'N'];
        assert_eq!(
            n_info.field_types,
            vec![FieldType::Int, FieldType::Char, FieldType::Int]
        );
        assert!(n_info.list_field.is_none());
    }

    #[test]
    fn parse_group_line() {
        let text = "\
P 3 seq
O S 1 3 DNA
G g 1 6 STRING
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        let g_info = &entry.info[&b'g'];
        assert!(g_info.is_group);
    }

    #[test]
    fn duplicate_line_type_is_error() {
        let text = "\
P 3 foo
O B 1 3 INT
D B 1 4 REAL
";
        let result = Schema::from_text(text);
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("duplicate"));
    }

    #[test]
    fn t2_seq_schema() {
        // The t2.seq test file has a more complex schema with multiple
        // object types and a group.
        let text = "\
P 3 seq
O s 2 3 INT 6 STRING
G S 0
D n 1 3 INT
O S 1 3 DNA
";
        // Note: this has two O lines, which means two object types. The C
        // code allows this (each object type is independently indexed).
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];

        assert!(entry.info[&b's'].is_object);
        assert_eq!(
            entry.info[&b's'].field_types,
            vec![FieldType::Int, FieldType::String]
        );

        assert!(entry.info[&b'S'].is_object);
        assert_eq!(entry.info[&b'S'].field_types, vec![FieldType::Dna]);

        assert_eq!(entry.info[&b'n'].field_types, vec![FieldType::Int]);
    }

    #[test]
    fn parse_group_with_no_fields() {
        // G lines can have 0 fields (e.g. "G S 0").
        let text = "\
P 3 seq
O S 1 3 DNA
G s 0
";
        let schema = Schema::from_text(text).unwrap();
        let entry = &schema.entries[0];
        let g_info = &entry.info[&b's'];
        assert!(g_info.is_group);
        assert!(g_info.field_types.is_empty());
    }

    #[test]
    fn write_simple_schema() {
        let text = "P 3 seq\nO S 1 3 DNA\nD I 1 6 STRING\n";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        assert_eq!(output, text);
    }

    #[test]
    fn write_schema_with_comments() {
        let text = "\
P 3 seq
O S 1 3 DNA             sequence: the DNA string
D I 1 6 STRING          id: sequence identifier
";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        // Comment spacing is normalised — the writer always uses a fixed
        // gap, so the output won't match the original whitespace exactly.
        let reparsed = Schema::from_text(&output).unwrap();
        assert_eq!(
            reparsed.entries[0].info[&b'S'].comment.as_deref(),
            Some("sequence: the DNA string")
        );
        assert_eq!(
            reparsed.entries[0].info[&b'I'].comment.as_deref(),
            Some("id: sequence identifier")
        );
    }

    #[test]
    fn write_schema_with_secondary() {
        let text = "P 3 seq\nS 3 irp\nO S 1 3 DNA\n";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        assert_eq!(output, text);
    }

    #[test]
    fn write_multi_field_schema() {
        let text = "P 4 adna\nO S 1 3 DNA\nD N 3 3 INT 4 CHAR 3 INT\n";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        assert_eq!(output, text);
    }

    #[test]
    fn write_schema_with_bare_group() {
        let text = "P 3 seq\nO S 1 3 DNA\nG s 0\n";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        assert_eq!(output, text);
    }

    #[test]
    fn write_schema_with_group_and_fields() {
        let text = "P 3 seq\nO S 1 3 DNA\nG g 1 6 STRING\n";
        let schema = Schema::from_text(text).unwrap();
        let output = schema.to_text();
        // A group with fields is emitted as a G placeholder + D definition.
        let expected = "P 3 seq\nO S 1 3 DNA\nG g 0\nD g 1 6 STRING\n";
        assert_eq!(output, expected);
    }

    #[test]
    fn write_t2_schema_round_trip() {
        let text = "\
P 3 seq
O s 2 3 INT 6 STRING
G S 0
D n 1 3 INT
O S 1 3 DNA
";
        let schema = Schema::from_text(text).unwrap();
        // Parse back and verify semantics match.
        let output = schema.to_text();
        let reparsed = Schema::from_text(&output).unwrap();
        let orig = &schema.entries[0];
        let new = &reparsed.entries[0];

        assert_eq!(orig.primary, new.primary);
        assert_eq!(orig.info[&b's'].field_types, new.info[&b's'].field_types);
        assert_eq!(orig.info[&b'S'].field_types, new.info[&b'S'].field_types);
        assert_eq!(orig.info[&b'n'].field_types, new.info[&b'n'].field_types);
        assert!(new.info.contains_key(&b'S'));
    }
}