nested-text 0.1.0

A fully spec-compliant NestedText v3.8 parser and serializer
Documentation
use crate::error::{Error, ErrorKind};

/// The type of a non-blank, non-comment line.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LineKind {
    ListItem,
    DictItem,
    StringItem,
    KeyItem,
    InlineDict,
    InlineList,
    Unrecognized,
}

/// A classified line from the input.
#[derive(Debug, Clone)]
pub struct Line {
    /// The kind of line.
    pub kind: LineKind,
    /// 0-based line number in the original input.
    pub lineno: usize,
    /// Indentation depth (number of leading ASCII spaces).
    pub depth: usize,
    /// For DictItem: the key portion. For KeyItem/StringItem: the text after the tag.
    pub key: Option<String>,
    /// The value portion after the tag (for ListItem, DictItem, StringItem, KeyItem).
    /// For InlineList/InlineDict: the full content including brackets.
    pub value: Option<String>,
    /// The original line text (for error reporting).
    pub text: String,
}

/// Lexer that tokenizes NestedText input into classified lines.
///
/// Blank lines and comment lines are skipped. The remaining lines are classified
/// and stored for the parser to consume via peek/next.
#[derive(Debug)]
pub struct Lexer {
    lines: Vec<Line>,
    pos: usize,
}

impl Lexer {
    /// Tokenize the input string into classified lines.
    pub fn new(input: &str) -> Result<Self, Error> {
        // Strip UTF-8 BOM if present
        let input = input.strip_prefix('\u{feff}').unwrap_or(input);
        // Normalize line endings: \r\n -> \n, then \r -> \n
        let normalized = input.replace("\r\n", "\n").replace('\r', "\n");
        let raw_lines: Vec<&str> = normalized.split('\n').collect();

        let mut lines = Vec::new();

        for (lineno, &raw) in raw_lines.iter().enumerate() {
            // Compute indentation depth and check for invalid indent characters
            let depth = count_leading_spaces(raw);

            // Check for tabs or unicode spaces in leading whitespace
            if let Some(err) = check_indentation(raw, depth, lineno) {
                return Err(err);
            }

            let rest = &raw[depth..];

            // Skip blank lines (empty or only spaces)
            if rest.is_empty() {
                continue;
            }

            // Skip comments: first non-space char is #
            if rest.starts_with('#') {
                continue;
            }

            // Classify the line
            let line = classify_line(rest, depth, lineno, raw)?;
            lines.push(line);
        }

        Ok(Lexer { lines, pos: 0 })
    }

    /// Peek at the next line without consuming it.
    pub fn peek(&self) -> Option<&Line> {
        self.lines.get(self.pos)
    }

    /// Consume and return the next line.
    pub fn next_line(&mut self) -> Option<&Line> {
        if self.pos < self.lines.len() {
            let line = &self.lines[self.pos];
            self.pos += 1;
            Some(line)
        } else {
            None
        }
    }

    /// Check if the next line has exactly the given depth and kind.
    pub fn next_is(&self, depth: usize, kind: LineKind) -> bool {
        self.peek().map_or(false, |l| l.depth == depth && l.kind == kind)
    }
}

/// Count leading ASCII space characters.
fn count_leading_spaces(line: &str) -> usize {
    line.bytes().take_while(|&b| b == b' ').count()
}

/// Check for invalid characters (tabs, unicode spaces) in indentation.
fn check_indentation(raw: &str, _space_count: usize, lineno: usize) -> Option<Error> {
    // Look at the characters before the first non-whitespace character
    for (i, ch) in raw.chars().enumerate() {
        if ch == ' ' {
            continue;
        }
        if ch == '\t' {
            return Some(
                Error::new(
                    ErrorKind::TabInIndentation,
                    "invalid character in indentation: '\\t'.",
                )
                .with_lineno(lineno)
                .with_colno(i)
                .with_line(raw.to_string()),
            );
        }
        // Check for unicode whitespace characters in the indentation region
        if ch.is_whitespace() && ch != ' ' {
            let desc = match ch {
                '\u{a0}' => "'\\xa0' (NO-BREAK SPACE).".to_string(),
                _ => format!("'\\u{:04x}'.", ch as u32),
            };
            return Some(
                Error::new(
                    ErrorKind::InvalidIndentation,
                    format!("invalid character in indentation: {}", desc),
                )
                .with_lineno(lineno)
                .with_colno(i)
                .with_line(raw.to_string()),
            );
        }
        // Hit a non-whitespace character, done checking indentation
        break;
    }
    None
}

/// Classify a line based on its content after stripping indentation.
fn classify_line(rest: &str, depth: usize, lineno: usize, raw: &str) -> Result<Line, Error> {
    let text = raw.to_string();

    // List item: starts with "- " or is exactly "-"
    if rest.starts_with("- ") || rest == "-" {
        let value = if rest == "-" {
            String::new()
        } else {
            rest[2..].to_string()
        };
        return Ok(Line {
            kind: LineKind::ListItem,
            lineno,
            depth,
            key: None,
            value: Some(value),
            text,
        });
    }

    // String item: starts with "> " or is exactly ">"
    if rest.starts_with("> ") || rest == ">" {
        let value = if rest == ">" {
            String::new()
        } else {
            rest[2..].to_string()
        };
        return Ok(Line {
            kind: LineKind::StringItem,
            lineno,
            depth,
            key: None,
            value: Some(value),
            text,
        });
    }

    // Key item: starts with ": " or is exactly ":"
    if rest.starts_with(": ") || rest == ":" {
        let value = if rest == ":" {
            String::new()
        } else {
            rest[2..].to_string()
        };
        return Ok(Line {
            kind: LineKind::KeyItem,
            lineno,
            depth,
            key: None,
            value: Some(value),
            text,
        });
    }

    // Inline list: starts with "["
    if rest.starts_with('[') {
        return Ok(Line {
            kind: LineKind::InlineList,
            lineno,
            depth,
            key: None,
            value: Some(rest.to_string()),
            text,
        });
    }

    // Inline dict: starts with "{"
    if rest.starts_with('{') {
        return Ok(Line {
            kind: LineKind::InlineDict,
            lineno,
            depth,
            key: None,
            value: Some(rest.to_string()),
            text,
        });
    }

    // Dict item: look for ": " or trailing ":" in the line
    // The key must not start with any tag character: - > : # [ {
    // and must not be empty
    if let Some((key, value)) = try_parse_dict_item(rest) {
        return Ok(Line {
            kind: LineKind::DictItem,
            lineno,
            depth,
            key: Some(key),
            value: Some(value),
            text,
        });
    }

    // Unrecognized line
    Ok(Line {
        kind: LineKind::Unrecognized,
        lineno,
        depth,
        key: None,
        value: None,
        text,
    })
}

/// Try to parse a line as a dict item (key: value).
///
/// Returns Some((key, value)) if this is a dict item, None otherwise.
/// The key is everything before the first ": " or a trailing ":".
/// The key must not be empty and must not start with tag characters.
fn try_parse_dict_item(rest: &str) -> Option<(String, String)> {
    // A line that already matched a different line type tag should not reach here.
    // But if it did (e.g. starts with "-" but not "- " or "-\n"), we only reject
    // the characters that would have been caught by earlier classify_line checks.
    // By the time we reach this function, we know the line does NOT start with:
    //   "- ", "-" (alone), "> ", ">" (alone), ": ", ":" (alone), "#", "[", "{"
    // So we don't need to reject any first characters here — classify_line
    // already handled those cases before calling us.

    // Look for ": " separator or trailing ":"
    if let Some(colon_pos) = rest.find(": ") {
        let key = rest[..colon_pos].to_string();
        let value = rest[colon_pos + 2..].to_string();
        // Key must not be empty
        if key.is_empty() {
            return None;
        }
        // Trim trailing whitespace from key
        let key = key.trim_end().to_string();
        return Some((key, value));
    }

    // Check for trailing colon (key with empty value)
    if rest.ends_with(':') {
        let key = rest[..rest.len() - 1].trim_end().to_string();
        if key.is_empty() {
            return None;
        }
        return Some((key, String::new()));
    }

    None
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_blank_and_comments_skipped() {
        let input = "# comment\n\n  # indented comment\n  \nkey: value";
        let lexer = Lexer::new(input).unwrap();
        assert_eq!(lexer.lines.len(), 1);
        assert_eq!(lexer.lines[0].kind, LineKind::DictItem);
    }

    #[test]
    fn test_list_item() {
        let lexer = Lexer::new("- hello\n-").unwrap();
        assert_eq!(lexer.lines.len(), 2);
        assert_eq!(lexer.lines[0].kind, LineKind::ListItem);
        assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
        assert_eq!(lexer.lines[1].kind, LineKind::ListItem);
        assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
    }

    #[test]
    fn test_string_item() {
        let lexer = Lexer::new("> hello\n>").unwrap();
        assert_eq!(lexer.lines.len(), 2);
        assert_eq!(lexer.lines[0].kind, LineKind::StringItem);
        assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
        assert_eq!(lexer.lines[1].kind, LineKind::StringItem);
        assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
    }

    #[test]
    fn test_key_item() {
        let lexer = Lexer::new(": hello\n:").unwrap();
        assert_eq!(lexer.lines.len(), 2);
        assert_eq!(lexer.lines[0].kind, LineKind::KeyItem);
        assert_eq!(lexer.lines[0].value.as_deref(), Some("hello"));
        assert_eq!(lexer.lines[1].kind, LineKind::KeyItem);
        assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
    }

    #[test]
    fn test_dict_item() {
        let lexer = Lexer::new("name: John\nage:").unwrap();
        assert_eq!(lexer.lines.len(), 2);
        assert_eq!(lexer.lines[0].kind, LineKind::DictItem);
        assert_eq!(lexer.lines[0].key.as_deref(), Some("name"));
        assert_eq!(lexer.lines[0].value.as_deref(), Some("John"));
        assert_eq!(lexer.lines[1].kind, LineKind::DictItem);
        assert_eq!(lexer.lines[1].key.as_deref(), Some("age"));
        assert_eq!(lexer.lines[1].value.as_deref(), Some(""));
    }

    #[test]
    fn test_inline_list() {
        let lexer = Lexer::new("[a, b, c]").unwrap();
        assert_eq!(lexer.lines.len(), 1);
        assert_eq!(lexer.lines[0].kind, LineKind::InlineList);
        assert_eq!(lexer.lines[0].value.as_deref(), Some("[a, b, c]"));
    }

    #[test]
    fn test_inline_dict() {
        let lexer = Lexer::new("{k: v}").unwrap();
        assert_eq!(lexer.lines.len(), 1);
        assert_eq!(lexer.lines[0].kind, LineKind::InlineDict);
        assert_eq!(lexer.lines[0].value.as_deref(), Some("{k: v}"));
    }

    #[test]
    fn test_indentation() {
        let lexer = Lexer::new("key:\n  - item1\n  - item2").unwrap();
        assert_eq!(lexer.lines[0].depth, 0);
        assert_eq!(lexer.lines[1].depth, 2);
        assert_eq!(lexer.lines[2].depth, 2);
    }

    #[test]
    fn test_tab_in_indentation() {
        let result = Lexer::new("\tkey: value");
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert_eq!(err.kind, ErrorKind::TabInIndentation);
    }

    #[test]
    fn test_dict_key_no_tag_start() {
        // Lines starting with tag characters should not be dict items
        let lexer = Lexer::new("- not a dict\n> not a dict\n: not a dict").unwrap();
        assert_eq!(lexer.lines[0].kind, LineKind::ListItem);
        assert_eq!(lexer.lines[1].kind, LineKind::StringItem);
        assert_eq!(lexer.lines[2].kind, LineKind::KeyItem);
    }

    #[test]
    fn test_unrecognized() {
        // A line with no tag pattern
        let lexer = Lexer::new("hello world").unwrap();
        assert_eq!(lexer.lines[0].kind, LineKind::Unrecognized);
    }
}