djr 0.0.1

Djot parser written in pure Rust
Documentation
use crate::string::unicode::*;
use std::collections::HashMap;

/// Struct for storing any kind of attributes
#[derive(Debug, Default, PartialEq)]
pub struct Attributes<'a> {
    id: Option<&'a str>,
    comment: Option<&'a str>,
    classes: Vec<&'a str>,
    keyval: HashMap<&'a str, &'a str>,
}

/// This enum keeps track of the current scanner state
enum State {
    Start,
    Scanning,
    ScanningComment,
    ScanningId,
    ScanningClass,
    ScanningKey,
    ScanningValue,
    ScanningBareValue,
    ScanningQuotedValue,
    ScanningEscaped,
}

/// This attribute scanner is insane, but it works as a charm and should be quite robust, since it
/// is closely based on the official attribute scanner found here:
/// https://github.com/jgm/djot/blob/e0eedd3354cb4273fd5ddcc0898a74744e5e797a/djot/attributes.lua
pub(crate) fn parse_attributes(text: &str) -> Option<Attributes<'_>> {
    use State::*;

    let mut id = None;
    let mut comment = None;
    let mut classes = Vec::new();
    let mut keyval = HashMap::new();

    let mut state = Start;
    let mut begin = 0;
    let mut tmp_key = "";
    for (i, byte) in text.bytes().enumerate() {
        match (&state, byte) {
            // Start of scanning
            (Start, b'{') => state = Scanning,
            (Start, _) => return None,

            // Scanning
            (Scanning, b' ' | b'\t' | b'\n' | b'\r') => state = Scanning,
            (Scanning, b'}') => break,
            (Scanning, b'%') => {
                begin = i;
                state = ScanningComment;
            }
            (Scanning, b'#') => {
                begin = i;
                state = ScanningId;
            }
            (Scanning, b'.') => {
                begin = i;
                state = ScanningClass;
            }
            (Scanning, c) if is_alphabetic(c) || matches!(c, b'_' | b':') => {
                begin = i;
                state = ScanningKey;
            }

            // Scanning comment
            (ScanningComment, b'%' | b'}') => {
                comment = Some(text[begin + 1..i].trim());
                state = Scanning;
            }

            // Scanning ID
            (ScanningId, c)
                if _is_whitespace(c) || matches!(c, b'"' | b'\'' | b'=' | b'<' | b'>' | b'`') =>
            {
                id = Some(&text[begin + 1..i]);
                state = Scanning;
            }
            (ScanningId, b'}') => {
                id = Some(&text[begin + 1..i]);
                break;
            }

            // Scanning class
            (ScanningClass, c)
                if _is_whitespace(c) || matches!(c, b'"' | b'\'' | b'=' | b'<' | b'>' | b'`') =>
            {
                classes.push(&text[begin + 1..i]);
                state = Scanning;
            }
            (ScanningClass, b'}') => {
                classes.push(&text[begin + 1..i]);
                break;
            }

            // Scanning key
            (ScanningKey, c) if is_alphanumeric(c) || matches!(c, b'_' | b'.' | b'-' | b':') => {}
            (ScanningKey, b'=') => {
                tmp_key = &text[begin..i];
                state = ScanningValue;
            }
            (ScanningKey, _) => state = Scanning,

            // Scanning value
            (ScanningValue, b'"') => {
                begin = i;
                state = ScanningQuotedValue;
            }
            (ScanningValue, c) if is_alphanumeric(c) || matches!(c, b'_' | b'.' | b'-' | b':') => {
                begin = i;
                state = ScanningBareValue;
            }
            (ScanningValue, _) => state = Scanning,

            // Scanning bare value
            (ScanningBareValue, c)
                if _is_whitespace(c) || matches!(c, b'"' | b'\'' | b'=' | b'<' | b'>' | b'`') =>
            {
                keyval.insert(tmp_key, &text[begin..i]);
                state = Scanning;
            }
            (ScanningBareValue, b'}') => {
                keyval.insert(tmp_key, &text[begin..i]);
                break;
            }

            // Scanning quoted value
            (ScanningQuotedValue, b'"') => {
                keyval.insert(tmp_key, &text[begin + 1..i]);
                state = Scanning;
            }
            (ScanningQuotedValue, b'\\') => state = ScanningEscaped,
            (ScanningEscaped, _) => state = ScanningQuotedValue,

            _ => {}
        }
    }

    let attrs = Attributes {
        id,
        comment,
        classes,
        keyval,
    };

    // If all value are empty, return None
    if attrs == Attributes::default() {
        None
    } else {
        Some(attrs)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn find_basic_attributes() {
        let s = "{a=b #ident\n.class\nkey=val1\n .class key2=\"val two \\\" ok\" x";

        let mut map = HashMap::new();
        map.insert("a", "b");
        map.insert("key", "val1");
        map.insert("key2", "val two \\\" ok");

        assert_eq!(
            parse_attributes(s),
            Some(Attributes {
                id: Some("ident"),
                comment: None,
                classes: vec!["class", "class"],
                keyval: map
            }),
        )
    }

    #[test]
    fn leave_out_comments() {
        let s = "{a=b % This is a comment %}";

        let mut map = HashMap::new();
        map.insert("a", "b");

        assert_eq!(
            parse_attributes(s),
            Some(Attributes {
                id: None,
                comment: Some("This is a comment"),
                classes: vec![],
                keyval: map,
            })
        )
    }

    #[test]
    fn only_comment() {
        let s = "{%This is a lonely comment%}";

        assert_eq!(
            parse_attributes(s),
            Some(Attributes {
                id: None,
                comment: Some("This is a lonely comment"),
                classes: vec![],
                keyval: HashMap::new(),
            })
        );
    }
}