serde_sono 0.5.1

use std::{
    fmt::{Debug, Display, Write},
    iter::Peekable,
    num::NonZeroU16,
    ops::Deref,
    str::CharIndices,
};

pub type Result<T> = std::result::Result<T, Error>;

#[derive(Debug)]
pub struct Error {
    pub message: String,
    pub line: Line,
    pub column: Column,
}

impl Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if f.alternate() {
            write!(
                f,
                "{} [line: {}, column: {}]",
                &self.message, self.line, self.column
            )
        } else {
            f.write_str(&self.message)
        }
    }
}

pub type Line = NonZeroU16;
pub type Column = NonZeroU16;

// cloning is cheap.
#[derive(Debug, Clone)]
pub struct Lexem<'a>(&'a str);

impl<'a> Lexem<'a> {
    pub fn unquoted(&self) -> Lexem<'a> {
        let bytes = self.0.as_bytes();
        if bytes.len() > 1 && bytes[0] == b'"' && bytes[bytes.len() - 1] == b'"' {
            Lexem(unsafe { std::str::from_utf8_unchecked(&bytes[1..bytes.len() - 1]) })
        } else {
            Lexem(self.0)
        }
    }

    pub fn unescaped(&self) -> String {
        let mut out = String::with_capacity(self.0.len());
        let mut s = self.0;
        loop {
            match s.split_once('\\') {
                None => {
                    out.push_str(s);
                    return out;
                }
                Some((r, rs)) => {
                    out.push_str(r);
                    let w = if rs.is_empty() {
                        // ~ when parsed from a sono source we won't
                        // get here; this represents an invalid escape
                        // character at the string's end, so we just
                        // keep the input as is
                        out.push('\\');
                        0
                    } else {
                        match rs.as_bytes()[0] {
                            b'\n' => 1,
                            b'\\' => {
                                out.push('\\');
                                1
                            }
                            b'\"' => {
                                out.push('\"');
                                1
                            }
                            b'b' => {
                                out.push('\u{0008}');
                                1
                            }
                            b'f' => {
                                out.push('\u{000c}');
                                1
                            }
                            b'n' => {
                                out.push('\n');
                                1
                            }
                            b'r' => {
                                out.push('\r');
                                1
                            }
                            b't' => {
                                out.push('\t');
                                1
                            }
                            _ => {
                                // ~ when parsed from a sono source we won't get
                                // here; this represents an invalid escape sequence,
                                // so we just keep the input as is
                                let c = rs.chars().next().unwrap();
                                out.push('\\');
                                out.push(c);
                                c.len_utf8()
                            }
                        }
                    };
                    s = &rs[w..];
                }
            };
        }
    }

    /// Returns the first and single character of a single character
    /// string without processing escape sequences (ie. a string whose
    /// length is exactly one.)
    pub fn single_char_raw(self) -> Option<char> {
        let mut chars = self.0.chars();
        match (chars.next(), chars.next()) {
            (c, None) => c,
            _ => None,
        }
    }

    /// Returns the first and single character of a single character
    /// string with escape sequences resolved (ie. a string whose
    /// [unescaped](Lexem::unescaped) length is exactly one.)
    pub fn single_char_unescaped(self) -> Option<char> {
        let mut chars = self.0.chars();
        loop {
            match (chars.next(), chars.next()) {
                (Some('\\'), None) => {
                    // ~ edge case (see Lexem::unescaped)
                    return Some('\\');
                }
                (Some('\\'), Some(c)) => {
                    macro_rules! if_last {
                        ($char:expr) => {
                            return if chars.next().is_none() {
                                Some($char)
                            } else {
                                None
                            }
                        };
                    }
                    match c {
                        '\n' => {
                            // ~ a `\<newline>` is effectively
                            // swallowed, therefore as no length, try
                            // the next character in the next loop
                            // iteration
                        }
                        '\\' | '\"' => if_last!(c),
                        'b' => if_last!('\u{0008}'),
                        'f' => if_last!('\u{000c}'),
                        'n' => if_last!('\n'),
                        'r' => if_last!('\r'),
                        't' => if_last!('\t'),
                        _ => {
                            // ~ edge case (see Lexem::unescaped);
                            // this would be longer-than-one string
                            return None;
                        }
                    }
                }
                (Some(c), None) => return Some(c),
                _ => return None,
            }
        }
    }

    pub fn into_str(self) -> &'a str {
        self.0
    }
}

impl Deref for Lexem<'_> {
    type Target = str;

    fn deref(&self) -> &Self::Target {
        self.0
    }
}

/// A boolean telling whether a [TType::String]'s lexem contains at
/// least one escape sequence.
pub type HasEscapes = bool;

#[derive(Debug)]
pub enum TType<'a> {
    LeftCurly,
    RightCurly,
    LeftParen,
    RightParen,
    LeftBracket,
    RightBracket,
    Colon,
    Comma,
    True,
    False,
    Null,

    Ident(Lexem<'a>),
    // ~ as parsed from input stream including the surrounding quotes;
    // if `has-escapes == true`, the lexem contains at least one of
    // the following sequences:
    // - `\\n`: (a newline preceeded by the esc char) unescape: omit, effectivly removing the <newline>
    // - `\\`: (to escape char twice in a row) unescape: replace with a single backslash
    // - `\"`: unescape: replace with a single quote
    // - `\b`, unescape: replace with a <backspace> char (ie. \u{0008})
    // - `\f`, unescape: replace with a <form feed> char (ie. \u{000c})
    // - `\n`, unescape: replace with a <newline> char (ie. \n)
    // - `\r`, unescape: replace with a <carriage return> char (ie. \r)
    // - `\t`, unescape: replace with a single <tab> char (ie. \t)
    String(Lexem<'a>, HasEscapes),
    Int(Lexem<'a>),
    Float(Lexem<'a>),
}

impl Display for TType<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TType::LeftCurly => f.write_char('{'),
            TType::RightCurly => f.write_char('}'),
            TType::LeftParen => f.write_char('('),
            TType::RightParen => f.write_char(')'),
            TType::LeftBracket => f.write_char('['),
            TType::RightBracket => f.write_char(']'),
            TType::Colon => f.write_char(':'),
            TType::Comma => f.write_char(','),
            TType::True => f.write_str("true"),
            TType::False => f.write_str("false"),
            TType::Null => f.write_str("null"),
            TType::Ident(lexem) => f.write_str(lexem.0),
            // XXX don't we need to unescape here? what purposes are
            // we using this display implementation for exactly?
            TType::String(lexem, _) => f.write_str(lexem.0),
            TType::Int(lexem) => f.write_str(lexem.0),
            TType::Float(lexem) => f.write_str(lexem.0),
        }
    }
}

#[derive(Debug)]
pub struct Token<'a> {
    pub ttype: TType<'a>,
    pub line: Line,
    pub column: Column,
}

pub fn parse(input: &str) -> Scanner<'_> {
    Scanner {
        input,
        chars: input.char_indices().peekable(),
        line: Line::new(1).unwrap(),
        column: Column::new(1).unwrap(),
    }
}

// --------------------------------------------------------------------

pub struct Scanner<'a> {
    // ~ the parsed input string
    input: &'a str,
    // ~ points to the next char to be consumed
    chars: Peekable<CharIndices<'a>>,
    // ~ tells the line number of the next to-be-consumed char
    line: Line,
    // ~ tells the column number of the next to-be-consumed char
    column: Column,
}

impl<'a> Iterator for Scanner<'a> {
    type Item = Result<Token<'a>>;

    fn next(&mut self) -> Option<Self::Item> {
        while let Some((i, c)) = self.chars.next() {
            let mut curr_column = self.column;
            self.column = self.column.saturating_add(1);

            macro_rules! return_token {
                ($ttype:expr) => {
                    return Some(Ok(Token {
                        ttype: $ttype,
                        line: self.line,
                        column: curr_column,
                    }))
                };
            }
            macro_rules! return_error {
                ($($arg:tt)*) => {
                    return Some(Err(Error {
                        message: format!($($arg)*),
                        line: self.line,
                        column: curr_column,
                    }))
                };
            }

            match c {
                '(' => return_token!(TType::LeftParen),
                ')' => return_token!(TType::RightParen),
                '[' => return_token!(TType::LeftBracket),
                ']' => return_token!(TType::RightBracket),
                '{' => return_token!(TType::LeftCurly),
                '}' => return_token!(TType::RightCurly),
                ':' => return_token!(TType::Colon),
                ',' => return_token!(TType::Comma),
                '/' => {
                    match self.chars.by_ref().next() {
                        None => {
                            curr_column = self.column;
                            // ~ do not increment `self.column` here (that column simply's not there)
                            return_error!("unexpected end-of-file");
                        }
                        Some((_, '/')) => {
                            // ~ consume the whole line
                            self.column = self.column.saturating_add(1);
                            let mut consumed = 0;
                            loop {
                                match self.chars.next() {
                                    None => {
                                        self.column = self.column.saturating_add(consumed);
                                        break;
                                    }
                                    Some((_, '\n')) => {
                                        self.line = self.line.saturating_add(1);
                                        self.column = Column::new(1).unwrap();
                                        break;
                                    }
                                    _ => {
                                        consumed += 1;
                                        // ~ keep going
                                    }
                                }
                            }
                        }
                        Some((_, c)) => {
                            // ~ keep `curr_column` still pointing to
                            // the first slash (for the error message)
                            self.column = self.column.saturating_add(1);
                            return_error!("expected '//', but encountered '/{c}'")
                        }
                    }
                }
                ' ' | '\t' | '\r' => {
                    // ~ no-op
                }
                '\n' => {
                    self.line = self.line.saturating_add(1);
                    self.column = Column::new(1).unwrap();
                }
                '"' => {
                    let mut has_escapes = false;
                    let mut columns_consumed = 0;
                    while let Some((j, c)) = self.chars.by_ref().next() {
                        match c {
                            '\n' => {
                                self.line = self.line.saturating_add(1);
                                self.column = Column::new(1).unwrap();
                                columns_consumed = 0;
                            }
                            '"' => {
                                columns_consumed += 1;
                                self.column = self.column.saturating_add(columns_consumed);
                                return_token!(TType::String(
                                    Lexem(&self.input[i..=j]),
                                    has_escapes
                                ));
                            }
                            '\\' => {
                                has_escapes = true;
                                columns_consumed += 1;
                                // ~ validate the next char
                                match self.chars.by_ref().next() {
                                    None => break,
                                    Some((_, c)) => {
                                        columns_consumed += 1;
                                        match c {
                                            '\n' => {
                                                self.line = self.line.saturating_add(1);
                                                self.column = Column::new(1).unwrap();
                                                columns_consumed = 0;
                                            }
                                            '\\' | '"' | 'b' | 'f' | 'n' | 'r' | 't' => {
                                                columns_consumed += 1;
                                            }
                                            c => {
                                                // try to skip to the end of the string literal,
                                                // effectively reporting only one error per token
                                                columns_consumed =
                                                    self.discard_until_eos(columns_consumed);
                                                self.column =
                                                    self.column.saturating_add(columns_consumed);
                                                // ~ keep `curr_column` still pointing to
                                                // the first slash (for the error message)
                                                return_error!(
                                                    "expected '\\<newline>', '\\\\', '\\\"', '\\b', '\\f', '\\n', '\\r', or '\\t', but got '\\{c}'");
                                            }
                                        }
                                    }
                                }
                            }
                            _ => {
                                columns_consumed += 1;
                                // ~ and keep going
                            }
                        }
                    }
                    self.column = self.column.saturating_add(columns_consumed);
                    return_error!("expected '\"', but reached end-of-file");
                }
                '-' | '0'..='9' => {
                    return Some(
                        self.parse_number(i, c)
                            .map(|(s, nt)| Token {
                                ttype: match nt {
                                    NumberType::Int => TType::Int(Lexem(s)),
                                    NumberType::Float => TType::Float(Lexem(s)),
                                },
                                line: self.line,
                                column: curr_column,
                            })
                            .map_err(|e| Error {
                                message: e.to_string(),
                                line: self.line,
                                column: curr_column,
                            }),
                    );
                }
                'a'..='z' | 'A'..='Z' | '_' => {
                    let mut chars_consumed = 0;
                    let mut last_char;
                    loop {
                        last_char = self.chars.peek().copied();
                        if let Some((_, nc)) = last_char {
                            if nc.is_ascii_alphanumeric() || nc == '_' {
                                let _ = self.chars.next();
                                chars_consumed += 1;
                            } else {
                                break;
                            }
                        } else {
                            break;
                        }
                    }
                    self.column = self.column.saturating_add(chars_consumed);
                    return_token!(match if let Some((j, _)) = last_char {
                        &self.input[i..j]
                    } else {
                        &self.input[i..]
                    } {
                        "null" => TType::Null,
                        "true" => TType::True,
                        "false" => TType::False,
                        s => TType::Ident(Lexem(s)),
                    });
                }
                _ => {
                    return_error!("invalid character '{c}'");
                }
            };
        }
        None
    }
}

enum NumberType {
    Int,
    Float,
}

impl<'a> Scanner<'a> {
    // ~ parses a number assuming `curr_char` is its first charater
    // ~ panics if `curr_char` is not '-' or a digit
    fn parse_number(
        &mut self,
        // ~ the position of `curr_char` in self.input
        start_i: usize,
        // ~ the currently consumed character (must be '0'..='9' or '-')
        start_char: char,
        // ~ the column of `curr_char`
    ) -> std::result::Result<(&'a str, NumberType), &'static str> {
        fn to_number(
            s: &str,
            ntype: NumberType,
        ) -> std::result::Result<(&str, NumberType), &'static str> {
            let sb = s.as_bytes();
            if sb.len() > 1 && sb[0] == b'0' && sb[1] != b'.' {
                Err("invalid number; leading zeros not allowed")
            } else {
                Ok((s, ntype))
            }
        }
        match start_char {
            '-' => match self.chars.peek() {
                Some((_, '0'..='9')) => {
                    let (_, curr_char) = self.chars.next().unwrap();
                    self.column = self.column.saturating_add(1);
                    return self.parse_number(start_i, curr_char);
                }
                _ => {
                    return Err("invalid number; expected digit after '-'");
                }
            },
            '0'..='9' => {
                loop {
                    match self.chars.peek() {
                        None => {
                            return to_number(&self.input[start_i..], NumberType::Int);
                        }
                        Some((_, '.')) => {
                            // ~ continue parsing the fraction below
                            break;
                        }
                        Some((j, nc)) if nc.is_ascii_whitespace() || ",:{([])}".contains(*nc) => {
                            return to_number(&self.input[start_i..*j], NumberType::Int);
                        }
                        Some((_, '0'..='9')) => {
                            let _ = self.chars.next();
                            self.column = self.column.saturating_add(1);
                            // ~ continue trying to consume more
                        }
                        _ => {
                            return Err("invalid number; digits followed by garbage");
                        }
                    }
                }
            }
            _ => {
                panic!("unexpected curr_char");
            }
        }
        // ~ consume a fraction
        {
            let nc = self.chars.next();
            debug_assert!(matches!(nc, Some((_, '.'))));
        }
        let mut consumed_columns = 1; // ~ one for the '.' above
        if !matches!(self.chars.peek(), Some((_, '0'..='9'))) {
            return Err("invalid number; missing fraction");
        }
        let _ = self.chars.next();
        consumed_columns += 1;
        let last_char = loop {
            let nc = self.chars.peek();
            if let Some((_, '0'..='9')) = nc {
                let _ = self.chars.next();
                consumed_columns += 1;
            } else {
                break nc.copied();
            }
        };
        self.column = self.column.saturating_add(consumed_columns);
        match last_char {
            None => to_number(&self.input[start_i..], NumberType::Float),
            Some((j, nc)) if nc.is_ascii_whitespace() || ",:{([])}".contains(nc) => {
                to_number(&self.input[start_i..j], NumberType::Float)
            }
            _ => Err("invalid number; fraction followed by garbage"),
        }
    }

    /// Assuming the stream to be positioned within a string, reads
    /// until encountering a closing quote, discarding everything in
    /// between (even faulty escapes.)
    ///
    /// Returns the number of columns consumed from the current line
    /// (starting with `start_column`.)
    fn discard_until_eos(&mut self, mut start_column: u16) -> u16 {
        let mut escaped = false;
        for (_, c) in self.chars.by_ref() {
            match c {
                '\n' => {
                    self.line = self.line.saturating_add(1);
                    self.column = Column::new(1).unwrap();
                    start_column = 0;
                    escaped = false;
                }
                '\\' => {
                    start_column += 1;
                    escaped = true;
                }
                '"' => {
                    start_column += 1;
                    if escaped {
                        escaped = false;
                    } else {
                        break;
                    }
                }
                _ => {
                    start_column += 1;
                    escaped = false;
                    // ~ and continue consuming
                }
            }
        }
        start_column
    }
}

#[cfg(test)]
mod tests {
    use super::Lexem;

    fn parse(s: &str) -> Vec<String> {
        super::parse(s).fold(Vec::new(), |mut acc, t| {
            acc.push(match t {
                Ok(t) => format!("{}:{},{}", t.ttype, t.line, t.column),
                Err(e) => format!("<<{e}>>:{}:{}", e.line, e.column),
            });
            acc
        })
    }

    #[test]
    fn test_basics() {
        assert_eq!(
            vec![
                "(:1,1".to_string(),
                "[:1,2".to_string(),
                "{:1,3".to_string(),
                "}:2,1".to_string(),
                "]:2,2".to_string(),
                "):2,3".to_string(),
                r#""hello, world":2,6"#.to_string(),
                "::2,20".to_string(),
                ",:3,2".to_string(),
            ],
            parse("([{\n}])  \"hello, world\":  //asdfasdf\"foadsf\" \n\t,")
        );
    }

    #[test]
    fn test_escapes() {
        assert_eq!(vec![r#""abc\\def":1,2"#], parse(r#" "abc\\def""#));
        assert_eq!(
            vec![r#""\r\n\t\f\b\\\"":1,3"#],
            parse(r#"  "\r\n\t\f\b\\\"""#)
        );
        // ~ invalid escapes
        assert_eq!(
            vec![
                r#"<<expected '\<newline>', '\\', '\"', '\b', '\f', '\n', '\r', or '\t', but got '\x'>>:1:1"#,
                "asdf:1,14",
            ],
            parse(r#""\x...\"..." asdf"#)
        );
    }

    #[test]
    fn test_ttype_string_without_escapes() {
        assert!(matches!(
            super::parse(r#""""#).next(),
            Some(Ok(super::Token {
                ttype: super::TType::String(Lexem(r#""""#), false),
                ..
            }))
        ));
        assert!(matches!(
            super::parse(r#""abc""#).next(),
            Some(Ok(super::Token {
                ttype: super::TType::String(Lexem(r#""abc""#), false),
                ..
            }))
        ));
    }

    #[test]
    fn test_ttype_string_with_escapes() {
        assert!(matches!(
            super::parse(r#""a\bc""#).next(),
            Some(Ok(super::Token {
                ttype: super::TType::String(Lexem(r#""a\bc""#), true),
                ..
            }))
        ));
    }

    #[test]
    fn test_numbers() {
        assert_eq!(vec!["0:1,1".to_string()], parse("0"));
        assert_eq!(vec!["1:1,1".to_string()], parse("1"));
        assert_eq!(vec!["10:1,1".to_string()], parse("10"));
        assert_eq!(vec!["123:1,1".to_string()], parse("123"));
        assert_eq!(vec!["-10:1,1".to_string()], parse("-10"));
        assert_eq!(vec!["-0:1,1".to_string()], parse("-0"));
        assert_eq!(vec!["123.0:1,1".to_string()], parse("123.0"));
        assert_eq!(vec!["123.00001:1,1".to_string()], parse("123.00001"));
        assert_eq!(
            vec![
                "123.1:1,1".to_string(),
                "234:1,7".to_string(),
                "99.80001:1,11".to_string()
            ],
            parse("123.1 234 99.80001")
        );
    }

    #[test]
    fn test_invalid_numbers() {
        for (s, errs) in [
            (
                "123.00001.",
                vec![
                    "<<invalid number; fraction followed by garbage>>:1:1",
                    "<<invalid character '.'>>:1:10",
                ],
            ),
            (
                " 0123",
                vec!["<<invalid number; leading zeros not allowed>>:1:2"],
            ),
        ] {
            assert_eq!(errs, parse(s), "on input: {s:?}");
        }
    }

    #[test]
    fn test_identifiers() {
        assert_eq!(
            vec![
                "null:1,1".to_string(),
                ",:1,5".to_string(),
                "true:1,6".to_string(),
                "false:2,2".to_string(),
                "hello_world:2,8".to_string()
            ],
            parse("null,true\n false\thello_world")
        );
    }

    #[test]
    fn test_unescape() {
        assert_eq!("", Lexem(r#""#).unescaped());
        assert_eq!("\n", Lexem(r#"\n"#).unescaped());
        assert_eq!("abc\ndef", Lexem(r#"abc\ndef"#).unescaped());
        assert_eq!("abc\nd\te\rf", Lexem(r#"abc\nd\te\rf"#).unescaped());
        assert_eq!("\n\\\t\\\r\\", Lexem(r#"\n\\\t\\\r\\"#).unescaped());
        assert_eq!("\\\n\\\t\\\r\\", Lexem(r#"\\\n\\\t\\\r\\"#).unescaped());
        assert_eq!("\\", Lexem(r#"\\"#).unescaped());
        // ~ invalid, just check we don't crash and have a deterministic output
        assert_eq!("\\", Lexem(r#"\"#).unescaped());
        assert_eq!("\\🐂", Lexem(r#"\🐂"#).unescaped());
    }

    #[test]
    fn test_single_char_raw() {
        assert_eq!(None, Lexem("").single_char_raw());
        assert_eq!(None, Lexem("ab").single_char_raw());
        assert_eq!(Some('a'), Lexem("a").single_char_raw());
        assert_eq!(Some('\n'), Lexem("\n").single_char_raw());
        assert_eq!(None, Lexem("\\\n").single_char_raw());
        assert_eq!(Some('\\'), Lexem("\\").single_char_raw());
    }

    #[test]
    fn test_single_char_unescaped() {
        assert_eq!(None, Lexem("").single_char_unescaped());
        assert_eq!(None, Lexem("ab").single_char_unescaped());
        assert_eq!(Some('a'), Lexem("a").single_char_unescaped());
        assert_eq!(Some('\n'), Lexem("\n").single_char_unescaped());
        assert_eq!(Some('.'), Lexem("\\\n\\\n.").single_char_unescaped());
        assert_eq!(Some('\t'), Lexem("\\t").single_char_unescaped());
        assert_eq!(Some('\\'), Lexem("\\\\").single_char_unescaped());
        assert_eq!(None, Lexem(r#"\\\\"#).single_char_unescaped());
        // ~ special case (an invalid escape actually)
        assert_eq!(Some('\\'), Lexem("\\").single_char_unescaped());
        assert_eq!(None, Lexem(r#"\\\"#).single_char_unescaped());
    }
}