use crate::error::{Error, Result};
use std::path::{Path, PathBuf};
use winnow::ascii::{line_ending, space0};
use winnow::combinator::{alt, delimited, opt, peek, preceded, terminated};
use winnow::error::{ContextError, ErrMode};
use winnow::prelude::*;
use winnow::stream::AsChar;
use winnow::token::{none_of, one_of, take_till, take_until, take_while};
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
At,
Ident(String),
String(String),
Integer(i64),
Float(f64),
Boolean(bool),
LBrace,
RBrace,
LParen,
RParen,
LBracket,
RBracket,
Colon,
Comma,
Equals,
Arrow,
LessThan,
GreaterThan,
Dot,
Comment(String),
DocComment(String),
ModuleDoc(String),
Delimiter,
RawContent(String),
Newline,
Eof,
}
impl Token {
pub fn description(&self) -> &'static str {
match self {
Token::At => "@",
Token::Ident(_) => "identifier",
Token::String(_) => "string",
Token::Integer(_) => "integer",
Token::Float(_) => "float",
Token::Boolean(_) => "boolean",
Token::LBrace => "{",
Token::RBrace => "}",
Token::LParen => "(",
Token::RParen => ")",
Token::LBracket => "[",
Token::RBracket => "]",
Token::Colon => ":",
Token::Comma => ",",
Token::Equals => "=",
Token::Arrow => "->",
Token::LessThan => "<",
Token::GreaterThan => ">",
Token::Dot => ".",
Token::Comment(_) => "comment",
Token::DocComment(_) => "doc comment",
Token::ModuleDoc(_) => "module doc",
Token::Delimiter => "---",
Token::RawContent(_) => "raw content",
Token::Newline => "newline",
Token::Eof => "end of file",
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct LocatedToken {
pub token: Token,
pub line: usize,
pub column: usize,
}
pub struct WinnowLexer {
path: PathBuf,
}
impl WinnowLexer {
pub fn new(path: &Path) -> Self {
Self {
path: path.to_path_buf(),
}
}
pub fn tokenize(&self, content: &str) -> Result<Vec<LocatedToken>> {
let mut tokens = Vec::new();
let mut input = content;
let mut line = 1;
let mut column = 1;
loop {
let before_skip = input;
if let Ok(_) = space0::<_, ErrMode<ContextError>>(&mut input) {
let skipped = before_skip.len() - input.len();
column += skipped;
}
if input.is_empty() {
tokens.push(LocatedToken {
token: Token::Eof,
line,
column,
});
break;
}
let tok_line = line;
let tok_column = column;
let before_token = input;
let token =
parse_token(&mut input).map_err(|e| self.format_error(e, tok_line, tok_column))?;
let consumed = before_token.len() - input.len();
for ch in before_token[..consumed].chars() {
if ch == '\n' {
line += 1;
column = 1;
} else {
column += 1;
}
}
tokens.push(LocatedToken {
token,
line: tok_line,
column: tok_column,
});
}
Ok(tokens)
}
fn format_error(&self, err: ErrMode<ContextError>, line: usize, column: usize) -> Error {
Error::Parse {
file: self.path.clone(),
line,
column,
message: format!("{}", err),
}
}
}
fn parse_token<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
alt((
parse_newline,
parse_comment,
parse_delimiter_or_raw,
parse_arrow,
parse_single_char,
parse_string,
parse_number,
parse_ident_or_keyword,
))
.parse_next(input)
}
fn parse_newline<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
'\n'.value(Token::Newline).parse_next(input)
}
fn parse_comment<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
preceded(
"//",
alt((
preceded('!', take_till(0.., '\n'))
.map(|s: &str| Token::ModuleDoc(s.trim().to_string())),
preceded('/', take_till(0.., '\n'))
.map(|s: &str| Token::DocComment(s.trim().to_string())),
take_till(0.., '\n').map(|s: &str| Token::Comment(s.trim().to_string())),
)),
)
.parse_next(input)
}
fn parse_delimiter_or_raw<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
preceded(
"---",
alt((
preceded(
opt(line_ending),
terminated(take_until(0.., "---"), "---")
.map(|s: &str| Token::RawContent(s.trim_end().to_string())),
),
winnow::combinator::empty.value(Token::Delimiter),
)),
)
.parse_next(input)
}
fn parse_arrow<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
"->".value(Token::Arrow).parse_next(input)
}
fn parse_single_char<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
alt((
'@'.value(Token::At),
'{'.value(Token::LBrace),
'}'.value(Token::RBrace),
'('.value(Token::LParen),
')'.value(Token::RParen),
'['.value(Token::LBracket),
']'.value(Token::RBracket),
':'.value(Token::Colon),
','.value(Token::Comma),
'='.value(Token::Equals),
'<'.value(Token::LessThan),
'>'.value(Token::GreaterThan),
'.'.value(Token::Dot),
))
.parse_next(input)
}
fn parse_string<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
delimited('"', parse_string_content, '"')
.map(Token::String)
.parse_next(input)
}
fn parse_string_content<'a>(input: &mut &'a str) -> winnow::Result<String, ErrMode<ContextError>> {
use winnow::combinator::repeat;
repeat(
0..,
alt((
preceded(
'\\',
alt((
'n'.value('\n'),
't'.value('\t'),
'r'.value('\r'),
'\\'.value('\\'),
'"'.value('"'),
)),
),
none_of('"'),
)),
)
.fold(String::new, |mut acc, ch| {
acc.push(ch);
acc
})
.parse_next(input)
}
fn parse_number<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
let sign = opt('-').parse_next(input)?;
let int_part = digit1_with_underscores.parse_next(input)?;
let checkpoint = input.checkpoint();
let dot_result: winnow::Result<_, ErrMode<ContextError>> = ('.').parse_next(input);
if dot_result.is_ok() {
if peek::<_, _, ErrMode<ContextError>, _>(one_of(AsChar::is_dec_digit))
.parse_next(input)
.is_ok()
{
let frac_part = digit1_with_underscores.parse_next(input)?;
let num_str = format!(
"{}{}{}{}",
sign.map(|c| c.to_string()).unwrap_or_default(),
int_part.replace('_', ""),
".",
frac_part.replace('_', "")
);
let val = num_str
.parse::<f64>()
.map_err(|_| ErrMode::Backtrack(ContextError::default()))?;
return Ok(Token::Float(val));
} else {
input.reset(&checkpoint);
}
}
let num_str = format!(
"{}{}",
sign.map(|c| c.to_string()).unwrap_or_default(),
int_part.replace('_', "")
);
let val = num_str
.parse::<i64>()
.map_err(|_| ErrMode::Backtrack(ContextError::default()))?;
Ok(Token::Integer(val))
}
fn digit1_with_underscores<'a>(
input: &mut &'a str,
) -> winnow::Result<&'a str, ErrMode<ContextError>> {
take_while(1.., |c: char| c.is_ascii_digit() || c == '_').parse_next(input)
}
fn parse_ident_or_keyword<'a>(input: &mut &'a str) -> winnow::Result<Token, ErrMode<ContextError>> {
use winnow::combinator::peek;
peek(one_of(|c: char| c.is_alphabetic() || c == '_')).parse_next(input)?;
take_while(1.., |c: char| c.is_alphanumeric() || c == '_')
.map(|s: &str| match s {
"true" => Token::Boolean(true),
"false" => Token::Boolean(false),
_ => Token::Ident(s.to_string()),
})
.parse_next(input)
}
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(input: &str) -> Vec<Token> {
let lexer = WinnowLexer::new(Path::new("test.au"));
lexer
.tokenize(input)
.unwrap()
.into_iter()
.map(|lt| lt.token)
.collect()
}
#[test]
fn test_simple_tokens() {
let tokens = tokenize("@ { } ( ) [ ] : , = -> < > .");
assert_eq!(
tokens,
vec![
Token::At,
Token::LBrace,
Token::RBrace,
Token::LParen,
Token::RParen,
Token::LBracket,
Token::RBracket,
Token::Colon,
Token::Comma,
Token::Equals,
Token::Arrow,
Token::LessThan,
Token::GreaterThan,
Token::Dot,
Token::Eof,
]
);
}
#[test]
fn test_identifier() {
let tokens = tokenize("foo bar_baz hello123");
assert_eq!(
tokens,
vec![
Token::Ident("foo".into()),
Token::Ident("bar_baz".into()),
Token::Ident("hello123".into()),
Token::Eof,
]
);
}
#[test]
fn test_string_literal() {
let tokens = tokenize(r#""hello world""#);
assert_eq!(tokens[0], Token::String("hello world".into()));
}
#[test]
fn test_string_with_escapes() {
let tokens = tokenize(r#""line1\nline2\ttab""#);
assert_eq!(tokens[0], Token::String("line1\nline2\ttab".into()));
}
#[test]
fn test_integer() {
let tokens = tokenize("42 -17 1_000_000");
assert_eq!(tokens[0], Token::Integer(42));
assert_eq!(tokens[1], Token::Integer(-17));
assert_eq!(tokens[2], Token::Integer(1000000));
}
#[test]
fn test_float() {
let tokens = tokenize("3.14 -2.5");
assert_eq!(tokens[0], Token::Float(3.14));
assert_eq!(tokens[1], Token::Float(-2.5));
}
#[test]
fn test_boolean() {
let tokens = tokenize("true false");
assert_eq!(tokens[0], Token::Boolean(true));
assert_eq!(tokens[1], Token::Boolean(false));
}
#[test]
fn test_comment() {
let tokens = tokenize("// this is a comment");
assert_eq!(tokens[0], Token::Comment("this is a comment".into()));
}
#[test]
fn test_doc_comment() {
let tokens = tokenize("/// Documentation");
assert_eq!(tokens[0], Token::DocComment("Documentation".into()));
}
#[test]
fn test_module_doc() {
let tokens = tokenize("//! Module documentation");
assert_eq!(tokens[0], Token::ModuleDoc("Module documentation".into()));
}
#[test]
fn test_delimiter() {
let tokens = tokenize("---");
assert_eq!(tokens[0], Token::Delimiter);
}
#[test]
fn test_raw_content() {
let input = r#"---
SELECT * FROM users
WHERE id = 1
---"#;
let tokens = tokenize(input);
assert_eq!(
tokens[0],
Token::RawContent("SELECT * FROM users\nWHERE id = 1".into())
);
}
#[test]
fn test_raw_content_inline() {
let input = "---\nSELECT * FROM users\n---";
let tokens = tokenize(input);
assert_eq!(tokens[0], Token::RawContent("SELECT * FROM users".into()));
}
#[test]
fn test_directive() {
let tokens = tokenize("@schema User");
assert_eq!(tokens[0], Token::At);
assert_eq!(tokens[1], Token::Ident("schema".into()));
assert_eq!(tokens[2], Token::Ident("User".into()));
}
#[test]
fn test_complex_example() {
let input = r#"@schema User {
id: EntityId
name: String
}"#;
let tokens = tokenize(input);
assert_eq!(tokens[0], Token::At);
assert_eq!(tokens[1], Token::Ident("schema".into()));
assert_eq!(tokens[2], Token::Ident("User".into()));
assert_eq!(tokens[3], Token::LBrace);
}
}