use crate::parsing::positions::CharPositions;
use crate::parsing::syntax_error;
use crate::prelude::*;
use std::num::IntErrorKind;
use std::path::PathBuf;
use std::rc::Rc;
use std::result;
use std::str::FromStr;
#[derive(Debug)]
pub(crate) struct Token {
pub data: TokenData,
pub span: Span,
}
impl Token {
pub fn to_atom(&self) -> Option<Argument> {
if let TokenData::Atom(atom) = &self.data {
Some(Argument::Atom(atom.clone(), self.span.clone()))
} else {
None
}
}
pub fn to_name(&self) -> Result<String> {
if let TokenData::Name(name) = &self.data {
Ok(name.clone())
} else {
syntax_error("expected atom or ident", &self.span)
}
}
pub const fn is_comma(&self) -> bool {
matches!(self.data, TokenData::Comma)
}
pub const fn is_comment(&self) -> bool {
matches!(self.data, TokenData::Comment(_))
}
}
#[derive(Debug)]
pub enum TokenData {
LeftParen,
Comma,
RightParen,
Atom(Atom),
Name(String),
Comment(String),
}
fn take_until(
chars: impl Iterator<Item = (Position, char)>,
target: char,
) -> result::Result<(Position, String), String> {
let mut result = String::new();
for (pos, c) in chars {
if c == target {
return Ok((pos, result));
}
result.push(c);
}
Err(result)
}
const CAP: usize = 10;
pub fn tokenize(code: &str, file_path: Rc<PathBuf>) -> Result<Vec<Token>> {
let mut tokens = vec![];
let mut current = String::with_capacity(CAP);
let mut chars = CharPositions::new(code);
let mut add_token = |data, start, end| {
tokens.push(Token {
span: Span::new(start, end, file_path.clone()),
data,
});
};
let mut current_start_pos = None;
while let Some((char_pos, c)) = chars.next() {
let syntax_error = |msg| {
Err(Exception::spanned(
"Syntax",
msg,
&Span::single(char_pos, file_path.clone()),
))
};
match c {
'(' => {
if !current.is_empty() {
add_token(
TokenData::Name(current),
current_start_pos.take().unwrap(),
char_pos.one_back(),
);
current = String::with_capacity(CAP);
}
add_token(TokenData::LeftParen, char_pos, char_pos);
}
')' | ',' | ' ' | '\n' | '\t' => {
if !current.is_empty() {
add_token(
try_parse_atom(current, char_pos, &file_path)?,
current_start_pos.take().unwrap(),
char_pos.one_back(),
);
current = String::with_capacity(CAP);
}
add_token(
match c {
')' => TokenData::RightParen,
',' => TokenData::Comma,
_ => continue,
},
char_pos,
char_pos,
);
}
'"' => {
let Ok((end_pos, body)) = take_until(chars.by_ref(), '"') else {
return syntax_error("unclosed string literal");
};
add_token(TokenData::Atom(Atom::new_string(&body)), char_pos, end_pos);
}
'\'' => {
let Ok((end_pos, body)) = take_until(chars.by_ref(), '\'') else {
return syntax_error("unclosed char literal");
};
match char::from_str(&body) {
Ok(c) => add_token(TokenData::Atom(Atom::Char(c)), char_pos, end_pos),
Err(e) => return syntax_error(&format!("invalid char literal: {e}")),
}
}
'#' => {
let (end_pos, body) =
take_until(chars.by_ref(), '\n').unwrap_or_else(|body| (last_pos(code), body));
add_token(TokenData::Comment(body), char_pos, end_pos);
}
_ => {
if current_start_pos.is_none() {
current_start_pos = Some(char_pos);
}
current.push(c);
}
}
}
if !current.is_empty() {
let p = last_pos(code);
add_token(
try_parse_atom(current, p, &file_path)?,
current_start_pos.unwrap(),
p,
);
}
Ok(tokens)
}
fn try_parse_atom(s: String, pos: Position, file_path: &Rc<PathBuf>) -> Result<TokenData> {
match s.as_str() {
"true" => Ok(TokenData::Atom(Atom::Bool(true))),
"false" => Ok(TokenData::Atom(Atom::Bool(false))),
"null" => Ok(TokenData::Atom(Atom::Null)),
_ => match s.parse::<i64>() {
Ok(int) => Ok(TokenData::Atom(Atom::Int(int))),
Err(err) => match err.kind() {
IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => Err(Exception::spanned(
"Syntax",
format!("overflowing integer literal: {s}"),
&Span::single(pos, file_path.clone()),
)),
_ => Ok(TokenData::Name(s)),
},
},
}
}
fn last_pos(code: &str) -> Position {
CharPositions::new(code)
.last()
.expect("already found some code")
.0
}
#[cfg_attr(not(test), expect(dead_code))]
pub fn extract(text: &str, span: &Span) -> Option<String> {
let mut start_found = false;
let mut s = String::new();
for (pos, c) in CharPositions::new(text) {
if pos == span.start {
start_found = true;
}
if start_found {
s.push(c);
}
if pos == span.end {
if !start_found {
return None;
}
return Some(s);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use crate::no_path;
fn sp(start_line: u32, start_col: u32, end_line: u32, end_col: u32) -> Span {
Span::new(
Position::new(start_line - 1, start_col),
Position::new(end_line - 1, end_col),
no_path(),
)
}
#[expect(clippy::unnecessary_wraps)]
fn so(text: &str) -> Option<String> {
Some(text.to_string())
}
#[test]
fn extract_1() {
let t = "abc\nde\nf\nghi\n";
assert_eq!(extract(t, &sp(1, 1, 1, 4)), so("abc\n"));
assert_eq!(extract(t, &sp(1, 1, 2, 2)), so("abc\nde"));
assert_eq!(extract(t, &sp(1, 1, 2, 1)), so("abc\nd"));
assert_eq!(extract(t, &sp(1, 3, 2, 2)), so("c\nde"));
assert_eq!(extract(t, &sp(1, 1, 1, 2)), so("ab"));
assert_eq!(extract(t, &sp(1, 1, 1, 1)), so("a"));
assert_eq!(extract(t, &sp(1, 1, 1, 1000)), None);
assert_eq!(extract(t, &sp(1, 2, 1, 1)), None);
assert_eq!(extract(t, &sp(2, 2, 3, 2)), so("e\nf\n"));
assert_eq!(extract(t, &sp(3, 1, 1, 1)), None);
assert_eq!(extract(t, &sp(2, 1, 1, 4)), None);
assert_eq!(extract(t, &sp(2, 2, 2, 2)), so("e"));
assert_eq!(extract(t, &sp(3, 2, 3, 2)), so("\n"));
assert_eq!(extract(t, &sp(4, 5, 4, 5)), None);
assert_eq!(extract(t, &sp(4, 5, 4, 4)), None);
assert_eq!(extract(t, &sp(3, 2, 4, 1)), so("\ng"));
assert_eq!(extract(t, &sp(3, 2, 6, 1)), None);
assert_eq!(extract(t, &sp(3, 3, 4, 1)), None);
}
#[test]
fn token_extraction() {
let code = "_(
def(double_and_print, x, print(*(2, x))),
)
";
let tokens = tokenize(code, no_path()).unwrap();
let parts = tokens
.into_iter()
.map(|t| extract(code, &t.span).unwrap())
.collect::<Vec<_>>();
assert_eq!(
parts,
[
"_",
"(",
"def",
"(",
"double_and_print",
",",
"x",
",",
"print",
"(",
"*",
"(",
"2",
",",
"x",
")",
")",
")",
",",
")"
]
.map(ToString::to_string)
);
assert_eq!(parts.join(""), code.replace(['\n', '\t', ' '], ""));
}
}