use smol_str::SmolStr;
use crate::bib::syntax::SyntaxKind;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub kind: SyntaxKind,
pub text: SmolStr,
}
fn special_kind(c: u8) -> Option<SyntaxKind> {
Some(match c {
b'@' => SyntaxKind::AT,
b'{' => SyntaxKind::L_BRACE,
b'}' => SyntaxKind::R_BRACE,
b'(' => SyntaxKind::L_PAREN,
b')' => SyntaxKind::R_PAREN,
b',' => SyntaxKind::COMMA,
b'=' => SyntaxKind::EQ,
b'#' => SyntaxKind::HASH,
b'"' => SyntaxKind::QUOTE,
_ => return None,
})
}
fn is_word_boundary(c: u8) -> bool {
matches!(c, b' ' | b'\t' | b'\n' | b'\r') || special_kind(c).is_some()
}
pub fn lex(input: &str) -> Vec<Token> {
let bytes = input.as_bytes();
let mut out = Vec::new();
let mut pos = 0;
while pos < bytes.len() {
let c = bytes[pos];
let (kind, len) = if let Some(kind) = special_kind(c) {
(kind, 1)
} else if c == b'\n' {
(SyntaxKind::NEWLINE, 1)
} else if c == b'\r' {
let len = if bytes.get(pos + 1) == Some(&b'\n') {
2
} else {
1
};
(SyntaxKind::NEWLINE, len)
} else if c == b' ' || c == b'\t' {
let len = run_len(bytes, pos, |b| b == b' ' || b == b'\t');
(SyntaxKind::WHITESPACE, len)
} else {
let len = run_len(bytes, pos, |b| !is_word_boundary(b));
let kind = if bytes[pos..pos + len].iter().all(u8::is_ascii_digit) {
SyntaxKind::NUMBER
} else {
SyntaxKind::WORD
};
(kind, len)
};
out.push(Token {
kind,
text: SmolStr::new(&input[pos..pos + len]),
});
pos += len;
}
out
}
fn run_len(bytes: &[u8], start: usize, pred: impl Fn(u8) -> bool) -> usize {
let mut i = start + 1;
while i < bytes.len() && pred(bytes[i]) {
i += 1;
}
i - start
}