use super::error::ExprParseError;
#[derive(Debug, Clone)]
pub struct Token {
pub text: String,
pub kind: TokenKind,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TokenKind {
Ident,
Number,
StringLit,
LParen,
RParen,
Comma,
Op,
}
pub fn tokenize(input: &str) -> Result<Vec<Token>, ExprParseError> {
let chars: Vec<(usize, char)> = input.char_indices().collect();
let mut tokens = Vec::new();
let mut i = 0;
while i < chars.len() {
let (_, ch) = chars[i];
if ch.is_ascii_whitespace() {
i += 1;
continue;
}
if ch == '(' {
tokens.push(Token {
text: "(".into(),
kind: TokenKind::LParen,
});
i += 1;
continue;
}
if ch == ')' {
tokens.push(Token {
text: ")".into(),
kind: TokenKind::RParen,
});
i += 1;
continue;
}
if ch == ',' {
tokens.push(Token {
text: ",".into(),
kind: TokenKind::Comma,
});
i += 1;
continue;
}
if i + 1 < chars.len() {
let (_, next_ch) = chars[i + 1];
let two: String = [ch, next_ch].iter().collect();
if matches!(two.as_str(), "<=" | ">=" | "!=" | "<>" | "||") {
tokens.push(Token {
text: two,
kind: TokenKind::Op,
});
i += 2;
continue;
}
}
if matches!(ch, '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>') {
tokens.push(Token {
text: ch.to_string(),
kind: TokenKind::Op,
});
i += 1;
continue;
}
if ch == '\'' {
let mut s = String::new();
i += 1;
while i < chars.len() {
let (_, c) = chars[i];
if c == '\'' {
if i + 1 < chars.len() && chars[i + 1].1 == '\'' {
s.push('\'');
i += 2;
continue;
}
i += 1;
break;
}
s.push(c);
i += 1;
}
tokens.push(Token {
text: s,
kind: TokenKind::StringLit,
});
continue;
}
if ch.is_ascii_digit()
|| (ch == '.' && i + 1 < chars.len() && chars[i + 1].1.is_ascii_digit())
{
let start_byte = chars[i].0;
let start_i = i;
while i < chars.len() && (chars[i].1.is_ascii_digit() || chars[i].1 == '.') {
i += 1;
}
let end_byte = if i < chars.len() {
chars[i].0
} else {
input.len()
};
tokens.push(Token {
text: input[start_byte..end_byte].to_string(),
kind: TokenKind::Number,
});
let _ = start_i; continue;
}
if ch.is_ascii_alphabetic() || ch == '_' {
let start_byte = chars[i].0;
while i < chars.len() && (chars[i].1.is_ascii_alphanumeric() || chars[i].1 == '_') {
i += 1;
}
let end_byte = if i < chars.len() {
chars[i].0
} else {
input.len()
};
tokens.push(Token {
text: input[start_byte..end_byte].to_string(),
kind: TokenKind::Ident,
});
continue;
}
return Err(ExprParseError::UnexpectedToken {
found: format!("'{ch}' (U+{:04X})", ch as u32),
pos: i,
});
}
Ok(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_expression() {
let tokens = tokenize("price * (1 + tax_rate)").unwrap();
assert_eq!(tokens.len(), 7);
}
#[test]
fn cjk_string_literal() {
let tokens = tokenize("'你好' || name").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::StringLit);
assert_eq!(tokens[0].text, "你好");
}
#[test]
fn emoji_string_literal() {
let tokens = tokenize("'🎉' || tag").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "🎉");
}
#[test]
fn two_char_op_after_multibyte_string() {
let tokens = tokenize("'你' || x").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[1].text, "||");
}
#[test]
fn escaped_quote_in_string() {
let tokens = tokenize("'it''s'").unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "it's");
}
#[test]
fn latin_diacritics_in_string() {
let tokens = tokenize("'café'").unwrap();
assert_eq!(tokens[0].text, "café");
}
#[test]
fn comparison_after_cjk() {
let tokens = tokenize("name != '禁止'").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[1].text, "!=");
assert_eq!(tokens[2].text, "禁止");
}
}