use regex::Regex;
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
Unknown,
}
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub kind: TokenKind,
pub text: String,
pub index: usize,
pub row: usize,
pub col: usize,
pub length: usize,
pub indent: usize,
pub tag: isize,
}
impl Token {
pub fn new(kind: TokenKind, text: String, index: usize, row: usize, col: usize, length: usize, indent: usize) -> Self {
Token {
kind,
text,
index,
row,
col,
length,
indent,
tag: 0,
}
}
}
pub struct Lexer {
pub input: String,
pub pos: usize,
pub row: usize,
pub col: usize,
pub regex_cache: HashMap<u32, Regex>,
pub last_token_kind: Option<TokenKind>,
}
impl Lexer {
pub fn new(input: String) -> Self {
let mut regex_cache = HashMap::new();
regex_cache.insert(u32::MAX, Regex::new("__Unknown__").unwrap());
Lexer {
input,
pos: 0,
row: 1,
col: 1,
regex_cache,
last_token_kind: None,
}
}
pub fn from_str(input: &str) -> Self {
Self::new(input.to_string())
}
pub fn tokenize(&mut self) -> Vec<Token> {
let mut tokens = vec![];
while let Some(tok) = self.next_token() {
tokens.push(tok);
}
tokens
}
pub fn next_token(&mut self) -> Option<Token> {
if self.pos >= self.input.len() {
return None;
}
let remaining = &self.input[self.pos..];
let start_row = self.row;
let start_col = self.col;
let indent = self.calculate_line_indent();
let ch = remaining.chars().next().unwrap();
let matched = ch.to_string();
let current_pos = self.pos;
self.advance(&matched);
let token = Token::new(TokenKind::Unknown, matched, current_pos, start_row, start_col, 1, indent);
self.last_token_kind = Some(token.kind.clone());
Some(token)
}
pub fn calculate_line_indent(&self) -> usize {
let mut line_start = 0;
let mut pos = 0;
while pos < self.pos {
if self.input.chars().nth(pos) == Some('\n') {
line_start = pos + 1;
}
pos += 1;
}
let line_content = &self.input[line_start..];
line_content.chars().take_while(|&c| c == ' ').count()
}
pub fn match_cached_pattern(&self, input: &str, token_kind: TokenKind) -> Option<String> {
if let Some(regex) = self.regex_cache.get(&(token_kind as u32)) {
if let Some(mat) = regex.find(input) {
return Some(mat.as_str().to_string());
}
}
None
}
fn advance(&mut self, matched: &str) {
for ch in matched.chars() {
self.pos += ch.len_utf8();
if ch == '\n' {
self.row += 1;
self.col = 1;
} else {
self.col += 1;
}
}
}
}