klex 0.1.2

A simple lexer (tokenizer) generator for Rust
Documentation
// This file is auto-generated by build.rs
// Do not edit manually
// --------------------------------------------------------
//----<GENERATED_BY>----

use regex::Regex;
use std::collections::HashMap;

#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
	Unknown,
//----<TOKEN_KIND>----
}

/// Token structure that represents a lexical token
/// Holds information about each token generated by the lexer
#[derive(Debug, Clone, PartialEq)]
pub struct Token {
	/// Token type identifier
	pub kind: TokenKind,
	/// Actual string value of the token
	pub text: String,
	/// 0-based start position in the entire input
	pub index: usize,
	/// Row number where the token appears (1-based)
	pub row: usize,
	/// Column number where the token appears (1-based)
	pub col: usize,
	/// Length of the token in characters
	pub length: usize,
	/// Indentation from the beginning of the line (number of spaces)
	pub indent: usize,
	/// User-defined tag (for additional information)
	pub tag: isize,
}

impl Token {
	/// Creates a new token with the specified parameters
	/// The tag field is initialized to 0
	pub fn new(kind: TokenKind, text: String, index: usize, row: usize, col: usize, length: usize, indent: usize) -> Self {
		Token {
			kind,
			text,
			index,
			row,
			col,
			length,
			indent,
			tag: 0,
		}
	}

//----<TO_STRING_METHOD>----
}

/// Lexer structure for lexical analysis
/// Parses input strings and generates tokens
pub struct Lexer {
	/// Input string to be analyzed
	pub input: String,
	/// Current parsing position (in bytes)
	pub pos: usize,
	/// Current row number (1-based)
	pub row: usize,
	/// Current column number (1-based)
	pub col: usize,
	/// Regular expression cache (for performance optimization)
	pub regex_cache: HashMap<u32, Regex>,
	/// Type of the last generated token
	pub last_token_kind: Option<TokenKind>,
}

impl Lexer {
	/// Creates a new lexer instance with the given input string
	/// Initializes the position to the beginning and sets up regex cache
	pub fn new(input: String) -> Self {
		let mut regex_cache = HashMap::new();
		regex_cache.insert(u32::MAX, Regex::new("__Unknown__").unwrap());
		//----<REG_EX_CODE>----
		Lexer {
			input,
			pos: 0,
			row: 1,
			col: 1,
			regex_cache,
			last_token_kind: None,
		}
	}

	/// Creates a new lexer instance from a string slice
	/// This is a convenience method that converts &str to String
	pub fn from_str(input: &str) -> Self {
		Self::new(input.to_string())
	}

	/// Tokenize input
	pub fn tokenize(&mut self) -> Vec<Token> {
		let mut tokens = vec![];
		while let Some(tok) = self.next_token() {
			tokens.push(tok); 
		}
		tokens
	}

	/// Returns the next token from the input string
	/// Returns None when the end of input is reached
	pub fn next_token(&mut self) -> Option<Token> {
		if self.pos >= self.input.len() {
			return None;
		}

		let remaining = &self.input[self.pos..];
		let start_row = self.row;
		let start_col = self.col;

		// Calculate indent (spaces at the start of current line)
		let indent = self.calculate_line_indent();

		//----<RULE_MATCH_CODE>----

		// No pattern matched, consume one character
		let ch = remaining.chars().next().unwrap();
		let matched = ch.to_string();
		let current_pos = self.pos;
		self.advance(&matched);
		let token = Token::new(TokenKind::Unknown, matched, current_pos, start_row, start_col, 1, indent);
		self.last_token_kind = Some(token.kind.clone());
		Some(token)
	}

	/// Calculates the indentation level of the current line
	/// Returns the number of spaces from the beginning of the line
	pub fn calculate_line_indent(&self) -> usize {
		// Find the start of the current line
		let mut line_start = 0;
		let mut pos = 0;
		
		// Find the beginning of the current line
		while pos < self.pos {
			if self.input.chars().nth(pos) == Some('\n') {
				line_start = pos + 1;
			}
			pos += 1;
		}
		
		// Count spaces from the beginning of the line
		let line_content = &self.input[line_start..];
		line_content.chars().take_while(|&c| c == ' ').count()
	}

	/// Attempts to match a cached regex pattern against the input
	/// Returns the matched string if found, None otherwise
	pub fn match_cached_pattern(&self, input: &str, token_kind: TokenKind) -> Option<String> {
		if let Some(regex) = self.regex_cache.get(&(token_kind as u32)) {
			if let Some(mat) = regex.find(input) {
				return Some(mat.as_str().to_string());
			}
		}
		None
	}

	/// Advances the lexer position based on the matched string
	/// Updates position, row, and column counters appropriately
	fn advance(&mut self, matched: &str) {
		for ch in matched.chars() {
			self.pos += ch.len_utf8();
			if ch == '\n' {
				self.row += 1;
				self.col = 1;
			} else {
				self.col += 1;
			}
		}
	}
}