// Basic Ruchy lexer implemented in Ruchy
// This is a proof-of-concept for self-hosting capability
// Token types
enum Token {
// Literals
Integer(i64),
Float(f64),
String(String),
Bool(bool),
// Keywords
Let,
Fun,
If,
Else,
Match,
For,
While,
Struct,
Enum,
Trait,
Import,
Export,
// Operators
Plus,
Minus,
Star,
Slash,
Equal,
NotEqual,
Less,
Greater,
// Delimiters
LeftParen,
RightParen,
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
// Special
Arrow,
Comma,
Semicolon,
Colon,
Dot,
// Identifiers
Identifier(String),
// Control
Newline,
Eof,
}
// Span information for error reporting
struct Span {
start: usize,
end: usize,
}
// Lexer state
struct Lexer {
input: String,
position: usize,
current_char: Option<char>,
}
impl Lexer {
fun new(input: String) -> Lexer {
let mut lexer = Lexer {
input: input,
position: 0,
current_char: None,
};
lexer.advance();
lexer
}
fun advance(self) {
if self.position < self.input.len() {
self.current_char = Some(self.input.chars().nth(self.position).unwrap());
self.position = self.position + 1;
} else {
self.current_char = None;
}
}
fun skip_whitespace(self) {
while self.current_char == Some(' ') || self.current_char == Some('\t') {
self.advance();
}
}
fun read_number(self) -> Token {
let mut num_str = String::new();
let mut is_float = false;
while self.current_char.is_some() {
match self.current_char.unwrap() {
'0'..='9' => {
num_str.push(self.current_char.unwrap());
self.advance();
},
'.' => {
if !is_float {
is_float = true;
num_str.push('.');
self.advance();
} else {
break;
}
},
_ => break,
}
}
if is_float {
Token::Float(num_str.parse().unwrap())
} else {
Token::Integer(num_str.parse().unwrap())
}
}
fun read_identifier(self) -> Token {
let mut ident = String::new();
while self.current_char.is_some() {
let ch = self.current_char.unwrap();
if ch.is_alphanumeric() || ch == '_' {
ident.push(ch);
self.advance();
} else {
break;
}
}
// Check for keywords
match ident.as_str() {
"let" => Token::Let,
"fun" => Token::Fun,
"if" => Token::If,
"else" => Token::Else,
"match" => Token::Match,
"for" => Token::For,
"while" => Token::While,
"struct" => Token::Struct,
"enum" => Token::Enum,
"trait" => Token::Trait,
"import" => Token::Import,
"export" => Token::Export,
"true" => Token::Bool(true),
"false" => Token::Bool(false),
_ => Token::Identifier(ident),
}
}
fun read_string(self) -> Token {
self.advance(); // Skip opening quote
let mut string = String::new();
while self.current_char.is_some() && self.current_char != Some('"') {
if self.current_char == Some('\\') {
self.advance();
if self.current_char.is_some() {
match self.current_char.unwrap() {
'n' => string.push('\n'),
't' => string.push('\t'),
'\\' => string.push('\\'),
'"' => string.push('"'),
_ => {
string.push('\\');
string.push(self.current_char.unwrap());
}
}
self.advance();
}
} else {
string.push(self.current_char.unwrap());
self.advance();
}
}
if self.current_char == Some('"') {
self.advance(); // Skip closing quote
}
Token::String(string)
}
fun next_token(self) -> Token {
self.skip_whitespace();
if self.current_char.is_none() {
return Token::Eof;
}
let ch = self.current_char.unwrap();
match ch {
'\n' => {
self.advance();
Token::Newline
},
'(' => {
self.advance();
Token::LeftParen
},
')' => {
self.advance();
Token::RightParen
},
'{' => {
self.advance();
Token::LeftBrace
},
'}' => {
self.advance();
Token::RightBrace
},
'[' => {
self.advance();
Token::LeftBracket
},
']' => {
self.advance();
Token::RightBracket
},
'+' => {
self.advance();
Token::Plus
},
'-' => {
self.advance();
if self.current_char == Some('>') {
self.advance();
Token::Arrow
} else {
Token::Minus
}
},
'*' => {
self.advance();
Token::Star
},
'/' => {
self.advance();
if self.current_char == Some('/') {
// Skip comment line
while self.current_char.is_some() && self.current_char != Some('\n') {
self.advance();
}
self.next_token() // Recursively get next token
} else {
Token::Slash
}
},
'=' => {
self.advance();
if self.current_char == Some('=') {
self.advance();
Token::Equal
} else {
Token::Equal // For now, treat = and == the same
}
},
'!' => {
self.advance();
if self.current_char == Some('=') {
self.advance();
Token::NotEqual
} else {
Token::NotEqual // Simplified
}
},
'<' => {
self.advance();
Token::Less
},
'>' => {
self.advance();
Token::Greater
},
',' => {
self.advance();
Token::Comma
},
';' => {
self.advance();
Token::Semicolon
},
':' => {
self.advance();
Token::Colon
},
'.' => {
self.advance();
Token::Dot
},
'"' => self.read_string(),
'0'..='9' => self.read_number(),
'a'..='z' | 'A'..='Z' | '_' => self.read_identifier(),
_ => {
self.advance();
self.next_token() // Skip unknown characters
}
}
}
fun tokenize(self) -> Vec<Token> {
let mut tokens = Vec::new();
loop {
let token = self.next_token();
if token == Token::Eof {
tokens.push(token);
break;
}
if token != Token::Newline { // Skip newlines for simplicity
tokens.push(token);
}
}
tokens
}
}
// Test the lexer
fun main() {
let input = "let x = 42 + 3.14";
let mut lexer = Lexer::new(input.to_string());
let tokens = lexer.tokenize();
println("Tokens for: {}", input);
for token in tokens {
match token {
Token::Integer(n) => println(" Integer: {}", n),
Token::Float(f) => println(" Float: {}", f),
Token::String(s) => println(" String: {}", s),
Token::Identifier(id) => println(" Identifier: {}", id),
Token::Let => println(" Keyword: let"),
Token::Plus => println(" Operator: +"),
Token::Equal => println(" Operator: ="),
Token::Eof => println(" EOF"),
_ => println(" Other token"),
}
}
}