1#[derive(Clone)]
7pub enum TokenType {
8 Symbol(std::sync::Arc<[char]>),
10 Atom(std::sync::Arc<[char]>),
12 String(std::sync::Arc<[char]>),
14 Char(char),
16 Number(f64),
18 Seperator,
20 SentenceSeperator,
22 FuncListOpen,
24 FuncListClose,
26 ListOpen,
28 ListClose,
30 }
32
33impl std::fmt::Debug for TokenType {
34 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35 match self {
36 TokenType::Symbol(sym) => write!(f, "Id({})", sym.iter().collect::<String>()),
37 TokenType::Atom(a) => write!(f, "Atom({})", a.iter().collect::<String>()),
38 TokenType::String(s) => write!(f, "Str({})", s.iter().collect::<String>()),
39 TokenType::Char(c) => write!(f, "Ch({c})"),
40 TokenType::Number(n) => write!(f, "Num({n})"),
41 TokenType::Seperator => write!(f, "S"),
42 TokenType::SentenceSeperator => write!(f, "SS"),
43 TokenType::FuncListOpen => write!(f, "FnO"),
44 TokenType::FuncListClose => write!(f, "FnC"),
45 TokenType::ListOpen => write!(f, "LstO"),
46 TokenType::ListClose => write!(f, "LstC"),
47 }
48 }
49}
50
51#[derive(Clone)]
53pub struct Token {
54 pub value: TokenType,
55 pub position: ((usize, usize), (usize, usize)),
56}
57
58impl std::fmt::Debug for Token {
59 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60 write!(
61 f,
62 "Token<{:?}, [({}, {}), ({}, {})]>",
63 self.value, self.position.0.0, self.position.0.1, self.position.1.0, self.position.1.1,
64 )
65 }
66}
67
68struct Lexer<'a> {
70 chars: std::iter::Peekable<std::str::Chars<'a>>,
71 line: usize,
72 col: usize,
73}
74
75impl<'a> Lexer<'a> {
76 fn new(input: &'a str) -> Self {
77 Self {
78 chars: input.chars().peekable(),
79 line: 1,
80 col: 0,
81 }
82 }
83
84 fn peek(&mut self) -> Option<char> { self.chars.peek().copied() }
86
87 fn next_char(&mut self) -> Option<char> {
89 let ch = self.chars.next()?;
90 if ch == '\n' {
91 self.line += 1;
92 self.col = 0;
93 } else {
94 self.col += 1;
95 }
96 Some(ch)
97 }
98
99 fn current_pos(&self) -> (usize, usize) { (self.line, self.col + 1) }
101
102 fn end_pos(&self) -> (usize, usize) { (self.line, self.col) }
104}
105
106pub fn source_to_token(source: &str) -> Result<Vec<Token>, std::sync::Arc<str>> {
130 let mut lexer = Lexer::new(source);
131 let mut tokens = Vec::new();
132 let mut comment_depth = 0;
133
134 while let Some(ch) = lexer.peek() {
135 let start = lexer.current_pos();
136
137 if ch == '(' && lexer.peek_second() == Some('*') {
139 lexer.next_char();
140 lexer.next_char(); comment_depth += 1;
142 continue;
143 }
144 if ch == '*' && comment_depth > 0 && lexer.peek_second() == Some(')') {
145 lexer.next_char();
146 lexer.next_char(); comment_depth -= 1;
148 continue;
149 }
150
151 if comment_depth > 0 {
153 lexer.next_char();
154 continue;
155 }
156
157 match ch {
159 ' ' | '\t' | '\r' | '\n' => {
160 lexer.next_char();
161 }
162 '"' => tokens.push(lex_string(&mut lexer)?),
163 '#' => tokens.push(lex_atom_or_char(&mut lexer)?),
164 ',' => tokens.push(make_token(&mut lexer, TokenType::Seperator)),
165 ';' => tokens.push(make_token(&mut lexer, TokenType::SentenceSeperator)),
166 '[' => tokens.push(make_token(&mut lexer, TokenType::FuncListOpen)),
167 ']' => tokens.push(make_token(&mut lexer, TokenType::FuncListClose)),
168 '{' => tokens.push(make_token(&mut lexer, TokenType::ListOpen)),
169 '}' => tokens.push(make_token(&mut lexer, TokenType::ListClose)),
170 c if c.is_ascii_digit() || c == '+' || c == '-' => tokens.push(lex_number(&mut lexer)?),
171 c if is_symbol_start(c) => tokens.push(lex_symbol(&mut lexer)?),
172 _ => {
173 return Err(std::sync::Arc::from(format!(
174 "Error[ksl::token::source_to_token]: Invalid token `{}` at `({}, {})`.",
175 ch, start.0, start.1
176 )));
177 }
178 }
179 }
180
181 if comment_depth == 0 {
182 Ok(tokens)
183 } else {
184 Err(std::sync::Arc::from(
185 "Error[ksl::token::source_to_token]: Unclosed comment.",
186 ))
187 }
188}
189
190impl<'a> Lexer<'a> {
191 fn peek_second(&self) -> Option<char> {
193 let mut it = self.chars.clone();
194 it.next();
195 it.next()
196 }
197}
198
199fn make_token(lexer: &mut Lexer, val: TokenType) -> Token {
201 let start = lexer.current_pos();
202 lexer.next_char();
203 Token {
204 value: val,
205 position: (start, lexer.end_pos()),
206 }
207}
208
209fn is_symbol_start(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace() && !c.is_ascii_digit()) || c == '_' }
211
212fn is_symbol_cont(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace()) || c == '_' || c == '\'' }
214
215fn lex_string(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
216 let start = lexer.current_pos();
217 lexer.next_char(); let mut buf = Vec::new();
219 while let Some(c) = lexer.next_char() {
220 if c == '"' {
221 return Ok(Token {
222 value: TokenType::String(std::sync::Arc::from(buf)),
223 position: (start, lexer.end_pos()),
224 });
225 }
226 buf.push(c);
227 }
228 Err(std::sync::Arc::from(format!(
229 "Error[ksl::token::lex_string]: Unclosed string at `({}, {})`.",
230 start.0, start.1
231 )))
232}
233
234fn lex_number(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
235 let start = lexer.current_pos();
236 let mut s = String::new();
237
238 while let Some(c) = lexer.peek() {
240 if c.is_ascii_digit() || "+-.e".contains(c) {
241 s.push(lexer.next_char().unwrap());
242 } else {
243 break;
244 }
245 }
246
247 s.parse::<f64>()
248 .map(|n| Token {
249 value: TokenType::Number(n),
250 position: (start, lexer.end_pos()),
251 })
252 .map_err(|_| {
253 std::sync::Arc::from(format!(
254 concat!(
255 "Error[ksl::token::lex_number]: ",
256 "Invalid number string `{}` at `({}, {})`."
257 ),
258 s, start.0, start.1
259 ))
260 })
261}
262
263fn lex_atom_or_char(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
264 let start = lexer.current_pos();
265 lexer.next_char(); match lexer.peek() {
268 Some(c) if c.is_ascii_digit() => {
269 let mut s = String::new();
271 while let Some(digit) = lexer.peek() {
272 if digit.is_ascii_digit() {
273 s.push(lexer.next_char().unwrap());
274 } else {
275 break;
276 }
277 }
278 let code = s.parse::<u32>().map_err(|_| {
279 std::sync::Arc::from(format!(
280 "Error[ksl::token::lex_atom_or_char]: Invalid number string `{}` at `({}, {})`.",
281 s, start.0, start.1
282 ))
283 })?;
284 let ch = char::from_u32(code).ok_or_else(|| {
285 std::sync::Arc::from(format!(
286 "Error[ksl::token::lex_atom_or_char]: Invalid unicode `{}` at `({}, {})`.",
287 code, start.0, start.1
288 ))
289 })?;
290 Ok(Token {
291 value: TokenType::Char(ch),
292 position: (start, lexer.end_pos()),
293 })
294 }
295 Some(c) if !c.is_ascii_punctuation() && !c.is_whitespace() => {
296 let mut buf = Vec::new();
298 while let Some(cont) = lexer.peek() {
299 if is_symbol_cont(cont) {
300 buf.push(lexer.next_char().unwrap());
301 } else {
302 break;
303 }
304 }
305 Ok(Token {
306 value: TokenType::Atom(std::sync::Arc::from(buf)),
307 position: (start, lexer.end_pos()),
308 })
309 }
310 _ => Err(std::sync::Arc::from(format!(
311 "Error[ksl::token::lex_atom_or_char]: Invalid atom at `({}, {})`.",
312 start.0, start.1
313 ))),
314 }
315}
316
317fn lex_symbol(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
318 let start = lexer.current_pos();
319 let mut buf = Vec::new();
320 while let Some(c) = lexer.peek() {
321 if is_symbol_cont(c) {
322 buf.push(lexer.next_char().unwrap());
323 } else {
324 break;
325 }
326 }
327 Ok(Token {
328 value: TokenType::Symbol(std::sync::Arc::from(buf)),
329 position: (start, lexer.end_pos()),
330 })
331}