1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41 Date(&'src str),
42
43 #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d*)?)")]
47 Number(&'src str),
48
49 #[regex(r#""([^"\\]|\\.)*""#)]
52 String(&'src str),
53
54 #[regex(r"([A-Z]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*(:([A-Z0-9]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*)+")]
63 Account(&'src str),
64
65 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]+")]
72 Currency(&'src str),
73
74 #[regex(r"#[a-zA-Z0-9-_/.]+")]
76 Tag(&'src str),
77
78 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
80 Link(&'src str),
81
82 #[token("txn")]
86 Txn,
87 #[token("balance")]
89 Balance,
90 #[token("open")]
92 Open,
93 #[token("close")]
95 Close,
96 #[token("commodity")]
98 Commodity,
99 #[token("pad")]
101 Pad,
102 #[token("event")]
104 Event,
105 #[token("query")]
107 Query,
108 #[token("note")]
110 Note,
111 #[token("document")]
113 Document,
114 #[token("price")]
116 Price,
117 #[token("custom")]
119 Custom,
120 #[token("option")]
122 Option_,
123 #[token("include")]
125 Include,
126 #[token("plugin")]
128 Plugin,
129 #[token("pushtag")]
131 Pushtag,
132 #[token("poptag")]
134 Poptag,
135 #[token("pushmeta")]
137 Pushmeta,
138 #[token("popmeta")]
140 Popmeta,
141 #[token("TRUE")]
143 #[token("True")]
144 #[token("true")]
145 True,
146 #[token("FALSE")]
148 #[token("False")]
149 #[token("false")]
150 False,
151 #[token("NULL")]
153 Null,
154
155 #[token("{{")]
159 LDoubleBrace,
160 #[token("}}")]
162 RDoubleBrace,
163 #[token("{#")]
165 LBraceHash,
166 #[token("{")]
168 LBrace,
169 #[token("}")]
171 RBrace,
172 #[token("(")]
174 LParen,
175 #[token(")")]
177 RParen,
178 #[token("@@")]
180 AtAt,
181 #[token("@")]
183 At,
184 #[token(":")]
186 Colon,
187 #[token(",")]
189 Comma,
190 #[token("~")]
192 Tilde,
193 #[token("|")]
195 Pipe,
196 #[token("+")]
198 Plus,
199 #[token("-")]
201 Minus,
202 #[token("*")]
204 Star,
205 #[token("/")]
207 Slash,
208
209 #[token("!")]
212 Pending,
213
214 #[regex(r"[PSTCURM?&]")]
217 Flag(&'src str),
218
219 #[regex(r"\r?\n")]
222 Newline,
223
224 #[regex(r";[^\n\r]*", allow_greedy = true)]
227 Comment(&'src str),
228
229 #[token("#")]
234 Hash,
235
236 #[regex(r"%[^\n\r]*", allow_greedy = true)]
239 PercentComment(&'src str),
240
241 #[regex(r"#![^\n\r]*", allow_greedy = true)]
244 Shebang(&'src str),
245
246 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
249 EmacsDirective(&'src str),
250
251 #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
255 MetaKey(&'src str),
256
257 Indent(usize),
261
262 DeepIndent(usize),
264
265 Error(&'src str),
268}
269
270impl Token<'_> {
271 pub const fn is_txn_flag(&self) -> bool {
273 matches!(
274 self,
275 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash
276 )
277 }
278
279 pub const fn is_directive_keyword(&self) -> bool {
281 matches!(
282 self,
283 Self::Txn
284 | Self::Balance
285 | Self::Open
286 | Self::Close
287 | Self::Commodity
288 | Self::Pad
289 | Self::Event
290 | Self::Query
291 | Self::Note
292 | Self::Document
293 | Self::Price
294 | Self::Custom
295 | Self::Option_
296 | Self::Include
297 | Self::Plugin
298 | Self::Pushtag
299 | Self::Poptag
300 | Self::Pushmeta
301 | Self::Popmeta
302 )
303 }
304}
305
306impl fmt::Display for Token<'_> {
307 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
308 match self {
309 Self::Date(s) => write!(f, "{s}"),
310 Self::Number(s) => write!(f, "{s}"),
311 Self::String(s) => write!(f, "{s}"),
312 Self::Account(s) => write!(f, "{s}"),
313 Self::Currency(s) => write!(f, "{s}"),
314 Self::Tag(s) => write!(f, "{s}"),
315 Self::Link(s) => write!(f, "{s}"),
316 Self::Txn => write!(f, "txn"),
317 Self::Balance => write!(f, "balance"),
318 Self::Open => write!(f, "open"),
319 Self::Close => write!(f, "close"),
320 Self::Commodity => write!(f, "commodity"),
321 Self::Pad => write!(f, "pad"),
322 Self::Event => write!(f, "event"),
323 Self::Query => write!(f, "query"),
324 Self::Note => write!(f, "note"),
325 Self::Document => write!(f, "document"),
326 Self::Price => write!(f, "price"),
327 Self::Custom => write!(f, "custom"),
328 Self::Option_ => write!(f, "option"),
329 Self::Include => write!(f, "include"),
330 Self::Plugin => write!(f, "plugin"),
331 Self::Pushtag => write!(f, "pushtag"),
332 Self::Poptag => write!(f, "poptag"),
333 Self::Pushmeta => write!(f, "pushmeta"),
334 Self::Popmeta => write!(f, "popmeta"),
335 Self::True => write!(f, "TRUE"),
336 Self::False => write!(f, "FALSE"),
337 Self::Null => write!(f, "NULL"),
338 Self::LDoubleBrace => write!(f, "{{{{"),
339 Self::RDoubleBrace => write!(f, "}}}}"),
340 Self::LBraceHash => write!(f, "{{#"),
341 Self::LBrace => write!(f, "{{"),
342 Self::RBrace => write!(f, "}}"),
343 Self::LParen => write!(f, "("),
344 Self::RParen => write!(f, ")"),
345 Self::AtAt => write!(f, "@@"),
346 Self::At => write!(f, "@"),
347 Self::Colon => write!(f, ":"),
348 Self::Comma => write!(f, ","),
349 Self::Tilde => write!(f, "~"),
350 Self::Pipe => write!(f, "|"),
351 Self::Plus => write!(f, "+"),
352 Self::Minus => write!(f, "-"),
353 Self::Star => write!(f, "*"),
354 Self::Slash => write!(f, "/"),
355 Self::Pending => write!(f, "!"),
356 Self::Flag(s) => write!(f, "{s}"),
357 Self::Newline => write!(f, "\\n"),
358 Self::Comment(s) => write!(f, "{s}"),
359 Self::Hash => write!(f, "#"),
360 Self::PercentComment(s) => write!(f, "{s}"),
361 Self::Shebang(s) => write!(f, "{s}"),
362 Self::EmacsDirective(s) => write!(f, "{s}"),
363 Self::MetaKey(s) => write!(f, "{s}"),
364 Self::Indent(n) => write!(f, "<indent:{n}>"),
365 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
366 Self::Error(s) => write!(f, "{s}"),
367 }
368 }
369}
370
371pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
378 let mut tokens = Vec::new();
379 let mut lexer = Token::lexer(source);
380 let mut at_line_start = true;
381 let mut last_newline_end = 0usize;
382
383 while let Some(result) = lexer.next() {
384 let span = lexer.span();
385
386 match result {
387 Ok(Token::Newline) => {
388 tokens.push((Token::Newline, span.clone().into()));
389 at_line_start = true;
390 last_newline_end = span.end;
391 }
392 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
393 let comment_start = span.start;
396 let line_end = source[span.end..]
397 .find('\n')
398 .map_or(source.len(), |i| span.end + i);
399 let comment_text = &source[comment_start..line_end];
400 tokens.push((
401 Token::Comment(comment_text),
402 Span {
403 start: comment_start,
404 end: line_end,
405 },
406 ));
407 while let Some(peek_result) = lexer.next() {
409 let peek_span = lexer.span();
410 let peek_end = peek_span.end;
411 if peek_result == Ok(Token::Newline) {
412 tokens.push((Token::Newline, peek_span.into()));
413 at_line_start = true;
414 last_newline_end = peek_end;
415 break;
416 }
417 }
419 }
420 Ok(token) => {
421 if at_line_start && span.start > last_newline_end {
423 let leading = &source[last_newline_end..span.start];
426 let mut space_count = 0;
427 let mut char_count = 0;
428 for c in leading.chars() {
429 match c {
430 ' ' => {
431 space_count += 1;
432 char_count += 1;
433 }
434 '\t' => {
435 space_count += 4; char_count += 1;
437 }
438 _ => break,
439 }
440 }
441 if space_count >= 1 {
443 let indent_start = last_newline_end;
444 let indent_end = last_newline_end + char_count;
445 let indent_token = if space_count >= 4 {
447 Token::DeepIndent(space_count)
448 } else {
449 Token::Indent(space_count)
450 };
451 tokens.push((
452 indent_token,
453 Span {
454 start: indent_start,
455 end: indent_end,
456 },
457 ));
458 }
459 }
460 at_line_start = false;
461 tokens.push((token, span.into()));
462 }
463 Err(()) => {
464 at_line_start = false;
466 let invalid_text = &source[span.clone()];
467 tokens.push((Token::Error(invalid_text), span.into()));
468 }
469 }
470 }
471
472 tokens
473}
474
475#[cfg(test)]
476mod tests {
477 use super::*;
478
479 #[test]
480 fn test_tokenize_date() {
481 let tokens = tokenize("2024-01-15");
482 assert_eq!(tokens.len(), 1);
483 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
484 }
485
486 #[test]
487 fn test_tokenize_number() {
488 let tokens = tokenize("1234.56");
489 assert_eq!(tokens.len(), 1);
490 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
491
492 let tokens = tokenize("-1,234.56");
493 assert_eq!(tokens.len(), 1);
494 assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
495 }
496
497 #[test]
498 fn test_tokenize_account() {
499 let tokens = tokenize("Assets:Bank:Checking");
500 assert_eq!(tokens.len(), 1);
501 assert!(matches!(
502 tokens[0].0,
503 Token::Account("Assets:Bank:Checking")
504 ));
505 }
506
507 #[test]
508 fn test_tokenize_account_unicode() {
509 let tokens = tokenize("Assets:CORP✨");
514 assert_eq!(tokens.len(), 1);
515 assert!(matches!(tokens[0].0, Token::Account("Assets:CORP✨")));
516
517 let tokens = tokenize("Assets:沪深300");
519 assert_eq!(tokens.len(), 1);
520 assert!(matches!(tokens[0].0, Token::Account("Assets:沪深300")));
521
522 let tokens = tokenize("Assets:日本銀行");
524 assert_eq!(tokens.len(), 1);
525 assert!(matches!(tokens[0].0, Token::Account("Assets:日本銀行")));
526
527 let tokens = tokenize("Assets:Café");
529 assert_eq!(tokens.len(), 1);
530 assert!(matches!(tokens[0].0, Token::Account("Assets:Café")));
531
532 let tokens = tokenize("Assets:€uro");
534 assert_eq!(tokens.len(), 1);
535 assert!(matches!(tokens[0].0, Token::Account("Assets:€uro")));
536
537 let tokens = tokenize("Assets:Test💰Account");
539 assert_eq!(tokens.len(), 1);
540 assert!(matches!(
541 tokens[0].0,
542 Token::Account("Assets:Test💰Account")
543 ));
544 }
545
546 #[test]
547 fn test_tokenize_currency() {
548 let tokens = tokenize("USD");
549 assert_eq!(tokens.len(), 1);
550 assert!(matches!(tokens[0].0, Token::Currency("USD")));
551 }
552
553 #[test]
554 fn test_tokenize_string() {
555 let tokens = tokenize(r#""Hello, World!""#);
556 assert_eq!(tokens.len(), 1);
557 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
558 }
559
560 #[test]
561 fn test_tokenize_keywords() {
562 let tokens = tokenize("txn balance open close");
563 assert_eq!(tokens.len(), 4);
564 assert!(matches!(tokens[0].0, Token::Txn));
565 assert!(matches!(tokens[1].0, Token::Balance));
566 assert!(matches!(tokens[2].0, Token::Open));
567 assert!(matches!(tokens[3].0, Token::Close));
568 }
569
570 #[test]
571 fn test_tokenize_tag_and_link() {
572 let tokens = tokenize("#my-tag ^my-link");
573 assert_eq!(tokens.len(), 2);
574 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
575 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
576 }
577
578 #[test]
579 fn test_tokenize_comment() {
580 let tokens = tokenize("; This is a comment");
581 assert_eq!(tokens.len(), 1);
582 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
583 }
584
585 #[test]
586 fn test_tokenize_indentation() {
587 let tokens = tokenize("txn\n Assets:Bank 100 USD");
588 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
590 }
591
592 #[test]
593 fn test_tokenize_transaction_line() {
594 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
595 let tokens = tokenize(source);
596
597 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
599 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
600 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
601 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
602 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
603 assert!(
604 tokens
605 .iter()
606 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
607 );
608 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
609 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
610 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
611 }
612
613 #[test]
614 fn test_tokenize_metadata_key() {
615 let tokens = tokenize("filename:");
616 assert_eq!(tokens.len(), 1);
617 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
618 }
619
620 #[test]
621 fn test_tokenize_punctuation() {
622 let tokens = tokenize("{ } @ @@ , ~");
623 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
624 assert!(token_types.contains(&Token::LBrace));
625 assert!(token_types.contains(&Token::RBrace));
626 assert!(token_types.contains(&Token::At));
627 assert!(token_types.contains(&Token::AtAt));
628 assert!(token_types.contains(&Token::Comma));
629 assert!(token_types.contains(&Token::Tilde));
630 }
631}