1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41 Date(&'src str),
42
43 #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d*)?)")]
47 Number(&'src str),
48
49 #[regex(r#""([^"\\]|\\.)*""#)]
52 String(&'src str),
53
54 #[regex(r"([A-Z]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*(:([A-Z0-9]|[^\x00-\x7F])([A-Za-z0-9-]|[^\x00-\x7F])*)+")]
63 Account(&'src str),
64
65 #[regex(r"/[A-Z][A-Z0-9'._-]*|[A-Z][A-Z0-9'._-]+")]
72 Currency(&'src str),
73
74 #[regex(r"#[a-zA-Z0-9-_/.]+")]
76 Tag(&'src str),
77
78 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
80 Link(&'src str),
81
82 #[token("txn")]
86 Txn,
87 #[token("balance")]
89 Balance,
90 #[token("open")]
92 Open,
93 #[token("close")]
95 Close,
96 #[token("commodity")]
98 Commodity,
99 #[token("pad")]
101 Pad,
102 #[token("event")]
104 Event,
105 #[token("query")]
107 Query,
108 #[token("note")]
110 Note,
111 #[token("document")]
113 Document,
114 #[token("price")]
116 Price,
117 #[token("custom")]
119 Custom,
120 #[token("option")]
122 Option_,
123 #[token("include")]
125 Include,
126 #[token("plugin")]
128 Plugin,
129 #[token("pushtag")]
131 Pushtag,
132 #[token("poptag")]
134 Poptag,
135 #[token("pushmeta")]
137 Pushmeta,
138 #[token("popmeta")]
140 Popmeta,
141 #[token("TRUE")]
143 #[token("True")]
144 #[token("true")]
145 True,
146 #[token("FALSE")]
148 #[token("False")]
149 #[token("false")]
150 False,
151 #[token("NULL")]
153 Null,
154
155 #[token("{{")]
159 LDoubleBrace,
160 #[token("}}")]
162 RDoubleBrace,
163 #[token("{#")]
165 LBraceHash,
166 #[token("{")]
168 LBrace,
169 #[token("}")]
171 RBrace,
172 #[token("(")]
174 LParen,
175 #[token(")")]
177 RParen,
178 #[token("@@")]
180 AtAt,
181 #[token("@")]
183 At,
184 #[token(":")]
186 Colon,
187 #[token(",")]
189 Comma,
190 #[token("~")]
192 Tilde,
193 #[token("|")]
195 Pipe,
196 #[token("+")]
198 Plus,
199 #[token("-")]
201 Minus,
202 #[token("*")]
204 Star,
205 #[token("/")]
207 Slash,
208
209 #[token("!")]
212 Pending,
213
214 #[regex(r"[PSTCURM?&]")]
217 Flag(&'src str),
218
219 #[regex(r"\r?\n")]
222 Newline,
223
224 #[regex(r";[^\n\r]*", allow_greedy = true)]
227 Comment(&'src str),
228
229 #[token("#")]
234 Hash,
235
236 #[regex(r"%[^\n\r]*", allow_greedy = true)]
239 PercentComment(&'src str),
240
241 #[regex(r"#![^\n\r]*", allow_greedy = true)]
244 Shebang(&'src str),
245
246 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
249 EmacsDirective(&'src str),
250
251 #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
255 MetaKey(&'src str),
256
257 Indent(usize),
261
262 DeepIndent(usize),
264
265 Error(&'src str),
268}
269
270impl Token<'_> {
271 pub const fn is_txn_flag(&self) -> bool {
273 matches!(
274 self,
275 Self::Star | Self::Pending | Self::Flag(_) | Self::Hash
276 )
277 }
278
279 pub const fn is_directive_keyword(&self) -> bool {
281 matches!(
282 self,
283 Self::Txn
284 | Self::Balance
285 | Self::Open
286 | Self::Close
287 | Self::Commodity
288 | Self::Pad
289 | Self::Event
290 | Self::Query
291 | Self::Note
292 | Self::Document
293 | Self::Price
294 | Self::Custom
295 | Self::Option_
296 | Self::Include
297 | Self::Plugin
298 | Self::Pushtag
299 | Self::Poptag
300 | Self::Pushmeta
301 | Self::Popmeta
302 )
303 }
304}
305
306impl fmt::Display for Token<'_> {
307 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
308 match self {
309 Self::Date(s) => write!(f, "{s}"),
310 Self::Number(s) => write!(f, "{s}"),
311 Self::String(s) => write!(f, "{s}"),
312 Self::Account(s) => write!(f, "{s}"),
313 Self::Currency(s) => write!(f, "{s}"),
314 Self::Tag(s) => write!(f, "{s}"),
315 Self::Link(s) => write!(f, "{s}"),
316 Self::Txn => write!(f, "txn"),
317 Self::Balance => write!(f, "balance"),
318 Self::Open => write!(f, "open"),
319 Self::Close => write!(f, "close"),
320 Self::Commodity => write!(f, "commodity"),
321 Self::Pad => write!(f, "pad"),
322 Self::Event => write!(f, "event"),
323 Self::Query => write!(f, "query"),
324 Self::Note => write!(f, "note"),
325 Self::Document => write!(f, "document"),
326 Self::Price => write!(f, "price"),
327 Self::Custom => write!(f, "custom"),
328 Self::Option_ => write!(f, "option"),
329 Self::Include => write!(f, "include"),
330 Self::Plugin => write!(f, "plugin"),
331 Self::Pushtag => write!(f, "pushtag"),
332 Self::Poptag => write!(f, "poptag"),
333 Self::Pushmeta => write!(f, "pushmeta"),
334 Self::Popmeta => write!(f, "popmeta"),
335 Self::True => write!(f, "TRUE"),
336 Self::False => write!(f, "FALSE"),
337 Self::Null => write!(f, "NULL"),
338 Self::LDoubleBrace => write!(f, "{{{{"),
339 Self::RDoubleBrace => write!(f, "}}}}"),
340 Self::LBraceHash => write!(f, "{{#"),
341 Self::LBrace => write!(f, "{{"),
342 Self::RBrace => write!(f, "}}"),
343 Self::LParen => write!(f, "("),
344 Self::RParen => write!(f, ")"),
345 Self::AtAt => write!(f, "@@"),
346 Self::At => write!(f, "@"),
347 Self::Colon => write!(f, ":"),
348 Self::Comma => write!(f, ","),
349 Self::Tilde => write!(f, "~"),
350 Self::Pipe => write!(f, "|"),
351 Self::Plus => write!(f, "+"),
352 Self::Minus => write!(f, "-"),
353 Self::Star => write!(f, "*"),
354 Self::Slash => write!(f, "/"),
355 Self::Pending => write!(f, "!"),
356 Self::Flag(s) => write!(f, "{s}"),
357 Self::Newline => write!(f, "\\n"),
358 Self::Comment(s) => write!(f, "{s}"),
359 Self::Hash => write!(f, "#"),
360 Self::PercentComment(s) => write!(f, "{s}"),
361 Self::Shebang(s) => write!(f, "{s}"),
362 Self::EmacsDirective(s) => write!(f, "{s}"),
363 Self::MetaKey(s) => write!(f, "{s}"),
364 Self::Indent(n) => write!(f, "<indent:{n}>"),
365 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
366 Self::Error(s) => write!(f, "{s}"),
367 }
368 }
369}
370
371pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
378 let mut tokens = Vec::new();
379 let mut lexer = Token::lexer(source);
380 let mut at_line_start = true;
381 let mut last_newline_end = 0usize;
382
383 while let Some(result) = lexer.next() {
384 let span = lexer.span();
385
386 match result {
387 Ok(Token::Newline) => {
388 tokens.push((Token::Newline, span.clone().into()));
389 at_line_start = true;
390 last_newline_end = span.end;
391 }
392 Ok(Token::Hash) if at_line_start && span.start == last_newline_end => {
393 let comment_start = span.start;
396 let line_end = source[span.end..]
397 .find('\n')
398 .map_or(source.len(), |i| span.end + i);
399 let comment_text = &source[comment_start..line_end];
400 tokens.push((
401 Token::Comment(comment_text),
402 Span {
403 start: comment_start,
404 end: line_end,
405 },
406 ));
407 while let Some(peek_result) = lexer.next() {
409 let peek_span = lexer.span();
410 let peek_end = peek_span.end;
411 if peek_result == Ok(Token::Newline) {
412 tokens.push((Token::Newline, peek_span.into()));
413 at_line_start = true;
414 last_newline_end = peek_end;
415 break;
416 }
417 }
419 }
420 Ok(token) => {
421 if at_line_start && span.start > last_newline_end {
423 let leading = &source[last_newline_end..span.start];
426 let mut space_count = 0;
427 let mut char_count = 0;
428 for c in leading.chars() {
429 match c {
430 ' ' => {
431 space_count += 1;
432 char_count += 1;
433 }
434 '\t' => {
435 space_count += 4; char_count += 1;
437 }
438 _ => break,
439 }
440 }
441 if space_count >= 1 {
443 let indent_start = last_newline_end;
444 let indent_end = last_newline_end + char_count;
445 let indent_token = if space_count >= 3 {
451 Token::DeepIndent(space_count)
452 } else {
453 Token::Indent(space_count)
454 };
455 tokens.push((
456 indent_token,
457 Span {
458 start: indent_start,
459 end: indent_end,
460 },
461 ));
462 }
463 }
464 at_line_start = false;
465 tokens.push((token, span.into()));
466 }
467 Err(()) => {
468 at_line_start = false;
470 let invalid_text = &source[span.clone()];
471 tokens.push((Token::Error(invalid_text), span.into()));
472 }
473 }
474 }
475
476 tokens
477}
478
479#[cfg(test)]
480mod tests {
481 use super::*;
482
483 #[test]
484 fn test_tokenize_date() {
485 let tokens = tokenize("2024-01-15");
486 assert_eq!(tokens.len(), 1);
487 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
488 }
489
490 #[test]
491 fn test_tokenize_number() {
492 let tokens = tokenize("1234.56");
493 assert_eq!(tokens.len(), 1);
494 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
495
496 let tokens = tokenize("-1,234.56");
497 assert_eq!(tokens.len(), 1);
498 assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
499 }
500
501 #[test]
502 fn test_tokenize_account() {
503 let tokens = tokenize("Assets:Bank:Checking");
504 assert_eq!(tokens.len(), 1);
505 assert!(matches!(
506 tokens[0].0,
507 Token::Account("Assets:Bank:Checking")
508 ));
509 }
510
511 #[test]
512 fn test_tokenize_account_unicode() {
513 let tokens = tokenize("Assets:CORP✨");
518 assert_eq!(tokens.len(), 1);
519 assert!(matches!(tokens[0].0, Token::Account("Assets:CORP✨")));
520
521 let tokens = tokenize("Assets:沪深300");
523 assert_eq!(tokens.len(), 1);
524 assert!(matches!(tokens[0].0, Token::Account("Assets:沪深300")));
525
526 let tokens = tokenize("Assets:日本銀行");
528 assert_eq!(tokens.len(), 1);
529 assert!(matches!(tokens[0].0, Token::Account("Assets:日本銀行")));
530
531 let tokens = tokenize("Assets:Café");
533 assert_eq!(tokens.len(), 1);
534 assert!(matches!(tokens[0].0, Token::Account("Assets:Café")));
535
536 let tokens = tokenize("Assets:€uro");
538 assert_eq!(tokens.len(), 1);
539 assert!(matches!(tokens[0].0, Token::Account("Assets:€uro")));
540
541 let tokens = tokenize("Assets:Test💰Account");
543 assert_eq!(tokens.len(), 1);
544 assert!(matches!(
545 tokens[0].0,
546 Token::Account("Assets:Test💰Account")
547 ));
548 }
549
550 #[test]
551 fn test_tokenize_currency() {
552 let tokens = tokenize("USD");
553 assert_eq!(tokens.len(), 1);
554 assert!(matches!(tokens[0].0, Token::Currency("USD")));
555 }
556
557 #[test]
558 fn test_tokenize_string() {
559 let tokens = tokenize(r#""Hello, World!""#);
560 assert_eq!(tokens.len(), 1);
561 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
562 }
563
564 #[test]
565 fn test_tokenize_keywords() {
566 let tokens = tokenize("txn balance open close");
567 assert_eq!(tokens.len(), 4);
568 assert!(matches!(tokens[0].0, Token::Txn));
569 assert!(matches!(tokens[1].0, Token::Balance));
570 assert!(matches!(tokens[2].0, Token::Open));
571 assert!(matches!(tokens[3].0, Token::Close));
572 }
573
574 #[test]
575 fn test_tokenize_tag_and_link() {
576 let tokens = tokenize("#my-tag ^my-link");
577 assert_eq!(tokens.len(), 2);
578 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
579 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
580 }
581
582 #[test]
583 fn test_tokenize_comment() {
584 let tokens = tokenize("; This is a comment");
585 assert_eq!(tokens.len(), 1);
586 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
587 }
588
589 #[test]
590 fn test_tokenize_indentation() {
591 let tokens = tokenize("txn\n Assets:Bank 100 USD");
592 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
594 }
595
596 #[test]
597 fn test_tokenize_transaction_line() {
598 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
599 let tokens = tokenize(source);
600
601 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
603 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
604 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
605 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
606 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
607 assert!(
608 tokens
609 .iter()
610 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
611 );
612 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
613 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
614 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
615 }
616
617 #[test]
618 fn test_tokenize_metadata_key() {
619 let tokens = tokenize("filename:");
620 assert_eq!(tokens.len(), 1);
621 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
622 }
623
624 #[test]
625 fn test_tokenize_punctuation() {
626 let tokens = tokenize("{ } @ @@ , ~");
627 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
628 assert!(token_types.contains(&Token::LBrace));
629 assert!(token_types.contains(&Token::RBrace));
630 assert!(token_types.contains(&Token::At));
631 assert!(token_types.contains(&Token::AtAt));
632 assert!(token_types.contains(&Token::Comma));
633 assert!(token_types.contains(&Token::Tilde));
634 }
635}