1use logos::Logos;
7use std::fmt;
8use std::ops::Range;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Span {
13 pub start: usize,
15 pub end: usize,
17}
18
19impl From<Range<usize>> for Span {
20 fn from(range: Range<usize>) -> Self {
21 Self {
22 start: range.start,
23 end: range.end,
24 }
25 }
26}
27
28impl From<Span> for Range<usize> {
29 fn from(span: Span) -> Self {
30 span.start..span.end
31 }
32}
33
34#[derive(Logos, Debug, Clone, PartialEq, Eq)]
36#[logos(skip r"[ \t]+")] pub enum Token<'src> {
38 #[regex(r"\d{4}[-/]\d{2}[-/]\d{2}")]
41 Date(&'src str),
42
43 #[regex(r"-?(\.\d+|(\d{1,3}(,\d{3})*|\d+)(\.\d+)?)")]
46 Number(&'src str),
47
48 #[regex(r#""([^"\\]|\\.)*""#)]
51 String(&'src str),
52
53 #[regex(r"(Assets|Liabilities|Equity|Income|Expenses)(:[A-Za-z0-9][a-zA-Z0-9-]*)+")]
57 Account(&'src str),
58
59 #[regex(r"/[A-Z0-9'._-]+|[A-Z][A-Z0-9'._-]+")]
65 Currency(&'src str),
66
67 #[regex(r"#[a-zA-Z0-9-_/.]+")]
69 Tag(&'src str),
70
71 #[regex(r"\^[a-zA-Z0-9-_/.]+")]
73 Link(&'src str),
74
75 #[token("txn")]
79 Txn,
80 #[token("balance")]
82 Balance,
83 #[token("open")]
85 Open,
86 #[token("close")]
88 Close,
89 #[token("commodity")]
91 Commodity,
92 #[token("pad")]
94 Pad,
95 #[token("event")]
97 Event,
98 #[token("query")]
100 Query,
101 #[token("note")]
103 Note,
104 #[token("document")]
106 Document,
107 #[token("price")]
109 Price,
110 #[token("custom")]
112 Custom,
113 #[token("option")]
115 Option_,
116 #[token("include")]
118 Include,
119 #[token("plugin")]
121 Plugin,
122 #[token("pushtag")]
124 Pushtag,
125 #[token("poptag")]
127 Poptag,
128 #[token("pushmeta")]
130 Pushmeta,
131 #[token("popmeta")]
133 Popmeta,
134 #[token("TRUE")]
136 #[token("True")]
137 #[token("true")]
138 True,
139 #[token("FALSE")]
141 #[token("False")]
142 #[token("false")]
143 False,
144 #[token("NULL")]
146 Null,
147
148 #[token("{{")]
152 LDoubleBrace,
153 #[token("}}")]
155 RDoubleBrace,
156 #[token("{#")]
158 LBraceHash,
159 #[token("{")]
161 LBrace,
162 #[token("}")]
164 RBrace,
165 #[token("(")]
167 LParen,
168 #[token(")")]
170 RParen,
171 #[token("@@")]
173 AtAt,
174 #[token("@")]
176 At,
177 #[token(":")]
179 Colon,
180 #[token(",")]
182 Comma,
183 #[token("~")]
185 Tilde,
186 #[token("+")]
188 Plus,
189 #[token("-")]
191 Minus,
192 #[token("*")]
194 Star,
195 #[token("/")]
197 Slash,
198
199 #[token("!")]
202 Pending,
203
204 #[regex(r"[PSTCURM#?%&]")]
207 Flag(&'src str),
208
209 #[regex(r"\r?\n")]
212 Newline,
213
214 #[regex(r";[^\n\r]*", allow_greedy = true)]
217 Comment(&'src str),
218
219 #[regex(r"#![^\n\r]*", allow_greedy = true)]
222 Shebang(&'src str),
223
224 #[regex(r"#\+[^\n\r]*", allow_greedy = true)]
227 EmacsDirective(&'src str),
228
229 #[regex(r"[a-zA-Z][a-zA-Z0-9_-]*:")]
233 MetaKey(&'src str),
234
235 Indent(usize),
239
240 DeepIndent(usize),
242
243 Error(&'src str),
246}
247
248impl Token<'_> {
249 pub const fn is_txn_flag(&self) -> bool {
251 matches!(self, Self::Star | Self::Pending | Self::Flag(_))
252 }
253
254 pub const fn is_directive_keyword(&self) -> bool {
256 matches!(
257 self,
258 Self::Txn
259 | Self::Balance
260 | Self::Open
261 | Self::Close
262 | Self::Commodity
263 | Self::Pad
264 | Self::Event
265 | Self::Query
266 | Self::Note
267 | Self::Document
268 | Self::Price
269 | Self::Custom
270 | Self::Option_
271 | Self::Include
272 | Self::Plugin
273 | Self::Pushtag
274 | Self::Poptag
275 | Self::Pushmeta
276 | Self::Popmeta
277 )
278 }
279}
280
281impl fmt::Display for Token<'_> {
282 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283 match self {
284 Self::Date(s) => write!(f, "{s}"),
285 Self::Number(s) => write!(f, "{s}"),
286 Self::String(s) => write!(f, "{s}"),
287 Self::Account(s) => write!(f, "{s}"),
288 Self::Currency(s) => write!(f, "{s}"),
289 Self::Tag(s) => write!(f, "{s}"),
290 Self::Link(s) => write!(f, "{s}"),
291 Self::Txn => write!(f, "txn"),
292 Self::Balance => write!(f, "balance"),
293 Self::Open => write!(f, "open"),
294 Self::Close => write!(f, "close"),
295 Self::Commodity => write!(f, "commodity"),
296 Self::Pad => write!(f, "pad"),
297 Self::Event => write!(f, "event"),
298 Self::Query => write!(f, "query"),
299 Self::Note => write!(f, "note"),
300 Self::Document => write!(f, "document"),
301 Self::Price => write!(f, "price"),
302 Self::Custom => write!(f, "custom"),
303 Self::Option_ => write!(f, "option"),
304 Self::Include => write!(f, "include"),
305 Self::Plugin => write!(f, "plugin"),
306 Self::Pushtag => write!(f, "pushtag"),
307 Self::Poptag => write!(f, "poptag"),
308 Self::Pushmeta => write!(f, "pushmeta"),
309 Self::Popmeta => write!(f, "popmeta"),
310 Self::True => write!(f, "TRUE"),
311 Self::False => write!(f, "FALSE"),
312 Self::Null => write!(f, "NULL"),
313 Self::LDoubleBrace => write!(f, "{{{{"),
314 Self::RDoubleBrace => write!(f, "}}}}"),
315 Self::LBraceHash => write!(f, "{{#"),
316 Self::LBrace => write!(f, "{{"),
317 Self::RBrace => write!(f, "}}"),
318 Self::LParen => write!(f, "("),
319 Self::RParen => write!(f, ")"),
320 Self::AtAt => write!(f, "@@"),
321 Self::At => write!(f, "@"),
322 Self::Colon => write!(f, ":"),
323 Self::Comma => write!(f, ","),
324 Self::Tilde => write!(f, "~"),
325 Self::Plus => write!(f, "+"),
326 Self::Minus => write!(f, "-"),
327 Self::Star => write!(f, "*"),
328 Self::Slash => write!(f, "/"),
329 Self::Pending => write!(f, "!"),
330 Self::Flag(s) => write!(f, "{s}"),
331 Self::Newline => write!(f, "\\n"),
332 Self::Comment(s) => write!(f, "{s}"),
333 Self::Shebang(s) => write!(f, "{s}"),
334 Self::EmacsDirective(s) => write!(f, "{s}"),
335 Self::MetaKey(s) => write!(f, "{s}"),
336 Self::Indent(n) => write!(f, "<indent:{n}>"),
337 Self::DeepIndent(n) => write!(f, "<deep-indent:{n}>"),
338 Self::Error(s) => write!(f, "{s}"),
339 }
340 }
341}
342
343pub fn tokenize(source: &str) -> Vec<(Token<'_>, Span)> {
350 let mut tokens = Vec::new();
351 let mut lexer = Token::lexer(source);
352 let mut at_line_start = true;
353 let mut last_newline_end = 0usize;
354
355 while let Some(result) = lexer.next() {
356 let span = lexer.span();
357
358 match result {
359 Ok(Token::Newline) => {
360 tokens.push((Token::Newline, span.clone().into()));
361 at_line_start = true;
362 last_newline_end = span.end;
363 }
364 Ok(token) => {
365 if at_line_start && span.start > last_newline_end {
367 let leading = &source[last_newline_end..span.start];
370 let mut space_count = 0;
371 let mut char_count = 0;
372 for c in leading.chars() {
373 match c {
374 ' ' => {
375 space_count += 1;
376 char_count += 1;
377 }
378 '\t' => {
379 space_count += 4; char_count += 1;
381 }
382 _ => break,
383 }
384 }
385 if space_count >= 2 {
386 let indent_start = last_newline_end;
387 let indent_end = last_newline_end + char_count;
388 let indent_token = if space_count >= 4 {
390 Token::DeepIndent(space_count)
391 } else {
392 Token::Indent(space_count)
393 };
394 tokens.push((
395 indent_token,
396 Span {
397 start: indent_start,
398 end: indent_end,
399 },
400 ));
401 }
402 }
403 at_line_start = false;
404 tokens.push((token, span.into()));
405 }
406 Err(()) => {
407 at_line_start = false;
409 let invalid_text = &source[span.clone()];
410 tokens.push((Token::Error(invalid_text), span.into()));
411 }
412 }
413 }
414
415 tokens
416}
417
418#[cfg(test)]
419mod tests {
420 use super::*;
421
422 #[test]
423 fn test_tokenize_date() {
424 let tokens = tokenize("2024-01-15");
425 assert_eq!(tokens.len(), 1);
426 assert!(matches!(tokens[0].0, Token::Date("2024-01-15")));
427 }
428
429 #[test]
430 fn test_tokenize_number() {
431 let tokens = tokenize("1234.56");
432 assert_eq!(tokens.len(), 1);
433 assert!(matches!(tokens[0].0, Token::Number("1234.56")));
434
435 let tokens = tokenize("-1,234.56");
436 assert_eq!(tokens.len(), 1);
437 assert!(matches!(tokens[0].0, Token::Number("-1,234.56")));
438 }
439
440 #[test]
441 fn test_tokenize_account() {
442 let tokens = tokenize("Assets:Bank:Checking");
443 assert_eq!(tokens.len(), 1);
444 assert!(matches!(
445 tokens[0].0,
446 Token::Account("Assets:Bank:Checking")
447 ));
448 }
449
450 #[test]
451 fn test_tokenize_currency() {
452 let tokens = tokenize("USD");
453 assert_eq!(tokens.len(), 1);
454 assert!(matches!(tokens[0].0, Token::Currency("USD")));
455 }
456
457 #[test]
458 fn test_tokenize_string() {
459 let tokens = tokenize(r#""Hello, World!""#);
460 assert_eq!(tokens.len(), 1);
461 assert!(matches!(tokens[0].0, Token::String(r#""Hello, World!""#)));
462 }
463
464 #[test]
465 fn test_tokenize_keywords() {
466 let tokens = tokenize("txn balance open close");
467 assert_eq!(tokens.len(), 4);
468 assert!(matches!(tokens[0].0, Token::Txn));
469 assert!(matches!(tokens[1].0, Token::Balance));
470 assert!(matches!(tokens[2].0, Token::Open));
471 assert!(matches!(tokens[3].0, Token::Close));
472 }
473
474 #[test]
475 fn test_tokenize_tag_and_link() {
476 let tokens = tokenize("#my-tag ^my-link");
477 assert_eq!(tokens.len(), 2);
478 assert!(matches!(tokens[0].0, Token::Tag("#my-tag")));
479 assert!(matches!(tokens[1].0, Token::Link("^my-link")));
480 }
481
482 #[test]
483 fn test_tokenize_comment() {
484 let tokens = tokenize("; This is a comment");
485 assert_eq!(tokens.len(), 1);
486 assert!(matches!(tokens[0].0, Token::Comment("; This is a comment")));
487 }
488
489 #[test]
490 fn test_tokenize_indentation() {
491 let tokens = tokenize("txn\n Assets:Bank 100 USD");
492 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Indent(_))));
494 }
495
496 #[test]
497 fn test_tokenize_transaction_line() {
498 let source = "2024-01-15 * \"Grocery Store\" #food\n Expenses:Food 50.00 USD";
499 let tokens = tokenize(source);
500
501 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Date(_))));
503 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Star)));
504 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::String(_))));
505 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Tag(_))));
506 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Newline)));
507 assert!(
508 tokens
509 .iter()
510 .any(|(t, _)| matches!(t, Token::Indent(_) | Token::DeepIndent(_)))
511 );
512 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Account(_))));
513 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Number(_))));
514 assert!(tokens.iter().any(|(t, _)| matches!(t, Token::Currency(_))));
515 }
516
517 #[test]
518 fn test_tokenize_metadata_key() {
519 let tokens = tokenize("filename:");
520 assert_eq!(tokens.len(), 1);
521 assert!(matches!(tokens[0].0, Token::MetaKey("filename:")));
522 }
523
524 #[test]
525 fn test_tokenize_punctuation() {
526 let tokens = tokenize("{ } @ @@ , ~");
527 let token_types: Vec<_> = tokens.iter().map(|(t, _)| t.clone()).collect();
528 assert!(token_types.contains(&Token::LBrace));
529 assert!(token_types.contains(&Token::RBrace));
530 assert!(token_types.contains(&Token::At));
531 assert!(token_types.contains(&Token::AtAt));
532 assert!(token_types.contains(&Token::Comma));
533 assert!(token_types.contains(&Token::Tilde));
534 }
535}