Skip to main content

agm_core/parser/
lexer.rs

1//! Lexer: classifies each line of an AGM source file into a `LineKind`.
2//!
3//! This is a hand-written, line-oriented lexer. No parser combinators are used.
4//! Rules are applied in a strict priority order; the first match wins.
5
6use crate::error::{AgmError, ErrorCode, ErrorLocation};
7
8// ---------------------------------------------------------------------------
9// Types
10// ---------------------------------------------------------------------------
11
12/// Classification of a single line in an AGM source file.
13#[derive(Debug, Clone, PartialEq)]
14pub enum LineKind {
15    Blank,
16    Comment,
17    NodeDeclaration(String),
18    ScalarField(String, String),
19    InlineListField(String, Vec<String>),
20    FieldStart(String),
21    ListItem(String),
22    IndentedLine(String),
23    BodyMarker,
24    TestExpectHeader(String),
25}
26
27/// A single classified line from an AGM source file.
28#[derive(Debug, Clone, PartialEq)]
29pub struct Line {
30    pub kind: LineKind,
31    pub number: usize,
32    pub indent: usize,
33    pub raw: String,
34}
35
36// ---------------------------------------------------------------------------
37// Helper functions
38// ---------------------------------------------------------------------------
39
40/// Returns the byte position of the first tab character, or `None`.
41fn find_tab(s: &str) -> Option<usize> {
42    s.bytes().position(|b| b == b'\t')
43}
44
45/// Counts the number of leading ASCII spaces.
46fn count_indent(s: &str) -> usize {
47    s.bytes().take_while(|&b| b == b' ').count()
48}
49
50/// Returns `true` if `key` matches `[a-zA-Z_][a-zA-Z0-9_]*` and is non-empty.
51fn is_valid_field_key(key: &str) -> bool {
52    let mut chars = key.chars();
53    match chars.next() {
54        Some(c) if c.is_ascii_alphabetic() || c == '_' => {}
55        _ => return false,
56    }
57    chars.all(|c| c.is_ascii_alphanumeric() || c == '_')
58}
59
60/// Parses an inline list value such as `[a, b, c]`.
61///
62/// Strips outer `[]`, splits by `,`, trims each item, and filters empty strings.
63/// Returns `Err` with P007 if the value starts with `[` but does not end with `]`.
64fn parse_inline_list(value: &str, line_number: usize) -> Result<Vec<String>, AgmError> {
65    // value has already been confirmed to start with '[' by caller
66    if !value.ends_with(']') {
67        return Err(AgmError::new(
68            ErrorCode::P007,
69            "Invalid inline list syntax",
70            ErrorLocation::new(None, Some(line_number), None),
71        ));
72    }
73    let inner = &value[1..value.len() - 1];
74    let items: Vec<String> = inner
75        .split(',')
76        .map(|s| s.trim().to_string())
77        .filter(|s| !s.is_empty())
78        .collect();
79    Ok(items)
80}
81
82// ---------------------------------------------------------------------------
83// Core classification
84// ---------------------------------------------------------------------------
85
86/// Classifies a single raw line.
87///
88/// Rules are applied in strict priority order; the first match wins.
89pub fn classify_line(raw: &str, line_number: usize) -> Result<Line, AgmError> {
90    // Rule 1 — Tab check
91    if find_tab(raw).is_some() {
92        return Err(AgmError::new(
93            ErrorCode::P004,
94            "Tab character in indentation (spaces required)",
95            ErrorLocation::new(None, Some(line_number), None),
96        ));
97    }
98
99    let trimmed = raw.trim();
100    let indent = count_indent(raw);
101
102    // Rule 2 — Blank
103    if trimmed.is_empty() {
104        return Ok(Line {
105            kind: LineKind::Blank,
106            number: line_number,
107            indent: 0,
108            raw: raw.to_string(),
109        });
110    }
111
112    // Rule 3 — TestExpectHeader: starts with "# expect:"
113    if let Some(rest) = trimmed.strip_prefix("# expect:") {
114        return Ok(Line {
115            kind: LineKind::TestExpectHeader(rest.trim().to_string()),
116            number: line_number,
117            indent: 0,
118            raw: raw.to_string(),
119        });
120    }
121
122    // Rule 4 — Comment: starts with '#'
123    if trimmed.starts_with('#') {
124        return Ok(Line {
125            kind: LineKind::Comment,
126            number: line_number,
127            indent,
128            raw: raw.to_string(),
129        });
130    }
131
132    // Rule 5 — NodeDeclaration: trimmed == "node" OR starts with "node "
133    if trimmed == "node" || trimmed.starts_with("node ") {
134        let id = if trimmed == "node" {
135            ""
136        } else {
137            trimmed["node ".len()..].trim()
138        };
139        return Ok(Line {
140            kind: LineKind::NodeDeclaration(id.to_string()),
141            number: line_number,
142            indent,
143            raw: raw.to_string(),
144        });
145    }
146
147    // Rule 6 — BodyMarker: starts with "body:" AND rest after "body:" trimmed == "|"
148    if let Some(rest) = trimmed.strip_prefix("body:") {
149        if rest.trim() == "|" {
150            return Ok(Line {
151                kind: LineKind::BodyMarker,
152                number: line_number,
153                indent,
154                raw: raw.to_string(),
155            });
156        }
157        // Fall through to field rules below
158    }
159
160    // Rules 7-9: colon-based field rules
161    if let Some(colon_pos) = raw.find(':') {
162        let key_raw = &raw[..colon_pos];
163        let key = key_raw.trim();
164        let value_raw = &raw[colon_pos + 1..];
165        let value = value_raw.trim();
166
167        if is_valid_field_key(key) {
168            // Rule 7 — InlineListField: value starts with '['
169            if value.starts_with('[') {
170                if !value.ends_with(']') {
171                    return Err(AgmError::new(
172                        ErrorCode::P007,
173                        "Invalid inline list syntax",
174                        ErrorLocation::new(None, Some(line_number), None),
175                    ));
176                }
177                let items = parse_inline_list(value, line_number)?;
178                return Ok(Line {
179                    kind: LineKind::InlineListField(key.to_string(), items),
180                    number: line_number,
181                    indent,
182                    raw: raw.to_string(),
183                });
184            }
185
186            // Rule 8 — ScalarField: value is non-empty
187            if !value.is_empty() {
188                return Ok(Line {
189                    kind: LineKind::ScalarField(key.to_string(), value.to_string()),
190                    number: line_number,
191                    indent,
192                    raw: raw.to_string(),
193                });
194            }
195
196            // Rule 9 — FieldStart: value is empty
197            return Ok(Line {
198                kind: LineKind::FieldStart(key.to_string()),
199                number: line_number,
200                indent,
201                raw: raw.to_string(),
202            });
203        }
204    }
205
206    // Rule 10 — ListItem: raw stripped of leading spaces starts with "- " or equals "-"
207    let stripped = raw.trim_start_matches(' ');
208    if stripped.starts_with("- ") || stripped == "-" {
209        let value = if stripped == "-" {
210            ""
211        } else {
212            &stripped["- ".len()..]
213        };
214        return Ok(Line {
215            kind: LineKind::ListItem(value.to_string()),
216            number: line_number,
217            indent,
218            raw: raw.to_string(),
219        });
220    }
221
222    // Rule 11 — IndentedLine: indent > 0
223    if indent > 0 {
224        return Ok(Line {
225            kind: LineKind::IndentedLine(trimmed.to_string()),
226            number: line_number,
227            indent,
228            raw: raw.to_string(),
229        });
230    }
231
232    // Rule 12 — Fallback
233    Ok(Line {
234        kind: LineKind::IndentedLine(trimmed.to_string()),
235        number: line_number,
236        indent: 0,
237        raw: raw.to_string(),
238    })
239}
240
241// ---------------------------------------------------------------------------
242// Lex function
243// ---------------------------------------------------------------------------
244
245/// Lexes an entire AGM source string into a vector of classified lines.
246///
247/// Returns `Ok(lines)` if all lines are valid, or `Err(errors)` listing every
248/// line that failed classification.
249pub fn lex(input: &str) -> Result<Vec<Line>, Vec<AgmError>> {
250    let mut lines = Vec::new();
251    let mut errors = Vec::new();
252    for (idx, raw_line) in input.lines().enumerate() {
253        match classify_line(raw_line, idx + 1) {
254            Ok(line) => lines.push(line),
255            Err(err) => errors.push(err),
256        }
257    }
258    if errors.is_empty() {
259        Ok(lines)
260    } else {
261        Err(errors)
262    }
263}
264
265// ---------------------------------------------------------------------------
266// Tests
267// ---------------------------------------------------------------------------
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272    use crate::error::ErrorCode;
273
274    // ---- A: Blank Lines ----
275
276    #[test]
277    fn test_classify_empty_string_returns_blank() {
278        let line = classify_line("", 1).unwrap();
279        assert_eq!(line.kind, LineKind::Blank);
280        assert_eq!(line.indent, 0);
281    }
282
283    #[test]
284    fn test_classify_spaces_only_returns_blank() {
285        let line = classify_line("   ", 1).unwrap();
286        assert_eq!(line.kind, LineKind::Blank);
287        assert_eq!(line.indent, 0);
288    }
289
290    #[test]
291    fn test_classify_single_space_returns_blank() {
292        let line = classify_line(" ", 1).unwrap();
293        assert_eq!(line.kind, LineKind::Blank);
294        assert_eq!(line.indent, 0);
295    }
296
297    // ---- B: Comments ----
298
299    #[test]
300    fn test_classify_hash_comment_returns_comment() {
301        let line = classify_line("# comment", 1).unwrap();
302        assert_eq!(line.kind, LineKind::Comment);
303    }
304
305    #[test]
306    fn test_classify_hash_only_returns_comment() {
307        let line = classify_line("#", 1).unwrap();
308        assert_eq!(line.kind, LineKind::Comment);
309    }
310
311    #[test]
312    fn test_classify_indented_comment_returns_comment_with_indent() {
313        let line = classify_line("  # indented comment", 1).unwrap();
314        assert_eq!(line.kind, LineKind::Comment);
315        assert_eq!(line.indent, 2);
316    }
317
318    // ---- C: TestExpectHeader ----
319
320    #[test]
321    fn test_classify_expect_header_with_content_returns_test_expect_header() {
322        let line = classify_line("# expect: error AGM-P004", 1).unwrap();
323        assert_eq!(
324            line.kind,
325            LineKind::TestExpectHeader("error AGM-P004".to_string())
326        );
327    }
328
329    #[test]
330    fn test_classify_expect_header_empty_rest_returns_test_expect_header() {
331        let line = classify_line("# expect:", 1).unwrap();
332        assert_eq!(line.kind, LineKind::TestExpectHeader("".to_string()));
333    }
334
335    #[test]
336    fn test_classify_expect_without_space_returns_comment_not_test_expect() {
337        let line = classify_line("#expect: foo", 1).unwrap();
338        assert_eq!(line.kind, LineKind::Comment);
339    }
340
341    // ---- D: Node Declarations ----
342
343    #[test]
344    fn test_classify_node_with_id_returns_node_declaration() {
345        let line = classify_line("node auth.login", 1).unwrap();
346        assert_eq!(
347            line.kind,
348            LineKind::NodeDeclaration("auth.login".to_string())
349        );
350    }
351
352    #[test]
353    fn test_classify_node_with_dotted_id_returns_node_declaration() {
354        let line = classify_line("node billing.invoice.create", 1).unwrap();
355        assert_eq!(
356            line.kind,
357            LineKind::NodeDeclaration("billing.invoice.create".to_string())
358        );
359    }
360
361    #[test]
362    fn test_classify_node_alone_returns_node_declaration_empty_id() {
363        let line = classify_line("node", 1).unwrap();
364        assert_eq!(line.kind, LineKind::NodeDeclaration("".to_string()));
365    }
366
367    #[test]
368    fn test_classify_node_with_extra_spaces_trims_id() {
369        let line = classify_line("node   auth.login  ", 1).unwrap();
370        assert_eq!(
371            line.kind,
372            LineKind::NodeDeclaration("auth.login".to_string())
373        );
374    }
375
376    // ---- E: BodyMarker ----
377
378    #[test]
379    fn test_classify_body_pipe_returns_body_marker() {
380        let line = classify_line("body: |", 1).unwrap();
381        assert_eq!(line.kind, LineKind::BodyMarker);
382    }
383
384    #[test]
385    fn test_classify_body_pipe_with_spaces_returns_body_marker() {
386        let line = classify_line("body:  |  ", 1).unwrap();
387        assert_eq!(line.kind, LineKind::BodyMarker);
388    }
389
390    #[test]
391    fn test_classify_body_pipe_with_suffix_returns_scalar_field() {
392        let line = classify_line("body: |something", 1).unwrap();
393        assert_eq!(
394            line.kind,
395            LineKind::ScalarField("body".to_string(), "|something".to_string())
396        );
397    }
398
399    // ---- F: Inline Lists ----
400
401    #[test]
402    fn test_classify_inline_list_multiple_items_returns_inline_list_field() {
403        let line = classify_line("tags: [auth, security]", 1).unwrap();
404        assert_eq!(
405            line.kind,
406            LineKind::InlineListField(
407                "tags".to_string(),
408                vec!["auth".to_string(), "security".to_string()]
409            )
410        );
411    }
412
413    #[test]
414    fn test_classify_inline_list_single_item_returns_inline_list_field() {
415        let line = classify_line("tags: [auth]", 1).unwrap();
416        assert_eq!(
417            line.kind,
418            LineKind::InlineListField("tags".to_string(), vec!["auth".to_string()])
419        );
420    }
421
422    #[test]
423    fn test_classify_inline_list_empty_returns_inline_list_field_empty() {
424        let line = classify_line("tags: []", 1).unwrap();
425        assert_eq!(
426            line.kind,
427            LineKind::InlineListField("tags".to_string(), vec![])
428        );
429    }
430
431    #[test]
432    fn test_classify_inline_list_unclosed_returns_err_p007() {
433        let err = classify_line("tags: [auth, security", 1).unwrap_err();
434        assert_eq!(err.code, ErrorCode::P007);
435    }
436
437    // ---- G: Scalar Fields ----
438
439    #[test]
440    fn test_classify_scalar_field_simple_returns_scalar_field() {
441        let line = classify_line("type: workflow", 1).unwrap();
442        assert_eq!(
443            line.kind,
444            LineKind::ScalarField("type".to_string(), "workflow".to_string())
445        );
446    }
447
448    #[test]
449    fn test_classify_scalar_field_with_colon_in_value_keeps_rest() {
450        let line = classify_line("summary: Rule: no tabs allowed", 1).unwrap();
451        assert_eq!(
452            line.kind,
453            LineKind::ScalarField("summary".to_string(), "Rule: no tabs allowed".to_string())
454        );
455    }
456
457    #[test]
458    fn test_classify_scalar_field_trims_value_whitespace() {
459        let line = classify_line("type:   workflow  ", 1).unwrap();
460        assert_eq!(
461            line.kind,
462            LineKind::ScalarField("type".to_string(), "workflow".to_string())
463        );
464    }
465
466    // ---- H: Field Start ----
467
468    #[test]
469    fn test_classify_field_start_no_value_returns_field_start() {
470        let line = classify_line("items:", 1).unwrap();
471        assert_eq!(line.kind, LineKind::FieldStart("items".to_string()));
472    }
473
474    #[test]
475    fn test_classify_field_start_with_trailing_spaces_returns_field_start() {
476        let line = classify_line("items:   ", 1).unwrap();
477        assert_eq!(line.kind, LineKind::FieldStart("items".to_string()));
478    }
479
480    // ---- I: List Items ----
481
482    #[test]
483    fn test_classify_list_item_with_content_returns_list_item_with_indent() {
484        let line = classify_line("  - first item", 1).unwrap();
485        assert_eq!(line.kind, LineKind::ListItem("first item".to_string()));
486        assert_eq!(line.indent, 2);
487    }
488
489    #[test]
490    fn test_classify_list_item_dash_only_returns_list_item_empty() {
491        let line = classify_line("  -", 1).unwrap();
492        assert_eq!(line.kind, LineKind::ListItem("".to_string()));
493        assert_eq!(line.indent, 2);
494    }
495
496    #[test]
497    fn test_classify_list_item_no_space_after_dash_returns_indented_line() {
498        let line = classify_line("  -value", 1).unwrap();
499        assert_eq!(line.kind, LineKind::IndentedLine("-value".to_string()));
500        assert_eq!(line.indent, 2);
501    }
502
503    // ---- J: Indented Lines ----
504
505    #[test]
506    fn test_classify_indented_text_returns_indented_line_with_indent() {
507        let line = classify_line("  Some block text", 1).unwrap();
508        assert_eq!(
509            line.kind,
510            LineKind::IndentedLine("Some block text".to_string())
511        );
512        assert_eq!(line.indent, 2);
513    }
514
515    #[test]
516    fn test_classify_deeply_indented_text_returns_indented_line() {
517        let line = classify_line("      deep text", 1).unwrap();
518        assert_eq!(line.kind, LineKind::IndentedLine("deep text".to_string()));
519        assert_eq!(line.indent, 6);
520    }
521
522    // ---- K: Tab Rejection ----
523
524    #[test]
525    fn test_classify_tab_at_start_returns_err_p004() {
526        let err = classify_line("\ttype: workflow", 1).unwrap_err();
527        assert_eq!(err.code, ErrorCode::P004);
528    }
529
530    #[test]
531    fn test_classify_tab_in_middle_returns_err_p004() {
532        let err = classify_line("type:\tworkflow", 1).unwrap_err();
533        assert_eq!(err.code, ErrorCode::P004);
534    }
535
536    #[test]
537    fn test_classify_tab_only_returns_err_p004() {
538        let err = classify_line("\t", 1).unwrap_err();
539        assert_eq!(err.code, ErrorCode::P004);
540    }
541
542    // ---- L: lex() Integration ----
543
544    #[test]
545    fn test_lex_valid_snippet_returns_ok_with_correct_lines() {
546        let input = "node auth.login\ntype: workflow\nsummary: Login flow\n";
547        let lines = lex(input).unwrap();
548        assert_eq!(lines.len(), 3);
549        assert_eq!(
550            lines[0].kind,
551            LineKind::NodeDeclaration("auth.login".to_string())
552        );
553        assert_eq!(
554            lines[1].kind,
555            LineKind::ScalarField("type".to_string(), "workflow".to_string())
556        );
557        assert_eq!(
558            lines[2].kind,
559            LineKind::ScalarField("summary".to_string(), "Login flow".to_string())
560        );
561    }
562
563    #[test]
564    fn test_lex_two_tab_lines_returns_err_with_two_p004_errors() {
565        let input = "\ttype: workflow\nsummary: ok\n\tversion: 1\n";
566        let errors = lex(input).unwrap_err();
567        assert_eq!(errors.len(), 2);
568        assert!(errors.iter().all(|e| e.code == ErrorCode::P004));
569    }
570
571    #[test]
572    fn test_lex_empty_input_returns_ok_empty_vec() {
573        let lines = lex("").unwrap();
574        assert_eq!(lines, vec![]);
575    }
576}