Skip to main content

lex_core/lex/token/
line.rs

1//! Line-based token types for the lexer pipeline
2//!
3//!     This module contains token types specific to the line-based lexer pipeline. Being line
4//!     based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//!     Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//!     of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//!     Since this still is happening in the lexing stage, each line must be tokenized into one
9//!     category. In the real world, a line might be more than one possible category. For example a
10//!     line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//!     For this reason, line tokens can be OR tokens at times, and at other times the order of
13//!     line categorization is crucial to getting the right result. While there are only a few
14//!     consequential marks in lines (blank, data, subject, list) having them denormalized is
15//!     required to have parsing simpler.
16//!
17//!     The LineType enum is the definitive set: blank, annotation start/end, data, subject, list,
18//!     subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//!     structural node, not a line token.
20//!
21//! Line Types
22//!
23//!     These are the line tokens:
24//!
25//!         - BlankLine: empty or whitespace only
26//!         - AnnotationEndLine: a line starting with :: marker and having no further content
27//!         - AnnotationStartLine: a data node + lex marker
28//!         - DataLine: :: label params? (no closing :: marker)
29//!         - SubjectLine: Line ending with colon (could be subject/definition/session title)
30//!         - ListLine: Line starting with list marker (-, 1., a., I., etc.)
31//!         - SubjectOrListItemLine: Line starting with list marker and ending with colon
32//!         - ParagraphLine: Any other line (paragraph text)
33//!         - DialogLine: a line that starts with a dash, but is marked not to be a list item.
34//!         - Indent / Dedent: structural markers passed through from indentation handling.
35//!         - DocumentStart: synthetic marker for document content boundary.
36//!
37//!     And to represent a group of lines at the same level, there is a LineContainer.
38//!
39//!     See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
40//!     for the classification logic and ordering.
41
42use std::fmt;
43
44use super::core::Token;
45
46/// A line token represents one logical line created from grouped raw tokens.
47///
48/// Line tokens are produced by the line token transformation,
49/// which groups raw tokens into semantic line units. Each line token stores:
50/// - The original raw tokens that created it (for location information and AST construction)
51/// - The line type (what kind of line this is)
52/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
53///
54/// By preserving raw tokens and their individual spans, we can later
55/// pass them directly to existing AST constructors (using the same unified approach as the
56/// the parser), which handles all location tracking and AST node creation automatically.
57///
58/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
59/// will compute bounding boxes from the individual token_spans when needed.
60#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
61pub struct LineToken {
62    /// The original raw tokens that comprise this line
63    pub source_tokens: Vec<Token>,
64
65    /// The byte range in source code for each token
66    /// Must be the same length as source_tokens
67    pub token_spans: Vec<std::ops::Range<usize>>,
68
69    /// The type/classification of this line
70    pub line_type: LineType,
71}
72
73impl LineToken {
74    /// Get source tokens as (Token, Range<usize>) pairs.
75    ///
76    /// This creates owned pairs from the separate source_tokens and token_spans vectors.
77    /// Used by the AST construction facade to get tokens in the format expected by
78    /// the token processing utilities.
79    ///
80    /// Note: LineToken stores tokens and spans separately for serialization efficiency.
81    /// This method creates the paired format needed for location tracking.
82    pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
83        self.source_tokens
84            .iter()
85            .zip(self.token_spans.iter())
86            .map(|(token, span)| (token.clone(), span.clone()))
87            .collect()
88    }
89}
90
91/// The classification of a line token
92#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
93pub enum LineType {
94    /// Blank line (empty or whitespace only)
95    BlankLine,
96
97    /// Annotation end line: a line starting with :: marker and having no further content
98    AnnotationEndLine,
99
100    /// Annotation start line: follows annotation grammar <lex-marker><space><label>(<space><parameters>)? <lex-marker> <content>?
101    AnnotationStartLine,
102
103    /// Data line: :: label params? (no closing :: marker)
104    DataLine,
105
106    /// Line ending with colon (could be subject/definition/session title)
107    SubjectLine,
108
109    /// Line starting with list marker (-, 1., a., I., etc.)
110    ListLine,
111
112    /// Line starting with list marker and ending with colon (subject and list item combined)
113    SubjectOrListItemLine,
114
115    /// Any other line (paragraph text)
116    ParagraphLine,
117
118    /// Line that is part of a dialog
119    DialogLine,
120
121    /// Indentation marker (pass-through from prior transformation)
122    Indent,
123
124    /// Dedentation marker (pass-through from prior transformation)
125    Dedent,
126
127    /// Document start marker (synthetic)
128    ///
129    /// Marks the boundary between document-level metadata (annotations) and document content.
130    /// Injected by DocumentStartMarker transformation at:
131    /// - Position 0 if no document-level annotations
132    /// - Immediately after the last document-level annotation otherwise
133    ///
134    /// This enables grammar rules to reason about document structure and position.
135    DocumentStart,
136}
137
138impl fmt::Display for LineType {
139    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140        let name = match self {
141            LineType::BlankLine => "BLANK_LINE",
142            LineType::AnnotationEndLine => "ANNOTATION_END_LINE",
143            LineType::AnnotationStartLine => "ANNOTATION_START_LINE",
144            LineType::DataLine => "DATA_LINE",
145            LineType::SubjectLine => "SUBJECT_LINE",
146            LineType::ListLine => "LIST_LINE",
147            LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
148            LineType::ParagraphLine => "PARAGRAPH_LINE",
149            LineType::DialogLine => "DIALOG_LINE",
150            LineType::Indent => "INDENT",
151            LineType::Dedent => "DEDENT",
152            LineType::DocumentStart => "DOCUMENT_START",
153        };
154        write!(f, "{name}")
155    }
156}
157
158impl LineType {
159    /// Format token type as grammar notation: `<token-name>`
160    ///
161    /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
162    ///
163    /// Examples:
164    /// - BlankLine -> `<blank-line>`
165    /// - AnnotationStartLine -> `<annotation-start-line>`
166    /// - SubjectLine -> `<subject-line>`
167    pub fn to_grammar_string(&self) -> String {
168        let name = match self {
169            LineType::BlankLine => "blank-line",
170            LineType::AnnotationEndLine => "annotation-end-line",
171            LineType::AnnotationStartLine => "annotation-start-line",
172            LineType::DataLine => "data-line",
173            LineType::SubjectLine => "subject-line",
174            LineType::ListLine => "list-line",
175            LineType::SubjectOrListItemLine => "subject-or-list-item-line",
176            LineType::ParagraphLine => "paragraph-line",
177            LineType::DialogLine => "dialog-line",
178            LineType::Indent => "indent",
179            LineType::Dedent => "dedent",
180            LineType::DocumentStart => "document-start-line",
181        };
182        format!("<{name}>")
183    }
184}
185
186/// The primary tree structure for the lexer output.
187///
188/// This is a recursive enum representing the complete hierarchical structure of line tokens.
189/// Every node in the tree is either a line token or a container of child nodes.
190///
191/// The tree is built by processing Indent/Dedent markers:
192/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
193/// - Container variant: A grouped set of child nodes at a deeper indentation level
194///
195/// This structure allows the parser to match patterns by checking token types while
196/// maintaining the complete source structure (source tokens, nesting).
197///
198/// Note: Container does NOT store an aggregate source_span. The AST construction facade
199/// will compute bounding boxes by recursively unrolling children to their source tokens.
200#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
201pub enum LineContainer {
202    /// A single line token
203    Token(LineToken),
204
205    /// A container of child nodes (represents indented content or grouped lines at same level)
206    Container { children: Vec<LineContainer> },
207}
208
209impl LineContainer {
210    /// Check if this container is empty (only valid for root containers)
211    pub fn is_empty(&self) -> bool {
212        match self {
213            LineContainer::Token(_) => false,
214            LineContainer::Container { children, .. } => children.is_empty(),
215        }
216    }
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn test_tokenize_indented_marker() {
225        use crate::lex::lexing::tokenize;
226        let source = "  ::";
227        let tokens_with_range = tokenize(source);
228        let tokens: Vec<crate::lex::token::Token> =
229            tokens_with_range.into_iter().map(|(t, _)| t).collect();
230        println!("Tokens: {tokens:?}");
231    }
232
233    #[test]
234    fn test_token_type_to_grammar_string() {
235        assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
236        assert_eq!(
237            LineType::AnnotationStartLine.to_grammar_string(),
238            "<annotation-start-line>"
239        );
240        assert_eq!(
241            LineType::AnnotationEndLine.to_grammar_string(),
242            "<annotation-end-line>"
243        );
244        assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
245        assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
246        assert_eq!(
247            LineType::SubjectOrListItemLine.to_grammar_string(),
248            "<subject-or-list-item-line>"
249        );
250        assert_eq!(
251            LineType::ParagraphLine.to_grammar_string(),
252            "<paragraph-line>"
253        );
254        assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
255        assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
256        assert_eq!(
257            LineType::DocumentStart.to_grammar_string(),
258            "<document-start-line>"
259        );
260    }
261
262    #[test]
263    fn test_token_sequence_formatting() {
264        // Test creating a sequence of tokens and formatting them
265        let tokens = [
266            LineType::SubjectLine,
267            LineType::Indent,
268            LineType::ParagraphLine,
269            LineType::Dedent,
270        ];
271
272        let formatted = tokens
273            .iter()
274            .map(|t| t.to_grammar_string())
275            .collect::<Vec<_>>()
276            .join("");
277
278        assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
279    }
280
281    #[test]
282    fn test_blank_line_group_formatting() {
283        let tokens = [
284            LineType::BlankLine,
285            LineType::BlankLine,
286            LineType::BlankLine,
287        ];
288
289        let formatted = tokens
290            .iter()
291            .map(|t| t.to_grammar_string())
292            .collect::<Vec<_>>()
293            .join("");
294
295        assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
296    }
297
298    #[test]
299    fn test_complex_pattern_formatting() {
300        // Session pattern: blank + content + blank + container
301        let tokens = [
302            LineType::BlankLine,
303            LineType::SubjectLine,
304            LineType::BlankLine,
305            LineType::Indent,
306            LineType::ParagraphLine,
307            LineType::Dedent,
308        ];
309
310        let formatted = tokens
311            .iter()
312            .map(|t| t.to_grammar_string())
313            .collect::<Vec<_>>()
314            .join("");
315
316        assert_eq!(
317            formatted,
318            "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
319        );
320    }
321
322    #[test]
323    fn test_line_token_source_token_pairs() {
324        // Test that LineToken can provide source tokens in paired format
325        let line_token = LineToken {
326            source_tokens: vec![
327                Token::Text("hello".to_string()),
328                Token::Whitespace(1),
329                Token::Text("world".to_string()),
330            ],
331            token_spans: vec![0..5, 5..6, 6..11],
332            line_type: LineType::ParagraphLine,
333        };
334
335        let pairs = line_token.source_token_pairs();
336        assert_eq!(pairs.len(), 3);
337        assert_eq!(pairs[0].1, 0..5);
338        assert_eq!(pairs[1].1, 5..6);
339        assert_eq!(pairs[2].1, 6..11);
340
341        // Verify tokens match
342        match &pairs[0].0 {
343            Token::Text(s) => assert_eq!(s, "hello"),
344            _ => panic!("Expected Text token"),
345        }
346    }
347}