lex_core/lex/token/
line.rs

1//! Line-based token types for the lexer pipeline
2//!
3//!     This module contains token types specific to the line-based lexer pipeline. Being line
4//!     based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//!     Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//!     of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//!     Since this still is happening in the lexing stage, each line must be tokenized into one
9//!     category. In the real world, a line might be more than one possible category. For example a
10//!     line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//!     For this reason, line tokens can be OR tokens at times, and at other times the order of
13//!     line categorization is crucial to getting the right result. While there are only a few
14//!     consequential marks in lines (blank, data, subject, list) having them denormalized is
15//!     required to have parsing simpler.
16//!
17//!     The LineType enum is the definitive set: blank, annotation start, data, subject, list,
18//!     subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//!     structural node, not a line token.
20//!
21//! Line Types
22//!
23//!     These are the line tokens:
24//!
25//!         - BlankLine: empty or whitespace only
26//!         - DataMarkerLine: a data marker in closed form (:: label params? ::)
27//!         - DataLine: :: label params? (no closing :: marker)
28//!         - SubjectLine: Line ending with colon (could be subject/definition/session title)
29//!         - ListLine: Line starting with list marker (-, 1., a., I., etc.)
30//!         - SubjectOrListItemLine: Line starting with list marker and ending with colon
31//!         - ParagraphLine: Any other line (paragraph text)
32//!         - DialogLine: a line that starts with a dash, but is marked not to be a list item.
33//!         - Indent / Dedent: structural markers passed through from indentation handling.
34//!         - DocumentStart: synthetic marker for document content boundary.
35//!
36//!     And to represent a group of lines at the same level, there is a LineContainer.
37//!
38//!     See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
39//!     for the classification logic and ordering.
40
41use std::fmt;
42
43use super::core::Token;
44
45/// A line token represents one logical line created from grouped raw tokens.
46///
47/// Line tokens are produced by the line token transformation,
48/// which groups raw tokens into semantic line units. Each line token stores:
49/// - The original raw tokens that created it (for location information and AST construction)
50/// - The line type (what kind of line this is)
51/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
52///
53/// By preserving raw tokens and their individual spans, we can later
54/// pass them directly to existing AST constructors (using the same unified approach as the
55/// the parser), which handles all location tracking and AST node creation automatically.
56///
57/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
58/// will compute bounding boxes from the individual token_spans when needed.
59#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
60pub struct LineToken {
61    /// The original raw tokens that comprise this line
62    pub source_tokens: Vec<Token>,
63
64    /// The byte range in source code for each token
65    /// Must be the same length as source_tokens
66    pub token_spans: Vec<std::ops::Range<usize>>,
67
68    /// The type/classification of this line
69    pub line_type: LineType,
70}
71
72impl LineToken {
73    /// Get source tokens as (Token, Range<usize>) pairs.
74    ///
75    /// This creates owned pairs from the separate source_tokens and token_spans vectors.
76    /// Used by the AST construction facade to get tokens in the format expected by
77    /// the token processing utilities.
78    ///
79    /// Note: LineToken stores tokens and spans separately for serialization efficiency.
80    /// This method creates the paired format needed for location tracking.
81    pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
82        self.source_tokens
83            .iter()
84            .zip(self.token_spans.iter())
85            .map(|(token, span)| (token.clone(), span.clone()))
86            .collect()
87    }
88}
89
90/// The classification of a line token
91#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
92pub enum LineType {
93    /// Blank line (empty or whitespace only)
94    BlankLine,
95
96    /// Data marker line: a data marker in closed form (:: label params? ::).
97    /// Used for both annotation headers and verbatim closing lines.
98    DataMarkerLine,
99
100    /// Data line: :: label params? (no closing :: marker)
101    DataLine,
102
103    /// Line ending with colon (could be subject/definition/session title)
104    SubjectLine,
105
106    /// Line starting with list marker (-, 1., a., I., etc.)
107    ListLine,
108
109    /// Line starting with list marker and ending with colon (subject and list item combined)
110    SubjectOrListItemLine,
111
112    /// Any other line (paragraph text)
113    ParagraphLine,
114
115    /// Line that is part of a dialog
116    DialogLine,
117
118    /// Indentation marker (pass-through from prior transformation)
119    Indent,
120
121    /// Dedentation marker (pass-through from prior transformation)
122    Dedent,
123
124    /// Document start marker (synthetic)
125    ///
126    /// Marks the boundary between document-level metadata (annotations) and document content.
127    /// Injected by DocumentStartMarker transformation at:
128    /// - Position 0 if no document-level annotations
129    /// - Immediately after the last document-level annotation otherwise
130    ///
131    /// This enables grammar rules to reason about document structure and position.
132    DocumentStart,
133}
134
135impl fmt::Display for LineType {
136    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
137        let name = match self {
138            LineType::BlankLine => "BLANK_LINE",
139            LineType::DataMarkerLine => "DATA_MARKER_LINE",
140            LineType::DataLine => "DATA_LINE",
141            LineType::SubjectLine => "SUBJECT_LINE",
142            LineType::ListLine => "LIST_LINE",
143            LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
144            LineType::ParagraphLine => "PARAGRAPH_LINE",
145            LineType::DialogLine => "DIALOG_LINE",
146            LineType::Indent => "INDENT",
147            LineType::Dedent => "DEDENT",
148            LineType::DocumentStart => "DOCUMENT_START",
149        };
150        write!(f, "{name}")
151    }
152}
153
154impl LineType {
155    /// Format token type as grammar notation: `<token-name>`
156    ///
157    /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
158    ///
159    /// Examples:
160    /// - BlankLine -> `<blank-line>`
161    /// - DataMarkerLine -> `<data-marker-line>`
162    /// - SubjectLine -> `<subject-line>`
163    pub fn to_grammar_string(&self) -> String {
164        let name = match self {
165            LineType::BlankLine => "blank-line",
166            LineType::DataMarkerLine => "data-marker-line",
167            LineType::DataLine => "data-line",
168            LineType::SubjectLine => "subject-line",
169            LineType::ListLine => "list-line",
170            LineType::SubjectOrListItemLine => "subject-or-list-item-line",
171            LineType::ParagraphLine => "paragraph-line",
172            LineType::DialogLine => "dialog-line",
173            LineType::Indent => "indent",
174            LineType::Dedent => "dedent",
175            LineType::DocumentStart => "document-start-line",
176        };
177        format!("<{name}>")
178    }
179}
180
181/// The primary tree structure for the lexer output.
182///
183/// This is a recursive enum representing the complete hierarchical structure of line tokens.
184/// Every node in the tree is either a line token or a container of child nodes.
185///
186/// The tree is built by processing Indent/Dedent markers:
187/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
188/// - Container variant: A grouped set of child nodes at a deeper indentation level
189///
190/// This structure allows the parser to match patterns by checking token types while
191/// maintaining the complete source structure (source tokens, nesting).
192///
193/// Note: Container does NOT store an aggregate source_span. The AST construction facade
194/// will compute bounding boxes by recursively unrolling children to their source tokens.
195#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
196pub enum LineContainer {
197    /// A single line token
198    Token(LineToken),
199
200    /// A container of child nodes (represents indented content or grouped lines at same level)
201    Container { children: Vec<LineContainer> },
202}
203
204impl LineContainer {
205    /// Check if this container is empty (only valid for root containers)
206    pub fn is_empty(&self) -> bool {
207        match self {
208            LineContainer::Token(_) => false,
209            LineContainer::Container { children, .. } => children.is_empty(),
210        }
211    }
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn test_tokenize_indented_marker() {
220        use crate::lex::lexing::tokenize;
221        let source = "  ::";
222        let tokens_with_range = tokenize(source);
223        let tokens: Vec<crate::lex::token::Token> =
224            tokens_with_range.into_iter().map(|(t, _)| t).collect();
225        println!("Tokens: {tokens:?}");
226    }
227
228    #[test]
229    fn test_token_type_to_grammar_string() {
230        assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
231        assert_eq!(
232            LineType::DataMarkerLine.to_grammar_string(),
233            "<data-marker-line>"
234        );
235        assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
236        assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
237        assert_eq!(
238            LineType::SubjectOrListItemLine.to_grammar_string(),
239            "<subject-or-list-item-line>"
240        );
241        assert_eq!(
242            LineType::ParagraphLine.to_grammar_string(),
243            "<paragraph-line>"
244        );
245        assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
246        assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
247        assert_eq!(
248            LineType::DocumentStart.to_grammar_string(),
249            "<document-start-line>"
250        );
251    }
252
253    #[test]
254    fn test_token_sequence_formatting() {
255        // Test creating a sequence of tokens and formatting them
256        let tokens = [
257            LineType::SubjectLine,
258            LineType::Indent,
259            LineType::ParagraphLine,
260            LineType::Dedent,
261        ];
262
263        let formatted = tokens
264            .iter()
265            .map(|t| t.to_grammar_string())
266            .collect::<Vec<_>>()
267            .join("");
268
269        assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
270    }
271
272    #[test]
273    fn test_blank_line_group_formatting() {
274        let tokens = [
275            LineType::BlankLine,
276            LineType::BlankLine,
277            LineType::BlankLine,
278        ];
279
280        let formatted = tokens
281            .iter()
282            .map(|t| t.to_grammar_string())
283            .collect::<Vec<_>>()
284            .join("");
285
286        assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
287    }
288
289    #[test]
290    fn test_complex_pattern_formatting() {
291        // Session pattern: blank + content + blank + container
292        let tokens = [
293            LineType::BlankLine,
294            LineType::SubjectLine,
295            LineType::BlankLine,
296            LineType::Indent,
297            LineType::ParagraphLine,
298            LineType::Dedent,
299        ];
300
301        let formatted = tokens
302            .iter()
303            .map(|t| t.to_grammar_string())
304            .collect::<Vec<_>>()
305            .join("");
306
307        assert_eq!(
308            formatted,
309            "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
310        );
311    }
312
313    #[test]
314    fn test_line_token_source_token_pairs() {
315        // Test that LineToken can provide source tokens in paired format
316        let line_token = LineToken {
317            source_tokens: vec![
318                Token::Text("hello".to_string()),
319                Token::Whitespace(1),
320                Token::Text("world".to_string()),
321            ],
322            token_spans: vec![0..5, 5..6, 6..11],
323            line_type: LineType::ParagraphLine,
324        };
325
326        let pairs = line_token.source_token_pairs();
327        assert_eq!(pairs.len(), 3);
328        assert_eq!(pairs[0].1, 0..5);
329        assert_eq!(pairs[1].1, 5..6);
330        assert_eq!(pairs[2].1, 6..11);
331
332        // Verify tokens match
333        match &pairs[0].0 {
334            Token::Text(s) => assert_eq!(s, "hello"),
335            _ => panic!("Expected Text token"),
336        }
337    }
338}
lex_core/lex/token/line.rs

lex_core/lex/token/
line.rs