lex_core/lex/token/
line.rs

1//! Line-based token types for the lexer pipeline
2//!
3//!     This module contains token types specific to the line-based lexer pipeline. Being line
4//!     based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//!     Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//!     of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//!     Since this still is happening in the lexing stage, each line must be tokenized into one
9//!     category. In the real world, a line might be more than one possible category. For example a
10//!     line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//!     For this reason, line tokens can be OR tokens at times, and at other times the order of
13//!     line categorization is crucial to getting the right result. While there are only a few
14//!     consequential marks in lines (blank, data, subject, list) having them denormalized is
15//!     required to have parsing simpler.
16//!
17//!     The LineType enum is the definitive set: blank, data marker, subject, list,
18//!     subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//!     structural node, not a line token.
20//!
21//! Line Types
22//!
23//!     These are the line tokens:
24//!
25//!         - BlankLine: empty or whitespace only
26//!         - DataMarkerLine: a data marker in closed form (:: label params? ::)
27//!         - SubjectLine: Line ending with colon (could be subject/definition/session title)
28//!         - ListLine: Line starting with list marker (-, 1., a., I., etc.)
29//!         - SubjectOrListItemLine: Line starting with list marker and ending with colon
30//!         - ParagraphLine: Any other line (paragraph text)
31//!         - DialogLine: a line that starts with a dash, but is marked not to be a list item.
32//!         - Indent / Dedent: structural markers passed through from indentation handling.
33//!         - DocumentStart: synthetic marker for document content boundary.
34//!
35//!     And to represent a group of lines at the same level, there is a LineContainer.
36//!
37//!     See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
38//!     for the classification logic and ordering.
39
40use std::fmt;
41
42use super::core::Token;
43
44/// A line token represents one logical line created from grouped raw tokens.
45///
46/// Line tokens are produced by the line token transformation,
47/// which groups raw tokens into semantic line units. Each line token stores:
48/// - The original raw tokens that created it (for location information and AST construction)
49/// - The line type (what kind of line this is)
50/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
51///
52/// By preserving raw tokens and their individual spans, we can later
53/// pass them directly to existing AST constructors (using the same unified approach as the
54/// the parser), which handles all location tracking and AST node creation automatically.
55///
56/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
57/// will compute bounding boxes from the individual token_spans when needed.
58#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
59pub struct LineToken {
60    /// The original raw tokens that comprise this line
61    pub source_tokens: Vec<Token>,
62
63    /// The byte range in source code for each token
64    /// Must be the same length as source_tokens
65    pub token_spans: Vec<std::ops::Range<usize>>,
66
67    /// The type/classification of this line
68    pub line_type: LineType,
69}
70
71impl LineToken {
72    /// Get source tokens as (Token, Range<usize>) pairs.
73    ///
74    /// This creates owned pairs from the separate source_tokens and token_spans vectors.
75    /// Used by the AST construction facade to get tokens in the format expected by
76    /// the token processing utilities.
77    ///
78    /// Note: LineToken stores tokens and spans separately for serialization efficiency.
79    /// This method creates the paired format needed for location tracking.
80    pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
81        self.source_tokens
82            .iter()
83            .zip(self.token_spans.iter())
84            .map(|(token, span)| (token.clone(), span.clone()))
85            .collect()
86    }
87}
88
89/// The classification of a line token
90#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
91pub enum LineType {
92    /// Blank line (empty or whitespace only)
93    BlankLine,
94
95    /// Data marker line: a data marker in closed form (:: label params? ::).
96    /// Used for both annotation headers and verbatim closing lines.
97    DataMarkerLine,
98
99    /// Line ending with colon (could be subject/definition/session title)
100    SubjectLine,
101
102    /// Line starting with list marker (-, 1., a., I., etc.)
103    ListLine,
104
105    /// Line starting with list marker and ending with colon (subject and list item combined)
106    SubjectOrListItemLine,
107
108    /// Any other line (paragraph text)
109    ParagraphLine,
110
111    /// Line that is part of a dialog
112    DialogLine,
113
114    /// Indentation marker (pass-through from prior transformation)
115    Indent,
116
117    /// Dedentation marker (pass-through from prior transformation)
118    Dedent,
119
120    /// Document start marker (synthetic)
121    ///
122    /// Marks the boundary between document-level metadata (annotations) and document content.
123    /// Injected by DocumentStartMarker transformation at:
124    /// - Position 0 if no document-level annotations
125    /// - Immediately after the last document-level annotation otherwise
126    ///
127    /// This enables grammar rules to reason about document structure and position.
128    DocumentStart,
129}
130
131impl fmt::Display for LineType {
132    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133        let name = match self {
134            LineType::BlankLine => "BLANK_LINE",
135            LineType::DataMarkerLine => "DATA_MARKER_LINE",
136            LineType::SubjectLine => "SUBJECT_LINE",
137            LineType::ListLine => "LIST_LINE",
138            LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
139            LineType::ParagraphLine => "PARAGRAPH_LINE",
140            LineType::DialogLine => "DIALOG_LINE",
141            LineType::Indent => "INDENT",
142            LineType::Dedent => "DEDENT",
143            LineType::DocumentStart => "DOCUMENT_START",
144        };
145        write!(f, "{name}")
146    }
147}
148
149impl LineType {
150    /// Format token type as grammar notation: `<token-name>`
151    ///
152    /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
153    ///
154    /// Examples:
155    /// - BlankLine -> `<blank-line>`
156    /// - DataMarkerLine -> `<data-marker-line>`
157    /// - SubjectLine -> `<subject-line>`
158    pub fn to_grammar_string(&self) -> String {
159        let name = match self {
160            LineType::BlankLine => "blank-line",
161            LineType::DataMarkerLine => "data-marker-line",
162            LineType::SubjectLine => "subject-line",
163            LineType::ListLine => "list-line",
164            LineType::SubjectOrListItemLine => "subject-or-list-item-line",
165            LineType::ParagraphLine => "paragraph-line",
166            LineType::DialogLine => "dialog-line",
167            LineType::Indent => "indent",
168            LineType::Dedent => "dedent",
169            LineType::DocumentStart => "document-start-line",
170        };
171        format!("<{name}>")
172    }
173}
174
175/// The primary tree structure for the lexer output.
176///
177/// This is a recursive enum representing the complete hierarchical structure of line tokens.
178/// Every node in the tree is either a line token or a container of child nodes.
179///
180/// The tree is built by processing Indent/Dedent markers:
181/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
182/// - Container variant: A grouped set of child nodes at a deeper indentation level
183///
184/// This structure allows the parser to match patterns by checking token types while
185/// maintaining the complete source structure (source tokens, nesting).
186///
187/// Note: Container does NOT store an aggregate source_span. The AST construction facade
188/// will compute bounding boxes by recursively unrolling children to their source tokens.
189#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
190pub enum LineContainer {
191    /// A single line token
192    Token(LineToken),
193
194    /// A container of child nodes (represents indented content or grouped lines at same level)
195    Container { children: Vec<LineContainer> },
196}
197
198impl LineContainer {
199    /// Check if this container is empty (only valid for root containers)
200    pub fn is_empty(&self) -> bool {
201        match self {
202            LineContainer::Token(_) => false,
203            LineContainer::Container { children, .. } => children.is_empty(),
204        }
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    #[test]
213    fn test_tokenize_indented_marker() {
214        use crate::lex::lexing::tokenize;
215        let source = "  ::";
216        let tokens_with_range = tokenize(source);
217        let tokens: Vec<crate::lex::token::Token> =
218            tokens_with_range.into_iter().map(|(t, _)| t).collect();
219        println!("Tokens: {tokens:?}");
220    }
221
222    #[test]
223    fn test_token_type_to_grammar_string() {
224        assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
225        assert_eq!(
226            LineType::DataMarkerLine.to_grammar_string(),
227            "<data-marker-line>"
228        );
229        assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
230        assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
231        assert_eq!(
232            LineType::SubjectOrListItemLine.to_grammar_string(),
233            "<subject-or-list-item-line>"
234        );
235        assert_eq!(
236            LineType::ParagraphLine.to_grammar_string(),
237            "<paragraph-line>"
238        );
239        assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
240        assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
241        assert_eq!(
242            LineType::DocumentStart.to_grammar_string(),
243            "<document-start-line>"
244        );
245    }
246
247    #[test]
248    fn test_token_sequence_formatting() {
249        // Test creating a sequence of tokens and formatting them
250        let tokens = [
251            LineType::SubjectLine,
252            LineType::Indent,
253            LineType::ParagraphLine,
254            LineType::Dedent,
255        ];
256
257        let formatted = tokens
258            .iter()
259            .map(|t| t.to_grammar_string())
260            .collect::<Vec<_>>()
261            .join("");
262
263        assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
264    }
265
266    #[test]
267    fn test_blank_line_group_formatting() {
268        let tokens = [
269            LineType::BlankLine,
270            LineType::BlankLine,
271            LineType::BlankLine,
272        ];
273
274        let formatted = tokens
275            .iter()
276            .map(|t| t.to_grammar_string())
277            .collect::<Vec<_>>()
278            .join("");
279
280        assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
281    }
282
283    #[test]
284    fn test_complex_pattern_formatting() {
285        // Session pattern: blank + content + blank + container
286        let tokens = [
287            LineType::BlankLine,
288            LineType::SubjectLine,
289            LineType::BlankLine,
290            LineType::Indent,
291            LineType::ParagraphLine,
292            LineType::Dedent,
293        ];
294
295        let formatted = tokens
296            .iter()
297            .map(|t| t.to_grammar_string())
298            .collect::<Vec<_>>()
299            .join("");
300
301        assert_eq!(
302            formatted,
303            "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
304        );
305    }
306
307    #[test]
308    fn test_line_token_source_token_pairs() {
309        // Test that LineToken can provide source tokens in paired format
310        let line_token = LineToken {
311            source_tokens: vec![
312                Token::Text("hello".to_string()),
313                Token::Whitespace(1),
314                Token::Text("world".to_string()),
315            ],
316            token_spans: vec![0..5, 5..6, 6..11],
317            line_type: LineType::ParagraphLine,
318        };
319
320        let pairs = line_token.source_token_pairs();
321        assert_eq!(pairs.len(), 3);
322        assert_eq!(pairs[0].1, 0..5);
323        assert_eq!(pairs[1].1, 5..6);
324        assert_eq!(pairs[2].1, 6..11);
325
326        // Verify tokens match
327        match &pairs[0].0 {
328            Token::Text(s) => assert_eq!(s, "hello"),
329            _ => panic!("Expected Text token"),
330        }
331    }
332}
lex_core/lex/token/line.rs

lex_core/lex/token/
line.rs