lex_core/lex/token/line.rs
1//! Line-based token types for the lexer pipeline
2//!
3//! This module contains token types specific to the line-based lexer pipeline. Being line
4//! based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//! Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//! of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//! Since this still is happening in the lexing stage, each line must be tokenized into one
9//! category. In the real world, a line might be more than one possible category. For example a
10//! line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//! For this reason, line tokens can be OR tokens at times, and at other times the order of
13//! line categorization is crucial to getting the right result. While there are only a few
14//! consequential marks in lines (blank, data, subject, list) having them denormalized is
15//! required to have parsing simpler.
16//!
17//! The LineType enum is the definitive set: blank, annotation start/end, data, subject, list,
18//! subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//! structural node, not a line token.
20//!
21//! Line Types
22//!
23//! These are the line tokens:
24//!
25//! - BlankLine: empty or whitespace only
26//! - AnnotationEndLine: a line starting with :: marker and having no further content
27//! - AnnotationStartLine: a data node + lex marker
28//! - DataLine: :: label params? (no closing :: marker)
29//! - SubjectLine: Line ending with colon (could be subject/definition/session title)
30//! - ListLine: Line starting with list marker (-, 1., a., I., etc.)
31//! - SubjectOrListItemLine: Line starting with list marker and ending with colon
32//! - ParagraphLine: Any other line (paragraph text)
33//! - DialogLine: a line that starts with a dash, but is marked not to be a list item.
34//! - Indent / Dedent: structural markers passed through from indentation handling.
35//! - DocumentStart: synthetic marker for document content boundary.
36//!
37//! And to represent a group of lines at the same level, there is a LineContainer.
38//!
39//! See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
40//! for the classification logic and ordering.
41
42use std::fmt;
43
44use super::core::Token;
45
46/// A line token represents one logical line created from grouped raw tokens.
47///
48/// Line tokens are produced by the line token transformation,
49/// which groups raw tokens into semantic line units. Each line token stores:
50/// - The original raw tokens that created it (for location information and AST construction)
51/// - The line type (what kind of line this is)
52/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
53///
54/// By preserving raw tokens and their individual spans, we can later
55/// pass them directly to existing AST constructors (using the same unified approach as the
56/// the parser), which handles all location tracking and AST node creation automatically.
57///
58/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
59/// will compute bounding boxes from the individual token_spans when needed.
60#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
61pub struct LineToken {
62 /// The original raw tokens that comprise this line
63 pub source_tokens: Vec<Token>,
64
65 /// The byte range in source code for each token
66 /// Must be the same length as source_tokens
67 pub token_spans: Vec<std::ops::Range<usize>>,
68
69 /// The type/classification of this line
70 pub line_type: LineType,
71}
72
73impl LineToken {
74 /// Get source tokens as (Token, Range<usize>) pairs.
75 ///
76 /// This creates owned pairs from the separate source_tokens and token_spans vectors.
77 /// Used by the AST construction facade to get tokens in the format expected by
78 /// the token processing utilities.
79 ///
80 /// Note: LineToken stores tokens and spans separately for serialization efficiency.
81 /// This method creates the paired format needed for location tracking.
82 pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
83 self.source_tokens
84 .iter()
85 .zip(self.token_spans.iter())
86 .map(|(token, span)| (token.clone(), span.clone()))
87 .collect()
88 }
89}
90
91/// The classification of a line token
92#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
93pub enum LineType {
94 /// Blank line (empty or whitespace only)
95 BlankLine,
96
97 /// Annotation end line: a line starting with :: marker and having no further content
98 AnnotationEndLine,
99
100 /// Annotation start line: follows annotation grammar <lex-marker><space><label>(<space><parameters>)? <lex-marker> <content>?
101 AnnotationStartLine,
102
103 /// Data line: :: label params? (no closing :: marker)
104 DataLine,
105
106 /// Line ending with colon (could be subject/definition/session title)
107 SubjectLine,
108
109 /// Line starting with list marker (-, 1., a., I., etc.)
110 ListLine,
111
112 /// Line starting with list marker and ending with colon (subject and list item combined)
113 SubjectOrListItemLine,
114
115 /// Any other line (paragraph text)
116 ParagraphLine,
117
118 /// Line that is part of a dialog
119 DialogLine,
120
121 /// Indentation marker (pass-through from prior transformation)
122 Indent,
123
124 /// Dedentation marker (pass-through from prior transformation)
125 Dedent,
126
127 /// Document start marker (synthetic)
128 ///
129 /// Marks the boundary between document-level metadata (annotations) and document content.
130 /// Injected by DocumentStartMarker transformation at:
131 /// - Position 0 if no document-level annotations
132 /// - Immediately after the last document-level annotation otherwise
133 ///
134 /// This enables grammar rules to reason about document structure and position.
135 DocumentStart,
136}
137
138impl fmt::Display for LineType {
139 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
140 let name = match self {
141 LineType::BlankLine => "BLANK_LINE",
142 LineType::AnnotationEndLine => "ANNOTATION_END_LINE",
143 LineType::AnnotationStartLine => "ANNOTATION_START_LINE",
144 LineType::DataLine => "DATA_LINE",
145 LineType::SubjectLine => "SUBJECT_LINE",
146 LineType::ListLine => "LIST_LINE",
147 LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
148 LineType::ParagraphLine => "PARAGRAPH_LINE",
149 LineType::DialogLine => "DIALOG_LINE",
150 LineType::Indent => "INDENT",
151 LineType::Dedent => "DEDENT",
152 LineType::DocumentStart => "DOCUMENT_START",
153 };
154 write!(f, "{name}")
155 }
156}
157
158impl LineType {
159 /// Format token type as grammar notation: `<token-name>`
160 ///
161 /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
162 ///
163 /// Examples:
164 /// - BlankLine -> `<blank-line>`
165 /// - AnnotationStartLine -> `<annotation-start-line>`
166 /// - SubjectLine -> `<subject-line>`
167 pub fn to_grammar_string(&self) -> String {
168 let name = match self {
169 LineType::BlankLine => "blank-line",
170 LineType::AnnotationEndLine => "annotation-end-line",
171 LineType::AnnotationStartLine => "annotation-start-line",
172 LineType::DataLine => "data-line",
173 LineType::SubjectLine => "subject-line",
174 LineType::ListLine => "list-line",
175 LineType::SubjectOrListItemLine => "subject-or-list-item-line",
176 LineType::ParagraphLine => "paragraph-line",
177 LineType::DialogLine => "dialog-line",
178 LineType::Indent => "indent",
179 LineType::Dedent => "dedent",
180 LineType::DocumentStart => "document-start-line",
181 };
182 format!("<{name}>")
183 }
184}
185
186/// The primary tree structure for the lexer output.
187///
188/// This is a recursive enum representing the complete hierarchical structure of line tokens.
189/// Every node in the tree is either a line token or a container of child nodes.
190///
191/// The tree is built by processing Indent/Dedent markers:
192/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
193/// - Container variant: A grouped set of child nodes at a deeper indentation level
194///
195/// This structure allows the parser to match patterns by checking token types while
196/// maintaining the complete source structure (source tokens, nesting).
197///
198/// Note: Container does NOT store an aggregate source_span. The AST construction facade
199/// will compute bounding boxes by recursively unrolling children to their source tokens.
200#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
201pub enum LineContainer {
202 /// A single line token
203 Token(LineToken),
204
205 /// A container of child nodes (represents indented content or grouped lines at same level)
206 Container { children: Vec<LineContainer> },
207}
208
209impl LineContainer {
210 /// Check if this container is empty (only valid for root containers)
211 pub fn is_empty(&self) -> bool {
212 match self {
213 LineContainer::Token(_) => false,
214 LineContainer::Container { children, .. } => children.is_empty(),
215 }
216 }
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 #[test]
224 fn test_tokenize_indented_marker() {
225 use crate::lex::lexing::tokenize;
226 let source = " ::";
227 let tokens_with_range = tokenize(source);
228 let tokens: Vec<crate::lex::token::Token> =
229 tokens_with_range.into_iter().map(|(t, _)| t).collect();
230 println!("Tokens: {tokens:?}");
231 }
232
233 #[test]
234 fn test_token_type_to_grammar_string() {
235 assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
236 assert_eq!(
237 LineType::AnnotationStartLine.to_grammar_string(),
238 "<annotation-start-line>"
239 );
240 assert_eq!(
241 LineType::AnnotationEndLine.to_grammar_string(),
242 "<annotation-end-line>"
243 );
244 assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
245 assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
246 assert_eq!(
247 LineType::SubjectOrListItemLine.to_grammar_string(),
248 "<subject-or-list-item-line>"
249 );
250 assert_eq!(
251 LineType::ParagraphLine.to_grammar_string(),
252 "<paragraph-line>"
253 );
254 assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
255 assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
256 assert_eq!(
257 LineType::DocumentStart.to_grammar_string(),
258 "<document-start-line>"
259 );
260 }
261
262 #[test]
263 fn test_token_sequence_formatting() {
264 // Test creating a sequence of tokens and formatting them
265 let tokens = [
266 LineType::SubjectLine,
267 LineType::Indent,
268 LineType::ParagraphLine,
269 LineType::Dedent,
270 ];
271
272 let formatted = tokens
273 .iter()
274 .map(|t| t.to_grammar_string())
275 .collect::<Vec<_>>()
276 .join("");
277
278 assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
279 }
280
281 #[test]
282 fn test_blank_line_group_formatting() {
283 let tokens = [
284 LineType::BlankLine,
285 LineType::BlankLine,
286 LineType::BlankLine,
287 ];
288
289 let formatted = tokens
290 .iter()
291 .map(|t| t.to_grammar_string())
292 .collect::<Vec<_>>()
293 .join("");
294
295 assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
296 }
297
298 #[test]
299 fn test_complex_pattern_formatting() {
300 // Session pattern: blank + content + blank + container
301 let tokens = [
302 LineType::BlankLine,
303 LineType::SubjectLine,
304 LineType::BlankLine,
305 LineType::Indent,
306 LineType::ParagraphLine,
307 LineType::Dedent,
308 ];
309
310 let formatted = tokens
311 .iter()
312 .map(|t| t.to_grammar_string())
313 .collect::<Vec<_>>()
314 .join("");
315
316 assert_eq!(
317 formatted,
318 "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
319 );
320 }
321
322 #[test]
323 fn test_line_token_source_token_pairs() {
324 // Test that LineToken can provide source tokens in paired format
325 let line_token = LineToken {
326 source_tokens: vec![
327 Token::Text("hello".to_string()),
328 Token::Whitespace(1),
329 Token::Text("world".to_string()),
330 ],
331 token_spans: vec![0..5, 5..6, 6..11],
332 line_type: LineType::ParagraphLine,
333 };
334
335 let pairs = line_token.source_token_pairs();
336 assert_eq!(pairs.len(), 3);
337 assert_eq!(pairs[0].1, 0..5);
338 assert_eq!(pairs[1].1, 5..6);
339 assert_eq!(pairs[2].1, 6..11);
340
341 // Verify tokens match
342 match &pairs[0].0 {
343 Token::Text(s) => assert_eq!(s, "hello"),
344 _ => panic!("Expected Text token"),
345 }
346 }
347}