lex_core/lex/token/line.rs
1//! Line-based token types for the lexer pipeline
2//!
3//! This module contains token types specific to the line-based lexer pipeline. Being line
4//! based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//! Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//! of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//! Since this still is happening in the lexing stage, each line must be tokenized into one
9//! category. In the real world, a line might be more than one possible category. For example a
10//! line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//! For this reason, line tokens can be OR tokens at times, and at other times the order of
13//! line categorization is crucial to getting the right result. While there are only a few
14//! consequential marks in lines (blank, data, subject, list) having them denormalized is
15//! required to have parsing simpler.
16//!
17//! The LineType enum is the definitive set: blank, annotation start, data, subject, list,
18//! subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//! structural node, not a line token.
20//!
21//! Line Types
22//!
23//! These are the line tokens:
24//!
25//! - BlankLine: empty or whitespace only
26//! - DataMarkerLine: a data marker in closed form (:: label params? ::)
27//! - DataLine: :: label params? (no closing :: marker)
28//! - SubjectLine: Line ending with colon (could be subject/definition/session title)
29//! - ListLine: Line starting with list marker (-, 1., a., I., etc.)
30//! - SubjectOrListItemLine: Line starting with list marker and ending with colon
31//! - ParagraphLine: Any other line (paragraph text)
32//! - DialogLine: a line that starts with a dash, but is marked not to be a list item.
33//! - Indent / Dedent: structural markers passed through from indentation handling.
34//! - DocumentStart: synthetic marker for document content boundary.
35//!
36//! And to represent a group of lines at the same level, there is a LineContainer.
37//!
38//! See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
39//! for the classification logic and ordering.
40
41use std::fmt;
42
43use super::core::Token;
44
45/// A line token represents one logical line created from grouped raw tokens.
46///
47/// Line tokens are produced by the line token transformation,
48/// which groups raw tokens into semantic line units. Each line token stores:
49/// - The original raw tokens that created it (for location information and AST construction)
50/// - The line type (what kind of line this is)
51/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
52///
53/// By preserving raw tokens and their individual spans, we can later
54/// pass them directly to existing AST constructors (using the same unified approach as the
55/// the parser), which handles all location tracking and AST node creation automatically.
56///
57/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
58/// will compute bounding boxes from the individual token_spans when needed.
59#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
60pub struct LineToken {
61 /// The original raw tokens that comprise this line
62 pub source_tokens: Vec<Token>,
63
64 /// The byte range in source code for each token
65 /// Must be the same length as source_tokens
66 pub token_spans: Vec<std::ops::Range<usize>>,
67
68 /// The type/classification of this line
69 pub line_type: LineType,
70}
71
72impl LineToken {
73 /// Get source tokens as (Token, Range<usize>) pairs.
74 ///
75 /// This creates owned pairs from the separate source_tokens and token_spans vectors.
76 /// Used by the AST construction facade to get tokens in the format expected by
77 /// the token processing utilities.
78 ///
79 /// Note: LineToken stores tokens and spans separately for serialization efficiency.
80 /// This method creates the paired format needed for location tracking.
81 pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
82 self.source_tokens
83 .iter()
84 .zip(self.token_spans.iter())
85 .map(|(token, span)| (token.clone(), span.clone()))
86 .collect()
87 }
88}
89
90/// The classification of a line token
91#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
92pub enum LineType {
93 /// Blank line (empty or whitespace only)
94 BlankLine,
95
96 /// Data marker line: a data marker in closed form (:: label params? ::).
97 /// Used for both annotation headers and verbatim closing lines.
98 DataMarkerLine,
99
100 /// Data line: :: label params? (no closing :: marker)
101 DataLine,
102
103 /// Line ending with colon (could be subject/definition/session title)
104 SubjectLine,
105
106 /// Line starting with list marker (-, 1., a., I., etc.)
107 ListLine,
108
109 /// Line starting with list marker and ending with colon (subject and list item combined)
110 SubjectOrListItemLine,
111
112 /// Any other line (paragraph text)
113 ParagraphLine,
114
115 /// Line that is part of a dialog
116 DialogLine,
117
118 /// Indentation marker (pass-through from prior transformation)
119 Indent,
120
121 /// Dedentation marker (pass-through from prior transformation)
122 Dedent,
123
124 /// Document start marker (synthetic)
125 ///
126 /// Marks the boundary between document-level metadata (annotations) and document content.
127 /// Injected by DocumentStartMarker transformation at:
128 /// - Position 0 if no document-level annotations
129 /// - Immediately after the last document-level annotation otherwise
130 ///
131 /// This enables grammar rules to reason about document structure and position.
132 DocumentStart,
133}
134
135impl fmt::Display for LineType {
136 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
137 let name = match self {
138 LineType::BlankLine => "BLANK_LINE",
139 LineType::DataMarkerLine => "DATA_MARKER_LINE",
140 LineType::DataLine => "DATA_LINE",
141 LineType::SubjectLine => "SUBJECT_LINE",
142 LineType::ListLine => "LIST_LINE",
143 LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
144 LineType::ParagraphLine => "PARAGRAPH_LINE",
145 LineType::DialogLine => "DIALOG_LINE",
146 LineType::Indent => "INDENT",
147 LineType::Dedent => "DEDENT",
148 LineType::DocumentStart => "DOCUMENT_START",
149 };
150 write!(f, "{name}")
151 }
152}
153
154impl LineType {
155 /// Format token type as grammar notation: `<token-name>`
156 ///
157 /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
158 ///
159 /// Examples:
160 /// - BlankLine -> `<blank-line>`
161 /// - DataMarkerLine -> `<data-marker-line>`
162 /// - SubjectLine -> `<subject-line>`
163 pub fn to_grammar_string(&self) -> String {
164 let name = match self {
165 LineType::BlankLine => "blank-line",
166 LineType::DataMarkerLine => "data-marker-line",
167 LineType::DataLine => "data-line",
168 LineType::SubjectLine => "subject-line",
169 LineType::ListLine => "list-line",
170 LineType::SubjectOrListItemLine => "subject-or-list-item-line",
171 LineType::ParagraphLine => "paragraph-line",
172 LineType::DialogLine => "dialog-line",
173 LineType::Indent => "indent",
174 LineType::Dedent => "dedent",
175 LineType::DocumentStart => "document-start-line",
176 };
177 format!("<{name}>")
178 }
179}
180
181/// The primary tree structure for the lexer output.
182///
183/// This is a recursive enum representing the complete hierarchical structure of line tokens.
184/// Every node in the tree is either a line token or a container of child nodes.
185///
186/// The tree is built by processing Indent/Dedent markers:
187/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
188/// - Container variant: A grouped set of child nodes at a deeper indentation level
189///
190/// This structure allows the parser to match patterns by checking token types while
191/// maintaining the complete source structure (source tokens, nesting).
192///
193/// Note: Container does NOT store an aggregate source_span. The AST construction facade
194/// will compute bounding boxes by recursively unrolling children to their source tokens.
195#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
196pub enum LineContainer {
197 /// A single line token
198 Token(LineToken),
199
200 /// A container of child nodes (represents indented content or grouped lines at same level)
201 Container { children: Vec<LineContainer> },
202}
203
204impl LineContainer {
205 /// Check if this container is empty (only valid for root containers)
206 pub fn is_empty(&self) -> bool {
207 match self {
208 LineContainer::Token(_) => false,
209 LineContainer::Container { children, .. } => children.is_empty(),
210 }
211 }
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 #[test]
219 fn test_tokenize_indented_marker() {
220 use crate::lex::lexing::tokenize;
221 let source = " ::";
222 let tokens_with_range = tokenize(source);
223 let tokens: Vec<crate::lex::token::Token> =
224 tokens_with_range.into_iter().map(|(t, _)| t).collect();
225 println!("Tokens: {tokens:?}");
226 }
227
228 #[test]
229 fn test_token_type_to_grammar_string() {
230 assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
231 assert_eq!(
232 LineType::DataMarkerLine.to_grammar_string(),
233 "<data-marker-line>"
234 );
235 assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
236 assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
237 assert_eq!(
238 LineType::SubjectOrListItemLine.to_grammar_string(),
239 "<subject-or-list-item-line>"
240 );
241 assert_eq!(
242 LineType::ParagraphLine.to_grammar_string(),
243 "<paragraph-line>"
244 );
245 assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
246 assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
247 assert_eq!(
248 LineType::DocumentStart.to_grammar_string(),
249 "<document-start-line>"
250 );
251 }
252
253 #[test]
254 fn test_token_sequence_formatting() {
255 // Test creating a sequence of tokens and formatting them
256 let tokens = [
257 LineType::SubjectLine,
258 LineType::Indent,
259 LineType::ParagraphLine,
260 LineType::Dedent,
261 ];
262
263 let formatted = tokens
264 .iter()
265 .map(|t| t.to_grammar_string())
266 .collect::<Vec<_>>()
267 .join("");
268
269 assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
270 }
271
272 #[test]
273 fn test_blank_line_group_formatting() {
274 let tokens = [
275 LineType::BlankLine,
276 LineType::BlankLine,
277 LineType::BlankLine,
278 ];
279
280 let formatted = tokens
281 .iter()
282 .map(|t| t.to_grammar_string())
283 .collect::<Vec<_>>()
284 .join("");
285
286 assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
287 }
288
289 #[test]
290 fn test_complex_pattern_formatting() {
291 // Session pattern: blank + content + blank + container
292 let tokens = [
293 LineType::BlankLine,
294 LineType::SubjectLine,
295 LineType::BlankLine,
296 LineType::Indent,
297 LineType::ParagraphLine,
298 LineType::Dedent,
299 ];
300
301 let formatted = tokens
302 .iter()
303 .map(|t| t.to_grammar_string())
304 .collect::<Vec<_>>()
305 .join("");
306
307 assert_eq!(
308 formatted,
309 "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
310 );
311 }
312
313 #[test]
314 fn test_line_token_source_token_pairs() {
315 // Test that LineToken can provide source tokens in paired format
316 let line_token = LineToken {
317 source_tokens: vec![
318 Token::Text("hello".to_string()),
319 Token::Whitespace(1),
320 Token::Text("world".to_string()),
321 ],
322 token_spans: vec![0..5, 5..6, 6..11],
323 line_type: LineType::ParagraphLine,
324 };
325
326 let pairs = line_token.source_token_pairs();
327 assert_eq!(pairs.len(), 3);
328 assert_eq!(pairs[0].1, 0..5);
329 assert_eq!(pairs[1].1, 5..6);
330 assert_eq!(pairs[2].1, 6..11);
331
332 // Verify tokens match
333 match &pairs[0].0 {
334 Token::Text(s) => assert_eq!(s, "hello"),
335 _ => panic!("Expected Text token"),
336 }
337 }
338}