lex_core/lex/token/line.rs
1//! Line-based token types for the lexer pipeline
2//!
3//! This module contains token types specific to the line-based lexer pipeline. Being line
4//! based, all the grammar needs is to have line tokens in order to parse any level of elements.
5//! Only annotations and end of verbatim blocks use data nodes, that means that pretty much all
6//! of Lex needs to be parsed from naturally occurring text lines, indentation and blank lines.
7//!
8//! Since this still is happening in the lexing stage, each line must be tokenized into one
9//! category. In the real world, a line might be more than one possible category. For example a
10//! line might have a sequence marker and a subject marker (for example "1. Recap:").
11//!
12//! For this reason, line tokens can be OR tokens at times, and at other times the order of
13//! line categorization is crucial to getting the right result. While there are only a few
14//! consequential marks in lines (blank, data, subject, list) having them denormalized is
15//! required to have parsing simpler.
16//!
17//! The LineType enum is the definitive set: blank, data marker, subject, list,
18//! subject-or-list-item, paragraph, dialog, indent, dedent. Containers are a separate
19//! structural node, not a line token.
20//!
21//! Line Types
22//!
23//! These are the line tokens:
24//!
25//! - BlankLine: empty or whitespace only
26//! - DataMarkerLine: a data marker in closed form (:: label params? ::)
27//! - SubjectLine: Line ending with colon (could be subject/definition/session title)
28//! - ListLine: Line starting with list marker (-, 1., a., I., etc.)
29//! - SubjectOrListItemLine: Line starting with list marker and ending with colon
30//! - ParagraphLine: Any other line (paragraph text)
31//! - DialogLine: a line that starts with a dash, but is marked not to be a list item.
32//! - Indent / Dedent: structural markers passed through from indentation handling.
33//! - DocumentStart: synthetic marker for document content boundary.
34//!
35//! And to represent a group of lines at the same level, there is a LineContainer.
36//!
37//! See [classify_line_tokens](crate::lex::lexing::line_classification::classify_line_tokens)
38//! for the classification logic and ordering.
39
40use std::fmt;
41
42use super::core::Token;
43
44/// A line token represents one logical line created from grouped raw tokens.
45///
46/// Line tokens are produced by the line token transformation,
47/// which groups raw tokens into semantic line units. Each line token stores:
48/// - The original raw tokens that created it (for location information and AST construction)
49/// - The line type (what kind of line this is)
50/// - Individual token spans (to enable byte-accurate text extraction from token subsets)
51///
52/// By preserving raw tokens and their individual spans, we can later
53/// pass them directly to existing AST constructors (using the same unified approach as the
54/// the parser), which handles all location tracking and AST node creation automatically.
55///
56/// Note: LineToken does NOT store an aggregate source_span. The AST construction facade
57/// will compute bounding boxes from the individual token_spans when needed.
58#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
59pub struct LineToken {
60 /// The original raw tokens that comprise this line
61 pub source_tokens: Vec<Token>,
62
63 /// The byte range in source code for each token
64 /// Must be the same length as source_tokens
65 pub token_spans: Vec<std::ops::Range<usize>>,
66
67 /// The type/classification of this line
68 pub line_type: LineType,
69}
70
71impl LineToken {
72 /// Get source tokens as (Token, Range<usize>) pairs.
73 ///
74 /// This creates owned pairs from the separate source_tokens and token_spans vectors.
75 /// Used by the AST construction facade to get tokens in the format expected by
76 /// the token processing utilities.
77 ///
78 /// Note: LineToken stores tokens and spans separately for serialization efficiency.
79 /// This method creates the paired format needed for location tracking.
80 pub fn source_token_pairs(&self) -> Vec<(Token, std::ops::Range<usize>)> {
81 self.source_tokens
82 .iter()
83 .zip(self.token_spans.iter())
84 .map(|(token, span)| (token.clone(), span.clone()))
85 .collect()
86 }
87}
88
89/// The classification of a line token
90#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
91pub enum LineType {
92 /// Blank line (empty or whitespace only)
93 BlankLine,
94
95 /// Data marker line: a data marker in closed form (:: label params? ::).
96 /// Used for both annotation headers and verbatim closing lines.
97 DataMarkerLine,
98
99 /// Line ending with colon (could be subject/definition/session title)
100 SubjectLine,
101
102 /// Line starting with list marker (-, 1., a., I., etc.)
103 ListLine,
104
105 /// Line starting with list marker and ending with colon (subject and list item combined)
106 SubjectOrListItemLine,
107
108 /// Any other line (paragraph text)
109 ParagraphLine,
110
111 /// Line that is part of a dialog
112 DialogLine,
113
114 /// Indentation marker (pass-through from prior transformation)
115 Indent,
116
117 /// Dedentation marker (pass-through from prior transformation)
118 Dedent,
119
120 /// Document start marker (synthetic)
121 ///
122 /// Marks the boundary between document-level metadata (annotations) and document content.
123 /// Injected by DocumentStartMarker transformation at:
124 /// - Position 0 if no document-level annotations
125 /// - Immediately after the last document-level annotation otherwise
126 ///
127 /// This enables grammar rules to reason about document structure and position.
128 DocumentStart,
129}
130
131impl fmt::Display for LineType {
132 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133 let name = match self {
134 LineType::BlankLine => "BLANK_LINE",
135 LineType::DataMarkerLine => "DATA_MARKER_LINE",
136 LineType::SubjectLine => "SUBJECT_LINE",
137 LineType::ListLine => "LIST_LINE",
138 LineType::SubjectOrListItemLine => "SUBJECT_OR_LIST_ITEM_LINE",
139 LineType::ParagraphLine => "PARAGRAPH_LINE",
140 LineType::DialogLine => "DIALOG_LINE",
141 LineType::Indent => "INDENT",
142 LineType::Dedent => "DEDENT",
143 LineType::DocumentStart => "DOCUMENT_START",
144 };
145 write!(f, "{name}")
146 }
147}
148
149impl LineType {
150 /// Format token type as grammar notation: `<token-name>`
151 ///
152 /// Converts UPPER_CASE_WITH_UNDERSCORES to <lower-case-with-dashes>
153 ///
154 /// Examples:
155 /// - BlankLine -> `<blank-line>`
156 /// - DataMarkerLine -> `<data-marker-line>`
157 /// - SubjectLine -> `<subject-line>`
158 pub fn to_grammar_string(&self) -> String {
159 let name = match self {
160 LineType::BlankLine => "blank-line",
161 LineType::DataMarkerLine => "data-marker-line",
162 LineType::SubjectLine => "subject-line",
163 LineType::ListLine => "list-line",
164 LineType::SubjectOrListItemLine => "subject-or-list-item-line",
165 LineType::ParagraphLine => "paragraph-line",
166 LineType::DialogLine => "dialog-line",
167 LineType::Indent => "indent",
168 LineType::Dedent => "dedent",
169 LineType::DocumentStart => "document-start-line",
170 };
171 format!("<{name}>")
172 }
173}
174
175/// The primary tree structure for the lexer output.
176///
177/// This is a recursive enum representing the complete hierarchical structure of line tokens.
178/// Every node in the tree is either a line token or a container of child nodes.
179///
180/// The tree is built by processing Indent/Dedent markers:
181/// - Token variant: A single line token (e.g., SubjectLine, ParagraphLine, ListLine)
182/// - Container variant: A grouped set of child nodes at a deeper indentation level
183///
184/// This structure allows the parser to match patterns by checking token types while
185/// maintaining the complete source structure (source tokens, nesting).
186///
187/// Note: Container does NOT store an aggregate source_span. The AST construction facade
188/// will compute bounding boxes by recursively unrolling children to their source tokens.
189#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
190pub enum LineContainer {
191 /// A single line token
192 Token(LineToken),
193
194 /// A container of child nodes (represents indented content or grouped lines at same level)
195 Container { children: Vec<LineContainer> },
196}
197
198impl LineContainer {
199 /// Check if this container is empty (only valid for root containers)
200 pub fn is_empty(&self) -> bool {
201 match self {
202 LineContainer::Token(_) => false,
203 LineContainer::Container { children, .. } => children.is_empty(),
204 }
205 }
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 #[test]
213 fn test_tokenize_indented_marker() {
214 use crate::lex::lexing::tokenize;
215 let source = " ::";
216 let tokens_with_range = tokenize(source);
217 let tokens: Vec<crate::lex::token::Token> =
218 tokens_with_range.into_iter().map(|(t, _)| t).collect();
219 println!("Tokens: {tokens:?}");
220 }
221
222 #[test]
223 fn test_token_type_to_grammar_string() {
224 assert_eq!(LineType::BlankLine.to_grammar_string(), "<blank-line>");
225 assert_eq!(
226 LineType::DataMarkerLine.to_grammar_string(),
227 "<data-marker-line>"
228 );
229 assert_eq!(LineType::SubjectLine.to_grammar_string(), "<subject-line>");
230 assert_eq!(LineType::ListLine.to_grammar_string(), "<list-line>");
231 assert_eq!(
232 LineType::SubjectOrListItemLine.to_grammar_string(),
233 "<subject-or-list-item-line>"
234 );
235 assert_eq!(
236 LineType::ParagraphLine.to_grammar_string(),
237 "<paragraph-line>"
238 );
239 assert_eq!(LineType::Indent.to_grammar_string(), "<indent>");
240 assert_eq!(LineType::Dedent.to_grammar_string(), "<dedent>");
241 assert_eq!(
242 LineType::DocumentStart.to_grammar_string(),
243 "<document-start-line>"
244 );
245 }
246
247 #[test]
248 fn test_token_sequence_formatting() {
249 // Test creating a sequence of tokens and formatting them
250 let tokens = [
251 LineType::SubjectLine,
252 LineType::Indent,
253 LineType::ParagraphLine,
254 LineType::Dedent,
255 ];
256
257 let formatted = tokens
258 .iter()
259 .map(|t| t.to_grammar_string())
260 .collect::<Vec<_>>()
261 .join("");
262
263 assert_eq!(formatted, "<subject-line><indent><paragraph-line><dedent>");
264 }
265
266 #[test]
267 fn test_blank_line_group_formatting() {
268 let tokens = [
269 LineType::BlankLine,
270 LineType::BlankLine,
271 LineType::BlankLine,
272 ];
273
274 let formatted = tokens
275 .iter()
276 .map(|t| t.to_grammar_string())
277 .collect::<Vec<_>>()
278 .join("");
279
280 assert_eq!(formatted, "<blank-line><blank-line><blank-line>");
281 }
282
283 #[test]
284 fn test_complex_pattern_formatting() {
285 // Session pattern: blank + content + blank + container
286 let tokens = [
287 LineType::BlankLine,
288 LineType::SubjectLine,
289 LineType::BlankLine,
290 LineType::Indent,
291 LineType::ParagraphLine,
292 LineType::Dedent,
293 ];
294
295 let formatted = tokens
296 .iter()
297 .map(|t| t.to_grammar_string())
298 .collect::<Vec<_>>()
299 .join("");
300
301 assert_eq!(
302 formatted,
303 "<blank-line><subject-line><blank-line><indent><paragraph-line><dedent>"
304 );
305 }
306
307 #[test]
308 fn test_line_token_source_token_pairs() {
309 // Test that LineToken can provide source tokens in paired format
310 let line_token = LineToken {
311 source_tokens: vec![
312 Token::Text("hello".to_string()),
313 Token::Whitespace(1),
314 Token::Text("world".to_string()),
315 ],
316 token_spans: vec![0..5, 5..6, 6..11],
317 line_type: LineType::ParagraphLine,
318 };
319
320 let pairs = line_token.source_token_pairs();
321 assert_eq!(pairs.len(), 3);
322 assert_eq!(pairs[0].1, 0..5);
323 assert_eq!(pairs[1].1, 5..6);
324 assert_eq!(pairs[2].1, 6..11);
325
326 // Verify tokens match
327 match &pairs[0].0 {
328 Token::Text(s) => assert_eq!(s, "hello"),
329 _ => panic!("Expected Text token"),
330 }
331 }
332}