lex_core/lex/token/normalization/
utilities.rs

1//! Token processing utilities for the Immutable Log Architecture
2//!
3//! This module provides the core utilities for token manipulation.
4//! All functions here are pure and thoroughly unit tested.
5//!
6//! # Architecture
7//!
8//! The Logos lexer produces `(Token, Range<usize>)` pairs - this is the ground truth.
9//! Transformations create aggregate tokens that store these original pairs in `source_tokens`.
10//! This module provides utilities to:
11//! 1. Unroll aggregate tokens back to flat lists
12//! 2. Flatten token vectors
13//! 3. Compute bounding boxes from token ranges
14//! 4. Extract text from ranges
15//!
16//! Note: Conversion from byte ranges to AST Range is handled in the `location` module.
17
18use crate::lex::token::core::Token;
19use std::ops::Range as ByteRange;
20
21/// Trait that any token structure can implement to provide access to source tokens.
22///
23/// This enables the unrolling system to work with any parser's token representation.
24#[allow(dead_code)]
25pub trait SourceTokenProvider {
26    /// Get the original Logos tokens that comprise this token.
27    ///
28    /// For atomic tokens (direct from Logos), this returns a slice containing just that token.
29    /// For aggregate tokens (from transformations), this returns all the original tokens.
30    fn source_tokens(&self) -> &[(Token, ByteRange<usize>)];
31}
32
33/// Unroll a collection of tokens to a flat list of original Logos tokens.
34///
35/// This recursively extracts all `source_tokens` from aggregate structures,
36/// returning a flat list of the original `(Token, Range<usize>)` pairs that
37/// came directly from the Logos lexer.
38///
39/// # Example
40///
41/// ```rust,ignore
42/// let line_tokens: Vec<LineToken> = /* ... parsed tokens ... */;
43/// let flat_tokens = unroll(&line_tokens);
44/// // flat_tokens now contains all original Logos tokens
45/// ```
46#[allow(dead_code)]
47pub fn unroll<T: SourceTokenProvider>(tokens: &[T]) -> Vec<(Token, ByteRange<usize>)> {
48    tokens
49        .iter()
50        .flat_map(|t| t.source_tokens().iter().cloned())
51        .collect()
52}
53
54/// Flatten a collection of token vectors into a single flat list.
55///
56/// This is useful for token types (like LineToken) that provide source tokens
57/// as owned Vec rather than borrowed slices. It simply concatenates all the vectors.
58///
59/// # Example
60///
61/// ```rust,ignore
62/// let line_tokens: Vec<LineToken> = /* ... parsed tokens ... */;
63/// let token_vecs: Vec<Vec<(Token, Range<usize>)>> = line_tokens.iter()
64///     .map(|lt| lt.source_token_pairs())
65///     .collect();
66/// let flat_tokens = flatten_token_vecs(&token_vecs);
67/// ```
68#[allow(dead_code)]
69pub fn flatten_token_vecs(
70    token_vecs: &[Vec<(Token, ByteRange<usize>)>],
71) -> Vec<(Token, ByteRange<usize>)> {
72    token_vecs.iter().flat_map(|v| v.iter().cloned()).collect()
73}
74
75/// Compute the bounding box (minimum start, maximum end) from a list of tokens.
76///
77/// Returns the smallest `Range<usize>` that encompasses all token ranges.
78///
79/// # Panics
80///
81/// Panics if the token list is empty. Callers should ensure tokens are non-empty.
82///
83/// # Example
84///
85/// ```rust,ignore
86/// let tokens = vec![
87///     (Token::Text("hello".into()), 0..5),
88///     (Token::Whitespace(1), 5..6),
89///     (Token::Text("world".into()), 6..11),
90/// ];
91/// let bbox = compute_bounding_box(&tokens);
92/// assert_eq!(bbox, 0..11);
93/// ```
94pub fn compute_bounding_box(tokens: &[(Token, ByteRange<usize>)]) -> ByteRange<usize> {
95    assert!(
96        !tokens.is_empty(),
97        "Cannot compute bounding box from empty token list"
98    );
99
100    let min_start = tokens
101        .iter()
102        .map(|(_, range)| range.start)
103        .min()
104        .unwrap_or(0);
105    let max_end = tokens.iter().map(|(_, range)| range.end).max().unwrap_or(0);
106
107    min_start..max_end
108}
109
110/// Extract text from the source string at the given range.
111///
112/// # Arguments
113///
114/// * `range` - The byte offset range to extract
115/// * `source` - The original source string
116///
117/// # Example
118///
119/// ```rust,ignore
120/// let text = extract_text(0..5, "hello world");
121/// assert_eq!(text, "hello");
122/// ```
123pub fn extract_text(range: ByteRange<usize>, source: &str) -> String {
124    source[range].to_string()
125}
126
127/// Compute the 0-indexed column number for a given byte offset in the source string.
128///
129/// # Arguments
130///
131/// * `offset` - The byte offset to get the column for
132/// * `source` - The original source string
133///
134/// # Example
135///
136/// ```rust,ignore
137/// let source = "hello\n world";
138/// // 'w' is at byte offset 6
139/// let col = compute_column(6, source);
140/// assert_eq!(col, 1);
141/// ```
142pub fn compute_column(offset: usize, source: &str) -> usize {
143    let mut last_newline = 0;
144    for (i, c) in source.char_indices() {
145        if i >= offset {
146            break;
147        }
148        if c == '\n' {
149            last_newline = i + 1;
150        }
151    }
152    offset - last_newline
153}
154
155/// High-level convenience: extract text directly from tokens.
156///
157/// This combines `compute_bounding_box` and `extract_text` for convenience.
158///
159/// # Panics
160///
161/// Panics if tokens is empty.
162#[allow(dead_code)]
163pub fn tokens_to_text(tokens: &[(Token, ByteRange<usize>)], source: &str) -> String {
164    let range = compute_bounding_box(tokens);
165    extract_text(range, source)
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    // Mock token provider for testing
173    struct MockToken {
174        tokens: Vec<(Token, ByteRange<usize>)>,
175    }
176
177    impl SourceTokenProvider for MockToken {
178        fn source_tokens(&self) -> &[(Token, ByteRange<usize>)] {
179            &self.tokens
180        }
181    }
182
183    #[test]
184    fn test_compute_bounding_box_single_token() {
185        let tokens = vec![(
186            Token::Text("hello".to_string()),
187            ByteRange { start: 0, end: 5 },
188        )];
189        let bbox = compute_bounding_box(&tokens);
190        assert_eq!(bbox, 0..5);
191    }
192
193    #[test]
194    fn test_compute_bounding_box_multiple_contiguous() {
195        let tokens = vec![
196            (
197                Token::Text("hello".to_string()),
198                ByteRange { start: 0, end: 5 },
199            ),
200            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
201            (
202                Token::Text("world".to_string()),
203                ByteRange { start: 6, end: 11 },
204            ),
205        ];
206        let bbox = compute_bounding_box(&tokens);
207        assert_eq!(bbox, 0..11);
208    }
209
210    #[test]
211    fn test_compute_bounding_box_non_contiguous() {
212        // In case tokens have gaps (shouldn't happen normally, but test it)
213        let tokens = vec![
214            (
215                Token::Text("hello".to_string()),
216                ByteRange { start: 0, end: 5 },
217            ),
218            (
219                Token::Text("world".to_string()),
220                ByteRange { start: 10, end: 15 },
221            ),
222        ];
223        let bbox = compute_bounding_box(&tokens);
224        assert_eq!(bbox, 0..15);
225    }
226
227    #[test]
228    #[should_panic(expected = "Cannot compute bounding box from empty token list")]
229    fn test_compute_bounding_box_empty_panics() {
230        let tokens: Vec<(Token, ByteRange<usize>)> = vec![];
231        compute_bounding_box(&tokens);
232    }
233
234    #[test]
235    fn test_extract_text_simple() {
236        let source = "hello world";
237        assert_eq!(
238            extract_text(ByteRange { start: 0, end: 5 }, source),
239            "hello"
240        );
241        assert_eq!(
242            extract_text(ByteRange { start: 6, end: 11 }, source),
243            "world"
244        );
245    }
246
247    #[test]
248    fn test_extract_text_multiline() {
249        let source = "line one\nline two\nline three";
250        assert_eq!(
251            extract_text(ByteRange { start: 0, end: 8 }, source),
252            "line one"
253        );
254        assert_eq!(
255            extract_text(ByteRange { start: 9, end: 17 }, source),
256            "line two"
257        );
258    }
259
260    #[test]
261    fn test_extract_text_unicode() {
262        let source = "hello 世界";
263        // "世界" is 6 bytes (3 bytes per character)
264        let text = extract_text(ByteRange { start: 6, end: 12 }, source);
265        assert_eq!(text, "世界");
266    }
267
268    #[test]
269    fn test_unroll_single_token() {
270        let mock = MockToken {
271            tokens: vec![(
272                Token::Text("hello".to_string()),
273                ByteRange { start: 0, end: 5 },
274            )],
275        };
276        let unrolled = unroll(&[mock]);
277        assert_eq!(unrolled.len(), 1);
278        assert_eq!(unrolled[0].1, 0..5);
279    }
280
281    #[test]
282    fn test_unroll_multiple_tokens() {
283        let mock1 = MockToken {
284            tokens: vec![(
285                Token::Text("hello".to_string()),
286                ByteRange { start: 0, end: 5 },
287            )],
288        };
289        let mock2 = MockToken {
290            tokens: vec![
291                (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
292                (
293                    Token::Text("world".to_string()),
294                    ByteRange { start: 6, end: 11 },
295                ),
296            ],
297        };
298        let unrolled = unroll(&[mock1, mock2]);
299        assert_eq!(unrolled.len(), 3);
300        assert_eq!(unrolled[0].1, 0..5);
301        assert_eq!(unrolled[1].1, 5..6);
302        assert_eq!(unrolled[2].1, 6..11);
303    }
304
305    #[test]
306    fn test_tokens_to_text_convenience() {
307        let source = "hello world";
308        let tokens = vec![
309            (
310                Token::Text("hello".to_string()),
311                ByteRange { start: 0, end: 5 },
312            ),
313            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
314        ];
315        let text = tokens_to_text(&tokens, source);
316        assert_eq!(text, "hello ");
317    }
318
319    #[test]
320    fn test_flatten_token_vecs_empty() {
321        let vecs: Vec<Vec<(Token, ByteRange<usize>)>> = vec![];
322        let flattened = flatten_token_vecs(&vecs);
323        assert_eq!(flattened.len(), 0);
324    }
325
326    #[test]
327    fn test_flatten_token_vecs_single() {
328        let vecs = vec![vec![
329            (
330                Token::Text("hello".to_string()),
331                ByteRange { start: 0, end: 5 },
332            ),
333            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
334        ]];
335        let flattened = flatten_token_vecs(&vecs);
336        assert_eq!(flattened.len(), 2);
337        assert_eq!(flattened[0].1, 0..5);
338        assert_eq!(flattened[1].1, 5..6);
339    }
340
341    #[test]
342    fn test_flatten_token_vecs_multiple() {
343        let vecs = vec![
344            vec![(
345                Token::Text("hello".to_string()),
346                ByteRange { start: 0, end: 5 },
347            )],
348            vec![
349                (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
350                (
351                    Token::Text("world".to_string()),
352                    ByteRange { start: 6, end: 11 },
353                ),
354            ],
355        ];
356        let flattened = flatten_token_vecs(&vecs);
357        assert_eq!(flattened.len(), 3);
358        assert_eq!(flattened[0].1, 0..5);
359        assert_eq!(flattened[1].1, 5..6);
360        assert_eq!(flattened[2].1, 6..11);
361    }
362}
lex_core/lex/token/normalization/utilities.rs

lex_core/lex/token/normalization/
utilities.rs