Skip to main content

lex_core/lex/token/normalization/
utilities.rs

1//! Token processing utilities for the Immutable Log Architecture
2//!
3//! This module provides the core utilities for token manipulation.
4//! All functions here are pure and thoroughly unit tested.
5//!
6//! # Architecture
7//!
8//! The Logos lexer produces `(Token, Range<usize>)` pairs - this is the ground truth.
9//! Transformations create aggregate tokens that store these original pairs in `source_tokens`.
10//! This module provides utilities to:
11//! 1. Unroll aggregate tokens back to flat lists
12//! 2. Flatten token vectors
13//! 3. Compute bounding boxes from token ranges
14//! 4. Extract text from ranges
15//!
16//! Note: Conversion from byte ranges to AST Range is handled in the `location` module.
17
18use crate::lex::token::core::Token;
19use std::ops::Range as ByteRange;
20
21/// Trait that any token structure can implement to provide access to source tokens.
22///
23/// This enables the unrolling system to work with any parser's token representation.
24#[allow(dead_code)]
25pub trait SourceTokenProvider {
26    /// Get the original Logos tokens that comprise this token.
27    ///
28    /// For atomic tokens (direct from Logos), this returns a slice containing just that token.
29    /// For aggregate tokens (from transformations), this returns all the original tokens.
30    fn source_tokens(&self) -> &[(Token, ByteRange<usize>)];
31}
32
33/// Unroll a collection of tokens to a flat list of original Logos tokens.
34///
35/// This recursively extracts all `source_tokens` from aggregate structures,
36/// returning a flat list of the original `(Token, Range<usize>)` pairs that
37/// came directly from the Logos lexer.
38///
39/// # Example
40///
41/// ```rust,ignore
42/// let line_tokens: Vec<LineToken> = /* ... parsed tokens ... */;
43/// let flat_tokens = unroll(&line_tokens);
44/// // flat_tokens now contains all original Logos tokens
45/// ```
46#[allow(dead_code)]
47pub fn unroll<T: SourceTokenProvider>(tokens: &[T]) -> Vec<(Token, ByteRange<usize>)> {
48    tokens
49        .iter()
50        .flat_map(|t| t.source_tokens().iter().cloned())
51        .collect()
52}
53
54/// Flatten a collection of token vectors into a single flat list.
55///
56/// This is useful for token types (like LineToken) that provide source tokens
57/// as owned Vec rather than borrowed slices. It simply concatenates all the vectors.
58///
59/// # Example
60///
61/// ```rust,ignore
62/// let line_tokens: Vec<LineToken> = /* ... parsed tokens ... */;
63/// let token_vecs: Vec<Vec<(Token, Range<usize>)>> = line_tokens.iter()
64///     .map(|lt| lt.source_token_pairs())
65///     .collect();
66/// let flat_tokens = flatten_token_vecs(&token_vecs);
67/// ```
68#[allow(dead_code)]
69pub fn flatten_token_vecs(
70    token_vecs: &[Vec<(Token, ByteRange<usize>)>],
71) -> Vec<(Token, ByteRange<usize>)> {
72    token_vecs.iter().flat_map(|v| v.iter().cloned()).collect()
73}
74
75/// Compute the bounding box (minimum start, maximum end) from a list of tokens.
76///
77/// Returns the smallest `Range<usize>` that encompasses all token ranges.
78/// Returns `0..0` if the token list is empty.
79///
80/// # Example
81///
82/// ```rust,ignore
83/// let tokens = vec![
84///     (Token::Text("hello".into()), 0..5),
85///     (Token::Whitespace(1), 5..6),
86///     (Token::Text("world".into()), 6..11),
87/// ];
88/// let bbox = compute_bounding_box(&tokens);
89/// assert_eq!(bbox, 0..11);
90/// ```
91pub fn compute_bounding_box(tokens: &[(Token, ByteRange<usize>)]) -> ByteRange<usize> {
92    if tokens.is_empty() {
93        return 0..0;
94    }
95
96    let min_start = tokens
97        .iter()
98        .map(|(_, range)| range.start)
99        .min()
100        .unwrap_or(0);
101    let max_end = tokens.iter().map(|(_, range)| range.end).max().unwrap_or(0);
102
103    min_start..max_end
104}
105
106/// Extract text from the source string at the given range.
107///
108/// # Arguments
109///
110/// * `range` - The byte offset range to extract
111/// * `source` - The original source string
112///
113/// # Example
114///
115/// ```rust,ignore
116/// let text = extract_text(0..5, "hello world");
117/// assert_eq!(text, "hello");
118/// ```
119pub fn extract_text(range: ByteRange<usize>, source: &str) -> String {
120    source[range].to_string()
121}
122
123/// Compute the 0-indexed column number for a given byte offset in the source string.
124///
125/// # Arguments
126///
127/// * `offset` - The byte offset to get the column for
128/// * `source` - The original source string
129///
130/// # Example
131///
132/// ```rust,ignore
133/// let source = "hello\n world";
134/// // 'w' is at byte offset 6
135/// let col = compute_column(6, source);
136/// assert_eq!(col, 1);
137/// ```
138pub fn compute_column(offset: usize, source: &str) -> usize {
139    let mut last_newline = 0;
140    for (i, c) in source.char_indices() {
141        if i >= offset {
142            break;
143        }
144        if c == '\n' {
145            last_newline = i + 1;
146        }
147    }
148    offset - last_newline
149}
150
151/// High-level convenience: extract text directly from tokens.
152///
153/// This combines `compute_bounding_box` and `extract_text` for convenience.
154///
155/// # Panics
156///
157/// Panics if tokens is empty.
158#[allow(dead_code)]
159pub fn tokens_to_text(tokens: &[(Token, ByteRange<usize>)], source: &str) -> String {
160    let range = compute_bounding_box(tokens);
161    extract_text(range, source)
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    // Mock token provider for testing
169    struct MockToken {
170        tokens: Vec<(Token, ByteRange<usize>)>,
171    }
172
173    impl SourceTokenProvider for MockToken {
174        fn source_tokens(&self) -> &[(Token, ByteRange<usize>)] {
175            &self.tokens
176        }
177    }
178
179    #[test]
180    fn test_compute_bounding_box_single_token() {
181        let tokens = vec![(
182            Token::Text("hello".to_string()),
183            ByteRange { start: 0, end: 5 },
184        )];
185        let bbox = compute_bounding_box(&tokens);
186        assert_eq!(bbox, 0..5);
187    }
188
189    #[test]
190    fn test_compute_bounding_box_multiple_contiguous() {
191        let tokens = vec![
192            (
193                Token::Text("hello".to_string()),
194                ByteRange { start: 0, end: 5 },
195            ),
196            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
197            (
198                Token::Text("world".to_string()),
199                ByteRange { start: 6, end: 11 },
200            ),
201        ];
202        let bbox = compute_bounding_box(&tokens);
203        assert_eq!(bbox, 0..11);
204    }
205
206    #[test]
207    fn test_compute_bounding_box_non_contiguous() {
208        // In case tokens have gaps (shouldn't happen normally, but test it)
209        let tokens = vec![
210            (
211                Token::Text("hello".to_string()),
212                ByteRange { start: 0, end: 5 },
213            ),
214            (
215                Token::Text("world".to_string()),
216                ByteRange { start: 10, end: 15 },
217            ),
218        ];
219        let bbox = compute_bounding_box(&tokens);
220        assert_eq!(bbox, 0..15);
221    }
222
223    #[test]
224    fn test_compute_bounding_box_empty_returns_zero_range() {
225        let tokens: Vec<(Token, ByteRange<usize>)> = vec![];
226        assert_eq!(compute_bounding_box(&tokens), 0..0);
227    }
228
229    #[test]
230    fn test_extract_text_simple() {
231        let source = "hello world";
232        assert_eq!(
233            extract_text(ByteRange { start: 0, end: 5 }, source),
234            "hello"
235        );
236        assert_eq!(
237            extract_text(ByteRange { start: 6, end: 11 }, source),
238            "world"
239        );
240    }
241
242    #[test]
243    fn test_extract_text_multiline() {
244        let source = "line one\nline two\nline three";
245        assert_eq!(
246            extract_text(ByteRange { start: 0, end: 8 }, source),
247            "line one"
248        );
249        assert_eq!(
250            extract_text(ByteRange { start: 9, end: 17 }, source),
251            "line two"
252        );
253    }
254
255    #[test]
256    fn test_extract_text_unicode() {
257        let source = "hello 世界";
258        // "世界" is 6 bytes (3 bytes per character)
259        let text = extract_text(ByteRange { start: 6, end: 12 }, source);
260        assert_eq!(text, "世界");
261    }
262
263    #[test]
264    fn test_unroll_single_token() {
265        let mock = MockToken {
266            tokens: vec![(
267                Token::Text("hello".to_string()),
268                ByteRange { start: 0, end: 5 },
269            )],
270        };
271        let unrolled = unroll(&[mock]);
272        assert_eq!(unrolled.len(), 1);
273        assert_eq!(unrolled[0].1, 0..5);
274    }
275
276    #[test]
277    fn test_unroll_multiple_tokens() {
278        let mock1 = MockToken {
279            tokens: vec![(
280                Token::Text("hello".to_string()),
281                ByteRange { start: 0, end: 5 },
282            )],
283        };
284        let mock2 = MockToken {
285            tokens: vec![
286                (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
287                (
288                    Token::Text("world".to_string()),
289                    ByteRange { start: 6, end: 11 },
290                ),
291            ],
292        };
293        let unrolled = unroll(&[mock1, mock2]);
294        assert_eq!(unrolled.len(), 3);
295        assert_eq!(unrolled[0].1, 0..5);
296        assert_eq!(unrolled[1].1, 5..6);
297        assert_eq!(unrolled[2].1, 6..11);
298    }
299
300    #[test]
301    fn test_tokens_to_text_convenience() {
302        let source = "hello world";
303        let tokens = vec![
304            (
305                Token::Text("hello".to_string()),
306                ByteRange { start: 0, end: 5 },
307            ),
308            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
309        ];
310        let text = tokens_to_text(&tokens, source);
311        assert_eq!(text, "hello ");
312    }
313
314    #[test]
315    fn test_flatten_token_vecs_empty() {
316        let vecs: Vec<Vec<(Token, ByteRange<usize>)>> = vec![];
317        let flattened = flatten_token_vecs(&vecs);
318        assert_eq!(flattened.len(), 0);
319    }
320
321    #[test]
322    fn test_flatten_token_vecs_single() {
323        let vecs = vec![vec![
324            (
325                Token::Text("hello".to_string()),
326                ByteRange { start: 0, end: 5 },
327            ),
328            (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
329        ]];
330        let flattened = flatten_token_vecs(&vecs);
331        assert_eq!(flattened.len(), 2);
332        assert_eq!(flattened[0].1, 0..5);
333        assert_eq!(flattened[1].1, 5..6);
334    }
335
336    #[test]
337    fn test_flatten_token_vecs_multiple() {
338        let vecs = vec![
339            vec![(
340                Token::Text("hello".to_string()),
341                ByteRange { start: 0, end: 5 },
342            )],
343            vec![
344                (Token::Whitespace(1), ByteRange { start: 5, end: 6 }),
345                (
346                    Token::Text("world".to_string()),
347                    ByteRange { start: 6, end: 11 },
348                ),
349            ],
350        ];
351        let flattened = flatten_token_vecs(&vecs);
352        assert_eq!(flattened.len(), 3);
353        assert_eq!(flattened[0].1, 0..5);
354        assert_eq!(flattened[1].1, 5..6);
355        assert_eq!(flattened[2].1, 6..11);
356    }
357}