texlang/token/
trace.rs

1//! Tracing system for determining the origin of a token.
2//!
3//! This module implements a [Tracer] for tokens.
4//! When building helpful error messages we need to know the origin of tokens -
5//!     e.g., the file and line they came from.
6//! The tracing functionality here enables obtaining this information in the form of a [SourceCodeTrace].
7//!
8//! Rather than the custom system here,
9//!     a simpler solution would be to include this information on the token itself.
10//! The token could include a line position number and a reference
11//!     counting pointer to some data structure containing information about the line.
12//! The problem with this solution is that it makes the [Token] type very large, and
13//!     this causes unacceptably poor performance in Texlang's tight inner loops.
14//! With the tracer here, each token only needs to hold onto a 32-bit [Key] which is enough
15//!     to perform a full trace.
16//!
17//! # How the tracer works
18//!
19//! When adding source code to the input, the tracer is informed using the
20//!     [register_source_code](Tracer::register_source_code) method.
21//! The tracer allocates a contiguous range of [Keys](Key) that is large enough
22//!     to give each UTF-8 character in the input a unique key.
23//! These keys are returned using the opaque [KeyRange] type, which enables the caller to retrieve
24//!     these keys.
25//! It is assumed that the caller will assign keys in order to each UTF-8 character in the source code.
26//!
27//! In addition to returning the range, the tracer associates the key range with the source code in an
28//!     internal data structure.
29//!
30//! When tracing a token (using [trace](Tracer::trace)), the token's key is used to identify
31//!     which key range the key came from.
32//! This key range is then used to identify the source code associated to the token.
33//! The difference between the token's key and the first key for the source code is the UTF-8 offset
34//!     into the source code.
35//! Thus we can uniquely identify the UTF-8 character the token is a associated to.
36use crate::token::{CsNameInterner, Token, Value};
37use std::collections::BTreeMap;
38use std::ops::Bound::Included;
39use std::path::PathBuf;
40
41/// Key attached to tokens to enable tracing them.
42///
43/// This type is 32 bits.
44#[derive(Debug, PartialEq, Eq, Clone, Copy)]
45#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
46pub struct Key(u32);
47
48impl Key {
49    pub fn dummy() -> Key {
50        Key(u32::MAX)
51    }
52}
53
54/// Range of free keys that may be assigned to tokens.
55#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
56pub struct KeyRange {
57    next: u32,
58    limit: u32,
59}
60
61impl KeyRange {
62    /// Get the next trace [Key].
63    ///
64    /// Panics if all of the keys in this range have been used.
65    #[allow(clippy::should_implement_trait)]
66    pub fn next(&mut self) -> Key {
67        if self.next >= self.limit {
68            panic!["requested more trace keys than are in the range"]
69        }
70        let n = self.next;
71        self.next += 1;
72        Key(n)
73    }
74
75    /// Peek at the next trace [Key].
76    ///
77    /// Panics if all of the keys in this range have been used.
78    pub fn peek(&mut self) -> Key {
79        if self.next >= self.limit {
80            panic!["requested more trace keys than are in the range"]
81        }
82        Key(self.next)
83    }
84}
85
86impl KeyRange {
87    pub fn empty() -> KeyRange {
88        KeyRange { next: 0, limit: 0 }
89    }
90
91    #[cfg(test)]
92    pub fn for_testing() -> KeyRange {
93        KeyRange {
94            next: 0,
95            limit: u32::MAX,
96        }
97    }
98}
99
100/// A token trace
101#[derive(Debug, PartialEq, Eq)]
102pub struct SourceCodeTrace {
103    /// Name of the file this token came from.
104    pub file_name: PathBuf,
105    /// Content of the line this token came from.
106    pub line_content: String,
107    /// Number of the line within the file, starting at 1.
108    pub line_number: usize,
109    /// Index within the line that the token starts.
110    pub index: usize,
111    /// Value of the token.
112    pub value: String,
113    /// If this is for a token, the value of the token.
114    /// Otherwise this is an end of input snippet.
115    pub token: Option<Token>,
116}
117
118/// Data structure that records information for token tracing
119#[derive(Default)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub struct Tracer {
122    checkpoints: BTreeMap<u32, Checkpoint>,
123    next_key: u32,
124
125    // A key use to get the last file that was inputted manually; i.e., not via an \input
126    // or other command in a TeX file
127    last_external_input: Option<u32>,
128}
129
130impl Tracer {
131    /// Registers source code with the tracer.
132    ///
133    /// The returned [KeyRange] should be used to assign [Keys](Key) to the tokens that
134    ///     are lexed from the referenced source code.
135    /// The tracer assumes that the first key is assigned to token corresponding to the
136    ///     first UTF-8 character in their source code,
137    ///     the second key to the second UTF-8 character, and so on.
138    pub fn register_source_code(
139        &mut self,
140        token: Option<Token>,
141        file_name: PathBuf,
142        source_code: &str,
143    ) -> KeyRange {
144        let len = match u32::try_from(source_code.len()) {
145            Err(_) => {
146                panic!(
147                    "source code too big ({} bytes); max is 2^32={} bytes",
148                    source_code.len(),
149                    u32::MAX
150                )
151            }
152            // For empty files, we still assign one key because this allows us to trace end of input errors.
153            Ok(0) => 1_u32,
154            Ok(limit) => limit,
155        };
156        let range = KeyRange {
157            next: self.next_key,
158            limit: self.next_key + len,
159        };
160        self.checkpoints.insert(
161            range.next,
162            Checkpoint::SourceCode {
163                file_name,
164                content: source_code.to_string(),
165            },
166        );
167        if token.is_none() {
168            self.last_external_input = Some(self.next_key);
169        }
170        self.next_key = range.limit;
171        range
172    }
173
174    /// Return a trace for the provided token.
175    pub fn trace(&self, token: Token, cs_name_interner: &CsNameInterner) -> SourceCodeTrace {
176        let value = match token.value() {
177            Value::ControlSequence(cs_name) => {
178                format!["\\{}", cs_name_interner.resolve(cs_name).unwrap()]
179            }
180            _ => token.char().unwrap().to_string(),
181        };
182
183        let (&first_key, checkpoint) = self
184            .checkpoints
185            .range((Included(&0), Included(&token.trace_key.0)))
186            .rev()
187            .next()
188            .unwrap();
189
190        match checkpoint {
191            Checkpoint::SourceCode { file_name, content } => {
192                let char_offset = (token.trace_key().0 - first_key) as usize;
193                let mut line_number = 1;
194                let mut byte_line_start = 0;
195                let mut char_line_start = 0;
196                for (char_index, (byte_index, c)) in content.char_indices().enumerate() {
197                    if char_index == char_offset {
198                        break;
199                    }
200                    if c == '\n' {
201                        byte_line_start = byte_index + 1;
202                        char_line_start = char_index + 1;
203                        line_number += 1;
204                    }
205                }
206                let position = char_offset - char_line_start;
207                let tail = &content[byte_line_start..];
208                let line_content = match tail.split_once('\n') {
209                    None => tail.to_string(),
210                    Some((a, _)) => a.to_string(),
211                };
212                SourceCodeTrace {
213                    file_name: file_name.clone(),
214                    line_content,
215                    line_number,
216                    index: position,
217                    value,
218                    token: Some(token),
219                }
220            }
221        }
222    }
223
224    pub fn trace_end_of_input(&self) -> SourceCodeTrace {
225        let f = self
226            .checkpoints
227            .get(&self.last_external_input.unwrap())
228            .unwrap();
229        match f {
230            Checkpoint::SourceCode { file_name, content } => {
231                // (line index, byte index of first character)
232                let mut last_line: (usize, usize) = (0, 0);
233                let mut last_non_empty_line: (usize, usize) = (0, 0);
234                for (i, c) in content.char_indices() {
235                    if !c.is_whitespace() {
236                        last_non_empty_line = last_line;
237                    } else if c == '\n' {
238                        last_line.0 += 1;
239                        last_line.1 = i + 1;
240                    }
241                }
242                let last_line = content[last_non_empty_line.1..].trim_end();
243                SourceCodeTrace {
244                    file_name: file_name.clone(),
245                    line_content: last_line.to_string(),
246                    line_number: last_non_empty_line.0 + 1,
247                    index: last_line.len(),
248                    value: " ".to_string(),
249                    token: None,
250                }
251            }
252        }
253    }
254}
255
256#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
257enum Checkpoint {
258    SourceCode { file_name: PathBuf, content: String },
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn one_source_code() {
267        let file_name: PathBuf = "input.tex".into();
268        let line_1 = "hël".to_string();
269        let line_2 = "wor\\cömmand".to_string();
270        let line_3 = "hël".to_string();
271        let source_code = format!("{}\n{}\n{}", line_1, line_2, line_3);
272
273        let mut tracer: Tracer = Default::default();
274        let mut interner: CsNameInterner = Default::default();
275        let command = interner.get_or_intern("command");
276        let mut range = tracer.register_source_code(None, file_name.clone(), &source_code);
277        let mut tokens = vec![
278            Token::new_letter('h', range.next()),
279            Token::new_letter('e', range.next()),
280            Token::new_letter('l', range.next()),
281            Token::new_space('\n', range.next()),
282            Token::new_letter('w', range.next()),
283            Token::new_letter('o', range.next()),
284            Token::new_letter('r', range.next()),
285            Token::new_control_sequence(command, range.next()),
286        ];
287        for _ in 0.."command".len() {
288            range.next();
289        }
290        let mut extra_tokens = vec![
291            Token::new_space('\n', range.next()),
292            Token::new_letter('h', range.next()),
293            Token::new_letter('e', range.next()),
294            Token::new_letter('l', range.next()),
295        ];
296        tokens.append(&mut extra_tokens);
297
298        let got_traces: Vec<SourceCodeTrace> = tokens
299            .iter()
300            .map(|token| tracer.trace(*token, &interner))
301            .collect();
302
303        let want_traces = vec![
304            SourceCodeTrace {
305                file_name: file_name.clone(),
306                line_content: line_1.clone(),
307                line_number: 1,
308                index: 0,
309                value: "h".to_string(),
310                token: Some(tokens[0]),
311            },
312            SourceCodeTrace {
313                file_name: file_name.clone(),
314                line_content: line_1.clone(),
315                line_number: 1,
316                index: 1,
317                value: "e".to_string(),
318                token: Some(tokens[1]),
319            },
320            SourceCodeTrace {
321                file_name: file_name.clone(),
322                line_content: line_1.clone(),
323                line_number: 1,
324                index: 2,
325                value: "l".to_string(),
326                token: Some(tokens[2]),
327            },
328            SourceCodeTrace {
329                file_name: file_name.clone(),
330                line_content: line_1.clone(),
331                line_number: 1,
332                index: 3,
333                value: "\n".to_string(),
334                token: Some(tokens[3]),
335            },
336            SourceCodeTrace {
337                file_name: file_name.clone(),
338                line_content: line_2.clone(),
339                line_number: 2,
340                index: 0,
341                value: "w".to_string(),
342                token: Some(tokens[4]),
343            },
344            SourceCodeTrace {
345                file_name: file_name.clone(),
346                line_content: line_2.clone(),
347                line_number: 2,
348                index: 1,
349                value: "o".to_string(),
350                token: Some(tokens[5]),
351            },
352            SourceCodeTrace {
353                file_name: file_name.clone(),
354                line_content: line_2.clone(),
355                line_number: 2,
356                index: 2,
357                value: "r".to_string(),
358                token: Some(tokens[6]),
359            },
360            SourceCodeTrace {
361                file_name: file_name.clone(),
362                line_content: line_2.clone(),
363                line_number: 2,
364                index: 3,
365                value: "\\command".to_string(),
366                token: Some(tokens[7]),
367            },
368            SourceCodeTrace {
369                file_name: file_name.clone(),
370                line_content: line_2.clone(),
371                line_number: 2,
372                index: 11,
373                value: "\n".to_string(),
374                token: Some(tokens[8]),
375            },
376            SourceCodeTrace {
377                file_name: file_name.clone(),
378                line_content: line_3.clone(),
379                line_number: 3,
380                index: 0,
381                value: "h".to_string(),
382                token: Some(tokens[9]),
383            },
384            SourceCodeTrace {
385                file_name: file_name.clone(),
386                line_content: line_3.clone(),
387                line_number: 3,
388                index: 1,
389                value: "e".to_string(),
390                token: Some(tokens[10]),
391            },
392            SourceCodeTrace {
393                file_name: file_name.clone(),
394                line_content: line_3.clone(),
395                line_number: 3,
396                index: 2,
397                value: "l".to_string(),
398                token: Some(tokens[11]),
399            },
400        ];
401        assert_eq!(want_traces, got_traces);
402    }
403
404    #[test]
405    fn multiple_source_code() {
406        let mut tokens = Vec::new();
407        let mut tracer: Tracer = Default::default();
408        let interner: CsNameInterner = Default::default();
409
410        let file_1: PathBuf = "a.tex".into();
411        let file_1_content = "a".to_string();
412        let mut range = tracer.register_source_code(None, file_1.clone(), &file_1_content);
413        tokens.push(Token::new_letter('a', range.next()));
414
415        let file_2: PathBuf = "b.tex".into();
416        let file_2_content = "b".to_string();
417        let mut range = tracer.register_source_code(None, file_2.clone(), &file_2_content);
418        tokens.push(Token::new_letter('b', range.next()));
419
420        let file_3: PathBuf = "c.tex".into();
421        let file_3_content = "c".to_string();
422        let mut range = tracer.register_source_code(None, file_3.clone(), &file_3_content);
423        tokens.push(Token::new_letter('c', range.next()));
424
425        let got_traces: Vec<SourceCodeTrace> = tokens
426            .iter()
427            .map(|token| tracer.trace(*token, &interner))
428            .collect();
429
430        let want_traces = vec![
431            SourceCodeTrace {
432                file_name: file_1,
433                line_content: file_1_content,
434                line_number: 1,
435                index: 0,
436                value: "a".to_string(),
437                token: Some(tokens[0]),
438            },
439            SourceCodeTrace {
440                file_name: file_2,
441                line_content: file_2_content,
442                line_number: 1,
443                index: 0,
444                value: "b".to_string(),
445                token: Some(tokens[1]),
446            },
447            SourceCodeTrace {
448                file_name: file_3,
449                line_content: file_3_content,
450                line_number: 1,
451                index: 0,
452                value: "c".to_string(),
453                token: Some(tokens[2]),
454            },
455        ];
456        assert_eq!(want_traces, got_traces);
457    }
458}