Skip to main content

oak_testing/
lexing.rs

1//! Lexer testing utilities for the Oak ecosystem.
2//!
3//! This module provides comprehensive testing infrastructure for lexers,
4//! including file-based testing, expected output comparison, and
5//! test result serialization.
6
7use crate::{create_file, json_from_path, source_from_path};
8use oak_core::{
9    Language, Lexer, Source, TokenType,
10    errors::{OakDiagnostics, OakError},
11};
12use serde::{Deserialize, Serialize};
13
14use std::{
15    path::{Path, PathBuf},
16    sync::{Arc, Mutex},
17    thread,
18    time::{Duration, Instant},
19};
20use walkdir::WalkDir;
21
22/// A lexer testing utility that can run tests against multiple files.
23///
24/// The `LexerTester` provides functionality to test lexers against a directory
25/// of files with specific extensions, comparing actual output against expected
26/// results stored in JSON files.
27pub struct LexerTester {
28    root: PathBuf,
29    extensions: Vec<String>,
30    timeout: Duration,
31}
32
33/// Expected lexer test results for comparison.
34///
35/// This struct represents the expected output of a lexer test, including
36/// success status, token count, token data, and any expected errors.
37#[derive(Debug, PartialEq, Serialize, Deserialize)]
38pub struct LexerTestExpected {
39    /// Whether the lexing was expected to succeed.
40    pub success: bool,
41    /// The expected number of tokens.
42    pub count: usize,
43    /// The expected token data.
44    pub tokens: Vec<TokenData>,
45    /// Any expected error messages.
46    pub errors: Vec<String>,
47}
48
49/// Individual token data for lexer testing.
50///
51/// Represents a single token with its kind, text content, and position
52/// information used for testing lexer output.
53#[derive(Debug, PartialEq, Serialize, Deserialize)]
54pub struct TokenData {
55    /// The kind of the token as a string.
56    pub kind: String,
57    /// The text content of the token.
58    pub text: String,
59    /// The start position of the token in the source.
60    pub start: usize,
61    /// The end position of the token in the source.
62    pub end: usize,
63}
64
65impl LexerTester {
66    /// Creates a new lexer tester with the specified root directory.
67    pub fn new<P: AsRef<Path>>(root: P) -> Self {
68        Self { root: root.as_ref().to_path_buf(), extensions: vec![], timeout: Duration::from_secs(10) }
69    }
70
71    /// Adds a file extension to test against.
72    pub fn with_extension(mut self, extension: impl ToString) -> Self {
73        self.extensions.push(extension.to_string());
74        self
75    }
76
77    /// Sets the timeout duration for each test.
78    pub fn with_timeout(mut self, time: Duration) -> Self {
79        self.timeout = time;
80        self
81    }
82
83    /// Run tests for the given lexer against all files in the root directory with the specified extensions.
84    pub fn run_tests<L, Lex>(self, lexer: &Lex) -> Result<(), OakError>
85    where
86        L: Language + Send + Sync,
87        L::TokenType: Serialize + std::fmt::Debug + Send + Sync,
88        Lex: Lexer<L> + Send + Sync + Clone,
89    {
90        let test_files = self.find_test_files()?;
91        let force_regenerated = std::env::var("REGENERATE_TESTS").unwrap_or("0".to_string()) == "1";
92        let mut regenerated_any = false;
93
94        for file_path in test_files {
95            println!("Testing file: {}", file_path.display());
96            regenerated_any |= self.test_single_file::<L, Lex>(&file_path, lexer, force_regenerated)?
97        }
98
99        if regenerated_any && force_regenerated {
100            println!("Tests regenerated for: {}", self.root.display());
101            Ok(())
102        }
103        else {
104            Ok(())
105        }
106    }
107
108    fn find_test_files(&self) -> Result<Vec<PathBuf>, OakError> {
109        let mut files = Vec::new();
110
111        for entry in WalkDir::new(&self.root) {
112            let entry = entry.unwrap();
113            let path = entry.path();
114
115            if path.is_file() {
116                if let Some(ext) = path.extension() {
117                    let ext_str = ext.to_str().unwrap_or("");
118                    if self.extensions.iter().any(|e| e == ext_str) {
119                        // Ignore output files generated by the Tester itself to prevent recursive inclusion
120                        let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
121                        let is_output_file = file_name.ends_with(".parsed.json") || file_name.ends_with(".lexed.json") || file_name.ends_with(".built.json") || file_name.ends_with(".expected.json");
122
123                        if !is_output_file {
124                            files.push(path.to_path_buf())
125                        }
126                    }
127                }
128            }
129        }
130
131        Ok(files)
132    }
133
134    fn test_single_file<L, Lex>(&self, file_path: &Path, lexer: &Lex, force_regenerated: bool) -> Result<bool, OakError>
135    where
136        L: Language + Send + Sync,
137        L::TokenType: Serialize + std::fmt::Debug + Send + Sync,
138        Lex: Lexer<L> + Send + Sync + Clone,
139    {
140        let source = source_from_path(file_path)?;
141
142        // Use Arc and Mutex to share results between threads
143        let result = Arc::new(Mutex::new(None));
144        let result_clone = Arc::clone(&result);
145
146        // Clone lexer for use in thread
147        let lexer_clone = lexer.clone();
148        // Wrap source in Arc for sharing between threads
149        let source_arc = Arc::new(source);
150        let source_clone = Arc::clone(&source_arc);
151
152        // Create a new thread to raise lexical analysis
153        std::thread::scope(|s| {
154            let handle = s.spawn(move || {
155                let mut cache = oak_core::parser::ParseSession::<L>::default();
156                let output = lexer_clone.lex(&*source_clone, &[], &mut cache);
157                let mut result = result_clone.lock().unwrap();
158                *result = Some(output)
159            });
160
161            // Wait for thread completion or timeout
162            let start_time = Instant::now();
163            let timeout_occurred = loop {
164                // Check if thread has finished
165                if handle.is_finished() {
166                    break false;
167                }
168
169                // Check for timeout
170                if start_time.elapsed() > self.timeout {
171                    break true;
172                }
173
174                // Sleep briefly to avoid busy waiting
175                thread::sleep(Duration::from_millis(10));
176            };
177
178            // Return error if timed out
179            if timeout_occurred {
180                return Err(OakError::custom_error(&format!("Lexer test timed out after {:?} for file: {}", self.timeout, file_path.display())));
181            }
182
183            Ok(())
184        })?;
185
186        // Get lexical analysis result
187        let OakDiagnostics { result: tokens_result, mut diagnostics } = {
188            let result_guard = result.lock().unwrap();
189            match result_guard.as_ref() {
190                Some(output) => output.clone(),
191                None => return Err(OakError::custom_error("Failed to get lexer result")),
192            }
193        };
194
195        // Construct test result
196        let mut success = true;
197        let tokens = match tokens_result {
198            Ok(tokens) => tokens,
199            Err(e) => {
200                success = false;
201                diagnostics.push(e);
202                oak_core::Tokens::default()
203            }
204        };
205
206        if !diagnostics.is_empty() {
207            success = false;
208        }
209
210        let tokens: Vec<TokenData> = tokens
211            .iter()
212            .filter(|token| !token.kind.is_ignored())
213            .map(|token| {
214                let len = source_arc.as_ref().length();
215                let start = token.span.start.min(len);
216                let end = token.span.end.min(len).max(start);
217                let text = source_arc.as_ref().get_text_in((start..end).into()).to_string();
218                TokenData { kind: format!("{:?}", token.kind), text, start: token.span.start, end: token.span.end }
219            })
220            .collect();
221
222        let errors: Vec<String> = diagnostics.iter().map(|e| e.to_string()).collect();
223        let test_result = LexerTestExpected { success, count: tokens.len(), tokens, errors };
224
225        // Process expected result file
226        let expected_file = file_path.with_extension(format!("{}.lexed.json", file_path.extension().unwrap_or_default().to_str().unwrap_or("")));
227
228        // Migration: If the new naming convention file doesn't exist, but the old one does, rename it
229        if !expected_file.exists() {
230            let legacy_file = file_path.with_extension("expected.json");
231            if legacy_file.exists() {
232                let _ = std::fs::rename(&legacy_file, &expected_file);
233            }
234        }
235
236        let mut regenerated = false;
237        if expected_file.exists() && !force_regenerated {
238            let expected_json = json_from_path(&expected_file)?;
239            let expected: LexerTestExpected = serde_json::from_value(expected_json).map_err(|e| OakError::custom_error(e.to_string()))?;
240            if test_result != expected {
241                return Err(OakError::test_failure(file_path.to_path_buf(), format!("{:#?}", expected), format!("{:#?}", test_result)));
242            }
243        }
244        else {
245            use std::io::Write;
246            let mut file = create_file(&expected_file)?;
247            let json_val = serde_json::to_string_pretty(&test_result).map_err(|e| OakError::custom_error(e.to_string()))?;
248            file.write_all(json_val.as_bytes()).map_err(|e| OakError::custom_error(e.to_string()))?;
249            regenerated = true;
250        }
251
252        Ok(regenerated)
253    }
254}