oak_core/helpers/
lexing.rs

1//! Lexer testing utilities for the Oak parsing framework.
2//!
3//! This module provides comprehensive testing infrastructure for lexers,
4//! including file-based testing, expected output comparison, and
5//! test result serialization.
6
7use crate::{
8    Language, Lexer, TokenType,
9    errors::{OakDiagnostics, OakError},
10    helpers::{create_file, json_from_path, source_from_path},
11    source::Source,
12};
13use serde::{Deserialize, Serialize};
14use serde_json::{Serializer, ser::PrettyFormatter};
15use std::{
16    path::{Path, PathBuf},
17    sync::{Arc, Mutex},
18    thread,
19    time::{Duration, Instant},
20};
21use walkdir::WalkDir;
22
23/// A lexer testing utility that can run tests against multiple files.
24///
25/// The `LexerTester` provides functionality to test lexers against a directory
26/// of files with specific extensions, comparing actual output against expected
27/// results stored in JSON files.
28pub struct LexerTester {
29    root: PathBuf,
30    extensions: Vec<String>,
31    timeout: Duration,
32}
33
34/// Expected lexer test results for comparison.
35///
36/// This struct represents the expected output of a lexer test, including
37/// success status, token count, token data, and any expected errors.
38#[derive(Debug, Serialize, Deserialize, PartialEq)]
39pub struct LexerTestExpected {
40    success: bool,
41    count: usize,
42    tokens: Vec<TokenData>,
43    errors: Vec<String>,
44}
45
46/// Individual token data for lexer testing.
47///
48/// Represents a single token with its kind, text content, and position
49/// information used for testing lexer output.
50#[derive(Debug, Serialize, Deserialize, PartialEq)]
51pub struct TokenData {
52    kind: String,
53    text: String,
54    start: usize,
55    end: usize,
56}
57
58impl LexerTester {
59    /// Creates a new lexer tester with the specified root directory.
60    pub fn new<P: AsRef<Path>>(root: P) -> Self {
61        Self { root: root.as_ref().to_path_buf(), extensions: vec![], timeout: Duration::from_secs(10) }
62    }
63
64    /// Adds a file extension to test against.
65    pub fn with_extension(mut self, extension: impl ToString) -> Self {
66        self.extensions.push(extension.to_string());
67        self
68    }
69    /// Sets the timeout duration for each test.
70    pub fn with_timeout(mut self, time: Duration) -> Self {
71        self.timeout = time;
72        self
73    }
74
75    /// Run tests for the given lexer against all files in the root directory with the specified extensions.
76    ///
77    /// # Arguments
78    ///
79    /// * `lexer`: The lexer to test.
80    ///
81    /// # Examples
82    ///
83    /// ```
84    /// ```
85    pub fn run_tests<L, Lex>(self, lexer: &Lex) -> Result<(), OakError>
86    where
87        L: Language + Send + Sync + 'static,
88        L::TokenType: Serialize + std::fmt::Debug + Send + Sync,
89        Lex: Lexer<L> + Send + Sync + 'static + Clone,
90    {
91        let test_files = self.find_test_files()?;
92        let force_regenerated = std::env::var("REGENERATE_TESTS").unwrap_or("0".to_string()) == "1";
93        let mut regenerated_any = false;
94
95        for file_path in test_files {
96            println!("Testing file: {}", file_path.display());
97            regenerated_any |= self.test_single_file::<L, Lex>(&file_path, lexer, force_regenerated)?;
98        }
99
100        if regenerated_any && force_regenerated { Err(OakError::test_regenerated(self.root)) } else { Ok(()) }
101    }
102
103    fn find_test_files(&self) -> Result<Vec<PathBuf>, OakError> {
104        let mut files = Vec::new();
105
106        for entry in WalkDir::new(&self.root) {
107            let entry = entry.unwrap();
108            let path = entry.path();
109
110            if path.is_file() {
111                if let Some(ext) = path.extension() {
112                    let ext_str = ext.to_str().unwrap_or("");
113                    if self.extensions.iter().any(|e| e == ext_str) {
114                        // 忽略由 Tester 自身生成的输出文件,防止递归包含
115                        let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
116                        let is_output_file = file_name.ends_with(".parsed.json") || file_name.ends_with(".lexed.json") || file_name.ends_with(".built.json");
117
118                        if !is_output_file {
119                            files.push(path.to_path_buf());
120                        }
121                    }
122                }
123            }
124        }
125
126        Ok(files)
127    }
128
129    fn test_single_file<L, Lex>(&self, file_path: &Path, lexer: &Lex, force_regenerated: bool) -> Result<bool, OakError>
130    where
131        L: Language + Send + Sync + 'static,
132        L::TokenType: Serialize + std::fmt::Debug + Send + Sync,
133        Lex: Lexer<L> + Send + Sync + 'static + Clone,
134    {
135        let source = source_from_path(file_path)?;
136
137        // Use Arc and Mutex to share results between threads
138        let result = Arc::new(Mutex::new(None));
139        let result_clone = Arc::clone(&result);
140
141        // Clone lexer for use in thread
142        let lexer_clone = lexer.clone();
143        // Wrap source in Arc for sharing between threads
144        let source_arc = Arc::new(source);
145        let source_clone = Arc::clone(&source_arc);
146
147        // Create a new thread to perform lexical analysis
148        let handle = thread::spawn(move || {
149            let mut cache = crate::parser::ParseSession::<L>::default();
150            let output = lexer_clone.lex(&*source_clone, &[], &mut cache);
151            let mut result = result_clone.lock().unwrap();
152            *result = Some(output);
153        });
154
155        // Wait for thread completion or timeout
156        let start_time = Instant::now();
157        let timeout_occurred = loop {
158            // Check if thread has finished
159            if handle.is_finished() {
160                break false;
161            }
162
163            // Check for timeout
164            if start_time.elapsed() > self.timeout {
165                break true;
166            }
167
168            // Sleep briefly to avoid busy waiting
169            thread::sleep(Duration::from_millis(10));
170        };
171
172        // Return error if timed out
173        if timeout_occurred {
174            return Err(OakError::custom_error(&format!("Lexer test timed out after {:?} for file: {}", self.timeout, file_path.display())));
175        }
176
177        // Get lexical analysis result
178        let OakDiagnostics { result: tokens_result, mut diagnostics } = {
179            let result_guard = result.lock().unwrap();
180            match result_guard.as_ref() {
181                Some(output) => output.clone(),
182                None => return Err(OakError::custom_error("Failed to get lexer result")),
183            }
184        };
185
186        // Construct test result
187        let mut success = true;
188        let tokens = match tokens_result {
189            Ok(tokens) => tokens,
190            Err(e) => {
191                success = false;
192                diagnostics.push(e);
193                triomphe::Arc::from_iter(Vec::new())
194            }
195        };
196
197        if !diagnostics.is_empty() {
198            success = false;
199        }
200
201        let tokens: Vec<TokenData> = tokens
202            .iter()
203            .filter(|token| !token.kind.is_ignored())
204            .map(|token| {
205                let len = source_arc.as_ref().length();
206                let start = token.span.start.min(len);
207                let end = token.span.end.min(len).max(start);
208                let text = source_arc.as_ref().get_text_in((start..end).into()).to_string();
209                TokenData { kind: format!("{:?}", token.kind), text, start: token.span.start, end: token.span.end }
210            })
211            .take(100)
212            .collect();
213
214        let errors: Vec<String> = diagnostics.iter().map(|e| e.to_string()).collect();
215        let test_result = LexerTestExpected { success, count: tokens.len(), tokens, errors };
216
217        // Process expected result file
218        let expected_file = file_path.with_extension(format!("{}.lexed.json", file_path.extension().unwrap_or_default().to_str().unwrap_or("")));
219
220        let mut regenerated = false;
221        if expected_file.exists() && !force_regenerated {
222            let expected: LexerTestExpected = json_from_path(&expected_file)?;
223
224            if test_result != expected {
225                return Err(OakError::test_failure(file_path.to_path_buf(), format!("{:#?}", expected), format!("{:#?}", test_result)));
226            }
227        }
228        else {
229            let file = create_file(&expected_file)?;
230            let mut writer = Serializer::with_formatter(file, PrettyFormatter::with_indent(b"    "));
231            test_result.serialize(&mut writer)?;
232
233            if force_regenerated {
234                regenerated = true;
235            }
236            else {
237                return Err(OakError::test_regenerated(expected_file));
238            }
239        }
240
241        Ok(regenerated)
242    }
243}