mati_core/analysis/parser/
mod.rs1mod c;
15mod cpp;
16mod elixir;
17mod go;
18mod haskell;
19pub mod import;
20mod java;
21mod python;
22mod ruby;
23mod rust;
24mod scala;
25mod typescript;
26
27use std::collections::HashMap;
28
29use anyhow::Result;
30use rayon::prelude::*;
31use sha2::{Digest, Sha256};
32
33use crate::analysis::walker::{Language, WalkedFile};
34use crate::store::record::{TodoComment, TodoKind};
35
36pub use import::{ImportKind, ImportStatement};
37
38#[derive(Debug, Clone)]
46pub struct StaticFileAnalysis {
47 pub path: String,
49 pub language: Language,
50 pub entry_points: Vec<String>,
52 pub exported_types: Vec<String>,
55 pub imports: Vec<ImportStatement>,
57 pub todos: Vec<TodoComment>,
59 pub unsafe_count: u32,
61 pub unwrap_count: u32,
63 pub panic_count: u32,
65 pub branch_count: u32,
67 pub module_doc: Option<String>,
69 pub content_hash: Option<String>,
71 pub line_count: u32,
73}
74
75impl StaticFileAnalysis {
76 pub(crate) fn empty(file: &WalkedFile) -> Self {
77 Self {
78 path: file.rel_path.clone(),
79 language: file.language,
80 entry_points: Vec::new(),
81 exported_types: Vec::new(),
82 imports: Vec::new(),
83 todos: Vec::new(),
84 unsafe_count: 0,
85 unwrap_count: 0,
86 panic_count: 0,
87 branch_count: 0,
88 module_doc: None,
89 content_hash: None,
90 line_count: 0,
91 }
92 }
93}
94
95pub fn parse_file(file: &WalkedFile) -> Result<StaticFileAnalysis> {
104 if !is_parseable_language(file.language) {
106 return Ok(StaticFileAnalysis::empty(file));
107 }
108 let bytes = match read_source_bytes(file) {
109 Some(b) => b,
110 None => return Ok(StaticFileAnalysis::empty(file)),
111 };
112 analyze_file_bytes(file, &bytes)
113}
114
115pub fn parse_files_parallel(files: &[WalkedFile]) -> Vec<StaticFileAnalysis> {
120 files
121 .par_iter()
122 .map(|f| {
123 parse_file(f).unwrap_or_else(|e| {
124 tracing::warn!("parser: unexpected error on {}: {e}", f.rel_path);
125 StaticFileAnalysis::empty(f)
126 })
127 })
128 .collect()
129}
130
131pub struct HashParseOutput {
133 pub parsed_files: Vec<WalkedFile>,
135 pub analyses: Vec<StaticFileAnalysis>,
137 pub new_mtimes: HashMap<String, u64>,
140 pub parse_count: usize,
142 pub skipped_count: usize,
144}
145
146pub fn hash_and_parse_parallel(
156 files: &[WalkedFile],
157 stored_mtimes: &HashMap<String, u64>,
158) -> HashParseOutput {
159 enum Slot {
160 Changed(Box<(WalkedFile, StaticFileAnalysis)>),
161 Unchanged,
162 }
163
164 let slots: Vec<Option<Slot>> = files
165 .par_iter()
166 .map(|f| {
167 if f.mtime_secs != 0 && stored_mtimes.get(&f.rel_path) == Some(&f.mtime_secs) {
169 return Some(Slot::Unchanged);
170 }
171 if !is_parseable_language(f.language) {
173 return Some(Slot::Changed(Box::new((
174 f.clone(),
175 StaticFileAnalysis::empty(f),
176 ))));
177 }
178 let bytes = match std::fs::read(&f.abs_path) {
180 Ok(b) => b,
181 Err(_) => return None, };
183 let analysis = analyze_file_bytes(f, &bytes).unwrap_or_else(|e| {
184 tracing::warn!("parser: error on {}: {e}", f.rel_path);
185 StaticFileAnalysis::empty(f)
186 });
187 Some(Slot::Changed(Box::new((f.clone(), analysis))))
188 })
189 .collect();
190
191 let mut parsed_files = Vec::new();
192 let mut analyses = Vec::new();
193 let mut new_mtimes = HashMap::new();
194 let mut skipped_count = 0usize;
195
196 for slot in slots.into_iter().flatten() {
197 match slot {
198 Slot::Changed(boxed) => {
199 let (file, analysis) = *boxed;
200 new_mtimes.insert(file.rel_path.clone(), file.mtime_secs);
201 parsed_files.push(file);
202 analyses.push(analysis);
203 }
204 Slot::Unchanged => skipped_count += 1,
205 }
206 }
207
208 let parse_count = parsed_files.len();
209 HashParseOutput {
210 parsed_files,
211 analyses,
212 new_mtimes,
213 parse_count,
214 skipped_count,
215 }
216}
217
218fn is_parseable_language(language: Language) -> bool {
221 matches!(
222 language,
223 Language::Rust
224 | Language::TypeScript
225 | Language::JavaScript
226 | Language::Python
227 | Language::Go
228 | Language::Java
229 | Language::C
230 | Language::Cpp
231 | Language::Ruby
232 | Language::Scala
233 | Language::Elixir
234 | Language::Haskell
235 )
236}
237
238pub(crate) fn analyze_file_bytes(file: &WalkedFile, bytes: &[u8]) -> Result<StaticFileAnalysis> {
239 let source = String::from_utf8_lossy(bytes);
240 let mut analysis = parse_file_from_source(file, &source)?;
241 analysis.content_hash = Some(format!("{:x}", Sha256::digest(bytes)));
242 analysis.line_count = count_lines(bytes);
243 Ok(analysis)
244}
245
246fn parse_file_from_source(file: &WalkedFile, source: &str) -> Result<StaticFileAnalysis> {
248 match file.language {
249 Language::Rust => rust::parse_rust(file, source),
250 Language::TypeScript | Language::JavaScript => typescript::parse_typescript(file, source),
251 Language::Python => python::parse_python(file, source),
252 Language::Go => go::parse_go(file, source),
253 Language::Java => java::parse_java(file, source),
254 Language::C => c::parse_c(file, source),
255 Language::Cpp => cpp::parse_cpp(file, source),
256 Language::Ruby => ruby::parse_ruby(file, source),
257 Language::Scala => scala::parse_scala(file, source),
258 Language::Elixir => elixir::parse_elixir(file, source),
259 Language::Haskell => haskell::parse_haskell(file, source),
260 _ => Ok(StaticFileAnalysis::empty(file)),
261 }
262}
263
264fn read_source_bytes(file: &WalkedFile) -> Option<Vec<u8>> {
265 match std::fs::read(&file.abs_path) {
266 Ok(bytes) => Some(bytes),
267 Err(e) => {
268 tracing::warn!("parser: cannot read {}: {e}", file.rel_path);
269 None
270 }
271 }
272}
273
274fn count_lines(bytes: &[u8]) -> u32 {
275 if bytes.is_empty() {
276 return 0;
277 }
278 let newline_count = bytes.iter().filter(|&&b| b == b'\n').count() as u32;
279 if bytes.last() == Some(&b'\n') {
280 newline_count
281 } else {
282 newline_count + 1
283 }
284}
285
286pub(crate) fn extract_todo(comment: &str, line: u32) -> Option<TodoComment> {
292 let inner = comment
293 .trim_start_matches('/')
294 .trim_start_matches('*')
295 .trim_start_matches('#')
296 .trim_end_matches('/')
297 .trim_end_matches('*')
298 .trim();
299
300 let b = inner.as_bytes();
301
302 let kind = if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"TODO") {
303 TodoKind::Todo
304 } else if b.len() >= 5 && b[..5].eq_ignore_ascii_case(b"FIXME") {
305 TodoKind::Fixme
306 } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"HACK") {
307 TodoKind::Hack
308 } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"NOTE") {
309 TodoKind::Note
310 } else if b.len() >= 10 && b[..10].eq_ignore_ascii_case(b"DEPRECATED") {
311 TodoKind::Deprecated
312 } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"@TS-") {
313 TodoKind::Note
315 } else if inner.contains("type: ignore") {
316 TodoKind::Note
318 } else {
319 return None;
320 };
321
322 Some(TodoComment {
323 text: inner.to_owned(),
324 line,
325 kind,
326 })
327}
328
329pub(crate) fn normalize_doc(s: &str) -> String {
335 let mut out = String::with_capacity(s.len());
336 let mut last_was_space = true; for ch in s.chars() {
338 if ch.is_whitespace() {
339 if !last_was_space {
340 out.push(' ');
341 last_was_space = true;
342 }
343 } else {
344 out.push(ch);
345 last_was_space = false;
346 }
347 }
348 if out.ends_with(' ') {
349 out.pop();
350 }
351 out
352}
353
354#[cfg(test)]
357mod tests {
358 use super::*;
359 use std::path::PathBuf;
360
361 #[test]
362 fn extract_todo_none_for_plain_comment() {
363 assert!(extract_todo("// nothing special", 1).is_none());
364 }
365
366 #[test]
367 fn extract_todo_rust_line_comment() {
368 let t = extract_todo("// TODO: do something", 3).unwrap();
369 assert_eq!(t.kind, TodoKind::Todo);
370 assert_eq!(t.line, 3);
371 }
372
373 #[test]
374 fn extract_todo_rust_block_comment() {
375 let t = extract_todo("/* FIXME: clean up */", 10).unwrap();
376 assert_eq!(t.kind, TodoKind::Fixme);
377 }
378
379 #[test]
380 fn extract_todo_rust_doc_comment() {
381 let t = extract_todo("/// TODO: document", 1).unwrap();
382 assert_eq!(t.kind, TodoKind::Todo);
383 }
384
385 #[test]
386 fn extract_todo_python_hash_comment() {
387 let t = extract_todo("# TODO: fix this", 5).unwrap();
388 assert_eq!(t.kind, TodoKind::Todo);
389 }
390
391 #[test]
392 fn extract_todo_ts_ignore() {
393 let t = extract_todo("// @ts-ignore", 1).unwrap();
394 assert_eq!(t.kind, TodoKind::Note);
395 }
396
397 #[test]
398 fn extract_todo_ts_expect_error() {
399 let t = extract_todo("// @ts-expect-error", 1).unwrap();
400 assert_eq!(t.kind, TodoKind::Note);
401 }
402
403 #[test]
404 fn extract_todo_python_type_ignore() {
405 let t = extract_todo("# type: ignore", 1).unwrap();
406 assert_eq!(t.kind, TodoKind::Note);
407 }
408
409 #[test]
410 fn extract_todo_python_type_ignore_with_code() {
411 let t = extract_todo("# type: ignore[attr-defined]", 1).unwrap();
412 assert_eq!(t.kind, TodoKind::Note);
413 }
414
415 #[test]
416 fn extract_todo_case_insensitive() {
417 let t = extract_todo("// todo: lowercase", 1).unwrap();
418 assert_eq!(t.kind, TodoKind::Todo);
419 }
420
421 #[test]
422 fn unsupported_language_skipped_without_disk_read() {
423 let f = WalkedFile {
424 abs_path: PathBuf::from("/nonexistent/file.txt"),
425 rel_path: "notes.txt".to_owned(),
426 language: Language::Unknown,
427 size_bytes: 0,
428 mtime_secs: 0,
429 };
430 let a = parse_file(&f).unwrap();
431 assert!(a.entry_points.is_empty());
432 }
433
434 #[test]
435 fn parse_files_parallel_preserves_order() {
436 use tempfile::TempDir;
437 let dir = TempDir::new().unwrap();
438 let files: Vec<WalkedFile> = (0..3)
439 .map(|i| {
440 let rel = format!("f{i}.rs");
441 let abs = dir.path().join(&rel);
442 std::fs::write(&abs, format!("pub fn f{i}() {{}}")).unwrap();
443 WalkedFile {
444 abs_path: abs,
445 rel_path: rel,
446 language: Language::Rust,
447 size_bytes: 20,
448 mtime_secs: 0,
449 }
450 })
451 .collect();
452
453 let results = parse_files_parallel(&files);
454 assert_eq!(results[0].path, "f0.rs");
455 assert_eq!(results[1].path, "f1.rs");
456 assert_eq!(results[2].path, "f2.rs");
457 }
458
459 #[test]
460 fn parse_file_populates_hash_and_line_count() {
461 use tempfile::TempDir;
462
463 let dir = TempDir::new().unwrap();
464 let abs = dir.path().join("f.rs");
465 std::fs::write(&abs, "pub fn f() {}\n").unwrap();
466
467 let file = WalkedFile {
468 abs_path: abs,
469 rel_path: "f.rs".to_string(),
470 language: Language::Rust,
471 size_bytes: 13,
472 mtime_secs: 0,
473 };
474
475 let analysis = parse_file(&file).unwrap();
476 assert!(analysis.content_hash.is_some());
477 assert_eq!(analysis.line_count, 1);
478 }
479
480 #[test]
481 fn parse_file_counts_single_line_without_trailing_newline() {
482 use tempfile::TempDir;
483
484 let dir = TempDir::new().unwrap();
485 let abs = dir.path().join("f.rs");
486 std::fs::write(&abs, "pub fn f() {}").unwrap();
487
488 let file = WalkedFile {
489 abs_path: abs,
490 rel_path: "f.rs".to_string(),
491 language: Language::Rust,
492 size_bytes: 12,
493 mtime_secs: 0,
494 };
495
496 let analysis = parse_file(&file).unwrap();
497 assert_eq!(analysis.line_count, 1);
498 }
499
500 #[test]
501 fn parse_file_counts_multiple_lines_without_trailing_newline() {
502 use tempfile::TempDir;
503
504 let dir = TempDir::new().unwrap();
505 let abs = dir.path().join("f.rs");
506 std::fs::write(&abs, "pub fn f() {}\npub fn g() {}").unwrap();
507
508 let file = WalkedFile {
509 abs_path: abs,
510 rel_path: "f.rs".to_string(),
511 language: Language::Rust,
512 size_bytes: 27,
513 mtime_secs: 0,
514 };
515
516 let analysis = parse_file(&file).unwrap();
517 assert_eq!(analysis.line_count, 2);
518 }
519}