Skip to main content

mati_core/analysis/parser/
mod.rs

1//! Multi-language tree-sitter parser — Layer 0 static analysis.
2//!
3//! Each supported language lives in its own submodule with isolated statics:
4//! `LazyLock<Language>`, `LazyLock<Query>`, `LazyLock<Captures>`,
5//! `thread_local! Parser`. Adding a language = copy a module.
6//!
7//! # Performance
8//!
9//! - One combined query per language, single tree traversal per file.
10//! - Thread-local parsers: one per rayon worker, reused across files.
11//! - Disk read skipped for unsupported languages.
12//! - Count-only captures: no text allocated for counting signals.
13
14mod c;
15mod cpp;
16mod elixir;
17mod go;
18mod haskell;
19pub mod import;
20mod java;
21mod python;
22mod ruby;
23mod rust;
24mod scala;
25mod typescript;
26
27use std::collections::HashMap;
28
29use anyhow::Result;
30use rayon::prelude::*;
31use sha2::{Digest, Sha256};
32
33use crate::analysis::walker::{Language, WalkedFile};
34use crate::store::record::{TodoComment, TodoKind};
35
36pub use import::{ImportKind, ImportStatement};
37
38// ── Output type ───────────────────────────────────────────────────────────────
39
40/// Structural signals extracted from a single source file by tree-sitter.
41///
42/// Intermediate representation for Layer 0. Maps onto `FileRecord` fields.
43/// Git-derived fields (`change_frequency`, `last_author`, `is_hotspot`)
44/// are filled later by M-06-D.
45#[derive(Debug, Clone)]
46pub struct StaticFileAnalysis {
47    /// Repo-relative path with forward slashes.
48    pub path: String,
49    pub language: Language,
50    /// Public functions and modules (Rust: `pub fn`; TS: exported; Python: non-`_` top-level).
51    pub entry_points: Vec<String>,
52    /// Public types (Rust: `pub struct/enum/trait`; TS: exported class/interface/type/enum;
53    /// Python: non-`_` top-level classes).
54    pub exported_types: Vec<String>,
55    /// Structured import statements with classification and source location.
56    pub imports: Vec<ImportStatement>,
57    /// TODO / FIXME / HACK / NOTE / DEPRECATED / @ts-ignore / type:ignore comments.
58    pub todos: Vec<TodoComment>,
59    /// `unsafe {}` blocks (Rust only).
60    pub unsafe_count: u32,
61    /// `.unwrap()` calls (Rust) or non-null assertions `!` (TypeScript).
62    pub unwrap_count: u32,
63    /// `panic!()` macro invocations (Rust only).
64    pub panic_count: u32,
65    /// Control-flow branches: if, match/switch, loop, while, for, ternary, try.
66    pub branch_count: u32,
67    /// Canonical module-level doc comment (language-specific — see ENRICHMENT.md §1.1).
68    pub module_doc: Option<String>,
69    /// SHA-256 hex digest of file bytes at parse time. Used for content-change detection (P3).
70    pub content_hash: Option<String>,
71    /// Number of newlines in the file — used for line-count delta in staleness signals.
72    pub line_count: u32,
73}
74
75impl StaticFileAnalysis {
76    pub(crate) fn empty(file: &WalkedFile) -> Self {
77        Self {
78            path: file.rel_path.clone(),
79            language: file.language,
80            entry_points: Vec::new(),
81            exported_types: Vec::new(),
82            imports: Vec::new(),
83            todos: Vec::new(),
84            unsafe_count: 0,
85            unwrap_count: 0,
86            panic_count: 0,
87            branch_count: 0,
88            module_doc: None,
89            content_hash: None,
90            line_count: 0,
91        }
92    }
93}
94
95// ── Public API ────────────────────────────────────────────────────────────────
96
97/// Parse a single file and return its structural analysis.
98///
99/// Returns an empty analysis (never `Err`) when:
100/// - Language is unsupported (skips disk read entirely)
101/// - File cannot be read from disk
102/// - tree-sitter fails to produce a parse tree
103pub fn parse_file(file: &WalkedFile) -> Result<StaticFileAnalysis> {
104    // Guard: skip disk read for unsupported languages.
105    if !is_parseable_language(file.language) {
106        return Ok(StaticFileAnalysis::empty(file));
107    }
108    let bytes = match read_source_bytes(file) {
109        Some(b) => b,
110        None => return Ok(StaticFileAnalysis::empty(file)),
111    };
112    analyze_file_bytes(file, &bytes)
113}
114
115/// Parse a slice of files in parallel using rayon.
116///
117/// Parse errors are logged and produce an empty analysis — a single
118/// unreadable file never aborts the entire init pass.
119pub fn parse_files_parallel(files: &[WalkedFile]) -> Vec<StaticFileAnalysis> {
120    files
121        .par_iter()
122        .map(|f| {
123            parse_file(f).unwrap_or_else(|e| {
124                tracing::warn!("parser: unexpected error on {}: {e}", f.rel_path);
125                StaticFileAnalysis::empty(f)
126            })
127        })
128        .collect()
129}
130
131/// Output of the combined mtime-check + parse pass.
132pub struct HashParseOutput {
133    /// Files whose mtime changed (new or modified), in rayon-completion order.
134    pub parsed_files: Vec<WalkedFile>,
135    /// Analyses for each file in `parsed_files` (same order).
136    pub analyses: Vec<StaticFileAnalysis>,
137    /// Updated mtimes for changed/new files only (rel_path → mtime_secs).
138    /// Merge these into the stored mtime index and write one blob record.
139    pub new_mtimes: HashMap<String, u64>,
140    /// Count of files that were (re)parsed.
141    pub parse_count: usize,
142    /// Count of files whose mtime matched the stored value — skipped (no read).
143    pub skipped_count: usize,
144}
145
146/// Combined mtime-check + parse pass.
147///
148/// For each file:
149/// - If `mtime_secs` matches the stored value → skip entirely (zero disk I/O).
150/// - Otherwise → read file bytes, run tree-sitter, record updated mtime.
151///
152/// This eliminates the full I/O sweep on re-init when files are unchanged:
153/// a re-init with no edits costs only the walk + mtime comparison (≈130ms),
154/// not a full disk read of all source files (≈2100ms on 58k-file repos).
155pub fn hash_and_parse_parallel(
156    files: &[WalkedFile],
157    stored_mtimes: &HashMap<String, u64>,
158) -> HashParseOutput {
159    enum Slot {
160        Changed(Box<(WalkedFile, StaticFileAnalysis)>),
161        Unchanged,
162    }
163
164    let slots: Vec<Option<Slot>> = files
165        .par_iter()
166        .map(|f| {
167            // Fast path: mtime unchanged → file is the same, skip entirely.
168            if f.mtime_secs != 0 && stored_mtimes.get(&f.rel_path) == Some(&f.mtime_secs) {
169                return Some(Slot::Unchanged);
170            }
171            // Non-parseable languages: record mtime from walker metadata — no disk read.
172            if !is_parseable_language(f.language) {
173                return Some(Slot::Changed(Box::new((
174                    f.clone(),
175                    StaticFileAnalysis::empty(f),
176                ))));
177            }
178            // Parseable, changed/new: read file bytes and run tree-sitter.
179            let bytes = match std::fs::read(&f.abs_path) {
180                Ok(b) => b,
181                Err(_) => return None, // unreadable — skip silently
182            };
183            let analysis = analyze_file_bytes(f, &bytes).unwrap_or_else(|e| {
184                tracing::warn!("parser: error on {}: {e}", f.rel_path);
185                StaticFileAnalysis::empty(f)
186            });
187            Some(Slot::Changed(Box::new((f.clone(), analysis))))
188        })
189        .collect();
190
191    let mut parsed_files = Vec::new();
192    let mut analyses = Vec::new();
193    let mut new_mtimes = HashMap::new();
194    let mut skipped_count = 0usize;
195
196    for slot in slots.into_iter().flatten() {
197        match slot {
198            Slot::Changed(boxed) => {
199                let (file, analysis) = *boxed;
200                new_mtimes.insert(file.rel_path.clone(), file.mtime_secs);
201                parsed_files.push(file);
202                analyses.push(analysis);
203            }
204            Slot::Unchanged => skipped_count += 1,
205        }
206    }
207
208    let parse_count = parsed_files.len();
209    HashParseOutput {
210        parsed_files,
211        analyses,
212        new_mtimes,
213        parse_count,
214        skipped_count,
215    }
216}
217
218// ── Shared utilities ──────────────────────────────────────────────────────────
219
220fn is_parseable_language(language: Language) -> bool {
221    matches!(
222        language,
223        Language::Rust
224            | Language::TypeScript
225            | Language::JavaScript
226            | Language::Python
227            | Language::Go
228            | Language::Java
229            | Language::C
230            | Language::Cpp
231            | Language::Ruby
232            | Language::Scala
233            | Language::Elixir
234            | Language::Haskell
235    )
236}
237
238pub(crate) fn analyze_file_bytes(file: &WalkedFile, bytes: &[u8]) -> Result<StaticFileAnalysis> {
239    let source = String::from_utf8_lossy(bytes);
240    let mut analysis = parse_file_from_source(file, &source)?;
241    analysis.content_hash = Some(format!("{:x}", Sha256::digest(bytes)));
242    analysis.line_count = count_lines(bytes);
243    Ok(analysis)
244}
245
246/// Dispatch parse to the language-specific parser using pre-read source text.
247fn parse_file_from_source(file: &WalkedFile, source: &str) -> Result<StaticFileAnalysis> {
248    match file.language {
249        Language::Rust => rust::parse_rust(file, source),
250        Language::TypeScript | Language::JavaScript => typescript::parse_typescript(file, source),
251        Language::Python => python::parse_python(file, source),
252        Language::Go => go::parse_go(file, source),
253        Language::Java => java::parse_java(file, source),
254        Language::C => c::parse_c(file, source),
255        Language::Cpp => cpp::parse_cpp(file, source),
256        Language::Ruby => ruby::parse_ruby(file, source),
257        Language::Scala => scala::parse_scala(file, source),
258        Language::Elixir => elixir::parse_elixir(file, source),
259        Language::Haskell => haskell::parse_haskell(file, source),
260        _ => Ok(StaticFileAnalysis::empty(file)),
261    }
262}
263
264fn read_source_bytes(file: &WalkedFile) -> Option<Vec<u8>> {
265    match std::fs::read(&file.abs_path) {
266        Ok(bytes) => Some(bytes),
267        Err(e) => {
268            tracing::warn!("parser: cannot read {}: {e}", file.rel_path);
269            None
270        }
271    }
272}
273
274fn count_lines(bytes: &[u8]) -> u32 {
275    if bytes.is_empty() {
276        return 0;
277    }
278    let newline_count = bytes.iter().filter(|&&b| b == b'\n').count() as u32;
279    if bytes.last() == Some(&b'\n') {
280        newline_count
281    } else {
282        newline_count + 1
283    }
284}
285
286/// Scan a comment node for a TODO-family or type-suppression marker.
287///
288/// Handles all comment syntaxes: `//`, `///`, `/* */`, `#` (Python).
289/// Uses byte-level `eq_ignore_ascii_case` — no allocation until a match.
290/// Line number is 1-based (editor convention).
291pub(crate) fn extract_todo(comment: &str, line: u32) -> Option<TodoComment> {
292    let inner = comment
293        .trim_start_matches('/')
294        .trim_start_matches('*')
295        .trim_start_matches('#')
296        .trim_end_matches('/')
297        .trim_end_matches('*')
298        .trim();
299
300    let b = inner.as_bytes();
301
302    let kind = if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"TODO") {
303        TodoKind::Todo
304    } else if b.len() >= 5 && b[..5].eq_ignore_ascii_case(b"FIXME") {
305        TodoKind::Fixme
306    } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"HACK") {
307        TodoKind::Hack
308    } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"NOTE") {
309        TodoKind::Note
310    } else if b.len() >= 10 && b[..10].eq_ignore_ascii_case(b"DEPRECATED") {
311        TodoKind::Deprecated
312    } else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"@TS-") {
313        // @ts-ignore, @ts-nocheck, @ts-expect-error
314        TodoKind::Note
315    } else if inner.contains("type: ignore") {
316        // Python mypy suppression: # type: ignore[code]
317        TodoKind::Note
318    } else {
319        return None;
320    };
321
322    Some(TodoComment {
323        text: inner.to_owned(),
324        line,
325        kind,
326    })
327}
328
329/// Normalize a doc comment string: collapse internal whitespace runs to a
330/// single space and trim leading/trailing whitespace.
331///
332/// Used by language parsers to clean up multi-line doc comments before storing
333/// them as `module_doc`.
334pub(crate) fn normalize_doc(s: &str) -> String {
335    let mut out = String::with_capacity(s.len());
336    let mut last_was_space = true; // trim leading
337    for ch in s.chars() {
338        if ch.is_whitespace() {
339            if !last_was_space {
340                out.push(' ');
341                last_was_space = true;
342            }
343        } else {
344            out.push(ch);
345            last_was_space = false;
346        }
347    }
348    if out.ends_with(' ') {
349        out.pop();
350    }
351    out
352}
353
354// ── Tests ─────────────────────────────────────────────────────────────────────
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359    use std::path::PathBuf;
360
361    #[test]
362    fn extract_todo_none_for_plain_comment() {
363        assert!(extract_todo("// nothing special", 1).is_none());
364    }
365
366    #[test]
367    fn extract_todo_rust_line_comment() {
368        let t = extract_todo("// TODO: do something", 3).unwrap();
369        assert_eq!(t.kind, TodoKind::Todo);
370        assert_eq!(t.line, 3);
371    }
372
373    #[test]
374    fn extract_todo_rust_block_comment() {
375        let t = extract_todo("/* FIXME: clean up */", 10).unwrap();
376        assert_eq!(t.kind, TodoKind::Fixme);
377    }
378
379    #[test]
380    fn extract_todo_rust_doc_comment() {
381        let t = extract_todo("/// TODO: document", 1).unwrap();
382        assert_eq!(t.kind, TodoKind::Todo);
383    }
384
385    #[test]
386    fn extract_todo_python_hash_comment() {
387        let t = extract_todo("# TODO: fix this", 5).unwrap();
388        assert_eq!(t.kind, TodoKind::Todo);
389    }
390
391    #[test]
392    fn extract_todo_ts_ignore() {
393        let t = extract_todo("// @ts-ignore", 1).unwrap();
394        assert_eq!(t.kind, TodoKind::Note);
395    }
396
397    #[test]
398    fn extract_todo_ts_expect_error() {
399        let t = extract_todo("// @ts-expect-error", 1).unwrap();
400        assert_eq!(t.kind, TodoKind::Note);
401    }
402
403    #[test]
404    fn extract_todo_python_type_ignore() {
405        let t = extract_todo("# type: ignore", 1).unwrap();
406        assert_eq!(t.kind, TodoKind::Note);
407    }
408
409    #[test]
410    fn extract_todo_python_type_ignore_with_code() {
411        let t = extract_todo("# type: ignore[attr-defined]", 1).unwrap();
412        assert_eq!(t.kind, TodoKind::Note);
413    }
414
415    #[test]
416    fn extract_todo_case_insensitive() {
417        let t = extract_todo("// todo: lowercase", 1).unwrap();
418        assert_eq!(t.kind, TodoKind::Todo);
419    }
420
421    #[test]
422    fn unsupported_language_skipped_without_disk_read() {
423        let f = WalkedFile {
424            abs_path: PathBuf::from("/nonexistent/file.txt"),
425            rel_path: "notes.txt".to_owned(),
426            language: Language::Unknown,
427            size_bytes: 0,
428            mtime_secs: 0,
429        };
430        let a = parse_file(&f).unwrap();
431        assert!(a.entry_points.is_empty());
432    }
433
434    #[test]
435    fn parse_files_parallel_preserves_order() {
436        use tempfile::TempDir;
437        let dir = TempDir::new().unwrap();
438        let files: Vec<WalkedFile> = (0..3)
439            .map(|i| {
440                let rel = format!("f{i}.rs");
441                let abs = dir.path().join(&rel);
442                std::fs::write(&abs, format!("pub fn f{i}() {{}}")).unwrap();
443                WalkedFile {
444                    abs_path: abs,
445                    rel_path: rel,
446                    language: Language::Rust,
447                    size_bytes: 20,
448                    mtime_secs: 0,
449                }
450            })
451            .collect();
452
453        let results = parse_files_parallel(&files);
454        assert_eq!(results[0].path, "f0.rs");
455        assert_eq!(results[1].path, "f1.rs");
456        assert_eq!(results[2].path, "f2.rs");
457    }
458
459    #[test]
460    fn parse_file_populates_hash_and_line_count() {
461        use tempfile::TempDir;
462
463        let dir = TempDir::new().unwrap();
464        let abs = dir.path().join("f.rs");
465        std::fs::write(&abs, "pub fn f() {}\n").unwrap();
466
467        let file = WalkedFile {
468            abs_path: abs,
469            rel_path: "f.rs".to_string(),
470            language: Language::Rust,
471            size_bytes: 13,
472            mtime_secs: 0,
473        };
474
475        let analysis = parse_file(&file).unwrap();
476        assert!(analysis.content_hash.is_some());
477        assert_eq!(analysis.line_count, 1);
478    }
479
480    #[test]
481    fn parse_file_counts_single_line_without_trailing_newline() {
482        use tempfile::TempDir;
483
484        let dir = TempDir::new().unwrap();
485        let abs = dir.path().join("f.rs");
486        std::fs::write(&abs, "pub fn f() {}").unwrap();
487
488        let file = WalkedFile {
489            abs_path: abs,
490            rel_path: "f.rs".to_string(),
491            language: Language::Rust,
492            size_bytes: 12,
493            mtime_secs: 0,
494        };
495
496        let analysis = parse_file(&file).unwrap();
497        assert_eq!(analysis.line_count, 1);
498    }
499
500    #[test]
501    fn parse_file_counts_multiple_lines_without_trailing_newline() {
502        use tempfile::TempDir;
503
504        let dir = TempDir::new().unwrap();
505        let abs = dir.path().join("f.rs");
506        std::fs::write(&abs, "pub fn f() {}\npub fn g() {}").unwrap();
507
508        let file = WalkedFile {
509            abs_path: abs,
510            rel_path: "f.rs".to_string(),
511            language: Language::Rust,
512            size_bytes: 27,
513            mtime_secs: 0,
514        };
515
516        let analysis = parse_file(&file).unwrap();
517        assert_eq!(analysis.line_count, 2);
518    }
519}