Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for the fallow codebase analyzer.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8
9pub mod astro;
10pub mod cache;
11pub(crate) mod complexity;
12pub mod css;
13pub mod html;
14pub mod mdx;
15mod parse;
16pub mod sfc;
17pub mod suppress;
18pub mod visitor;
19
20use std::path::Path;
21
22use rayon::prelude::*;
23
24use cache::CacheStore;
25use fallow_types::discover::{DiscoveredFile, FileId};
26
27// Re-export all extract types from fallow-types
28pub use fallow_types::extract::{
29    DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo, ImportedName,
30    MemberAccess, MemberInfo, MemberKind, ModuleInfo, ParseResult, ReExportInfo, RequireCallInfo,
31    compute_line_offsets,
32};
33
34// Re-export extraction functions for internal use and fuzzing
35pub use astro::extract_astro_frontmatter;
36pub use css::extract_css_module_exports;
37pub use mdx::extract_mdx_statements;
38pub use sfc::{extract_sfc_scripts, is_sfc_file};
39
40use parse::parse_source_to_module;
41
42/// Parse all files in parallel, extracting imports and exports.
43/// Uses the cache to skip reparsing files whose content hasn't changed.
44pub fn parse_all_files(files: &[DiscoveredFile], cache: Option<&CacheStore>) -> ParseResult {
45    use std::sync::atomic::{AtomicUsize, Ordering};
46    let cache_hits = AtomicUsize::new(0);
47    let cache_misses = AtomicUsize::new(0);
48
49    let modules: Vec<ModuleInfo> = files
50        .par_iter()
51        .filter_map(|file| parse_single_file_cached(file, cache, &cache_hits, &cache_misses))
52        .collect();
53
54    let hits = cache_hits.load(Ordering::Relaxed);
55    let misses = cache_misses.load(Ordering::Relaxed);
56    if hits > 0 || misses > 0 {
57        tracing::info!(
58            cache_hits = hits,
59            cache_misses = misses,
60            "incremental cache stats"
61        );
62    }
63
64    ParseResult {
65        modules,
66        cache_hits: hits,
67        cache_misses: misses,
68    }
69}
70
71/// Extract mtime (seconds since epoch) from file metadata.
72/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
73fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
74    metadata
75        .modified()
76        .ok()
77        .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
78        .map_or(0, |d| d.as_secs())
79}
80
81/// Parse a single file, consulting the cache first.
82///
83/// Cache validation strategy (fast path -> slow path):
84/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
85/// 2. If mtime+size match the cached entry -> cache hit, return immediately
86/// 3. If mtime+size differ -> read file, compute content hash
87/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
88/// 5. Otherwise -> cache miss, full parse
89fn parse_single_file_cached(
90    file: &DiscoveredFile,
91    cache: Option<&CacheStore>,
92    cache_hits: &std::sync::atomic::AtomicUsize,
93    cache_misses: &std::sync::atomic::AtomicUsize,
94) -> Option<ModuleInfo> {
95    use std::sync::atomic::Ordering;
96
97    // Fast path: check mtime+size before reading file content.
98    // A single stat() syscall is ~100x cheaper than read()+hash().
99    if let Some(store) = cache
100        && let Ok(metadata) = std::fs::metadata(&file.path)
101    {
102        let mt = mtime_secs(&metadata);
103        let sz = metadata.len();
104        if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
105            cache_hits.fetch_add(1, Ordering::Relaxed);
106            return Some(cache::cached_to_module(cached, file.id));
107        }
108    }
109
110    // Slow path: read file content and compute content hash.
111    let source = std::fs::read_to_string(&file.path).ok()?;
112    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
113
114    // Check cache by content hash (handles touch/save-without-change)
115    if let Some(store) = cache
116        && let Some(cached) = store.get(&file.path, content_hash)
117    {
118        cache_hits.fetch_add(1, Ordering::Relaxed);
119        return Some(cache::cached_to_module(cached, file.id));
120    }
121    cache_misses.fetch_add(1, Ordering::Relaxed);
122
123    // Cache miss — do a full parse
124    Some(parse_source_to_module(
125        file.id,
126        &file.path,
127        &source,
128        content_hash,
129    ))
130}
131
132/// Parse a single file and extract module information.
133#[must_use]
134pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
135    let source = std::fs::read_to_string(&file.path).ok()?;
136    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
137    Some(parse_source_to_module(
138        file.id,
139        &file.path,
140        &source,
141        content_hash,
142    ))
143}
144
145/// Parse from in-memory content (for LSP).
146#[must_use]
147pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
148    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
149    parse_source_to_module(file_id, path, content, content_hash)
150}
151
152// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
153// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
154#[cfg(all(test, not(miri)))]
155mod tests;