Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8
9mod asset_url;
10pub mod astro;
11pub mod cache;
12pub(crate) mod complexity;
13pub mod css;
14pub mod flags;
15pub mod glimmer;
16pub mod graphql;
17pub mod html;
18pub mod iconify;
19pub mod inventory;
20pub mod mdx;
21mod parse;
22pub mod sfc;
23mod sfc_template;
24pub mod suppress;
25pub(crate) mod template_complexity;
26mod template_usage;
27pub mod visitor;
28
29use std::path::Path;
30
31use rayon::prelude::*;
32
33use cache::CacheStore;
34use fallow_types::discover::{DiscoveredFile, FileId};
35
36// Re-export all extract types from fallow-types
37pub use fallow_types::extract::{
38    ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
39    ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
40    ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
41    compute_line_offsets,
42};
43
44// Re-export extraction functions for internal use and fuzzing
45pub use astro::extract_astro_frontmatter;
46pub use css::extract_css_module_exports;
47pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
48pub use mdx::extract_mdx_statements;
49pub use sfc::{extract_sfc_scripts, is_sfc_file};
50pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
51
52/// Synthetic member-access object used to carry exported-instance bindings.
53///
54/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
55/// means the exported value named `export_name` is an instance of the local
56/// class/interface symbol named `target`.
57pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
58
59/// Synthetic member-access object prefix for typed Playwright fixtures.
60///
61/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
62/// means the exported Playwright test object named `test` provides a fixture
63/// named `fixture` whose declared type is `type_name`.
64pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
65
66/// Synthetic member-access object prefix for Playwright fixture member uses.
67///
68/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
69/// means a callback passed to the Playwright test object named `test`
70/// destructures `fixture` and accesses `fixture.member`.
71pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
72
73/// Synthetic member-access object prefix for static-factory call returns.
74///
75/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
76/// means a local binding was assigned from `<callee>.<method>()` and a member
77/// is accessed on the result. The analyze layer resolves `callee` through the
78/// consumer module's imports to a class export and credits `member` on the
79/// class when the matching method carries `is_instance_returning_static`.
80/// See issue #346.
81pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
82
83/// Synthetic member-access object prefix for fluent-builder chain credit.
84///
85/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
86/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
87/// observed. `chain` is a comma-separated list of method names (empty when
88/// `member` is the first chained call after `root_method`). The analyze layer
89/// resolves `callee` to a class export, validates `root_method` has
90/// `is_instance_returning_static`, walks each `chain` segment requiring
91/// `is_self_returning` on the class, and credits `member` on the class
92/// when the chain remains on the class type. See issue #387.
93pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
94
95/// Synthetic member-access object prefix for fluent chains rooted at a `new`
96/// expression.
97///
98/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
99/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
100/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
101/// returns an instance of `class`, so no `is_instance_returning_static` check
102/// applies. `chain` is a comma-separated list of the intermediate method names
103/// between the constructor and `member` (it always contains at least the first
104/// method, which must be `is_self_returning` to reach `member`). The analyze
105/// layer resolves `class` to a class export, requires every `chain` segment to
106/// be `is_self_returning` on the class, and credits `member` on the class.
107/// The first method directly off the constructor is credited separately via
108/// the `static_member_object_name` `NewExpression` arm. See issue #605.
109pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
110
111use parse::parse_source_to_module;
112
113/// Leading UTF-8 byte order mark codepoint.
114///
115/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
116/// BOM at the start of source files. fallow's contract is "UTF-8 with or
117/// without BOM; line offsets are computed against the post-BOM view; the BOM,
118/// if present on input, is preserved on output by `fallow fix`."
119const BOM_CHAR: char = '\u{FEFF}';
120
121/// Strip the leading UTF-8 BOM if present.
122///
123/// Called at every file-read entry point in this crate so the rest of the
124/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
125/// analyses) sees a consistent post-BOM view. Mirrors the
126/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
127/// and source-code-shaped sources are processed symmetrically. See issue #475.
128#[must_use]
129pub(crate) fn strip_bom(source: &str) -> &str {
130    source.strip_prefix(BOM_CHAR).unwrap_or(source)
131}
132
133/// Parse all files in parallel, extracting imports and exports.
134/// Uses the cache to skip reparsing files whose content hasn't changed.
135///
136/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
137/// metrics are computed during parsing (needed by the `health` command).
138/// Pass `false` for dead-code analysis where complexity data is unused.
139pub fn parse_all_files(
140    files: &[DiscoveredFile],
141    cache: Option<&CacheStore>,
142    need_complexity: bool,
143) -> ParseResult {
144    use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
145    let cache_hits = AtomicUsize::new(0);
146    let cache_misses = AtomicUsize::new(0);
147    // Summed nanoseconds spent in the actual AST parse (cache-miss path only).
148    // Lets the perf renderer report parse CPU time vs the stage's wall-clock.
149    let parse_cpu_nanos = AtomicU64::new(0);
150
151    let modules: Vec<ModuleInfo> = files
152        .par_iter()
153        .filter_map(|file| {
154            parse_single_file_cached(
155                file,
156                cache,
157                &cache_hits,
158                &cache_misses,
159                &parse_cpu_nanos,
160                need_complexity,
161            )
162        })
163        .collect();
164
165    let hits = cache_hits.load(Ordering::Relaxed);
166    let misses = cache_misses.load(Ordering::Relaxed);
167    if hits > 0 || misses > 0 {
168        tracing::info!(
169            cache_hits = hits,
170            cache_misses = misses,
171            "incremental cache stats"
172        );
173    }
174
175    ParseResult {
176        modules,
177        cache_hits: hits,
178        cache_misses: misses,
179        parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
180    }
181}
182
183/// Parse a single file, consulting the cache first.
184///
185/// Cache validation strategy (fast path -> slow path):
186/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
187/// 2. If mtime+size match the cached entry -> cache hit, return immediately
188/// 3. If mtime+size differ -> read file, compute content hash
189/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
190/// 5. Otherwise -> cache miss, full parse
191fn parse_single_file_cached(
192    file: &DiscoveredFile,
193    cache: Option<&CacheStore>,
194    cache_hits: &std::sync::atomic::AtomicUsize,
195    cache_misses: &std::sync::atomic::AtomicUsize,
196    parse_cpu_nanos: &std::sync::atomic::AtomicU64,
197    need_complexity: bool,
198) -> Option<ModuleInfo> {
199    use std::sync::atomic::Ordering;
200
201    // Fast path: check mtime+size before reading file content.
202    // A single stat() syscall is ~100x cheaper than read()+hash().
203    if let Some(store) = cache
204        && let Ok(metadata) = std::fs::metadata(&file.path)
205    {
206        let mt = mtime_secs(&metadata);
207        let sz = metadata.len();
208        if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
209            // When complexity is requested but the cached entry lacks it
210            // (populated by a prior `check` run), skip the cache and re-parse.
211            if !need_complexity || !cached.complexity.is_empty() {
212                cache_hits.fetch_add(1, Ordering::Relaxed);
213                return Some(cache::cached_to_module_opts(
214                    cached,
215                    file.id,
216                    need_complexity,
217                ));
218            }
219        }
220    }
221
222    // Slow path: read file content and compute content hash.
223    // Strip the UTF-8 BOM, if present, before hashing AND before parsing so
224    // the content hash, `compute_line_offsets`, and the oxc parser all see
225    // the same byte sequence. Without this, hash matches that depend on
226    // BOM presence would silently miss the cache. Issue #475.
227    let raw = std::fs::read_to_string(&file.path).ok()?;
228    let source = strip_bom(&raw);
229    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
230
231    // Check cache by content hash (handles touch/save-without-change)
232    if let Some(store) = cache
233        && let Some(cached) = store.get(&file.path, content_hash)
234        && (!need_complexity || !cached.complexity.is_empty())
235    {
236        cache_hits.fetch_add(1, Ordering::Relaxed);
237        return Some(cache::cached_to_module_opts(
238            cached,
239            file.id,
240            need_complexity,
241        ));
242    }
243    cache_misses.fetch_add(1, Ordering::Relaxed);
244
245    // Cache miss, do a full parse. Time just this AST parse so the perf
246    // renderer can report parse CPU time (summed across workers) vs the
247    // stage's wall-clock. File read + hash above are deliberately excluded:
248    // the figure is "parse work", not IO.
249    let parse_start = std::time::Instant::now();
250    let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
251    parse_cpu_nanos.fetch_add(
252        u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
253        Ordering::Relaxed,
254    );
255    Some(module)
256}
257
258/// Extract mtime (seconds since epoch) from file metadata.
259/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
260fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
261    metadata
262        .modified()
263        .ok()
264        .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
265        .map_or(0, |d| d.as_secs())
266}
267
268/// Parse a single file and extract module information (without complexity).
269#[must_use]
270pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
271    // BOM strip before hash + parse so downstream offsets stay aligned with
272    // the parser's view. See `parse_single_file_cached` and issue #475.
273    let raw = std::fs::read_to_string(&file.path).ok()?;
274    let source = strip_bom(&raw);
275    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
276    Some(parse_source_to_module(
277        file.id,
278        &file.path,
279        source,
280        content_hash,
281        false,
282    ))
283}
284
285/// Parse from in-memory content (for LSP, includes complexity).
286#[must_use]
287pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
288    // Editors normally strip a BOM before sending didOpen.text, but be
289    // defensive: an editor or test that hands us BOM-bearing content must
290    // produce the same offsets as the on-disk path. Issue #475.
291    let content = strip_bom(content);
292    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
293    parse_source_to_module(file_id, path, content, content_hash, true)
294}
295
296// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
297// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
298#[cfg(all(test, not(miri)))]
299mod tests;