Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8// fallow's analysis never executes the analyzed project's code, and this crate
9// spawns no external process at all. The deny (paired with the `.clippy.toml`
10// ban on `std::process::Command::new`) keeps it that way: any future process
11// spawn here fails the build. Test helpers are exempt via `not(test)`.
12#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
13
14mod asset_url;
15pub mod astro;
16pub mod cache;
17pub(crate) mod complexity;
18pub mod css;
19pub mod flags;
20pub mod glimmer;
21pub mod graphql;
22pub mod html;
23pub mod iconify;
24pub mod inventory;
25pub mod mdx;
26mod parse;
27pub mod sfc;
28mod sfc_template;
29mod source_map;
30pub mod suppress;
31pub(crate) mod template_complexity;
32mod template_usage;
33pub mod visitor;
34
35use std::path::Path;
36
37use rayon::prelude::*;
38
39use cache::CacheStore;
40use fallow_types::discover::{DiscoveredFile, FileId};
41
42// Re-export all extract types from fallow-types
43pub use fallow_types::extract::{
44    ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
45    ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
46    ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
47    compute_line_offsets,
48};
49
50// Re-export extraction functions for internal use and fuzzing
51pub use astro::extract_astro_frontmatter;
52pub use css::extract_css_module_exports;
53pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
54pub use mdx::extract_mdx_statements;
55pub use sfc::{extract_sfc_scripts, is_sfc_file};
56pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
57
58/// Synthetic member-access object used to carry exported-instance bindings.
59///
60/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
61/// means the exported value named `export_name` is an instance of the local
62/// class/interface symbol named `target`.
63pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
64
65/// Synthetic member-access object prefix for typed Playwright fixtures.
66///
67/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
68/// means the exported Playwright test object named `test` provides a fixture
69/// named `fixture` whose declared type is `type_name`.
70pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
71
72/// Synthetic member-access object prefix for Playwright fixture member uses.
73///
74/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
75/// means a callback passed to the Playwright test object named `test`
76/// destructures `fixture` and accesses `fixture.member`.
77pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
78
79/// Synthetic member-access object prefix for static-factory call returns.
80///
81/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
82/// means a local binding was assigned from `<callee>.<method>()` and a member
83/// is accessed on the result. The analyze layer resolves `callee` through the
84/// consumer module's imports to a class export and credits `member` on the
85/// class when the matching method carries `is_instance_returning_static`.
86/// See issue #346.
87pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
88
89/// Synthetic member-access object prefix for fluent-builder chain credit.
90///
91/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
92/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
93/// observed. `chain` is a comma-separated list of method names (empty when
94/// `member` is the first chained call after `root_method`). The analyze layer
95/// resolves `callee` to a class export, validates `root_method` has
96/// `is_instance_returning_static`, walks each `chain` segment requiring
97/// `is_self_returning` on the class, and credits `member` on the class
98/// when the chain remains on the class type. See issue #387.
99pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
100
101/// Synthetic member-access object prefix for fluent chains rooted at a `new`
102/// expression.
103///
104/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
105/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
106/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
107/// returns an instance of `class`, so no `is_instance_returning_static` check
108/// applies. `chain` is a comma-separated list of the intermediate method names
109/// between the constructor and `member` (it always contains at least the first
110/// method, which must be `is_self_returning` to reach `member`). The analyze
111/// layer resolves `class` to a class export, requires every `chain` segment to
112/// be `is_self_returning` on the class, and credits `member` on the class.
113/// The first method directly off the constructor is credited separately via
114/// the `static_member_object_name` `NewExpression` arm. See issue #605.
115pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
116
117use parse::parse_source_to_module;
118
119/// Leading UTF-8 byte order mark codepoint.
120///
121/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
122/// BOM at the start of source files. fallow's contract is "UTF-8 with or
123/// without BOM; line offsets are computed against the post-BOM view; the BOM,
124/// if present on input, is preserved on output by `fallow fix`."
125const BOM_CHAR: char = '\u{FEFF}';
126
127/// Strip the leading UTF-8 BOM if present.
128///
129/// Called at every file-read entry point in this crate so the rest of the
130/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
131/// analyses) sees a consistent post-BOM view. Mirrors the
132/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
133/// and source-code-shaped sources are processed symmetrically. See issue #475.
134#[must_use]
135pub(crate) fn strip_bom(source: &str) -> &str {
136    source.strip_prefix(BOM_CHAR).unwrap_or(source)
137}
138
139/// Parse all files in parallel, extracting imports and exports.
140/// Uses the cache to skip reparsing files whose content hasn't changed.
141///
142/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
143/// metrics are computed during parsing (needed by the `health` command).
144/// Pass `false` for dead-code analysis where complexity data is unused.
145pub fn parse_all_files(
146    files: &[DiscoveredFile],
147    cache: Option<&CacheStore>,
148    need_complexity: bool,
149) -> ParseResult {
150    use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
151    let cache_hits = AtomicUsize::new(0);
152    let cache_misses = AtomicUsize::new(0);
153    // Summed nanoseconds spent in the actual AST parse (cache-miss path only).
154    // Lets the perf renderer report parse CPU time vs the stage's wall-clock.
155    let parse_cpu_nanos = AtomicU64::new(0);
156
157    let modules: Vec<ModuleInfo> = files
158        .par_iter()
159        .filter_map(|file| {
160            parse_single_file_cached(
161                file,
162                cache,
163                &cache_hits,
164                &cache_misses,
165                &parse_cpu_nanos,
166                need_complexity,
167            )
168        })
169        .collect();
170
171    let hits = cache_hits.load(Ordering::Relaxed);
172    let misses = cache_misses.load(Ordering::Relaxed);
173    if hits > 0 || misses > 0 {
174        tracing::info!(
175            cache_hits = hits,
176            cache_misses = misses,
177            "incremental cache stats"
178        );
179    }
180
181    ParseResult {
182        modules,
183        cache_hits: hits,
184        cache_misses: misses,
185        parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
186    }
187}
188
189/// Parse a single file, consulting the cache first.
190///
191/// Cache validation strategy (fast path -> slow path):
192/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
193/// 2. If mtime+size match the cached entry -> cache hit, return immediately
194/// 3. If mtime+size differ -> read file, compute content hash
195/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
196/// 5. Otherwise -> cache miss, full parse
197fn parse_single_file_cached(
198    file: &DiscoveredFile,
199    cache: Option<&CacheStore>,
200    cache_hits: &std::sync::atomic::AtomicUsize,
201    cache_misses: &std::sync::atomic::AtomicUsize,
202    parse_cpu_nanos: &std::sync::atomic::AtomicU64,
203    need_complexity: bool,
204) -> Option<ModuleInfo> {
205    use std::sync::atomic::Ordering;
206
207    // Fast path: check mtime+size before reading file content.
208    // A single stat() syscall is ~100x cheaper than read()+hash().
209    if let Some(store) = cache
210        && let Ok(metadata) = std::fs::metadata(&file.path)
211    {
212        let mt = mtime_secs(&metadata);
213        let sz = metadata.len();
214        if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
215            // When complexity is requested but the cached entry lacks it
216            // (populated by a prior `check` run), skip the cache and re-parse.
217            if !need_complexity || !cached.complexity.is_empty() {
218                cache_hits.fetch_add(1, Ordering::Relaxed);
219                return Some(cache::cached_to_module_opts(
220                    cached,
221                    file.id,
222                    need_complexity,
223                ));
224            }
225        }
226    }
227
228    // Slow path: read file content and compute content hash.
229    // Strip the UTF-8 BOM, if present, before hashing AND before parsing so
230    // the content hash, `compute_line_offsets`, and the oxc parser all see
231    // the same byte sequence. Without this, hash matches that depend on
232    // BOM presence would silently miss the cache. Issue #475.
233    let raw = std::fs::read_to_string(&file.path).ok()?;
234    let source = strip_bom(&raw);
235    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
236
237    // Check cache by content hash (handles touch/save-without-change)
238    if let Some(store) = cache
239        && let Some(cached) = store.get(&file.path, content_hash)
240        && (!need_complexity || !cached.complexity.is_empty())
241    {
242        cache_hits.fetch_add(1, Ordering::Relaxed);
243        return Some(cache::cached_to_module_opts(
244            cached,
245            file.id,
246            need_complexity,
247        ));
248    }
249    cache_misses.fetch_add(1, Ordering::Relaxed);
250
251    // Cache miss, do a full parse. Time just this AST parse so the perf
252    // renderer can report parse CPU time (summed across workers) vs the
253    // stage's wall-clock. File read + hash above are deliberately excluded:
254    // the figure is "parse work", not IO.
255    let parse_start = std::time::Instant::now();
256    let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
257    parse_cpu_nanos.fetch_add(
258        u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
259        Ordering::Relaxed,
260    );
261    Some(module)
262}
263
264/// Extract mtime (seconds since epoch) from file metadata.
265/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
266fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
267    metadata
268        .modified()
269        .ok()
270        .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
271        .map_or(0, |d| d.as_secs())
272}
273
274/// Parse a single file and extract module information (without complexity).
275#[must_use]
276pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
277    // BOM strip before hash + parse so downstream offsets stay aligned with
278    // the parser's view. See `parse_single_file_cached` and issue #475.
279    let raw = std::fs::read_to_string(&file.path).ok()?;
280    let source = strip_bom(&raw);
281    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
282    Some(parse_source_to_module(
283        file.id,
284        &file.path,
285        source,
286        content_hash,
287        false,
288    ))
289}
290
291/// Parse from in-memory content (for LSP, includes complexity).
292#[must_use]
293pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
294    // Editors normally strip a BOM before sending didOpen.text, but be
295    // defensive: an editor or test that hands us BOM-bearing content must
296    // produce the same offsets as the on-disk path. Issue #475.
297    let content = strip_bom(content);
298    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
299    parse_source_to_module(file_id, path, content, content_hash, true)
300}
301
302// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
303// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
304#[cfg(all(test, not(miri)))]
305mod tests;