fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8// fallow's analysis never executes the analyzed project's code, and this crate
9// spawns no external process at all. The deny (paired with the `.clippy.toml`
10// ban on `std::process::Command::new`) keeps it that way: any future process
11// spawn here fails the build. Test helpers are exempt via `not(test)`.
12#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
13
14mod asset_url;
15pub mod astro;
16pub mod cache;
17pub(crate) mod complexity;
18pub mod css;
19pub mod flags;
20pub mod glimmer;
21pub mod graphql;
22pub mod html;
23pub mod iconify;
24pub mod inventory;
25pub mod mdx;
26mod parse;
27pub mod sfc;
28mod sfc_template;
29mod source_map;
30pub mod suppress;
31pub(crate) mod template_complexity;
32mod template_usage;
33pub mod visitor;
34
35use std::path::Path;
36
37use rayon::prelude::*;
38
39use cache::CacheStore;
40use fallow_types::discover::{DiscoveredFile, FileId};
41
42// Re-export all extract types from fallow-types
43pub use fallow_types::extract::{
44 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
45 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
46 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
47 compute_line_offsets,
48};
49
50// Re-export extraction functions for internal use and fuzzing
51pub use astro::extract_astro_frontmatter;
52pub use css::extract_css_module_exports;
53pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
54pub use mdx::extract_mdx_statements;
55pub use sfc::{extract_sfc_scripts, is_sfc_file};
56pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
57
58/// Synthetic member-access object used to carry exported-instance bindings.
59///
60/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
61/// means the exported value named `export_name` is an instance of the local
62/// class/interface symbol named `target`.
63pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
64
65/// Synthetic member-access object prefix for typed Playwright fixtures.
66///
67/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
68/// means the exported Playwright test object named `test` provides a fixture
69/// named `fixture` whose declared type is `type_name`.
70pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
71
72/// Synthetic member-access object prefix for Playwright fixture member uses.
73///
74/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
75/// means a callback passed to the Playwright test object named `test`
76/// destructures `fixture` and accesses `fixture.member`.
77pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
78
79/// Synthetic member-access object prefix for static-factory call returns.
80///
81/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
82/// means a local binding was assigned from `<callee>.<method>()` and a member
83/// is accessed on the result. The analyze layer resolves `callee` through the
84/// consumer module's imports to a class export and credits `member` on the
85/// class when the matching method carries `is_instance_returning_static`.
86/// See issue #346.
87pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
88
89/// Synthetic member-access object prefix for fluent-builder chain credit.
90///
91/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
92/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
93/// observed. `chain` is a comma-separated list of method names (empty when
94/// `member` is the first chained call after `root_method`). The analyze layer
95/// resolves `callee` to a class export, validates `root_method` has
96/// `is_instance_returning_static`, walks each `chain` segment requiring
97/// `is_self_returning` on the class, and credits `member` on the class
98/// when the chain remains on the class type. See issue #387.
99pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
100
101/// Synthetic member-access object prefix for fluent chains rooted at a `new`
102/// expression.
103///
104/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
105/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
106/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
107/// returns an instance of `class`, so no `is_instance_returning_static` check
108/// applies. `chain` is a comma-separated list of the intermediate method names
109/// between the constructor and `member` (it always contains at least the first
110/// method, which must be `is_self_returning` to reach `member`). The analyze
111/// layer resolves `class` to a class export, requires every `chain` segment to
112/// be `is_self_returning` on the class, and credits `member` on the class.
113/// The first method directly off the constructor is credited separately via
114/// the `static_member_object_name` `NewExpression` arm. See issue #605.
115pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
116
117use parse::parse_source_to_module;
118
119/// Leading UTF-8 byte order mark codepoint.
120///
121/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
122/// BOM at the start of source files. fallow's contract is "UTF-8 with or
123/// without BOM; line offsets are computed against the post-BOM view; the BOM,
124/// if present on input, is preserved on output by `fallow fix`."
125const BOM_CHAR: char = '\u{FEFF}';
126
127/// Strip the leading UTF-8 BOM if present.
128///
129/// Called at every file-read entry point in this crate so the rest of the
130/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
131/// analyses) sees a consistent post-BOM view. Mirrors the
132/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
133/// and source-code-shaped sources are processed symmetrically. See issue #475.
134#[must_use]
135pub(crate) fn strip_bom(source: &str) -> &str {
136 source.strip_prefix(BOM_CHAR).unwrap_or(source)
137}
138
139/// Parse all files in parallel, extracting imports and exports.
140/// Uses the cache to skip reparsing files whose content hasn't changed.
141///
142/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
143/// metrics are computed during parsing (needed by the `health` command).
144/// Pass `false` for dead-code analysis where complexity data is unused.
145pub fn parse_all_files(
146 files: &[DiscoveredFile],
147 cache: Option<&CacheStore>,
148 need_complexity: bool,
149) -> ParseResult {
150 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
151 let cache_hits = AtomicUsize::new(0);
152 let cache_misses = AtomicUsize::new(0);
153 // Summed nanoseconds spent in the actual AST parse (cache-miss path only).
154 // Lets the perf renderer report parse CPU time vs the stage's wall-clock.
155 let parse_cpu_nanos = AtomicU64::new(0);
156
157 let modules: Vec<ModuleInfo> = files
158 .par_iter()
159 .filter_map(|file| {
160 parse_single_file_cached(
161 file,
162 cache,
163 &cache_hits,
164 &cache_misses,
165 &parse_cpu_nanos,
166 need_complexity,
167 )
168 })
169 .collect();
170
171 let hits = cache_hits.load(Ordering::Relaxed);
172 let misses = cache_misses.load(Ordering::Relaxed);
173 if hits > 0 || misses > 0 {
174 tracing::info!(
175 cache_hits = hits,
176 cache_misses = misses,
177 "incremental cache stats"
178 );
179 }
180
181 ParseResult {
182 modules,
183 cache_hits: hits,
184 cache_misses: misses,
185 parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
186 }
187}
188
189/// Parse a single file, consulting the cache first.
190///
191/// Cache validation strategy (fast path -> slow path):
192/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
193/// 2. If mtime+size match the cached entry -> cache hit, return immediately
194/// 3. If mtime+size differ -> read file, compute content hash
195/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
196/// 5. Otherwise -> cache miss, full parse
197fn parse_single_file_cached(
198 file: &DiscoveredFile,
199 cache: Option<&CacheStore>,
200 cache_hits: &std::sync::atomic::AtomicUsize,
201 cache_misses: &std::sync::atomic::AtomicUsize,
202 parse_cpu_nanos: &std::sync::atomic::AtomicU64,
203 need_complexity: bool,
204) -> Option<ModuleInfo> {
205 use std::sync::atomic::Ordering;
206
207 // Fast path: check mtime+size before reading file content.
208 // A single stat() syscall is ~100x cheaper than read()+hash().
209 if let Some(store) = cache
210 && let Ok(metadata) = std::fs::metadata(&file.path)
211 {
212 let mt = mtime_secs(&metadata);
213 let sz = metadata.len();
214 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
215 // When complexity is requested but the cached entry lacks it
216 // (populated by a prior `check` run), skip the cache and re-parse.
217 if !need_complexity || !cached.complexity.is_empty() {
218 cache_hits.fetch_add(1, Ordering::Relaxed);
219 return Some(cache::cached_to_module_opts(
220 cached,
221 file.id,
222 need_complexity,
223 ));
224 }
225 }
226 }
227
228 // Slow path: read file content and compute content hash.
229 // Strip the UTF-8 BOM, if present, before hashing AND before parsing so
230 // the content hash, `compute_line_offsets`, and the oxc parser all see
231 // the same byte sequence. Without this, hash matches that depend on
232 // BOM presence would silently miss the cache. Issue #475.
233 let raw = std::fs::read_to_string(&file.path).ok()?;
234 let source = strip_bom(&raw);
235 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
236
237 // Check cache by content hash (handles touch/save-without-change)
238 if let Some(store) = cache
239 && let Some(cached) = store.get(&file.path, content_hash)
240 && (!need_complexity || !cached.complexity.is_empty())
241 {
242 cache_hits.fetch_add(1, Ordering::Relaxed);
243 return Some(cache::cached_to_module_opts(
244 cached,
245 file.id,
246 need_complexity,
247 ));
248 }
249 cache_misses.fetch_add(1, Ordering::Relaxed);
250
251 // Cache miss, do a full parse. Time just this AST parse so the perf
252 // renderer can report parse CPU time (summed across workers) vs the
253 // stage's wall-clock. File read + hash above are deliberately excluded:
254 // the figure is "parse work", not IO.
255 let parse_start = std::time::Instant::now();
256 let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
257 parse_cpu_nanos.fetch_add(
258 u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
259 Ordering::Relaxed,
260 );
261 Some(module)
262}
263
264/// Extract mtime (seconds since epoch) from file metadata.
265/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
266fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
267 metadata
268 .modified()
269 .ok()
270 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
271 .map_or(0, |d| d.as_secs())
272}
273
274/// Parse a single file and extract module information (without complexity).
275#[must_use]
276pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
277 // BOM strip before hash + parse so downstream offsets stay aligned with
278 // the parser's view. See `parse_single_file_cached` and issue #475.
279 let raw = std::fs::read_to_string(&file.path).ok()?;
280 let source = strip_bom(&raw);
281 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
282 Some(parse_source_to_module(
283 file.id,
284 &file.path,
285 source,
286 content_hash,
287 false,
288 ))
289}
290
291/// Parse from in-memory content (for LSP, includes complexity).
292#[must_use]
293pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
294 // Editors normally strip a BOM before sending didOpen.text, but be
295 // defensive: an editor or test that hands us BOM-bearing content must
296 // produce the same offsets as the on-disk path. Issue #475.
297 let content = strip_bom(content);
298 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
299 parse_source_to_module(file_id, path, content, content_hash, true)
300}
301
302// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
303// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
304#[cfg(all(test, not(miri)))]
305mod tests;