fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8
9mod asset_url;
10pub mod astro;
11pub mod cache;
12pub(crate) mod complexity;
13pub mod css;
14pub mod flags;
15pub mod glimmer;
16pub mod graphql;
17pub mod html;
18pub mod iconify;
19pub mod inventory;
20pub mod mdx;
21mod parse;
22pub mod sfc;
23mod sfc_template;
24pub mod suppress;
25pub(crate) mod template_complexity;
26mod template_usage;
27pub mod visitor;
28
29use std::path::Path;
30
31use rayon::prelude::*;
32
33use cache::CacheStore;
34use fallow_types::discover::{DiscoveredFile, FileId};
35
36// Re-export all extract types from fallow-types
37pub use fallow_types::extract::{
38 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
39 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
40 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
41 compute_line_offsets,
42};
43
44// Re-export extraction functions for internal use and fuzzing
45pub use astro::extract_astro_frontmatter;
46pub use css::extract_css_module_exports;
47pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
48pub use mdx::extract_mdx_statements;
49pub use sfc::{extract_sfc_scripts, is_sfc_file};
50pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
51
52/// Synthetic member-access object used to carry exported-instance bindings.
53///
54/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
55/// means the exported value named `export_name` is an instance of the local
56/// class/interface symbol named `target`.
57pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
58
59/// Synthetic member-access object prefix for typed Playwright fixtures.
60///
61/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
62/// means the exported Playwright test object named `test` provides a fixture
63/// named `fixture` whose declared type is `type_name`.
64pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
65
66/// Synthetic member-access object prefix for Playwright fixture member uses.
67///
68/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
69/// means a callback passed to the Playwright test object named `test`
70/// destructures `fixture` and accesses `fixture.member`.
71pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
72
73/// Synthetic member-access object prefix for static-factory call returns.
74///
75/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
76/// means a local binding was assigned from `<callee>.<method>()` and a member
77/// is accessed on the result. The analyze layer resolves `callee` through the
78/// consumer module's imports to a class export and credits `member` on the
79/// class when the matching method carries `is_instance_returning_static`.
80/// See issue #346.
81pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
82
83/// Synthetic member-access object prefix for fluent-builder chain credit.
84///
85/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
86/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
87/// observed. `chain` is a comma-separated list of method names (empty when
88/// `member` is the first chained call after `root_method`). The analyze layer
89/// resolves `callee` to a class export, validates `root_method` has
90/// `is_instance_returning_static`, walks each `chain` segment requiring
91/// `is_self_returning` on the class, and credits `member` on the class
92/// when the chain remains on the class type. See issue #387.
93pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
94
95/// Synthetic member-access object prefix for fluent chains rooted at a `new`
96/// expression.
97///
98/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
99/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
100/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
101/// returns an instance of `class`, so no `is_instance_returning_static` check
102/// applies. `chain` is a comma-separated list of the intermediate method names
103/// between the constructor and `member` (it always contains at least the first
104/// method, which must be `is_self_returning` to reach `member`). The analyze
105/// layer resolves `class` to a class export, requires every `chain` segment to
106/// be `is_self_returning` on the class, and credits `member` on the class.
107/// The first method directly off the constructor is credited separately via
108/// the `static_member_object_name` `NewExpression` arm. See issue #605.
109pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
110
111use parse::parse_source_to_module;
112
113/// Leading UTF-8 byte order mark codepoint.
114///
115/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
116/// BOM at the start of source files. fallow's contract is "UTF-8 with or
117/// without BOM; line offsets are computed against the post-BOM view; the BOM,
118/// if present on input, is preserved on output by `fallow fix`."
119const BOM_CHAR: char = '\u{FEFF}';
120
121/// Strip the leading UTF-8 BOM if present.
122///
123/// Called at every file-read entry point in this crate so the rest of the
124/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
125/// analyses) sees a consistent post-BOM view. Mirrors the
126/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
127/// and source-code-shaped sources are processed symmetrically. See issue #475.
128#[must_use]
129pub(crate) fn strip_bom(source: &str) -> &str {
130 source.strip_prefix(BOM_CHAR).unwrap_or(source)
131}
132
133/// Parse all files in parallel, extracting imports and exports.
134/// Uses the cache to skip reparsing files whose content hasn't changed.
135///
136/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
137/// metrics are computed during parsing (needed by the `health` command).
138/// Pass `false` for dead-code analysis where complexity data is unused.
139pub fn parse_all_files(
140 files: &[DiscoveredFile],
141 cache: Option<&CacheStore>,
142 need_complexity: bool,
143) -> ParseResult {
144 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
145 let cache_hits = AtomicUsize::new(0);
146 let cache_misses = AtomicUsize::new(0);
147 // Summed nanoseconds spent in the actual AST parse (cache-miss path only).
148 // Lets the perf renderer report parse CPU time vs the stage's wall-clock.
149 let parse_cpu_nanos = AtomicU64::new(0);
150
151 let modules: Vec<ModuleInfo> = files
152 .par_iter()
153 .filter_map(|file| {
154 parse_single_file_cached(
155 file,
156 cache,
157 &cache_hits,
158 &cache_misses,
159 &parse_cpu_nanos,
160 need_complexity,
161 )
162 })
163 .collect();
164
165 let hits = cache_hits.load(Ordering::Relaxed);
166 let misses = cache_misses.load(Ordering::Relaxed);
167 if hits > 0 || misses > 0 {
168 tracing::info!(
169 cache_hits = hits,
170 cache_misses = misses,
171 "incremental cache stats"
172 );
173 }
174
175 ParseResult {
176 modules,
177 cache_hits: hits,
178 cache_misses: misses,
179 parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
180 }
181}
182
183/// Parse a single file, consulting the cache first.
184///
185/// Cache validation strategy (fast path -> slow path):
186/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
187/// 2. If mtime+size match the cached entry -> cache hit, return immediately
188/// 3. If mtime+size differ -> read file, compute content hash
189/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
190/// 5. Otherwise -> cache miss, full parse
191fn parse_single_file_cached(
192 file: &DiscoveredFile,
193 cache: Option<&CacheStore>,
194 cache_hits: &std::sync::atomic::AtomicUsize,
195 cache_misses: &std::sync::atomic::AtomicUsize,
196 parse_cpu_nanos: &std::sync::atomic::AtomicU64,
197 need_complexity: bool,
198) -> Option<ModuleInfo> {
199 use std::sync::atomic::Ordering;
200
201 // Fast path: check mtime+size before reading file content.
202 // A single stat() syscall is ~100x cheaper than read()+hash().
203 if let Some(store) = cache
204 && let Ok(metadata) = std::fs::metadata(&file.path)
205 {
206 let mt = mtime_secs(&metadata);
207 let sz = metadata.len();
208 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
209 // When complexity is requested but the cached entry lacks it
210 // (populated by a prior `check` run), skip the cache and re-parse.
211 if !need_complexity || !cached.complexity.is_empty() {
212 cache_hits.fetch_add(1, Ordering::Relaxed);
213 return Some(cache::cached_to_module_opts(
214 cached,
215 file.id,
216 need_complexity,
217 ));
218 }
219 }
220 }
221
222 // Slow path: read file content and compute content hash.
223 // Strip the UTF-8 BOM, if present, before hashing AND before parsing so
224 // the content hash, `compute_line_offsets`, and the oxc parser all see
225 // the same byte sequence. Without this, hash matches that depend on
226 // BOM presence would silently miss the cache. Issue #475.
227 let raw = std::fs::read_to_string(&file.path).ok()?;
228 let source = strip_bom(&raw);
229 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
230
231 // Check cache by content hash (handles touch/save-without-change)
232 if let Some(store) = cache
233 && let Some(cached) = store.get(&file.path, content_hash)
234 && (!need_complexity || !cached.complexity.is_empty())
235 {
236 cache_hits.fetch_add(1, Ordering::Relaxed);
237 return Some(cache::cached_to_module_opts(
238 cached,
239 file.id,
240 need_complexity,
241 ));
242 }
243 cache_misses.fetch_add(1, Ordering::Relaxed);
244
245 // Cache miss, do a full parse. Time just this AST parse so the perf
246 // renderer can report parse CPU time (summed across workers) vs the
247 // stage's wall-clock. File read + hash above are deliberately excluded:
248 // the figure is "parse work", not IO.
249 let parse_start = std::time::Instant::now();
250 let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
251 parse_cpu_nanos.fetch_add(
252 u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
253 Ordering::Relaxed,
254 );
255 Some(module)
256}
257
258/// Extract mtime (seconds since epoch) from file metadata.
259/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
260fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
261 metadata
262 .modified()
263 .ok()
264 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
265 .map_or(0, |d| d.as_secs())
266}
267
268/// Parse a single file and extract module information (without complexity).
269#[must_use]
270pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
271 // BOM strip before hash + parse so downstream offsets stay aligned with
272 // the parser's view. See `parse_single_file_cached` and issue #475.
273 let raw = std::fs::read_to_string(&file.path).ok()?;
274 let source = strip_bom(&raw);
275 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
276 Some(parse_source_to_module(
277 file.id,
278 &file.path,
279 source,
280 content_hash,
281 false,
282 ))
283}
284
285/// Parse from in-memory content (for LSP, includes complexity).
286#[must_use]
287pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
288 // Editors normally strip a BOM before sending didOpen.text, but be
289 // defensive: an editor or test that hands us BOM-bearing content must
290 // produce the same offsets as the on-disk path. Issue #475.
291 let content = strip_bom(content);
292 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
293 parse_source_to_module(file_id, path, content, content_hash, true)
294}
295
296// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
297// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
298#[cfg(all(test, not(miri)))]
299mod tests;