fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10 test,
11 allow(
12 clippy::unwrap_used,
13 clippy::expect_used,
14 reason = "tests use unwrap and expect to keep fixture setup concise"
15 )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod flags;
24pub mod glimmer;
25pub mod graphql;
26pub mod html;
27pub mod iconify;
28pub mod inventory;
29pub mod mdx;
30mod parse;
31pub mod sfc;
32mod sfc_template;
33mod source_map;
34pub mod suppress;
35pub(crate) mod template_complexity;
36mod template_usage;
37/// Visitor utilities for AST extraction.
38pub mod visitor;
39
40use std::path::Path;
41
42use rayon::prelude::*;
43
44use cache::CacheStore;
45use fallow_types::discover::{DiscoveredFile, FileId};
46
47pub use fallow_types::extract::{
48 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
49 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
50 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
51 compute_line_offsets,
52};
53
54pub use astro::extract_astro_frontmatter;
55pub use css::extract_css_module_exports;
56pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
57pub use mdx::extract_mdx_statements;
58pub use sfc::{extract_sfc_scripts, is_sfc_file};
59pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
60
61#[expect(
62 clippy::expect_used,
63 reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
64)]
65pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
66 regex::Regex::new(pattern).expect("static regex pattern should compile")
67}
68
69/// Synthetic member-access object used to carry exported-instance bindings.
70///
71/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
72/// means the exported value named `export_name` is an instance of the local
73/// class/interface symbol named `target`.
74pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
75
76/// Synthetic member-access object prefix for typed Playwright fixtures.
77///
78/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
79/// means the exported Playwright test object named `test` provides a fixture
80/// named `fixture` whose declared type is `type_name`.
81pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
82
83/// Synthetic member-access object prefix for Playwright fixture member uses.
84///
85/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
86/// means a callback passed to the Playwright test object named `test`
87/// destructures `fixture` and accesses `fixture.member`.
88pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
89
90/// Synthetic member-access object prefix for exported Playwright fixture type aliases.
91///
92/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_TYPE_SENTINEL}{alias}:{fixture_path}"), member: type_name }`
93/// means a local type alias named `alias` contains a nested fixture path whose
94/// declared type is `type_name`. The analyze layer uses this when a Playwright
95/// fixture generic imports an object type alias from another module.
96pub const PLAYWRIGHT_FIXTURE_TYPE_SENTINEL: &str = "__fallow_playwright_fixture_type__:";
97
98/// Synthetic member-access object prefix for static-factory call returns.
99///
100/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
101/// means a local binding was assigned from `<callee>.<method>()` and a member
102/// is accessed on the result. The analyze layer resolves `callee` through the
103/// consumer module's imports to a class export and credits `member` on the
104/// class when the matching method carries `is_instance_returning_static`.
105/// See issue #346.
106pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
107
108/// Synthetic member-access object prefix for fluent-builder chain credit.
109///
110/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
111/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
112/// observed. `chain` is a comma-separated list of method names (empty when
113/// `member` is the first chained call after `root_method`). The analyze layer
114/// resolves `callee` to a class export, validates `root_method` has
115/// `is_instance_returning_static`, walks each `chain` segment requiring
116/// `is_self_returning` on the class, and credits `member` on the class
117/// when the chain remains on the class type. See issue #387.
118pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
119
120/// Synthetic member-access object prefix for fluent chains rooted at a `new`
121/// expression.
122///
123/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
124/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
125/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
126/// returns an instance of `class`, so no `is_instance_returning_static` check
127/// applies. `chain` is a comma-separated list of the intermediate method names
128/// between the constructor and `member` (it always contains at least the first
129/// method, which must be `is_self_returning` to reach `member`). The analyze
130/// layer resolves `class` to a class export, requires every `chain` segment to
131/// be `is_self_returning` on the class, and credits `member` on the class.
132/// The first method directly off the constructor is credited separately via
133/// the `static_member_object_name` `NewExpression` arm. See issue #605.
134pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
135
136pub use parse::parse_source_to_module;
137
138/// Leading UTF-8 byte order mark codepoint.
139///
140/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
141/// BOM at the start of source files. fallow's contract is "UTF-8 with or
142/// without BOM; line offsets are computed against the post-BOM view; the BOM,
143/// if present on input, is preserved on output by `fallow fix`."
144const BOM_CHAR: char = '\u{FEFF}';
145
146/// Strip the leading UTF-8 BOM if present.
147///
148/// Called at every file-read entry point in this crate so the rest of the
149/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
150/// analyses) sees a consistent post-BOM view. Mirrors the
151/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
152/// and source-code-shaped sources are processed symmetrically. See issue #475.
153#[must_use]
154pub(crate) fn strip_bom(source: &str) -> &str {
155 source.strip_prefix(BOM_CHAR).unwrap_or(source)
156}
157
158/// Parse all files in parallel, extracting imports and exports.
159/// Uses the cache to skip reparsing files whose content hasn't changed.
160///
161/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
162/// metrics are computed during parsing (needed by the `health` command).
163/// Pass `false` for dead-code analysis where complexity data is unused.
164pub fn parse_all_files(
165 files: &[DiscoveredFile],
166 cache: Option<&CacheStore>,
167 need_complexity: bool,
168) -> ParseResult {
169 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
170 let cache_hits = AtomicUsize::new(0);
171 let cache_misses = AtomicUsize::new(0);
172 let parse_cpu_nanos = AtomicU64::new(0);
173
174 let modules: Vec<ModuleInfo> = files
175 .par_iter()
176 .filter_map(|file| {
177 parse_single_file_cached(
178 file,
179 cache,
180 &cache_hits,
181 &cache_misses,
182 &parse_cpu_nanos,
183 need_complexity,
184 )
185 })
186 .collect();
187
188 let hits = cache_hits.load(Ordering::Relaxed);
189 let misses = cache_misses.load(Ordering::Relaxed);
190 if hits > 0 || misses > 0 {
191 tracing::info!(
192 cache_hits = hits,
193 cache_misses = misses,
194 "incremental cache stats"
195 );
196 }
197
198 ParseResult {
199 modules,
200 cache_hits: hits,
201 cache_misses: misses,
202 parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
203 }
204}
205
206/// Parse a single file, consulting the cache first.
207///
208/// Cache validation strategy (fast path -> slow path):
209/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
210/// 2. If mtime+size match the cached entry -> cache hit, return immediately
211/// 3. If mtime+size differ -> read file, compute content hash
212/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
213/// 5. Otherwise -> cache miss, full parse
214fn parse_single_file_cached(
215 file: &DiscoveredFile,
216 cache: Option<&CacheStore>,
217 cache_hits: &std::sync::atomic::AtomicUsize,
218 cache_misses: &std::sync::atomic::AtomicUsize,
219 parse_cpu_nanos: &std::sync::atomic::AtomicU64,
220 need_complexity: bool,
221) -> Option<ModuleInfo> {
222 use std::sync::atomic::Ordering;
223
224 if let Some(store) = cache
225 && let Ok(metadata) = std::fs::metadata(&file.path)
226 {
227 let mt = mtime_secs(&metadata);
228 let sz = metadata.len();
229 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz)
230 && (!need_complexity || !cached.complexity.is_empty())
231 {
232 cache_hits.fetch_add(1, Ordering::Relaxed);
233 return Some(cache::cached_to_module_opts(
234 cached,
235 file.id,
236 need_complexity,
237 ));
238 }
239 }
240
241 let raw = std::fs::read_to_string(&file.path).ok()?;
242 let source = strip_bom(&raw);
243 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
244
245 if let Some(store) = cache
246 && let Some(cached) = store.get(&file.path, content_hash)
247 && (!need_complexity || !cached.complexity.is_empty())
248 {
249 cache_hits.fetch_add(1, Ordering::Relaxed);
250 return Some(cache::cached_to_module_opts(
251 cached,
252 file.id,
253 need_complexity,
254 ));
255 }
256 cache_misses.fetch_add(1, Ordering::Relaxed);
257
258 let parse_start = std::time::Instant::now();
259 let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
260 parse_cpu_nanos.fetch_add(
261 u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
262 Ordering::Relaxed,
263 );
264 Some(module)
265}
266
267/// Extract mtime (seconds since epoch) from file metadata.
268/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
269fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
270 metadata
271 .modified()
272 .ok()
273 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
274 .map_or(0, |d| d.as_secs())
275}
276
277/// Parse a single file and extract module information (without complexity).
278#[must_use]
279pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
280 let raw = std::fs::read_to_string(&file.path).ok()?;
281 let source = strip_bom(&raw);
282 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
283 Some(parse_source_to_module(
284 file.id,
285 &file.path,
286 source,
287 content_hash,
288 false,
289 ))
290}
291
292/// Parse from in-memory content (for LSP, includes complexity).
293#[must_use]
294pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
295 let content = strip_bom(content);
296 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
297 parse_source_to_module(file_id, path, content, content_hash, true)
298}
299
300#[cfg(all(test, not(miri)))]
301mod tests;