fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10 test,
11 allow(
12 clippy::unwrap_used,
13 clippy::expect_used,
14 reason = "tests use unwrap and expect to keep fixture setup concise"
15 )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod css_classes;
24pub mod css_metrics;
25pub mod flags;
26pub mod glimmer;
27pub mod graphql;
28pub mod html;
29pub mod iconify;
30pub mod inventory;
31pub mod mdx;
32mod parse;
33pub mod sfc;
34pub mod sfc_css;
35mod sfc_props;
36mod sfc_template;
37mod source_map;
38pub mod suppress;
39/// Tailwind CSS arbitrary-value detection.
40pub mod tailwind;
41pub(crate) mod template_complexity;
42mod template_usage;
43/// Visitor utilities for AST extraction.
44pub mod visitor;
45
46use std::path::Path;
47
48use rayon::prelude::*;
49
50use cache::CacheStore;
51use fallow_types::discover::{DiscoveredFile, FileId};
52
53pub use fallow_types::extract::{
54 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
55 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
56 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
57 compute_line_offsets,
58};
59
60pub use astro::extract_astro_frontmatter;
61pub use css::{
62 ThemeScan, ThemeTokenDef, extract_apply_tokens, extract_css_module_exports, scan_theme_blocks,
63};
64pub use css_classes::{
65 MarkupClassScan, MarkupClassToken, is_edit_distance_one, is_typo_edit, scan_markup_class_tokens,
66};
67pub use css_metrics::compute_css_analytics;
68pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
69pub use mdx::extract_mdx_statements;
70pub use sfc::{extract_sfc_scripts, extract_sfc_styles, is_sfc_file};
71pub use sfc_css::{scoped_unused_classes, sfc_virtual_stylesheet};
72pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
73pub use tailwind::{TailwindArbitraryUse, scan_tailwind_arbitrary_values};
74
75#[expect(
76 clippy::expect_used,
77 reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
78)]
79pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
80 regex::Regex::new(pattern).expect("static regex pattern should compile")
81}
82
83/// Synthetic member-access object used to carry exported-instance bindings.
84///
85/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
86/// means the exported value named `export_name` is an instance of the local
87/// class/interface symbol named `target`.
88pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
89
90/// Synthetic member-access object prefix for typed Playwright fixtures.
91///
92/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
93/// means the exported Playwright test object named `test` provides a fixture
94/// named `fixture` whose declared type is `type_name`.
95pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
96
97/// Synthetic member-access object prefix for Playwright fixture wrapper aliases.
98///
99/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_ALIAS_SENTINEL}{alias}:"), member: base }`
100/// means the exported Playwright test object named `alias` inherits fixture
101/// definitions from the exported test object named `base`.
102pub const PLAYWRIGHT_FIXTURE_ALIAS_SENTINEL: &str = "__fallow_playwright_fixture_alias__:";
103
104/// Synthetic member-access object prefix for Playwright fixture member uses.
105///
106/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
107/// means a callback passed to the Playwright test object named `test`
108/// destructures `fixture` and accesses `fixture.member`.
109pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
110
111/// Synthetic member-access object prefix for exported Playwright fixture type aliases.
112///
113/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_TYPE_SENTINEL}{alias}:{fixture_path}"), member: type_name }`
114/// means a local type alias named `alias` contains a nested fixture path whose
115/// declared type is `type_name`. The analyze layer uses this when a Playwright
116/// fixture generic imports an object type alias from another module.
117pub const PLAYWRIGHT_FIXTURE_TYPE_SENTINEL: &str = "__fallow_playwright_fixture_type__:";
118
119/// Synthetic member-access object prefix for static-factory call returns.
120///
121/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
122/// means a local binding was assigned from `<callee>.<method>()` and a member
123/// is accessed on the result. The analyze layer resolves `callee` through the
124/// consumer module's imports to a class export and credits `member` on the
125/// class when the matching method carries `is_instance_returning_static`.
126/// See issue #346.
127pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
128
129/// Synthetic member-access object prefix for fluent-builder chain credit.
130///
131/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
132/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
133/// observed. `chain` is a comma-separated list of method names (empty when
134/// `member` is the first chained call after `root_method`). The analyze layer
135/// resolves `callee` to a class export, validates `root_method` has
136/// `is_instance_returning_static`, walks each `chain` segment requiring
137/// `is_self_returning` on the class, and credits `member` on the class
138/// when the chain remains on the class type. See issue #387.
139pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
140
141/// Synthetic member-access object prefix for fluent chains rooted at a `new`
142/// expression.
143///
144/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
145/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
146/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
147/// returns an instance of `class`, so no `is_instance_returning_static` check
148/// applies. `chain` is a comma-separated list of the intermediate method names
149/// between the constructor and `member` (it always contains at least the first
150/// method, which must be `is_self_returning` to reach `member`). The analyze
151/// layer resolves `class` to a class export, requires every `chain` segment to
152/// be `is_self_returning` on the class, and credits `member` on the class.
153/// The first method directly off the constructor is credited separately via
154/// the `static_member_object_name` `NewExpression` arm. See issue #605.
155pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
156
157pub use parse::parse_source_to_module;
158
159/// Leading UTF-8 byte order mark codepoint.
160///
161/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
162/// BOM at the start of source files. fallow's contract is "UTF-8 with or
163/// without BOM; line offsets are computed against the post-BOM view; the BOM,
164/// if present on input, is preserved on output by `fallow fix`."
165const BOM_CHAR: char = '\u{FEFF}';
166
167/// Strip the leading UTF-8 BOM if present.
168///
169/// Called at every file-read entry point in this crate so the rest of the
170/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
171/// analyses) sees a consistent post-BOM view. Mirrors the
172/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
173/// and source-code-shaped sources are processed symmetrically. See issue #475.
174#[must_use]
175pub(crate) fn strip_bom(source: &str) -> &str {
176 source.strip_prefix(BOM_CHAR).unwrap_or(source)
177}
178
179/// Parse all files in parallel, extracting imports and exports.
180/// Uses the cache to skip reparsing files whose content hasn't changed.
181///
182/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
183/// metrics are computed during parsing (needed by the `health` command).
184/// Pass `false` for dead-code analysis where complexity data is unused.
185pub fn parse_all_files(
186 files: &[DiscoveredFile],
187 cache: Option<&CacheStore>,
188 need_complexity: bool,
189) -> ParseResult {
190 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
191 let cache_hits = AtomicUsize::new(0);
192 let cache_misses = AtomicUsize::new(0);
193 let parse_cpu_nanos = AtomicU64::new(0);
194
195 let modules: Vec<ModuleInfo> = files
196 .par_iter()
197 .filter_map(|file| {
198 parse_single_file_cached(
199 file,
200 cache,
201 &cache_hits,
202 &cache_misses,
203 &parse_cpu_nanos,
204 need_complexity,
205 )
206 })
207 .collect();
208
209 let hits = cache_hits.load(Ordering::Relaxed);
210 let misses = cache_misses.load(Ordering::Relaxed);
211 if hits > 0 || misses > 0 {
212 tracing::info!(
213 cache_hits = hits,
214 cache_misses = misses,
215 "incremental cache stats"
216 );
217 }
218
219 ParseResult {
220 modules,
221 cache_hits: hits,
222 cache_misses: misses,
223 parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
224 }
225}
226
227/// Parse a single file, consulting the cache first.
228///
229/// Cache validation strategy (fast path -> slow path):
230/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
231/// 2. If mtime+size match the cached entry -> cache hit, return immediately
232/// 3. If mtime+size differ -> read file, compute content hash
233/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
234/// 5. Otherwise -> cache miss, full parse
235fn parse_single_file_cached(
236 file: &DiscoveredFile,
237 cache: Option<&CacheStore>,
238 cache_hits: &std::sync::atomic::AtomicUsize,
239 cache_misses: &std::sync::atomic::AtomicUsize,
240 parse_cpu_nanos: &std::sync::atomic::AtomicU64,
241 need_complexity: bool,
242) -> Option<ModuleInfo> {
243 use std::sync::atomic::Ordering;
244
245 if let Some(store) = cache
246 && let Ok(metadata) = std::fs::metadata(&file.path)
247 {
248 let mt = mtime_secs(&metadata);
249 let sz = metadata.len();
250 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz)
251 && (!need_complexity || !cached.complexity.is_empty())
252 {
253 cache_hits.fetch_add(1, Ordering::Relaxed);
254 return Some(cache::cached_to_module_opts(
255 cached,
256 file.id,
257 need_complexity,
258 ));
259 }
260 }
261
262 let raw = std::fs::read_to_string(&file.path).ok()?;
263 let source = strip_bom(&raw);
264 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
265
266 if let Some(store) = cache
267 && let Some(cached) = store.get(&file.path, content_hash)
268 && (!need_complexity || !cached.complexity.is_empty())
269 {
270 cache_hits.fetch_add(1, Ordering::Relaxed);
271 return Some(cache::cached_to_module_opts(
272 cached,
273 file.id,
274 need_complexity,
275 ));
276 }
277 cache_misses.fetch_add(1, Ordering::Relaxed);
278
279 let parse_start = std::time::Instant::now();
280 let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
281 parse_cpu_nanos.fetch_add(
282 u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
283 Ordering::Relaxed,
284 );
285 Some(module)
286}
287
288/// Extract mtime (seconds since epoch) from file metadata.
289/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
290fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
291 metadata
292 .modified()
293 .ok()
294 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
295 .map_or(0, |d| d.as_secs())
296}
297
298/// Parse a single file and extract module information (without complexity).
299#[must_use]
300pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
301 let raw = std::fs::read_to_string(&file.path).ok()?;
302 let source = strip_bom(&raw);
303 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
304 Some(parse_source_to_module(
305 file.id,
306 &file.path,
307 source,
308 content_hash,
309 false,
310 ))
311}
312
313/// Parse from in-memory content (for LSP, includes complexity).
314#[must_use]
315pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
316 let content = strip_bom(content);
317 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
318 parse_source_to_module(file_id, path, content, content_hash, true)
319}
320
321#[cfg(all(test, not(miri)))]
322mod tests;