fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10 test,
11 allow(
12 clippy::unwrap_used,
13 clippy::expect_used,
14 reason = "tests use unwrap and expect to keep fixture setup concise"
15 )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod css_classes;
24pub mod css_metrics;
25pub mod flags;
26pub mod glimmer;
27pub mod graphql;
28pub mod html;
29pub mod iconify;
30pub mod inventory;
31pub mod mdx;
32mod module_info;
33mod parse;
34pub mod sfc;
35pub mod sfc_css;
36mod sfc_props;
37mod sfc_template;
38mod source_map;
39pub mod suppress;
40/// Tailwind CSS arbitrary-value detection.
41pub mod tailwind;
42pub(crate) mod template_complexity;
43mod template_usage;
44/// Visitor utilities for AST extraction.
45pub mod visitor;
46
47use std::path::Path;
48
49use rayon::prelude::*;
50
51use cache::CacheStore;
52use fallow_types::discover::{DiscoveredFile, FileId};
53
54pub use fallow_types::extract::{
55 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
56 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
57 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
58 compute_line_offsets,
59};
60
61pub use astro::{
62 extract_astro_frontmatter, extract_astro_style_regions, extract_astro_template_regions,
63};
64pub use css::{
65 ThemeScan, ThemeTokenDef, extract_apply_tokens, extract_css_module_exports, scan_theme_blocks,
66};
67pub use css_classes::{
68 MarkupClassScan, MarkupClassToken, is_edit_distance_one, is_typo_edit, scan_markup_class_tokens,
69};
70pub use css_metrics::compute_css_analytics;
71pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
72pub use mdx::extract_mdx_statements;
73pub use sfc::{
74 SourceRegion, extract_sfc_scripts, extract_sfc_styles, extract_sfc_template_regions,
75 is_sfc_file,
76};
77pub use sfc_css::{scoped_unused_classes, sfc_virtual_stylesheet};
78pub use sfc_template::angular::{ANGULAR_THIS_SPREAD_SENTINEL, ANGULAR_TPL_SENTINEL};
79pub use tailwind::{TailwindArbitraryUse, scan_tailwind_arbitrary_values};
80
81#[expect(
82 clippy::expect_used,
83 reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
84)]
85pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
86 regex::Regex::new(pattern).expect("static regex pattern should compile")
87}
88
89/// Synthetic member-access object used to carry exported-instance bindings.
90///
91/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
92/// means the exported value named `export_name` is an instance of the local
93/// class/interface symbol named `target`.
94pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
95
96/// Synthetic member-access object prefix for typed Playwright fixtures.
97///
98/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
99/// means the exported Playwright test object named `test` provides a fixture
100/// named `fixture` whose declared type is `type_name`.
101pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
102
103/// Synthetic member-access object prefix for Playwright fixture wrapper aliases.
104///
105/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_ALIAS_SENTINEL}{alias}:"), member: base }`
106/// means the exported Playwright test object named `alias` inherits fixture
107/// definitions from the exported test object named `base`.
108pub const PLAYWRIGHT_FIXTURE_ALIAS_SENTINEL: &str = "__fallow_playwright_fixture_alias__:";
109
110/// Synthetic member-access object prefix for Playwright fixture member uses.
111///
112/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
113/// means a callback passed to the Playwright test object named `test`
114/// destructures `fixture` and accesses `fixture.member`.
115pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
116
117/// Synthetic member-access object prefix for exported Playwright fixture type aliases.
118///
119/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_TYPE_SENTINEL}{alias}:{fixture_path}"), member: type_name }`
120/// means a local type alias named `alias` contains a nested fixture path whose
121/// declared type is `type_name`. The analyze layer uses this when a Playwright
122/// fixture generic imports an object type alias from another module.
123pub const PLAYWRIGHT_FIXTURE_TYPE_SENTINEL: &str = "__fallow_playwright_fixture_type__:";
124
125/// Synthetic member-access object prefix for static-factory call returns.
126///
127/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
128/// means a local binding was assigned from `<callee>.<method>()` and a member
129/// is accessed on the result. The analyze layer resolves `callee` through the
130/// consumer module's imports to a class export and credits `member` on the
131/// class when the matching method carries `is_instance_returning_static`.
132/// See issue #346.
133pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
134
135/// Synthetic member-access object prefix for fluent-builder chain credit.
136///
137/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
138/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
139/// observed. `chain` is a comma-separated list of method names (empty when
140/// `member` is the first chained call after `root_method`). The analyze layer
141/// resolves `callee` to a class export, validates `root_method` has
142/// `is_instance_returning_static`, walks each `chain` segment requiring
143/// `is_self_returning` on the class, and credits `member` on the class
144/// when the chain remains on the class type. See issue #387.
145pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
146
147/// Synthetic member-access object prefix for fluent chains rooted at a `new`
148/// expression.
149///
150/// `MemberAccess { object: format!("{FLUENT_CHAIN_NEW_SENTINEL}{class}:{chain}"), member }`
151/// means a chain `new <class>(...).<...chain>.<member>` was observed. Unlike
152/// `FLUENT_CHAIN_SENTINEL`, there is no root method: a constructor always
153/// returns an instance of `class`, so no `is_instance_returning_static` check
154/// applies. `chain` is a comma-separated list of the intermediate method names
155/// between the constructor and `member` (it always contains at least the first
156/// method, which must be `is_self_returning` to reach `member`). The analyze
157/// layer resolves `class` to a class export, requires every `chain` segment to
158/// be `is_self_returning` on the class, and credits `member` on the class.
159/// The first method directly off the constructor is credited separately via
160/// the `static_member_object_name` `NewExpression` arm. See issue #605.
161pub const FLUENT_CHAIN_NEW_SENTINEL: &str = "__fallow_fluent_chain_new__:";
162
163pub use parse::parse_source_to_module;
164
165/// Leading UTF-8 byte order mark codepoint.
166///
167/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
168/// BOM at the start of source files. fallow's contract is "UTF-8 with or
169/// without BOM; line offsets are computed against the post-BOM view; the BOM,
170/// if present on input, is preserved on output by `fallow fix`."
171const BOM_CHAR: char = '\u{FEFF}';
172
173/// Strip the leading UTF-8 BOM if present.
174///
175/// Called at every file-read entry point in this crate so the rest of the
176/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
177/// analyses) sees a consistent post-BOM view. Mirrors the
178/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
179/// and source-code-shaped sources are processed symmetrically. See issue #475.
180#[must_use]
181pub(crate) fn strip_bom(source: &str) -> &str {
182 source.strip_prefix(BOM_CHAR).unwrap_or(source)
183}
184
185/// Parse all files in parallel, extracting imports and exports.
186/// Uses the cache to skip reparsing files whose content hasn't changed.
187///
188/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
189/// metrics are computed during parsing (needed by the `health` command).
190/// Pass `false` for dead-code analysis where complexity data is unused.
191pub fn parse_all_files(
192 files: &[DiscoveredFile],
193 cache: Option<&CacheStore>,
194 need_complexity: bool,
195) -> ParseResult {
196 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
197 let cache_hits = AtomicUsize::new(0);
198 let cache_misses = AtomicUsize::new(0);
199 let parse_cpu_nanos = AtomicU64::new(0);
200
201 let modules: Vec<ModuleInfo> = files
202 .par_iter()
203 .filter_map(|file| {
204 parse_single_file_cached(
205 file,
206 cache,
207 &cache_hits,
208 &cache_misses,
209 &parse_cpu_nanos,
210 need_complexity,
211 )
212 })
213 .collect();
214
215 let hits = cache_hits.load(Ordering::Relaxed);
216 let misses = cache_misses.load(Ordering::Relaxed);
217 if hits > 0 || misses > 0 {
218 tracing::info!(
219 cache_hits = hits,
220 cache_misses = misses,
221 "incremental cache stats"
222 );
223 }
224
225 ParseResult {
226 modules,
227 cache_hits: hits,
228 cache_misses: misses,
229 parse_cpu_ms: parse_cpu_nanos.load(Ordering::Relaxed) as f64 / 1_000_000.0,
230 }
231}
232
233/// Parse a single file, consulting the cache first.
234///
235/// Cache validation strategy (fast path -> slow path):
236/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
237/// 2. If mtime+size match the cached entry -> cache hit, return immediately
238/// 3. If mtime+size differ -> read file, compute content hash
239/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
240/// 5. Otherwise -> cache miss, full parse
241fn parse_single_file_cached(
242 file: &DiscoveredFile,
243 cache: Option<&CacheStore>,
244 cache_hits: &std::sync::atomic::AtomicUsize,
245 cache_misses: &std::sync::atomic::AtomicUsize,
246 parse_cpu_nanos: &std::sync::atomic::AtomicU64,
247 need_complexity: bool,
248) -> Option<ModuleInfo> {
249 use std::sync::atomic::Ordering;
250
251 if let Some(store) = cache
252 && let Ok(metadata) = std::fs::metadata(&file.path)
253 {
254 let mt = mtime_secs(&metadata);
255 let sz = metadata.len();
256 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz)
257 && (!need_complexity || !cached.complexity.is_empty())
258 {
259 cache_hits.fetch_add(1, Ordering::Relaxed);
260 return Some(cache::cached_to_module_opts(
261 cached,
262 file.id,
263 need_complexity,
264 ));
265 }
266 }
267
268 let raw = std::fs::read_to_string(&file.path).ok()?;
269 let source = strip_bom(&raw);
270 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
271
272 if let Some(store) = cache
273 && let Some(cached) = store.get(&file.path, content_hash)
274 && (!need_complexity || !cached.complexity.is_empty())
275 {
276 cache_hits.fetch_add(1, Ordering::Relaxed);
277 return Some(cache::cached_to_module_opts(
278 cached,
279 file.id,
280 need_complexity,
281 ));
282 }
283 cache_misses.fetch_add(1, Ordering::Relaxed);
284
285 let parse_start = std::time::Instant::now();
286 let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
287 parse_cpu_nanos.fetch_add(
288 u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX),
289 Ordering::Relaxed,
290 );
291 Some(module)
292}
293
294/// Extract mtime (seconds since epoch) from file metadata.
295/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
296fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
297 metadata
298 .modified()
299 .ok()
300 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
301 .map_or(0, |d| d.as_secs())
302}
303
304/// Parse a single file and extract module information (without complexity).
305#[must_use]
306pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
307 let raw = std::fs::read_to_string(&file.path).ok()?;
308 let source = strip_bom(&raw);
309 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
310 Some(parse_source_to_module(
311 file.id,
312 &file.path,
313 source,
314 content_hash,
315 false,
316 ))
317}
318
319/// Parse from in-memory content (for LSP, includes complexity).
320#[must_use]
321pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
322 let content = strip_bom(content);
323 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
324 parse_source_to_module(file_id, path, content, content_hash, true)
325}
326
327#[cfg(all(test, not(miri)))]
328mod tests;