fallow_extract/lib.rs
1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8
9mod asset_url;
10pub mod astro;
11pub mod cache;
12pub(crate) mod complexity;
13pub mod css;
14pub mod flags;
15pub mod glimmer;
16pub mod graphql;
17pub mod html;
18pub mod inventory;
19pub mod mdx;
20mod parse;
21pub mod sfc;
22mod sfc_template;
23pub mod suppress;
24pub(crate) mod template_complexity;
25mod template_usage;
26pub mod visitor;
27
28use std::path::Path;
29
30use rayon::prelude::*;
31
32use cache::CacheStore;
33use fallow_types::discover::{DiscoveredFile, FileId};
34
35// Re-export all extract types from fallow-types
36pub use fallow_types::extract::{
37 ClassHeritageInfo, DynamicImportInfo, DynamicImportPattern, ExportInfo, ExportName, ImportInfo,
38 ImportedName, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind, ModuleInfo,
39 ParseResult, PublicSignatureTypeReference, ReExportInfo, RequireCallInfo, VisibilityTag,
40 compute_line_offsets,
41};
42
43// Re-export extraction functions for internal use and fuzzing
44pub use astro::extract_astro_frontmatter;
45pub use css::extract_css_module_exports;
46pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
47pub use mdx::extract_mdx_statements;
48pub use sfc::{extract_sfc_scripts, is_sfc_file};
49pub use sfc_template::angular::ANGULAR_TPL_SENTINEL;
50
51/// Synthetic member-access object used to carry exported-instance bindings.
52///
53/// `MemberAccess { object: format!("{INSTANCE_EXPORT_SENTINEL}{export_name}"), member: target }`
54/// means the exported value named `export_name` is an instance of the local
55/// class/interface symbol named `target`.
56pub const INSTANCE_EXPORT_SENTINEL: &str = "__fallow_instance_export__:";
57
58/// Synthetic member-access object prefix for typed Playwright fixtures.
59///
60/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_DEF_SENTINEL}{test}:{fixture}"), member: type_name }`
61/// means the exported Playwright test object named `test` provides a fixture
62/// named `fixture` whose declared type is `type_name`.
63pub const PLAYWRIGHT_FIXTURE_DEF_SENTINEL: &str = "__fallow_playwright_fixture_def__:";
64
65/// Synthetic member-access object prefix for Playwright fixture member uses.
66///
67/// `MemberAccess { object: format!("{PLAYWRIGHT_FIXTURE_USE_SENTINEL}{test}:{fixture}"), member }`
68/// means a callback passed to the Playwright test object named `test`
69/// destructures `fixture` and accesses `fixture.member`.
70pub const PLAYWRIGHT_FIXTURE_USE_SENTINEL: &str = "__fallow_playwright_fixture_use__:";
71
72/// Synthetic member-access object prefix for static-factory call returns.
73///
74/// `MemberAccess { object: format!("{FACTORY_CALL_SENTINEL}{callee}:{method}"), member }`
75/// means a local binding was assigned from `<callee>.<method>()` and a member
76/// is accessed on the result. The analyze layer resolves `callee` through the
77/// consumer module's imports to a class export and credits `member` on the
78/// class when the matching method carries `is_instance_returning_static`.
79/// See issue #346.
80pub const FACTORY_CALL_SENTINEL: &str = "__fallow_factory_call__:";
81
82/// Synthetic member-access object prefix for fluent-builder chain credit.
83///
84/// `MemberAccess { object: format!("{FLUENT_CHAIN_SENTINEL}{callee}:{root_method}:{chain}"), member }`
85/// means a fluent chain `<callee>.<root_method>().<...chain>.<member>` was
86/// observed. `chain` is a comma-separated list of method names (empty when
87/// `member` is the first chained call after `root_method`). The analyze layer
88/// resolves `callee` to a class export, validates `root_method` has
89/// `is_instance_returning_static`, walks each `chain` segment requiring
90/// `is_self_returning` on the class, and credits `member` on the class
91/// when the chain remains on the class type. See issue #387.
92pub const FLUENT_CHAIN_SENTINEL: &str = "__fallow_fluent_chain__:";
93
94use parse::parse_source_to_module;
95
96/// Leading UTF-8 byte order mark codepoint.
97///
98/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
99/// BOM at the start of source files. fallow's contract is "UTF-8 with or
100/// without BOM; line offsets are computed against the post-BOM view; the BOM,
101/// if present on input, is preserved on output by `fallow fix`."
102const BOM_CHAR: char = '\u{FEFF}';
103
104/// Strip the leading UTF-8 BOM if present.
105///
106/// Called at every file-read entry point in this crate so the rest of the
107/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
108/// analyses) sees a consistent post-BOM view. Mirrors the
109/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
110/// and source-code-shaped sources are processed symmetrically. See issue #475.
111#[must_use]
112pub(crate) fn strip_bom(source: &str) -> &str {
113 source.strip_prefix(BOM_CHAR).unwrap_or(source)
114}
115
116/// Parse all files in parallel, extracting imports and exports.
117/// Uses the cache to skip reparsing files whose content hasn't changed.
118///
119/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
120/// metrics are computed during parsing (needed by the `health` command).
121/// Pass `false` for dead-code analysis where complexity data is unused.
122pub fn parse_all_files(
123 files: &[DiscoveredFile],
124 cache: Option<&CacheStore>,
125 need_complexity: bool,
126) -> ParseResult {
127 use std::sync::atomic::{AtomicUsize, Ordering};
128 let cache_hits = AtomicUsize::new(0);
129 let cache_misses = AtomicUsize::new(0);
130
131 let modules: Vec<ModuleInfo> = files
132 .par_iter()
133 .filter_map(|file| {
134 parse_single_file_cached(file, cache, &cache_hits, &cache_misses, need_complexity)
135 })
136 .collect();
137
138 let hits = cache_hits.load(Ordering::Relaxed);
139 let misses = cache_misses.load(Ordering::Relaxed);
140 if hits > 0 || misses > 0 {
141 tracing::info!(
142 cache_hits = hits,
143 cache_misses = misses,
144 "incremental cache stats"
145 );
146 }
147
148 ParseResult {
149 modules,
150 cache_hits: hits,
151 cache_misses: misses,
152 }
153}
154
155/// Parse a single file, consulting the cache first.
156///
157/// Cache validation strategy (fast path -> slow path):
158/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
159/// 2. If mtime+size match the cached entry -> cache hit, return immediately
160/// 3. If mtime+size differ -> read file, compute content hash
161/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
162/// 5. Otherwise -> cache miss, full parse
163fn parse_single_file_cached(
164 file: &DiscoveredFile,
165 cache: Option<&CacheStore>,
166 cache_hits: &std::sync::atomic::AtomicUsize,
167 cache_misses: &std::sync::atomic::AtomicUsize,
168 need_complexity: bool,
169) -> Option<ModuleInfo> {
170 use std::sync::atomic::Ordering;
171
172 // Fast path: check mtime+size before reading file content.
173 // A single stat() syscall is ~100x cheaper than read()+hash().
174 if let Some(store) = cache
175 && let Ok(metadata) = std::fs::metadata(&file.path)
176 {
177 let mt = mtime_secs(&metadata);
178 let sz = metadata.len();
179 if let Some(cached) = store.get_by_metadata(&file.path, mt, sz) {
180 // When complexity is requested but the cached entry lacks it
181 // (populated by a prior `check` run), skip the cache and re-parse.
182 if !need_complexity || !cached.complexity.is_empty() {
183 cache_hits.fetch_add(1, Ordering::Relaxed);
184 return Some(cache::cached_to_module_opts(
185 cached,
186 file.id,
187 need_complexity,
188 ));
189 }
190 }
191 }
192
193 // Slow path: read file content and compute content hash.
194 // Strip the UTF-8 BOM, if present, before hashing AND before parsing so
195 // the content hash, `compute_line_offsets`, and the oxc parser all see
196 // the same byte sequence. Without this, hash matches that depend on
197 // BOM presence would silently miss the cache. Issue #475.
198 let raw = std::fs::read_to_string(&file.path).ok()?;
199 let source = strip_bom(&raw);
200 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
201
202 // Check cache by content hash (handles touch/save-without-change)
203 if let Some(store) = cache
204 && let Some(cached) = store.get(&file.path, content_hash)
205 && (!need_complexity || !cached.complexity.is_empty())
206 {
207 cache_hits.fetch_add(1, Ordering::Relaxed);
208 return Some(cache::cached_to_module_opts(
209 cached,
210 file.id,
211 need_complexity,
212 ));
213 }
214 cache_misses.fetch_add(1, Ordering::Relaxed);
215
216 // Cache miss, do a full parse
217 Some(parse_source_to_module(
218 file.id,
219 &file.path,
220 source,
221 content_hash,
222 need_complexity,
223 ))
224}
225
226/// Extract mtime (seconds since epoch) from file metadata.
227/// Returns 0 if mtime cannot be determined (pre-epoch, unsupported OS, etc.).
228fn mtime_secs(metadata: &std::fs::Metadata) -> u64 {
229 metadata
230 .modified()
231 .ok()
232 .and_then(|t| t.duration_since(std::time::SystemTime::UNIX_EPOCH).ok())
233 .map_or(0, |d| d.as_secs())
234}
235
236/// Parse a single file and extract module information (without complexity).
237#[must_use]
238pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
239 // BOM strip before hash + parse so downstream offsets stay aligned with
240 // the parser's view. See `parse_single_file_cached` and issue #475.
241 let raw = std::fs::read_to_string(&file.path).ok()?;
242 let source = strip_bom(&raw);
243 let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
244 Some(parse_source_to_module(
245 file.id,
246 &file.path,
247 source,
248 content_hash,
249 false,
250 ))
251}
252
253/// Parse from in-memory content (for LSP, includes complexity).
254#[must_use]
255pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
256 // Editors normally strip a BOM before sending didOpen.text, but be
257 // defensive: an editor or test that hands us BOM-bearing content must
258 // produce the same offsets as the on-disk path. Issue #475.
259 let content = strip_bom(content);
260 let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
261 parse_source_to_module(file_id, path, content, content_hash, true)
262}
263
264// Parser integration tests invoke Oxc under Miri which is ~1000x slower.
265// Unit tests in individual modules (visitor, suppress, sfc, css, etc.) still run.
266#[cfg(all(test, not(miri)))]
267mod tests;