Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10    test,
11    allow(
12        clippy::unwrap_used,
13        clippy::expect_used,
14        reason = "tests use unwrap and expect to keep fixture setup concise"
15    )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod css_classes;
24pub mod css_in_js;
25pub mod css_metrics;
26pub mod flags;
27pub mod glimmer;
28pub mod graphql;
29pub mod html;
30pub mod iconify;
31pub mod inventory;
32pub mod mdx;
33mod module_info;
34mod parse;
35pub mod sfc;
36pub mod sfc_css;
37mod sfc_props;
38mod sfc_template;
39mod source_map;
40pub mod suppress;
41/// Tailwind CSS arbitrary-value detection.
42pub mod tailwind;
43pub(crate) mod template_complexity;
44mod template_usage;
45/// Visitor utilities for AST extraction.
46pub mod visitor;
47
48use std::path::Path;
49
50use rayon::prelude::*;
51
52use cache::CacheStore;
53use fallow_types::discover::{DiscoveredFile, FileId};
54
55pub use fallow_types::extract::{
56    AngularTemplateMemberAccessFact, AngularThisSpreadFact, ClassHeritageInfo,
57    DynamicCustomElementRenderFact, DynamicImportInfo, DynamicImportPattern, ExportInfo,
58    ExportName, FactoryCallMemberAccessFact, FactoryFnMemberAccessFact, FactoryReturnExport,
59    FluentChainMemberAccessFact, FluentChainNewMemberAccessFact, ImportInfo, ImportedName,
60    InstanceExportBindingFact, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind,
61    ModuleInfo, ParseResult, PlaywrightFixtureAliasFact, PlaywrightFixtureDefinitionFact,
62    PlaywrightFixtureTypeFact, PlaywrightFixtureUseFact, PublicSignatureTypeReference,
63    ReExportInfo, RequireCallInfo, SemanticFact, VisibilityTag, compute_line_offsets,
64};
65
66pub use astro::{
67    extract_astro_frontmatter, extract_astro_style_regions, extract_astro_template_regions,
68};
69pub use css::{
70    ThemeScan, ThemeTokenDef, extract_apply_tokens, extract_apply_tokens_located,
71    extract_css_module_exports, extract_css_var_reads_located, scan_theme_blocks,
72};
73pub use css_classes::{
74    MarkupClassScan, MarkupClassToken, is_edit_distance_one, is_typo_edit, scan_markup_class_tokens,
75};
76pub use css_in_js::{
77    CssInJsObjectSheets, CssInJsToken, CssInJsTokenDef, TokenConsumerHit, css_in_js_object_sheets,
78    css_in_js_token_consumers, css_in_js_token_defs, css_in_js_virtual_stylesheet,
79};
80pub use css_metrics::compute_css_analytics;
81pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
82pub use mdx::extract_mdx_statements;
83pub use sfc::{
84    SourceRegion, extract_sfc_scripts, extract_sfc_styles, extract_sfc_template_regions,
85    is_sfc_file,
86};
87pub use sfc_css::{scoped_unused_classes, sfc_virtual_stylesheet};
88pub use tailwind::{TailwindArbitraryUse, scan_tailwind_arbitrary_values};
89
90#[expect(
91    clippy::expect_used,
92    reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
93)]
94pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
95    regex::Regex::new(pattern).expect("static regex pattern should compile")
96}
97
98pub use parse::parse_source_to_module;
99
100/// Leading UTF-8 byte order mark codepoint.
101///
102/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
103/// BOM at the start of source files. fallow's contract is "UTF-8 with or
104/// without BOM; line offsets are computed against the post-BOM view; the BOM,
105/// if present on input, is preserved on output by `fallow fix`."
106const BOM_CHAR: char = '\u{FEFF}';
107
108/// Strip the leading UTF-8 BOM if present.
109///
110/// Called at every file-read entry point in this crate so the rest of the
111/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
112/// analyses) sees a consistent post-BOM view. Mirrors the
113/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
114/// and source-code-shaped sources are processed symmetrically. See issue #475.
115#[must_use]
116pub(crate) fn strip_bom(source: &str) -> &str {
117    source.strip_prefix(BOM_CHAR).unwrap_or(source)
118}
119
120/// Parse all files in parallel, extracting imports and exports.
121/// Uses the cache to skip reparsing files whose content hasn't changed.
122///
123/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
124/// metrics are computed during parsing (needed by the `health` command).
125/// Pass `false` for dead-code analysis where complexity data is unused.
126pub fn parse_all_files(
127    files: &[DiscoveredFile],
128    cache: Option<&CacheStore>,
129    need_complexity: bool,
130) -> ParseResult {
131    let results: Vec<ParseFileResult> = files
132        .par_iter()
133        .map(|file| parse_single_file_cached(file, cache, need_complexity))
134        .collect();
135
136    let mut modules = Vec::with_capacity(results.len());
137    let mut hits = 0usize;
138    let mut misses = 0usize;
139    let mut parse_cpu_nanos = 0u64;
140
141    for result in results {
142        hits += result.cache_hits;
143        misses += result.cache_misses;
144        parse_cpu_nanos = parse_cpu_nanos.saturating_add(result.parse_cpu_nanos);
145        if let Some(module) = result.module {
146            modules.push(module);
147        }
148    }
149
150    if hits > 0 || misses > 0 {
151        tracing::info!(
152            cache_hits = hits,
153            cache_misses = misses,
154            "incremental cache stats"
155        );
156    }
157
158    ParseResult {
159        modules,
160        cache_hits: hits,
161        cache_misses: misses,
162        parse_cpu_ms: parse_cpu_nanos as f64 / 1_000_000.0,
163    }
164}
165
166struct ParseFileResult {
167    module: Option<ModuleInfo>,
168    cache_hits: usize,
169    cache_misses: usize,
170    parse_cpu_nanos: u64,
171}
172
173impl ParseFileResult {
174    fn cache_hit(module: ModuleInfo) -> Self {
175        Self {
176            module: Some(module),
177            cache_hits: 1,
178            cache_misses: 0,
179            parse_cpu_nanos: 0,
180        }
181    }
182
183    fn cache_miss(module: ModuleInfo, parse_cpu_nanos: u64) -> Self {
184        Self {
185            module: Some(module),
186            cache_hits: 0,
187            cache_misses: 1,
188            parse_cpu_nanos,
189        }
190    }
191
192    const fn skipped() -> Self {
193        Self {
194            module: None,
195            cache_hits: 0,
196            cache_misses: 0,
197            parse_cpu_nanos: 0,
198        }
199    }
200}
201
202/// Parse a single file, consulting the cache first.
203///
204/// Cache validation strategy (fast path -> slow path):
205/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
206/// 2. If mtime+size match the cached entry -> cache hit, return immediately
207/// 3. If mtime+size differ -> read file, compute content hash
208/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
209/// 5. Otherwise -> cache miss, full parse
210fn parse_single_file_cached(
211    file: &DiscoveredFile,
212    cache: Option<&CacheStore>,
213    need_complexity: bool,
214) -> ParseFileResult {
215    let cached_by_path = cache.and_then(|store| store.get_by_path_only(&file.path));
216
217    if let Some(cached) = cached_by_path
218        && cached.file_size == file.size_bytes
219        && let Ok(metadata) = std::fs::metadata(&file.path)
220        && metadata.len() == cached.file_size
221    {
222        let fingerprint =
223            fallow_types::source_fingerprint::SourceFingerprint::from_metadata(&metadata);
224        if cached.source_fingerprint() == fingerprint
225            && fingerprint.has_known_mtime()
226            && (!need_complexity || !cached.complexity.is_empty())
227        {
228            return ParseFileResult::cache_hit(cache::cached_to_module_opts(
229                cached,
230                file.id,
231                need_complexity,
232            ));
233        }
234    }
235
236    let Ok(raw) = std::fs::read_to_string(&file.path) else {
237        return ParseFileResult::skipped();
238    };
239    let source = strip_bom(&raw);
240    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
241
242    if let Some(cached) = cached_by_path
243        && cached.content_hash == content_hash
244        && (!need_complexity || !cached.complexity.is_empty())
245    {
246        return ParseFileResult::cache_hit(cache::cached_to_module_opts(
247            cached,
248            file.id,
249            need_complexity,
250        ));
251    }
252
253    let parse_start = std::time::Instant::now();
254    let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
255    let parse_cpu_nanos = u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX);
256    ParseFileResult::cache_miss(module, parse_cpu_nanos)
257}
258
259/// Parse a single file and extract module information (without complexity).
260#[must_use]
261pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
262    let raw = std::fs::read_to_string(&file.path).ok()?;
263    let source = strip_bom(&raw);
264    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
265    Some(parse_source_to_module(
266        file.id,
267        &file.path,
268        source,
269        content_hash,
270        false,
271    ))
272}
273
274/// Parse from in-memory content (for LSP, includes complexity).
275#[must_use]
276pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
277    let content = strip_bom(content);
278    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
279    parse_source_to_module(file_id, path, content, content_hash, true)
280}
281
282#[cfg(all(test, not(miri)))]
283mod tests;