Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10    test,
11    allow(
12        clippy::unwrap_used,
13        clippy::expect_used,
14        reason = "tests use unwrap and expect to keep fixture setup concise"
15    )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod css_classes;
24pub mod css_metrics;
25pub mod flags;
26pub mod glimmer;
27pub mod graphql;
28pub mod html;
29pub mod iconify;
30pub mod inventory;
31pub mod mdx;
32mod module_info;
33mod parse;
34pub mod sfc;
35pub mod sfc_css;
36mod sfc_props;
37mod sfc_template;
38mod source_map;
39pub mod suppress;
40/// Tailwind CSS arbitrary-value detection.
41pub mod tailwind;
42pub(crate) mod template_complexity;
43mod template_usage;
44/// Visitor utilities for AST extraction.
45pub mod visitor;
46
47use std::path::Path;
48
49use rayon::prelude::*;
50
51use cache::CacheStore;
52use fallow_types::discover::{DiscoveredFile, FileId};
53
54pub use fallow_types::extract::{
55    AngularTemplateMemberAccessFact, AngularThisSpreadFact, ClassHeritageInfo,
56    DynamicCustomElementRenderFact, DynamicImportInfo, DynamicImportPattern, ExportInfo,
57    ExportName, FactoryCallMemberAccessFact, FactoryFnMemberAccessFact, FactoryReturnExport,
58    FluentChainMemberAccessFact, FluentChainNewMemberAccessFact, ImportInfo, ImportedName,
59    InstanceExportBindingFact, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind,
60    ModuleInfo, ParseResult, PlaywrightFixtureAliasFact, PlaywrightFixtureDefinitionFact,
61    PlaywrightFixtureTypeFact, PlaywrightFixtureUseFact, PublicSignatureTypeReference,
62    ReExportInfo, RequireCallInfo, SemanticFact, VisibilityTag, compute_line_offsets,
63};
64
65pub use astro::{
66    extract_astro_frontmatter, extract_astro_style_regions, extract_astro_template_regions,
67};
68pub use css::{
69    ThemeScan, ThemeTokenDef, extract_apply_tokens, extract_css_module_exports, scan_theme_blocks,
70};
71pub use css_classes::{
72    MarkupClassScan, MarkupClassToken, is_edit_distance_one, is_typo_edit, scan_markup_class_tokens,
73};
74pub use css_metrics::compute_css_analytics;
75pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
76pub use mdx::extract_mdx_statements;
77pub use sfc::{
78    SourceRegion, extract_sfc_scripts, extract_sfc_styles, extract_sfc_template_regions,
79    is_sfc_file,
80};
81pub use sfc_css::{scoped_unused_classes, sfc_virtual_stylesheet};
82pub use tailwind::{TailwindArbitraryUse, scan_tailwind_arbitrary_values};
83
84#[expect(
85    clippy::expect_used,
86    reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
87)]
88pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
89    regex::Regex::new(pattern).expect("static regex pattern should compile")
90}
91
92pub use parse::parse_source_to_module;
93
94/// Leading UTF-8 byte order mark codepoint.
95///
96/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
97/// BOM at the start of source files. fallow's contract is "UTF-8 with or
98/// without BOM; line offsets are computed against the post-BOM view; the BOM,
99/// if present on input, is preserved on output by `fallow fix`."
100const BOM_CHAR: char = '\u{FEFF}';
101
102/// Strip the leading UTF-8 BOM if present.
103///
104/// Called at every file-read entry point in this crate so the rest of the
105/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
106/// analyses) sees a consistent post-BOM view. Mirrors the
107/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
108/// and source-code-shaped sources are processed symmetrically. See issue #475.
109#[must_use]
110pub(crate) fn strip_bom(source: &str) -> &str {
111    source.strip_prefix(BOM_CHAR).unwrap_or(source)
112}
113
114/// Parse all files in parallel, extracting imports and exports.
115/// Uses the cache to skip reparsing files whose content hasn't changed.
116///
117/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
118/// metrics are computed during parsing (needed by the `health` command).
119/// Pass `false` for dead-code analysis where complexity data is unused.
120pub fn parse_all_files(
121    files: &[DiscoveredFile],
122    cache: Option<&CacheStore>,
123    need_complexity: bool,
124) -> ParseResult {
125    let results: Vec<ParseFileResult> = files
126        .par_iter()
127        .map(|file| parse_single_file_cached(file, cache, need_complexity))
128        .collect();
129
130    let mut modules = Vec::with_capacity(results.len());
131    let mut hits = 0usize;
132    let mut misses = 0usize;
133    let mut parse_cpu_nanos = 0u64;
134
135    for result in results {
136        hits += result.cache_hits;
137        misses += result.cache_misses;
138        parse_cpu_nanos = parse_cpu_nanos.saturating_add(result.parse_cpu_nanos);
139        if let Some(module) = result.module {
140            modules.push(module);
141        }
142    }
143
144    if hits > 0 || misses > 0 {
145        tracing::info!(
146            cache_hits = hits,
147            cache_misses = misses,
148            "incremental cache stats"
149        );
150    }
151
152    ParseResult {
153        modules,
154        cache_hits: hits,
155        cache_misses: misses,
156        parse_cpu_ms: parse_cpu_nanos as f64 / 1_000_000.0,
157    }
158}
159
160struct ParseFileResult {
161    module: Option<ModuleInfo>,
162    cache_hits: usize,
163    cache_misses: usize,
164    parse_cpu_nanos: u64,
165}
166
167impl ParseFileResult {
168    fn cache_hit(module: ModuleInfo) -> Self {
169        Self {
170            module: Some(module),
171            cache_hits: 1,
172            cache_misses: 0,
173            parse_cpu_nanos: 0,
174        }
175    }
176
177    fn cache_miss(module: ModuleInfo, parse_cpu_nanos: u64) -> Self {
178        Self {
179            module: Some(module),
180            cache_hits: 0,
181            cache_misses: 1,
182            parse_cpu_nanos,
183        }
184    }
185
186    const fn skipped() -> Self {
187        Self {
188            module: None,
189            cache_hits: 0,
190            cache_misses: 0,
191            parse_cpu_nanos: 0,
192        }
193    }
194}
195
196/// Parse a single file, consulting the cache first.
197///
198/// Cache validation strategy (fast path -> slow path):
199/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
200/// 2. If mtime+size match the cached entry -> cache hit, return immediately
201/// 3. If mtime+size differ -> read file, compute content hash
202/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
203/// 5. Otherwise -> cache miss, full parse
204fn parse_single_file_cached(
205    file: &DiscoveredFile,
206    cache: Option<&CacheStore>,
207    need_complexity: bool,
208) -> ParseFileResult {
209    let cached_by_path = cache.and_then(|store| store.get_by_path_only(&file.path));
210
211    if let Some(cached) = cached_by_path
212        && cached.file_size == file.size_bytes
213        && let Ok(metadata) = std::fs::metadata(&file.path)
214        && metadata.len() == cached.file_size
215    {
216        let fingerprint =
217            fallow_types::source_fingerprint::SourceFingerprint::from_metadata(&metadata);
218        if cached.source_fingerprint() == fingerprint
219            && fingerprint.has_known_mtime()
220            && (!need_complexity || !cached.complexity.is_empty())
221        {
222            return ParseFileResult::cache_hit(cache::cached_to_module_opts(
223                cached,
224                file.id,
225                need_complexity,
226            ));
227        }
228    }
229
230    let Ok(raw) = std::fs::read_to_string(&file.path) else {
231        return ParseFileResult::skipped();
232    };
233    let source = strip_bom(&raw);
234    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
235
236    if let Some(cached) = cached_by_path
237        && cached.content_hash == content_hash
238        && (!need_complexity || !cached.complexity.is_empty())
239    {
240        return ParseFileResult::cache_hit(cache::cached_to_module_opts(
241            cached,
242            file.id,
243            need_complexity,
244        ));
245    }
246
247    let parse_start = std::time::Instant::now();
248    let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
249    let parse_cpu_nanos = u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX);
250    ParseFileResult::cache_miss(module, parse_cpu_nanos)
251}
252
253/// Parse a single file and extract module information (without complexity).
254#[must_use]
255pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
256    let raw = std::fs::read_to_string(&file.path).ok()?;
257    let source = strip_bom(&raw);
258    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
259    Some(parse_source_to_module(
260        file.id,
261        &file.path,
262        source,
263        content_hash,
264        false,
265    ))
266}
267
268/// Parse from in-memory content (for LSP, includes complexity).
269#[must_use]
270pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
271    let content = strip_bom(content);
272    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
273    parse_source_to_module(file_id, path, content, content_hash, true)
274}
275
276#[cfg(all(test, not(miri)))]
277mod tests;