Skip to main content

fallow_extract/
lib.rs

1//! Parsing and extraction engine for fallow codebase intelligence.
2//!
3//! This crate handles all file parsing: JS/TS via Oxc, Vue/Svelte SFC extraction,
4//! Astro frontmatter, MDX import/export extraction, CSS Module class name extraction,
5//! HTML asset reference extraction, and incremental caching of parse results.
6
7#![warn(missing_docs)]
8#![cfg_attr(not(test), deny(clippy::disallowed_methods))]
9#![cfg_attr(
10    test,
11    allow(
12        clippy::unwrap_used,
13        clippy::expect_used,
14        reason = "tests use unwrap and expect to keep fixture setup concise"
15    )
16)]
17
18mod asset_url;
19pub mod astro;
20pub mod cache;
21pub(crate) mod complexity;
22pub mod css;
23pub mod css_classes;
24pub mod css_in_js;
25pub mod css_metrics;
26pub mod flags;
27pub mod glimmer;
28pub mod graphql;
29pub mod html;
30pub mod iconify;
31pub mod inventory;
32pub mod mdx;
33mod module_info;
34mod parse;
35pub mod sfc;
36pub mod sfc_css;
37mod sfc_props;
38mod sfc_template;
39mod source_map;
40pub mod suppress;
41/// Tailwind CSS arbitrary-value detection.
42pub mod tailwind;
43pub(crate) mod template_complexity;
44mod template_usage;
45/// Visitor utilities for AST extraction.
46pub mod visitor;
47
48use std::path::Path;
49
50use rayon::prelude::*;
51
52use cache::CacheStore;
53use fallow_types::discover::{DiscoveredFile, FileId};
54
55pub use fallow_types::extract::{
56    AngularTemplateMemberAccessFact, AngularThisSpreadFact, ClassHeritageInfo,
57    DynamicCustomElementRenderFact, DynamicImportInfo, DynamicImportPattern, ExportInfo,
58    ExportName, FactoryCallMemberAccessFact, FactoryFnMemberAccessFact, FactoryReturnExport,
59    FluentChainMemberAccessFact, FluentChainNewMemberAccessFact, ImportInfo, ImportedName,
60    InstanceExportBindingFact, LocalTypeDeclaration, MemberAccess, MemberInfo, MemberKind,
61    ModuleInfo, ParseResult, PlaywrightFixtureAliasFact, PlaywrightFixtureDefinitionFact,
62    PlaywrightFixtureTypeFact, PlaywrightFixtureUseFact, PublicSignatureTypeReference,
63    ReExportInfo, RequireCallInfo, SemanticFact, VisibilityTag, compute_line_offsets,
64};
65
66pub use astro::{
67    extract_astro_frontmatter, extract_astro_style_regions, extract_astro_template_regions,
68};
69pub use css::{
70    ThemeScan, ThemeTokenDef, extract_apply_tokens, extract_apply_tokens_located,
71    extract_css_module_exports, extract_css_var_reads_located, scan_theme_blocks,
72};
73pub use css_classes::{
74    MarkupClassScan, MarkupClassToken, is_edit_distance_one, is_typo_edit, scan_markup_class_tokens,
75};
76pub use css_in_js::{
77    CssInJsObjectSheets, CssInJsToken, CssInJsTokenDef, CssInJsTokenOrigin, TokenConsumerHit,
78    css_in_js_object_sheets, css_in_js_theme_consumers, css_in_js_theme_token_defs,
79    css_in_js_token_consumers, css_in_js_token_defs, css_in_js_virtual_stylesheet,
80    panda_style_value_consumers, panda_token_call_consumers,
81};
82pub use css_metrics::{compute_css_analytics, parse_css_color_rgb};
83pub use glimmer::{is_glimmer_file, strip_glimmer_templates};
84pub use mdx::extract_mdx_statements;
85pub use sfc::{
86    SourceRegion, extract_sfc_scripts, extract_sfc_styles, extract_sfc_template_regions,
87    is_sfc_file,
88};
89pub use sfc_css::{
90    scoped_unused_classes, sfc_preprocessor_virtual_stylesheet, sfc_virtual_stylesheet,
91};
92pub use tailwind::{TailwindArbitraryUse, scan_tailwind_arbitrary_values};
93
94#[expect(
95    clippy::expect_used,
96    reason = "static regex patterns are hard-coded analyzer invariants covered by extraction tests"
97)]
98pub(crate) fn static_regex(pattern: &str) -> regex::Regex {
99    regex::Regex::new(pattern).expect("static regex pattern should compile")
100}
101
102pub use parse::parse_source_to_module;
103
104/// Leading UTF-8 byte order mark codepoint.
105///
106/// Windows editors (Notepad, older VS settings, some IDE plugins) emit a UTF-8
107/// BOM at the start of source files. fallow's contract is "UTF-8 with or
108/// without BOM; line offsets are computed against the post-BOM view; the BOM,
109/// if present on input, is preserved on output by `fallow fix`."
110const BOM_CHAR: char = '\u{FEFF}';
111
112/// Strip the leading UTF-8 BOM if present.
113///
114/// Called at every file-read entry point in this crate so the rest of the
115/// pipeline (content hash, `compute_line_offsets`, oxc parser, downstream
116/// analyses) sees a consistent post-BOM view. Mirrors the
117/// `fallow_config` layer (`config_writer.rs::BOM`) so config-shaped sources
118/// and source-code-shaped sources are processed symmetrically. See issue #475.
119#[must_use]
120pub(crate) fn strip_bom(source: &str) -> &str {
121    source.strip_prefix(BOM_CHAR).unwrap_or(source)
122}
123
124/// Parse all files in parallel, extracting imports and exports.
125/// Uses the cache to skip reparsing files whose content hasn't changed.
126///
127/// When `need_complexity` is true, per-function cyclomatic/cognitive complexity
128/// metrics are computed during parsing (needed by the `health` command).
129/// Pass `false` for dead-code analysis where complexity data is unused.
130pub fn parse_all_files(
131    files: &[DiscoveredFile],
132    cache: Option<&CacheStore>,
133    need_complexity: bool,
134) -> ParseResult {
135    let results: Vec<ParseFileResult> = files
136        .par_iter()
137        .map(|file| parse_single_file_cached(file, cache, need_complexity))
138        .collect();
139
140    let mut modules = Vec::with_capacity(results.len());
141    let mut hits = 0usize;
142    let mut misses = 0usize;
143    let mut parse_cpu_nanos = 0u64;
144
145    for result in results {
146        hits += result.cache_hits;
147        misses += result.cache_misses;
148        parse_cpu_nanos = parse_cpu_nanos.saturating_add(result.parse_cpu_nanos);
149        if let Some(module) = result.module {
150            modules.push(module);
151        }
152    }
153
154    if hits > 0 || misses > 0 {
155        tracing::info!(
156            cache_hits = hits,
157            cache_misses = misses,
158            "incremental cache stats"
159        );
160    }
161
162    ParseResult {
163        modules,
164        cache_hits: hits,
165        cache_misses: misses,
166        parse_cpu_ms: parse_cpu_nanos as f64 / 1_000_000.0,
167    }
168}
169
170struct ParseFileResult {
171    module: Option<ModuleInfo>,
172    cache_hits: usize,
173    cache_misses: usize,
174    parse_cpu_nanos: u64,
175}
176
177impl ParseFileResult {
178    fn cache_hit(module: ModuleInfo) -> Self {
179        Self {
180            module: Some(module),
181            cache_hits: 1,
182            cache_misses: 0,
183            parse_cpu_nanos: 0,
184        }
185    }
186
187    fn cache_miss(module: ModuleInfo, parse_cpu_nanos: u64) -> Self {
188        Self {
189            module: Some(module),
190            cache_hits: 0,
191            cache_misses: 1,
192            parse_cpu_nanos,
193        }
194    }
195
196    const fn skipped() -> Self {
197        Self {
198            module: None,
199            cache_hits: 0,
200            cache_misses: 0,
201            parse_cpu_nanos: 0,
202        }
203    }
204}
205
206/// Parse a single file, consulting the cache first.
207///
208/// Cache validation strategy (fast path -> slow path):
209/// 1. `stat()` the file to get mtime + size (single syscall, no file read)
210/// 2. If mtime+size match the cached entry -> cache hit, return immediately
211/// 3. If mtime+size differ -> read file, compute content hash
212/// 4. If content hash matches cached entry -> cache hit (file was `touch`ed but unchanged)
213/// 5. Otherwise -> cache miss, full parse
214fn parse_single_file_cached(
215    file: &DiscoveredFile,
216    cache: Option<&CacheStore>,
217    need_complexity: bool,
218) -> ParseFileResult {
219    let cached_by_path = cache.and_then(|store| store.get_by_path_only(&file.path));
220
221    if let Some(cached) = cached_by_path
222        && cached.file_size == file.size_bytes
223        && let Ok(metadata) = std::fs::metadata(&file.path)
224        && metadata.len() == cached.file_size
225    {
226        let fingerprint =
227            fallow_types::source_fingerprint::SourceFingerprint::from_metadata(&metadata);
228        if cached.source_fingerprint() == fingerprint
229            && fingerprint.has_known_mtime()
230            && (!need_complexity || !cached.complexity.is_empty())
231        {
232            return ParseFileResult::cache_hit(cache::cached_to_module_opts(
233                cached,
234                file.id,
235                need_complexity,
236            ));
237        }
238    }
239
240    let Ok(raw) = std::fs::read_to_string(&file.path) else {
241        return ParseFileResult::skipped();
242    };
243    let source = strip_bom(&raw);
244    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
245
246    if let Some(cached) = cached_by_path
247        && cached.content_hash == content_hash
248        && (!need_complexity || !cached.complexity.is_empty())
249    {
250        return ParseFileResult::cache_hit(cache::cached_to_module_opts(
251            cached,
252            file.id,
253            need_complexity,
254        ));
255    }
256
257    let parse_start = std::time::Instant::now();
258    let module = parse_source_to_module(file.id, &file.path, source, content_hash, need_complexity);
259    let parse_cpu_nanos = u64::try_from(parse_start.elapsed().as_nanos()).unwrap_or(u64::MAX);
260    ParseFileResult::cache_miss(module, parse_cpu_nanos)
261}
262
263/// Parse a single file and extract module information (without complexity).
264#[must_use]
265pub fn parse_single_file(file: &DiscoveredFile) -> Option<ModuleInfo> {
266    let raw = std::fs::read_to_string(&file.path).ok()?;
267    let source = strip_bom(&raw);
268    let content_hash = xxhash_rust::xxh3::xxh3_64(source.as_bytes());
269    Some(parse_source_to_module(
270        file.id,
271        &file.path,
272        source,
273        content_hash,
274        false,
275    ))
276}
277
278/// Parse from in-memory content (for LSP, includes complexity).
279#[must_use]
280pub fn parse_from_content(file_id: FileId, path: &Path, content: &str) -> ModuleInfo {
281    let content = strip_bom(content);
282    let content_hash = xxhash_rust::xxh3::xxh3_64(content.as_bytes());
283    parse_source_to_module(file_id, path, content, content_hash, true)
284}
285
286#[cfg(all(test, not(miri)))]
287mod tests;