Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
17use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
18use fallow_types::discover::FileId;
19
20/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
21static HTML_COMMENT_RE: LazyLock<regex::Regex> =
22    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
23
24/// Regex to extract `src` attribute from `<script>` tags.
25/// Matches both `<script src="...">` and `<script type="module" src="...">`.
26/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
27static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
28    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
29        .expect("valid regex")
30});
31
32/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
33/// `rel="modulepreload"`.
34/// Handles attributes in any order (rel before or after href).
35static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
36    regex::Regex::new(
37        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
38    )
39    .expect("valid regex")
40});
41
42/// Regex for the reverse attribute order: href before rel.
43static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
44    regex::Regex::new(
45        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
46    )
47    .expect("valid regex")
48});
49
50/// Check if a path is an HTML file.
51// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
52pub(crate) fn is_html_file(path: &Path) -> bool {
53    path.extension()
54        .and_then(|e| e.to_str())
55        .is_some_and(|ext| ext == "html")
56}
57
58/// Returns true if an HTML asset reference is a remote URL that should be skipped.
59fn is_remote_url(src: &str) -> bool {
60    src.starts_with("http://")
61        || src.starts_with("https://")
62        || src.starts_with("//")
63        || src.starts_with("data:")
64}
65
66/// Parse an HTML file, extracting script and stylesheet references as imports.
67pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
68    let suppressions = crate::suppress::parse_suppressions_from_source(source);
69
70    // Strip HTML comments before matching to avoid false positives.
71    let stripped = HTML_COMMENT_RE.replace_all(source, "");
72
73    let mut imports = Vec::new();
74
75    // Extract <script src="..."> references
76    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
77        if let Some(m) = cap.get(1) {
78            let src = m.as_str().trim();
79            if !src.is_empty() && !is_remote_url(src) {
80                imports.push(ImportInfo {
81                    source: src.to_string(),
82                    imported_name: ImportedName::SideEffect,
83                    local_name: String::new(),
84                    is_type_only: false,
85                    span: Span::default(),
86                    source_span: Span::default(),
87                });
88            }
89        }
90    }
91
92    // Extract <link rel="stylesheet" href="..."> and <link rel="modulepreload" href="...">
93    // Handle both attribute orders: rel before href, and href before rel.
94    for cap in LINK_HREF_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(2) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                imports.push(ImportInfo {
99                    source: href.to_string(),
100                    imported_name: ImportedName::SideEffect,
101                    local_name: String::new(),
102                    is_type_only: false,
103                    span: Span::default(),
104                    source_span: Span::default(),
105                });
106            }
107        }
108    }
109    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
110        if let Some(m) = cap.get(1) {
111            let href = m.as_str().trim();
112            if !href.is_empty() && !is_remote_url(href) {
113                imports.push(ImportInfo {
114                    source: href.to_string(),
115                    imported_name: ImportedName::SideEffect,
116                    local_name: String::new(),
117                    is_type_only: false,
118                    span: Span::default(),
119                    source_span: Span::default(),
120                });
121            }
122        }
123    }
124
125    // Deduplicate: the same asset may be referenced by both <script src> and
126    // <link rel="modulepreload" href> for the same path.
127    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
128    imports.dedup_by(|a, b| a.source == b.source);
129
130    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
131    // Referenced identifiers are stored as MemberAccess entries with a sentinel
132    // object name so the analysis phase can bridge them to the component class.
133    let template_refs = angular::collect_angular_template_refs(source);
134    let member_accesses: Vec<MemberAccess> = template_refs
135        .into_iter()
136        .map(|name| MemberAccess {
137            object: ANGULAR_TPL_SENTINEL.to_string(),
138            member: name,
139        })
140        .collect();
141
142    ModuleInfo {
143        file_id,
144        exports: Vec::new(),
145        imports,
146        re_exports: Vec::new(),
147        dynamic_imports: Vec::new(),
148        dynamic_import_patterns: Vec::new(),
149        require_calls: Vec::new(),
150        member_accesses,
151        whole_object_uses: Vec::new(),
152        has_cjs_exports: false,
153        content_hash,
154        suppressions,
155        unused_import_bindings: Vec::new(),
156        line_offsets: fallow_types::extract::compute_line_offsets(source),
157        complexity: Vec::new(),
158        flag_uses: Vec::new(),
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    // ── is_html_file ─────────────────────────────────────────────
167
168    #[test]
169    fn is_html_file_html() {
170        assert!(is_html_file(Path::new("index.html")));
171    }
172
173    #[test]
174    fn is_html_file_nested() {
175        assert!(is_html_file(Path::new("pages/about.html")));
176    }
177
178    #[test]
179    fn is_html_file_rejects_htm() {
180        assert!(!is_html_file(Path::new("index.htm")));
181    }
182
183    #[test]
184    fn is_html_file_rejects_js() {
185        assert!(!is_html_file(Path::new("app.js")));
186    }
187
188    #[test]
189    fn is_html_file_rejects_ts() {
190        assert!(!is_html_file(Path::new("app.ts")));
191    }
192
193    #[test]
194    fn is_html_file_rejects_vue() {
195        assert!(!is_html_file(Path::new("App.vue")));
196    }
197
198    // ── is_remote_url ────────────────────────────────────────────
199
200    #[test]
201    fn remote_url_http() {
202        assert!(is_remote_url("http://example.com/script.js"));
203    }
204
205    #[test]
206    fn remote_url_https() {
207        assert!(is_remote_url("https://cdn.example.com/style.css"));
208    }
209
210    #[test]
211    fn remote_url_protocol_relative() {
212        assert!(is_remote_url("//cdn.example.com/lib.js"));
213    }
214
215    #[test]
216    fn remote_url_data() {
217        assert!(is_remote_url("data:text/javascript;base64,abc"));
218    }
219
220    #[test]
221    fn local_relative_not_remote() {
222        assert!(!is_remote_url("./src/entry.js"));
223    }
224
225    #[test]
226    fn local_root_relative_not_remote() {
227        assert!(!is_remote_url("/src/entry.js"));
228    }
229
230    // ── parse_html_to_module: script src extraction ──────────────
231
232    #[test]
233    fn extracts_module_script_src() {
234        let info = parse_html_to_module(
235            FileId(0),
236            r#"<script type="module" src="./src/entry.js"></script>"#,
237            0,
238        );
239        assert_eq!(info.imports.len(), 1);
240        assert_eq!(info.imports[0].source, "./src/entry.js");
241    }
242
243    #[test]
244    fn extracts_plain_script_src() {
245        let info = parse_html_to_module(
246            FileId(0),
247            r#"<script src="./src/polyfills.js"></script>"#,
248            0,
249        );
250        assert_eq!(info.imports.len(), 1);
251        assert_eq!(info.imports[0].source, "./src/polyfills.js");
252    }
253
254    #[test]
255    fn extracts_multiple_scripts() {
256        let info = parse_html_to_module(
257            FileId(0),
258            r#"
259            <script type="module" src="./src/entry.js"></script>
260            <script src="./src/polyfills.js"></script>
261            "#,
262            0,
263        );
264        assert_eq!(info.imports.len(), 2);
265    }
266
267    #[test]
268    fn skips_inline_script() {
269        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
270        assert!(info.imports.is_empty());
271    }
272
273    #[test]
274    fn skips_remote_script() {
275        let info = parse_html_to_module(
276            FileId(0),
277            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
278            0,
279        );
280        assert!(info.imports.is_empty());
281    }
282
283    #[test]
284    fn skips_protocol_relative_script() {
285        let info = parse_html_to_module(
286            FileId(0),
287            r#"<script src="//cdn.example.com/lib.js"></script>"#,
288            0,
289        );
290        assert!(info.imports.is_empty());
291    }
292
293    // ── parse_html_to_module: link href extraction ───────────────
294
295    #[test]
296    fn extracts_stylesheet_link() {
297        let info = parse_html_to_module(
298            FileId(0),
299            r#"<link rel="stylesheet" href="./src/global.css" />"#,
300            0,
301        );
302        assert_eq!(info.imports.len(), 1);
303        assert_eq!(info.imports[0].source, "./src/global.css");
304    }
305
306    #[test]
307    fn extracts_modulepreload_link() {
308        let info = parse_html_to_module(
309            FileId(0),
310            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
311            0,
312        );
313        assert_eq!(info.imports.len(), 1);
314        assert_eq!(info.imports[0].source, "./src/vendor.js");
315    }
316
317    #[test]
318    fn extracts_link_with_reversed_attrs() {
319        let info = parse_html_to_module(
320            FileId(0),
321            r#"<link href="./src/global.css" rel="stylesheet" />"#,
322            0,
323        );
324        assert_eq!(info.imports.len(), 1);
325        assert_eq!(info.imports[0].source, "./src/global.css");
326    }
327
328    #[test]
329    fn skips_preload_link() {
330        let info = parse_html_to_module(
331            FileId(0),
332            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
333            0,
334        );
335        assert!(info.imports.is_empty());
336    }
337
338    #[test]
339    fn skips_icon_link() {
340        let info =
341            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
342        assert!(info.imports.is_empty());
343    }
344
345    #[test]
346    fn skips_remote_stylesheet() {
347        let info = parse_html_to_module(
348            FileId(0),
349            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
350            0,
351        );
352        assert!(info.imports.is_empty());
353    }
354
355    // ── HTML comment stripping ───────────────────────────────────
356
357    #[test]
358    fn skips_commented_out_script() {
359        let info = parse_html_to_module(
360            FileId(0),
361            r#"<!-- <script src="./old.js"></script> -->
362            <script src="./new.js"></script>"#,
363            0,
364        );
365        assert_eq!(info.imports.len(), 1);
366        assert_eq!(info.imports[0].source, "./new.js");
367    }
368
369    #[test]
370    fn skips_commented_out_link() {
371        let info = parse_html_to_module(
372            FileId(0),
373            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
374            <link rel="stylesheet" href="./new.css" />"#,
375            0,
376        );
377        assert_eq!(info.imports.len(), 1);
378        assert_eq!(info.imports[0].source, "./new.css");
379    }
380
381    // ── Multi-line attributes ────────────────────────────────────
382
383    #[test]
384    fn handles_multiline_script_tag() {
385        let info = parse_html_to_module(
386            FileId(0),
387            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
388            0,
389        );
390        assert_eq!(info.imports.len(), 1);
391        assert_eq!(info.imports[0].source, "./src/entry.js");
392    }
393
394    #[test]
395    fn handles_multiline_link_tag() {
396        let info = parse_html_to_module(
397            FileId(0),
398            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
399            0,
400        );
401        assert_eq!(info.imports.len(), 1);
402        assert_eq!(info.imports[0].source, "./src/global.css");
403    }
404
405    // ── Full HTML document ───────────────────────────────────────
406
407    #[test]
408    fn full_vite_html() {
409        let info = parse_html_to_module(
410            FileId(0),
411            r#"<!doctype html>
412<html>
413  <head>
414    <link rel="stylesheet" href="./src/global.css" />
415    <link rel="icon" href="/favicon.ico" />
416  </head>
417  <body>
418    <div id="app"></div>
419    <script type="module" src="./src/entry.js"></script>
420  </body>
421</html>"#,
422            0,
423        );
424        assert_eq!(info.imports.len(), 2);
425        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
426        assert!(sources.contains(&"./src/global.css"));
427        assert!(sources.contains(&"./src/entry.js"));
428    }
429
430    // ── Edge cases ───────────────────────────────────────────────
431
432    #[test]
433    fn empty_html() {
434        let info = parse_html_to_module(FileId(0), "", 0);
435        assert!(info.imports.is_empty());
436    }
437
438    #[test]
439    fn html_with_no_assets() {
440        let info = parse_html_to_module(
441            FileId(0),
442            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
443            0,
444        );
445        assert!(info.imports.is_empty());
446    }
447
448    #[test]
449    fn single_quoted_attributes() {
450        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
451        assert_eq!(info.imports.len(), 1);
452        assert_eq!(info.imports[0].source, "./src/entry.js");
453    }
454
455    #[test]
456    fn all_imports_are_side_effect() {
457        let info = parse_html_to_module(
458            FileId(0),
459            r#"<script src="./entry.js"></script>
460            <link rel="stylesheet" href="./style.css" />"#,
461            0,
462        );
463        for imp in &info.imports {
464            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
465            assert!(imp.local_name.is_empty());
466            assert!(!imp.is_type_only);
467        }
468    }
469
470    #[test]
471    fn suppression_comments_extracted() {
472        let info = parse_html_to_module(
473            FileId(0),
474            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
475            0,
476        );
477        // HTML comments use <!-- --> not //, so suppression parsing
478        // from source text won't find standard JS-style comments.
479        // This is expected — HTML suppression is not supported.
480        assert_eq!(info.imports.len(), 1);
481    }
482
483    // ── Angular template scanning ──────────────────────────────
484
485    #[test]
486    fn angular_template_extracts_member_refs() {
487        let info = parse_html_to_module(
488            FileId(0),
489            "<h1>{{ title() }}</h1>\n\
490             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
491             <button (click)=\"onButtonClick()\">Toggle</button>",
492            0,
493        );
494        let names: rustc_hash::FxHashSet<&str> = info
495            .member_accesses
496            .iter()
497            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
498            .map(|a| a.member.as_str())
499            .collect();
500        assert!(names.contains("title"), "should contain 'title'");
501        assert!(
502            names.contains("isHighlighted"),
503            "should contain 'isHighlighted'"
504        );
505        assert!(names.contains("greeting"), "should contain 'greeting'");
506        assert!(
507            names.contains("onButtonClick"),
508            "should contain 'onButtonClick'"
509        );
510    }
511
512    #[test]
513    fn plain_html_no_angular_refs() {
514        let info = parse_html_to_module(
515            FileId(0),
516            "<!doctype html><html><body><h1>Hello</h1></body></html>",
517            0,
518        );
519        assert!(info.member_accesses.is_empty());
520    }
521}