Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
17use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
18use fallow_types::discover::FileId;
19
20/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
21static HTML_COMMENT_RE: LazyLock<regex::Regex> =
22    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
23
24/// Regex to extract `src` attribute from `<script>` tags.
25/// Matches both `<script src="...">` and `<script type="module" src="...">`.
26/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
27static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
28    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
29        .expect("valid regex")
30});
31
32/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
33/// `rel="modulepreload"`.
34/// Handles attributes in any order (rel before or after href).
35static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
36    regex::Regex::new(
37        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
38    )
39    .expect("valid regex")
40});
41
42/// Regex for the reverse attribute order: href before rel.
43static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
44    regex::Regex::new(
45        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
46    )
47    .expect("valid regex")
48});
49
50/// Check if a path is an HTML file.
51// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
52pub(crate) fn is_html_file(path: &Path) -> bool {
53    path.extension()
54        .and_then(|e| e.to_str())
55        .is_some_and(|ext| ext == "html")
56}
57
58/// Returns true if an HTML asset reference is a remote URL that should be skipped.
59fn is_remote_url(src: &str) -> bool {
60    src.starts_with("http://")
61        || src.starts_with("https://")
62        || src.starts_with("//")
63        || src.starts_with("data:")
64}
65
66/// Parse an HTML file, extracting script and stylesheet references as imports.
67pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
68    let suppressions = crate::suppress::parse_suppressions_from_source(source);
69
70    // Strip HTML comments before matching to avoid false positives.
71    let stripped = HTML_COMMENT_RE.replace_all(source, "");
72
73    let mut imports = Vec::new();
74
75    // Extract <script src="..."> references
76    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
77        if let Some(m) = cap.get(1) {
78            let src = m.as_str().trim();
79            if !src.is_empty() && !is_remote_url(src) {
80                imports.push(ImportInfo {
81                    source: src.to_string(),
82                    imported_name: ImportedName::SideEffect,
83                    local_name: String::new(),
84                    is_type_only: false,
85                    span: Span::default(),
86                    source_span: Span::default(),
87                });
88            }
89        }
90    }
91
92    // Extract <link rel="stylesheet" href="..."> and <link rel="modulepreload" href="...">
93    // Handle both attribute orders: rel before href, and href before rel.
94    for cap in LINK_HREF_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(2) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                imports.push(ImportInfo {
99                    source: href.to_string(),
100                    imported_name: ImportedName::SideEffect,
101                    local_name: String::new(),
102                    is_type_only: false,
103                    span: Span::default(),
104                    source_span: Span::default(),
105                });
106            }
107        }
108    }
109    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
110        if let Some(m) = cap.get(1) {
111            let href = m.as_str().trim();
112            if !href.is_empty() && !is_remote_url(href) {
113                imports.push(ImportInfo {
114                    source: href.to_string(),
115                    imported_name: ImportedName::SideEffect,
116                    local_name: String::new(),
117                    is_type_only: false,
118                    span: Span::default(),
119                    source_span: Span::default(),
120                });
121            }
122        }
123    }
124
125    // Deduplicate: the same asset may be referenced by both <script src> and
126    // <link rel="modulepreload" href> for the same path.
127    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
128    imports.dedup_by(|a, b| a.source == b.source);
129
130    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
131    // Referenced identifiers are stored as MemberAccess entries with a sentinel
132    // object name so the analysis phase can bridge them to the component class.
133    let template_refs = angular::collect_angular_template_refs(source);
134    let member_accesses: Vec<MemberAccess> = template_refs
135        .into_iter()
136        .map(|name| MemberAccess {
137            object: ANGULAR_TPL_SENTINEL.to_string(),
138            member: name,
139        })
140        .collect();
141
142    ModuleInfo {
143        file_id,
144        exports: Vec::new(),
145        imports,
146        re_exports: Vec::new(),
147        dynamic_imports: Vec::new(),
148        dynamic_import_patterns: Vec::new(),
149        require_calls: Vec::new(),
150        member_accesses,
151        whole_object_uses: Vec::new(),
152        has_cjs_exports: false,
153        content_hash,
154        suppressions,
155        unused_import_bindings: Vec::new(),
156        line_offsets: fallow_types::extract::compute_line_offsets(source),
157        complexity: Vec::new(),
158    }
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    // ── is_html_file ─────────────────────────────────────────────
166
167    #[test]
168    fn is_html_file_html() {
169        assert!(is_html_file(Path::new("index.html")));
170    }
171
172    #[test]
173    fn is_html_file_nested() {
174        assert!(is_html_file(Path::new("pages/about.html")));
175    }
176
177    #[test]
178    fn is_html_file_rejects_htm() {
179        assert!(!is_html_file(Path::new("index.htm")));
180    }
181
182    #[test]
183    fn is_html_file_rejects_js() {
184        assert!(!is_html_file(Path::new("app.js")));
185    }
186
187    #[test]
188    fn is_html_file_rejects_ts() {
189        assert!(!is_html_file(Path::new("app.ts")));
190    }
191
192    #[test]
193    fn is_html_file_rejects_vue() {
194        assert!(!is_html_file(Path::new("App.vue")));
195    }
196
197    // ── is_remote_url ────────────────────────────────────────────
198
199    #[test]
200    fn remote_url_http() {
201        assert!(is_remote_url("http://example.com/script.js"));
202    }
203
204    #[test]
205    fn remote_url_https() {
206        assert!(is_remote_url("https://cdn.example.com/style.css"));
207    }
208
209    #[test]
210    fn remote_url_protocol_relative() {
211        assert!(is_remote_url("//cdn.example.com/lib.js"));
212    }
213
214    #[test]
215    fn remote_url_data() {
216        assert!(is_remote_url("data:text/javascript;base64,abc"));
217    }
218
219    #[test]
220    fn local_relative_not_remote() {
221        assert!(!is_remote_url("./src/entry.js"));
222    }
223
224    #[test]
225    fn local_root_relative_not_remote() {
226        assert!(!is_remote_url("/src/entry.js"));
227    }
228
229    // ── parse_html_to_module: script src extraction ──────────────
230
231    #[test]
232    fn extracts_module_script_src() {
233        let info = parse_html_to_module(
234            FileId(0),
235            r#"<script type="module" src="./src/entry.js"></script>"#,
236            0,
237        );
238        assert_eq!(info.imports.len(), 1);
239        assert_eq!(info.imports[0].source, "./src/entry.js");
240    }
241
242    #[test]
243    fn extracts_plain_script_src() {
244        let info = parse_html_to_module(
245            FileId(0),
246            r#"<script src="./src/polyfills.js"></script>"#,
247            0,
248        );
249        assert_eq!(info.imports.len(), 1);
250        assert_eq!(info.imports[0].source, "./src/polyfills.js");
251    }
252
253    #[test]
254    fn extracts_multiple_scripts() {
255        let info = parse_html_to_module(
256            FileId(0),
257            r#"
258            <script type="module" src="./src/entry.js"></script>
259            <script src="./src/polyfills.js"></script>
260            "#,
261            0,
262        );
263        assert_eq!(info.imports.len(), 2);
264    }
265
266    #[test]
267    fn skips_inline_script() {
268        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
269        assert!(info.imports.is_empty());
270    }
271
272    #[test]
273    fn skips_remote_script() {
274        let info = parse_html_to_module(
275            FileId(0),
276            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
277            0,
278        );
279        assert!(info.imports.is_empty());
280    }
281
282    #[test]
283    fn skips_protocol_relative_script() {
284        let info = parse_html_to_module(
285            FileId(0),
286            r#"<script src="//cdn.example.com/lib.js"></script>"#,
287            0,
288        );
289        assert!(info.imports.is_empty());
290    }
291
292    // ── parse_html_to_module: link href extraction ───────────────
293
294    #[test]
295    fn extracts_stylesheet_link() {
296        let info = parse_html_to_module(
297            FileId(0),
298            r#"<link rel="stylesheet" href="./src/global.css" />"#,
299            0,
300        );
301        assert_eq!(info.imports.len(), 1);
302        assert_eq!(info.imports[0].source, "./src/global.css");
303    }
304
305    #[test]
306    fn extracts_modulepreload_link() {
307        let info = parse_html_to_module(
308            FileId(0),
309            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
310            0,
311        );
312        assert_eq!(info.imports.len(), 1);
313        assert_eq!(info.imports[0].source, "./src/vendor.js");
314    }
315
316    #[test]
317    fn extracts_link_with_reversed_attrs() {
318        let info = parse_html_to_module(
319            FileId(0),
320            r#"<link href="./src/global.css" rel="stylesheet" />"#,
321            0,
322        );
323        assert_eq!(info.imports.len(), 1);
324        assert_eq!(info.imports[0].source, "./src/global.css");
325    }
326
327    #[test]
328    fn skips_preload_link() {
329        let info = parse_html_to_module(
330            FileId(0),
331            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
332            0,
333        );
334        assert!(info.imports.is_empty());
335    }
336
337    #[test]
338    fn skips_icon_link() {
339        let info =
340            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
341        assert!(info.imports.is_empty());
342    }
343
344    #[test]
345    fn skips_remote_stylesheet() {
346        let info = parse_html_to_module(
347            FileId(0),
348            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
349            0,
350        );
351        assert!(info.imports.is_empty());
352    }
353
354    // ── HTML comment stripping ───────────────────────────────────
355
356    #[test]
357    fn skips_commented_out_script() {
358        let info = parse_html_to_module(
359            FileId(0),
360            r#"<!-- <script src="./old.js"></script> -->
361            <script src="./new.js"></script>"#,
362            0,
363        );
364        assert_eq!(info.imports.len(), 1);
365        assert_eq!(info.imports[0].source, "./new.js");
366    }
367
368    #[test]
369    fn skips_commented_out_link() {
370        let info = parse_html_to_module(
371            FileId(0),
372            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
373            <link rel="stylesheet" href="./new.css" />"#,
374            0,
375        );
376        assert_eq!(info.imports.len(), 1);
377        assert_eq!(info.imports[0].source, "./new.css");
378    }
379
380    // ── Multi-line attributes ────────────────────────────────────
381
382    #[test]
383    fn handles_multiline_script_tag() {
384        let info = parse_html_to_module(
385            FileId(0),
386            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
387            0,
388        );
389        assert_eq!(info.imports.len(), 1);
390        assert_eq!(info.imports[0].source, "./src/entry.js");
391    }
392
393    #[test]
394    fn handles_multiline_link_tag() {
395        let info = parse_html_to_module(
396            FileId(0),
397            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
398            0,
399        );
400        assert_eq!(info.imports.len(), 1);
401        assert_eq!(info.imports[0].source, "./src/global.css");
402    }
403
404    // ── Full HTML document ───────────────────────────────────────
405
406    #[test]
407    fn full_vite_html() {
408        let info = parse_html_to_module(
409            FileId(0),
410            r#"<!doctype html>
411<html>
412  <head>
413    <link rel="stylesheet" href="./src/global.css" />
414    <link rel="icon" href="/favicon.ico" />
415  </head>
416  <body>
417    <div id="app"></div>
418    <script type="module" src="./src/entry.js"></script>
419  </body>
420</html>"#,
421            0,
422        );
423        assert_eq!(info.imports.len(), 2);
424        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
425        assert!(sources.contains(&"./src/global.css"));
426        assert!(sources.contains(&"./src/entry.js"));
427    }
428
429    // ── Edge cases ───────────────────────────────────────────────
430
431    #[test]
432    fn empty_html() {
433        let info = parse_html_to_module(FileId(0), "", 0);
434        assert!(info.imports.is_empty());
435    }
436
437    #[test]
438    fn html_with_no_assets() {
439        let info = parse_html_to_module(
440            FileId(0),
441            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
442            0,
443        );
444        assert!(info.imports.is_empty());
445    }
446
447    #[test]
448    fn single_quoted_attributes() {
449        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
450        assert_eq!(info.imports.len(), 1);
451        assert_eq!(info.imports[0].source, "./src/entry.js");
452    }
453
454    #[test]
455    fn all_imports_are_side_effect() {
456        let info = parse_html_to_module(
457            FileId(0),
458            r#"<script src="./entry.js"></script>
459            <link rel="stylesheet" href="./style.css" />"#,
460            0,
461        );
462        for imp in &info.imports {
463            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
464            assert!(imp.local_name.is_empty());
465            assert!(!imp.is_type_only);
466        }
467    }
468
469    #[test]
470    fn suppression_comments_extracted() {
471        let info = parse_html_to_module(
472            FileId(0),
473            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
474            0,
475        );
476        // HTML comments use <!-- --> not //, so suppression parsing
477        // from source text won't find standard JS-style comments.
478        // This is expected — HTML suppression is not supported.
479        assert_eq!(info.imports.len(), 1);
480    }
481
482    // ── Angular template scanning ──────────────────────────────
483
484    #[test]
485    fn angular_template_extracts_member_refs() {
486        let info = parse_html_to_module(
487            FileId(0),
488            "<h1>{{ title() }}</h1>\n\
489             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
490             <button (click)=\"onButtonClick()\">Toggle</button>",
491            0,
492        );
493        let names: rustc_hash::FxHashSet<&str> = info
494            .member_accesses
495            .iter()
496            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
497            .map(|a| a.member.as_str())
498            .collect();
499        assert!(names.contains("title"), "should contain 'title'");
500        assert!(
501            names.contains("isHighlighted"),
502            "should contain 'isHighlighted'"
503        );
504        assert!(names.contains("greeting"), "should contain 'greeting'");
505        assert!(
506            names.contains("onButtonClick"),
507            "should contain 'onButtonClick'"
508        );
509    }
510
511    #[test]
512    fn plain_html_no_angular_refs() {
513        let info = parse_html_to_module(
514            FileId(0),
515            "<!doctype html><html><body><h1>Hello</h1></body></html>",
516            0,
517        );
518        assert!(info.member_accesses.is_empty());
519    }
520}