Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30        .expect("valid regex")
31});
32
33/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
34/// `rel="modulepreload"`.
35/// Handles attributes in any order (rel before or after href).
36static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
37    regex::Regex::new(
38        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
39    )
40    .expect("valid regex")
41});
42
43/// Regex for the reverse attribute order: href before rel.
44static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
45    regex::Regex::new(
46        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
47    )
48    .expect("valid regex")
49});
50
51/// Check if a path is an HTML file.
52// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
53pub(crate) fn is_html_file(path: &Path) -> bool {
54    path.extension()
55        .and_then(|e| e.to_str())
56        .is_some_and(|ext| ext == "html")
57}
58
59/// Returns true if an HTML asset reference is a remote URL that should be skipped.
60pub(crate) fn is_remote_url(src: &str) -> bool {
61    src.starts_with("http://")
62        || src.starts_with("https://")
63        || src.starts_with("//")
64        || src.starts_with("data:")
65}
66
67/// Parse an HTML file, extracting script and stylesheet references as imports.
68pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
69    let suppressions = crate::suppress::parse_suppressions_from_source(source);
70
71    // Strip HTML comments before matching to avoid false positives.
72    let stripped = HTML_COMMENT_RE.replace_all(source, "");
73
74    let mut imports = Vec::new();
75
76    // Extract <script src="..."> references.
77    // Bare filenames (e.g., `src="app.js"`) are normalized to `./app.js` so
78    // the resolver doesn't misclassify them as npm packages.
79    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
80        if let Some(m) = cap.get(1) {
81            let src = m.as_str().trim();
82            if !src.is_empty() && !is_remote_url(src) {
83                imports.push(ImportInfo {
84                    source: normalize_asset_url(src),
85                    imported_name: ImportedName::SideEffect,
86                    local_name: String::new(),
87                    is_type_only: false,
88                    span: Span::default(),
89                    source_span: Span::default(),
90                });
91            }
92        }
93    }
94
95    // Extract <link rel="stylesheet" href="..."> and <link rel="modulepreload" href="...">.
96    // Handle both attribute orders: rel before href, and href before rel.
97    // Bare filenames are normalized identically to `<script src>`.
98    for cap in LINK_HREF_RE.captures_iter(&stripped) {
99        if let Some(m) = cap.get(2) {
100            let href = m.as_str().trim();
101            if !href.is_empty() && !is_remote_url(href) {
102                imports.push(ImportInfo {
103                    source: normalize_asset_url(href),
104                    imported_name: ImportedName::SideEffect,
105                    local_name: String::new(),
106                    is_type_only: false,
107                    span: Span::default(),
108                    source_span: Span::default(),
109                });
110            }
111        }
112    }
113    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
114        if let Some(m) = cap.get(1) {
115            let href = m.as_str().trim();
116            if !href.is_empty() && !is_remote_url(href) {
117                imports.push(ImportInfo {
118                    source: normalize_asset_url(href),
119                    imported_name: ImportedName::SideEffect,
120                    local_name: String::new(),
121                    is_type_only: false,
122                    span: Span::default(),
123                    source_span: Span::default(),
124                });
125            }
126        }
127    }
128
129    // Deduplicate: the same asset may be referenced by both <script src> and
130    // <link rel="modulepreload" href> for the same path.
131    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
132    imports.dedup_by(|a, b| a.source == b.source);
133
134    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
135    // Referenced identifiers are stored as MemberAccess entries with a sentinel
136    // object name so the analysis phase can bridge them to the component class.
137    let template_refs = angular::collect_angular_template_refs(source);
138    let member_accesses: Vec<MemberAccess> = template_refs
139        .into_iter()
140        .map(|name| MemberAccess {
141            object: ANGULAR_TPL_SENTINEL.to_string(),
142            member: name,
143        })
144        .collect();
145
146    ModuleInfo {
147        file_id,
148        exports: Vec::new(),
149        imports,
150        re_exports: Vec::new(),
151        dynamic_imports: Vec::new(),
152        dynamic_import_patterns: Vec::new(),
153        require_calls: Vec::new(),
154        member_accesses,
155        whole_object_uses: Vec::new(),
156        has_cjs_exports: false,
157        content_hash,
158        suppressions,
159        unused_import_bindings: Vec::new(),
160        line_offsets: fallow_types::extract::compute_line_offsets(source),
161        complexity: Vec::new(),
162        flag_uses: Vec::new(),
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    // ── is_html_file ─────────────────────────────────────────────
171
172    #[test]
173    fn is_html_file_html() {
174        assert!(is_html_file(Path::new("index.html")));
175    }
176
177    #[test]
178    fn is_html_file_nested() {
179        assert!(is_html_file(Path::new("pages/about.html")));
180    }
181
182    #[test]
183    fn is_html_file_rejects_htm() {
184        assert!(!is_html_file(Path::new("index.htm")));
185    }
186
187    #[test]
188    fn is_html_file_rejects_js() {
189        assert!(!is_html_file(Path::new("app.js")));
190    }
191
192    #[test]
193    fn is_html_file_rejects_ts() {
194        assert!(!is_html_file(Path::new("app.ts")));
195    }
196
197    #[test]
198    fn is_html_file_rejects_vue() {
199        assert!(!is_html_file(Path::new("App.vue")));
200    }
201
202    // ── is_remote_url ────────────────────────────────────────────
203
204    #[test]
205    fn remote_url_http() {
206        assert!(is_remote_url("http://example.com/script.js"));
207    }
208
209    #[test]
210    fn remote_url_https() {
211        assert!(is_remote_url("https://cdn.example.com/style.css"));
212    }
213
214    #[test]
215    fn remote_url_protocol_relative() {
216        assert!(is_remote_url("//cdn.example.com/lib.js"));
217    }
218
219    #[test]
220    fn remote_url_data() {
221        assert!(is_remote_url("data:text/javascript;base64,abc"));
222    }
223
224    #[test]
225    fn local_relative_not_remote() {
226        assert!(!is_remote_url("./src/entry.js"));
227    }
228
229    #[test]
230    fn local_root_relative_not_remote() {
231        assert!(!is_remote_url("/src/entry.js"));
232    }
233
234    // ── parse_html_to_module: script src extraction ──────────────
235
236    #[test]
237    fn extracts_module_script_src() {
238        let info = parse_html_to_module(
239            FileId(0),
240            r#"<script type="module" src="./src/entry.js"></script>"#,
241            0,
242        );
243        assert_eq!(info.imports.len(), 1);
244        assert_eq!(info.imports[0].source, "./src/entry.js");
245    }
246
247    #[test]
248    fn extracts_plain_script_src() {
249        let info = parse_html_to_module(
250            FileId(0),
251            r#"<script src="./src/polyfills.js"></script>"#,
252            0,
253        );
254        assert_eq!(info.imports.len(), 1);
255        assert_eq!(info.imports[0].source, "./src/polyfills.js");
256    }
257
258    #[test]
259    fn extracts_multiple_scripts() {
260        let info = parse_html_to_module(
261            FileId(0),
262            r#"
263            <script type="module" src="./src/entry.js"></script>
264            <script src="./src/polyfills.js"></script>
265            "#,
266            0,
267        );
268        assert_eq!(info.imports.len(), 2);
269    }
270
271    #[test]
272    fn skips_inline_script() {
273        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
274        assert!(info.imports.is_empty());
275    }
276
277    #[test]
278    fn skips_remote_script() {
279        let info = parse_html_to_module(
280            FileId(0),
281            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
282            0,
283        );
284        assert!(info.imports.is_empty());
285    }
286
287    #[test]
288    fn skips_protocol_relative_script() {
289        let info = parse_html_to_module(
290            FileId(0),
291            r#"<script src="//cdn.example.com/lib.js"></script>"#,
292            0,
293        );
294        assert!(info.imports.is_empty());
295    }
296
297    // ── parse_html_to_module: link href extraction ───────────────
298
299    #[test]
300    fn extracts_stylesheet_link() {
301        let info = parse_html_to_module(
302            FileId(0),
303            r#"<link rel="stylesheet" href="./src/global.css" />"#,
304            0,
305        );
306        assert_eq!(info.imports.len(), 1);
307        assert_eq!(info.imports[0].source, "./src/global.css");
308    }
309
310    #[test]
311    fn extracts_modulepreload_link() {
312        let info = parse_html_to_module(
313            FileId(0),
314            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
315            0,
316        );
317        assert_eq!(info.imports.len(), 1);
318        assert_eq!(info.imports[0].source, "./src/vendor.js");
319    }
320
321    #[test]
322    fn extracts_link_with_reversed_attrs() {
323        let info = parse_html_to_module(
324            FileId(0),
325            r#"<link href="./src/global.css" rel="stylesheet" />"#,
326            0,
327        );
328        assert_eq!(info.imports.len(), 1);
329        assert_eq!(info.imports[0].source, "./src/global.css");
330    }
331
332    // ── Bare asset references normalized to relative paths ──────
333    // Regression tests for the same class of bug as #99 (Angular templateUrl).
334    // Browsers resolve `src="app.js"` and `href="styles.css"` relative to the
335    // HTML file, so emitting these as bare specifiers would misclassify them
336    // as unlisted npm packages.
337
338    #[test]
339    fn bare_script_src_normalized_to_relative() {
340        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
341        assert_eq!(info.imports.len(), 1);
342        assert_eq!(info.imports[0].source, "./app.js");
343    }
344
345    #[test]
346    fn bare_module_script_src_normalized_to_relative() {
347        let info = parse_html_to_module(
348            FileId(0),
349            r#"<script type="module" src="main.ts"></script>"#,
350            0,
351        );
352        assert_eq!(info.imports.len(), 1);
353        assert_eq!(info.imports[0].source, "./main.ts");
354    }
355
356    #[test]
357    fn bare_stylesheet_link_href_normalized_to_relative() {
358        let info = parse_html_to_module(
359            FileId(0),
360            r#"<link rel="stylesheet" href="styles.css" />"#,
361            0,
362        );
363        assert_eq!(info.imports.len(), 1);
364        assert_eq!(info.imports[0].source, "./styles.css");
365    }
366
367    #[test]
368    fn bare_link_href_reversed_attrs_normalized_to_relative() {
369        let info = parse_html_to_module(
370            FileId(0),
371            r#"<link href="styles.css" rel="stylesheet" />"#,
372            0,
373        );
374        assert_eq!(info.imports.len(), 1);
375        assert_eq!(info.imports[0].source, "./styles.css");
376    }
377
378    #[test]
379    fn bare_modulepreload_link_href_normalized_to_relative() {
380        let info = parse_html_to_module(
381            FileId(0),
382            r#"<link rel="modulepreload" href="vendor.js" />"#,
383            0,
384        );
385        assert_eq!(info.imports.len(), 1);
386        assert_eq!(info.imports[0].source, "./vendor.js");
387    }
388
389    #[test]
390    fn bare_asset_with_subdir_normalized_to_relative() {
391        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
392        assert_eq!(info.imports.len(), 1);
393        assert_eq!(info.imports[0].source, "./assets/app.js");
394    }
395
396    #[test]
397    fn root_absolute_script_src_unchanged() {
398        // `/src/main.ts` is a web convention (Vite root-relative) and must
399        // stay absolute so the resolver's HTML special case applies.
400        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
401        assert_eq!(info.imports.len(), 1);
402        assert_eq!(info.imports[0].source, "/src/main.ts");
403    }
404
405    #[test]
406    fn parent_relative_script_src_unchanged() {
407        let info = parse_html_to_module(
408            FileId(0),
409            r#"<script src="../shared/vendor.js"></script>"#,
410            0,
411        );
412        assert_eq!(info.imports.len(), 1);
413        assert_eq!(info.imports[0].source, "../shared/vendor.js");
414    }
415
416    #[test]
417    fn skips_preload_link() {
418        let info = parse_html_to_module(
419            FileId(0),
420            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
421            0,
422        );
423        assert!(info.imports.is_empty());
424    }
425
426    #[test]
427    fn skips_icon_link() {
428        let info =
429            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
430        assert!(info.imports.is_empty());
431    }
432
433    #[test]
434    fn skips_remote_stylesheet() {
435        let info = parse_html_to_module(
436            FileId(0),
437            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
438            0,
439        );
440        assert!(info.imports.is_empty());
441    }
442
443    // ── HTML comment stripping ───────────────────────────────────
444
445    #[test]
446    fn skips_commented_out_script() {
447        let info = parse_html_to_module(
448            FileId(0),
449            r#"<!-- <script src="./old.js"></script> -->
450            <script src="./new.js"></script>"#,
451            0,
452        );
453        assert_eq!(info.imports.len(), 1);
454        assert_eq!(info.imports[0].source, "./new.js");
455    }
456
457    #[test]
458    fn skips_commented_out_link() {
459        let info = parse_html_to_module(
460            FileId(0),
461            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
462            <link rel="stylesheet" href="./new.css" />"#,
463            0,
464        );
465        assert_eq!(info.imports.len(), 1);
466        assert_eq!(info.imports[0].source, "./new.css");
467    }
468
469    // ── Multi-line attributes ────────────────────────────────────
470
471    #[test]
472    fn handles_multiline_script_tag() {
473        let info = parse_html_to_module(
474            FileId(0),
475            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
476            0,
477        );
478        assert_eq!(info.imports.len(), 1);
479        assert_eq!(info.imports[0].source, "./src/entry.js");
480    }
481
482    #[test]
483    fn handles_multiline_link_tag() {
484        let info = parse_html_to_module(
485            FileId(0),
486            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
487            0,
488        );
489        assert_eq!(info.imports.len(), 1);
490        assert_eq!(info.imports[0].source, "./src/global.css");
491    }
492
493    // ── Full HTML document ───────────────────────────────────────
494
495    #[test]
496    fn full_vite_html() {
497        let info = parse_html_to_module(
498            FileId(0),
499            r#"<!doctype html>
500<html>
501  <head>
502    <link rel="stylesheet" href="./src/global.css" />
503    <link rel="icon" href="/favicon.ico" />
504  </head>
505  <body>
506    <div id="app"></div>
507    <script type="module" src="./src/entry.js"></script>
508  </body>
509</html>"#,
510            0,
511        );
512        assert_eq!(info.imports.len(), 2);
513        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
514        assert!(sources.contains(&"./src/global.css"));
515        assert!(sources.contains(&"./src/entry.js"));
516    }
517
518    // ── Edge cases ───────────────────────────────────────────────
519
520    #[test]
521    fn empty_html() {
522        let info = parse_html_to_module(FileId(0), "", 0);
523        assert!(info.imports.is_empty());
524    }
525
526    #[test]
527    fn html_with_no_assets() {
528        let info = parse_html_to_module(
529            FileId(0),
530            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
531            0,
532        );
533        assert!(info.imports.is_empty());
534    }
535
536    #[test]
537    fn single_quoted_attributes() {
538        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
539        assert_eq!(info.imports.len(), 1);
540        assert_eq!(info.imports[0].source, "./src/entry.js");
541    }
542
543    #[test]
544    fn all_imports_are_side_effect() {
545        let info = parse_html_to_module(
546            FileId(0),
547            r#"<script src="./entry.js"></script>
548            <link rel="stylesheet" href="./style.css" />"#,
549            0,
550        );
551        for imp in &info.imports {
552            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
553            assert!(imp.local_name.is_empty());
554            assert!(!imp.is_type_only);
555        }
556    }
557
558    #[test]
559    fn suppression_comments_extracted() {
560        let info = parse_html_to_module(
561            FileId(0),
562            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
563            0,
564        );
565        // HTML comments use <!-- --> not //, so suppression parsing
566        // from source text won't find standard JS-style comments.
567        // This is expected — HTML suppression is not supported.
568        assert_eq!(info.imports.len(), 1);
569    }
570
571    // ── Angular template scanning ──────────────────────────────
572
573    #[test]
574    fn angular_template_extracts_member_refs() {
575        let info = parse_html_to_module(
576            FileId(0),
577            "<h1>{{ title() }}</h1>\n\
578             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
579             <button (click)=\"onButtonClick()\">Toggle</button>",
580            0,
581        );
582        let names: rustc_hash::FxHashSet<&str> = info
583            .member_accesses
584            .iter()
585            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
586            .map(|a| a.member.as_str())
587            .collect();
588        assert!(names.contains("title"), "should contain 'title'");
589        assert!(
590            names.contains("isHighlighted"),
591            "should contain 'isHighlighted'"
592        );
593        assert!(names.contains("greeting"), "should contain 'greeting'");
594        assert!(
595            names.contains("onButtonClick"),
596            "should contain 'onButtonClick'"
597        );
598    }
599
600    #[test]
601    fn plain_html_no_angular_refs() {
602        let info = parse_html_to_module(
603            FileId(0),
604            "<!doctype html><html><body><h1>Hello</h1></body></html>",
605            0,
606        );
607        assert!(info.member_accesses.is_empty());
608    }
609}