Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30        .expect("valid regex")
31});
32
33/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
34/// `rel="modulepreload"`.
35/// Handles attributes in any order (rel before or after href).
36static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
37    regex::Regex::new(
38        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
39    )
40    .expect("valid regex")
41});
42
43/// Regex for the reverse attribute order: href before rel.
44static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
45    regex::Regex::new(
46        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
47    )
48    .expect("valid regex")
49});
50
51/// Check if a path is an HTML file.
52// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
53pub(crate) fn is_html_file(path: &Path) -> bool {
54    path.extension()
55        .and_then(|e| e.to_str())
56        .is_some_and(|ext| ext == "html")
57}
58
59/// Returns true if an HTML asset reference is a remote URL that should be skipped.
60pub(crate) fn is_remote_url(src: &str) -> bool {
61    src.starts_with("http://")
62        || src.starts_with("https://")
63        || src.starts_with("//")
64        || src.starts_with("data:")
65}
66
67/// Extract local (non-remote) asset references from HTML-like markup.
68///
69/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
70/// between the HTML file parser and the JS/TS visitor's tagged template
71/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
72/// layouts emits the same asset edges as a real `.html` file.
73pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
74    let stripped = HTML_COMMENT_RE.replace_all(source, "");
75    let mut refs: Vec<String> = Vec::new();
76
77    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
78        if let Some(m) = cap.get(1) {
79            let src = m.as_str().trim();
80            if !src.is_empty() && !is_remote_url(src) {
81                refs.push(src.to_string());
82            }
83        }
84    }
85
86    for cap in LINK_HREF_RE.captures_iter(&stripped) {
87        if let Some(m) = cap.get(2) {
88            let href = m.as_str().trim();
89            if !href.is_empty() && !is_remote_url(href) {
90                refs.push(href.to_string());
91            }
92        }
93    }
94    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(1) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                refs.push(href.to_string());
99            }
100        }
101    }
102
103    refs
104}
105
106/// Parse an HTML file, extracting script and stylesheet references as imports.
107#[cfg(test)]
108pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
109    parse_html_to_module_with_complexity(file_id, source, content_hash, false)
110}
111
112/// Parse an HTML file and optionally compute Angular template complexity.
113pub(crate) fn parse_html_to_module_with_complexity(
114    file_id: FileId,
115    source: &str,
116    content_hash: u64,
117    need_complexity: bool,
118) -> ModuleInfo {
119    let suppressions = crate::suppress::parse_suppressions_from_source(source);
120
121    // Bare filenames (e.g., `src="app.js"`) are normalized to `./app.js` so
122    // the resolver doesn't misclassify them as npm packages.
123    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
124        .into_iter()
125        .map(|raw| ImportInfo {
126            source: normalize_asset_url(&raw),
127            imported_name: ImportedName::SideEffect,
128            local_name: String::new(),
129            is_type_only: false,
130            span: Span::default(),
131            source_span: Span::default(),
132        })
133        .collect();
134
135    // Deduplicate: the same asset may be referenced by both <script src> and
136    // <link rel="modulepreload" href> for the same path.
137    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
138    imports.dedup_by(|a, b| a.source == b.source);
139
140    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
141    //
142    // Bare identifier refs (e.g. `title`, `dataService`, pipe names) are stored
143    // as `MemberAccess` entries with a sentinel object name so the analysis
144    // phase can credit them as members of the component class.
145    //
146    // Static member-access chains (`dataService.getTotal`) where `dataService`
147    // is an unresolved identifier are stored as regular (non-sentinel)
148    // `MemberAccess` entries. The analysis phase resolves these through the
149    // importing component's typed instance bindings (from
150    // `ClassHeritageInfo.instance_bindings`) to credit the target class's
151    // member as used.
152    let template_refs = angular::collect_angular_template_refs(source);
153    let mut member_accesses: Vec<MemberAccess> = template_refs
154        .identifiers
155        .into_iter()
156        .map(|name| MemberAccess {
157            object: ANGULAR_TPL_SENTINEL.to_string(),
158            member: name,
159        })
160        .collect();
161    member_accesses.extend(template_refs.member_accesses);
162
163    let complexity = if need_complexity {
164        crate::template_complexity::compute_angular_template_complexity(source)
165            .into_iter()
166            .collect()
167    } else {
168        Vec::new()
169    };
170
171    ModuleInfo {
172        file_id,
173        exports: Vec::new(),
174        imports,
175        re_exports: Vec::new(),
176        dynamic_imports: Vec::new(),
177        dynamic_import_patterns: Vec::new(),
178        require_calls: Vec::new(),
179        member_accesses,
180        whole_object_uses: Vec::new(),
181        has_cjs_exports: false,
182        content_hash,
183        suppressions,
184        unused_import_bindings: Vec::new(),
185        type_referenced_import_bindings: Vec::new(),
186        value_referenced_import_bindings: Vec::new(),
187        line_offsets: fallow_types::extract::compute_line_offsets(source),
188        complexity,
189        flag_uses: Vec::new(),
190        class_heritage: vec![],
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    // ── is_html_file ─────────────────────────────────────────────
199
200    #[test]
201    fn is_html_file_html() {
202        assert!(is_html_file(Path::new("index.html")));
203    }
204
205    #[test]
206    fn is_html_file_nested() {
207        assert!(is_html_file(Path::new("pages/about.html")));
208    }
209
210    #[test]
211    fn is_html_file_rejects_htm() {
212        assert!(!is_html_file(Path::new("index.htm")));
213    }
214
215    #[test]
216    fn is_html_file_rejects_js() {
217        assert!(!is_html_file(Path::new("app.js")));
218    }
219
220    #[test]
221    fn is_html_file_rejects_ts() {
222        assert!(!is_html_file(Path::new("app.ts")));
223    }
224
225    #[test]
226    fn is_html_file_rejects_vue() {
227        assert!(!is_html_file(Path::new("App.vue")));
228    }
229
230    // ── is_remote_url ────────────────────────────────────────────
231
232    #[test]
233    fn remote_url_http() {
234        assert!(is_remote_url("http://example.com/script.js"));
235    }
236
237    #[test]
238    fn remote_url_https() {
239        assert!(is_remote_url("https://cdn.example.com/style.css"));
240    }
241
242    #[test]
243    fn remote_url_protocol_relative() {
244        assert!(is_remote_url("//cdn.example.com/lib.js"));
245    }
246
247    #[test]
248    fn remote_url_data() {
249        assert!(is_remote_url("data:text/javascript;base64,abc"));
250    }
251
252    #[test]
253    fn local_relative_not_remote() {
254        assert!(!is_remote_url("./src/entry.js"));
255    }
256
257    #[test]
258    fn local_root_relative_not_remote() {
259        assert!(!is_remote_url("/src/entry.js"));
260    }
261
262    // ── parse_html_to_module: script src extraction ──────────────
263
264    #[test]
265    fn extracts_module_script_src() {
266        let info = parse_html_to_module(
267            FileId(0),
268            r#"<script type="module" src="./src/entry.js"></script>"#,
269            0,
270        );
271        assert_eq!(info.imports.len(), 1);
272        assert_eq!(info.imports[0].source, "./src/entry.js");
273    }
274
275    #[test]
276    fn extracts_plain_script_src() {
277        let info = parse_html_to_module(
278            FileId(0),
279            r#"<script src="./src/polyfills.js"></script>"#,
280            0,
281        );
282        assert_eq!(info.imports.len(), 1);
283        assert_eq!(info.imports[0].source, "./src/polyfills.js");
284    }
285
286    #[test]
287    fn extracts_multiple_scripts() {
288        let info = parse_html_to_module(
289            FileId(0),
290            r#"
291            <script type="module" src="./src/entry.js"></script>
292            <script src="./src/polyfills.js"></script>
293            "#,
294            0,
295        );
296        assert_eq!(info.imports.len(), 2);
297    }
298
299    #[test]
300    fn skips_inline_script() {
301        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
302        assert!(info.imports.is_empty());
303    }
304
305    #[test]
306    fn skips_remote_script() {
307        let info = parse_html_to_module(
308            FileId(0),
309            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
310            0,
311        );
312        assert!(info.imports.is_empty());
313    }
314
315    #[test]
316    fn skips_protocol_relative_script() {
317        let info = parse_html_to_module(
318            FileId(0),
319            r#"<script src="//cdn.example.com/lib.js"></script>"#,
320            0,
321        );
322        assert!(info.imports.is_empty());
323    }
324
325    // ── parse_html_to_module: link href extraction ───────────────
326
327    #[test]
328    fn extracts_stylesheet_link() {
329        let info = parse_html_to_module(
330            FileId(0),
331            r#"<link rel="stylesheet" href="./src/global.css" />"#,
332            0,
333        );
334        assert_eq!(info.imports.len(), 1);
335        assert_eq!(info.imports[0].source, "./src/global.css");
336    }
337
338    #[test]
339    fn extracts_modulepreload_link() {
340        let info = parse_html_to_module(
341            FileId(0),
342            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
343            0,
344        );
345        assert_eq!(info.imports.len(), 1);
346        assert_eq!(info.imports[0].source, "./src/vendor.js");
347    }
348
349    #[test]
350    fn extracts_link_with_reversed_attrs() {
351        let info = parse_html_to_module(
352            FileId(0),
353            r#"<link href="./src/global.css" rel="stylesheet" />"#,
354            0,
355        );
356        assert_eq!(info.imports.len(), 1);
357        assert_eq!(info.imports[0].source, "./src/global.css");
358    }
359
360    // ── Bare asset references normalized to relative paths ──────
361    // Regression tests for the same class of bug as #99 (Angular templateUrl).
362    // Browsers resolve `src="app.js"` and `href="styles.css"` relative to the
363    // HTML file, so emitting these as bare specifiers would misclassify them
364    // as unlisted npm packages.
365
366    #[test]
367    fn bare_script_src_normalized_to_relative() {
368        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
369        assert_eq!(info.imports.len(), 1);
370        assert_eq!(info.imports[0].source, "./app.js");
371    }
372
373    #[test]
374    fn bare_module_script_src_normalized_to_relative() {
375        let info = parse_html_to_module(
376            FileId(0),
377            r#"<script type="module" src="main.ts"></script>"#,
378            0,
379        );
380        assert_eq!(info.imports.len(), 1);
381        assert_eq!(info.imports[0].source, "./main.ts");
382    }
383
384    #[test]
385    fn bare_stylesheet_link_href_normalized_to_relative() {
386        let info = parse_html_to_module(
387            FileId(0),
388            r#"<link rel="stylesheet" href="styles.css" />"#,
389            0,
390        );
391        assert_eq!(info.imports.len(), 1);
392        assert_eq!(info.imports[0].source, "./styles.css");
393    }
394
395    #[test]
396    fn bare_link_href_reversed_attrs_normalized_to_relative() {
397        let info = parse_html_to_module(
398            FileId(0),
399            r#"<link href="styles.css" rel="stylesheet" />"#,
400            0,
401        );
402        assert_eq!(info.imports.len(), 1);
403        assert_eq!(info.imports[0].source, "./styles.css");
404    }
405
406    #[test]
407    fn bare_modulepreload_link_href_normalized_to_relative() {
408        let info = parse_html_to_module(
409            FileId(0),
410            r#"<link rel="modulepreload" href="vendor.js" />"#,
411            0,
412        );
413        assert_eq!(info.imports.len(), 1);
414        assert_eq!(info.imports[0].source, "./vendor.js");
415    }
416
417    #[test]
418    fn bare_asset_with_subdir_normalized_to_relative() {
419        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
420        assert_eq!(info.imports.len(), 1);
421        assert_eq!(info.imports[0].source, "./assets/app.js");
422    }
423
424    #[test]
425    fn root_absolute_script_src_unchanged() {
426        // `/src/main.ts` is a web convention (Vite root-relative) and must
427        // stay absolute so the resolver's HTML special case applies.
428        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
429        assert_eq!(info.imports.len(), 1);
430        assert_eq!(info.imports[0].source, "/src/main.ts");
431    }
432
433    #[test]
434    fn parent_relative_script_src_unchanged() {
435        let info = parse_html_to_module(
436            FileId(0),
437            r#"<script src="../shared/vendor.js"></script>"#,
438            0,
439        );
440        assert_eq!(info.imports.len(), 1);
441        assert_eq!(info.imports[0].source, "../shared/vendor.js");
442    }
443
444    #[test]
445    fn skips_preload_link() {
446        let info = parse_html_to_module(
447            FileId(0),
448            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
449            0,
450        );
451        assert!(info.imports.is_empty());
452    }
453
454    #[test]
455    fn skips_icon_link() {
456        let info =
457            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
458        assert!(info.imports.is_empty());
459    }
460
461    #[test]
462    fn skips_remote_stylesheet() {
463        let info = parse_html_to_module(
464            FileId(0),
465            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
466            0,
467        );
468        assert!(info.imports.is_empty());
469    }
470
471    // ── HTML comment stripping ───────────────────────────────────
472
473    #[test]
474    fn skips_commented_out_script() {
475        let info = parse_html_to_module(
476            FileId(0),
477            r#"<!-- <script src="./old.js"></script> -->
478            <script src="./new.js"></script>"#,
479            0,
480        );
481        assert_eq!(info.imports.len(), 1);
482        assert_eq!(info.imports[0].source, "./new.js");
483    }
484
485    #[test]
486    fn skips_commented_out_link() {
487        let info = parse_html_to_module(
488            FileId(0),
489            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
490            <link rel="stylesheet" href="./new.css" />"#,
491            0,
492        );
493        assert_eq!(info.imports.len(), 1);
494        assert_eq!(info.imports[0].source, "./new.css");
495    }
496
497    // ── Multi-line attributes ────────────────────────────────────
498
499    #[test]
500    fn handles_multiline_script_tag() {
501        let info = parse_html_to_module(
502            FileId(0),
503            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
504            0,
505        );
506        assert_eq!(info.imports.len(), 1);
507        assert_eq!(info.imports[0].source, "./src/entry.js");
508    }
509
510    #[test]
511    fn handles_multiline_link_tag() {
512        let info = parse_html_to_module(
513            FileId(0),
514            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
515            0,
516        );
517        assert_eq!(info.imports.len(), 1);
518        assert_eq!(info.imports[0].source, "./src/global.css");
519    }
520
521    // ── Full HTML document ───────────────────────────────────────
522
523    #[test]
524    fn full_vite_html() {
525        let info = parse_html_to_module(
526            FileId(0),
527            r#"<!doctype html>
528<html>
529  <head>
530    <link rel="stylesheet" href="./src/global.css" />
531    <link rel="icon" href="/favicon.ico" />
532  </head>
533  <body>
534    <div id="app"></div>
535    <script type="module" src="./src/entry.js"></script>
536  </body>
537</html>"#,
538            0,
539        );
540        assert_eq!(info.imports.len(), 2);
541        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
542        assert!(sources.contains(&"./src/global.css"));
543        assert!(sources.contains(&"./src/entry.js"));
544    }
545
546    // ── Edge cases ───────────────────────────────────────────────
547
548    #[test]
549    fn empty_html() {
550        let info = parse_html_to_module(FileId(0), "", 0);
551        assert!(info.imports.is_empty());
552    }
553
554    #[test]
555    fn html_with_no_assets() {
556        let info = parse_html_to_module(
557            FileId(0),
558            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
559            0,
560        );
561        assert!(info.imports.is_empty());
562    }
563
564    #[test]
565    fn single_quoted_attributes() {
566        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
567        assert_eq!(info.imports.len(), 1);
568        assert_eq!(info.imports[0].source, "./src/entry.js");
569    }
570
571    #[test]
572    fn all_imports_are_side_effect() {
573        let info = parse_html_to_module(
574            FileId(0),
575            r#"<script src="./entry.js"></script>
576            <link rel="stylesheet" href="./style.css" />"#,
577            0,
578        );
579        for imp in &info.imports {
580            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
581            assert!(imp.local_name.is_empty());
582            assert!(!imp.is_type_only);
583        }
584    }
585
586    #[test]
587    fn suppression_comments_extracted() {
588        let info = parse_html_to_module(
589            FileId(0),
590            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
591            0,
592        );
593        // HTML comments use <!-- --> not //, so suppression parsing
594        // from source text won't find standard JS-style comments.
595        // This is expected — HTML suppression is not supported.
596        assert_eq!(info.imports.len(), 1);
597    }
598
599    // ── Angular template scanning ──────────────────────────────
600
601    #[test]
602    fn angular_template_extracts_member_refs() {
603        let info = parse_html_to_module(
604            FileId(0),
605            "<h1>{{ title() }}</h1>\n\
606             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
607             <button (click)=\"onButtonClick()\">Toggle</button>",
608            0,
609        );
610        let names: rustc_hash::FxHashSet<&str> = info
611            .member_accesses
612            .iter()
613            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
614            .map(|a| a.member.as_str())
615            .collect();
616        assert!(names.contains("title"), "should contain 'title'");
617        assert!(
618            names.contains("isHighlighted"),
619            "should contain 'isHighlighted'"
620        );
621        assert!(names.contains("greeting"), "should contain 'greeting'");
622        assert!(
623            names.contains("onButtonClick"),
624            "should contain 'onButtonClick'"
625        );
626    }
627
628    #[test]
629    fn plain_html_no_angular_refs() {
630        let info = parse_html_to_module(
631            FileId(0),
632            "<!doctype html><html><body><h1>Hello</h1></body></html>",
633            0,
634        );
635        assert!(info.member_accesses.is_empty());
636    }
637}