Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30        .expect("valid regex")
31});
32
33/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
34/// `rel="modulepreload"`.
35/// Handles attributes in any order (rel before or after href).
36static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
37    regex::Regex::new(
38        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
39    )
40    .expect("valid regex")
41});
42
43/// Regex for the reverse attribute order: href before rel.
44static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
45    regex::Regex::new(
46        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
47    )
48    .expect("valid regex")
49});
50
51/// Check if a path is an HTML file.
52// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
53pub(crate) fn is_html_file(path: &Path) -> bool {
54    path.extension()
55        .and_then(|e| e.to_str())
56        .is_some_and(|ext| ext == "html")
57}
58
59/// Returns true if an HTML asset reference is a remote URL that should be skipped.
60pub(crate) fn is_remote_url(src: &str) -> bool {
61    src.starts_with("http://")
62        || src.starts_with("https://")
63        || src.starts_with("//")
64        || src.starts_with("data:")
65}
66
67/// Extract local (non-remote) asset references from HTML-like markup.
68///
69/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
70/// between the HTML file parser and the JS/TS visitor's tagged template
71/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
72/// layouts emits the same asset edges as a real `.html` file.
73pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
74    let stripped = HTML_COMMENT_RE.replace_all(source, "");
75    let mut refs: Vec<String> = Vec::new();
76
77    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
78        if let Some(m) = cap.get(1) {
79            let src = m.as_str().trim();
80            if !src.is_empty() && !is_remote_url(src) {
81                refs.push(src.to_string());
82            }
83        }
84    }
85
86    for cap in LINK_HREF_RE.captures_iter(&stripped) {
87        if let Some(m) = cap.get(2) {
88            let href = m.as_str().trim();
89            if !href.is_empty() && !is_remote_url(href) {
90                refs.push(href.to_string());
91            }
92        }
93    }
94    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(1) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                refs.push(href.to_string());
99            }
100        }
101    }
102
103    refs
104}
105
106/// Parse an HTML file, extracting script and stylesheet references as imports.
107#[cfg(test)]
108pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
109    parse_html_to_module_with_complexity(file_id, source, content_hash, false)
110}
111
112/// Parse an HTML file and optionally compute Angular template complexity.
113pub(crate) fn parse_html_to_module_with_complexity(
114    file_id: FileId,
115    source: &str,
116    content_hash: u64,
117    need_complexity: bool,
118) -> ModuleInfo {
119    let suppressions = crate::suppress::parse_suppressions_from_source(source);
120
121    // Bare filenames (e.g., `src="app.js"`) are normalized to `./app.js` so
122    // the resolver doesn't misclassify them as npm packages.
123    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
124        .into_iter()
125        .map(|raw| ImportInfo {
126            source: normalize_asset_url(&raw),
127            imported_name: ImportedName::SideEffect,
128            local_name: String::new(),
129            is_type_only: false,
130            from_style: false,
131            span: Span::default(),
132            source_span: Span::default(),
133        })
134        .collect();
135
136    // Deduplicate: the same asset may be referenced by both <script src> and
137    // <link rel="modulepreload" href> for the same path.
138    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
139    imports.dedup_by(|a, b| a.source == b.source);
140
141    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
142    //
143    // Bare identifier refs (e.g. `title`, `dataService`, pipe names) are stored
144    // as `MemberAccess` entries with a sentinel object name so the analysis
145    // phase can credit them as members of the component class.
146    //
147    // Static member-access chains (`dataService.getTotal`) where `dataService`
148    // is an unresolved identifier are stored as regular (non-sentinel)
149    // `MemberAccess` entries. The analysis phase resolves these through the
150    // importing component's typed instance bindings (from
151    // `ClassHeritageInfo.instance_bindings`) to credit the target class's
152    // member as used.
153    let template_refs = angular::collect_angular_template_refs(source);
154    let mut member_accesses: Vec<MemberAccess> = template_refs
155        .identifiers
156        .into_iter()
157        .map(|name| MemberAccess {
158            object: ANGULAR_TPL_SENTINEL.to_string(),
159            member: name,
160        })
161        .collect();
162    member_accesses.extend(template_refs.member_accesses);
163
164    let complexity = if need_complexity {
165        crate::template_complexity::compute_angular_template_complexity(source)
166            .into_iter()
167            .collect()
168    } else {
169        Vec::new()
170    };
171
172    ModuleInfo {
173        file_id,
174        exports: Vec::new(),
175        imports,
176        re_exports: Vec::new(),
177        dynamic_imports: Vec::new(),
178        dynamic_import_patterns: Vec::new(),
179        require_calls: Vec::new(),
180        member_accesses,
181        whole_object_uses: Vec::new(),
182        has_cjs_exports: false,
183        content_hash,
184        suppressions,
185        unused_import_bindings: Vec::new(),
186        type_referenced_import_bindings: Vec::new(),
187        value_referenced_import_bindings: Vec::new(),
188        line_offsets: fallow_types::extract::compute_line_offsets(source),
189        complexity,
190        flag_uses: Vec::new(),
191        class_heritage: vec![],
192        local_type_declarations: Vec::new(),
193        public_signature_type_references: Vec::new(),
194        namespace_object_aliases: Vec::new(),
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    // ── is_html_file ─────────────────────────────────────────────
203
204    #[test]
205    fn is_html_file_html() {
206        assert!(is_html_file(Path::new("index.html")));
207    }
208
209    #[test]
210    fn is_html_file_nested() {
211        assert!(is_html_file(Path::new("pages/about.html")));
212    }
213
214    #[test]
215    fn is_html_file_rejects_htm() {
216        assert!(!is_html_file(Path::new("index.htm")));
217    }
218
219    #[test]
220    fn is_html_file_rejects_js() {
221        assert!(!is_html_file(Path::new("app.js")));
222    }
223
224    #[test]
225    fn is_html_file_rejects_ts() {
226        assert!(!is_html_file(Path::new("app.ts")));
227    }
228
229    #[test]
230    fn is_html_file_rejects_vue() {
231        assert!(!is_html_file(Path::new("App.vue")));
232    }
233
234    // ── is_remote_url ────────────────────────────────────────────
235
236    #[test]
237    fn remote_url_http() {
238        assert!(is_remote_url("http://example.com/script.js"));
239    }
240
241    #[test]
242    fn remote_url_https() {
243        assert!(is_remote_url("https://cdn.example.com/style.css"));
244    }
245
246    #[test]
247    fn remote_url_protocol_relative() {
248        assert!(is_remote_url("//cdn.example.com/lib.js"));
249    }
250
251    #[test]
252    fn remote_url_data() {
253        assert!(is_remote_url("data:text/javascript;base64,abc"));
254    }
255
256    #[test]
257    fn local_relative_not_remote() {
258        assert!(!is_remote_url("./src/entry.js"));
259    }
260
261    #[test]
262    fn local_root_relative_not_remote() {
263        assert!(!is_remote_url("/src/entry.js"));
264    }
265
266    // ── parse_html_to_module: script src extraction ──────────────
267
268    #[test]
269    fn extracts_module_script_src() {
270        let info = parse_html_to_module(
271            FileId(0),
272            r#"<script type="module" src="./src/entry.js"></script>"#,
273            0,
274        );
275        assert_eq!(info.imports.len(), 1);
276        assert_eq!(info.imports[0].source, "./src/entry.js");
277    }
278
279    #[test]
280    fn extracts_plain_script_src() {
281        let info = parse_html_to_module(
282            FileId(0),
283            r#"<script src="./src/polyfills.js"></script>"#,
284            0,
285        );
286        assert_eq!(info.imports.len(), 1);
287        assert_eq!(info.imports[0].source, "./src/polyfills.js");
288    }
289
290    #[test]
291    fn extracts_multiple_scripts() {
292        let info = parse_html_to_module(
293            FileId(0),
294            r#"
295            <script type="module" src="./src/entry.js"></script>
296            <script src="./src/polyfills.js"></script>
297            "#,
298            0,
299        );
300        assert_eq!(info.imports.len(), 2);
301    }
302
303    #[test]
304    fn skips_inline_script() {
305        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
306        assert!(info.imports.is_empty());
307    }
308
309    #[test]
310    fn skips_remote_script() {
311        let info = parse_html_to_module(
312            FileId(0),
313            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
314            0,
315        );
316        assert!(info.imports.is_empty());
317    }
318
319    #[test]
320    fn skips_protocol_relative_script() {
321        let info = parse_html_to_module(
322            FileId(0),
323            r#"<script src="//cdn.example.com/lib.js"></script>"#,
324            0,
325        );
326        assert!(info.imports.is_empty());
327    }
328
329    // ── parse_html_to_module: link href extraction ───────────────
330
331    #[test]
332    fn extracts_stylesheet_link() {
333        let info = parse_html_to_module(
334            FileId(0),
335            r#"<link rel="stylesheet" href="./src/global.css" />"#,
336            0,
337        );
338        assert_eq!(info.imports.len(), 1);
339        assert_eq!(info.imports[0].source, "./src/global.css");
340    }
341
342    #[test]
343    fn extracts_modulepreload_link() {
344        let info = parse_html_to_module(
345            FileId(0),
346            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
347            0,
348        );
349        assert_eq!(info.imports.len(), 1);
350        assert_eq!(info.imports[0].source, "./src/vendor.js");
351    }
352
353    #[test]
354    fn extracts_link_with_reversed_attrs() {
355        let info = parse_html_to_module(
356            FileId(0),
357            r#"<link href="./src/global.css" rel="stylesheet" />"#,
358            0,
359        );
360        assert_eq!(info.imports.len(), 1);
361        assert_eq!(info.imports[0].source, "./src/global.css");
362    }
363
364    // ── Bare asset references normalized to relative paths ──────
365    // Regression tests for the same class of bug as #99 (Angular templateUrl).
366    // Browsers resolve `src="app.js"` and `href="styles.css"` relative to the
367    // HTML file, so emitting these as bare specifiers would misclassify them
368    // as unlisted npm packages.
369
370    #[test]
371    fn bare_script_src_normalized_to_relative() {
372        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
373        assert_eq!(info.imports.len(), 1);
374        assert_eq!(info.imports[0].source, "./app.js");
375    }
376
377    #[test]
378    fn bare_module_script_src_normalized_to_relative() {
379        let info = parse_html_to_module(
380            FileId(0),
381            r#"<script type="module" src="main.ts"></script>"#,
382            0,
383        );
384        assert_eq!(info.imports.len(), 1);
385        assert_eq!(info.imports[0].source, "./main.ts");
386    }
387
388    #[test]
389    fn bare_stylesheet_link_href_normalized_to_relative() {
390        let info = parse_html_to_module(
391            FileId(0),
392            r#"<link rel="stylesheet" href="styles.css" />"#,
393            0,
394        );
395        assert_eq!(info.imports.len(), 1);
396        assert_eq!(info.imports[0].source, "./styles.css");
397    }
398
399    #[test]
400    fn bare_link_href_reversed_attrs_normalized_to_relative() {
401        let info = parse_html_to_module(
402            FileId(0),
403            r#"<link href="styles.css" rel="stylesheet" />"#,
404            0,
405        );
406        assert_eq!(info.imports.len(), 1);
407        assert_eq!(info.imports[0].source, "./styles.css");
408    }
409
410    #[test]
411    fn bare_modulepreload_link_href_normalized_to_relative() {
412        let info = parse_html_to_module(
413            FileId(0),
414            r#"<link rel="modulepreload" href="vendor.js" />"#,
415            0,
416        );
417        assert_eq!(info.imports.len(), 1);
418        assert_eq!(info.imports[0].source, "./vendor.js");
419    }
420
421    #[test]
422    fn bare_asset_with_subdir_normalized_to_relative() {
423        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
424        assert_eq!(info.imports.len(), 1);
425        assert_eq!(info.imports[0].source, "./assets/app.js");
426    }
427
428    #[test]
429    fn root_absolute_script_src_unchanged() {
430        // `/src/main.ts` is a web convention (Vite root-relative) and must
431        // stay absolute so the resolver's HTML special case applies.
432        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
433        assert_eq!(info.imports.len(), 1);
434        assert_eq!(info.imports[0].source, "/src/main.ts");
435    }
436
437    #[test]
438    fn parent_relative_script_src_unchanged() {
439        let info = parse_html_to_module(
440            FileId(0),
441            r#"<script src="../shared/vendor.js"></script>"#,
442            0,
443        );
444        assert_eq!(info.imports.len(), 1);
445        assert_eq!(info.imports[0].source, "../shared/vendor.js");
446    }
447
448    #[test]
449    fn skips_preload_link() {
450        let info = parse_html_to_module(
451            FileId(0),
452            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
453            0,
454        );
455        assert!(info.imports.is_empty());
456    }
457
458    #[test]
459    fn skips_icon_link() {
460        let info =
461            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
462        assert!(info.imports.is_empty());
463    }
464
465    #[test]
466    fn skips_remote_stylesheet() {
467        let info = parse_html_to_module(
468            FileId(0),
469            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
470            0,
471        );
472        assert!(info.imports.is_empty());
473    }
474
475    // ── HTML comment stripping ───────────────────────────────────
476
477    #[test]
478    fn skips_commented_out_script() {
479        let info = parse_html_to_module(
480            FileId(0),
481            r#"<!-- <script src="./old.js"></script> -->
482            <script src="./new.js"></script>"#,
483            0,
484        );
485        assert_eq!(info.imports.len(), 1);
486        assert_eq!(info.imports[0].source, "./new.js");
487    }
488
489    #[test]
490    fn skips_commented_out_link() {
491        let info = parse_html_to_module(
492            FileId(0),
493            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
494            <link rel="stylesheet" href="./new.css" />"#,
495            0,
496        );
497        assert_eq!(info.imports.len(), 1);
498        assert_eq!(info.imports[0].source, "./new.css");
499    }
500
501    // ── Multi-line attributes ────────────────────────────────────
502
503    #[test]
504    fn handles_multiline_script_tag() {
505        let info = parse_html_to_module(
506            FileId(0),
507            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
508            0,
509        );
510        assert_eq!(info.imports.len(), 1);
511        assert_eq!(info.imports[0].source, "./src/entry.js");
512    }
513
514    #[test]
515    fn handles_multiline_link_tag() {
516        let info = parse_html_to_module(
517            FileId(0),
518            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
519            0,
520        );
521        assert_eq!(info.imports.len(), 1);
522        assert_eq!(info.imports[0].source, "./src/global.css");
523    }
524
525    // ── Full HTML document ───────────────────────────────────────
526
527    #[test]
528    fn full_vite_html() {
529        let info = parse_html_to_module(
530            FileId(0),
531            r#"<!doctype html>
532<html>
533  <head>
534    <link rel="stylesheet" href="./src/global.css" />
535    <link rel="icon" href="/favicon.ico" />
536  </head>
537  <body>
538    <div id="app"></div>
539    <script type="module" src="./src/entry.js"></script>
540  </body>
541</html>"#,
542            0,
543        );
544        assert_eq!(info.imports.len(), 2);
545        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
546        assert!(sources.contains(&"./src/global.css"));
547        assert!(sources.contains(&"./src/entry.js"));
548    }
549
550    // ── Edge cases ───────────────────────────────────────────────
551
552    #[test]
553    fn empty_html() {
554        let info = parse_html_to_module(FileId(0), "", 0);
555        assert!(info.imports.is_empty());
556    }
557
558    #[test]
559    fn html_with_no_assets() {
560        let info = parse_html_to_module(
561            FileId(0),
562            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
563            0,
564        );
565        assert!(info.imports.is_empty());
566    }
567
568    #[test]
569    fn single_quoted_attributes() {
570        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
571        assert_eq!(info.imports.len(), 1);
572        assert_eq!(info.imports[0].source, "./src/entry.js");
573    }
574
575    #[test]
576    fn all_imports_are_side_effect() {
577        let info = parse_html_to_module(
578            FileId(0),
579            r#"<script src="./entry.js"></script>
580            <link rel="stylesheet" href="./style.css" />"#,
581            0,
582        );
583        for imp in &info.imports {
584            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
585            assert!(imp.local_name.is_empty());
586            assert!(!imp.is_type_only);
587        }
588    }
589
590    #[test]
591    fn suppression_comments_extracted() {
592        let info = parse_html_to_module(
593            FileId(0),
594            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
595            0,
596        );
597        // HTML comments use <!-- --> not //, so suppression parsing
598        // from source text won't find standard JS-style comments.
599        // This is expected — HTML suppression is not supported.
600        assert_eq!(info.imports.len(), 1);
601    }
602
603    // ── Angular template scanning ──────────────────────────────
604
605    #[test]
606    fn angular_template_extracts_member_refs() {
607        let info = parse_html_to_module(
608            FileId(0),
609            "<h1>{{ title() }}</h1>\n\
610             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
611             <button (click)=\"onButtonClick()\">Toggle</button>",
612            0,
613        );
614        let names: rustc_hash::FxHashSet<&str> = info
615            .member_accesses
616            .iter()
617            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
618            .map(|a| a.member.as_str())
619            .collect();
620        assert!(names.contains("title"), "should contain 'title'");
621        assert!(
622            names.contains("isHighlighted"),
623            "should contain 'isHighlighted'"
624        );
625        assert!(names.contains("greeting"), "should contain 'greeting'");
626        assert!(
627            names.contains("onButtonClick"),
628            "should contain 'onButtonClick'"
629        );
630    }
631
632    #[test]
633    fn plain_html_no_angular_refs() {
634        let info = parse_html_to_module(
635            FileId(0),
636            "<!doctype html><html><body><h1>Hello</h1></body></html>",
637            0,
638        );
639        assert!(info.member_accesses.is_empty());
640    }
641}