Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| crate::static_regex(r"(?s)<!--.*?-->"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    crate::static_regex(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30});
31
32/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
33/// `rel="modulepreload"`.
34/// Handles attributes in any order (rel before or after href).
35static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
36    crate::static_regex(
37        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
38    )
39});
40
41/// Regex for the reverse attribute order: href before rel.
42static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
43    crate::static_regex(
44        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
45    )
46});
47
48/// Check if a path is an HTML file.
49pub(crate) fn is_html_file(path: &Path) -> bool {
50    path.extension()
51        .and_then(|e| e.to_str())
52        .is_some_and(|ext| ext == "html")
53}
54
55/// Returns true if an HTML asset reference is a remote URL that should be skipped.
56pub(crate) fn is_remote_url(src: &str) -> bool {
57    src.starts_with("http://")
58        || src.starts_with("https://")
59        || src.starts_with("//")
60        || src.starts_with("data:")
61}
62
63/// Build-time template placeholders that aren't valid import specifiers and
64/// never resolve to a real file. Skip them at extraction time so they don't
65/// enter the import graph as unresolvable specifiers.
66///
67/// - `{{ ... }}` covers Handlebars (Ember `index.html`'s `{{rootURL}}`,
68///   `{{config.assetsPath}}`), Mustache (Jekyll, Hugo), Jinja2 (Pelican /
69///   11ty plugins), and pre-compiled Vue / Angular templates whose
70///   interpolation has leaked into a checked-in HTML scaffold.
71/// - `###...###` covers ember-cli blueprint scaffold placeholders
72///   (`###APPNAME###`, `###DUMMY###`) checked in as addon-fixture templates.
73///
74/// Neither shape is a legal URL or path character outside template engines,
75/// so the skip is generic across frameworks rather than gated on a plugin.
76/// Returns `true` for any `src` / `href` value that contains either marker.
77pub(crate) fn is_template_placeholder(value: &str) -> bool {
78    value.contains("{{") || value.contains("###")
79}
80
81/// Extract local (non-remote) asset references from HTML-like markup.
82///
83/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
84/// between the HTML file parser and the JS/TS visitor's tagged template
85/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
86/// layouts emits the same asset edges as a real `.html` file.
87pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
88    let stripped = HTML_COMMENT_RE.replace_all(source, "");
89    let mut refs: Vec<String> = Vec::new();
90
91    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
92        if let Some(m) = cap.get(1) {
93            let src = m.as_str().trim();
94            if !src.is_empty() && !is_remote_url(src) && !is_template_placeholder(src) {
95                refs.push(src.to_string());
96            }
97        }
98    }
99
100    for cap in LINK_HREF_RE.captures_iter(&stripped) {
101        if let Some(m) = cap.get(2) {
102            let href = m.as_str().trim();
103            if !href.is_empty() && !is_remote_url(href) && !is_template_placeholder(href) {
104                refs.push(href.to_string());
105            }
106        }
107    }
108    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
109        if let Some(m) = cap.get(1) {
110            let href = m.as_str().trim();
111            if !href.is_empty() && !is_remote_url(href) && !is_template_placeholder(href) {
112                refs.push(href.to_string());
113            }
114        }
115    }
116
117    refs
118}
119
120/// Parse an HTML file, extracting script and stylesheet references as imports.
121#[cfg(test)]
122pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
123    parse_html_to_module_with_complexity(file_id, source, content_hash, false)
124}
125
126/// Parse an HTML file and optionally compute Angular template complexity.
127pub(crate) fn parse_html_to_module_with_complexity(
128    file_id: FileId,
129    source: &str,
130    content_hash: u64,
131    need_complexity: bool,
132) -> ModuleInfo {
133    let parsed_suppressions = crate::suppress::parse_suppressions_from_source(source);
134
135    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
136        .into_iter()
137        .map(|raw| ImportInfo {
138            source: normalize_asset_url(&raw),
139            imported_name: ImportedName::SideEffect,
140            local_name: String::new(),
141            is_type_only: false,
142            from_style: false,
143            span: Span::default(),
144            source_span: Span::default(),
145        })
146        .collect();
147
148    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
149    imports.dedup_by(|a, b| a.source == b.source);
150
151    let template_refs = angular::collect_angular_template_refs(source);
152    let mut member_accesses: Vec<MemberAccess> = template_refs
153        .identifiers
154        .into_iter()
155        .map(|name| MemberAccess {
156            object: ANGULAR_TPL_SENTINEL.to_string(),
157            member: name,
158        })
159        .collect();
160    member_accesses.extend(template_refs.member_accesses);
161
162    let complexity = if need_complexity {
163        crate::template_complexity::compute_angular_template_complexity(source)
164            .into_iter()
165            .collect()
166    } else {
167        Vec::new()
168    };
169
170    ModuleInfo {
171        file_id,
172        exports: Vec::new(),
173        imports,
174        re_exports: Vec::new(),
175        dynamic_imports: Vec::new(),
176        dynamic_import_patterns: Vec::new(),
177        require_calls: Vec::new(),
178        member_accesses,
179        whole_object_uses: Vec::new(),
180        has_cjs_exports: false,
181        has_angular_component_template_url: false,
182        content_hash,
183        suppressions: parsed_suppressions.suppressions,
184        unknown_suppression_kinds: parsed_suppressions.unknown_kinds,
185        unused_import_bindings: Vec::new(),
186        type_referenced_import_bindings: Vec::new(),
187        value_referenced_import_bindings: Vec::new(),
188        line_offsets: fallow_types::extract::compute_line_offsets(source),
189        complexity,
190        flag_uses: Vec::new(),
191        class_heritage: vec![],
192        injection_tokens: vec![],
193        local_type_declarations: Vec::new(),
194        public_signature_type_references: Vec::new(),
195        namespace_object_aliases: Vec::new(),
196        iconify_prefixes: Vec::new(),
197        auto_import_candidates: Vec::new(),
198        directives: Vec::new(),
199        security_sinks: Vec::new(),
200        security_sinks_skipped: 0,
201        tainted_bindings: Vec::new(),
202        sanitized_sink_args: Vec::new(),
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    #[test]
211    fn is_html_file_html() {
212        assert!(is_html_file(Path::new("index.html")));
213    }
214
215    #[test]
216    fn is_html_file_nested() {
217        assert!(is_html_file(Path::new("pages/about.html")));
218    }
219
220    #[test]
221    fn is_html_file_rejects_htm() {
222        assert!(!is_html_file(Path::new("index.htm")));
223    }
224
225    #[test]
226    fn is_html_file_rejects_js() {
227        assert!(!is_html_file(Path::new("app.js")));
228    }
229
230    #[test]
231    fn is_html_file_rejects_ts() {
232        assert!(!is_html_file(Path::new("app.ts")));
233    }
234
235    #[test]
236    fn is_html_file_rejects_vue() {
237        assert!(!is_html_file(Path::new("App.vue")));
238    }
239
240    #[test]
241    fn remote_url_http() {
242        assert!(is_remote_url("http://example.com/script.js"));
243    }
244
245    #[test]
246    fn remote_url_https() {
247        assert!(is_remote_url("https://cdn.example.com/style.css"));
248    }
249
250    #[test]
251    fn remote_url_protocol_relative() {
252        assert!(is_remote_url("//cdn.example.com/lib.js"));
253    }
254
255    #[test]
256    fn remote_url_data() {
257        assert!(is_remote_url("data:text/javascript;base64,abc"));
258    }
259
260    #[test]
261    fn local_relative_not_remote() {
262        assert!(!is_remote_url("./src/entry.js"));
263    }
264
265    #[test]
266    fn local_root_relative_not_remote() {
267        assert!(!is_remote_url("/src/entry.js"));
268    }
269
270    #[test]
271    fn extracts_module_script_src() {
272        let info = parse_html_to_module(
273            FileId(0),
274            r#"<script type="module" src="./src/entry.js"></script>"#,
275            0,
276        );
277        assert_eq!(info.imports.len(), 1);
278        assert_eq!(info.imports[0].source, "./src/entry.js");
279    }
280
281    #[test]
282    fn extracts_plain_script_src() {
283        let info = parse_html_to_module(
284            FileId(0),
285            r#"<script src="./src/polyfills.js"></script>"#,
286            0,
287        );
288        assert_eq!(info.imports.len(), 1);
289        assert_eq!(info.imports[0].source, "./src/polyfills.js");
290    }
291
292    #[test]
293    fn extracts_multiple_scripts() {
294        let info = parse_html_to_module(
295            FileId(0),
296            r#"
297            <script type="module" src="./src/entry.js"></script>
298            <script src="./src/polyfills.js"></script>
299            "#,
300            0,
301        );
302        assert_eq!(info.imports.len(), 2);
303    }
304
305    #[test]
306    fn skips_inline_script() {
307        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
308        assert!(info.imports.is_empty());
309    }
310
311    #[test]
312    fn skips_handlebars_placeholder_in_script_src() {
313        let info = parse_html_to_module(
314            FileId(0),
315            r#"<script src="{{rootURL}}assets/app.js"></script>
316               <script src="{{config.assetsPath}}vendor.js"></script>"#,
317            0,
318        );
319        assert!(
320            info.imports.is_empty(),
321            "Handlebars-placeholder script srcs should not enter the import graph; got {:?}",
322            info.imports
323        );
324    }
325
326    #[test]
327    fn skips_handlebars_placeholder_in_link_href() {
328        let info = parse_html_to_module(
329            FileId(0),
330            r#"<link rel="stylesheet" href="{{rootURL}}assets/app.css">"#,
331            0,
332        );
333        assert!(info.imports.is_empty());
334    }
335
336    #[test]
337    fn skips_ember_cli_blueprint_placeholder() {
338        let info = parse_html_to_module(
339            FileId(0),
340            r####"<script src="###APPNAME###/app.js"></script>"####,
341            0,
342        );
343        assert!(info.imports.is_empty());
344    }
345
346    #[test]
347    fn extracts_normal_specifier_alongside_placeholders() {
348        let info = parse_html_to_module(
349            FileId(0),
350            r#"<script src="{{rootURL}}assets/app.js"></script>
351               <script src="./src/main.ts"></script>"#,
352            0,
353        );
354        assert_eq!(info.imports.len(), 1);
355        assert_eq!(info.imports[0].source, "./src/main.ts");
356    }
357
358    #[test]
359    fn skips_remote_script() {
360        let info = parse_html_to_module(
361            FileId(0),
362            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
363            0,
364        );
365        assert!(info.imports.is_empty());
366    }
367
368    #[test]
369    fn skips_protocol_relative_script() {
370        let info = parse_html_to_module(
371            FileId(0),
372            r#"<script src="//cdn.example.com/lib.js"></script>"#,
373            0,
374        );
375        assert!(info.imports.is_empty());
376    }
377
378    #[test]
379    fn extracts_stylesheet_link() {
380        let info = parse_html_to_module(
381            FileId(0),
382            r#"<link rel="stylesheet" href="./src/global.css" />"#,
383            0,
384        );
385        assert_eq!(info.imports.len(), 1);
386        assert_eq!(info.imports[0].source, "./src/global.css");
387    }
388
389    #[test]
390    fn extracts_modulepreload_link() {
391        let info = parse_html_to_module(
392            FileId(0),
393            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
394            0,
395        );
396        assert_eq!(info.imports.len(), 1);
397        assert_eq!(info.imports[0].source, "./src/vendor.js");
398    }
399
400    #[test]
401    fn extracts_link_with_reversed_attrs() {
402        let info = parse_html_to_module(
403            FileId(0),
404            r#"<link href="./src/global.css" rel="stylesheet" />"#,
405            0,
406        );
407        assert_eq!(info.imports.len(), 1);
408        assert_eq!(info.imports[0].source, "./src/global.css");
409    }
410
411    #[test]
412    fn bare_script_src_normalized_to_relative() {
413        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
414        assert_eq!(info.imports.len(), 1);
415        assert_eq!(info.imports[0].source, "./app.js");
416    }
417
418    #[test]
419    fn bare_module_script_src_normalized_to_relative() {
420        let info = parse_html_to_module(
421            FileId(0),
422            r#"<script type="module" src="main.ts"></script>"#,
423            0,
424        );
425        assert_eq!(info.imports.len(), 1);
426        assert_eq!(info.imports[0].source, "./main.ts");
427    }
428
429    #[test]
430    fn bare_stylesheet_link_href_normalized_to_relative() {
431        let info = parse_html_to_module(
432            FileId(0),
433            r#"<link rel="stylesheet" href="styles.css" />"#,
434            0,
435        );
436        assert_eq!(info.imports.len(), 1);
437        assert_eq!(info.imports[0].source, "./styles.css");
438    }
439
440    #[test]
441    fn bare_link_href_reversed_attrs_normalized_to_relative() {
442        let info = parse_html_to_module(
443            FileId(0),
444            r#"<link href="styles.css" rel="stylesheet" />"#,
445            0,
446        );
447        assert_eq!(info.imports.len(), 1);
448        assert_eq!(info.imports[0].source, "./styles.css");
449    }
450
451    #[test]
452    fn bare_modulepreload_link_href_normalized_to_relative() {
453        let info = parse_html_to_module(
454            FileId(0),
455            r#"<link rel="modulepreload" href="vendor.js" />"#,
456            0,
457        );
458        assert_eq!(info.imports.len(), 1);
459        assert_eq!(info.imports[0].source, "./vendor.js");
460    }
461
462    #[test]
463    fn bare_asset_with_subdir_normalized_to_relative() {
464        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
465        assert_eq!(info.imports.len(), 1);
466        assert_eq!(info.imports[0].source, "./assets/app.js");
467    }
468
469    #[test]
470    fn root_absolute_script_src_unchanged() {
471        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
472        assert_eq!(info.imports.len(), 1);
473        assert_eq!(info.imports[0].source, "/src/main.ts");
474    }
475
476    #[test]
477    fn parent_relative_script_src_unchanged() {
478        let info = parse_html_to_module(
479            FileId(0),
480            r#"<script src="../shared/vendor.js"></script>"#,
481            0,
482        );
483        assert_eq!(info.imports.len(), 1);
484        assert_eq!(info.imports[0].source, "../shared/vendor.js");
485    }
486
487    #[test]
488    fn skips_preload_link() {
489        let info = parse_html_to_module(
490            FileId(0),
491            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
492            0,
493        );
494        assert!(info.imports.is_empty());
495    }
496
497    #[test]
498    fn skips_icon_link() {
499        let info =
500            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
501        assert!(info.imports.is_empty());
502    }
503
504    #[test]
505    fn skips_remote_stylesheet() {
506        let info = parse_html_to_module(
507            FileId(0),
508            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
509            0,
510        );
511        assert!(info.imports.is_empty());
512    }
513
514    #[test]
515    fn skips_commented_out_script() {
516        let info = parse_html_to_module(
517            FileId(0),
518            r#"<!-- <script src="./old.js"></script> -->
519            <script src="./new.js"></script>"#,
520            0,
521        );
522        assert_eq!(info.imports.len(), 1);
523        assert_eq!(info.imports[0].source, "./new.js");
524    }
525
526    #[test]
527    fn skips_commented_out_link() {
528        let info = parse_html_to_module(
529            FileId(0),
530            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
531            <link rel="stylesheet" href="./new.css" />"#,
532            0,
533        );
534        assert_eq!(info.imports.len(), 1);
535        assert_eq!(info.imports[0].source, "./new.css");
536    }
537
538    #[test]
539    fn handles_multiline_script_tag() {
540        let info = parse_html_to_module(
541            FileId(0),
542            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
543            0,
544        );
545        assert_eq!(info.imports.len(), 1);
546        assert_eq!(info.imports[0].source, "./src/entry.js");
547    }
548
549    #[test]
550    fn handles_multiline_link_tag() {
551        let info = parse_html_to_module(
552            FileId(0),
553            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
554            0,
555        );
556        assert_eq!(info.imports.len(), 1);
557        assert_eq!(info.imports[0].source, "./src/global.css");
558    }
559
560    #[test]
561    fn full_vite_html() {
562        let info = parse_html_to_module(
563            FileId(0),
564            r#"<!doctype html>
565<html>
566  <head>
567    <link rel="stylesheet" href="./src/global.css" />
568    <link rel="icon" href="/favicon.ico" />
569  </head>
570  <body>
571    <div id="app"></div>
572    <script type="module" src="./src/entry.js"></script>
573  </body>
574</html>"#,
575            0,
576        );
577        assert_eq!(info.imports.len(), 2);
578        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
579        assert!(sources.contains(&"./src/global.css"));
580        assert!(sources.contains(&"./src/entry.js"));
581    }
582
583    #[test]
584    fn empty_html() {
585        let info = parse_html_to_module(FileId(0), "", 0);
586        assert!(info.imports.is_empty());
587    }
588
589    #[test]
590    fn html_with_no_assets() {
591        let info = parse_html_to_module(
592            FileId(0),
593            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
594            0,
595        );
596        assert!(info.imports.is_empty());
597    }
598
599    #[test]
600    fn single_quoted_attributes() {
601        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
602        assert_eq!(info.imports.len(), 1);
603        assert_eq!(info.imports[0].source, "./src/entry.js");
604    }
605
606    #[test]
607    fn all_imports_are_side_effect() {
608        let info = parse_html_to_module(
609            FileId(0),
610            r#"<script src="./entry.js"></script>
611            <link rel="stylesheet" href="./style.css" />"#,
612            0,
613        );
614        for imp in &info.imports {
615            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
616            assert!(imp.local_name.is_empty());
617            assert!(!imp.is_type_only);
618        }
619    }
620
621    #[test]
622    fn suppression_comments_extracted() {
623        let info = parse_html_to_module(
624            FileId(0),
625            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
626            0,
627        );
628        assert_eq!(info.imports.len(), 1);
629    }
630
631    #[test]
632    fn angular_template_extracts_member_refs() {
633        let info = parse_html_to_module(
634            FileId(0),
635            "<h1>{{ title() }}</h1>\n\
636             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
637             <button (click)=\"onButtonClick()\">Toggle</button>",
638            0,
639        );
640        let names: rustc_hash::FxHashSet<&str> = info
641            .member_accesses
642            .iter()
643            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
644            .map(|a| a.member.as_str())
645            .collect();
646        assert!(names.contains("title"), "should contain 'title'");
647        assert!(
648            names.contains("isHighlighted"),
649            "should contain 'isHighlighted'"
650        );
651        assert!(names.contains("greeting"), "should contain 'greeting'");
652        assert!(
653            names.contains("onButtonClick"),
654            "should contain 'onButtonClick'"
655        );
656    }
657
658    #[test]
659    fn plain_html_no_angular_refs() {
660        let info = parse_html_to_module(
661            FileId(0),
662            "<!doctype html><html><body><h1>Hello</h1></body></html>",
663            0,
664        );
665        assert!(info.member_accesses.is_empty());
666    }
667}