Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| crate::static_regex(r"(?s)<!--.*?-->"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    crate::static_regex(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30});
31
32/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
33/// `rel="modulepreload"`.
34/// Handles attributes in any order (rel before or after href).
35static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
36    crate::static_regex(
37        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
38    )
39});
40
41/// Regex for the reverse attribute order: href before rel.
42static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
43    crate::static_regex(
44        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
45    )
46});
47
48/// Check if a path is an HTML file.
49pub(crate) fn is_html_file(path: &Path) -> bool {
50    path.extension()
51        .and_then(|e| e.to_str())
52        .is_some_and(|ext| ext == "html")
53}
54
55/// Returns true if an HTML asset reference is a remote URL that should be skipped.
56pub(crate) fn is_remote_url(src: &str) -> bool {
57    src.starts_with("http://")
58        || src.starts_with("https://")
59        || src.starts_with("//")
60        || src.starts_with("data:")
61}
62
63/// Build-time template placeholders that aren't valid import specifiers and
64/// never resolve to a real file. Skip them at extraction time so they don't
65/// enter the import graph as unresolvable specifiers.
66///
67/// - `{{ ... }}` covers Handlebars (Ember `index.html`'s `{{rootURL}}`,
68///   `{{config.assetsPath}}`), Mustache (Jekyll, Hugo), Jinja2 (Pelican /
69///   11ty plugins), and pre-compiled Vue / Angular templates whose
70///   interpolation has leaked into a checked-in HTML scaffold.
71/// - `###...###` covers ember-cli blueprint scaffold placeholders
72///   (`###APPNAME###`, `###DUMMY###`) checked in as addon-fixture templates.
73///
74/// Neither shape is a legal URL or path character outside template engines,
75/// so the skip is generic across frameworks rather than gated on a plugin.
76/// Returns `true` for any `src` / `href` value that contains either marker.
77pub(crate) fn is_template_placeholder(value: &str) -> bool {
78    value.contains("{{") || value.contains("###")
79}
80
81/// Extract local (non-remote) asset references from HTML-like markup.
82///
83/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
84/// between the HTML file parser and the JS/TS visitor's tagged template
85/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
86/// layouts emits the same asset edges as a real `.html` file.
87pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
88    let stripped = HTML_COMMENT_RE.replace_all(source, "");
89    let mut refs: Vec<String> = Vec::new();
90
91    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
92        if let Some(m) = cap.get(1) {
93            let src = m.as_str().trim();
94            if !src.is_empty() && !is_remote_url(src) && !is_template_placeholder(src) {
95                refs.push(src.to_string());
96            }
97        }
98    }
99
100    for cap in LINK_HREF_RE.captures_iter(&stripped) {
101        if let Some(m) = cap.get(2) {
102            let href = m.as_str().trim();
103            if !href.is_empty() && !is_remote_url(href) && !is_template_placeholder(href) {
104                refs.push(href.to_string());
105            }
106        }
107    }
108    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
109        if let Some(m) = cap.get(1) {
110            let href = m.as_str().trim();
111            if !href.is_empty() && !is_remote_url(href) && !is_template_placeholder(href) {
112                refs.push(href.to_string());
113            }
114        }
115    }
116
117    refs
118}
119
120/// Parse an HTML file, extracting script and stylesheet references as imports.
121#[cfg(test)]
122pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
123    parse_html_to_module_with_complexity(file_id, source, content_hash, false)
124}
125
126/// Parse an HTML file and optionally compute Angular template complexity.
127pub(crate) fn parse_html_to_module_with_complexity(
128    file_id: FileId,
129    source: &str,
130    content_hash: u64,
131    need_complexity: bool,
132) -> ModuleInfo {
133    let parsed_suppressions = crate::suppress::parse_suppressions_from_source(source);
134
135    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
136        .into_iter()
137        .map(|raw| ImportInfo {
138            source: normalize_asset_url(&raw),
139            imported_name: ImportedName::SideEffect,
140            local_name: String::new(),
141            is_type_only: false,
142            from_style: false,
143            span: Span::default(),
144            source_span: Span::default(),
145        })
146        .collect();
147
148    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
149    imports.dedup_by(|a, b| a.source == b.source);
150
151    let angular::AngularTemplateRefs {
152        identifiers,
153        member_accesses: template_member_accesses,
154        security_sinks,
155    } = angular::collect_angular_template_refs(source);
156    let mut member_accesses: Vec<MemberAccess> = identifiers
157        .into_iter()
158        .map(|name| MemberAccess {
159            object: ANGULAR_TPL_SENTINEL.to_string(),
160            member: name,
161        })
162        .collect();
163    member_accesses.extend(template_member_accesses);
164
165    let complexity = if need_complexity {
166        crate::template_complexity::compute_angular_template_complexity(source)
167            .into_iter()
168            .collect()
169    } else {
170        Vec::new()
171    };
172
173    ModuleInfo {
174        file_id,
175        exports: Vec::new(),
176        imports,
177        re_exports: Vec::new(),
178        dynamic_imports: Vec::new(),
179        dynamic_import_patterns: Vec::new(),
180        require_calls: Vec::new(),
181        package_path_references: Vec::new(),
182        member_accesses,
183        whole_object_uses: Vec::new(),
184        has_cjs_exports: false,
185        has_angular_component_template_url: false,
186        content_hash,
187        suppressions: parsed_suppressions.suppressions,
188        unknown_suppression_kinds: parsed_suppressions.unknown_kinds,
189        unused_import_bindings: Vec::new(),
190        type_referenced_import_bindings: Vec::new(),
191        value_referenced_import_bindings: Vec::new(),
192        line_offsets: fallow_types::extract::compute_line_offsets(source),
193        complexity,
194        flag_uses: Vec::new(),
195        class_heritage: vec![],
196        injection_tokens: vec![],
197        local_type_declarations: Vec::new(),
198        public_signature_type_references: Vec::new(),
199        namespace_object_aliases: Vec::new(),
200        iconify_prefixes: Vec::new(),
201        iconify_icon_names: Vec::new(),
202        auto_import_candidates: Vec::new(),
203        directives: Vec::new(),
204        client_only_dynamic_import_spans: Vec::new(),
205        security_sinks,
206        security_sinks_skipped: 0,
207        security_unresolved_callee_sites: Vec::new(),
208        tainted_bindings: Vec::new(),
209        sanitized_sink_args: Vec::new(),
210        security_control_sites: Vec::new(),
211        callee_uses: Vec::new(),
212        misplaced_directives: Vec::new(),
213        di_key_sites: Vec::new(),
214        has_dynamic_provide: false,
215        referenced_import_bindings: Vec::new(),
216        component_props: Vec::new(),
217        has_props_attrs_fallthrough: false,
218        has_define_expose: false,
219        has_define_model: false,
220        has_unharvestable_props: false,
221        component_emits: Vec::new(),
222        has_unharvestable_emits: false,
223        has_dynamic_emit: false,
224        has_emit_whole_object_use: false,
225        load_return_keys: Vec::new(),
226        has_unharvestable_load: false,
227        has_load_data_whole_use: false,
228        has_page_data_store_whole_use: false,
229        component_functions: Vec::new(),
230        react_props: Vec::new(),
231        hook_uses: Vec::new(),
232        render_edges: Vec::new(),
233    }
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239
240    #[test]
241    fn is_html_file_html() {
242        assert!(is_html_file(Path::new("index.html")));
243    }
244
245    #[test]
246    fn is_html_file_nested() {
247        assert!(is_html_file(Path::new("pages/about.html")));
248    }
249
250    #[test]
251    fn is_html_file_rejects_htm() {
252        assert!(!is_html_file(Path::new("index.htm")));
253    }
254
255    #[test]
256    fn is_html_file_rejects_js() {
257        assert!(!is_html_file(Path::new("app.js")));
258    }
259
260    #[test]
261    fn is_html_file_rejects_ts() {
262        assert!(!is_html_file(Path::new("app.ts")));
263    }
264
265    #[test]
266    fn is_html_file_rejects_vue() {
267        assert!(!is_html_file(Path::new("App.vue")));
268    }
269
270    #[test]
271    fn remote_url_http() {
272        assert!(is_remote_url("http://example.com/script.js"));
273    }
274
275    #[test]
276    fn remote_url_https() {
277        assert!(is_remote_url("https://cdn.example.com/style.css"));
278    }
279
280    #[test]
281    fn remote_url_protocol_relative() {
282        assert!(is_remote_url("//cdn.example.com/lib.js"));
283    }
284
285    #[test]
286    fn remote_url_data() {
287        assert!(is_remote_url("data:text/javascript;base64,abc"));
288    }
289
290    #[test]
291    fn local_relative_not_remote() {
292        assert!(!is_remote_url("./src/entry.js"));
293    }
294
295    #[test]
296    fn local_root_relative_not_remote() {
297        assert!(!is_remote_url("/src/entry.js"));
298    }
299
300    #[test]
301    fn extracts_module_script_src() {
302        let info = parse_html_to_module(
303            FileId(0),
304            r#"<script type="module" src="./src/entry.js"></script>"#,
305            0,
306        );
307        assert_eq!(info.imports.len(), 1);
308        assert_eq!(info.imports[0].source, "./src/entry.js");
309    }
310
311    #[test]
312    fn extracts_plain_script_src() {
313        let info = parse_html_to_module(
314            FileId(0),
315            r#"<script src="./src/polyfills.js"></script>"#,
316            0,
317        );
318        assert_eq!(info.imports.len(), 1);
319        assert_eq!(info.imports[0].source, "./src/polyfills.js");
320    }
321
322    #[test]
323    fn extracts_multiple_scripts() {
324        let info = parse_html_to_module(
325            FileId(0),
326            r#"
327            <script type="module" src="./src/entry.js"></script>
328            <script src="./src/polyfills.js"></script>
329            "#,
330            0,
331        );
332        assert_eq!(info.imports.len(), 2);
333    }
334
335    #[test]
336    fn skips_inline_script() {
337        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
338        assert!(info.imports.is_empty());
339    }
340
341    #[test]
342    fn skips_handlebars_placeholder_in_script_src() {
343        let info = parse_html_to_module(
344            FileId(0),
345            r#"<script src="{{rootURL}}assets/app.js"></script>
346               <script src="{{config.assetsPath}}vendor.js"></script>"#,
347            0,
348        );
349        assert!(
350            info.imports.is_empty(),
351            "Handlebars-placeholder script srcs should not enter the import graph; got {:?}",
352            info.imports
353        );
354    }
355
356    #[test]
357    fn skips_handlebars_placeholder_in_link_href() {
358        let info = parse_html_to_module(
359            FileId(0),
360            r#"<link rel="stylesheet" href="{{rootURL}}assets/app.css">"#,
361            0,
362        );
363        assert!(info.imports.is_empty());
364    }
365
366    #[test]
367    fn skips_ember_cli_blueprint_placeholder() {
368        let info = parse_html_to_module(
369            FileId(0),
370            r####"<script src="###APPNAME###/app.js"></script>"####,
371            0,
372        );
373        assert!(info.imports.is_empty());
374    }
375
376    #[test]
377    fn extracts_normal_specifier_alongside_placeholders() {
378        let info = parse_html_to_module(
379            FileId(0),
380            r#"<script src="{{rootURL}}assets/app.js"></script>
381               <script src="./src/main.ts"></script>"#,
382            0,
383        );
384        assert_eq!(info.imports.len(), 1);
385        assert_eq!(info.imports[0].source, "./src/main.ts");
386    }
387
388    #[test]
389    fn skips_remote_script() {
390        let info = parse_html_to_module(
391            FileId(0),
392            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
393            0,
394        );
395        assert!(info.imports.is_empty());
396    }
397
398    #[test]
399    fn skips_protocol_relative_script() {
400        let info = parse_html_to_module(
401            FileId(0),
402            r#"<script src="//cdn.example.com/lib.js"></script>"#,
403            0,
404        );
405        assert!(info.imports.is_empty());
406    }
407
408    #[test]
409    fn extracts_stylesheet_link() {
410        let info = parse_html_to_module(
411            FileId(0),
412            r#"<link rel="stylesheet" href="./src/global.css" />"#,
413            0,
414        );
415        assert_eq!(info.imports.len(), 1);
416        assert_eq!(info.imports[0].source, "./src/global.css");
417    }
418
419    #[test]
420    fn extracts_modulepreload_link() {
421        let info = parse_html_to_module(
422            FileId(0),
423            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
424            0,
425        );
426        assert_eq!(info.imports.len(), 1);
427        assert_eq!(info.imports[0].source, "./src/vendor.js");
428    }
429
430    #[test]
431    fn extracts_link_with_reversed_attrs() {
432        let info = parse_html_to_module(
433            FileId(0),
434            r#"<link href="./src/global.css" rel="stylesheet" />"#,
435            0,
436        );
437        assert_eq!(info.imports.len(), 1);
438        assert_eq!(info.imports[0].source, "./src/global.css");
439    }
440
441    #[test]
442    fn bare_script_src_normalized_to_relative() {
443        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
444        assert_eq!(info.imports.len(), 1);
445        assert_eq!(info.imports[0].source, "./app.js");
446    }
447
448    #[test]
449    fn bare_module_script_src_normalized_to_relative() {
450        let info = parse_html_to_module(
451            FileId(0),
452            r#"<script type="module" src="main.ts"></script>"#,
453            0,
454        );
455        assert_eq!(info.imports.len(), 1);
456        assert_eq!(info.imports[0].source, "./main.ts");
457    }
458
459    #[test]
460    fn bare_stylesheet_link_href_normalized_to_relative() {
461        let info = parse_html_to_module(
462            FileId(0),
463            r#"<link rel="stylesheet" href="styles.css" />"#,
464            0,
465        );
466        assert_eq!(info.imports.len(), 1);
467        assert_eq!(info.imports[0].source, "./styles.css");
468    }
469
470    #[test]
471    fn bare_link_href_reversed_attrs_normalized_to_relative() {
472        let info = parse_html_to_module(
473            FileId(0),
474            r#"<link href="styles.css" rel="stylesheet" />"#,
475            0,
476        );
477        assert_eq!(info.imports.len(), 1);
478        assert_eq!(info.imports[0].source, "./styles.css");
479    }
480
481    #[test]
482    fn bare_modulepreload_link_href_normalized_to_relative() {
483        let info = parse_html_to_module(
484            FileId(0),
485            r#"<link rel="modulepreload" href="vendor.js" />"#,
486            0,
487        );
488        assert_eq!(info.imports.len(), 1);
489        assert_eq!(info.imports[0].source, "./vendor.js");
490    }
491
492    #[test]
493    fn bare_asset_with_subdir_normalized_to_relative() {
494        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
495        assert_eq!(info.imports.len(), 1);
496        assert_eq!(info.imports[0].source, "./assets/app.js");
497    }
498
499    #[test]
500    fn root_absolute_script_src_unchanged() {
501        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
502        assert_eq!(info.imports.len(), 1);
503        assert_eq!(info.imports[0].source, "/src/main.ts");
504    }
505
506    #[test]
507    fn parent_relative_script_src_unchanged() {
508        let info = parse_html_to_module(
509            FileId(0),
510            r#"<script src="../shared/vendor.js"></script>"#,
511            0,
512        );
513        assert_eq!(info.imports.len(), 1);
514        assert_eq!(info.imports[0].source, "../shared/vendor.js");
515    }
516
517    #[test]
518    fn skips_preload_link() {
519        let info = parse_html_to_module(
520            FileId(0),
521            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
522            0,
523        );
524        assert!(info.imports.is_empty());
525    }
526
527    #[test]
528    fn skips_icon_link() {
529        let info =
530            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
531        assert!(info.imports.is_empty());
532    }
533
534    #[test]
535    fn skips_remote_stylesheet() {
536        let info = parse_html_to_module(
537            FileId(0),
538            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
539            0,
540        );
541        assert!(info.imports.is_empty());
542    }
543
544    #[test]
545    fn skips_commented_out_script() {
546        let info = parse_html_to_module(
547            FileId(0),
548            r#"<!-- <script src="./old.js"></script> -->
549            <script src="./new.js"></script>"#,
550            0,
551        );
552        assert_eq!(info.imports.len(), 1);
553        assert_eq!(info.imports[0].source, "./new.js");
554    }
555
556    #[test]
557    fn skips_commented_out_link() {
558        let info = parse_html_to_module(
559            FileId(0),
560            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
561            <link rel="stylesheet" href="./new.css" />"#,
562            0,
563        );
564        assert_eq!(info.imports.len(), 1);
565        assert_eq!(info.imports[0].source, "./new.css");
566    }
567
568    #[test]
569    fn handles_multiline_script_tag() {
570        let info = parse_html_to_module(
571            FileId(0),
572            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
573            0,
574        );
575        assert_eq!(info.imports.len(), 1);
576        assert_eq!(info.imports[0].source, "./src/entry.js");
577    }
578
579    #[test]
580    fn handles_multiline_link_tag() {
581        let info = parse_html_to_module(
582            FileId(0),
583            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
584            0,
585        );
586        assert_eq!(info.imports.len(), 1);
587        assert_eq!(info.imports[0].source, "./src/global.css");
588    }
589
590    #[test]
591    fn full_vite_html() {
592        let info = parse_html_to_module(
593            FileId(0),
594            r#"<!doctype html>
595<html>
596  <head>
597    <link rel="stylesheet" href="./src/global.css" />
598    <link rel="icon" href="/favicon.ico" />
599  </head>
600  <body>
601    <div id="app"></div>
602    <script type="module" src="./src/entry.js"></script>
603  </body>
604</html>"#,
605            0,
606        );
607        assert_eq!(info.imports.len(), 2);
608        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
609        assert!(sources.contains(&"./src/global.css"));
610        assert!(sources.contains(&"./src/entry.js"));
611    }
612
613    #[test]
614    fn empty_html() {
615        let info = parse_html_to_module(FileId(0), "", 0);
616        assert!(info.imports.is_empty());
617    }
618
619    #[test]
620    fn html_with_no_assets() {
621        let info = parse_html_to_module(
622            FileId(0),
623            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
624            0,
625        );
626        assert!(info.imports.is_empty());
627    }
628
629    #[test]
630    fn single_quoted_attributes() {
631        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
632        assert_eq!(info.imports.len(), 1);
633        assert_eq!(info.imports[0].source, "./src/entry.js");
634    }
635
636    #[test]
637    fn all_imports_are_side_effect() {
638        let info = parse_html_to_module(
639            FileId(0),
640            r#"<script src="./entry.js"></script>
641            <link rel="stylesheet" href="./style.css" />"#,
642            0,
643        );
644        for imp in &info.imports {
645            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
646            assert!(imp.local_name.is_empty());
647            assert!(!imp.is_type_only);
648        }
649    }
650
651    #[test]
652    fn suppression_comments_extracted() {
653        let info = parse_html_to_module(
654            FileId(0),
655            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
656            0,
657        );
658        assert_eq!(info.imports.len(), 1);
659    }
660
661    #[test]
662    fn angular_template_extracts_member_refs() {
663        let info = parse_html_to_module(
664            FileId(0),
665            "<h1>{{ title() }}</h1>\n\
666             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
667             <button (click)=\"onButtonClick()\">Toggle</button>",
668            0,
669        );
670        let names: rustc_hash::FxHashSet<&str> = info
671            .member_accesses
672            .iter()
673            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
674            .map(|a| a.member.as_str())
675            .collect();
676        assert!(names.contains("title"), "should contain 'title'");
677        assert!(
678            names.contains("isHighlighted"),
679            "should contain 'isHighlighted'"
680        );
681        assert!(names.contains("greeting"), "should contain 'greeting'");
682        assert!(
683            names.contains("onButtonClick"),
684            "should contain 'onButtonClick'"
685        );
686    }
687
688    #[test]
689    fn plain_html_no_angular_refs() {
690        let info = parse_html_to_module(
691            FileId(0),
692            "<!doctype html><html><body><h1>Hello</h1></body></html>",
693            0,
694        );
695        assert!(info.member_accesses.is_empty());
696    }
697}