Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30        .expect("valid regex")
31});
32
33/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
34/// `rel="modulepreload"`.
35/// Handles attributes in any order (rel before or after href).
36static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
37    regex::Regex::new(
38        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
39    )
40    .expect("valid regex")
41});
42
43/// Regex for the reverse attribute order: href before rel.
44static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
45    regex::Regex::new(
46        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
47    )
48    .expect("valid regex")
49});
50
51/// Check if a path is an HTML file.
52// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
53pub(crate) fn is_html_file(path: &Path) -> bool {
54    path.extension()
55        .and_then(|e| e.to_str())
56        .is_some_and(|ext| ext == "html")
57}
58
59/// Returns true if an HTML asset reference is a remote URL that should be skipped.
60pub(crate) fn is_remote_url(src: &str) -> bool {
61    src.starts_with("http://")
62        || src.starts_with("https://")
63        || src.starts_with("//")
64        || src.starts_with("data:")
65}
66
67/// Extract local (non-remote) asset references from HTML-like markup.
68///
69/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
70/// between the HTML file parser and the JS/TS visitor's tagged template
71/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
72/// layouts emits the same asset edges as a real `.html` file.
73pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
74    let stripped = HTML_COMMENT_RE.replace_all(source, "");
75    let mut refs: Vec<String> = Vec::new();
76
77    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
78        if let Some(m) = cap.get(1) {
79            let src = m.as_str().trim();
80            if !src.is_empty() && !is_remote_url(src) {
81                refs.push(src.to_string());
82            }
83        }
84    }
85
86    for cap in LINK_HREF_RE.captures_iter(&stripped) {
87        if let Some(m) = cap.get(2) {
88            let href = m.as_str().trim();
89            if !href.is_empty() && !is_remote_url(href) {
90                refs.push(href.to_string());
91            }
92        }
93    }
94    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(1) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                refs.push(href.to_string());
99            }
100        }
101    }
102
103    refs
104}
105
106/// Parse an HTML file, extracting script and stylesheet references as imports.
107pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
108    let suppressions = crate::suppress::parse_suppressions_from_source(source);
109
110    // Bare filenames (e.g., `src="app.js"`) are normalized to `./app.js` so
111    // the resolver doesn't misclassify them as npm packages.
112    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
113        .into_iter()
114        .map(|raw| ImportInfo {
115            source: normalize_asset_url(&raw),
116            imported_name: ImportedName::SideEffect,
117            local_name: String::new(),
118            is_type_only: false,
119            span: Span::default(),
120            source_span: Span::default(),
121        })
122        .collect();
123
124    // Deduplicate: the same asset may be referenced by both <script src> and
125    // <link rel="modulepreload" href> for the same path.
126    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
127    imports.dedup_by(|a, b| a.source == b.source);
128
129    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
130    //
131    // Bare identifier refs (e.g. `title`, `dataService`, pipe names) are stored
132    // as `MemberAccess` entries with a sentinel object name so the analysis
133    // phase can credit them as members of the component class.
134    //
135    // Static member-access chains (`dataService.getTotal`) where `dataService`
136    // is an unresolved identifier are stored as regular (non-sentinel)
137    // `MemberAccess` entries. The analysis phase resolves these through the
138    // importing component's typed instance bindings (from
139    // `ClassHeritageInfo.instance_bindings`) to credit the target class's
140    // member as used.
141    let template_refs = angular::collect_angular_template_refs(source);
142    let mut member_accesses: Vec<MemberAccess> = template_refs
143        .identifiers
144        .into_iter()
145        .map(|name| MemberAccess {
146            object: ANGULAR_TPL_SENTINEL.to_string(),
147            member: name,
148        })
149        .collect();
150    member_accesses.extend(template_refs.member_accesses);
151
152    ModuleInfo {
153        file_id,
154        exports: Vec::new(),
155        imports,
156        re_exports: Vec::new(),
157        dynamic_imports: Vec::new(),
158        dynamic_import_patterns: Vec::new(),
159        require_calls: Vec::new(),
160        member_accesses,
161        whole_object_uses: Vec::new(),
162        has_cjs_exports: false,
163        content_hash,
164        suppressions,
165        unused_import_bindings: Vec::new(),
166        type_referenced_import_bindings: Vec::new(),
167        value_referenced_import_bindings: Vec::new(),
168        line_offsets: fallow_types::extract::compute_line_offsets(source),
169        complexity: Vec::new(),
170        flag_uses: Vec::new(),
171        class_heritage: vec![],
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    // ── is_html_file ─────────────────────────────────────────────
180
181    #[test]
182    fn is_html_file_html() {
183        assert!(is_html_file(Path::new("index.html")));
184    }
185
186    #[test]
187    fn is_html_file_nested() {
188        assert!(is_html_file(Path::new("pages/about.html")));
189    }
190
191    #[test]
192    fn is_html_file_rejects_htm() {
193        assert!(!is_html_file(Path::new("index.htm")));
194    }
195
196    #[test]
197    fn is_html_file_rejects_js() {
198        assert!(!is_html_file(Path::new("app.js")));
199    }
200
201    #[test]
202    fn is_html_file_rejects_ts() {
203        assert!(!is_html_file(Path::new("app.ts")));
204    }
205
206    #[test]
207    fn is_html_file_rejects_vue() {
208        assert!(!is_html_file(Path::new("App.vue")));
209    }
210
211    // ── is_remote_url ────────────────────────────────────────────
212
213    #[test]
214    fn remote_url_http() {
215        assert!(is_remote_url("http://example.com/script.js"));
216    }
217
218    #[test]
219    fn remote_url_https() {
220        assert!(is_remote_url("https://cdn.example.com/style.css"));
221    }
222
223    #[test]
224    fn remote_url_protocol_relative() {
225        assert!(is_remote_url("//cdn.example.com/lib.js"));
226    }
227
228    #[test]
229    fn remote_url_data() {
230        assert!(is_remote_url("data:text/javascript;base64,abc"));
231    }
232
233    #[test]
234    fn local_relative_not_remote() {
235        assert!(!is_remote_url("./src/entry.js"));
236    }
237
238    #[test]
239    fn local_root_relative_not_remote() {
240        assert!(!is_remote_url("/src/entry.js"));
241    }
242
243    // ── parse_html_to_module: script src extraction ──────────────
244
245    #[test]
246    fn extracts_module_script_src() {
247        let info = parse_html_to_module(
248            FileId(0),
249            r#"<script type="module" src="./src/entry.js"></script>"#,
250            0,
251        );
252        assert_eq!(info.imports.len(), 1);
253        assert_eq!(info.imports[0].source, "./src/entry.js");
254    }
255
256    #[test]
257    fn extracts_plain_script_src() {
258        let info = parse_html_to_module(
259            FileId(0),
260            r#"<script src="./src/polyfills.js"></script>"#,
261            0,
262        );
263        assert_eq!(info.imports.len(), 1);
264        assert_eq!(info.imports[0].source, "./src/polyfills.js");
265    }
266
267    #[test]
268    fn extracts_multiple_scripts() {
269        let info = parse_html_to_module(
270            FileId(0),
271            r#"
272            <script type="module" src="./src/entry.js"></script>
273            <script src="./src/polyfills.js"></script>
274            "#,
275            0,
276        );
277        assert_eq!(info.imports.len(), 2);
278    }
279
280    #[test]
281    fn skips_inline_script() {
282        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
283        assert!(info.imports.is_empty());
284    }
285
286    #[test]
287    fn skips_remote_script() {
288        let info = parse_html_to_module(
289            FileId(0),
290            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
291            0,
292        );
293        assert!(info.imports.is_empty());
294    }
295
296    #[test]
297    fn skips_protocol_relative_script() {
298        let info = parse_html_to_module(
299            FileId(0),
300            r#"<script src="//cdn.example.com/lib.js"></script>"#,
301            0,
302        );
303        assert!(info.imports.is_empty());
304    }
305
306    // ── parse_html_to_module: link href extraction ───────────────
307
308    #[test]
309    fn extracts_stylesheet_link() {
310        let info = parse_html_to_module(
311            FileId(0),
312            r#"<link rel="stylesheet" href="./src/global.css" />"#,
313            0,
314        );
315        assert_eq!(info.imports.len(), 1);
316        assert_eq!(info.imports[0].source, "./src/global.css");
317    }
318
319    #[test]
320    fn extracts_modulepreload_link() {
321        let info = parse_html_to_module(
322            FileId(0),
323            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
324            0,
325        );
326        assert_eq!(info.imports.len(), 1);
327        assert_eq!(info.imports[0].source, "./src/vendor.js");
328    }
329
330    #[test]
331    fn extracts_link_with_reversed_attrs() {
332        let info = parse_html_to_module(
333            FileId(0),
334            r#"<link href="./src/global.css" rel="stylesheet" />"#,
335            0,
336        );
337        assert_eq!(info.imports.len(), 1);
338        assert_eq!(info.imports[0].source, "./src/global.css");
339    }
340
341    // ── Bare asset references normalized to relative paths ──────
342    // Regression tests for the same class of bug as #99 (Angular templateUrl).
343    // Browsers resolve `src="app.js"` and `href="styles.css"` relative to the
344    // HTML file, so emitting these as bare specifiers would misclassify them
345    // as unlisted npm packages.
346
347    #[test]
348    fn bare_script_src_normalized_to_relative() {
349        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
350        assert_eq!(info.imports.len(), 1);
351        assert_eq!(info.imports[0].source, "./app.js");
352    }
353
354    #[test]
355    fn bare_module_script_src_normalized_to_relative() {
356        let info = parse_html_to_module(
357            FileId(0),
358            r#"<script type="module" src="main.ts"></script>"#,
359            0,
360        );
361        assert_eq!(info.imports.len(), 1);
362        assert_eq!(info.imports[0].source, "./main.ts");
363    }
364
365    #[test]
366    fn bare_stylesheet_link_href_normalized_to_relative() {
367        let info = parse_html_to_module(
368            FileId(0),
369            r#"<link rel="stylesheet" href="styles.css" />"#,
370            0,
371        );
372        assert_eq!(info.imports.len(), 1);
373        assert_eq!(info.imports[0].source, "./styles.css");
374    }
375
376    #[test]
377    fn bare_link_href_reversed_attrs_normalized_to_relative() {
378        let info = parse_html_to_module(
379            FileId(0),
380            r#"<link href="styles.css" rel="stylesheet" />"#,
381            0,
382        );
383        assert_eq!(info.imports.len(), 1);
384        assert_eq!(info.imports[0].source, "./styles.css");
385    }
386
387    #[test]
388    fn bare_modulepreload_link_href_normalized_to_relative() {
389        let info = parse_html_to_module(
390            FileId(0),
391            r#"<link rel="modulepreload" href="vendor.js" />"#,
392            0,
393        );
394        assert_eq!(info.imports.len(), 1);
395        assert_eq!(info.imports[0].source, "./vendor.js");
396    }
397
398    #[test]
399    fn bare_asset_with_subdir_normalized_to_relative() {
400        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
401        assert_eq!(info.imports.len(), 1);
402        assert_eq!(info.imports[0].source, "./assets/app.js");
403    }
404
405    #[test]
406    fn root_absolute_script_src_unchanged() {
407        // `/src/main.ts` is a web convention (Vite root-relative) and must
408        // stay absolute so the resolver's HTML special case applies.
409        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
410        assert_eq!(info.imports.len(), 1);
411        assert_eq!(info.imports[0].source, "/src/main.ts");
412    }
413
414    #[test]
415    fn parent_relative_script_src_unchanged() {
416        let info = parse_html_to_module(
417            FileId(0),
418            r#"<script src="../shared/vendor.js"></script>"#,
419            0,
420        );
421        assert_eq!(info.imports.len(), 1);
422        assert_eq!(info.imports[0].source, "../shared/vendor.js");
423    }
424
425    #[test]
426    fn skips_preload_link() {
427        let info = parse_html_to_module(
428            FileId(0),
429            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
430            0,
431        );
432        assert!(info.imports.is_empty());
433    }
434
435    #[test]
436    fn skips_icon_link() {
437        let info =
438            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
439        assert!(info.imports.is_empty());
440    }
441
442    #[test]
443    fn skips_remote_stylesheet() {
444        let info = parse_html_to_module(
445            FileId(0),
446            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
447            0,
448        );
449        assert!(info.imports.is_empty());
450    }
451
452    // ── HTML comment stripping ───────────────────────────────────
453
454    #[test]
455    fn skips_commented_out_script() {
456        let info = parse_html_to_module(
457            FileId(0),
458            r#"<!-- <script src="./old.js"></script> -->
459            <script src="./new.js"></script>"#,
460            0,
461        );
462        assert_eq!(info.imports.len(), 1);
463        assert_eq!(info.imports[0].source, "./new.js");
464    }
465
466    #[test]
467    fn skips_commented_out_link() {
468        let info = parse_html_to_module(
469            FileId(0),
470            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
471            <link rel="stylesheet" href="./new.css" />"#,
472            0,
473        );
474        assert_eq!(info.imports.len(), 1);
475        assert_eq!(info.imports[0].source, "./new.css");
476    }
477
478    // ── Multi-line attributes ────────────────────────────────────
479
480    #[test]
481    fn handles_multiline_script_tag() {
482        let info = parse_html_to_module(
483            FileId(0),
484            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
485            0,
486        );
487        assert_eq!(info.imports.len(), 1);
488        assert_eq!(info.imports[0].source, "./src/entry.js");
489    }
490
491    #[test]
492    fn handles_multiline_link_tag() {
493        let info = parse_html_to_module(
494            FileId(0),
495            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
496            0,
497        );
498        assert_eq!(info.imports.len(), 1);
499        assert_eq!(info.imports[0].source, "./src/global.css");
500    }
501
502    // ── Full HTML document ───────────────────────────────────────
503
504    #[test]
505    fn full_vite_html() {
506        let info = parse_html_to_module(
507            FileId(0),
508            r#"<!doctype html>
509<html>
510  <head>
511    <link rel="stylesheet" href="./src/global.css" />
512    <link rel="icon" href="/favicon.ico" />
513  </head>
514  <body>
515    <div id="app"></div>
516    <script type="module" src="./src/entry.js"></script>
517  </body>
518</html>"#,
519            0,
520        );
521        assert_eq!(info.imports.len(), 2);
522        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
523        assert!(sources.contains(&"./src/global.css"));
524        assert!(sources.contains(&"./src/entry.js"));
525    }
526
527    // ── Edge cases ───────────────────────────────────────────────
528
529    #[test]
530    fn empty_html() {
531        let info = parse_html_to_module(FileId(0), "", 0);
532        assert!(info.imports.is_empty());
533    }
534
535    #[test]
536    fn html_with_no_assets() {
537        let info = parse_html_to_module(
538            FileId(0),
539            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
540            0,
541        );
542        assert!(info.imports.is_empty());
543    }
544
545    #[test]
546    fn single_quoted_attributes() {
547        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
548        assert_eq!(info.imports.len(), 1);
549        assert_eq!(info.imports[0].source, "./src/entry.js");
550    }
551
552    #[test]
553    fn all_imports_are_side_effect() {
554        let info = parse_html_to_module(
555            FileId(0),
556            r#"<script src="./entry.js"></script>
557            <link rel="stylesheet" href="./style.css" />"#,
558            0,
559        );
560        for imp in &info.imports {
561            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
562            assert!(imp.local_name.is_empty());
563            assert!(!imp.is_type_only);
564        }
565    }
566
567    #[test]
568    fn suppression_comments_extracted() {
569        let info = parse_html_to_module(
570            FileId(0),
571            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
572            0,
573        );
574        // HTML comments use <!-- --> not //, so suppression parsing
575        // from source text won't find standard JS-style comments.
576        // This is expected — HTML suppression is not supported.
577        assert_eq!(info.imports.len(), 1);
578    }
579
580    // ── Angular template scanning ──────────────────────────────
581
582    #[test]
583    fn angular_template_extracts_member_refs() {
584        let info = parse_html_to_module(
585            FileId(0),
586            "<h1>{{ title() }}</h1>\n\
587             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
588             <button (click)=\"onButtonClick()\">Toggle</button>",
589            0,
590        );
591        let names: rustc_hash::FxHashSet<&str> = info
592            .member_accesses
593            .iter()
594            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
595            .map(|a| a.member.as_str())
596            .collect();
597        assert!(names.contains("title"), "should contain 'title'");
598        assert!(
599            names.contains("isHighlighted"),
600            "should contain 'isHighlighted'"
601        );
602        assert!(names.contains("greeting"), "should contain 'greeting'");
603        assert!(
604            names.contains("onButtonClick"),
605            "should contain 'onButtonClick'"
606        );
607    }
608
609    #[test]
610    fn plain_html_no_angular_refs() {
611        let info = parse_html_to_module(
612            FileId(0),
613            "<!doctype html><html><body><h1>Hello</h1></body></html>",
614            0,
615        );
616        assert!(info.member_accesses.is_empty());
617    }
618}