Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script, stylesheet, and Angular template references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6//!
7//! Also scans for Angular template syntax (`{{ }}`, `[prop]`, `(event)`, `@if`, etc.)
8//! and stores referenced identifiers as `MemberAccess` entries with a sentinel object,
9//! enabling the analysis phase to credit component class members used in external templates.
10
11use std::path::Path;
12use std::sync::LazyLock;
13
14use oxc_span::Span;
15
16use crate::asset_url::normalize_asset_url;
17use crate::sfc_template::angular::{self, ANGULAR_TPL_SENTINEL};
18use crate::{ImportInfo, ImportedName, MemberAccess, ModuleInfo};
19use fallow_types::discover::FileId;
20
21/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
22static HTML_COMMENT_RE: LazyLock<regex::Regex> =
23    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
24
25/// Regex to extract `src` attribute from `<script>` tags.
26/// Matches both `<script src="...">` and `<script type="module" src="...">`.
27/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
28static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
29    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
30        .expect("valid regex")
31});
32
33/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
34/// `rel="modulepreload"`.
35/// Handles attributes in any order (rel before or after href).
36static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
37    regex::Regex::new(
38        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
39    )
40    .expect("valid regex")
41});
42
43/// Regex for the reverse attribute order: href before rel.
44static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
45    regex::Regex::new(
46        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
47    )
48    .expect("valid regex")
49});
50
51/// Check if a path is an HTML file.
52// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
53pub(crate) fn is_html_file(path: &Path) -> bool {
54    path.extension()
55        .and_then(|e| e.to_str())
56        .is_some_and(|ext| ext == "html")
57}
58
59/// Returns true if an HTML asset reference is a remote URL that should be skipped.
60pub(crate) fn is_remote_url(src: &str) -> bool {
61    src.starts_with("http://")
62        || src.starts_with("https://")
63        || src.starts_with("//")
64        || src.starts_with("data:")
65}
66
67/// Extract local (non-remote) asset references from HTML-like markup.
68///
69/// Returns the raw `src`/`href` strings (trimmed, remote URLs filtered). Shared
70/// between the HTML file parser and the JS/TS visitor's tagged template
71/// literal override so `` html`<script src="...">` `` in Hono/lit-html/htm
72/// layouts emits the same asset edges as a real `.html` file.
73pub(crate) fn collect_asset_refs(source: &str) -> Vec<String> {
74    let stripped = HTML_COMMENT_RE.replace_all(source, "");
75    let mut refs: Vec<String> = Vec::new();
76
77    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
78        if let Some(m) = cap.get(1) {
79            let src = m.as_str().trim();
80            if !src.is_empty() && !is_remote_url(src) {
81                refs.push(src.to_string());
82            }
83        }
84    }
85
86    for cap in LINK_HREF_RE.captures_iter(&stripped) {
87        if let Some(m) = cap.get(2) {
88            let href = m.as_str().trim();
89            if !href.is_empty() && !is_remote_url(href) {
90                refs.push(href.to_string());
91            }
92        }
93    }
94    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
95        if let Some(m) = cap.get(1) {
96            let href = m.as_str().trim();
97            if !href.is_empty() && !is_remote_url(href) {
98                refs.push(href.to_string());
99            }
100        }
101    }
102
103    refs
104}
105
106/// Parse an HTML file, extracting script and stylesheet references as imports.
107pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
108    let suppressions = crate::suppress::parse_suppressions_from_source(source);
109
110    // Bare filenames (e.g., `src="app.js"`) are normalized to `./app.js` so
111    // the resolver doesn't misclassify them as npm packages.
112    let mut imports: Vec<ImportInfo> = collect_asset_refs(source)
113        .into_iter()
114        .map(|raw| ImportInfo {
115            source: normalize_asset_url(&raw),
116            imported_name: ImportedName::SideEffect,
117            local_name: String::new(),
118            is_type_only: false,
119            span: Span::default(),
120            source_span: Span::default(),
121        })
122        .collect();
123
124    // Deduplicate: the same asset may be referenced by both <script src> and
125    // <link rel="modulepreload" href> for the same path.
126    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
127    imports.dedup_by(|a, b| a.source == b.source);
128
129    // Scan for Angular template syntax ({{ }}, [prop], (event), @if, etc.).
130    // Referenced identifiers are stored as MemberAccess entries with a sentinel
131    // object name so the analysis phase can bridge them to the component class.
132    let template_refs = angular::collect_angular_template_refs(source);
133    let member_accesses: Vec<MemberAccess> = template_refs
134        .into_iter()
135        .map(|name| MemberAccess {
136            object: ANGULAR_TPL_SENTINEL.to_string(),
137            member: name,
138        })
139        .collect();
140
141    ModuleInfo {
142        file_id,
143        exports: Vec::new(),
144        imports,
145        re_exports: Vec::new(),
146        dynamic_imports: Vec::new(),
147        dynamic_import_patterns: Vec::new(),
148        require_calls: Vec::new(),
149        member_accesses,
150        whole_object_uses: Vec::new(),
151        has_cjs_exports: false,
152        content_hash,
153        suppressions,
154        unused_import_bindings: Vec::new(),
155        line_offsets: fallow_types::extract::compute_line_offsets(source),
156        complexity: Vec::new(),
157        flag_uses: Vec::new(),
158        class_heritage: vec![],
159    }
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    // ── is_html_file ─────────────────────────────────────────────
167
168    #[test]
169    fn is_html_file_html() {
170        assert!(is_html_file(Path::new("index.html")));
171    }
172
173    #[test]
174    fn is_html_file_nested() {
175        assert!(is_html_file(Path::new("pages/about.html")));
176    }
177
178    #[test]
179    fn is_html_file_rejects_htm() {
180        assert!(!is_html_file(Path::new("index.htm")));
181    }
182
183    #[test]
184    fn is_html_file_rejects_js() {
185        assert!(!is_html_file(Path::new("app.js")));
186    }
187
188    #[test]
189    fn is_html_file_rejects_ts() {
190        assert!(!is_html_file(Path::new("app.ts")));
191    }
192
193    #[test]
194    fn is_html_file_rejects_vue() {
195        assert!(!is_html_file(Path::new("App.vue")));
196    }
197
198    // ── is_remote_url ────────────────────────────────────────────
199
200    #[test]
201    fn remote_url_http() {
202        assert!(is_remote_url("http://example.com/script.js"));
203    }
204
205    #[test]
206    fn remote_url_https() {
207        assert!(is_remote_url("https://cdn.example.com/style.css"));
208    }
209
210    #[test]
211    fn remote_url_protocol_relative() {
212        assert!(is_remote_url("//cdn.example.com/lib.js"));
213    }
214
215    #[test]
216    fn remote_url_data() {
217        assert!(is_remote_url("data:text/javascript;base64,abc"));
218    }
219
220    #[test]
221    fn local_relative_not_remote() {
222        assert!(!is_remote_url("./src/entry.js"));
223    }
224
225    #[test]
226    fn local_root_relative_not_remote() {
227        assert!(!is_remote_url("/src/entry.js"));
228    }
229
230    // ── parse_html_to_module: script src extraction ──────────────
231
232    #[test]
233    fn extracts_module_script_src() {
234        let info = parse_html_to_module(
235            FileId(0),
236            r#"<script type="module" src="./src/entry.js"></script>"#,
237            0,
238        );
239        assert_eq!(info.imports.len(), 1);
240        assert_eq!(info.imports[0].source, "./src/entry.js");
241    }
242
243    #[test]
244    fn extracts_plain_script_src() {
245        let info = parse_html_to_module(
246            FileId(0),
247            r#"<script src="./src/polyfills.js"></script>"#,
248            0,
249        );
250        assert_eq!(info.imports.len(), 1);
251        assert_eq!(info.imports[0].source, "./src/polyfills.js");
252    }
253
254    #[test]
255    fn extracts_multiple_scripts() {
256        let info = parse_html_to_module(
257            FileId(0),
258            r#"
259            <script type="module" src="./src/entry.js"></script>
260            <script src="./src/polyfills.js"></script>
261            "#,
262            0,
263        );
264        assert_eq!(info.imports.len(), 2);
265    }
266
267    #[test]
268    fn skips_inline_script() {
269        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
270        assert!(info.imports.is_empty());
271    }
272
273    #[test]
274    fn skips_remote_script() {
275        let info = parse_html_to_module(
276            FileId(0),
277            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
278            0,
279        );
280        assert!(info.imports.is_empty());
281    }
282
283    #[test]
284    fn skips_protocol_relative_script() {
285        let info = parse_html_to_module(
286            FileId(0),
287            r#"<script src="//cdn.example.com/lib.js"></script>"#,
288            0,
289        );
290        assert!(info.imports.is_empty());
291    }
292
293    // ── parse_html_to_module: link href extraction ───────────────
294
295    #[test]
296    fn extracts_stylesheet_link() {
297        let info = parse_html_to_module(
298            FileId(0),
299            r#"<link rel="stylesheet" href="./src/global.css" />"#,
300            0,
301        );
302        assert_eq!(info.imports.len(), 1);
303        assert_eq!(info.imports[0].source, "./src/global.css");
304    }
305
306    #[test]
307    fn extracts_modulepreload_link() {
308        let info = parse_html_to_module(
309            FileId(0),
310            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
311            0,
312        );
313        assert_eq!(info.imports.len(), 1);
314        assert_eq!(info.imports[0].source, "./src/vendor.js");
315    }
316
317    #[test]
318    fn extracts_link_with_reversed_attrs() {
319        let info = parse_html_to_module(
320            FileId(0),
321            r#"<link href="./src/global.css" rel="stylesheet" />"#,
322            0,
323        );
324        assert_eq!(info.imports.len(), 1);
325        assert_eq!(info.imports[0].source, "./src/global.css");
326    }
327
328    // ── Bare asset references normalized to relative paths ──────
329    // Regression tests for the same class of bug as #99 (Angular templateUrl).
330    // Browsers resolve `src="app.js"` and `href="styles.css"` relative to the
331    // HTML file, so emitting these as bare specifiers would misclassify them
332    // as unlisted npm packages.
333
334    #[test]
335    fn bare_script_src_normalized_to_relative() {
336        let info = parse_html_to_module(FileId(0), r#"<script src="app.js"></script>"#, 0);
337        assert_eq!(info.imports.len(), 1);
338        assert_eq!(info.imports[0].source, "./app.js");
339    }
340
341    #[test]
342    fn bare_module_script_src_normalized_to_relative() {
343        let info = parse_html_to_module(
344            FileId(0),
345            r#"<script type="module" src="main.ts"></script>"#,
346            0,
347        );
348        assert_eq!(info.imports.len(), 1);
349        assert_eq!(info.imports[0].source, "./main.ts");
350    }
351
352    #[test]
353    fn bare_stylesheet_link_href_normalized_to_relative() {
354        let info = parse_html_to_module(
355            FileId(0),
356            r#"<link rel="stylesheet" href="styles.css" />"#,
357            0,
358        );
359        assert_eq!(info.imports.len(), 1);
360        assert_eq!(info.imports[0].source, "./styles.css");
361    }
362
363    #[test]
364    fn bare_link_href_reversed_attrs_normalized_to_relative() {
365        let info = parse_html_to_module(
366            FileId(0),
367            r#"<link href="styles.css" rel="stylesheet" />"#,
368            0,
369        );
370        assert_eq!(info.imports.len(), 1);
371        assert_eq!(info.imports[0].source, "./styles.css");
372    }
373
374    #[test]
375    fn bare_modulepreload_link_href_normalized_to_relative() {
376        let info = parse_html_to_module(
377            FileId(0),
378            r#"<link rel="modulepreload" href="vendor.js" />"#,
379            0,
380        );
381        assert_eq!(info.imports.len(), 1);
382        assert_eq!(info.imports[0].source, "./vendor.js");
383    }
384
385    #[test]
386    fn bare_asset_with_subdir_normalized_to_relative() {
387        let info = parse_html_to_module(FileId(0), r#"<script src="assets/app.js"></script>"#, 0);
388        assert_eq!(info.imports.len(), 1);
389        assert_eq!(info.imports[0].source, "./assets/app.js");
390    }
391
392    #[test]
393    fn root_absolute_script_src_unchanged() {
394        // `/src/main.ts` is a web convention (Vite root-relative) and must
395        // stay absolute so the resolver's HTML special case applies.
396        let info = parse_html_to_module(FileId(0), r#"<script src="/src/main.ts"></script>"#, 0);
397        assert_eq!(info.imports.len(), 1);
398        assert_eq!(info.imports[0].source, "/src/main.ts");
399    }
400
401    #[test]
402    fn parent_relative_script_src_unchanged() {
403        let info = parse_html_to_module(
404            FileId(0),
405            r#"<script src="../shared/vendor.js"></script>"#,
406            0,
407        );
408        assert_eq!(info.imports.len(), 1);
409        assert_eq!(info.imports[0].source, "../shared/vendor.js");
410    }
411
412    #[test]
413    fn skips_preload_link() {
414        let info = parse_html_to_module(
415            FileId(0),
416            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
417            0,
418        );
419        assert!(info.imports.is_empty());
420    }
421
422    #[test]
423    fn skips_icon_link() {
424        let info =
425            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
426        assert!(info.imports.is_empty());
427    }
428
429    #[test]
430    fn skips_remote_stylesheet() {
431        let info = parse_html_to_module(
432            FileId(0),
433            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
434            0,
435        );
436        assert!(info.imports.is_empty());
437    }
438
439    // ── HTML comment stripping ───────────────────────────────────
440
441    #[test]
442    fn skips_commented_out_script() {
443        let info = parse_html_to_module(
444            FileId(0),
445            r#"<!-- <script src="./old.js"></script> -->
446            <script src="./new.js"></script>"#,
447            0,
448        );
449        assert_eq!(info.imports.len(), 1);
450        assert_eq!(info.imports[0].source, "./new.js");
451    }
452
453    #[test]
454    fn skips_commented_out_link() {
455        let info = parse_html_to_module(
456            FileId(0),
457            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
458            <link rel="stylesheet" href="./new.css" />"#,
459            0,
460        );
461        assert_eq!(info.imports.len(), 1);
462        assert_eq!(info.imports[0].source, "./new.css");
463    }
464
465    // ── Multi-line attributes ────────────────────────────────────
466
467    #[test]
468    fn handles_multiline_script_tag() {
469        let info = parse_html_to_module(
470            FileId(0),
471            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
472            0,
473        );
474        assert_eq!(info.imports.len(), 1);
475        assert_eq!(info.imports[0].source, "./src/entry.js");
476    }
477
478    #[test]
479    fn handles_multiline_link_tag() {
480        let info = parse_html_to_module(
481            FileId(0),
482            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
483            0,
484        );
485        assert_eq!(info.imports.len(), 1);
486        assert_eq!(info.imports[0].source, "./src/global.css");
487    }
488
489    // ── Full HTML document ───────────────────────────────────────
490
491    #[test]
492    fn full_vite_html() {
493        let info = parse_html_to_module(
494            FileId(0),
495            r#"<!doctype html>
496<html>
497  <head>
498    <link rel="stylesheet" href="./src/global.css" />
499    <link rel="icon" href="/favicon.ico" />
500  </head>
501  <body>
502    <div id="app"></div>
503    <script type="module" src="./src/entry.js"></script>
504  </body>
505</html>"#,
506            0,
507        );
508        assert_eq!(info.imports.len(), 2);
509        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
510        assert!(sources.contains(&"./src/global.css"));
511        assert!(sources.contains(&"./src/entry.js"));
512    }
513
514    // ── Edge cases ───────────────────────────────────────────────
515
516    #[test]
517    fn empty_html() {
518        let info = parse_html_to_module(FileId(0), "", 0);
519        assert!(info.imports.is_empty());
520    }
521
522    #[test]
523    fn html_with_no_assets() {
524        let info = parse_html_to_module(
525            FileId(0),
526            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
527            0,
528        );
529        assert!(info.imports.is_empty());
530    }
531
532    #[test]
533    fn single_quoted_attributes() {
534        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
535        assert_eq!(info.imports.len(), 1);
536        assert_eq!(info.imports[0].source, "./src/entry.js");
537    }
538
539    #[test]
540    fn all_imports_are_side_effect() {
541        let info = parse_html_to_module(
542            FileId(0),
543            r#"<script src="./entry.js"></script>
544            <link rel="stylesheet" href="./style.css" />"#,
545            0,
546        );
547        for imp in &info.imports {
548            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
549            assert!(imp.local_name.is_empty());
550            assert!(!imp.is_type_only);
551        }
552    }
553
554    #[test]
555    fn suppression_comments_extracted() {
556        let info = parse_html_to_module(
557            FileId(0),
558            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
559            0,
560        );
561        // HTML comments use <!-- --> not //, so suppression parsing
562        // from source text won't find standard JS-style comments.
563        // This is expected — HTML suppression is not supported.
564        assert_eq!(info.imports.len(), 1);
565    }
566
567    // ── Angular template scanning ──────────────────────────────
568
569    #[test]
570    fn angular_template_extracts_member_refs() {
571        let info = parse_html_to_module(
572            FileId(0),
573            "<h1>{{ title() }}</h1>\n\
574             <p [class.highlighted]=\"isHighlighted\">{{ greeting() }}</p>\n\
575             <button (click)=\"onButtonClick()\">Toggle</button>",
576            0,
577        );
578        let names: rustc_hash::FxHashSet<&str> = info
579            .member_accesses
580            .iter()
581            .filter(|a| a.object == ANGULAR_TPL_SENTINEL)
582            .map(|a| a.member.as_str())
583            .collect();
584        assert!(names.contains("title"), "should contain 'title'");
585        assert!(
586            names.contains("isHighlighted"),
587            "should contain 'isHighlighted'"
588        );
589        assert!(names.contains("greeting"), "should contain 'greeting'");
590        assert!(
591            names.contains("onButtonClick"),
592            "should contain 'onButtonClick'"
593        );
594    }
595
596    #[test]
597    fn plain_html_no_angular_refs() {
598        let info = parse_html_to_module(
599            FileId(0),
600            "<!doctype html><html><body><h1>Hello</h1></body></html>",
601            0,
602        );
603        assert!(info.member_accesses.is_empty());
604    }
605}