Skip to main content

fallow_extract/
html.rs

1//! HTML file parsing for script and stylesheet asset references.
2//!
3//! Extracts `<script src="...">` and `<link rel="stylesheet" href="...">` references
4//! from HTML files, creating graph edges so that referenced JS/CSS assets (and their
5//! transitive imports) are reachable from the HTML entry point.
6
7use std::path::Path;
8use std::sync::LazyLock;
9
10use oxc_span::Span;
11
12use crate::{ImportInfo, ImportedName, ModuleInfo};
13use fallow_types::discover::FileId;
14
15/// Regex to match HTML comments (`<!-- ... -->`) for stripping before extraction.
16static HTML_COMMENT_RE: LazyLock<regex::Regex> =
17    LazyLock::new(|| regex::Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
18
19/// Regex to extract `src` attribute from `<script>` tags.
20/// Matches both `<script src="...">` and `<script type="module" src="...">`.
21/// Uses `(?s)` so `.` matches newlines (multi-line attributes).
22static SCRIPT_SRC_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
23    regex::Regex::new(r#"(?si)<script\b(?:[^>"']|"[^"]*"|'[^']*')*?\bsrc\s*=\s*["']([^"']+)["']"#)
24        .expect("valid regex")
25});
26
27/// Regex to extract `href` attribute from `<link>` tags with `rel="stylesheet"` or
28/// `rel="modulepreload"`.
29/// Handles attributes in any order (rel before or after href).
30static LINK_HREF_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
31    regex::Regex::new(
32        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["'](?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["']"#,
33    )
34    .expect("valid regex")
35});
36
37/// Regex for the reverse attribute order: href before rel.
38static LINK_HREF_REVERSE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
39    regex::Regex::new(
40        r#"(?si)<link\b(?:[^>"']|"[^"]*"|'[^']*')*?\bhref\s*=\s*["']([^"']+)["'](?:[^>"']|"[^"]*"|'[^']*')*?\brel\s*=\s*["'](stylesheet|modulepreload)["']"#,
41    )
42    .expect("valid regex")
43});
44
45/// Check if a path is an HTML file.
46// Keep in sync with fallow_core::analyze::predicates::is_html_file (crate boundary prevents sharing)
47pub(crate) fn is_html_file(path: &Path) -> bool {
48    path.extension()
49        .and_then(|e| e.to_str())
50        .is_some_and(|ext| ext == "html")
51}
52
53/// Returns true if an HTML asset reference is a remote URL that should be skipped.
54fn is_remote_url(src: &str) -> bool {
55    src.starts_with("http://")
56        || src.starts_with("https://")
57        || src.starts_with("//")
58        || src.starts_with("data:")
59}
60
61/// Parse an HTML file, extracting script and stylesheet references as imports.
62pub(crate) fn parse_html_to_module(file_id: FileId, source: &str, content_hash: u64) -> ModuleInfo {
63    let suppressions = crate::suppress::parse_suppressions_from_source(source);
64
65    // Strip HTML comments before matching to avoid false positives.
66    let stripped = HTML_COMMENT_RE.replace_all(source, "");
67
68    let mut imports = Vec::new();
69
70    // Extract <script src="..."> references
71    for cap in SCRIPT_SRC_RE.captures_iter(&stripped) {
72        if let Some(m) = cap.get(1) {
73            let src = m.as_str().trim();
74            if !src.is_empty() && !is_remote_url(src) {
75                imports.push(ImportInfo {
76                    source: src.to_string(),
77                    imported_name: ImportedName::SideEffect,
78                    local_name: String::new(),
79                    is_type_only: false,
80                    span: Span::default(),
81                    source_span: Span::default(),
82                });
83            }
84        }
85    }
86
87    // Extract <link rel="stylesheet" href="..."> and <link rel="modulepreload" href="...">
88    // Handle both attribute orders: rel before href, and href before rel.
89    for cap in LINK_HREF_RE.captures_iter(&stripped) {
90        if let Some(m) = cap.get(2) {
91            let href = m.as_str().trim();
92            if !href.is_empty() && !is_remote_url(href) {
93                imports.push(ImportInfo {
94                    source: href.to_string(),
95                    imported_name: ImportedName::SideEffect,
96                    local_name: String::new(),
97                    is_type_only: false,
98                    span: Span::default(),
99                    source_span: Span::default(),
100                });
101            }
102        }
103    }
104    for cap in LINK_HREF_REVERSE_RE.captures_iter(&stripped) {
105        if let Some(m) = cap.get(1) {
106            let href = m.as_str().trim();
107            if !href.is_empty() && !is_remote_url(href) {
108                imports.push(ImportInfo {
109                    source: href.to_string(),
110                    imported_name: ImportedName::SideEffect,
111                    local_name: String::new(),
112                    is_type_only: false,
113                    span: Span::default(),
114                    source_span: Span::default(),
115                });
116            }
117        }
118    }
119
120    // Deduplicate: the same asset may be referenced by both <script src> and
121    // <link rel="modulepreload" href> for the same path.
122    imports.sort_unstable_by(|a, b| a.source.cmp(&b.source));
123    imports.dedup_by(|a, b| a.source == b.source);
124
125    ModuleInfo {
126        file_id,
127        exports: Vec::new(),
128        imports,
129        re_exports: Vec::new(),
130        dynamic_imports: Vec::new(),
131        dynamic_import_patterns: Vec::new(),
132        require_calls: Vec::new(),
133        member_accesses: Vec::new(),
134        whole_object_uses: Vec::new(),
135        has_cjs_exports: false,
136        content_hash,
137        suppressions,
138        unused_import_bindings: Vec::new(),
139        line_offsets: fallow_types::extract::compute_line_offsets(source),
140        complexity: Vec::new(),
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    // ── is_html_file ─────────────────────────────────────────────
149
150    #[test]
151    fn is_html_file_html() {
152        assert!(is_html_file(Path::new("index.html")));
153    }
154
155    #[test]
156    fn is_html_file_nested() {
157        assert!(is_html_file(Path::new("pages/about.html")));
158    }
159
160    #[test]
161    fn is_html_file_rejects_htm() {
162        assert!(!is_html_file(Path::new("index.htm")));
163    }
164
165    #[test]
166    fn is_html_file_rejects_js() {
167        assert!(!is_html_file(Path::new("app.js")));
168    }
169
170    #[test]
171    fn is_html_file_rejects_ts() {
172        assert!(!is_html_file(Path::new("app.ts")));
173    }
174
175    #[test]
176    fn is_html_file_rejects_vue() {
177        assert!(!is_html_file(Path::new("App.vue")));
178    }
179
180    // ── is_remote_url ────────────────────────────────────────────
181
182    #[test]
183    fn remote_url_http() {
184        assert!(is_remote_url("http://example.com/script.js"));
185    }
186
187    #[test]
188    fn remote_url_https() {
189        assert!(is_remote_url("https://cdn.example.com/style.css"));
190    }
191
192    #[test]
193    fn remote_url_protocol_relative() {
194        assert!(is_remote_url("//cdn.example.com/lib.js"));
195    }
196
197    #[test]
198    fn remote_url_data() {
199        assert!(is_remote_url("data:text/javascript;base64,abc"));
200    }
201
202    #[test]
203    fn local_relative_not_remote() {
204        assert!(!is_remote_url("./src/entry.js"));
205    }
206
207    #[test]
208    fn local_root_relative_not_remote() {
209        assert!(!is_remote_url("/src/entry.js"));
210    }
211
212    // ── parse_html_to_module: script src extraction ──────────────
213
214    #[test]
215    fn extracts_module_script_src() {
216        let info = parse_html_to_module(
217            FileId(0),
218            r#"<script type="module" src="./src/entry.js"></script>"#,
219            0,
220        );
221        assert_eq!(info.imports.len(), 1);
222        assert_eq!(info.imports[0].source, "./src/entry.js");
223    }
224
225    #[test]
226    fn extracts_plain_script_src() {
227        let info = parse_html_to_module(
228            FileId(0),
229            r#"<script src="./src/polyfills.js"></script>"#,
230            0,
231        );
232        assert_eq!(info.imports.len(), 1);
233        assert_eq!(info.imports[0].source, "./src/polyfills.js");
234    }
235
236    #[test]
237    fn extracts_multiple_scripts() {
238        let info = parse_html_to_module(
239            FileId(0),
240            r#"
241            <script type="module" src="./src/entry.js"></script>
242            <script src="./src/polyfills.js"></script>
243            "#,
244            0,
245        );
246        assert_eq!(info.imports.len(), 2);
247    }
248
249    #[test]
250    fn skips_inline_script() {
251        let info = parse_html_to_module(FileId(0), r#"<script>console.log("hello");</script>"#, 0);
252        assert!(info.imports.is_empty());
253    }
254
255    #[test]
256    fn skips_remote_script() {
257        let info = parse_html_to_module(
258            FileId(0),
259            r#"<script src="https://cdn.example.com/lib.js"></script>"#,
260            0,
261        );
262        assert!(info.imports.is_empty());
263    }
264
265    #[test]
266    fn skips_protocol_relative_script() {
267        let info = parse_html_to_module(
268            FileId(0),
269            r#"<script src="//cdn.example.com/lib.js"></script>"#,
270            0,
271        );
272        assert!(info.imports.is_empty());
273    }
274
275    // ── parse_html_to_module: link href extraction ───────────────
276
277    #[test]
278    fn extracts_stylesheet_link() {
279        let info = parse_html_to_module(
280            FileId(0),
281            r#"<link rel="stylesheet" href="./src/global.css" />"#,
282            0,
283        );
284        assert_eq!(info.imports.len(), 1);
285        assert_eq!(info.imports[0].source, "./src/global.css");
286    }
287
288    #[test]
289    fn extracts_modulepreload_link() {
290        let info = parse_html_to_module(
291            FileId(0),
292            r#"<link rel="modulepreload" href="./src/vendor.js" />"#,
293            0,
294        );
295        assert_eq!(info.imports.len(), 1);
296        assert_eq!(info.imports[0].source, "./src/vendor.js");
297    }
298
299    #[test]
300    fn extracts_link_with_reversed_attrs() {
301        let info = parse_html_to_module(
302            FileId(0),
303            r#"<link href="./src/global.css" rel="stylesheet" />"#,
304            0,
305        );
306        assert_eq!(info.imports.len(), 1);
307        assert_eq!(info.imports[0].source, "./src/global.css");
308    }
309
310    #[test]
311    fn skips_preload_link() {
312        let info = parse_html_to_module(
313            FileId(0),
314            r#"<link rel="preload" href="./src/font.woff2" as="font" />"#,
315            0,
316        );
317        assert!(info.imports.is_empty());
318    }
319
320    #[test]
321    fn skips_icon_link() {
322        let info =
323            parse_html_to_module(FileId(0), r#"<link rel="icon" href="./favicon.ico" />"#, 0);
324        assert!(info.imports.is_empty());
325    }
326
327    #[test]
328    fn skips_remote_stylesheet() {
329        let info = parse_html_to_module(
330            FileId(0),
331            r#"<link rel="stylesheet" href="https://fonts.googleapis.com/css" />"#,
332            0,
333        );
334        assert!(info.imports.is_empty());
335    }
336
337    // ── HTML comment stripping ───────────────────────────────────
338
339    #[test]
340    fn skips_commented_out_script() {
341        let info = parse_html_to_module(
342            FileId(0),
343            r#"<!-- <script src="./old.js"></script> -->
344            <script src="./new.js"></script>"#,
345            0,
346        );
347        assert_eq!(info.imports.len(), 1);
348        assert_eq!(info.imports[0].source, "./new.js");
349    }
350
351    #[test]
352    fn skips_commented_out_link() {
353        let info = parse_html_to_module(
354            FileId(0),
355            r#"<!-- <link rel="stylesheet" href="./old.css" /> -->
356            <link rel="stylesheet" href="./new.css" />"#,
357            0,
358        );
359        assert_eq!(info.imports.len(), 1);
360        assert_eq!(info.imports[0].source, "./new.css");
361    }
362
363    // ── Multi-line attributes ────────────────────────────────────
364
365    #[test]
366    fn handles_multiline_script_tag() {
367        let info = parse_html_to_module(
368            FileId(0),
369            "<script\n  type=\"module\"\n  src=\"./src/entry.js\"\n></script>",
370            0,
371        );
372        assert_eq!(info.imports.len(), 1);
373        assert_eq!(info.imports[0].source, "./src/entry.js");
374    }
375
376    #[test]
377    fn handles_multiline_link_tag() {
378        let info = parse_html_to_module(
379            FileId(0),
380            "<link\n  rel=\"stylesheet\"\n  href=\"./src/global.css\"\n/>",
381            0,
382        );
383        assert_eq!(info.imports.len(), 1);
384        assert_eq!(info.imports[0].source, "./src/global.css");
385    }
386
387    // ── Full HTML document ───────────────────────────────────────
388
389    #[test]
390    fn full_vite_html() {
391        let info = parse_html_to_module(
392            FileId(0),
393            r#"<!doctype html>
394<html>
395  <head>
396    <link rel="stylesheet" href="./src/global.css" />
397    <link rel="icon" href="/favicon.ico" />
398  </head>
399  <body>
400    <div id="app"></div>
401    <script type="module" src="./src/entry.js"></script>
402  </body>
403</html>"#,
404            0,
405        );
406        assert_eq!(info.imports.len(), 2);
407        let sources: Vec<&str> = info.imports.iter().map(|i| i.source.as_str()).collect();
408        assert!(sources.contains(&"./src/global.css"));
409        assert!(sources.contains(&"./src/entry.js"));
410    }
411
412    // ── Edge cases ───────────────────────────────────────────────
413
414    #[test]
415    fn empty_html() {
416        let info = parse_html_to_module(FileId(0), "", 0);
417        assert!(info.imports.is_empty());
418    }
419
420    #[test]
421    fn html_with_no_assets() {
422        let info = parse_html_to_module(
423            FileId(0),
424            r"<!doctype html><html><body><h1>Hello</h1></body></html>",
425            0,
426        );
427        assert!(info.imports.is_empty());
428    }
429
430    #[test]
431    fn single_quoted_attributes() {
432        let info = parse_html_to_module(FileId(0), r"<script src='./src/entry.js'></script>", 0);
433        assert_eq!(info.imports.len(), 1);
434        assert_eq!(info.imports[0].source, "./src/entry.js");
435    }
436
437    #[test]
438    fn all_imports_are_side_effect() {
439        let info = parse_html_to_module(
440            FileId(0),
441            r#"<script src="./entry.js"></script>
442            <link rel="stylesheet" href="./style.css" />"#,
443            0,
444        );
445        for imp in &info.imports {
446            assert!(matches!(imp.imported_name, ImportedName::SideEffect));
447            assert!(imp.local_name.is_empty());
448            assert!(!imp.is_type_only);
449        }
450    }
451
452    #[test]
453    fn suppression_comments_extracted() {
454        let info = parse_html_to_module(
455            FileId(0),
456            "<!-- fallow-ignore-file -->\n<script src=\"./entry.js\"></script>",
457            0,
458        );
459        // HTML comments use <!-- --> not //, so suppression parsing
460        // from source text won't find standard JS-style comments.
461        // This is expected — HTML suppression is not supported.
462        assert_eq!(info.imports.len(), 1);
463    }
464}