Skip to main content

lychee_lib/extract/
css.rs

1//! Extract URLs from CSS content
2//!
3//! This module extracts URLs from CSS files and `<style>` tags.
4//! It looks for `url()` functions which are commonly used for:
5//! - background-image
6//! - background
7//! - @import statements
8//! - font-face src
9//! - etc.
10// NOTE: this is a regular-expression based extractor and may not cover all edge
11// cases of CSS parsing. Specifically, it does not handle escape sequences
12// within URLs or nested functions, such as `url("image\"name.png")`.
13//
14// A more bespoke CSS parser, such as Servo's
15// [cssparser](https://github.com/servo/rust-cssparser) crate might or might not
16// cover these cases better, but it would come with the additional burden of
17// adding multiple dependencies.
18//
19// For the time being, we accept these limitations, but we may revisit this
20// decision in the future if needed.
21
22use std::sync::LazyLock;
23
24use regex::Regex;
25
26use crate::types::uri::raw::{RawUri, SourceSpanProvider, SpanProvider};
27
28/// Regular expression to match CSS `url()` functions
29///
30/// This regex matches:
31/// - url("...")
32/// - url('...')
33/// - url(...)
34///
35/// It captures the URL inside the parentheses, handling:
36/// - Single quotes
37/// - Double quotes
38/// - No quotes
39/// - Escaped quotes within the URL
40///
41/// Examples:
42/// - `background-image: url("./image.png");`
43/// - `background: url('/path/to/image.jpg');`
44/// - `@import url(https://example.com/style.css);`
45/// - `src: url(../fonts/font.woff2);`
46static CSS_URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(
48        r#"(?x)                     # Enable extended mode for whitespace and comments
49        url\s*\(                    # Match 'url(' with optional whitespace
50        \s*                         # Optional whitespace
51        (?:                         # Non-capturing group for the URL
52            "(?P<double>[^"]*)"     # Double-quoted URL
53            |                       # OR
54            '(?P<single>[^']*)'     # Single-quoted URL
55            |                       # OR
56            (?P<unquoted>[^)]+)     # Unquoted URL (anything until ')')
57        )
58        \s*                         # Optional whitespace
59        \)                          # Match closing ')'
60        "#,
61    )
62    .expect("CSS URL regex should be valid")
63});
64
65/// Extract all URLs from CSS content
66///
67/// This function finds all `url()` occurrences in CSS and extracts the URLs.
68///
69/// # Arguments
70///
71/// * `input` - The CSS content to extract URLs from
72/// * `span_provider` - Provides source location information for extracted URLs
73///
74/// # Returns
75///
76/// A vector of `RawUri` objects representing the extracted URLs
77///
78/// # Examples
79///
80/// CSS input:
81/// ```css
82/// .example {
83///     background-image: url("./image.png");
84///     background: url('/absolute/path.jpg');
85/// }
86/// @import url(https://example.com/style.css);
87/// ```
88///
89/// Extracts 3 URLs: `./image.png`, `/absolute/path.jpg`, and `https://example.com/style.css`
90pub(crate) fn extract_css<S: SpanProvider>(input: &str, span_provider: &S) -> Vec<RawUri> {
91    CSS_URL_REGEX
92        .captures_iter(input)
93        .filter_map(|cap| {
94            // Try to extract the URL from any of the three capture groups
95            let url = cap
96                .name("double")
97                .or_else(|| cap.name("single"))
98                .or_else(|| cap.name("unquoted"))
99                .map(|m| m.as_str().trim())?;
100
101            // Skip empty URLs. Example input: `url("")`
102            if url.is_empty() {
103                return None;
104            }
105
106            // Get the position of the entire match (for span information)
107            let match_start = cap.get(0)?.start();
108
109            Some(RawUri {
110                text: url.to_string(),
111                element: Some("style".to_string()),
112                attribute: Some("url".to_string()),
113                span: span_provider.span(match_start),
114            })
115        })
116        .collect()
117}
118
119/// Extract URLs from CSS content with default span
120pub(crate) fn extract_css_with_default_span(input: &str) -> Vec<RawUri> {
121    extract_css(input, &SourceSpanProvider::from_input(input))
122}
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127
128    // Tests based on MDN documentation:
129    // https://developer.mozilla.org/en-US/docs/Web/CSS/url_function
130
131    // Basic usage examples
132
133    #[test]
134    fn test_basic_usage_double_quotes() {
135        let css = r#"url("https://example.com/images/myImg.jpg");"#;
136        let urls = extract_css_with_default_span(css);
137        assert_eq!(urls.len(), 1);
138        assert_eq!(urls[0].text, "https://example.com/images/myImg.jpg");
139    }
140
141    #[test]
142    fn test_basic_usage_single_quotes() {
143        let css = r"url('https://example.com/images/myImg.jpg');";
144        let urls = extract_css_with_default_span(css);
145        assert_eq!(urls.len(), 1);
146        assert_eq!(urls[0].text, "https://example.com/images/myImg.jpg");
147    }
148
149    #[test]
150    fn test_basic_usage_unquoted() {
151        let css = r"url(https://example.com/images/myImg.jpg);";
152        let urls = extract_css_with_default_span(css);
153        assert_eq!(urls.len(), 1);
154        assert_eq!(urls[0].text, "https://example.com/images/myImg.jpg");
155    }
156
157    #[test]
158    fn test_data_url() {
159        let css = r#"url("data:image/jpeg;base64,iRxVB0…");"#;
160        let urls = extract_css_with_default_span(css);
161        assert_eq!(urls.len(), 1);
162        assert!(urls[0].text.starts_with("data:image/jpeg"));
163    }
164
165    #[test]
166    fn test_relative_url() {
167        let css = r"url(myImg.jpg);";
168        let urls = extract_css_with_default_span(css);
169        assert_eq!(urls.len(), 1);
170        assert_eq!(urls[0].text, "myImg.jpg");
171    }
172
173    #[test]
174    fn test_svg_fragment() {
175        let css = r"url(#IDofSVGpath);";
176        let urls = extract_css_with_default_span(css);
177        assert_eq!(urls.len(), 1);
178        assert_eq!(urls[0].text, "#IDofSVGpath");
179    }
180
181    // Associated properties
182
183    #[test]
184    fn test_background_image() {
185        let css = r#"background-image: url("star.gif");"#;
186        let urls = extract_css_with_default_span(css);
187        assert_eq!(urls.len(), 1);
188        assert_eq!(urls[0].text, "star.gif");
189    }
190
191    #[test]
192    fn test_list_style_image() {
193        let css = r"list-style-image: url('../images/bullet.jpg');";
194        let urls = extract_css_with_default_span(css);
195        assert_eq!(urls.len(), 1);
196        assert_eq!(urls[0].text, "../images/bullet.jpg");
197    }
198
199    #[test]
200    fn test_content_property() {
201        let css = r#"content: url("my-icon.jpg");"#;
202        let urls = extract_css_with_default_span(css);
203        assert_eq!(urls.len(), 1);
204        assert_eq!(urls[0].text, "my-icon.jpg");
205    }
206
207    #[test]
208    fn test_cursor_property() {
209        let css = r"cursor: url(my-cursor.cur);";
210        let urls = extract_css_with_default_span(css);
211        assert_eq!(urls.len(), 1);
212        assert_eq!(urls[0].text, "my-cursor.cur");
213    }
214
215    #[test]
216    fn test_border_image_source() {
217        let css = r"border-image-source: url(/media/diamonds.png);";
218        let urls = extract_css_with_default_span(css);
219        assert_eq!(urls.len(), 1);
220        assert_eq!(urls[0].text, "/media/diamonds.png");
221    }
222
223    #[test]
224    fn test_font_src() {
225        let css = r"src: url('fantastic-font.woff');";
226        let urls = extract_css_with_default_span(css);
227        assert_eq!(urls.len(), 1);
228        assert_eq!(urls[0].text, "fantastic-font.woff");
229    }
230
231    #[test]
232    fn test_offset_path() {
233        let css = r"offset-path: url(#path);";
234        let urls = extract_css_with_default_span(css);
235        assert_eq!(urls.len(), 1);
236        assert_eq!(urls[0].text, "#path");
237    }
238
239    #[test]
240    fn test_mask_image_with_fragment() {
241        let css = r#"mask-image: url("masks.svg#mask1");"#;
242        let urls = extract_css_with_default_span(css);
243        assert_eq!(urls.len(), 1);
244        assert_eq!(urls[0].text, "masks.svg#mask1");
245    }
246
247    // Properties with fallbacks
248
249    #[test]
250    fn test_cursor_with_fallback() {
251        let css = r"cursor: url(pointer.cur), pointer;";
252        let urls = extract_css_with_default_span(css);
253        assert_eq!(urls.len(), 1);
254        assert_eq!(urls[0].text, "pointer.cur");
255    }
256
257    // Shorthand properties
258
259    #[test]
260    fn test_background_shorthand() {
261        let css = r"background: url('star.gif') bottom right repeat-x blue;";
262        let urls = extract_css_with_default_span(css);
263        assert_eq!(urls.len(), 1);
264        assert_eq!(urls[0].text, "star.gif");
265    }
266
267    #[test]
268    fn test_border_image_shorthand() {
269        let css = r#"border-image: url("/media/diamonds.png") 30 fill / 30px / 30px space;"#;
270        let urls = extract_css_with_default_span(css);
271        assert_eq!(urls.len(), 1);
272        assert_eq!(urls[0].text, "/media/diamonds.png");
273    }
274
275    // As parameter in CSS functions
276
277    #[test]
278    fn test_cross_fade_function() {
279        let css = r"background-image: cross-fade(20% url(first.png), url(second.png));";
280        let urls = extract_css_with_default_span(css);
281        assert_eq!(urls.len(), 2);
282        assert_eq!(urls[0].text, "first.png");
283        assert_eq!(urls[1].text, "second.png");
284    }
285
286    #[test]
287    fn test_image_function() {
288        let css =
289            r"mask-image: image(url(mask.png), skyblue, linear-gradient(black, transparent));";
290        let urls = extract_css_with_default_span(css);
291        assert_eq!(urls.len(), 1);
292        assert_eq!(urls[0].text, "mask.png");
293    }
294
295    // Multiple values
296
297    #[test]
298    fn test_multiple_urls_in_content() {
299        let css =
300            r"content: url(star.svg) url(star.svg) url(star.svg) url(star.svg) url(star.svg);";
301        let urls = extract_css_with_default_span(css);
302        assert_eq!(urls.len(), 5);
303        for url in &urls {
304            assert_eq!(url.text, "star.svg");
305        }
306    }
307
308    // At-rules
309
310    #[test]
311    fn test_document_rule() {
312        let css = r#"@document url("https://www.example.com/") { }"#;
313        let urls = extract_css_with_default_span(css);
314        assert_eq!(urls.len(), 1);
315        assert_eq!(urls[0].text, "https://www.example.com/");
316    }
317
318    #[test]
319    fn test_import_rule() {
320        let css = r#"@import url("https://www.example.com/style.css");"#;
321        let urls = extract_css_with_default_span(css);
322        assert_eq!(urls.len(), 1);
323        assert_eq!(urls[0].text, "https://www.example.com/style.css");
324    }
325
326    #[test]
327    fn test_namespace_rule() {
328        let css = r"@namespace url(http://www.w3.org/1999/xhtml);";
329        let urls = extract_css_with_default_span(css);
330        assert_eq!(urls.len(), 1);
331        assert_eq!(urls[0].text, "http://www.w3.org/1999/xhtml");
332    }
333
334    // Complex real-world examples
335
336    #[test]
337    fn test_data_url_svg_embedded() {
338        let css = r#"background: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='90' height='45'%3E%3Cpath d='M10 10h60' stroke='%2300F' stroke-width='5'/%3E%3Cpath d='M10 20h60' stroke='%230F0' stroke-width='5'/%3E%3Cpath d='M10 30h60' stroke='red' stroke-width='5'/%3E%3C/svg%3E");"#;
339        let urls = extract_css_with_default_span(css);
340        assert_eq!(urls.len(), 1);
341        assert!(urls[0].text.starts_with("data:image/svg+xml"));
342        assert!(urls[0].text.contains("%3Csvg"));
343    }
344
345    #[test]
346    fn test_filter_svg_file() {
347        let css = r#"filter: url("my-file.svg#svg-blur");"#;
348        let urls = extract_css_with_default_span(css);
349        assert_eq!(urls.len(), 1);
350        assert_eq!(urls[0].text, "my-file.svg#svg-blur");
351    }
352
353    #[test]
354    fn test_filter_svg_inline() {
355        let css = r##"filter: url("#svg-blur");"##;
356        let urls = extract_css_with_default_span(css);
357        assert_eq!(urls.len(), 1);
358        assert_eq!(urls[0].text, "#svg-blur");
359    }
360
361    #[test]
362    fn test_extract_multiple_urls() {
363        let css = r#"
364        .example {
365            background-image: url("./image.png");
366            background: url('/absolute/path.jpg');
367        }
368        @import url(https://example.com/style.css);
369        @font-face {
370            src: url(../fonts/font.woff2);
371        }
372        "#;
373        let urls = extract_css_with_default_span(css);
374        assert_eq!(urls.len(), 4);
375        assert_eq!(urls[0].text, "./image.png");
376        assert_eq!(urls[1].text, "/absolute/path.jpg");
377        assert_eq!(urls[2].text, "https://example.com/style.css");
378        assert_eq!(urls[3].text, "../fonts/font.woff2");
379    }
380
381    #[test]
382    fn test_extract_url_with_spaces() {
383        let css = r#"background: url(  "./image.png"  );"#;
384        let urls = extract_css_with_default_span(css);
385        assert_eq!(urls.len(), 1);
386        assert_eq!(urls[0].text, "./image.png");
387    }
388
389    #[test]
390    fn test_empty_url() {
391        let css = r#"background: url("");"#;
392        let urls = extract_css_with_default_span(css);
393        // Empty URLs should be skipped
394        assert_eq!(urls.len(), 0);
395    }
396
397    #[test]
398    fn test_no_urls() {
399        let css = r"
400        .example {
401            color: red;
402            font-size: 16px;
403        }
404        ";
405        let urls = extract_css_with_default_span(css);
406        assert_eq!(urls.len(), 0);
407    }
408
409    #[test]
410    fn test_url_in_style_tag_content() {
411        // This simulates content that would be inside a <style> tag in HTML
412        let css = r#"
413        div {
414            background-image: url("./lychee.png");
415        }
416        "#;
417        let urls = extract_css_with_default_span(css);
418        assert_eq!(urls.len(), 1);
419        assert_eq!(urls[0].text, "./lychee.png");
420    }
421
422    #[test]
423    fn test_data_url_is_extracted() {
424        // Data URLs should still be extracted (even though they might be filtered later)
425        let css = r#"background: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==");"#;
426        let urls = extract_css_with_default_span(css);
427        assert_eq!(urls.len(), 1);
428        assert!(urls[0].text.starts_with("data:image/png"));
429    }
430
431    #[test]
432    fn test_element_and_attribute_metadata() {
433        let css = r#"background: url("./image.png");"#;
434        let urls = extract_css_with_default_span(css);
435        assert_eq!(urls.len(), 1);
436        assert_eq!(urls[0].element, Some("style".to_string()));
437        assert_eq!(urls[0].attribute, Some("url".to_string()));
438    }
439}