html_generator/
utils.rs

1// Copyright © 2025 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Utility functions for HTML and Markdown processing.
5//!
6//! This module provides various utility functions for tasks such as
7//! extracting front matter from Markdown content and formatting HTML headers.
8
9use crate::error::{HtmlError, Result};
10use once_cell::sync::Lazy;
11use regex::Regex;
12use scraper::ElementRef;
13use std::collections::HashMap;
14
15static FRONT_MATTER_REGEX: Lazy<Regex> = Lazy::new(|| {
16    Regex::new(r"(?ms)^---\s*\n(.*?)\n---\s*\n")
17        .expect("Failed to compile FRONT_MATTER_REGEX")
18});
19
20static HEADER_REGEX: Lazy<Regex> = Lazy::new(|| {
21    Regex::new(r"<(h[1-6])(?:\s[^>]*)?>(.+?)</h[1-6]>")
22        .expect("Failed to compile HEADER_REGEX")
23});
24
25static CONSECUTIVE_HYPHENS_REGEX: Lazy<Regex> = Lazy::new(|| {
26    Regex::new(r"-{2,}")
27        .expect("Failed to compile CONSECUTIVE_HYPHENS_REGEX")
28});
29
30/// Maximum allowed input size (in bytes) to prevent DOS attacks
31const MAX_INPUT_SIZE: usize = 1_000_000; // 1 MB
32
33/// Extracts front matter from Markdown content.
34///
35/// # Arguments
36///
37/// * `content` - A string slice that holds the content to process.
38///
39/// # Returns
40///
41/// * `Result<String>` - The content with front matter removed, or an error.
42///
43/// # Errors
44///
45/// This function will return an error if:
46/// * The input is empty or exceeds the maximum allowed size.
47/// * The front matter is invalidly formatted.
48///
49/// # Examples
50///
51/// ```
52/// use html_generator::utils::extract_front_matter;
53///
54/// let content = "---\ntitle: My Page\n---\n# Hello, world!\n\nThis is a test.";
55/// let result = extract_front_matter(content).unwrap();
56/// assert_eq!(result, "# Hello, world!\n\nThis is a test.");
57/// ```
58pub fn extract_front_matter(content: &str) -> Result<String> {
59    if content.is_empty() {
60        return Err(HtmlError::InvalidInput("Empty input".to_string()));
61    }
62    if content.len() > MAX_INPUT_SIZE {
63        return Err(HtmlError::InputTooLarge(content.len()));
64    }
65
66    if content.starts_with("---") {
67        if let Some(captures) = FRONT_MATTER_REGEX.captures(content) {
68            let front_matter = captures
69                .get(1)
70                .ok_or_else(|| {
71                    HtmlError::InvalidFrontMatterFormat(
72                        "Missing front matter match".to_string(),
73                    )
74                })?
75                .as_str();
76
77            for line in front_matter.lines() {
78                if !line.trim().contains(':') {
79                    return Err(HtmlError::InvalidFrontMatterFormat(
80                        format!(
81                            "Invalid line in front matter: {}",
82                            line
83                        ),
84                    ));
85                }
86            }
87
88            let remaining_content =
89                &content[captures.get(0).unwrap().end()..];
90            Ok(remaining_content.trim().to_string())
91        } else {
92            Err(HtmlError::InvalidFrontMatterFormat(
93                "Invalid front matter format".to_string(),
94            ))
95        }
96    } else {
97        Ok(content.to_string())
98    }
99}
100
101/// Formats a header with an ID and class.
102///
103/// # Arguments
104///
105/// * `header` - A string slice that holds the HTML header to process.
106/// * `id_generator` - An optional function that generates the ID from the header content.
107/// * `class_generator` - An optional function that generates the class from the header content.
108///
109/// # Returns
110///
111/// * `Result<String>` - The formatted HTML header, or an error.
112///
113/// # Examples
114///
115/// ```
116/// use html_generator::utils::format_header_with_id_class;
117///
118/// let header = "<h2>Hello, World!</h2>";
119/// let result = format_header_with_id_class(header, None, None).unwrap();
120/// assert_eq!(result, "<h2 id=\"hello-world\" class=\"hello-world\">Hello, World!</h2>");
121/// ```
122pub fn format_header_with_id_class(
123    header: &str,
124    id_generator: Option<fn(&str) -> String>,
125    class_generator: Option<fn(&str) -> String>,
126) -> Result<String> {
127    let captures = HEADER_REGEX.captures(header).ok_or_else(|| {
128        HtmlError::InvalidHeaderFormat(
129            "Invalid header format".to_string(),
130        )
131    })?;
132
133    let tag = captures
134        .get(1)
135        .ok_or_else(|| {
136            HtmlError::InvalidHeaderFormat(
137                "Missing header tag".to_string(),
138            )
139        })?
140        .as_str();
141
142    let text_content = captures
143        .get(2)
144        .ok_or_else(|| {
145            HtmlError::InvalidHeaderFormat(
146                "Missing header content".to_string(),
147            )
148        })?
149        .as_str();
150
151    let id = id_generator.map_or_else(
152        || generate_id(text_content),
153        |generator| generator(text_content),
154    );
155    let class = class_generator.map_or_else(
156        || generate_id(text_content),
157        |generator| generator(text_content),
158    );
159
160    Ok(format!(
161        r#"<{} id="{}" class="{}">{}</{}>"#,
162        tag, id, class, text_content, tag
163    ))
164}
165
166/// Generates a table of contents from HTML content.
167///
168/// # Arguments
169///
170/// * `html` - A string slice that holds the HTML content to process.
171///
172/// # Returns
173///
174/// * `Result<String>` - The generated table of contents as an HTML string, or an error.
175///
176/// # Examples
177///
178/// ```
179/// use html_generator::utils::generate_table_of_contents;
180///
181/// let html = "<h1>Title</h1><p>Some content</p><h2>Subtitle</h2><p>More content</p>";
182/// let result = generate_table_of_contents(html).unwrap();
183/// assert_eq!(result, r#"<ul><li class="toc-h1"><a href="\#title">Title</a></li><li class="toc-h2"><a href="\#subtitle">Subtitle</a></li></ul>"#);
184/// ```
185pub fn generate_table_of_contents(html: &str) -> Result<String> {
186    if html.is_empty() {
187        return Err(HtmlError::InvalidInput("Empty input".to_string()));
188    }
189    if html.len() > MAX_INPUT_SIZE {
190        return Err(HtmlError::InputTooLarge(html.len()));
191    }
192
193    let mut toc = String::new();
194    toc.push_str("<ul>");
195
196    for captures in HEADER_REGEX.captures_iter(html) {
197        if let Some(tag) = captures.get(1) {
198            let content = captures.get(2).map_or("", |m| m.as_str());
199            let id = generate_id(content);
200            toc.push_str(&format!(
201                r#"<li class="toc-{}"><a href="\#{}">{}</a></li>"#,
202                tag.as_str(),
203                id,
204                content
205            ));
206        }
207    }
208
209    toc.push_str("</ul>");
210    Ok(toc)
211}
212
213/// Check if an ARIA role is valid for a given element.
214///
215/// # Arguments
216///
217/// * `role` - The ARIA role to validate.
218/// * `element` - The HTML element to validate.
219///
220/// # Returns
221///
222/// * `bool` - Whether the role is valid for the element.
223pub fn is_valid_aria_role(role: &str, element: &ElementRef) -> bool {
224    static VALID_ROLES: Lazy<HashMap<&'static str, Vec<&'static str>>> =
225        Lazy::new(|| {
226            let mut roles = HashMap::new();
227            let _ =
228                roles.insert("a", vec!["link", "button", "menuitem"]);
229            let _ = roles.insert("button", vec!["button"]);
230            let _ =
231                roles.insert("div", vec!["alert", "tooltip", "dialog"]);
232            let _ = roles.insert(
233                "input",
234                vec!["textbox", "radio", "checkbox", "searchbox"],
235            );
236            roles
237        });
238
239    if let Some(valid_roles) = VALID_ROLES.get(element.value().name()) {
240        valid_roles.contains(&role)
241    } else {
242        false
243    }
244}
245
246/// Validates a language code.
247///
248/// # Arguments
249///
250/// * `lang` - The language code to validate.
251///
252/// # Returns
253///
254/// * `bool` - Whether the language code is valid.
255pub fn is_valid_language_code(lang: &str) -> bool {
256    let parts: Vec<&str> = lang.split('-').collect();
257    if parts.is_empty() || parts[0].len() < 2 || parts[0].len() > 3 {
258        return false;
259    }
260    parts[0].chars().all(|c| c.is_ascii_lowercase())
261}
262
263/// Generates an ID from the given content.
264///
265/// # Arguments
266///
267/// * `content` - The content to generate the ID from.
268///
269/// # Returns
270///
271/// * `String` - The generated ID.
272fn generate_id(content: &str) -> String {
273    CONSECUTIVE_HYPHENS_REGEX
274        .replace_all(
275            &content
276                .to_lowercase()
277                .replace(|c: char| !c.is_alphanumeric(), "-"),
278            "-",
279        )
280        .trim_matches('-')
281        .to_string()
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287    use scraper::Html;
288
289    /// Tests for `extract_front_matter` function.
290    mod extract_front_matter_tests {
291        use super::*;
292
293        #[test]
294        fn test_valid_front_matter() {
295            let content = "---\ntitle: My Page\n---\n# Hello, world!\n\nThis is a test.";
296            let result = extract_front_matter(content);
297            assert!(
298                result.is_ok(),
299                "Expected Ok, got Err: {:?}",
300                result
301            );
302            if let Ok(extracted) = result {
303                assert_eq!(
304                    extracted,
305                    "# Hello, world!\n\nThis is a test."
306                );
307            }
308        }
309
310        #[test]
311        fn test_no_front_matter() {
312            let content = "# Hello, world!\n\nThis is a test without front matter.";
313            let result = extract_front_matter(content);
314            assert!(
315                result.is_ok(),
316                "Expected Ok, got Err: {:?}",
317                result
318            );
319            if let Ok(extracted) = result {
320                assert_eq!(extracted, content);
321            }
322        }
323
324        #[test]
325        fn test_empty_input() {
326            let content = "";
327            let result = extract_front_matter(content);
328            assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
329        }
330
331        #[test]
332        fn test_exceeding_max_input_size() {
333            let content = "a".repeat(MAX_INPUT_SIZE + 1);
334            let result = extract_front_matter(&content);
335            assert!(matches!(result, Err(HtmlError::InputTooLarge(_))));
336        }
337
338        #[test]
339        fn test_invalid_front_matter_format() {
340            let content =
341                "---\ntitle: value\ninvalid_line\n---\nContent";
342            let result = extract_front_matter(content);
343            assert!(matches!(
344                result,
345                Err(HtmlError::InvalidFrontMatterFormat(_))
346            ));
347        }
348
349        #[test]
350        fn test_valid_front_matter_with_extra_content() {
351            let content = "---\ntitle: Page\n---\n\n# Title\n\nContent";
352            let result = extract_front_matter(content);
353            assert!(result.is_ok());
354            assert_eq!(result.unwrap(), "# Title\n\nContent");
355        }
356
357        #[test]
358        fn test_extract_front_matter_with_mid_document_delimiter() {
359            let content = "# Title\nContent\n---\nkey: value\n---";
360            let result = extract_front_matter(content);
361            assert!(result.is_ok());
362            assert_eq!(result.unwrap(), content);
363        }
364    }
365
366    /// Tests for `format_header_with_id_class` function.
367    mod format_header_with_id_class_tests {
368        use super::*;
369
370        #[test]
371        fn test_valid_header_default_generators() {
372            let header = "<h2>Hello, World!</h2>";
373            let result =
374                format_header_with_id_class(header, None, None);
375            assert!(
376                result.is_ok(),
377                "Expected Ok, got Err: {:?}",
378                result
379            );
380            if let Ok(formatted) = result {
381                assert_eq!(formatted, "<h2 id=\"hello-world\" class=\"hello-world\">Hello, World!</h2>");
382            }
383        }
384
385        #[test]
386        fn test_custom_id_and_class_generators() {
387            let header = "<h3>Test Header</h3>";
388            fn id_gen(content: &str) -> String {
389                format!(
390                    "custom-{}",
391                    content.to_lowercase().replace(' ', "-")
392                )
393            }
394            fn class_gen(_: &str) -> String {
395                "custom-class".to_string()
396            }
397            let result = format_header_with_id_class(
398                header,
399                Some(id_gen),
400                Some(class_gen),
401            );
402            assert!(
403                result.is_ok(),
404                "Expected Ok, got Err: {:?}",
405                result
406            );
407            if let Ok(formatted) = result {
408                assert_eq!(formatted, "<h3 id=\"custom-test-header\" class=\"custom-class\">Test Header</h3>");
409            }
410        }
411
412        #[test]
413        fn test_invalid_header_format() {
414            let header = "<p>Not a header</p>";
415            let result =
416                format_header_with_id_class(header, None, None);
417            assert!(matches!(
418                result,
419                Err(HtmlError::InvalidHeaderFormat(_))
420            ));
421        }
422
423        #[test]
424        fn test_header_with_nested_tags() {
425            let header = "<h2><span>Nested Header</span></h2>";
426            let result =
427                format_header_with_id_class(header, None, None);
428            assert!(result.is_ok());
429            assert_eq!(
430                result.unwrap(),
431                "<h2 id=\"span-nested-header-span\" class=\"span-nested-header-span\"><span>Nested Header</span></h2>"
432            );
433        }
434
435        #[test]
436        fn test_format_header_with_long_content() {
437            let header = format!("<h1>{}</h1>", "a".repeat(300));
438            let result =
439                format_header_with_id_class(&header, None, None);
440            assert!(result.is_ok());
441        }
442
443        #[test]
444        fn test_header_with_special_characters() {
445            let header = "<h3>Special & Header!</h3>";
446            let result =
447                format_header_with_id_class(header, None, None);
448            assert!(result.is_ok());
449            assert_eq!(
450                result.unwrap(),
451                "<h3 id=\"special-header\" class=\"special-header\">Special & Header!</h3>"
452            );
453        }
454    }
455
456    /// Tests for `generate_table_of_contents` function.
457    mod generate_table_of_contents_tests {
458        use super::*;
459
460        #[test]
461        fn test_valid_html_with_headers() {
462            let html = "<h1>Title</h1><h2>Subtitle</h2>";
463            let result = generate_table_of_contents(html);
464            assert!(
465                result.is_ok(),
466                "Expected Ok, got Err: {:?}",
467                result
468            );
469            if let Ok(toc) = result {
470                assert_eq!(
471                    toc,
472                    r#"<ul><li class="toc-h1"><a href="\#title">Title</a></li><li class="toc-h2"><a href="\#subtitle">Subtitle</a></li></ul>"#
473                );
474            }
475        }
476
477        #[test]
478        fn test_html_without_headers() {
479            let html = "<p>No headers here.</p>";
480            let result = generate_table_of_contents(html);
481            assert!(
482                result.is_ok(),
483                "Expected Ok, got Err: {:?}",
484                result
485            );
486            if let Ok(toc) = result {
487                assert_eq!(toc, "<ul></ul>");
488            }
489        }
490
491        #[test]
492        fn test_empty_html() {
493            let html = "";
494            let result = generate_table_of_contents(html);
495            assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
496        }
497
498        #[test]
499        fn test_large_html_content() {
500            let html = "<h1>Header</h1>".repeat(1000);
501            let result = generate_table_of_contents(&html);
502            assert!(result.is_ok());
503        }
504
505        #[test]
506        fn test_generate_table_of_contents_with_malformed_html() {
507            let html = "<h1>Title<h2>Subtitle";
508            let result = generate_table_of_contents(html);
509            assert!(result.is_ok());
510            assert_eq!(result.unwrap(), "<ul></ul>");
511        }
512
513        #[test]
514        fn test_generate_table_of_contents_with_attributes() {
515            let html = r#"<h1 class="header-class">Header</h1>"#;
516            let result = generate_table_of_contents(html);
517            assert!(result.is_ok());
518            assert_eq!(
519                result.unwrap(),
520                r#"<ul><li class="toc-h1"><a href="\#header">Header</a></li></ul>"#
521            );
522        }
523    }
524
525    /// Tests for ARIA validation and utilities.
526    mod aria_validation_tests {
527        use super::*;
528
529        #[test]
530        fn test_valid_aria_role_for_button() {
531            let html =
532                Html::parse_fragment("<button role='button'></button>");
533            let element = html
534                .select(&scraper::Selector::parse("button").unwrap())
535                .next()
536                .unwrap();
537            assert!(is_valid_aria_role("button", &element));
538        }
539
540        #[test]
541        fn test_invalid_aria_role_for_button() {
542            let html =
543                Html::parse_fragment("<button role='link'></button>");
544            let element = html
545                .select(&scraper::Selector::parse("button").unwrap())
546                .next()
547                .unwrap();
548            assert!(!is_valid_aria_role("link", &element));
549        }
550
551        #[test]
552        fn test_missing_required_aria_properties() {
553            let html =
554                Html::parse_fragment(r#"<div role="slider"></div>"#);
555            let element = html
556                .select(&scraper::Selector::parse("div").unwrap())
557                .next()
558                .unwrap();
559            let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
560            assert_eq!(
561                missing.unwrap(),
562                vec![
563                    "aria-valuenow".to_string(),
564                    "aria-valuemin".to_string(),
565                    "aria-valuemax".to_string()
566                ]
567            );
568        }
569
570        #[test]
571        fn test_get_missing_required_aria_properties_valid_role() {
572            let html = Html::parse_fragment(
573                r#"<div role="slider" aria-valuenow="10" aria-valuemin="0" aria-valuemax="100"></div>"#,
574            );
575            let element = html
576                .select(&scraper::Selector::parse("div").unwrap())
577                .next()
578                .unwrap();
579            let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
580            assert!(missing.is_none());
581        }
582
583        #[test]
584        fn test_get_missing_required_aria_properties_unknown_role() {
585            let html =
586                Html::parse_fragment(r#"<div role="unknown"></div>"#);
587            let element = html
588                .select(&scraper::Selector::parse("div").unwrap())
589                .next()
590                .unwrap();
591            let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
592            assert!(missing.is_none());
593        }
594    }
595
596    /// Tests for utility functions.
597    mod utility_function_tests {
598        use super::*;
599
600        #[test]
601        fn test_generate_id() {
602            let content = "Test Header!";
603            let result = generate_id(content);
604            assert_eq!(result, "test-header");
605        }
606
607        #[test]
608        fn test_generate_id_with_special_characters() {
609            let content = "Header--with??special**chars";
610            let result = generate_id(content);
611            assert_eq!(result, "header-with-special-chars");
612        }
613
614        #[test]
615        fn test_generate_id_with_leading_trailing_whitespace() {
616            let content = "  Test Header  ";
617            let result = generate_id(content);
618            assert_eq!(result, "test-header");
619        }
620
621        #[test]
622        fn test_generate_id_with_numeric_content() {
623            let content = "12345";
624            let result = generate_id(content);
625            assert_eq!(result, "12345");
626        }
627
628        #[test]
629        fn test_is_valid_language_code() {
630            assert!(is_valid_language_code("en"));
631            assert!(is_valid_language_code("en-US"));
632            assert!(!is_valid_language_code("E"));
633            assert!(!is_valid_language_code("123"));
634        }
635
636        #[test]
637        fn test_is_valid_language_code_long_code() {
638            assert!(is_valid_language_code("en-US-variant-123"));
639        }
640
641        #[test]
642        fn test_is_valid_language_code_non_ascii() {
643            assert!(!is_valid_language_code("日本語"));
644        }
645
646        /// Additional tests for `extract_front_matter` function.
647        #[test]
648        fn test_extract_front_matter_empty_delimiters() {
649            let content = "------\n# Missing proper front matter";
650            let result = extract_front_matter(content);
651            assert!(matches!(
652                result,
653                Err(HtmlError::InvalidFrontMatterFormat(_))
654            ));
655        }
656
657        #[test]
658        fn test_extract_front_matter_large_content_valid_front_matter()
659        {
660            let large_content = format!(
661                "---\nkey: value\n---\n{}",
662                "Content".repeat(5000)
663            );
664            let result = extract_front_matter(&large_content);
665            assert!(result.is_ok());
666        }
667
668        /// Additional tests for `format_header_with_id_class` function.
669        #[test]
670        fn test_format_header_with_malformed_html() {
671            let header = "<h2 Missing closing>";
672            let result =
673                format_header_with_id_class(header, None, None);
674            assert!(matches!(
675                result,
676                Err(HtmlError::InvalidHeaderFormat(_))
677            ));
678        }
679
680        #[test]
681        fn test_format_header_with_inline_styles() {
682            let header =
683                r#"<h2 style="color: red;">Styled Header</h2>"#;
684            let result =
685                format_header_with_id_class(header, None, None);
686            assert!(result.is_ok());
687            assert_eq!(
688            result.unwrap(),
689            "<h2 id=\"styled-header\" class=\"styled-header\">Styled Header</h2>"
690        );
691        }
692
693        /// Additional tests for `generate_table_of_contents` function.
694        #[test]
695        fn test_toc_with_nested_headers() {
696            let html = "<div><h1>Outer</h1><h2>Inner</h2></div>";
697            let result = generate_table_of_contents(html);
698            assert!(result.is_ok());
699            assert_eq!(
700                result.unwrap(),
701                r#"<ul><li class="toc-h1"><a href="\#outer">Outer</a></li><li class="toc-h2"><a href="\#inner">Inner</a></li></ul>"#
702            );
703        }
704
705        #[test]
706        fn test_toc_with_malformed_and_valid_headers() {
707            let html = "<h1>Valid</h1><h2 Malformed>";
708            let result = generate_table_of_contents(html);
709            assert!(result.is_ok());
710            assert_eq!(
711                result.unwrap(),
712                r#"<ul><li class="toc-h1"><a href="\#valid">Valid</a></li></ul>"#
713            );
714        }
715
716        /// Additional tests for `is_valid_aria_role` function.
717        #[test]
718        fn test_unsupported_html_element() {
719            let html = Html::parse_fragment(
720                "<unsupported role='custom'></unsupported>",
721            );
722            let element = html
723                .select(
724                    &scraper::Selector::parse("unsupported").unwrap(),
725                )
726                .next()
727                .unwrap();
728            assert!(!is_valid_aria_role("custom", &element));
729        }
730
731        /// Additional tests for `is_valid_language_code` function.
732        #[test]
733        fn test_is_valid_language_code_with_mixed_case() {
734            assert!(!is_valid_language_code("eN-uS"));
735            assert!(!is_valid_language_code("En#Us"));
736        }
737
738        /// Additional tests for `generate_id` function.
739        #[test]
740        fn test_generate_id_empty_content() {
741            let content = "";
742            let result = generate_id(content);
743            assert_eq!(result, "");
744        }
745
746        #[test]
747        fn test_generate_id_whitespace_content() {
748            let content = "   ";
749            let result = generate_id(content);
750            assert_eq!(result, "");
751        }
752
753        #[test]
754        fn test_generate_id_symbols_only() {
755            let content = "!@#$%^&*()";
756            let result = generate_id(content);
757            assert_eq!(result, "");
758        }
759    }
760}