1use crate::error::{HtmlError, Result};
10use once_cell::sync::Lazy;
11use regex::Regex;
12use scraper::ElementRef;
13use std::collections::HashMap;
14
15static FRONT_MATTER_REGEX: Lazy<Regex> = Lazy::new(|| {
16 Regex::new(r"(?ms)^---\s*\n(.*?)\n---\s*\n")
17 .expect("Failed to compile FRONT_MATTER_REGEX")
18});
19
20static HEADER_REGEX: Lazy<Regex> = Lazy::new(|| {
21 Regex::new(r"<(h[1-6])(?:\s[^>]*)?>(.+?)</h[1-6]>")
22 .expect("Failed to compile HEADER_REGEX")
23});
24
25static CONSECUTIVE_HYPHENS_REGEX: Lazy<Regex> = Lazy::new(|| {
26 Regex::new(r"-{2,}")
27 .expect("Failed to compile CONSECUTIVE_HYPHENS_REGEX")
28});
29
30const MAX_INPUT_SIZE: usize = 1_000_000; pub fn extract_front_matter(content: &str) -> Result<String> {
59 if content.is_empty() {
60 return Err(HtmlError::InvalidInput("Empty input".to_string()));
61 }
62 if content.len() > MAX_INPUT_SIZE {
63 return Err(HtmlError::InputTooLarge(content.len()));
64 }
65
66 if content.starts_with("---") {
67 if let Some(captures) = FRONT_MATTER_REGEX.captures(content) {
68 let front_matter = captures
69 .get(1)
70 .ok_or_else(|| {
71 HtmlError::InvalidFrontMatterFormat(
72 "Missing front matter match".to_string(),
73 )
74 })?
75 .as_str();
76
77 for line in front_matter.lines() {
78 if !line.trim().contains(':') {
79 return Err(HtmlError::InvalidFrontMatterFormat(
80 format!(
81 "Invalid line in front matter: {}",
82 line
83 ),
84 ));
85 }
86 }
87
88 let remaining_content =
89 &content[captures.get(0).unwrap().end()..];
90 Ok(remaining_content.trim().to_string())
91 } else {
92 Err(HtmlError::InvalidFrontMatterFormat(
93 "Invalid front matter format".to_string(),
94 ))
95 }
96 } else {
97 Ok(content.to_string())
98 }
99}
100
101pub fn format_header_with_id_class(
123 header: &str,
124 id_generator: Option<fn(&str) -> String>,
125 class_generator: Option<fn(&str) -> String>,
126) -> Result<String> {
127 let captures = HEADER_REGEX.captures(header).ok_or_else(|| {
128 HtmlError::InvalidHeaderFormat(
129 "Invalid header format".to_string(),
130 )
131 })?;
132
133 let tag = captures
134 .get(1)
135 .ok_or_else(|| {
136 HtmlError::InvalidHeaderFormat(
137 "Missing header tag".to_string(),
138 )
139 })?
140 .as_str();
141
142 let text_content = captures
143 .get(2)
144 .ok_or_else(|| {
145 HtmlError::InvalidHeaderFormat(
146 "Missing header content".to_string(),
147 )
148 })?
149 .as_str();
150
151 let id = id_generator.map_or_else(
152 || generate_id(text_content),
153 |generator| generator(text_content),
154 );
155 let class = class_generator.map_or_else(
156 || generate_id(text_content),
157 |generator| generator(text_content),
158 );
159
160 Ok(format!(
161 r#"<{} id="{}" class="{}">{}</{}>"#,
162 tag, id, class, text_content, tag
163 ))
164}
165
166pub fn generate_table_of_contents(html: &str) -> Result<String> {
186 if html.is_empty() {
187 return Err(HtmlError::InvalidInput("Empty input".to_string()));
188 }
189 if html.len() > MAX_INPUT_SIZE {
190 return Err(HtmlError::InputTooLarge(html.len()));
191 }
192
193 let mut toc = String::new();
194 toc.push_str("<ul>");
195
196 for captures in HEADER_REGEX.captures_iter(html) {
197 if let Some(tag) = captures.get(1) {
198 let content = captures.get(2).map_or("", |m| m.as_str());
199 let id = generate_id(content);
200 toc.push_str(&format!(
201 r#"<li class="toc-{}"><a href="\#{}">{}</a></li>"#,
202 tag.as_str(),
203 id,
204 content
205 ));
206 }
207 }
208
209 toc.push_str("</ul>");
210 Ok(toc)
211}
212
213pub fn is_valid_aria_role(role: &str, element: &ElementRef) -> bool {
224 static VALID_ROLES: Lazy<HashMap<&'static str, Vec<&'static str>>> =
225 Lazy::new(|| {
226 let mut roles = HashMap::new();
227 let _ =
228 roles.insert("a", vec!["link", "button", "menuitem"]);
229 let _ = roles.insert("button", vec!["button"]);
230 let _ =
231 roles.insert("div", vec!["alert", "tooltip", "dialog"]);
232 let _ = roles.insert(
233 "input",
234 vec!["textbox", "radio", "checkbox", "searchbox"],
235 );
236 roles
237 });
238
239 if let Some(valid_roles) = VALID_ROLES.get(element.value().name()) {
240 valid_roles.contains(&role)
241 } else {
242 false
243 }
244}
245
246pub fn is_valid_language_code(lang: &str) -> bool {
256 let parts: Vec<&str> = lang.split('-').collect();
257 if parts.is_empty() || parts[0].len() < 2 || parts[0].len() > 3 {
258 return false;
259 }
260 parts[0].chars().all(|c| c.is_ascii_lowercase())
261}
262
263fn generate_id(content: &str) -> String {
273 CONSECUTIVE_HYPHENS_REGEX
274 .replace_all(
275 &content
276 .to_lowercase()
277 .replace(|c: char| !c.is_alphanumeric(), "-"),
278 "-",
279 )
280 .trim_matches('-')
281 .to_string()
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287 use scraper::Html;
288
289 mod extract_front_matter_tests {
291 use super::*;
292
293 #[test]
294 fn test_valid_front_matter() {
295 let content = "---\ntitle: My Page\n---\n# Hello, world!\n\nThis is a test.";
296 let result = extract_front_matter(content);
297 assert!(
298 result.is_ok(),
299 "Expected Ok, got Err: {:?}",
300 result
301 );
302 if let Ok(extracted) = result {
303 assert_eq!(
304 extracted,
305 "# Hello, world!\n\nThis is a test."
306 );
307 }
308 }
309
310 #[test]
311 fn test_no_front_matter() {
312 let content = "# Hello, world!\n\nThis is a test without front matter.";
313 let result = extract_front_matter(content);
314 assert!(
315 result.is_ok(),
316 "Expected Ok, got Err: {:?}",
317 result
318 );
319 if let Ok(extracted) = result {
320 assert_eq!(extracted, content);
321 }
322 }
323
324 #[test]
325 fn test_empty_input() {
326 let content = "";
327 let result = extract_front_matter(content);
328 assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
329 }
330
331 #[test]
332 fn test_exceeding_max_input_size() {
333 let content = "a".repeat(MAX_INPUT_SIZE + 1);
334 let result = extract_front_matter(&content);
335 assert!(matches!(result, Err(HtmlError::InputTooLarge(_))));
336 }
337
338 #[test]
339 fn test_invalid_front_matter_format() {
340 let content =
341 "---\ntitle: value\ninvalid_line\n---\nContent";
342 let result = extract_front_matter(content);
343 assert!(matches!(
344 result,
345 Err(HtmlError::InvalidFrontMatterFormat(_))
346 ));
347 }
348
349 #[test]
350 fn test_valid_front_matter_with_extra_content() {
351 let content = "---\ntitle: Page\n---\n\n# Title\n\nContent";
352 let result = extract_front_matter(content);
353 assert!(result.is_ok());
354 assert_eq!(result.unwrap(), "# Title\n\nContent");
355 }
356
357 #[test]
358 fn test_extract_front_matter_with_mid_document_delimiter() {
359 let content = "# Title\nContent\n---\nkey: value\n---";
360 let result = extract_front_matter(content);
361 assert!(result.is_ok());
362 assert_eq!(result.unwrap(), content);
363 }
364 }
365
366 mod format_header_with_id_class_tests {
368 use super::*;
369
370 #[test]
371 fn test_valid_header_default_generators() {
372 let header = "<h2>Hello, World!</h2>";
373 let result =
374 format_header_with_id_class(header, None, None);
375 assert!(
376 result.is_ok(),
377 "Expected Ok, got Err: {:?}",
378 result
379 );
380 if let Ok(formatted) = result {
381 assert_eq!(formatted, "<h2 id=\"hello-world\" class=\"hello-world\">Hello, World!</h2>");
382 }
383 }
384
385 #[test]
386 fn test_custom_id_and_class_generators() {
387 let header = "<h3>Test Header</h3>";
388 fn id_gen(content: &str) -> String {
389 format!(
390 "custom-{}",
391 content.to_lowercase().replace(' ', "-")
392 )
393 }
394 fn class_gen(_: &str) -> String {
395 "custom-class".to_string()
396 }
397 let result = format_header_with_id_class(
398 header,
399 Some(id_gen),
400 Some(class_gen),
401 );
402 assert!(
403 result.is_ok(),
404 "Expected Ok, got Err: {:?}",
405 result
406 );
407 if let Ok(formatted) = result {
408 assert_eq!(formatted, "<h3 id=\"custom-test-header\" class=\"custom-class\">Test Header</h3>");
409 }
410 }
411
412 #[test]
413 fn test_invalid_header_format() {
414 let header = "<p>Not a header</p>";
415 let result =
416 format_header_with_id_class(header, None, None);
417 assert!(matches!(
418 result,
419 Err(HtmlError::InvalidHeaderFormat(_))
420 ));
421 }
422
423 #[test]
424 fn test_header_with_nested_tags() {
425 let header = "<h2><span>Nested Header</span></h2>";
426 let result =
427 format_header_with_id_class(header, None, None);
428 assert!(result.is_ok());
429 assert_eq!(
430 result.unwrap(),
431 "<h2 id=\"span-nested-header-span\" class=\"span-nested-header-span\"><span>Nested Header</span></h2>"
432 );
433 }
434
435 #[test]
436 fn test_format_header_with_long_content() {
437 let header = format!("<h1>{}</h1>", "a".repeat(300));
438 let result =
439 format_header_with_id_class(&header, None, None);
440 assert!(result.is_ok());
441 }
442
443 #[test]
444 fn test_header_with_special_characters() {
445 let header = "<h3>Special & Header!</h3>";
446 let result =
447 format_header_with_id_class(header, None, None);
448 assert!(result.is_ok());
449 assert_eq!(
450 result.unwrap(),
451 "<h3 id=\"special-header\" class=\"special-header\">Special & Header!</h3>"
452 );
453 }
454 }
455
456 mod generate_table_of_contents_tests {
458 use super::*;
459
460 #[test]
461 fn test_valid_html_with_headers() {
462 let html = "<h1>Title</h1><h2>Subtitle</h2>";
463 let result = generate_table_of_contents(html);
464 assert!(
465 result.is_ok(),
466 "Expected Ok, got Err: {:?}",
467 result
468 );
469 if let Ok(toc) = result {
470 assert_eq!(
471 toc,
472 r#"<ul><li class="toc-h1"><a href="\#title">Title</a></li><li class="toc-h2"><a href="\#subtitle">Subtitle</a></li></ul>"#
473 );
474 }
475 }
476
477 #[test]
478 fn test_html_without_headers() {
479 let html = "<p>No headers here.</p>";
480 let result = generate_table_of_contents(html);
481 assert!(
482 result.is_ok(),
483 "Expected Ok, got Err: {:?}",
484 result
485 );
486 if let Ok(toc) = result {
487 assert_eq!(toc, "<ul></ul>");
488 }
489 }
490
491 #[test]
492 fn test_empty_html() {
493 let html = "";
494 let result = generate_table_of_contents(html);
495 assert!(matches!(result, Err(HtmlError::InvalidInput(_))));
496 }
497
498 #[test]
499 fn test_large_html_content() {
500 let html = "<h1>Header</h1>".repeat(1000);
501 let result = generate_table_of_contents(&html);
502 assert!(result.is_ok());
503 }
504
505 #[test]
506 fn test_generate_table_of_contents_with_malformed_html() {
507 let html = "<h1>Title<h2>Subtitle";
508 let result = generate_table_of_contents(html);
509 assert!(result.is_ok());
510 assert_eq!(result.unwrap(), "<ul></ul>");
511 }
512
513 #[test]
514 fn test_generate_table_of_contents_with_attributes() {
515 let html = r#"<h1 class="header-class">Header</h1>"#;
516 let result = generate_table_of_contents(html);
517 assert!(result.is_ok());
518 assert_eq!(
519 result.unwrap(),
520 r#"<ul><li class="toc-h1"><a href="\#header">Header</a></li></ul>"#
521 );
522 }
523 }
524
525 mod aria_validation_tests {
527 use super::*;
528
529 #[test]
530 fn test_valid_aria_role_for_button() {
531 let html =
532 Html::parse_fragment("<button role='button'></button>");
533 let element = html
534 .select(&scraper::Selector::parse("button").unwrap())
535 .next()
536 .unwrap();
537 assert!(is_valid_aria_role("button", &element));
538 }
539
540 #[test]
541 fn test_invalid_aria_role_for_button() {
542 let html =
543 Html::parse_fragment("<button role='link'></button>");
544 let element = html
545 .select(&scraper::Selector::parse("button").unwrap())
546 .next()
547 .unwrap();
548 assert!(!is_valid_aria_role("link", &element));
549 }
550
551 #[test]
552 fn test_missing_required_aria_properties() {
553 let html =
554 Html::parse_fragment(r#"<div role="slider"></div>"#);
555 let element = html
556 .select(&scraper::Selector::parse("div").unwrap())
557 .next()
558 .unwrap();
559 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
560 assert_eq!(
561 missing.unwrap(),
562 vec![
563 "aria-valuenow".to_string(),
564 "aria-valuemin".to_string(),
565 "aria-valuemax".to_string()
566 ]
567 );
568 }
569
570 #[test]
571 fn test_get_missing_required_aria_properties_valid_role() {
572 let html = Html::parse_fragment(
573 r#"<div role="slider" aria-valuenow="10" aria-valuemin="0" aria-valuemax="100"></div>"#,
574 );
575 let element = html
576 .select(&scraper::Selector::parse("div").unwrap())
577 .next()
578 .unwrap();
579 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
580 assert!(missing.is_none());
581 }
582
583 #[test]
584 fn test_get_missing_required_aria_properties_unknown_role() {
585 let html =
586 Html::parse_fragment(r#"<div role="unknown"></div>"#);
587 let element = html
588 .select(&scraper::Selector::parse("div").unwrap())
589 .next()
590 .unwrap();
591 let missing = crate::accessibility::utils::get_missing_required_aria_properties(&element);
592 assert!(missing.is_none());
593 }
594 }
595
596 mod utility_function_tests {
598 use super::*;
599
600 #[test]
601 fn test_generate_id() {
602 let content = "Test Header!";
603 let result = generate_id(content);
604 assert_eq!(result, "test-header");
605 }
606
607 #[test]
608 fn test_generate_id_with_special_characters() {
609 let content = "Header--with??special**chars";
610 let result = generate_id(content);
611 assert_eq!(result, "header-with-special-chars");
612 }
613
614 #[test]
615 fn test_generate_id_with_leading_trailing_whitespace() {
616 let content = " Test Header ";
617 let result = generate_id(content);
618 assert_eq!(result, "test-header");
619 }
620
621 #[test]
622 fn test_generate_id_with_numeric_content() {
623 let content = "12345";
624 let result = generate_id(content);
625 assert_eq!(result, "12345");
626 }
627
628 #[test]
629 fn test_is_valid_language_code() {
630 assert!(is_valid_language_code("en"));
631 assert!(is_valid_language_code("en-US"));
632 assert!(!is_valid_language_code("E"));
633 assert!(!is_valid_language_code("123"));
634 }
635
636 #[test]
637 fn test_is_valid_language_code_long_code() {
638 assert!(is_valid_language_code("en-US-variant-123"));
639 }
640
641 #[test]
642 fn test_is_valid_language_code_non_ascii() {
643 assert!(!is_valid_language_code("日本語"));
644 }
645
646 #[test]
648 fn test_extract_front_matter_empty_delimiters() {
649 let content = "------\n# Missing proper front matter";
650 let result = extract_front_matter(content);
651 assert!(matches!(
652 result,
653 Err(HtmlError::InvalidFrontMatterFormat(_))
654 ));
655 }
656
657 #[test]
658 fn test_extract_front_matter_large_content_valid_front_matter()
659 {
660 let large_content = format!(
661 "---\nkey: value\n---\n{}",
662 "Content".repeat(5000)
663 );
664 let result = extract_front_matter(&large_content);
665 assert!(result.is_ok());
666 }
667
668 #[test]
670 fn test_format_header_with_malformed_html() {
671 let header = "<h2 Missing closing>";
672 let result =
673 format_header_with_id_class(header, None, None);
674 assert!(matches!(
675 result,
676 Err(HtmlError::InvalidHeaderFormat(_))
677 ));
678 }
679
680 #[test]
681 fn test_format_header_with_inline_styles() {
682 let header =
683 r#"<h2 style="color: red;">Styled Header</h2>"#;
684 let result =
685 format_header_with_id_class(header, None, None);
686 assert!(result.is_ok());
687 assert_eq!(
688 result.unwrap(),
689 "<h2 id=\"styled-header\" class=\"styled-header\">Styled Header</h2>"
690 );
691 }
692
693 #[test]
695 fn test_toc_with_nested_headers() {
696 let html = "<div><h1>Outer</h1><h2>Inner</h2></div>";
697 let result = generate_table_of_contents(html);
698 assert!(result.is_ok());
699 assert_eq!(
700 result.unwrap(),
701 r#"<ul><li class="toc-h1"><a href="\#outer">Outer</a></li><li class="toc-h2"><a href="\#inner">Inner</a></li></ul>"#
702 );
703 }
704
705 #[test]
706 fn test_toc_with_malformed_and_valid_headers() {
707 let html = "<h1>Valid</h1><h2 Malformed>";
708 let result = generate_table_of_contents(html);
709 assert!(result.is_ok());
710 assert_eq!(
711 result.unwrap(),
712 r#"<ul><li class="toc-h1"><a href="\#valid">Valid</a></li></ul>"#
713 );
714 }
715
716 #[test]
718 fn test_unsupported_html_element() {
719 let html = Html::parse_fragment(
720 "<unsupported role='custom'></unsupported>",
721 );
722 let element = html
723 .select(
724 &scraper::Selector::parse("unsupported").unwrap(),
725 )
726 .next()
727 .unwrap();
728 assert!(!is_valid_aria_role("custom", &element));
729 }
730
731 #[test]
733 fn test_is_valid_language_code_with_mixed_case() {
734 assert!(!is_valid_language_code("eN-uS"));
735 assert!(!is_valid_language_code("En#Us"));
736 }
737
738 #[test]
740 fn test_generate_id_empty_content() {
741 let content = "";
742 let result = generate_id(content);
743 assert_eq!(result, "");
744 }
745
746 #[test]
747 fn test_generate_id_whitespace_content() {
748 let content = " ";
749 let result = generate_id(content);
750 assert_eq!(result, "");
751 }
752
753 #[test]
754 fn test_generate_id_symbols_only() {
755 let content = "!@#$%^&*()";
756 let result = generate_id(content);
757 assert_eq!(result, "");
758 }
759 }
760}