1use kuchiki::{parse_html, parse_html_with_options, Attributes, NodeRef, ParseOpts};
379use kuchiki::{traits::*, ExpandedName};
380use std::cell::RefCell;
381use std::collections::{BTreeMap, BTreeSet};
382use std::ops::Deref;
383use std::rc::Rc;
384
385pub struct Pattern(NodeRef);
418
419impl Pattern {
420 pub fn new(pattern_str: &str) -> Result<Pattern, String> {
422 let doc = filter_whitespace(parse_html_strict(pattern_str)?).unwrap();
423 Ok(Pattern(doc))
424 }
425
426 pub fn matches(&self, html: &str) -> Vec<BTreeMap<String, String>> {
429 let doc = filter_whitespace(parse_html().one(html)).unwrap();
430 match_subtree(&doc, &self.0, false)
431 }
432}
433
434fn parse_html_strict(s: &str) -> Result<NodeRef, String> {
435 let errs = Rc::new(RefCell::new(vec![]));
436 let ret = {
437 let mut opts = ParseOpts::default();
438 let errs = Rc::clone(&errs);
439 opts.on_parse_error = Some(Box::new(move |err| {
440 if err != "Unexpected token" {
442 errs.borrow_mut().push(err.to_string())
443 }
444 }));
445 parse_html_with_options(opts).one(s)
446 };
447 let errs = errs.deref().borrow().clone();
448 let mut m = BTreeSet::new();
449 let mut errs_uniq = vec![];
450 for err in errs {
451 if !m.contains(&err) {
452 m.insert(err.clone());
453 errs_uniq.push(err);
454 }
455 }
456
457 if errs_uniq.is_empty() {
458 Ok(ret)
459 } else {
460 Err(errs_uniq.join(", "))
461 }
462}
463
464fn match_subtree(doc: &NodeRef, pattern: &NodeRef, exact: bool) -> Vec<BTreeMap<String, String>> {
465 let mut ret = vec![];
466
467 if let (Some(_), Some(_)) = (doc.as_doctype(), pattern.as_doctype()) {
468 let doc_cs = doc.children().collect::<Vec<_>>();
469 let pat_cs = pattern.children().collect::<Vec<_>>();
470 ret.append(&mut match_siblings(&doc_cs, &pat_cs, false));
471 }
472
473 if let (Some(_), Some(_)) = (doc.as_document(), pattern.as_document()) {
474 let doc_cs = doc.children().collect::<Vec<_>>();
475 let pat_cs = pattern.children().collect::<Vec<_>>();
476 ret.append(&mut match_siblings(&doc_cs, &pat_cs, false));
477 }
478
479 if let (Some(e1), Some(e2)) = (doc.as_element(), pattern.as_element()) {
480 if e1.name == e2.name {
481 if let Some(m1) = match_attributes(
482 e1.attributes.borrow().deref(),
483 e2.attributes.borrow().deref(),
484 ) {
485 let subseq = e2
486 .attributes
487 .borrow()
488 .deref()
489 .map
490 .keys()
491 .any(|k| k.local.as_ref() == "subseq");
492
493 let doc_cs = doc.children().collect::<Vec<_>>();
494 let pat_cs = pattern.children().collect::<Vec<_>>();
495
496 let pat_cs = if e2.name.local.as_ref() == "table"
498 && pat_cs.len() == 1
499 && pat_cs[0].as_element().map(|r| r.name.local.as_ref()) == Some("tbody")
500 {
501 pat_cs[0].children().collect::<Vec<_>>()
502 } else {
503 pat_cs
504 };
505
506 let m2 = match_siblings(&doc_cs, &pat_cs, subseq);
507
508 ret.append(&mut map_product(vec![m1], m2));
509 }
510 }
511 }
512
513 if let Some(pat_text) = pattern.as_text() {
514 if let Some(var) = is_var(pat_text.borrow().as_ref()) {
515 assert!(!var.whole);
516
517 if let Some(doc_text) = doc.as_text() {
518 return vec![singleton(var.name, doc_text.borrow().trim().to_owned())];
519 }
520
521 return vec![];
522 }
523
524 if let Some(doc_text) = doc.as_text() {
525 if let Some(m) = match_text(doc_text.borrow().trim(), pat_text.borrow().trim()) {
526 return vec![m];
527 }
528 }
529
530 return vec![];
532 }
533
534 if !exact {
535 for doc_child in doc.children() {
536 ret.append(&mut match_subtree(&doc_child, pattern, false));
537 }
538 }
539
540 ret
541}
542
543fn match_siblings(
544 doc: &[NodeRef],
545 pattern: &[NodeRef],
546 subseq: bool,
547) -> Vec<BTreeMap<String, String>> {
548 if pattern.is_empty() {
549 return vec![BTreeMap::new()];
550 }
551
552 if doc.is_empty() {
553 return vec![];
554 }
555
556 if pattern.len() == 1 {
558 if let Some(pat_text) = pattern[0].as_text() {
559 if let Some(var) = is_var(pat_text.borrow().as_ref()) {
560 if var.whole {
561 let texts = doc.iter().map(|r| r.to_string()).collect::<Vec<_>>();
562 return vec![singleton(var.name, texts.concat())];
563 }
564 }
565 }
566 }
567
568 let mut ret = vec![];
569
570 if subseq {
572 ret.append(&mut match_siblings_direct(&doc[..], pattern, subseq));
573 } else {
574 for i in 0..doc.len() {
575 ret.append(&mut match_siblings_direct(&doc[i..], pattern, subseq));
576 }
577 }
578
579 for d in doc.iter() {
581 ret.append(&mut match_descendants(d, pattern, subseq));
582 }
583
584 ret
585}
586
587fn match_siblings_direct(
591 doc: &[NodeRef],
592 pattern: &[NodeRef],
593 subseq: bool,
594) -> Vec<BTreeMap<String, String>> {
595 let non_skip_len = pattern
596 .iter()
597 .filter(|r| {
598 if let Some(text) = r.as_text() {
599 !is_skip(text.borrow().as_ref())
600 } else {
601 true
602 }
603 })
604 .count();
605
606 if non_skip_len == 0 {
607 return vec![BTreeMap::new()];
608 }
609
610 if non_skip_len > doc.len() {
611 return vec![];
612 }
613
614 if let Some(text) = pattern[0].as_text() {
615 if is_skip(text.borrow().as_ref()) {
616 let mut ret = vec![];
617 for i in 0..doc.len() {
618 ret.append(&mut match_siblings_direct(&doc[i..], &pattern[1..], subseq));
619 }
620 return ret;
621 }
622 }
623
624 let a = match_subtree(&doc[0], &pattern[0], true);
625
626 let mut ret = if !a.is_empty() {
627 map_product(a, match_siblings_direct(&doc[1..], &pattern[1..], subseq))
628 } else {
629 vec![]
630 };
631
632 if subseq {
633 ret.append(&mut match_siblings_direct(&doc[1..], pattern, subseq));
634 }
635
636 ret
637}
638
639fn match_descendants(
640 doc: &NodeRef,
641 pattern: &[NodeRef],
642 subseq: bool,
643) -> Vec<BTreeMap<String, String>> {
644 if pattern.is_empty() {
645 return vec![BTreeMap::new()];
646 }
647
648 let mut ret = vec![];
649 let cs = doc.children().collect::<Vec<_>>();
650 ret.append(&mut match_siblings(&cs, pattern, subseq));
651 ret
652}
653
654fn match_text(doc: &str, pat: &str) -> Option<BTreeMap<String, String>> {
655 if pat.find("{{").is_some() && pat.find("}}").is_some() {
656 let mut re_str = String::new();
658
659 re_str += "^";
660
661 let mut vars = vec![];
662
663 let mut cur = pat;
664
665 while let Some(ix) = cur.find("{{") {
666 re_str += &cur[0..ix];
667 cur = &cur[ix + 2..];
668 let close = cur.find("}}");
669 assert!(close.is_some(), "Invalid text pattern: \"{}\"", pat);
670 let close = close.unwrap();
671 vars.push(&cur[0..close]);
672 re_str += "(.*)";
673 cur = &cur[close + 2..];
674 }
675
676 re_str += cur;
677 re_str += "$";
678
679 let re = regex::Regex::new(&re_str).unwrap();
680
681 if let Some(caps) = re.captures(doc) {
682 let mut ret = BTreeMap::new();
683 for i in 0..vars.len() {
684 ret.insert(vars[i].to_owned(), caps[i + 1].to_string());
685 }
686 Some(ret)
687 } else {
688 None
689 }
690 } else {
691 if doc == pat {
692 Some(BTreeMap::new())
693 } else {
694 None
695 }
696 }
697}
698
699fn map_product(
700 a: Vec<BTreeMap<String, String>>,
701 b: Vec<BTreeMap<String, String>>,
702) -> Vec<BTreeMap<String, String>> {
703 let mut ret = vec![];
704 for a in a {
705 for b in b.iter() {
706 let mut a = a.clone();
707 a.append(&mut b.clone());
708 ret.push(a);
709 }
710 }
711 ret
712}
713
714struct Variable {
715 name: String,
716 whole: bool,
717}
718
719fn is_var(s: &str) -> Option<Variable> {
720 let s = s.trim();
721 if s.starts_with("{{") && s.ends_with("}}") {
722 let var = &s[2..s.len() - 2];
723 let mut it = var.split(':');
724 let var = it.next()?;
725
726 if let Some(qual) = it.next() {
727 if qual == "*" {
728 Some(Variable {
729 name: var.to_owned(),
730 whole: true,
731 })
732 } else {
733 None
734 }
735 } else {
736 Some(Variable {
737 name: var.to_owned(),
738 whole: false,
739 })
740 }
741 } else {
742 None
743 }
744}
745
746fn is_skip(s: &str) -> bool {
747 s.trim() == "..."
748}
749
750fn is_special_attr(n: &ExpandedName) -> bool {
751 let s = n.local.as_ref();
752 s == "subseq"
753}
754
755fn singleton(key: String, val: String) -> BTreeMap<String, String> {
756 let mut ret = BTreeMap::new();
757 ret.insert(key, val);
758 ret
759}
760
761fn match_attributes(a1: &Attributes, a2: &Attributes) -> Option<BTreeMap<String, String>> {
762 let a1 = &a1.map;
763 let a2 = &a2.map;
764
765 let mut ret = BTreeMap::new();
766
767 for (k2, v2) in a2.iter() {
768 if is_special_attr(k2) {
769 continue;
770 }
771
772 if let Some(v1) = a1.get(k2) {
773 if let Some(var) = is_var(&v2.value) {
774 assert!(!var.whole);
776 ret.insert(var.name, v1.value.trim().to_owned());
777 } else if {
778 let x = v2.value.find("{{");
779 let y = v2.value.find("}}");
780 x.is_some() && y.is_some() && x < y
781 } {
782 let t = match_text(&v1.value, &v2.value);
784 if t.is_none() {
785 return None;
786 }
787 ret.append(&mut t.unwrap())
788 } else if !is_subset(&v1.value, &v2.value) {
789 return None;
791 }
792 } else {
793 return None;
794 }
795 }
796
797 Some(ret)
798}
799
800fn is_subset(s1: &str, s2: &str) -> bool {
801 let ws1 = s1.split_whitespace().collect::<Vec<_>>();
802 for w in s2.split_whitespace() {
803 if !ws1.contains(&w) {
804 return false;
805 }
806 }
807 true
808}
809
810fn filter_whitespace(node: NodeRef) -> Option<NodeRef> {
811 if let Some(dt) = node.as_doctype() {
812 assert!(node.first_child().is_none());
813
814 Some(NodeRef::new_doctype(&dt.name, &dt.public_id, &dt.system_id))
815 } else if let Some(_) = node.as_document() {
816 let ret = NodeRef::new_document();
817 for child in node.children() {
818 if let Some(child) = filter_whitespace(child) {
819 ret.append(child);
820 }
821 }
822 Some(ret)
823 } else if let Some(element) = node.as_element() {
824 let ret = NodeRef::new_element(
825 element.name.clone(),
826 element.attributes.borrow().map.clone(),
827 );
828
829 for child in node.children() {
830 if let Some(child) = filter_whitespace(child) {
831 ret.append(child);
832 }
833 }
834
835 Some(ret)
836 } else if let Some(text) = node.as_text() {
837 assert!(node.first_child().is_none());
838
839 let text = text.borrow();
840 let text = text.trim();
841
842 if text == "" {
843 None
844 } else {
845 Some(NodeRef::new_text(text.to_owned()))
846 }
847 } else if let Some(_) = node.as_comment() {
848 assert!(node.first_child().is_none());
849 None
850 } else {
851 unreachable!()
852 }
853}
854
855#[test]
856fn test_basic() {
857 let doc = r#"
858<!DOCTYPE html>
859<html lang="en">
860 <head>
861 </head>
862 <body>
863 <ul>
864 <li>1</li>
865 <li>2</li>
866 <li>3</li>
867 </ul>
868 </body>
869</html>
870"#;
871
872 let pat = Pattern::new(
873 r#"
874<ul>
875 <li>{{hoge}}</li>
876</ul>
877"#,
878 )
879 .unwrap();
880
881 let ms = pat.matches(doc);
882 assert_eq!(ms.len(), 3);
883 assert_eq!(ms[0]["hoge"], "1");
884 assert_eq!(ms[1]["hoge"], "2");
885 assert_eq!(ms[2]["hoge"], "3");
886
887 let pat = Pattern::new(
888 r#"
889<ul>
890 <li>{{hoge}}</li>
891 <li>{{moge}}</li>
892</ul>
893"#,
894 )
895 .unwrap();
896
897 let ms = pat.matches(doc);
898 assert_eq!(ms.len(), 2);
899 assert_eq!(ms[0]["hoge"], "1");
900 assert_eq!(ms[0]["moge"], "2");
901 assert_eq!(ms[1]["hoge"], "2");
902 assert_eq!(ms[1]["moge"], "3");
903}
904
905#[test]
906fn test_attribute() {
907 let doc = r#"
908<!DOCTYPE html>
909<html lang="en">
910 <head>
911 </head>
912 <body>
913 <div class="foo bar baz">
914 hello
915 </div>
916 </body>
917</html>
918"#;
919
920 let pat = Pattern::new(r#"<div>{{foo}}</div>"#).unwrap();
921 let ms = pat.matches(doc);
922 assert_eq!(ms.len(), 1);
923 assert_eq!(ms[0]["foo"], "hello");
924
925 let pat = Pattern::new(r#"<div class="">{{foo}}</div>"#).unwrap();
926 let ms = pat.matches(doc);
927 assert_eq!(ms.len(), 1);
928 assert_eq!(ms[0]["foo"], "hello");
929
930 let pat = Pattern::new(r#"<div class="foo">{{foo}}</div>"#).unwrap();
931 let ms = pat.matches(doc);
932 assert_eq!(ms.len(), 1);
933 assert_eq!(ms[0]["foo"], "hello");
934
935 let pat = Pattern::new(r#"<div class="foo bar">{{foo}}</div>"#).unwrap();
936 let ms = pat.matches(doc);
937 assert_eq!(ms.len(), 1);
938 assert_eq!(ms[0]["foo"], "hello");
939
940 let pat = Pattern::new(r#"<div class="foo bar baz">{{foo}}</div>"#).unwrap();
941 let ms = pat.matches(doc);
942 assert_eq!(ms.len(), 1);
943 assert_eq!(ms[0]["foo"], "hello");
944
945 let pat = Pattern::new(r#"<div class="baz foo bar">{{foo}}</div>"#).unwrap();
946 let ms = pat.matches(doc);
947 assert_eq!(ms.len(), 1);
948 assert_eq!(ms[0]["foo"], "hello");
949
950 let pat = Pattern::new(r#"<div class="hoge">{{foo}}</div>"#).unwrap();
951 let ms = pat.matches(doc);
952 assert_eq!(ms.len(), 0);
953
954 let pat = Pattern::new(r#"<div id="">{{foo}}</div>"#).unwrap();
955 let ms = pat.matches(doc);
956 assert_eq!(ms.len(), 0);
957}
958
959#[test]
960fn test_attribute_pattern() {
961 let doc = r#"
962<!DOCTYPE html>
963<html lang="en">
964 <head>
965 </head>
966 <body>
967 <a href="https://www.google.com">Google</a>
968 <p>
969 <a href="https://github.com">GitHub</a>
970 </p>
971 </body>
972</html>
973"#;
974
975 let pat = Pattern::new(r#"<a href="{{url}}">{{link}}</a>"#).unwrap();
976 let ms = pat.matches(doc);
977 assert_eq!(ms.len(), 2);
978 assert_eq!(ms[0]["url"], "https://www.google.com");
979 assert_eq!(ms[0]["link"], "Google");
980 assert_eq!(ms[1]["url"], "https://github.com");
981 assert_eq!(ms[1]["link"], "GitHub");
982}
983
984#[test]
985fn test_skip() {
986 let doc = r#"
987<!DOCTYPE html>
988<html lang="en">
989 <head>
990 </head>
991 <body>
992 <ul>
993 <li>1</li>
994 <li>2</li>
995 <li>3</li>
996 </ul>
997 </body>
998</html>
999"#;
1000
1001 let pat = Pattern::new(
1002 r#"
1003<ul>
1004 <li>{{hoge}}</li>
1005 ...
1006 <li>{{moge}}</li>
1007</ul>
1008"#,
1009 )
1010 .unwrap();
1011
1012 let ms = pat.matches(doc);
1013 assert_eq!(ms.len(), 3);
1014 assert_eq!(ms[0]["hoge"], "1");
1015 assert_eq!(ms[0]["moge"], "2");
1016 assert_eq!(ms[1]["hoge"], "1");
1017 assert_eq!(ms[1]["moge"], "3");
1018 assert_eq!(ms[2]["hoge"], "2");
1019 assert_eq!(ms[2]["moge"], "3");
1020}
1021
1022#[test]
1023fn test_all_match() {
1024 let doc = r#"
1025<!DOCTYPE html>
1026<html lang="en">
1027 <head>
1028 </head>
1029 <body>
1030 Hello
1031 <span>hoge</span>
1032 World
1033 </body>
1034</html>
1035"#;
1036
1037 let pat = Pattern::new(r#"<body>{{body:*}}</body>"#).unwrap();
1038
1039 let ms = pat.matches(doc);
1040 assert_eq!(ms.len(), 1);
1041 assert_eq!(ms[0]["body"], "Hello<span>hoge</span>World");
1042}
1043
1044#[test]
1045fn test_partial() {
1046 let doc = r#"
1047<!DOCTYPE html>
1048<html lang="en">
1049 <head>
1050 </head>
1051 <body>
1052 <ul>
1053 <li>Test 1, 2</li>
1054 <li>Test 3, 4</li>
1055 <li>Test 5, 6</li>
1056 </ul>
1057 </body>
1058</html>
1059"#;
1060
1061 let pat = Pattern::new(r#"<ul>Test {{foo}}, {{bar}}</ul>"#).unwrap();
1062
1063 let ms = pat.matches(doc);
1064 assert_eq!(ms.len(), 3);
1065 assert_eq!(ms[0]["foo"], "1");
1066 assert_eq!(ms[0]["bar"], "2");
1067 assert_eq!(ms[1]["foo"], "3");
1068 assert_eq!(ms[1]["bar"], "4");
1069 assert_eq!(ms[2]["foo"], "5");
1070 assert_eq!(ms[2]["bar"], "6");
1071}
1072
1073#[test]
1074fn test_attr_partial() {
1075 let doc = r#"
1076<!DOCTYPE html>
1077<html lang="en">
1078 <head>
1079 </head>
1080 <body>
1081 <a href="/users/foo/info"></a>
1082 <a href="/users/bar/info"></a>
1083 <a href="/users/baz/info"></a>
1084 </body>
1085</html>
1086"#;
1087
1088 let pat = Pattern::new(r#"<a href="/users/{{user}}/info"></a>"#).unwrap();
1089
1090 let ms = pat.matches(doc);
1091 assert_eq!(ms.len(), 3);
1092 assert_eq!(ms[0]["user"], "foo");
1093 assert_eq!(ms[1]["user"], "bar");
1094 assert_eq!(ms[2]["user"], "baz");
1095}
1096
1097#[test]
1098fn test_table_skip() {
1099 let doc = r#"
1100<!DOCTYPE html>
1101<html lang="en">
1102 <head>
1103 </head>
1104 <body>
1105 <table>
1106 <tr><th>AAA</th><td>aaa</td></tr>
1107 <tr><th>BBB</th><td>bbb</td></tr>
1108 <tr><th>CCC</th><td>ccc</td></tr>
1109 <tr><th>DDD</th><td>ddd</td></tr>
1110 <tr><th>EEE</th><td>eee</td></tr>
1111 </table>
1112 </body>
1113</html>
1114"#;
1115
1116 let pat = Pattern::new(
1117 r#"
1118<table subseq>
1119 <tr><th>AAA</th><td>{{a}}</td></tr>
1120 <tr><th>BBB</th><td>{{b}}</td></tr>
1121 <tr><th>DDD</th><td>{{d}}</td></tr>
1122</table>
1123"#,
1124 )
1125 .unwrap();
1126
1127 let ms = pat.matches(doc);
1128 assert_eq!(ms.len(), 1);
1129 assert_eq!(ms[0]["a"], "aaa");
1130 assert_eq!(ms[0]["b"], "bbb");
1131 assert_eq!(ms[0]["d"], "ddd");
1132}