is_html/
lib.rs

1use regex::Regex;
2
3const HTML_TAGS: [&str; 117] = [
4    "a",
5    "abbr",
6    "address",
7    "area",
8    "article",
9    "aside",
10    "audio",
11    "b",
12    "base",
13    "bdi",
14    "bdo",
15    "blockquote",
16    "body",
17    "br",
18    "button",
19    "canvas",
20    "caption",
21    "cite",
22    "code",
23    "col",
24    "colgroup",
25    "data",
26    "datalist",
27    "dd",
28    "del",
29    "details",
30    "dfn",
31    "dialog",
32    "div",
33    "dl",
34    "dt",
35    "em",
36    "embed",
37    "fieldset",
38    "figcaption",
39    "figure",
40    "footer",
41    "form",
42    "h1",
43    "h2",
44    "h3",
45    "h4",
46    "h5",
47    "h6",
48    "head",
49    "header",
50    "hgroup",
51    "hr",
52    "html",
53    "i",
54    "iframe",
55    "img",
56    "input",
57    "ins",
58    "kbd",
59    "label",
60    "legend",
61    "li",
62    "link",
63    "main",
64    "map",
65    "mark",
66    "math",
67    "menu",
68    "menuitem",
69    "meta",
70    "meter",
71    "nav",
72    "noscript",
73    "object",
74    "ol",
75    "optgroup",
76    "option",
77    "output",
78    "p",
79    "param",
80    "picture",
81    "pre",
82    "progress",
83    "q",
84    "rb",
85    "rp",
86    "rt",
87    "rtc",
88    "ruby",
89    "s",
90    "samp",
91    "script",
92    "section",
93    "select",
94    "slot",
95    "small",
96    "source",
97    "span",
98    "strong",
99    "style",
100    "sub",
101    "summary",
102    "sup",
103    "svg",
104    "table",
105    "tbody",
106    "td",
107    "template",
108    "textarea",
109    "tfoot",
110    "th",
111    "thead",
112    "time",
113    "title",
114    "tr",
115    "track",
116    "u",
117    "ul",
118    "var",
119    "video",
120    "wbr",
121];
122
123/// Check if a string is HTML
124pub fn is_html(str: &str) -> bool {
125    let re = Regex::new(r"\s?<!doctype html>|(<html\b[^>]*>|<body\b[^>]*>|<x-[^>]+>)+").unwrap();
126    let re_full_str = HTML_TAGS.map(|x| format!("<{}\\b[^>]*>", x)).join("|");
127    let re_full = Regex::new(re_full_str.as_str()).unwrap();
128    re.is_match(str) || re_full.is_match(str)
129}
130
131#[cfg(test)]
132mod tests {
133    use crate::is_html;
134
135    #[test]
136    fn it_works() {
137        assert!(is_html("<!doctype html>"));
138        assert!(is_html("\n\n<!doctype html><html>"));
139        assert!(is_html("<html>"));
140        assert!(is_html("<html></html>"));
141        assert!(is_html("<html lang='en'></html>"));
142        assert!(is_html("<html><body></html>"));
143        assert!(is_html("<html><body class='no-js'></html>"));
144
145        assert!(is_html("<p>foo</p>"));
146        assert!(is_html("<a href='#'>foo</a>"));
147    }
148
149    #[test]
150    fn it_not_works() {
151        assert!(!is_html("<cake>foo</cake>"));
152        assert!(!is_html("<any>rocks</any>"));
153        assert!(!is_html("<cake>foo</cake>"));
154        assert!(!is_html("<bodyx>not</bodyx>"));
155    }
156}