1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct HtmlAttribute {
7 pub name: String,
8 pub value: Option<String>,
9}
10
11#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct HtmlElement {
14 pub name: String,
15 pub attributes: Vec<HtmlAttribute>,
16}
17
18#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct HtmlLink {
21 pub text: String,
22 pub href: String,
23}
24
25#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct HtmlHeading {
28 pub level: u8,
29 pub text: String,
30}
31
32#[must_use]
34pub fn looks_like_html(input: &str) -> bool {
35 let trimmed = input.trim();
36 if trimmed.is_empty() {
37 return false;
38 }
39
40 let bytes = trimmed.as_bytes();
41 bytes.windows(2).any(|window| {
42 window[0] == b'<' && (window[1].is_ascii_alphabetic() || matches!(window[1], b'/' | b'!'))
43 }) && trimmed.contains('>')
44}
45
46#[must_use]
48pub fn strip_html_comments(input: &str) -> String {
49 let mut result = String::new();
50 let mut remainder = input;
51
52 while let Some(start) = remainder.find("<!--") {
53 result.push_str(&remainder[..start]);
54 let comment = &remainder[start + 4..];
55 if let Some(end) = comment.find("-->") {
56 remainder = &comment[end + 3..];
57 } else {
58 remainder = "";
59 break;
60 }
61 }
62
63 result.push_str(remainder);
64 result
65}
66
67#[must_use]
69pub fn strip_tags_basic(input: &str) -> String {
70 let mut result = String::new();
71 let mut inside_tag = false;
72
73 for character in input.chars() {
74 match character {
75 '<' => inside_tag = true,
76 '>' => inside_tag = false,
77 _ if !inside_tag => result.push(character),
78 _ => {}
79 }
80 }
81
82 result
83}
84
85#[must_use]
87pub fn escape_html(input: &str) -> String {
88 input
89 .replace('&', "&")
90 .replace('<', "<")
91 .replace('>', ">")
92 .replace('"', """)
93 .replace('\'', "'")
94}
95
96#[must_use]
98pub fn unescape_html(input: &str) -> String {
99 input
100 .replace("<", "<")
101 .replace(">", ">")
102 .replace(""", "\"")
103 .replace("'", "'")
104 .replace("&", "&")
105}
106
107#[must_use]
109pub fn extract_links(input: &str) -> Vec<HtmlLink> {
110 let lower = input.to_ascii_lowercase();
111 let mut results = Vec::new();
112 let mut search_start = 0;
113
114 while let Some(start_offset) = lower[search_start..].find("<a") {
115 let start = search_start + start_offset;
116 let Some(open_end_offset) = lower[start..].find('>') else {
117 break;
118 };
119 let open_end = start + open_end_offset;
120 let Some(close_offset) = lower[open_end + 1..].find("</a>") else {
121 break;
122 };
123 let close_start = open_end + 1 + close_offset;
124 let element = &input[start..=open_end];
125 if let Some(href) = get_attribute(element, "href") {
126 let text = strip_tags_basic(&input[open_end + 1..close_start])
127 .trim()
128 .to_string();
129 results.push(HtmlLink { text, href });
130 }
131
132 search_start = close_start + 4;
133 }
134
135 results
136}
137
138#[must_use]
140pub fn extract_headings(input: &str) -> Vec<HtmlHeading> {
141 let lower = input.to_ascii_lowercase();
142 let mut results = Vec::new();
143 let mut search_start = 0;
144
145 while let Some(start_offset) = lower[search_start..].find("<h") {
146 let start = search_start + start_offset;
147 let bytes = lower.as_bytes();
148 let Some(level_byte) = bytes.get(start + 2) else {
149 break;
150 };
151 if !(b'1'..=b'6').contains(level_byte) {
152 search_start = start + 2;
153 continue;
154 }
155
156 let after_level = bytes.get(start + 3).copied();
157 if let Some(after_level) = after_level {
158 if after_level != b'>' && !after_level.is_ascii_whitespace() {
159 search_start = start + 2;
160 continue;
161 }
162 }
163
164 let Some(open_end_offset) = lower[start..].find('>') else {
165 break;
166 };
167 let open_end = start + open_end_offset;
168 let level = level_byte - b'0';
169 let close_tag = format!("</h{level}>");
170 let Some(close_offset) = lower[open_end + 1..].find(&close_tag) else {
171 break;
172 };
173 let close_start = open_end + 1 + close_offset;
174 let text = strip_tags_basic(&input[open_end + 1..close_start])
175 .trim()
176 .to_string();
177 results.push(HtmlHeading { level, text });
178 search_start = close_start + close_tag.len();
179 }
180
181 results
182}
183
184#[must_use]
186pub fn extract_title(input: &str) -> Option<String> {
187 let lower = input.to_ascii_lowercase();
188 let start = lower.find("<title>")? + 7;
189 let end = lower[start..].find("</title>")? + start;
190 Some(strip_tags_basic(&input[start..end]).trim().to_string())
191}
192
193#[must_use]
195pub fn extract_meta_content(input: &str, name: &str) -> Option<String> {
196 let lower = input.to_ascii_lowercase();
197 let mut search_start = 0;
198
199 while let Some(start_offset) = lower[search_start..].find("<meta") {
200 let start = search_start + start_offset;
201 let end_offset = lower[start..].find('>')?;
202 let end = start + end_offset;
203 let element = &input[start..=end];
204 if get_attribute(element, "name")
205 .as_deref()
206 .is_some_and(|value| value.eq_ignore_ascii_case(name))
207 {
208 return get_attribute(element, "content");
209 }
210 search_start = end + 1;
211 }
212
213 None
214}
215
216#[must_use]
218pub fn extract_attributes(element: &str) -> Vec<HtmlAttribute> {
219 let trimmed = element.trim();
220 if !trimmed.starts_with('<') {
221 return Vec::new();
222 }
223
224 let mut inner = trimmed.trim_start_matches('<').trim_end_matches('>').trim();
225 inner = inner.strip_suffix('/').unwrap_or(inner).trim_end();
226
227 let mut index = 0;
228 let bytes = inner.as_bytes();
229 while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
230 index += 1;
231 }
232
233 let mut attributes = Vec::new();
234 while index < bytes.len() {
235 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
236 index += 1;
237 }
238 if index >= bytes.len() {
239 break;
240 }
241
242 let name_start = index;
243 while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
244 index += 1;
245 }
246 let name = inner[name_start..index].trim();
247 if name.is_empty() {
248 break;
249 }
250
251 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
252 index += 1;
253 }
254
255 let value = if index < bytes.len() && bytes[index] == b'=' {
256 index += 1;
257 while index < bytes.len() && bytes[index].is_ascii_whitespace() {
258 index += 1;
259 }
260 if index >= bytes.len() {
261 Some(String::new())
262 } else {
263 let quote = bytes[index];
264 if quote == b'\'' || quote == b'"' {
265 index += 1;
266 let value_start = index;
267 while index < bytes.len() && bytes[index] != quote {
268 index += 1;
269 }
270 let parsed = inner[value_start..index].to_string();
271 if index < bytes.len() {
272 index += 1;
273 }
274 Some(parsed)
275 } else {
276 let value_start = index;
277 while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
278 index += 1;
279 }
280 Some(inner[value_start..index].to_string())
281 }
282 }
283 } else {
284 None
285 };
286
287 attributes.push(HtmlAttribute {
288 name: name.to_ascii_lowercase(),
289 value,
290 });
291 }
292
293 attributes
294}
295
296#[must_use]
298pub fn get_attribute(element: &str, name: &str) -> Option<String> {
299 let requested = name.trim().to_ascii_lowercase();
300 extract_attributes(element)
301 .into_iter()
302 .find(|attribute| attribute.name == requested)
303 .and_then(|attribute| attribute.value)
304}