Skip to main content

use_html/
lib.rs

1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4/// A lightweight HTML attribute.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct HtmlAttribute {
7    pub name: String,
8    pub value: Option<String>,
9}
10
11/// A lightweight HTML element view.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct HtmlElement {
14    pub name: String,
15    pub attributes: Vec<HtmlAttribute>,
16}
17
18/// A lightweight link extracted from an anchor tag.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct HtmlLink {
21    pub text: String,
22    pub href: String,
23}
24
25/// A lightweight heading extracted from `h1` through `h6`.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct HtmlHeading {
28    pub level: u8,
29    pub text: String,
30}
31
32/// Returns `true` when the input contains tag-like HTML markup.
33#[must_use]
34pub fn looks_like_html(input: &str) -> bool {
35    let trimmed = input.trim();
36    if trimmed.is_empty() {
37        return false;
38    }
39
40    let bytes = trimmed.as_bytes();
41    bytes.windows(2).any(|window| {
42        window[0] == b'<' && (window[1].is_ascii_alphabetic() || matches!(window[1], b'/' | b'!'))
43    }) && trimmed.contains('>')
44}
45
46/// Removes HTML comments from the input.
47#[must_use]
48pub fn strip_html_comments(input: &str) -> String {
49    let mut result = String::new();
50    let mut remainder = input;
51
52    while let Some(start) = remainder.find("<!--") {
53        result.push_str(&remainder[..start]);
54        let comment = &remainder[start + 4..];
55        if let Some(end) = comment.find("-->") {
56            remainder = &comment[end + 3..];
57        } else {
58            remainder = "";
59            break;
60        }
61    }
62
63    result.push_str(remainder);
64    result
65}
66
67/// Removes tag-like markup with simple angle-bracket stripping.
68#[must_use]
69pub fn strip_tags_basic(input: &str) -> String {
70    let mut result = String::new();
71    let mut inside_tag = false;
72
73    for character in input.chars() {
74        match character {
75            '<' => inside_tag = true,
76            '>' => inside_tag = false,
77            _ if !inside_tag => result.push(character),
78            _ => {}
79        }
80    }
81
82    result
83}
84
85/// Escapes common HTML-sensitive characters.
86#[must_use]
87pub fn escape_html(input: &str) -> String {
88    input
89        .replace('&', "&amp;")
90        .replace('<', "&lt;")
91        .replace('>', "&gt;")
92        .replace('"', "&quot;")
93        .replace('\'', "&#39;")
94}
95
96/// Unescapes common HTML entities.
97#[must_use]
98pub fn unescape_html(input: &str) -> String {
99    input
100        .replace("&lt;", "<")
101        .replace("&gt;", ">")
102        .replace("&quot;", "\"")
103        .replace("&#39;", "'")
104        .replace("&amp;", "&")
105}
106
107/// Extracts anchor tags with `href` attributes.
108#[must_use]
109pub fn extract_links(input: &str) -> Vec<HtmlLink> {
110    let lower = input.to_ascii_lowercase();
111    let mut results = Vec::new();
112    let mut search_start = 0;
113
114    while let Some(start_offset) = lower[search_start..].find("<a") {
115        let start = search_start + start_offset;
116        let Some(open_end_offset) = lower[start..].find('>') else {
117            break;
118        };
119        let open_end = start + open_end_offset;
120        let Some(close_offset) = lower[open_end + 1..].find("</a>") else {
121            break;
122        };
123        let close_start = open_end + 1 + close_offset;
124        let element = &input[start..=open_end];
125        if let Some(href) = get_attribute(element, "href") {
126            let text = strip_tags_basic(&input[open_end + 1..close_start])
127                .trim()
128                .to_string();
129            results.push(HtmlLink { text, href });
130        }
131
132        search_start = close_start + 4;
133    }
134
135    results
136}
137
138/// Extracts heading tags in document order.
139#[must_use]
140pub fn extract_headings(input: &str) -> Vec<HtmlHeading> {
141    let lower = input.to_ascii_lowercase();
142    let mut results = Vec::new();
143    let mut search_start = 0;
144
145    while let Some(start_offset) = lower[search_start..].find("<h") {
146        let start = search_start + start_offset;
147        let bytes = lower.as_bytes();
148        let Some(level_byte) = bytes.get(start + 2) else {
149            break;
150        };
151        if !(b'1'..=b'6').contains(level_byte) {
152            search_start = start + 2;
153            continue;
154        }
155
156        let after_level = bytes.get(start + 3).copied();
157        if let Some(after_level) = after_level {
158            if after_level != b'>' && !after_level.is_ascii_whitespace() {
159                search_start = start + 2;
160                continue;
161            }
162        }
163
164        let Some(open_end_offset) = lower[start..].find('>') else {
165            break;
166        };
167        let open_end = start + open_end_offset;
168        let level = level_byte - b'0';
169        let close_tag = format!("</h{level}>");
170        let Some(close_offset) = lower[open_end + 1..].find(&close_tag) else {
171            break;
172        };
173        let close_start = open_end + 1 + close_offset;
174        let text = strip_tags_basic(&input[open_end + 1..close_start])
175            .trim()
176            .to_string();
177        results.push(HtmlHeading { level, text });
178        search_start = close_start + close_tag.len();
179    }
180
181    results
182}
183
184/// Extracts the title text from the first `<title>` tag.
185#[must_use]
186pub fn extract_title(input: &str) -> Option<String> {
187    let lower = input.to_ascii_lowercase();
188    let start = lower.find("<title>")? + 7;
189    let end = lower[start..].find("</title>")? + start;
190    Some(strip_tags_basic(&input[start..end]).trim().to_string())
191}
192
193/// Extracts a `<meta name="..." content="...">` value.
194#[must_use]
195pub fn extract_meta_content(input: &str, name: &str) -> Option<String> {
196    let lower = input.to_ascii_lowercase();
197    let mut search_start = 0;
198
199    while let Some(start_offset) = lower[search_start..].find("<meta") {
200        let start = search_start + start_offset;
201        let end_offset = lower[start..].find('>')?;
202        let end = start + end_offset;
203        let element = &input[start..=end];
204        if get_attribute(element, "name")
205            .as_deref()
206            .is_some_and(|value| value.eq_ignore_ascii_case(name))
207        {
208            return get_attribute(element, "content");
209        }
210        search_start = end + 1;
211    }
212
213    None
214}
215
216/// Extracts attributes from an opening tag.
217#[must_use]
218pub fn extract_attributes(element: &str) -> Vec<HtmlAttribute> {
219    let trimmed = element.trim();
220    if !trimmed.starts_with('<') {
221        return Vec::new();
222    }
223
224    let mut inner = trimmed.trim_start_matches('<').trim_end_matches('>').trim();
225    inner = inner.strip_suffix('/').unwrap_or(inner).trim_end();
226
227    let mut index = 0;
228    let bytes = inner.as_bytes();
229    while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
230        index += 1;
231    }
232
233    let mut attributes = Vec::new();
234    while index < bytes.len() {
235        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
236            index += 1;
237        }
238        if index >= bytes.len() {
239            break;
240        }
241
242        let name_start = index;
243        while index < bytes.len() && !bytes[index].is_ascii_whitespace() && bytes[index] != b'=' {
244            index += 1;
245        }
246        let name = inner[name_start..index].trim();
247        if name.is_empty() {
248            break;
249        }
250
251        while index < bytes.len() && bytes[index].is_ascii_whitespace() {
252            index += 1;
253        }
254
255        let value = if index < bytes.len() && bytes[index] == b'=' {
256            index += 1;
257            while index < bytes.len() && bytes[index].is_ascii_whitespace() {
258                index += 1;
259            }
260            if index >= bytes.len() {
261                Some(String::new())
262            } else {
263                let quote = bytes[index];
264                if quote == b'\'' || quote == b'"' {
265                    index += 1;
266                    let value_start = index;
267                    while index < bytes.len() && bytes[index] != quote {
268                        index += 1;
269                    }
270                    let parsed = inner[value_start..index].to_string();
271                    if index < bytes.len() {
272                        index += 1;
273                    }
274                    Some(parsed)
275                } else {
276                    let value_start = index;
277                    while index < bytes.len() && !bytes[index].is_ascii_whitespace() {
278                        index += 1;
279                    }
280                    Some(inner[value_start..index].to_string())
281                }
282            }
283        } else {
284            None
285        };
286
287        attributes.push(HtmlAttribute {
288            name: name.to_ascii_lowercase(),
289            value,
290        });
291    }
292
293    attributes
294}
295
296/// Returns the named attribute value from an element when present.
297#[must_use]
298pub fn get_attribute(element: &str, name: &str) -> Option<String> {
299    let requested = name.trim().to_ascii_lowercase();
300    extract_attributes(element)
301        .into_iter()
302        .find(|attribute| attribute.name == requested)
303        .and_then(|attribute| attribute.value)
304}