Skip to main content

browser_cat/
html.rs

1/// HTML head/body parsing and plain-text-to-HTML conversion.
2///
3/// Ported from bcat's lib/bcat/html.rb by Ryan Tomayko.
4
5// ── HeadParser ────────────────────────────────────────────────────────────────
6
7/// Incrementally parses an HTML stream, separating the `<head>` from the
8/// `<body>`. Feed chunks via [`HeadParser::feed`] until [`HeadParser::complete`]
9/// returns true, then call [`HeadParser::head`] and [`HeadParser::take_body`].
10///
11/// Also detects whether the input is HTML at all: if the first non-whitespace
12/// character is not `<` the document is treated as plain text.
13#[derive(Debug, Default)]
14pub struct HeadParser {
15    buf: String,
16    /// Accumulated `<head>` inner content (script/style/meta/title/link/base).
17    head_parts: Vec<String>,
18    /// Everything from the first body character onward.
19    body_start: Option<String>,
20    /// Whether we've determined the input is HTML.
21    is_html: Option<bool>,
22}
23
24impl HeadParser {
25    pub fn new() -> Self {
26        Self::default()
27    }
28
29    /// Feed the next chunk of input. Returns `true` once the body has started
30    /// (i.e. [`HeadParser::complete`] would return `true`).
31    pub fn feed(&mut self, data: &str) -> bool {
32        if self.body_start.is_some() {
33            // Already complete — accumulate in body directly.
34            self.body_start.as_mut().unwrap().push_str(data);
35            return true;
36        }
37        self.buf.push_str(data);
38        self.parse();
39        self.body_start.is_some()
40    }
41
42    /// True once the first body character has been seen.
43    pub fn complete(&self) -> bool {
44        self.body_start.is_some()
45    }
46
47    /// True if the input looks like HTML (first non-whitespace char is `<`).
48    pub fn is_html(&self) -> bool {
49        self.is_html.unwrap_or(false)
50    }
51
52    /// Inner content of `<head>` (stripped of structural tags like DOCTYPE,
53    /// `<html>`, `<head>`).
54    pub fn head(&self) -> String {
55        self.head_parts.join("\n")
56    }
57
58    /// Take and return all body content seen so far, leaving the parser empty.
59    /// The caller is responsible for injecting a `<body>` wrapper if needed.
60    pub fn take_body(&mut self) -> String {
61        self.body_start.take().unwrap_or_default()
62    }
63
64    // ── internal ──────────────────────────────────────────────────────────────
65
66    fn parse(&mut self) {
67        loop {
68            // Determine html-ness from first non-whitespace character.
69            if self.is_html.is_none() {
70                let trimmed = self.buf.trim_start();
71                if trimmed.is_empty() {
72                    return; // need more data
73                }
74                self.is_html = Some(trimmed.starts_with('<'));
75            }
76
77            if !self.is_html.unwrap_or(false) {
78                // Plain text — everything is body.
79                let body = std::mem::take(&mut self.buf);
80                self.body_start = Some(body);
81                return;
82            }
83
84            // Skip pure whitespace at the start of the buffer.
85            let trimmed_start = self.buf.len() - self.buf.trim_start().len();
86            if trimmed_start > 0 {
87                self.buf = self.buf[trimmed_start..].to_string();
88            }
89
90            if self.buf.is_empty() {
91                return;
92            }
93
94            // Try to consume a head-level tag from the buffer front.
95            if let Some(consumed) = self.try_consume_head_tag() {
96                if !consumed.trim().is_empty() {
97                    self.head_parts.push(consumed);
98                }
99                continue;
100            }
101
102            // No head tag at the front — the rest is body.
103            let body = std::mem::take(&mut self.buf);
104            self.body_start = Some(body);
105            return;
106        }
107    }
108
109    /// Try to match and remove a head-level construct from the front of `buf`.
110    /// Returns `Some(matched_text)` on success, `None` if no head tag is found.
111    fn try_consume_head_tag(&mut self) -> Option<String> {
112        let buf = &self.buf;
113
114        // DOCTYPE
115        if buf.to_ascii_uppercase().starts_with("<!DOCTYPE") {
116            if let Some(end) = buf.find('>') {
117                self.buf = self.buf[end + 1..].to_string();
118                return Some(String::new()); // discard structural tag
119            }
120            return None; // incomplete, need more data
121        }
122
123        // <html ...> or </html>  — structural, discard
124        if let Some(rest) = buf.strip_prefix("<html") {
125            if rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/') {
126                if let Some(end) = buf.find('>') {
127                    self.buf = self.buf[end + 1..].to_string();
128                    return Some(String::new());
129                }
130                return None;
131            }
132        }
133        if buf.starts_with("</html") {
134            if let Some(end) = buf.find('>') {
135                self.buf = self.buf[end + 1..].to_string();
136                return Some(String::new());
137            }
138            return None;
139        }
140
141        // <head> or </head> — structural, discard
142        if let Some(rest) = buf.strip_prefix("<head") {
143            if rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/') {
144                if let Some(end) = buf.find('>') {
145                    self.buf = self.buf[end + 1..].to_string();
146                    return Some(String::new());
147                }
148                return None;
149            }
150        }
151        if buf.starts_with("</head") {
152            if let Some(end) = buf.find('>') {
153                self.buf = self.buf[end + 1..].to_string();
154                return Some(String::new());
155            }
156            return None;
157        }
158
159        // Head content tags we preserve: title, script, style, meta, link, base
160        for tag in &["title", "script", "style", "meta", "link", "base"] {
161            let open = format!("<{}", tag);
162            if buf.to_ascii_lowercase().starts_with(&open) {
163                let rest = &buf[open.len()..];
164                if rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/') {
165                    // Self-closing or paired tag — find the end.
166                    let close = format!("</{}>", tag);
167                    if let Some(end) = buf.to_ascii_lowercase().find(&close) {
168                        let full_end = end + close.len();
169                        let matched = self.buf[..full_end].to_string();
170                        self.buf = self.buf[full_end..].to_string();
171                        return Some(matched);
172                    }
173                    // Self-closing `<meta ... />` or `<link ... />`
174                    if let Some(end) = buf.find("/>") {
175                        let full_end = end + 2;
176                        let matched = self.buf[..full_end].to_string();
177                        self.buf = self.buf[full_end..].to_string();
178                        return Some(matched);
179                    }
180                    // `<meta ...>` without self-close
181                    if let Some(end) = buf.find('>') {
182                        let full_end = end + 1;
183                        let matched = self.buf[..full_end].to_string();
184                        self.buf = self.buf[full_end..].to_string();
185                        return Some(matched);
186                    }
187                    return None; // incomplete
188                }
189            }
190        }
191
192        // Comments <!-- ... --> in head
193        if buf.starts_with("<!--") {
194            if let Some(end) = buf.find("-->") {
195                let full_end = end + 3;
196                let matched = self.buf[..full_end].to_string();
197                self.buf = self.buf[full_end..].to_string();
198                return Some(matched);
199            }
200            return None;
201        }
202
203        None
204    }
205}
206
207// ── TextFilter ────────────────────────────────────────────────────────────────
208
209/// Wraps plain-text chunks in `<pre>` / `</pre>` HTML.
210///
211/// Produces the opening `<pre>` on the first chunk and `</pre>` only when
212/// [`TextFilter::finish`] is called.
213pub struct TextFilter {
214    opened: bool,
215}
216
217impl TextFilter {
218    pub fn new() -> Self {
219        Self { opened: false }
220    }
221
222    /// Convert a plain-text chunk to HTML. HTML-escapes entities.
223    pub fn filter(&mut self, chunk: &str) -> String {
224        let escaped = html_escape::encode_text(chunk).into_owned();
225        if !self.opened {
226            self.opened = true;
227            format!("<pre>{}", escaped)
228        } else {
229            escaped
230        }
231    }
232
233    /// Returns the closing `</pre>` tag (call once at end of stream).
234    pub fn finish(&self) -> &'static str {
235        if self.opened { "</pre>" } else { "" }
236    }
237}
238
239impl Default for TextFilter {
240    fn default() -> Self {
241        Self::new()
242    }
243}
244
245// ── Tests ─────────────────────────────────────────────────────────────────────
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    // ── HeadParser ──
252
253    #[test]
254    fn detects_plain_text() {
255        let mut p = HeadParser::new();
256        p.feed("hello world");
257        assert!(!p.is_html());
258        assert!(p.complete());
259        assert_eq!(p.take_body(), "hello world");
260    }
261
262    #[test]
263    fn detects_html() {
264        let mut p = HeadParser::new();
265        p.feed("<p>hello</p>");
266        assert!(p.is_html());
267    }
268
269    #[test]
270    fn strips_doctype() {
271        let mut p = HeadParser::new();
272        p.feed("<!DOCTYPE html>\n<p>body</p>");
273        assert!(p.is_html());
274        assert!(p.complete());
275        assert!(!p.head().contains("DOCTYPE"));
276        assert!(p.take_body().contains("<p>body</p>"));
277    }
278
279    #[test]
280    fn strips_html_head_tags() {
281        let mut p = HeadParser::new();
282        p.feed("<html><head></head><body><p>content</p></body></html>");
283        assert!(p.complete());
284        assert!(p.take_body().contains("<p>content</p>"));
285    }
286
287    #[test]
288    fn preserves_head_content_tags() {
289        let mut p = HeadParser::new();
290        p.feed("<html><head><title>My Page</title><style>body{}</style></head><body>hi</body></html>");
291        assert!(p.complete());
292        let head = p.head();
293        assert!(head.contains("<title>My Page</title>"));
294        assert!(head.contains("<style>body{}</style>"));
295    }
296
297    #[test]
298    fn fragment_with_no_head() {
299        let mut p = HeadParser::new();
300        p.feed("<p>just a fragment</p>");
301        assert!(p.is_html());
302        assert!(p.complete());
303        assert!(p.take_body().contains("<p>just a fragment</p>"));
304    }
305
306    #[test]
307    fn whitespace_before_html() {
308        let mut p = HeadParser::new();
309        p.feed("  \n  <p>text</p>");
310        assert!(p.is_html());
311    }
312
313    // ── TextFilter ──
314
315    #[test]
316    fn text_filter_wraps_in_pre() {
317        let mut f = TextFilter::new();
318        let out = f.filter("hello");
319        assert!(out.starts_with("<pre>"));
320        assert!(out.contains("hello"));
321    }
322
323    #[test]
324    fn text_filter_escapes_entities() {
325        let mut f = TextFilter::new();
326        let out = f.filter("<b>&</b>");
327        assert!(out.contains("&lt;b&gt;"));
328        assert!(out.contains("&amp;"));
329    }
330
331    #[test]
332    fn text_filter_no_double_pre() {
333        let mut f = TextFilter::new();
334        let a = f.filter("first");
335        let b = f.filter("second");
336        assert!(a.starts_with("<pre>"));
337        assert!(!b.starts_with("<pre>"));
338    }
339
340    #[test]
341    fn text_filter_finish() {
342        let mut f = TextFilter::new();
343        f.filter("x");
344        assert_eq!(f.finish(), "</pre>");
345    }
346}