browser_cat/
html.rs

1// HTML head/body parsing and plain-text-to-HTML conversion.
2// Ported from bcat's lib/bcat/html.rb by Ryan Tomayko.
3
4// ── HeadParser ────────────────────────────────────────────────────────────────
5
6/// Incrementally parses an HTML stream, separating the `<head>` from the
7/// `<body>`. Feed chunks via [`HeadParser::feed`] until [`HeadParser::complete`]
8/// returns true, then call [`HeadParser::head`] and [`HeadParser::take_body`].
9///
10/// Also detects whether the input is HTML at all: if the first non-whitespace
11/// character is not `<` the document is treated as plain text.
12#[derive(Debug, Default)]
13pub struct HeadParser {
14    buf: String,
15    /// Accumulated `<head>` inner content (script/style/meta/title/link/base).
16    head_parts: Vec<String>,
17    /// Everything from the first body character onward.
18    body_start: Option<String>,
19    /// Whether we've determined the input is HTML.
20    is_html: Option<bool>,
21}
22
23impl HeadParser {
24    pub fn new() -> Self {
25        Self::default()
26    }
27
28    /// Feed the next chunk of input. Returns `true` once the body has started
29    /// (i.e. [`HeadParser::complete`] would return `true`).
30    pub fn feed(&mut self, data: &str) -> bool {
31        if let Some(body) = self.body_start.as_mut() {
32            // Already complete — accumulate in body directly.
33            body.push_str(data);
34            return true;
35        }
36        self.buf.push_str(data);
37        self.parse();
38        self.body_start.is_some()
39    }
40
41    /// True once the first body character has been seen.
42    pub fn complete(&self) -> bool {
43        self.body_start.is_some()
44    }
45
46    /// True if the input looks like HTML (first non-whitespace char is `<`).
47    pub fn is_html(&self) -> bool {
48        self.is_html.unwrap_or(false)
49    }
50
51    /// Inner content of `<head>` (stripped of structural tags like DOCTYPE,
52    /// `<html>`, `<head>`).
53    pub fn head(&self) -> String {
54        self.head_parts.join("\n")
55    }
56
57    /// Take and return all body content seen so far, leaving the parser empty.
58    /// The caller is responsible for injecting a `<body>` wrapper if needed.
59    pub fn take_body(&mut self) -> String {
60        self.body_start.take().unwrap_or_default()
61    }
62
63    // ── internal ──────────────────────────────────────────────────────────────
64
65    fn parse(&mut self) {
66        loop {
67            // Determine html-ness from first non-whitespace character.
68            if self.is_html.is_none() {
69                let trimmed = self.buf.trim_start();
70                if trimmed.is_empty() {
71                    return; // need more data
72                }
73                self.is_html = Some(trimmed.starts_with('<'));
74            }
75
76            if !self.is_html.unwrap_or(false) {
77                // Plain text — everything is body.
78                let body = std::mem::take(&mut self.buf);
79                self.body_start = Some(body);
80                return;
81            }
82
83            // Skip pure whitespace at the start of the buffer.
84            let trimmed_start = self.buf.len() - self.buf.trim_start().len();
85            if trimmed_start > 0 {
86                self.buf = self.buf[trimmed_start..].to_string();
87            }
88
89            if self.buf.is_empty() {
90                return;
91            }
92
93            // Try to consume a head-level tag from the buffer front.
94            if let Some(consumed) = self.try_consume_head_tag() {
95                if !consumed.trim().is_empty() {
96                    self.head_parts.push(consumed);
97                }
98                continue;
99            }
100
101            // No head tag at the front — the rest is body.
102            let body = std::mem::take(&mut self.buf);
103            self.body_start = Some(body);
104            return;
105        }
106    }
107
108    /// Try to match and remove a head-level construct from the front of `buf`.
109    /// Returns `Some(matched_text)` on success, `None` if no head tag is found.
110    fn try_consume_head_tag(&mut self) -> Option<String> {
111        let buf = &self.buf;
112
113        // DOCTYPE
114        if buf.to_ascii_uppercase().starts_with("<!DOCTYPE") {
115            if let Some(end) = buf.find('>') {
116                self.buf = self.buf[end + 1..].to_string();
117                return Some(String::new()); // discard structural tag
118            }
119            return None; // incomplete, need more data
120        }
121
122        // <html ...> or </html>  — structural, discard
123        if let Some(rest) = buf.strip_prefix("<html")
124            && rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/')
125        {
126            if let Some(end) = buf.find('>') {
127                self.buf = self.buf[end + 1..].to_string();
128                return Some(String::new());
129            }
130            return None;
131        }
132        if buf.starts_with("</html") {
133            if let Some(end) = buf.find('>') {
134                self.buf = self.buf[end + 1..].to_string();
135                return Some(String::new());
136            }
137            return None;
138        }
139
140        // <head> or </head> — structural, discard
141        if let Some(rest) = buf.strip_prefix("<head")
142            && rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/')
143        {
144            if let Some(end) = buf.find('>') {
145                self.buf = self.buf[end + 1..].to_string();
146                return Some(String::new());
147            }
148            return None;
149        }
150        if buf.starts_with("</head") {
151            if let Some(end) = buf.find('>') {
152                self.buf = self.buf[end + 1..].to_string();
153                return Some(String::new());
154            }
155            return None;
156        }
157
158        // Head content tags we preserve: title, script, style, meta, link, base
159        for tag in &["title", "script", "style", "meta", "link", "base"] {
160            let open = format!("<{}", tag);
161            if buf.to_ascii_lowercase().starts_with(&open) {
162                let rest = &buf[open.len()..];
163                if rest.starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/') {
164                    // Self-closing or paired tag — find the end.
165                    let close = format!("</{}>", tag);
166                    if let Some(end) = buf.to_ascii_lowercase().find(&close) {
167                        let full_end = end + close.len();
168                        let matched = self.buf[..full_end].to_string();
169                        self.buf = self.buf[full_end..].to_string();
170                        return Some(matched);
171                    }
172                    // Self-closing `<meta ... />` or `<link ... />`
173                    if let Some(end) = buf.find("/>") {
174                        let full_end = end + 2;
175                        let matched = self.buf[..full_end].to_string();
176                        self.buf = self.buf[full_end..].to_string();
177                        return Some(matched);
178                    }
179                    // `<meta ...>` without self-close
180                    if let Some(end) = buf.find('>') {
181                        let full_end = end + 1;
182                        let matched = self.buf[..full_end].to_string();
183                        self.buf = self.buf[full_end..].to_string();
184                        return Some(matched);
185                    }
186                    return None; // incomplete
187                }
188            }
189        }
190
191        // Comments <!-- ... --> in head
192        if buf.starts_with("<!--") {
193            if let Some(end) = buf.find("-->") {
194                let full_end = end + 3;
195                let matched = self.buf[..full_end].to_string();
196                self.buf = self.buf[full_end..].to_string();
197                return Some(matched);
198            }
199            return None;
200        }
201
202        None
203    }
204}
205
206// ── TextFilter ────────────────────────────────────────────────────────────────
207
208/// Wraps plain-text chunks in `<pre>` / `</pre>` HTML.
209///
210/// Produces the opening `<pre>` on the first chunk and `</pre>` only when
211/// [`TextFilter::finish`] is called.
212pub struct TextFilter {
213    opened: bool,
214}
215
216impl TextFilter {
217    pub fn new() -> Self {
218        Self { opened: false }
219    }
220
221    /// Convert a plain-text chunk to HTML. HTML-escapes entities.
222    pub fn filter(&mut self, chunk: &str) -> String {
223        let escaped = html_escape::encode_text(chunk).into_owned();
224        if !self.opened {
225            self.opened = true;
226            format!("<pre>{}", escaped)
227        } else {
228            escaped
229        }
230    }
231
232    /// Returns the closing `</pre>` tag (call once at end of stream).
233    pub fn finish(&self) -> &'static str {
234        if self.opened { "</pre>" } else { "" }
235    }
236}
237
238impl Default for TextFilter {
239    fn default() -> Self {
240        Self::new()
241    }
242}
243
244// ── Tests ─────────────────────────────────────────────────────────────────────
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    // ── HeadParser ──
251
252    #[test]
253    fn detects_plain_text() {
254        let mut p = HeadParser::new();
255        p.feed("hello world");
256        assert!(!p.is_html());
257        assert!(p.complete());
258        assert_eq!(p.take_body(), "hello world");
259    }
260
261    #[test]
262    fn detects_html() {
263        let mut p = HeadParser::new();
264        p.feed("<p>hello</p>");
265        assert!(p.is_html());
266    }
267
268    #[test]
269    fn strips_doctype() {
270        let mut p = HeadParser::new();
271        p.feed("<!DOCTYPE html>\n<p>body</p>");
272        assert!(p.is_html());
273        assert!(p.complete());
274        assert!(!p.head().contains("DOCTYPE"));
275        assert!(p.take_body().contains("<p>body</p>"));
276    }
277
278    #[test]
279    fn strips_html_head_tags() {
280        let mut p = HeadParser::new();
281        p.feed("<html><head></head><body><p>content</p></body></html>");
282        assert!(p.complete());
283        assert!(p.take_body().contains("<p>content</p>"));
284    }
285
286    #[test]
287    fn preserves_head_content_tags() {
288        let mut p = HeadParser::new();
289        p.feed(
290            "<html><head><title>My Page</title><style>body{}</style></head><body>hi</body></html>",
291        );
292        assert!(p.complete());
293        let head = p.head();
294        assert!(head.contains("<title>My Page</title>"));
295        assert!(head.contains("<style>body{}</style>"));
296    }
297
298    #[test]
299    fn fragment_with_no_head() {
300        let mut p = HeadParser::new();
301        p.feed("<p>just a fragment</p>");
302        assert!(p.is_html());
303        assert!(p.complete());
304        assert!(p.take_body().contains("<p>just a fragment</p>"));
305    }
306
307    #[test]
308    fn whitespace_before_html() {
309        let mut p = HeadParser::new();
310        p.feed("  \n  <p>text</p>");
311        assert!(p.is_html());
312    }
313
314    // ── TextFilter ──
315
316    #[test]
317    fn text_filter_wraps_in_pre() {
318        let mut f = TextFilter::new();
319        let out = f.filter("hello");
320        assert!(out.starts_with("<pre>"));
321        assert!(out.contains("hello"));
322    }
323
324    #[test]
325    fn text_filter_escapes_entities() {
326        let mut f = TextFilter::new();
327        let out = f.filter("<b>&</b>");
328        assert!(out.contains("&lt;b&gt;"));
329        assert!(out.contains("&amp;"));
330    }
331
332    #[test]
333    fn text_filter_no_double_pre() {
334        let mut f = TextFilter::new();
335        let a = f.filter("first");
336        let b = f.filter("second");
337        assert!(a.starts_with("<pre>"));
338        assert!(!b.starts_with("<pre>"));
339    }
340
341    #[test]
342    fn text_filter_finish() {
343        let mut f = TextFilter::new();
344        f.filter("x");
345        assert_eq!(f.finish(), "</pre>");
346    }
347}
browser_cat/html.rs

browser_cat/
html.rs