1#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct Link {
13 pub text: String,
14 pub href: String,
15}
16
17#[derive(Debug, Clone)]
19pub struct HtmlDoc {
20 pub title: Option<String>,
21 pub markdown: String,
22 pub links: Vec<Link>,
23}
24
25pub fn parse(html: &str) -> HtmlDoc {
27 let title = extract_title(html);
28 let content = select_main(html);
29 let mut renderer = Renderer::default();
30 for token in tokenize(content) {
31 renderer.consume(&token);
32 }
33 let markdown = normalize(&renderer.out);
34 HtmlDoc {
35 title,
36 markdown,
37 links: renderer.links,
38 }
39}
40
41pub fn title(html: &str) -> Option<String> {
43 extract_title(html)
44}
45
46pub fn markdown_to_text(markdown: &str) -> String {
48 let mut out = String::with_capacity(markdown.len());
49 let mut in_fence = false;
50 for line in markdown.lines() {
51 if line.trim_start().starts_with("```") {
52 in_fence = !in_fence;
53 continue;
54 }
55 if in_fence {
56 out.push_str(line);
57 out.push('\n');
58 continue;
59 }
60 let stripped = strip_inline_markup(line);
61 out.push_str(&stripped);
62 out.push('\n');
63 }
64 out.trim().to_string()
65}
66
67fn strip_inline_markup(line: &str) -> String {
68 let without_heading = line.trim_start().trim_start_matches('#').trim_start();
69 replace_links_with_text(without_heading)
70}
71
72fn replace_links_with_text(s: &str) -> String {
74 let mut out = String::with_capacity(s.len());
75 let bytes = s.as_bytes();
76 let mut i = 0;
77 while i < bytes.len() {
78 if bytes[i] == b'[' {
79 if let Some(rel_close) = s[i + 1..].find(']') {
80 let close = i + 1 + rel_close;
81 if s[close + 1..].starts_with('(') {
82 if let Some(rel_paren) = s[close + 2..].find(')') {
83 out.push_str(&s[i + 1..close]);
84 i = close + 2 + rel_paren + 1;
85 continue;
86 }
87 }
88 }
89 }
90 let ch_len = utf8_len(bytes[i]);
92 out.push_str(&s[i..i + ch_len]);
93 i += ch_len;
94 }
95 out
96}
97
98fn utf8_len(first: u8) -> usize {
99 match first {
100 b if b < 0x80 => 1,
101 b if b >> 5 == 0b110 => 2,
102 b if b >> 4 == 0b1110 => 3,
103 _ => 4,
104 }
105}
106
107fn select_main(html: &str) -> &str {
110 if let Some(inner) = first_element_inner(html, "main") {
111 return inner;
112 }
113 if let Some(inner) = first_element_inner(html, "body") {
114 return inner;
115 }
116 html
117}
118
119fn first_element_inner<'a>(html: &'a str, tag: &str) -> Option<&'a str> {
121 let lower = html.to_ascii_lowercase();
122 let open_marker = format!("<{tag}");
123 let open_pos = lower.find(&open_marker)?;
124 let after_name = open_pos + open_marker.len();
126 let delim_ok = lower[after_name..]
127 .chars()
128 .next()
129 .is_some_and(|c| c == '>' || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '/');
130 if !delim_ok {
131 return None;
132 }
133 let gt = lower[open_pos..].find('>')? + open_pos;
134 let close_marker = format!("</{tag}");
135 let close_pos = lower[gt + 1..].find(&close_marker).map(|p| gt + 1 + p)?;
136 Some(&html[gt + 1..close_pos])
137}
138
139fn extract_title(html: &str) -> Option<String> {
140 let inner = first_element_inner(html, "title")?;
141 let decoded = decode_entities(inner);
142 let collapsed = collapse_ws(&decoded);
143 let trimmed = collapsed.trim();
144 if trimmed.is_empty() {
145 None
146 } else {
147 Some(trimmed.to_string())
148 }
149}
150
151enum Token<'a> {
154 Open {
155 name: String,
156 attrs: &'a str,
157 self_closing: bool,
158 },
159 Close {
160 name: String,
161 },
162 Text(&'a str),
163}
164
165fn tokenize(html: &str) -> Vec<Token<'_>> {
166 let bytes = html.as_bytes();
167 let n = bytes.len();
168 let mut tokens = Vec::new();
169 let mut i = 0;
170
171 while i < n {
172 if bytes[i] == b'<' {
173 if html[i..].starts_with("<!--") {
174 match html[i + 4..].find("-->") {
175 Some(end) => i = i + 4 + end + 3,
176 None => break,
177 }
178 continue;
179 }
180 if i + 1 < n && bytes[i + 1] == b'!' {
181 match html[i..].find('>') {
182 Some(end) => i += end + 1,
183 None => break,
184 }
185 continue;
186 }
187 if let Some(end) = tag_end(bytes, i) {
188 parse_tag(&html[i + 1..end], &mut tokens);
189 i = end + 1;
190 } else {
191 tokens.push(Token::Text(&html[i..]));
192 break;
193 }
194 } else {
195 let start = i;
196 while i < n && bytes[i] != b'<' {
197 i += 1;
198 }
199 tokens.push(Token::Text(&html[start..i]));
200 }
201 }
202 tokens
203}
204
205fn tag_end(bytes: &[u8], start: usize) -> Option<usize> {
207 let mut i = start + 1;
208 let mut quote = 0u8;
209 while i < bytes.len() {
210 let b = bytes[i];
211 if quote != 0 {
212 if b == quote {
213 quote = 0;
214 }
215 } else if b == b'"' || b == b'\'' {
216 quote = b;
217 } else if b == b'>' {
218 return Some(i);
219 }
220 i += 1;
221 }
222 None
223}
224
225fn parse_tag<'a>(inner: &'a str, tokens: &mut Vec<Token<'a>>) {
226 let trimmed = inner.trim_start();
227 if let Some(rest) = trimmed.strip_prefix('/') {
228 let name = take_name(rest);
229 if !name.is_empty() {
230 tokens.push(Token::Close { name });
231 }
232 return;
233 }
234 let name = take_name(trimmed);
235 if name.is_empty() {
236 return;
237 }
238 let attrs = &trimmed[name.len()..];
239 let self_closing = trimmed.trim_end().ends_with('/');
240 tokens.push(Token::Open {
241 name,
242 attrs,
243 self_closing,
244 });
245}
246
247fn take_name(s: &str) -> String {
248 s.chars()
249 .take_while(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == ':')
250 .collect::<String>()
251 .to_ascii_lowercase()
252}
253
254fn get_attr(attrs: &str, key: &str) -> Option<String> {
255 let lower = attrs.to_ascii_lowercase();
256 let mut from = 0;
257 while let Some(pos) = lower[from..].find(key) {
258 let idx = from + pos;
259 let boundary = idx == 0 || lower.as_bytes()[idx - 1].is_ascii_whitespace();
260 let after = idx + key.len();
261 let rest = attrs[after..].trim_start();
262 if boundary && rest.starts_with('=') {
263 return Some(parse_attr_value(rest[1..].trim_start()));
264 }
265 from = after;
266 }
267 None
268}
269
270fn parse_attr_value(s: &str) -> String {
271 let bytes = s.as_bytes();
272 if let Some(&q) = bytes.first() {
273 if q == b'"' || q == b'\'' {
274 let quote = q as char;
275 return match s[1..].find(quote) {
276 Some(end) => s[1..=end].to_string(),
277 None => s[1..].to_string(),
278 };
279 }
280 }
281 s.split_whitespace()
282 .next()
283 .unwrap_or("")
284 .trim_end_matches('/')
285 .to_string()
286}
287
288struct ListCtx {
291 ordered: bool,
292 index: usize,
293}
294
295#[derive(Default)]
296struct Renderer {
297 out: String,
298 links: Vec<Link>,
299 skip_depth: usize,
300 pre_depth: usize,
301 anchor: Option<(String, String)>,
302 list_stack: Vec<ListCtx>,
303}
304
305impl Renderer {
306 fn consume(&mut self, token: &Token<'_>) {
307 match token {
308 Token::Text(t) => self.text(t),
309 Token::Open {
310 name,
311 attrs,
312 self_closing,
313 } => self.open(name, attrs, *self_closing),
314 Token::Close { name } => self.close(name),
315 }
316 }
317
318 fn text(&mut self, raw: &str) {
319 if self.skip_depth > 0 {
320 return;
321 }
322 let decoded = decode_entities(raw);
323 if self.pre_depth > 0 {
324 self.out.push_str(&decoded);
325 return;
326 }
327 let collapsed = collapse_ws(&decoded);
328 if collapsed.is_empty() {
329 return;
330 }
331 match self.anchor {
332 Some((_, ref mut buf)) => buf.push_str(&collapsed),
333 None => self.out.push_str(&collapsed),
334 }
335 }
336
337 fn open(&mut self, name: &str, attrs: &str, self_closing: bool) {
338 if self.skip_depth > 0 {
339 if is_skip(name) && !self_closing && !is_void(name) {
340 self.skip_depth += 1;
341 }
342 return;
343 }
344 if is_skip(name) {
345 if !self_closing && !is_void(name) {
346 self.skip_depth += 1;
347 }
348 return;
349 }
350 if self_closing || is_void(name) {
351 self.open_void(name);
352 return;
353 }
354
355 match name {
356 "a" => self.open_anchor(attrs),
357 "pre" => {
358 self.block_break();
359 self.out.push_str("```");
360 self.newline();
361 self.pre_depth += 1;
362 }
363 "code" if self.pre_depth == 0 => self.out.push('`'),
364 "ul" => {
365 self.list_stack.push(ListCtx {
366 ordered: false,
367 index: 0,
368 });
369 self.block_break();
370 }
371 "ol" => {
372 self.list_stack.push(ListCtx {
373 ordered: true,
374 index: 0,
375 });
376 self.block_break();
377 }
378 "li" => {
379 self.newline();
380 let marker = match self.list_stack.last_mut() {
381 Some(ctx) if ctx.ordered => {
382 ctx.index += 1;
383 format!("{}. ", ctx.index)
384 }
385 _ => "- ".to_string(),
386 };
387 self.out.push_str(&marker);
388 }
389 "tr" => self.newline(),
390 "blockquote" => {
391 self.block_break();
392 self.out.push_str("> ");
393 }
394 h if is_heading(h) => {
395 self.block_break();
396 for _ in 0..heading_level(h) {
397 self.out.push('#');
398 }
399 self.out.push(' ');
400 }
401 b if is_block(b) => self.block_break(),
402 _ => {}
403 }
404 }
405
406 fn open_void(&mut self, name: &str) {
407 match name {
408 "br" => self.newline(),
409 "hr" => {
410 self.block_break();
411 self.out.push_str("---");
412 self.block_break();
413 }
414 _ => {}
415 }
416 }
417
418 fn open_anchor(&mut self, attrs: &str) {
419 if self.anchor.is_some() {
420 return;
421 }
422 if let Some(href) = get_attr(attrs, "href") {
423 let href = href.trim();
424 if !href.is_empty() && !href.starts_with("javascript:") && !href.starts_with('#') {
425 self.anchor = Some((href.to_string(), String::new()));
426 }
427 }
428 }
429
430 fn close(&mut self, name: &str) {
431 if self.skip_depth > 0 {
432 if is_skip(name) {
433 self.skip_depth -= 1;
434 }
435 return;
436 }
437 match name {
438 "a" => {
439 if let Some((href, text)) = self.anchor.take() {
440 let text = text.trim();
441 if !text.is_empty() {
442 self.out.push_str(&format!("[{text}]({href})"));
443 self.links.push(Link {
444 text: text.to_string(),
445 href,
446 });
447 }
448 }
449 }
450 "pre" => {
451 self.pre_depth = self.pre_depth.saturating_sub(1);
452 self.newline();
453 self.out.push_str("```");
454 self.block_break();
455 }
456 "code" if self.pre_depth == 0 => self.out.push('`'),
457 "ul" | "ol" => {
458 self.list_stack.pop();
459 self.block_break();
460 }
461 "td" | "th" => self.out.push_str(" | "),
462 h if is_heading(h) => self.block_break(),
463 b if is_block(b) => self.block_break(),
464 _ => {}
465 }
466 }
467
468 fn newline(&mut self) {
469 if !self.out.ends_with('\n') {
470 self.out.push('\n');
471 }
472 }
473
474 fn block_break(&mut self) {
475 while self.out.ends_with(' ') {
476 self.out.pop();
477 }
478 if self.out.is_empty() {
479 return;
480 }
481 if self.out.ends_with("\n\n") {
482 return;
483 }
484 if self.out.ends_with('\n') {
485 self.out.push('\n');
486 } else {
487 self.out.push_str("\n\n");
488 }
489 }
490}
491
492fn is_skip(name: &str) -> bool {
493 matches!(
494 name,
495 "script"
496 | "style"
497 | "noscript"
498 | "svg"
499 | "template"
500 | "iframe"
501 | "head"
502 | "object"
503 | "embed"
504 | "canvas"
505 | "math"
506 )
507}
508
509fn is_void(name: &str) -> bool {
510 matches!(
511 name,
512 "br" | "hr"
513 | "img"
514 | "input"
515 | "meta"
516 | "link"
517 | "source"
518 | "col"
519 | "area"
520 | "base"
521 | "wbr"
522 | "track"
523 | "param"
524 )
525}
526
527fn is_block(name: &str) -> bool {
528 matches!(
529 name,
530 "p" | "div"
531 | "section"
532 | "article"
533 | "main"
534 | "header"
535 | "footer"
536 | "aside"
537 | "nav"
538 | "dl"
539 | "dd"
540 | "dt"
541 | "table"
542 | "thead"
543 | "tbody"
544 | "tfoot"
545 | "figure"
546 | "figcaption"
547 | "address"
548 | "form"
549 | "fieldset"
550 | "details"
551 | "summary"
552 )
553}
554
555fn is_heading(name: &str) -> bool {
556 name.len() == 2 && name.starts_with('h') && matches!(name.as_bytes()[1], b'1'..=b'6')
557}
558
559fn heading_level(name: &str) -> usize {
560 (name.as_bytes()[1] - b'0') as usize
561}
562
563fn collapse_ws(s: &str) -> String {
566 let mut out = String::with_capacity(s.len());
567 let mut prev_space = false;
568 for c in s.chars() {
569 if c.is_whitespace() {
570 if !prev_space {
571 out.push(' ');
572 prev_space = true;
573 }
574 } else {
575 out.push(c);
576 prev_space = false;
577 }
578 }
579 out
580}
581
582fn normalize(s: &str) -> String {
583 let mut result = String::with_capacity(s.len());
584 let mut in_fence = false;
585 let mut blank_run = 0;
586
587 for line in s.lines() {
588 if line.trim() == "```" {
589 in_fence = !in_fence;
590 result.push_str("```\n");
591 blank_run = 0;
592 continue;
593 }
594 if in_fence {
595 result.push_str(line);
596 result.push('\n');
597 continue;
598 }
599 let trimmed = line.trim();
600 if trimmed.is_empty() {
601 blank_run += 1;
602 if blank_run <= 1 {
603 result.push('\n');
604 }
605 continue;
606 }
607 blank_run = 0;
608 result.push_str(trimmed);
609 result.push('\n');
610 }
611 result.trim().to_string()
612}
613
614pub fn decode_entities(s: &str) -> String {
619 if !s.contains('&') {
620 return s.to_string();
621 }
622 let mut out = String::with_capacity(s.len());
623 let bytes = s.as_bytes();
624 let mut i = 0;
625 while i < bytes.len() {
626 if bytes[i] == b'&' {
627 if let Some(rel_end) = s[i..].find(';') {
628 let end = i + rel_end;
629 let entity = &s[i + 1..end];
630 if let Some(decoded) = decode_one(entity) {
631 out.push_str(&decoded);
632 i = end + 1;
633 continue;
634 }
635 }
636 out.push('&');
637 i += 1;
638 } else {
639 let ch_len = utf8_len(bytes[i]);
640 out.push_str(&s[i..i + ch_len]);
641 i += ch_len;
642 }
643 }
644 out
645}
646
647fn decode_one(entity: &str) -> Option<String> {
648 if let Some(num) = entity.strip_prefix('#') {
649 let code = if let Some(hex) = num.strip_prefix(['x', 'X']) {
650 u32::from_str_radix(hex, 16).ok()?
651 } else {
652 num.parse::<u32>().ok()?
653 };
654 return char::from_u32(code).map(|c| c.to_string());
655 }
656 let named = match entity {
657 "amp" => "&",
658 "lt" => "<",
659 "gt" => ">",
660 "quot" => "\"",
661 "apos" => "'",
662 "nbsp" => " ",
663 "mdash" => "—",
664 "ndash" => "–",
665 "hellip" => "…",
666 "copy" => "©",
667 "reg" => "®",
668 "trade" => "™",
669 "laquo" => "«",
670 "raquo" => "»",
671 "lsquo" => "‘",
672 "rsquo" => "’",
673 "ldquo" => "“",
674 "rdquo" => "”",
675 "bull" => "•",
676 "middot" => "·",
677 "euro" => "€",
678 "pound" => "£",
679 "deg" => "°",
680 "times" => "×",
681 "divide" => "÷",
682 _ => return None,
683 };
684 Some(named.to_string())
685}
686
687#[cfg(test)]
688mod tests {
689 use super::*;
690
691 #[test]
692 fn extracts_title_and_decodes() {
693 let doc =
694 parse("<html><head><title>Foo & Bar</title></head><body><p>Hi</p></body></html>");
695 assert_eq!(doc.title.as_deref(), Some("Foo & Bar"));
696 }
697
698 #[test]
699 fn drops_script_and_style() {
700 let html = "<body><script>var x=1;</script><style>.a{}</style><p>Visible</p></body>";
701 let doc = parse(html);
702 assert_eq!(doc.markdown, "Visible");
703 assert!(!doc.markdown.contains("var x"));
704 }
705
706 #[test]
707 fn renders_headings_and_paragraphs() {
708 let html = "<body><h1>Title</h1><p>First.</p><p>Second.</p></body>";
709 let doc = parse(html);
710 assert_eq!(doc.markdown, "# Title\n\nFirst.\n\nSecond.");
711 }
712
713 #[test]
714 fn renders_links_and_collects_them() {
715 let html = r#"<body><p>See <a href="https://x.com/a">the site</a> now.</p></body>"#;
716 let doc = parse(html);
717 assert!(doc.markdown.contains("[the site](https://x.com/a)"));
718 assert_eq!(doc.links.len(), 1);
719 assert_eq!(doc.links[0].href, "https://x.com/a");
720 assert_eq!(doc.links[0].text, "the site");
721 }
722
723 #[test]
724 fn renders_unordered_and_ordered_lists() {
725 let html = "<body><ul><li>one</li><li>two</li></ul><ol><li>a</li><li>b</li></ol></body>";
726 let doc = parse(html);
727 assert!(doc.markdown.contains("- one"));
728 assert!(doc.markdown.contains("- two"));
729 assert!(doc.markdown.contains("1. a"));
730 assert!(doc.markdown.contains("2. b"));
731 }
732
733 #[test]
734 fn prefers_main_over_chrome() {
735 let html = "<body><nav><a href=/x>menu</a></nav><main><p>Core content</p></main><footer>foot</footer></body>";
736 let doc = parse(html);
737 assert_eq!(doc.markdown, "Core content");
738 }
739
740 #[test]
741 fn preserves_pre_as_fenced_code() {
742 let html = "<body><pre>line1\n line2</pre></body>";
743 let doc = parse(html);
744 assert!(doc.markdown.contains("```"));
745 assert!(doc.markdown.contains("line1\n line2"));
746 }
747
748 #[test]
749 fn markdown_to_text_strips_markup() {
750 let md = "# Heading\n\nSee [link](https://x.com) here.";
751 let text = markdown_to_text(md);
752 assert_eq!(text, "Heading\n\nSee link here.");
753 }
754
755 #[test]
756 fn handles_unterminated_tag_gracefully() {
757 let doc = parse("<body><p>ok</p><broken");
758 assert!(doc.markdown.contains("ok"));
759 }
760
761 #[test]
762 fn decodes_numeric_entities() {
763 assert_eq!(decode_entities("A&BA"), "A&BA");
764 }
765}