use ferogram_tl_types as tl;
use crate::rich_common::*;
pub fn parse_rich_html(html: &str) -> Vec<tl::enums::PageBlock> {
RichHtmlParser::new(html).parse()
}
struct RichHtmlParser {
html: String,
pos: usize,
}
impl RichHtmlParser {
fn new(html: &str) -> Self {
Self {
html: html.to_string(),
pos: 0,
}
}
fn remaining(&self) -> &str {
&self.html[self.pos..]
}
fn skip_whitespace(&mut self) {
while self.pos < self.html.len() {
let c = self.html.as_bytes()[self.pos];
if c.is_ascii_whitespace() {
self.pos += 1;
} else {
break;
}
}
}
fn parse(mut self) -> Vec<tl::enums::PageBlock> {
let mut blocks = Vec::new();
loop {
self.skip_whitespace();
if self.pos >= self.html.len() {
break;
}
if self.remaining().starts_with('<')
&& let Some(block) = self.try_parse_block_tag()
{
blocks.extend(block);
continue;
}
if let Some(para) = self.parse_text_paragraph() {
blocks.push(para);
}
}
blocks
}
fn try_parse_block_tag(&mut self) -> Option<Vec<tl::enums::PageBlock>> {
let rem = self.remaining();
let lower = rem.to_ascii_lowercase();
macro_rules! heading {
($tag:literal, $level:expr) => {
if lower.starts_with(concat!("<", $tag, ">"))
|| lower.starts_with(concat!("<", $tag, " "))
{
let body = self.consume_tag($tag)?;
return Some(vec![heading_block($level, parse_rich_html_inline(&body))]);
}
};
}
heading!("h1", 1);
heading!("h2", 2);
heading!("h3", 3);
heading!("h4", 4);
heading!("h5", 5);
heading!("h6", 6);
if lower.starts_with("<p>") || lower.starts_with("<p ") {
let body = self.consume_tag("p")?;
return Some(vec![tl::enums::PageBlock::Paragraph(
tl::types::PageBlockParagraph {
text: parse_rich_html_inline(&body),
},
)]);
}
if lower.starts_with("<pre>") || lower.starts_with("<pre>") || lower.starts_with("<pre ") {
let body = self.consume_tag("pre")?;
let (lang, code) = extract_pre_content_from_body(&body);
return Some(vec![tl::enums::PageBlock::Preformatted(
tl::types::PageBlockPreformatted {
text: rt_plain(code),
language: lang,
},
)]);
}
if lower.starts_with("<footer>") || lower.starts_with("<footer ") {
let body = self.consume_tag("footer")?;
return Some(vec![tl::enums::PageBlock::Footer(
tl::types::PageBlockFooter {
text: parse_rich_html_inline(&body),
},
)]);
}
if lower.starts_with("<hr") {
self.consume_until('>');
self.pos += 1;
return Some(vec![tl::enums::PageBlock::Divider]);
}
if lower.starts_with("<blockquote") {
let body = self.consume_tag("blockquote")?;
let (text, credit) = split_cite(&body);
return Some(vec![tl::enums::PageBlock::Blockquote(
tl::types::PageBlockBlockquote {
text: parse_rich_html_inline(&text),
caption: parse_rich_html_inline(&credit),
},
)]);
}
if lower.starts_with("<aside") {
let body = self.consume_tag("aside")?;
let (text, credit) = split_cite(&body);
return Some(vec![tl::enums::PageBlock::Pullquote(
tl::types::PageBlockPullquote {
text: parse_rich_html_inline(&text),
caption: parse_rich_html_inline(&credit),
},
)]);
}
if lower.starts_with("<ul") {
let body = self.consume_tag("ul")?;
let items = parse_html_list_items(&body, false);
return Some(vec![tl::enums::PageBlock::List(tl::types::PageBlockList {
items,
})]);
}
if lower.starts_with("<ol") {
let tag_open = rem.split('>').next().unwrap_or("").to_string();
let (_, attrs) = parse_tag(tag_open.trim_start_matches('<'));
let start: Option<i32> = attrs
.iter()
.find(|(k, _)| k == "start")
.and_then(|(_, v)| v.parse().ok());
let reversed = attrs.iter().any(|(k, _)| k == "reversed");
let ol_type: Option<String> = attrs
.iter()
.find(|(k, _)| k == "type")
.map(|(_, v)| v.clone());
let body = self.consume_tag("ol")?;
let items = parse_html_ordered_list_items(&body, ol_type.as_deref());
return Some(vec![tl::enums::PageBlock::OrderedList(
tl::types::PageBlockOrderedList {
reversed,
items,
start,
r#type: ol_type,
},
)]);
}
if lower.starts_with("<table") {
let tag_open = rem.split('>').next().unwrap_or("").to_string();
let (_, attrs) = parse_tag(tag_open.trim_start_matches('<'));
let bordered = attrs.iter().any(|(k, _)| k == "bordered");
let striped = attrs.iter().any(|(k, _)| k == "striped");
let body = self.consume_tag("table")?;
let (title, rows) = parse_html_table(&body);
return Some(vec![tl::enums::PageBlock::Table(
tl::types::PageBlockTable {
bordered,
striped,
title,
rows,
},
)]);
}
if lower.starts_with("<details") {
let is_open_hint = rem.to_ascii_lowercase().starts_with("<details open");
let full = self.consume_tag("details")?;
let is_open = is_open_hint || full.starts_with("open");
let summary = extract_between(&full, "<summary>", "</summary>").unwrap_or_default();
let body_start = full
.find("</summary>")
.map(|i| i + "</summary>".len())
.unwrap_or(full.len());
let inner = parse_rich_html(full[body_start..].trim());
return Some(vec![tl::enums::PageBlock::Details(
tl::types::PageBlockDetails {
open: is_open,
blocks: inner,
title: parse_rich_html_inline(&summary),
},
)]);
}
if lower.starts_with("<img ") {
let tag_raw = self.consume_self_closing_tag();
let (_, attrs) = parse_tag(&tag_raw);
let src = attrs
.iter()
.find(|(k, _)| k == "src")
.map(|(_, v)| v.clone())
.unwrap_or_default();
let spoiler = attrs.iter().any(|(k, _)| k == "tg-spoiler");
if !src.is_empty() {
return Some(vec![media_block(&src, empty_caption(), spoiler)]);
}
return Some(vec![]);
}
if lower.starts_with("<video ") {
let tag_raw = self.consume_self_closing_or_pair("video");
let (_, attrs) = parse_tag(&tag_raw);
let src = attrs
.iter()
.find(|(k, _)| k == "src")
.map(|(_, v)| v.clone())
.unwrap_or_default();
let spoiler = attrs.iter().any(|(k, _)| k == "tg-spoiler");
if !src.is_empty() {
return Some(vec![media_block(&src, empty_caption(), spoiler)]);
}
return Some(vec![]);
}
if lower.starts_with("<audio ") {
let tag_raw = self.consume_self_closing_or_pair("audio");
let (_, attrs) = parse_tag(&tag_raw);
let src = attrs
.iter()
.find(|(k, _)| k == "src")
.map(|(_, v)| v.clone())
.unwrap_or_default();
if !src.is_empty() {
return Some(vec![media_block(&src, empty_caption(), false)]);
}
return Some(vec![]);
}
if lower.starts_with("<figure") {
let body = self.consume_tag("figure")?;
let caption_raw =
extract_between(&body, "<figcaption>", "</figcaption>").unwrap_or_default();
let (cap_t, cap_cr) = split_cite(&caption_raw);
let cap = if cap_t.is_empty() {
empty_caption()
} else {
caption_text_credit(
parse_rich_html_inline(&cap_t),
parse_rich_html_inline(&cap_cr),
)
};
let spoiler = body.contains("tg-spoiler");
if body.to_ascii_lowercase().contains("<tg-map") {
let map_inner = extract_between(&body, "<tg-map", "/>").unwrap_or_default();
let (_, attrs) = parse_tag(&format!("tg-map {map_inner}"));
let lat: f64 = attrs
.iter()
.find(|(k, _)| k == "lat")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(0.0);
let long: f64 = attrs
.iter()
.find(|(k, _)| k == "long")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(0.0);
let zoom: i32 = attrs
.iter()
.find(|(k, _)| k == "zoom")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(15);
return Some(vec![tl::enums::PageBlock::Map(tl::types::PageBlockMap {
geo: tl::enums::GeoPoint::GeoPoint(tl::types::GeoPoint {
lat,
long,
access_hash: 0,
accuracy_radius: None,
}),
zoom,
w: 400,
h: 300,
caption: cap,
})]);
}
let src = extract_src_from_figure(&body);
if let Some(url) = src {
return Some(vec![media_block(&url, cap, spoiler)]);
}
return Some(vec![]);
}
if lower.starts_with("<tg-collage") {
let body = self.consume_tag("tg-collage")?;
let (items, cap) = extract_collage_items(&body);
return Some(vec![tl::enums::PageBlock::Collage(
tl::types::PageBlockCollage {
items,
caption: cap.unwrap_or_else(empty_caption),
},
)]);
}
if lower.starts_with("<tg-slideshow") {
let body = self.consume_tag("tg-slideshow")?;
let (items, cap) = extract_collage_items(&body);
return Some(vec![tl::enums::PageBlock::Slideshow(
tl::types::PageBlockSlideshow {
items,
caption: cap.unwrap_or_else(empty_caption),
},
)]);
}
if lower.starts_with("<tg-map") {
let tag_raw = self.consume_self_closing_tag();
let (_, attrs) = parse_tag(&tag_raw);
let lat: f64 = attrs
.iter()
.find(|(k, _)| k == "lat")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(0.0);
let long: f64 = attrs
.iter()
.find(|(k, _)| k == "long")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(0.0);
let zoom: i32 = attrs
.iter()
.find(|(k, _)| k == "zoom")
.and_then(|(_, v)| v.parse().ok())
.unwrap_or(15);
return Some(vec![tl::enums::PageBlock::Map(tl::types::PageBlockMap {
geo: tl::enums::GeoPoint::GeoPoint(tl::types::GeoPoint {
lat,
long,
access_hash: 0,
accuracy_radius: None,
}),
zoom,
w: 400,
h: 300,
caption: empty_caption(),
})]);
}
if lower.starts_with("<tg-math-block") {
let body = self.consume_tag("tg-math-block")?;
return Some(vec![tl::enums::PageBlock::Math(tl::types::PageBlockMath {
source: body,
})]);
}
if lower.starts_with("<a ") && lower.contains("name=") {
let tag_raw = self.consume_self_closing_or_pair("a");
let (_, attrs) = parse_tag(&tag_raw);
let name = attrs
.iter()
.find(|(k, _)| k == "name")
.map(|(_, v)| v.clone())
.unwrap_or_default();
if !name.is_empty() {
return Some(vec![tl::enums::PageBlock::Anchor(
tl::types::PageBlockAnchor { name },
)]);
}
return Some(vec![]);
}
if lower.starts_with("<!--") || lower.starts_with("<!") {
self.consume_until('>');
self.pos = (self.pos + 1).min(self.html.len());
return Some(vec![]);
}
None
}
fn consume_tag(&mut self, tag: &str) -> Option<String> {
let open_end = self.remaining().find('>')?;
self.pos += open_end + 1;
let close_tag = format!("</{tag}>");
let close_pos = self.remaining().to_ascii_lowercase().find(&close_tag)?;
let body = self.remaining()[..close_pos].to_string();
self.pos += close_pos + close_tag.len();
Some(body)
}
fn consume_self_closing_tag(&mut self) -> String {
let end = self.remaining().find('>').unwrap_or(self.remaining().len());
let tag_raw = self.remaining()[1..end]
.trim_end_matches('/')
.trim()
.to_string();
self.pos += end + 1;
tag_raw
}
fn consume_self_closing_or_pair(&mut self, tag: &str) -> String {
let rem = self.remaining();
let open_end = rem.find('>').unwrap_or(rem.len());
let is_self = rem[..open_end].ends_with('/');
let tag_raw = rem[1..open_end].trim_end_matches('/').trim().to_string();
self.pos += open_end + 1;
if !is_self {
let close_tag = format!("</{tag}>");
if let Some(end) = self.remaining().to_ascii_lowercase().find(&close_tag) {
self.pos += end + close_tag.len();
}
}
tag_raw
}
fn consume_until(&mut self, ch: char) {
while self.pos < self.html.len() {
if self.html.as_bytes()[self.pos] == ch as u8 {
break;
}
self.pos += 1;
}
}
fn parse_text_paragraph(&mut self) -> Option<tl::enums::PageBlock> {
let start = self.pos;
while self.pos < self.html.len() {
let rem = self.remaining();
if rem.starts_with('<') {
let lower = rem.to_ascii_lowercase();
let is_block = is_block_html_tag(&lower);
if is_block {
break;
}
let end = rem.find('>').unwrap_or(rem.len());
self.pos += end + 1;
} else {
self.pos += 1;
}
}
if self.pos == start {
return None;
}
let text_raw = &self.html[start..self.pos];
let decoded = decode_html_entities(text_raw);
if decoded.trim().is_empty() {
return None;
}
Some(tl::enums::PageBlock::Paragraph(
tl::types::PageBlockParagraph {
text: parse_rich_html_inline(&decoded),
},
))
}
}
pub fn parse_rich_html_inline(html: &str) -> tl::enums::RichText {
let chars: Vec<char> = html.chars().collect();
let mut parts = Vec::new();
let mut buf = String::new();
let mut i = 0;
let n = chars.len();
macro_rules! flush {
() => {
if !buf.is_empty() {
parts.push(rt_plain(decode_html_entities(&std::mem::take(&mut buf))));
}
};
}
while i < n {
if chars[i] == '&' {
let mut j = i + 1;
while j < n && chars[j] != ';' && chars[j] != ' ' {
j += 1;
}
if j < n && chars[j] == ';' {
let entity: String = chars[i..=j].iter().collect();
buf.push_str(&decode_html_entities(&entity));
i = j + 1;
continue;
}
}
if chars[i] != '<' {
buf.push(chars[i]);
i += 1;
continue;
}
let remaining: String = chars[i..].iter().collect();
if let Some((consumed, rt)) = try_parse_html_inline_tag(&chars, i, n) {
flush!();
parts.push(rt);
i = consumed;
continue;
}
buf.push(chars[i]);
i += 1;
let _ = remaining;
}
flush!();
rt_concat(parts)
}
fn extract_pre_content_from_body(body: &str) -> (String, String) {
let lo = body.to_ascii_lowercase();
if lo.contains("<code") {
let lang = extract_between(body, "class=\"language-", "\"").unwrap_or_default();
let code_start = lo.find('>').map(|i| i + 1).unwrap_or(0);
let code = extract_between(body, ">", "</code>")
.or_else(|| {
extract_between(body, "<code", "</code>").map(|c| {
let ci = c.find('>').map(|i| i + 1).unwrap_or(0);
c[ci..].to_string()
})
})
.unwrap_or_else(|| body[code_start..].to_string());
return (lang, decode_html_entities(&code));
}
(String::new(), decode_html_entities(body))
}