use regex::Regex;
use std::sync::LazyLock;
const MAX_MESSAGE_LEN: usize = 4000;
pub fn html_escape(text: &str) -> String {
text.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
pub fn md_to_telegram_html(md: &str) -> String {
static CODE_BLOCK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"```(\w*)\n?([\s\S]*?)```").expect("invalid regex"));
static INLINE_CODE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"`([^`]+)`").expect("invalid regex"));
static BOLD: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").expect("invalid regex"));
static ITALIC: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([^*]|^)\*([^*]+)\*([^*]|$)").expect("invalid regex"));
static LINK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("invalid regex"));
static HEADING: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^#{1,6}\s+(.+)$").expect("invalid regex"));
let mut protected: Vec<String> = Vec::new();
let text = CODE_BLOCK.replace_all(md, |caps: ®ex::Captures<'_>| {
let lang = &caps[1];
let code = html_escape(&caps[2]);
let placeholder = format!("\x00CODE{}\x00", protected.len());
let _ = lang; protected.push(format!("<pre>{code}</pre>"));
placeholder
});
let text = INLINE_CODE.replace_all(&text, |caps: ®ex::Captures<'_>| {
let code = html_escape(&caps[1]);
let placeholder = format!("\x00CODE{}\x00", protected.len());
protected.push(format!("<code>{code}</code>"));
placeholder
});
let text = html_escape(&text);
let text = BOLD.replace_all(&text, "<b>$1</b>");
let text = ITALIC.replace_all(&text, "$1<i>$2</i>$3");
let text = LINK.replace_all(&text, |caps: ®ex::Captures<'_>| {
let label = &caps[1];
let url = &caps[2];
if url.starts_with("http://")
|| url.starts_with("https://")
|| url.starts_with("tg://")
|| url.starts_with("mailto:")
{
format!("<a href=\"{url}\">{label}</a>")
} else {
format!("{label} ({url})")
}
});
let text = HEADING.replace_all(&text, "<b>$1</b>");
let mut text = text.into_owned();
for (i, block) in protected.iter().enumerate() {
let placeholder = format!("\x00CODE{i}\x00");
text = text.replace(&placeholder, block);
}
text
}
pub fn chunk_text(text: &str, max_len: usize) -> Vec<String> {
let max_len = if max_len == 0 {
MAX_MESSAGE_LEN
} else {
max_len
};
if text.len() <= max_len {
return vec![text.to_string()];
}
let mut chunks = Vec::new();
let mut remaining = text;
while !remaining.is_empty() {
if remaining.len() <= max_len {
chunks.push(remaining.to_string());
break;
}
let hard_cut = remaining.floor_char_boundary(max_len);
let split_at = find_split_point(remaining, hard_cut, "\n\n")
.or_else(|| find_split_point(remaining, hard_cut, "\n"))
.unwrap_or(hard_cut);
let (chunk, rest) = remaining.split_at(split_at);
chunks.push(chunk.to_string());
remaining = rest.trim_start_matches('\n');
}
chunks
}
fn find_split_point(text: &str, boundary: usize, delimiter: &str) -> Option<usize> {
let search_region = &text[..boundary];
search_region
.rfind(delimiter)
.map(|pos| pos.saturating_add(delimiter.len()))
}
pub(crate) fn find_safe_html_boundary(html: &str, max_len: usize) -> usize {
let mut boundary = html.floor_char_boundary(max_len.min(html.len()));
while boundary > 0 {
let bytes = &html.as_bytes()[..boundary];
let last_open = bytes.iter().rposition(|&b| b == b'<');
let last_close = bytes.iter().rposition(|&b| b == b'>');
let inside_tag = match (last_open, last_close) {
(Some(lt), Some(gt)) => lt > gt,
(Some(_), None) => true,
_ => false,
};
let last_amp = bytes.iter().rposition(|&b| b == b'&');
let last_semi = bytes.iter().rposition(|&b| b == b';');
let inside_entity = match (last_amp, last_semi) {
(Some(amp), Some(semi)) => amp > semi,
(Some(_), None) => true,
_ => false,
};
if !inside_tag && !inside_entity {
break;
}
boundary = html.floor_char_boundary(boundary.saturating_sub(1));
}
boundary
}
pub(crate) fn close_open_tags(html: &str) -> String {
use std::fmt::Write as _;
static TAG_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"<(/?)(\w+)[^>]*>").expect("invalid regex"));
let mut open_tags: Vec<String> = Vec::new();
for cap in TAG_RE.captures_iter(html) {
let is_close = &cap[1] == "/";
let tag_name = cap[2].to_lowercase();
if is_close {
if let Some(pos) = open_tags.iter().rposition(|t| *t == tag_name) {
open_tags.remove(pos);
}
} else {
open_tags.push(tag_name);
}
}
if open_tags.is_empty() {
return html.to_string();
}
let mut result = html.to_string();
for tag in open_tags.into_iter().rev() {
let _ = write!(result, "</{tag}>");
}
result
}
pub fn chunk_html(html: &str, max_len: usize) -> Vec<String> {
const CLOSING_TAG_HEADROOM: usize = 50;
let max_len = if max_len == 0 {
MAX_MESSAGE_LEN
} else {
max_len
};
let split_limit = max_len.saturating_sub(CLOSING_TAG_HEADROOM);
if html.len() <= max_len {
return vec![html.to_string()];
}
let mut chunks = Vec::new();
let mut remaining = html;
while !remaining.is_empty() {
if remaining.len() <= max_len {
chunks.push(remaining.to_string());
break;
}
let hard_cut = find_safe_html_boundary(remaining, split_limit);
let split_at = find_split_point(remaining, hard_cut, "\n\n")
.or_else(|| find_split_point(remaining, hard_cut, "\n"))
.unwrap_or(hard_cut);
let split_at = if split_at == 0 {
remaining.floor_char_boundary(max_len.max(1))
} else {
split_at
};
let (chunk, rest) = remaining.split_at(split_at);
chunks.push(close_open_tags(chunk));
remaining = rest.trim_start_matches('\n');
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn html_escape_ampersand() {
assert_eq!(html_escape("a & b"), "a & b");
}
#[test]
fn html_escape_angle_brackets() {
assert_eq!(
html_escape("<script>alert(1)</script>"),
"<script>alert(1)</script>"
);
}
#[test]
fn html_escape_all_special_chars() {
assert_eq!(html_escape("<b>&test</b>"), "<b>&test</b>");
}
#[test]
fn html_escape_quotes() {
assert_eq!(html_escape(r#"a"b'c"#), "a"b'c");
}
#[test]
fn html_escape_empty_string() {
assert_eq!(html_escape(""), "");
}
#[test]
fn html_escape_no_special_chars() {
assert_eq!(html_escape("hello world"), "hello world");
}
#[test]
fn md_bold() {
let result = md_to_telegram_html("Hello **world**");
assert!(result.contains("<b>world</b>"));
}
#[test]
fn md_bold_multiple() {
let result = md_to_telegram_html("**a** and **b**");
assert!(result.contains("<b>a</b>"));
assert!(result.contains("<b>b</b>"));
}
#[test]
fn md_code_block_with_lang() {
let result = md_to_telegram_html("```rust\nfn main() {}\n```");
assert!(result.contains("<pre>"));
assert!(result.contains("fn main()"));
assert!(!result.contains("language-"));
}
#[test]
fn md_code_block_without_lang() {
let result = md_to_telegram_html("```\nhello\n```");
assert!(result.contains("<pre>hello"));
assert!(!result.contains("language-"));
}
#[test]
fn md_code_block_escapes_html() {
let result = md_to_telegram_html("```\n<div>test</div>\n```");
assert!(result.contains("<div>"));
}
#[test]
fn md_inline_code() {
let result = md_to_telegram_html("Use `cargo build` here");
assert!(result.contains("<code>cargo build</code>"));
}
#[test]
fn md_inline_code_preserves_content() {
let result = md_to_telegram_html("Run `ls -la`");
assert!(result.contains("<code>ls -la</code>"));
}
#[test]
fn md_inline_code_not_affected_by_bold() {
let result = md_to_telegram_html("Use `**not bold**` here");
assert!(result.contains("<code>**not bold**</code>"));
assert!(!result.contains("<b>not bold</b>"));
}
#[test]
fn md_inline_code_escapes_html() {
let result = md_to_telegram_html("Use `<div>` tag");
assert!(result.contains("<code><div></code>"));
}
#[test]
fn md_link() {
let result = md_to_telegram_html("Visit [Google](https://google.com)");
assert!(result.contains(r#"<a href="https://google.com">Google</a>"#));
}
#[test]
fn md_link_unsafe_scheme_rejected() {
let result = md_to_telegram_html("Click [here](javascript:alert(1))");
assert!(!result.contains("<a href"));
assert!(result.contains("here"));
}
#[test]
fn md_heading() {
let result = md_to_telegram_html("# Title");
assert!(result.contains("<b>Title</b>"));
}
#[test]
fn md_heading_h3() {
let result = md_to_telegram_html("### Subtitle");
assert!(result.contains("<b>Subtitle</b>"));
}
#[test]
fn md_plain_text_escapes_html() {
let result = md_to_telegram_html("1 < 2 & 3 > 0");
assert!(result.contains("<"));
assert!(result.contains("&"));
assert!(result.contains(">"));
}
#[test]
fn md_empty_string() {
assert_eq!(md_to_telegram_html(""), "");
}
#[test]
fn chunk_text_short_message() {
let chunks = chunk_text("short text", 100);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], "short text");
}
#[test]
fn chunk_text_exact_limit() {
let text = "x".repeat(100);
let chunks = chunk_text(&text, 100);
assert_eq!(chunks.len(), 1);
}
#[test]
fn chunk_text_splits_at_paragraph() {
let text = "a".repeat(50) + "\n\n" + &"b".repeat(50);
let chunks = chunk_text(&text, 60);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].starts_with('a'));
assert!(chunks[1].starts_with('b'));
}
#[test]
fn chunk_text_splits_at_newline_when_no_paragraph() {
let text = "a".repeat(30) + "\n" + &"b".repeat(30);
let chunks = chunk_text(&text, 40);
assert_eq!(chunks.len(), 2);
}
#[test]
fn chunk_text_hard_split_when_no_breaks() {
let text = "x".repeat(200);
let chunks = chunk_text(&text, 100);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].len(), 100);
assert_eq!(chunks[1].len(), 100);
}
#[test]
fn chunk_text_zero_max_uses_default() {
let text = "short";
let chunks = chunk_text(text, 0);
assert_eq!(chunks.len(), 1);
}
#[test]
fn chunk_text_empty_string() {
let chunks = chunk_text("", 100);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], "");
}
#[test]
fn chunk_text_multibyte_safe() {
let text = "あ".repeat(100); let chunks = chunk_text(&text, 50);
assert!(chunks.len() > 1);
for chunk in &chunks {
assert!(!chunk.is_empty());
}
}
#[test]
fn chunk_text_preserves_all_content() {
let text = "a".repeat(150) + "\n\n" + &"b".repeat(150);
let chunks = chunk_text(&text, 200);
let reassembled: String = chunks.join("");
assert!(reassembled.contains(&"a".repeat(150)));
assert!(reassembled.contains(&"b".repeat(150)));
}
#[test]
fn find_split_at_double_newline() {
let text = "hello\n\nworld and more text here";
let point = find_split_point(text, 20, "\n\n");
assert_eq!(point, Some(7)); }
#[test]
fn find_split_no_delimiter() {
let text = "no breaks at all in this text";
let point = find_split_point(text, 15, "\n\n");
assert!(point.is_none());
}
#[test]
fn html_boundary_avoids_mid_tag() {
let html = "x".repeat(95) + "<b>bold</b>";
let b = find_safe_html_boundary(&html, 96);
assert_eq!(b, 95); }
#[test]
fn html_boundary_avoids_mid_entity() {
let html = "x".repeat(95) + "& rest";
let b = find_safe_html_boundary(&html, 98);
assert_eq!(b, 95); }
#[test]
fn html_boundary_after_complete_tag() {
let html = "x".repeat(90) + "<b>y</b>" + &"z".repeat(10);
let b = find_safe_html_boundary(&html, 102);
assert_eq!(b, 102);
}
#[test]
fn chunk_html_short() {
let chunks = chunk_html("<b>hello</b>", 100);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], "<b>hello</b>");
}
#[test]
fn chunk_html_does_not_split_mid_entity() {
let html = "& ".repeat(200); let chunks = chunk_html(&html, 100);
assert!(chunks.len() > 1);
for chunk in &chunks {
assert!(
!chunk.ends_with('&'),
"chunk ends with partial entity: {chunk}"
);
assert!(
!chunk.contains("&am\n") && !chunk.ends_with("&am"),
"chunk has partial entity: {chunk}"
);
}
}
#[test]
fn chunk_html_does_not_split_mid_tag() {
let padding = "x".repeat(95);
let html = format!("{padding}<b>bold</b>{padding}<i>ital</i>");
let chunks = chunk_html(&html, 100);
assert!(chunks.len() > 1);
for chunk in &chunks {
let trimmed = chunk.trim_end();
if let Some(last_lt) = trimmed.rfind('<') {
assert!(
trimmed[last_lt..].contains('>'),
"chunk ends inside a tag: {chunk}"
);
}
}
}
#[test]
fn chunk_html_handles_expanded_entities() {
let html = "&".repeat(1000); let chunks = chunk_html(&html, 4000);
assert!(chunks.len() >= 2);
for chunk in &chunks {
assert!(chunk.len() <= 4000);
}
}
#[test]
fn chunk_html_prefers_newline_split() {
let part_a = "a".repeat(200);
let part_b = "b".repeat(200);
let html = format!("{part_a}\n\n{part_b}");
let chunks = chunk_html(&html, 300);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].starts_with('a'));
assert!(chunks[1].starts_with('b'));
}
#[test]
fn close_open_tags_balanced() {
assert_eq!(close_open_tags("<b>bold</b>"), "<b>bold</b>");
}
#[test]
fn close_open_tags_unclosed_b() {
assert_eq!(close_open_tags("<b>bold"), "<b>bold</b>");
}
#[test]
fn close_open_tags_nested() {
assert_eq!(close_open_tags("<b><i>text"), "<b><i>text</i></b>");
}
#[test]
fn close_open_tags_no_tags() {
assert_eq!(close_open_tags("plain text"), "plain text");
}
#[test]
fn close_open_tags_partial_close() {
assert_eq!(close_open_tags("<b><i>text</i>"), "<b><i>text</i></b>");
}
}