use html2md;
pub const DEFAULT_CONTENT_MAX_CHARS: usize = 100_000;
pub fn html_to_markdown(html: &str) -> String {
let raw_md = html2md::rewrite_html(html, false);
clean_markdown(&collapse_blank_lines(&raw_md))
}
pub fn plain_to_markdown(value: &str) -> String {
collapse_blank_lines(value)
}
pub fn truncate_for_context(value: &str, max_chars: usize) -> (String, bool) {
if max_chars == 0 {
return (String::new(), !value.is_empty());
}
let mut byte_end = 0usize;
for (count, ch) in value.chars().enumerate() {
if count >= max_chars {
return (
format!(
"{}...(truncated, {} total)",
&value[..byte_end],
value.len()
),
true,
);
}
byte_end += ch.len_utf8();
}
(value.to_string(), false)
}
pub fn collapse_blank_lines(value: &str) -> String {
let normalized = value.replace("\r\n", "\n").replace('\r', "\n");
let mut out = String::new();
let mut blank_run = 0usize;
for line in normalized.lines() {
if line.trim().is_empty() {
blank_run += 1;
if blank_run <= 2 {
out.push('\n');
}
} else {
blank_run = 0;
out.push_str(line.trim_end());
out.push('\n');
}
}
out.trim().to_string()
}
fn clean_markdown(value: &str) -> String {
let no_images = strip_markdown_images(value);
let no_tracking = strip_tracking_links(&no_images);
let cleaned: Vec<String> = no_tracking
.lines()
.map(|line| {
let stripped = line.trim().trim_matches('|').trim();
stripped.replace("&", "&")
})
.collect();
collapse_blank_lines(&cleaned.join("\n"))
}
fn strip_markdown_images(value: &str) -> String {
let mut out = String::new();
let chars: Vec<char> = value.chars().collect();
let mut i = 0usize;
while i < chars.len() {
if chars[i] == '!'
&& i + 1 < chars.len()
&& chars[i + 1] == '['
&& let Some(end) = skip_markdown_link(&chars, i + 1)
{
i = end;
continue;
}
out.push(chars[i]);
i += 1;
}
out
}
fn strip_tracking_links(value: &str) -> String {
let mut out = String::new();
let chars: Vec<char> = value.chars().collect();
let mut i = 0usize;
while i < chars.len() {
if chars[i] == '['
&& let Some((link_text, _url, url_len, end)) = parse_markdown_link(&chars, i)
&& (link_text.trim().is_empty() || url_len > 150)
{
let clean = link_text.trim();
if !clean.is_empty() {
out.push_str(clean);
}
i = end;
continue;
}
out.push(chars[i]);
i += 1;
}
out
}
fn parse_markdown_link(chars: &[char], start: usize) -> Option<(String, String, usize, usize)> {
let close_bracket = chars[start + 1..].iter().position(|&c| c == ']')?;
let link_text: String = chars[start + 1..start + 1 + close_bracket].iter().collect();
let after = start + 1 + close_bracket + 1;
if after >= chars.len() || chars[after] != '(' {
return None;
}
let close_paren = chars[after + 1..].iter().position(|&c| c == ')')?;
let url: String = chars[after + 1..after + 1 + close_paren].iter().collect();
let end = after + 1 + close_paren + 1;
Some((link_text, url.clone(), url.len(), end))
}
fn skip_markdown_link(chars: &[char], start: usize) -> Option<usize> {
let close_bracket = chars[start + 1..].iter().position(|&c| c == ']')?;
let after = start + 1 + close_bracket + 1;
if after >= chars.len() || chars[after] != '(' {
return None;
}
let close_paren = chars[after + 1..].iter().position(|&c| c == ')')?;
Some(after + 1 + close_paren + 1)
}