use regex::Regex;
use crate::macros::process_confluence_macros;
use crate::types::AnyPage;
pub fn strip_html(html: &str) -> String {
let tag_re = Regex::new(r"<[^>]*>").unwrap();
let result = tag_re.replace_all(html, "");
result
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub fn html_to_markdown(html: &str, truncate: bool, max_length: usize) -> String {
if html.is_empty() {
return String::new();
}
let processed = process_confluence_macros(html);
let mut md = html_to_md_basic(&processed);
let multi_nl = Regex::new(r"\n{3,}").unwrap();
md = multi_nl.replace_all(&md, "\n\n").trim().to_string();
if truncate && max_length > 0 && md.len() > max_length {
md = smart_truncate(&md, max_length);
}
md
}
fn html_to_md_basic(html: &str) -> String {
let mut s = html.to_string();
for level in (1..=6).rev() {
let hashes = "#".repeat(level);
let open_re = Regex::new(&format!(r"(?i)<h{level}[^>]*>")).unwrap();
let close_re = Regex::new(&format!(r"(?i)</h{level}>")).unwrap();
s = open_re.replace_all(&s, &format!("\n{hashes} ")).to_string();
s = close_re.replace_all(&s, "\n").to_string();
}
let strong_re = Regex::new(r"(?is)<strong>([\s\S]*?)</strong>").unwrap();
s = strong_re.replace_all(&s, "**$1**").to_string();
let b_re = Regex::new(r"(?is)<b>([\s\S]*?)</b>").unwrap();
s = b_re.replace_all(&s, "**$1**").to_string();
let em_re = Regex::new(r"(?is)<em>([\s\S]*?)</em>").unwrap();
s = em_re.replace_all(&s, "*$1*").to_string();
let i_re = Regex::new(r"(?is)<i>([\s\S]*?)</i>").unwrap();
s = i_re.replace_all(&s, "*$1*").to_string();
let code_re = Regex::new(r"(?is)<code>([\s\S]*?)</code>").unwrap();
s = code_re.replace_all(&s, "`$1`").to_string();
let pre_re = Regex::new(r"(?is)<pre[^>]*>([\s\S]*?)</pre>").unwrap();
s = pre_re.replace_all(&s, "\n```\n$1\n```\n").to_string();
let a_re = Regex::new(r#"(?is)<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)</a>"#).unwrap();
s = a_re.replace_all(&s, "[$2]($1)").to_string();
let img_re = Regex::new(r#"(?i)<img[^>]*alt="([^"]*)"[^>]*/?>"#).unwrap();
s = img_re.replace_all(&s, "[Image: $1]").to_string();
let img_re2 = Regex::new(r#"(?i)<img[^>]*src="([^"]*)"[^>]*/?>"#).unwrap();
s = img_re2.replace_all(&s, "[Image: $1]").to_string();
let th_re = Regex::new(r"(?is)<th[^>]*>([\s\S]*?)</th>").unwrap();
s = th_re.replace_all(&s, "| **$1** ").to_string();
let td_re = Regex::new(r"(?is)<td[^>]*>([\s\S]*?)</td>").unwrap();
s = td_re.replace_all(&s, "| $1 ").to_string();
let tr_re = Regex::new(r"(?is)<tr[^>]*>([\s\S]*?)</tr>").unwrap();
s = tr_re.replace_all(&s, "$1|\n").to_string();
let li_re = Regex::new(r"(?is)<li[^>]*>([\s\S]*?)</li>").unwrap();
s = li_re.replace_all(&s, "- $1\n").to_string();
let p_re = Regex::new(r"(?i)<p[^>]*>").unwrap();
s = p_re.replace_all(&s, "\n").to_string();
let p_close = Regex::new(r"(?i)</p>").unwrap();
s = p_close.replace_all(&s, "\n").to_string();
let br_re = Regex::new(r"(?i)<br\s*/?\s*>").unwrap();
s = br_re.replace_all(&s, "\n").to_string();
let hr_re = Regex::new(r"(?i)<hr\s*/?\s*>").unwrap();
s = hr_re.replace_all(&s, "\n---\n").to_string();
let tag_re = Regex::new(r"<[^>]*>").unwrap();
s = tag_re.replace_all(&s, "").to_string();
s = s
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'");
s
}
pub fn smart_truncate(text: &str, max_length: usize) -> String {
if text.len() <= max_length {
return text.to_string();
}
let threshold_70 = (max_length as f64 * 0.7) as usize;
let threshold_80 = (max_length as f64 * 0.8) as usize;
let mut truncate_at = max_length;
if let Some(pos) = text[..max_length].rfind("\n\n") {
if pos > threshold_70 {
truncate_at = pos;
}
}
if truncate_at == max_length {
let sentence_break = [". ", "! ", "? "]
.iter()
.filter_map(|sep| text[..max_length].rfind(sep).map(|p| p + 1))
.max();
if let Some(pos) = sentence_break {
if pos > threshold_70 {
truncate_at = pos;
}
}
}
if truncate_at == max_length {
if let Some(pos) = text[..max_length].rfind(' ') {
if pos > threshold_80 {
truncate_at = pos;
}
}
}
let truncated = text[..truncate_at].trim();
let remaining = text.len() - truncate_at;
let remaining_k = (remaining as f64 / 1000.0).round() as usize;
format!(
"{truncated}\n\n---\n⚠️ **Content truncated** (~{remaining_k}k characters remaining)\n\n\
Use `read_page_outline` to see the page structure, then `read_page_section` to read specific sections."
)
}
pub fn format_space(space: &crate::types::ConfluenceSpace) -> String {
let mut lines = vec![
format!("**{}**: {}", space.key, space.name),
format!("- **Type**: {}", space.space_type),
format!("- **Status**: {}", space.status),
];
if let Some(desc) = space
.description
.as_ref()
.and_then(|d| d.plain.as_ref())
.map(|p| &p.value)
{
if !desc.is_empty() {
lines.push(format!("- **Description**: {desc}"));
}
}
lines.join("\n")
}
pub fn format_page(page: &AnyPage) -> String {
let mut lines = vec![format!("**{}** (ID: {})", page.title(), page.id())];
lines.push(format!("- **Status**: {}", page.status()));
if let Some((key, name)) = page.space_info() {
lines.push(format!("- **Space**: {name} ({key})"));
}
lines.push(format!("- **Version**: {}", page.version_number()));
if let Some(date) = page.version_date() {
lines.push(format!("- **Last Modified**: {date}"));
}
lines.join("\n")
}
pub fn format_page_detailed(page: &AnyPage, host: &str, is_cloud: bool, max_content_length: usize) -> String {
let mut sections: Vec<String> = Vec::new();
sections.push(format!("# {}", page.title()));
sections.push(String::new());
let mut info = vec![
"| Field | Value |".to_string(),
"|-------|-------|".to_string(),
format!("| **ID** | {} |", page.id()),
format!("| **Status** | {} |", page.status()),
];
if let Some((key, name)) = page.space_info() {
info.push(format!("| **Space** | {name} ({key}) |"));
}
if let Some(sid) = page.space_id() {
info.push(format!("| **Space ID** | {sid} |"));
}
info.push(format!("| **Version** | {} |", page.version_number()));
if let Some(date) = page.version_date() {
info.push(format!("| **Last Modified** | {date} |"));
}
if let Some(author) = page.version_author() {
info.push(format!("| **Modified By** | {author} |"));
}
if let Some(ancs) = page.ancestors() {
if !ancs.is_empty() {
let breadcrumb: Vec<&str> = ancs.iter().map(|a| a.title.as_str()).collect();
info.push(format!("| **Parent Path** | {} |", breadcrumb.join(" > ")));
}
}
sections.push(info.join("\n"));
sections.push(String::new());
let body = page.storage_value();
if !body.is_empty() {
sections.push("## Content".to_string());
sections.push(String::new());
sections.push(html_to_markdown(body, true, max_content_length));
}
if let Some(webui) = page.webui_link() {
let full_url = if webui.starts_with('/') {
let prefix = if is_cloud { "/wiki" } else { "" };
format!("{host}{prefix}{webui}")
} else {
webui.to_string()
};
sections.push(String::new());
sections.push("---".to_string());
sections.push(format!("**URL**: {full_url}"));
}
sections.join("\n")
}
pub fn format_bytes(bytes: u64) -> String {
if bytes == 0 {
return "0 Bytes".to_string();
}
let units = ["Bytes", "KB", "MB", "GB"];
let k: f64 = 1024.0;
let i = (bytes as f64).ln() / k.ln();
let i = i.floor() as usize;
let i = i.min(units.len() - 1);
let size = bytes as f64 / k.powi(i as i32);
format!("{:.2} {}", size, units[i])
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::*;
#[test]
fn strip_html_basic() {
assert_eq!(strip_html("<p>Hello <strong>world</strong></p>"), "Hello world");
}
#[test]
fn strip_html_empty() {
assert_eq!(strip_html(""), "");
}
#[test]
fn strip_html_entities() {
assert_eq!(strip_html("A & B < C"), "A & B < C");
}
#[test]
fn strip_html_collapses_whitespace() {
assert_eq!(strip_html("<p> lots of spaces </p>"), "lots of spaces");
}
#[test]
fn strip_html_nested_tags() {
assert_eq!(
strip_html("<div><ul><li>Item</li></ul></div>"),
"Item"
);
}
#[test]
fn html_to_markdown_empty() {
assert_eq!(html_to_markdown("", false, 0), "");
}
#[test]
fn html_to_markdown_headings() {
let html = "<h1>Title</h1><h2>Subtitle</h2>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("# Title"));
assert!(md.contains("## Subtitle"));
}
#[test]
fn html_to_markdown_bold_and_italic() {
let html = "<strong>bold</strong> and <em>italic</em>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("**bold**"));
assert!(md.contains("*italic*"));
}
#[test]
fn html_to_markdown_inline_code() {
let html = "<code>foo()</code>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("`foo()`"));
}
#[test]
fn html_to_markdown_link() {
let html = r#"<a href="https://example.com">click</a>"#;
let md = html_to_markdown(html, false, 0);
assert!(md.contains("[click](https://example.com)"));
}
#[test]
fn html_to_markdown_table() {
let html = "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("**Name**"));
assert!(md.contains("Alice"));
assert!(md.contains("|"));
}
#[test]
fn html_to_markdown_list() {
let html = "<ul><li>One</li><li>Two</li></ul>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("- One"));
assert!(md.contains("- Two"));
}
#[test]
fn html_to_markdown_hr() {
let html = "<p>Before</p><hr/><p>After</p>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("---"));
}
#[test]
fn html_to_markdown_with_truncation() {
let html = "<p>A</p>".repeat(1000);
let md = html_to_markdown(&html, true, 100);
assert!(md.len() < 300); assert!(md.contains("truncated"));
}
#[test]
fn html_to_markdown_no_truncation_when_short() {
let html = "<p>Short</p>";
let md = html_to_markdown(html, true, 10000);
assert!(!md.contains("truncated"));
}
#[test]
fn html_to_markdown_pre_block() {
let html = "<pre>code block</pre>";
let md = html_to_markdown(html, false, 0);
assert!(md.contains("```"));
assert!(md.contains("code block"));
}
#[test]
fn smart_truncate_short_text() {
let text = "Hello world";
assert_eq!(smart_truncate(text, 100), text);
}
#[test]
fn smart_truncate_at_paragraph() {
let text = "First paragraph.\n\nSecond paragraph that makes it longer.";
let result = smart_truncate(text, 30);
assert!(result.contains("First paragraph."));
assert!(result.contains("truncated"));
}
#[test]
fn smart_truncate_at_sentence() {
let text = "First sentence. Second sentence that pushes past the limit here.";
let result = smart_truncate(text, 50);
assert!(result.contains("truncated"));
}
#[test]
fn smart_truncate_at_word() {
let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10";
let result = smart_truncate(text, 30);
assert!(result.contains("truncated"));
assert!(result.starts_with("word1 word2 word3 word4 word5"));
}
#[test]
fn format_bytes_zero() {
assert_eq!(format_bytes(0), "0 Bytes");
}
#[test]
fn format_bytes_bytes() {
assert_eq!(format_bytes(500), "500.00 Bytes");
}
#[test]
fn format_bytes_kb() {
assert_eq!(format_bytes(1024), "1.00 KB");
}
#[test]
fn format_bytes_mb() {
assert_eq!(format_bytes(1024 * 1024), "1.00 MB");
}
#[test]
fn format_bytes_gb() {
assert_eq!(format_bytes(1024 * 1024 * 1024), "1.00 GB");
}
#[test]
fn format_bytes_fractional() {
assert_eq!(format_bytes(1536), "1.50 KB");
}
#[test]
fn format_space_basic() {
let space = ConfluenceSpace {
id: "1".to_string(),
key: "DEV".to_string(),
name: "Development".to_string(),
space_type: "global".to_string(),
status: "current".to_string(),
description: None,
links: None,
};
let result = format_space(&space);
assert!(result.contains("**DEV**: Development"));
assert!(result.contains("global"));
assert!(result.contains("current"));
}
#[test]
fn format_space_with_description() {
let space = ConfluenceSpace {
id: "1".to_string(),
key: "QA".to_string(),
name: "Quality Assurance".to_string(),
space_type: "global".to_string(),
status: "current".to_string(),
description: Some(SpaceDescription {
plain: Some(PlainValue {
value: "QA team space".to_string(),
}),
}),
links: None,
};
let result = format_space(&space);
assert!(result.contains("QA team space"));
}
#[test]
fn format_page_v1() {
let page = ConfluencePageV1 {
id: "42".to_string(),
page_type: "page".to_string(),
status: "current".to_string(),
title: "Test Page".to_string(),
space: Some(SpaceRef {
key: "TS".to_string(),
name: "Test Space".to_string(),
}),
version: Some(PageVersionV1 {
number: 5,
when: Some("2024-01-01".to_string()),
by: None,
}),
body: None,
ancestors: None,
links: None,
};
let any = AnyPage::V1(page);
let result = format_page(&any);
assert!(result.contains("**Test Page** (ID: 42)"));
assert!(result.contains("Test Space (TS)"));
assert!(result.contains("Version**: 5"));
assert!(result.contains("2024-01-01"));
}
#[test]
fn format_page_v2() {
let page = ConfluencePage {
id: "99".to_string(),
status: "current".to_string(),
title: "Cloud Page".to_string(),
space_id: Some("space-1".to_string()),
parent_id: None,
version: Some(PageVersion {
number: 3,
message: None,
created_at: Some("2024-06-15".to_string()),
}),
body: None,
links: None,
};
let any = AnyPage::V2(page);
let result = format_page(&any);
assert!(result.contains("**Cloud Page** (ID: 99)"));
assert!(result.contains("Version**: 3"));
}
#[test]
fn format_page_detailed_with_content() {
let page = ConfluencePageV1 {
id: "10".to_string(),
page_type: "page".to_string(),
status: "current".to_string(),
title: "Detailed Page".to_string(),
space: Some(SpaceRef {
key: "DS".to_string(),
name: "Detail Space".to_string(),
}),
version: Some(PageVersionV1 {
number: 2,
when: None,
by: Some(VersionAuthor {
display_name: "Jane Doe".to_string(),
}),
}),
body: Some(PageBody {
storage: Some(StorageBody {
value: "<p>Hello world</p>".to_string(),
representation: Some("storage".to_string()),
}),
view: None,
}),
ancestors: Some(vec![AncestorRef {
id: "1".to_string(),
title: "Root".to_string(),
}]),
links: Some(PageLinksV1 {
webui: Some("/pages/10".to_string()),
edit: None,
self_link: None,
}),
};
let any = AnyPage::V1(page);
let result = format_page_detailed(&any, "https://wiki.example.com", false, 30000);
assert!(result.contains("# Detailed Page"));
assert!(result.contains("Hello world"));
assert!(result.contains("Jane Doe"));
assert!(result.contains("Root"));
assert!(result.contains("https://wiki.example.com/pages/10"));
}
#[test]
fn format_page_detailed_cloud_url() {
let page = ConfluencePage {
id: "20".to_string(),
status: "current".to_string(),
title: "Cloud Detailed".to_string(),
space_id: Some("sp-1".to_string()),
parent_id: None,
version: Some(PageVersion {
number: 1,
message: None,
created_at: None,
}),
body: None,
links: Some(PageLinks {
webui: Some("/pages/20".to_string()),
editui: None,
}),
};
let any = AnyPage::V2(page);
let result = format_page_detailed(&any, "https://mysite.atlassian.net", true, 30000);
assert!(result.contains("https://mysite.atlassian.net/wiki/pages/20"));
}
}