use sha2::{Digest, Sha256};
use thiserror::Error;
use crate::DocEntry;
#[derive(Debug, Error)]
pub enum DocSiteError {
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
#[error("Response body too large (limit: {limit_kb} KB)")]
TooLarge { limit_kb: u64 },
}
pub fn strip_html_to_text(html: &str) -> String {
let mut out = String::with_capacity(html.len());
let mut chars = html.chars().peekable();
while let Some(c) = chars.next() {
if c != '<' {
out.push(c);
continue;
}
let mut tag_buf = String::new();
for ch in chars.by_ref() {
if ch == '>' {
break;
}
tag_buf.push(ch);
}
let tag_lower = tag_buf.trim_start_matches('/').to_lowercase();
let tag_name: &str = tag_lower
.split(|c: char| c.is_whitespace())
.next()
.unwrap_or("");
if tag_name == "script" || tag_name == "style" {
let close = format!("</{}", tag_name);
let mut buf = String::new();
while let Some(ch) = chars.next() {
buf.push(ch);
if buf.to_lowercase().ends_with(&close) {
for ch2 in chars.by_ref() {
if ch2 == '>' {
break;
}
}
break;
}
if buf.len() > close.len() * 4 {
let keep = close.len() + 1;
let drain_to = buf
.char_indices()
.rev()
.nth(keep)
.map(|(i, _)| i)
.unwrap_or(0);
buf = buf[drain_to..].to_string();
}
}
continue;
}
let block_tags = [
"p",
"div",
"br",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"li",
"tr",
"td",
"th",
"blockquote",
"pre",
"article",
"section",
"header",
"footer",
"nav",
"main",
];
if block_tags.contains(&tag_name) {
out.push('\n');
}
}
let mut result = String::with_capacity(out.len());
let mut last_was_newline = false;
let mut last_was_space = false;
for ch in out.chars() {
if ch == '\n' || ch == '\r' {
if !last_was_newline {
result.push('\n');
}
last_was_newline = true;
last_was_space = false;
} else if ch.is_whitespace() {
if !last_was_space && !last_was_newline {
result.push(' ');
}
last_was_space = true;
} else {
result.push(ch);
last_was_newline = false;
last_was_space = false;
}
}
result.trim().to_string()
}
pub fn strip_long_code_examples(text: &str, max_lines: usize) -> String {
let mut out = String::new();
let mut rest = text;
while let Some(open_pos) = rest.find("```") {
out.push_str(&rest[..open_pos]);
let after_open = &rest[open_pos + 3..];
let block_start = match after_open.find('\n') {
Some(nl) => nl + 1,
None => {
out.push_str("```");
out.push_str(after_open);
return out;
}
};
let fence_info = &after_open[..block_start];
let block_body = &after_open[block_start..];
let Some(close_pos) = block_body.find("```") else {
out.push_str("```");
out.push_str(after_open);
return out;
};
let content = &block_body[..close_pos];
let line_count = content.lines().count();
if line_count <= max_lines {
out.push_str("```");
out.push_str(fence_info);
out.push_str(content);
out.push_str("```");
}
let after_close = &block_body[close_pos + 3..];
rest = match after_close.find('\n') {
Some(nl) => &after_close[nl + 1..],
None => "",
};
}
out.push_str(rest);
out
}
pub async fn fetch_doc_site(url: &str, max_size_kb: u64) -> Result<DocEntry, DocSiteError> {
let response = reqwest::get(url).await?.error_for_status()?;
if let Some(len) = response.content_length() {
if len > max_size_kb * 1024 {
return Err(DocSiteError::TooLarge {
limit_kb: max_size_kb,
});
}
}
let bytes = response.bytes().await?;
if bytes.len() as u64 > max_size_kb * 1024 {
return Err(DocSiteError::TooLarge {
limit_kb: max_size_kb,
});
}
let raw = String::from_utf8_lossy(&bytes).into_owned();
let content = strip_html_to_text(&raw);
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
let content_hash = format!("{:x}", hasher.finalize());
let name = url
.rsplit('/')
.find(|s| !s.is_empty())
.unwrap_or("doc")
.to_string();
Ok(DocEntry {
name,
content,
source_url: Some(url.to_string()),
content_hash,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags() {
let html =
"<html><body><h1>Title</h1><p>Content here</p><script>evil()</script></body></html>";
let text = strip_html_to_text(html);
assert!(text.contains("Title"));
assert!(text.contains("Content here"));
assert!(!text.contains("<h1>"));
assert!(!text.contains("evil()"));
}
#[test]
fn test_strip_code_examples() {
let text = "Some text\n```\nline1\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10\nline11\n```\nMore text";
let stripped = strip_long_code_examples(text, 10);
assert!(stripped.contains("Some text"));
assert!(stripped.contains("More text"));
assert!(!stripped.contains("line11"));
}
#[test]
fn test_short_code_examples_kept() {
let text = "Before\n```\nshort example\n```\nAfter";
let stripped = strip_long_code_examples(text, 10);
assert!(stripped.contains("short example"));
}
}