use std::path::Path;
use anyhow::Result;
const WEBHELP_CONTENT_DIR: &str = "content";
pub fn is_webhelp_dir(dir: &Path) -> bool {
if dir.symlink_metadata().is_ok_and(|m| m.is_symlink()) {
return false;
}
let content_dir = dir.join(WEBHELP_CONTENT_DIR);
if !content_dir.is_dir() {
return false;
}
walkdir::WalkDir::new(&content_dir)
.into_iter()
.filter_map(|e| match e {
Ok(entry) => Some(entry),
Err(err) => {
tracing::warn!(error = %err, "Skipping entry during webhelp detection due to walkdir error");
None
}
})
.any(|e| {
e.file_type().is_file()
&& e.path()
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| ext.eq_ignore_ascii_case("html") || ext.eq_ignore_ascii_case("htm"))
.unwrap_or(false)
})
}
pub fn webhelp_to_markdown(dir: &Path) -> Result<String> {
let _span = tracing::info_span!("webhelp_to_markdown", dir = %dir.display()).entered();
let content_dir = dir.join(WEBHELP_CONTENT_DIR);
if content_dir.symlink_metadata().is_ok_and(|m| m.is_symlink()) {
anyhow::bail!(
"Web help content/ directory is a symlink (rejected for security): {}",
content_dir.display()
);
}
if !content_dir.is_dir() {
anyhow::bail!(
"Web help directory has no content/ subdirectory: {}",
dir.display()
);
}
const MAX_PAGES: usize = 1000;
let mut pages: Vec<_> = walkdir::WalkDir::new(&content_dir)
.into_iter()
.filter_entry(|e| !e.path_is_symlink())
.filter_map(|e| match e {
Ok(entry) => Some(entry),
Err(err) => {
tracing::warn!(error = %err, "Skipping web help page due to walkdir error");
None
}
})
.filter(|e| e.file_type().is_file())
.filter(|e| {
e.path()
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| ext.eq_ignore_ascii_case("html") || ext.eq_ignore_ascii_case("htm"))
.unwrap_or(false)
})
.collect();
pages.sort_by_key(|e| e.path().to_path_buf());
if pages.is_empty() {
anyhow::bail!("No HTML files found in {}", content_dir.display());
}
if pages.len() > MAX_PAGES {
tracing::warn!(
dir = %dir.display(),
total = pages.len(),
limit = MAX_PAGES,
"Web help page count exceeds limit, truncating"
);
pages.truncate(MAX_PAGES);
}
tracing::info!(
dir = %dir.display(),
pages = pages.len(),
"Found web help pages"
);
let mut merged = String::new();
let mut page_count = 0usize;
const MAX_WEBHELP_BYTES: usize = 50 * 1024 * 1024;
for entry in &pages {
let bytes = match std::fs::read(entry.path()) {
Ok(b) => b,
Err(e) => {
tracing::warn!(
path = %entry.path().display(),
error = %e,
"Failed to read web help page"
);
continue;
}
};
let html = String::from_utf8_lossy(&bytes);
match super::html::html_to_markdown(&html) {
Ok(md) if !md.trim().is_empty() => {
if !merged.is_empty() {
merged.push_str("\n\n---\n\n");
}
merged.push_str(&md);
page_count += 1;
if merged.len() > MAX_WEBHELP_BYTES {
tracing::warn!(
bytes = merged.len(),
pages = page_count,
"Webhelp output exceeds 50MB limit, truncating"
);
break;
}
}
Ok(_) => {} Err(e) => {
tracing::debug!(
path = %entry.path().display(),
error = %e,
"Skipping empty web help page"
);
}
}
}
if merged.is_empty() {
anyhow::bail!("Web help produced no content from {} pages", pages.len());
}
tracing::info!(
dir = %dir.display(),
pages = page_count,
bytes = merged.len(),
"Web help converted"
);
Ok(merged)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_webhelp_dir_returns_false_for_empty_dir() {
let dir = tempfile::tempdir().expect("should create temp dir");
assert!(
!is_webhelp_dir(dir.path()),
"empty directory should not be detected as a webhelp dir"
);
}
#[test]
fn test_is_webhelp_dir_returns_false_for_dir_without_html() {
let dir = tempfile::tempdir().expect("should create temp dir");
let content = dir.path().join("content");
std::fs::create_dir(&content).expect("should create content dir");
std::fs::write(content.join("readme.txt"), "not html").expect("should write file");
assert!(
!is_webhelp_dir(dir.path()),
"directory with content/ but no HTML files should not be detected as webhelp"
);
}
#[test]
fn test_is_webhelp_dir_returns_true_for_webhelp_layout() {
let dir = tempfile::tempdir().expect("should create temp dir");
let content = dir.path().join("content");
std::fs::create_dir(&content).expect("should create content dir");
std::fs::write(content.join("index.html"), "<p>hello</p>").expect("should write html");
assert!(
is_webhelp_dir(dir.path()),
"directory with content/*.html should be detected as webhelp"
);
}
}