#[cfg(feature = "convert")]
pub mod chm;
#[cfg(feature = "convert")]
pub mod cleaning;
#[cfg(feature = "convert")]
pub mod html;
pub mod naming;
#[cfg(feature = "convert")]
pub mod pdf;
#[cfg(feature = "convert")]
pub mod webhelp;
#[cfg(feature = "convert")]
use std::path::{Path, PathBuf};
#[cfg(feature = "convert")]
use anyhow::Context;
pub fn find_python() -> anyhow::Result<String> {
for name in &["python3", "python", "py"] {
match std::process::Command::new(name)
.arg("--version")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
{
Ok(status) if status.success() => {
return Ok(name.to_string());
}
_ => continue,
}
}
anyhow::bail!(
"Python not found. Install `python3` (Linux: `sudo apt install python3`, macOS: `brew install python`)"
)
}
#[cfg(feature = "convert")]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocFormat {
Pdf,
Html,
Chm,
Markdown,
WebHelp,
}
#[cfg(feature = "convert")]
type FileConverter = fn(&Path) -> anyhow::Result<String>;
#[cfg(feature = "convert")]
struct FormatEntry {
variant: DocFormat,
display_name: &'static str,
extensions: &'static [&'static str],
converter: Option<FileConverter>,
}
#[cfg(feature = "convert")]
static FORMAT_TABLE: &[FormatEntry] = &[
FormatEntry {
variant: DocFormat::Pdf,
display_name: "PDF",
extensions: &["pdf"],
converter: Some(pdf::pdf_to_markdown),
},
FormatEntry {
variant: DocFormat::Html,
display_name: "HTML",
extensions: &["html", "htm"],
converter: Some(html::html_file_to_markdown),
},
FormatEntry {
variant: DocFormat::Chm,
display_name: "CHM",
extensions: &["chm"],
converter: Some(chm::chm_to_markdown),
},
FormatEntry {
variant: DocFormat::Markdown,
display_name: "Markdown",
extensions: &["md", "markdown"],
converter: Some(markdown_passthrough),
},
FormatEntry {
variant: DocFormat::WebHelp,
display_name: "WebHelp",
extensions: &[],
converter: None,
},
];
#[cfg(feature = "convert")]
fn markdown_passthrough(path: &Path) -> anyhow::Result<String> {
let _span = tracing::info_span!("markdown_passthrough", path = %path.display()).entered();
const MAX_FILE_SIZE: u64 = 100 * 1024 * 1024;
let meta = std::fs::metadata(path)
.map_err(|e| anyhow::anyhow!("Failed to stat {}: {}", path.display(), e))?;
if meta.len() > MAX_FILE_SIZE {
anyhow::bail!(
"File {} exceeds {} MB size limit",
path.display(),
MAX_FILE_SIZE / 1024 / 1024,
);
}
std::fs::read_to_string(path)
.map_err(|e| anyhow::anyhow!("Failed to read {}: {}", path.display(), e))
}
#[cfg(feature = "convert")]
impl std::fmt::Display for DocFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let name = FORMAT_TABLE
.iter()
.find(|e| e.variant == *self)
.map(|e| e.display_name)
.unwrap_or("Unknown");
write!(f, "{}", name)
}
}
#[cfg(feature = "convert")]
pub struct ConvertOptions {
pub output_dir: PathBuf,
pub overwrite: bool,
pub dry_run: bool,
pub clean_tags: Vec<String>,
}
#[cfg(feature = "convert")]
pub struct ConvertResult {
pub source: PathBuf,
pub output: PathBuf,
pub format: DocFormat,
pub title: String,
pub sections: usize,
}
#[cfg(feature = "convert")]
pub fn detect_format(path: &Path) -> Option<DocFormat> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
FORMAT_TABLE
.iter()
.find(|entry| entry.extensions.contains(&ext.as_str()))
.map(|entry| entry.variant)
}
#[cfg(feature = "convert")]
pub fn convert_path(path: &Path, opts: &ConvertOptions) -> anyhow::Result<Vec<ConvertResult>> {
let _span = tracing::info_span!("convert_path", path = %path.display()).entered();
if path.is_dir() {
convert_directory(path, opts)
} else {
convert_file(path, opts).map(|r| vec![r])
}
}
#[cfg(feature = "convert")]
fn convert_file(path: &Path, opts: &ConvertOptions) -> anyhow::Result<ConvertResult> {
let _span = tracing::info_span!("convert_file", path = %path.display()).entered();
let format = detect_format(path)
.ok_or_else(|| anyhow::anyhow!("Unsupported format: {}", path.display()))?;
let entry = FORMAT_TABLE
.iter()
.find(|e| e.variant == format)
.ok_or_else(|| anyhow::anyhow!("Unsupported format {:?}", format))?;
let raw_markdown = match entry.converter {
Some(convert_fn) => convert_fn(path)?,
None => anyhow::bail!(
"{} is a directory format — use convert_path() on the directory",
entry.display_name
),
};
let tag_refs: Vec<&str> = opts.clean_tags.iter().map(|s| s.as_str()).collect();
let cleaned = cleaning::clean_markdown(&raw_markdown, &tag_refs);
let title = naming::extract_title(&cleaned, path);
let filename = naming::title_to_filename(&title);
let filename = naming::resolve_conflict(&filename, path, &opts.output_dir);
let sections = cleaned.lines().filter(|l| l.starts_with('#')).count();
finalize_output(path, &cleaned, &filename, &title, sections, format, opts)
}
#[cfg(feature = "convert")]
fn finalize_output(
source: &Path,
cleaned: &str,
filename: &str,
title: &str,
sections: usize,
format: DocFormat,
opts: &ConvertOptions,
) -> anyhow::Result<ConvertResult> {
let output_path = opts.output_dir.join(filename);
if !opts.dry_run {
std::fs::create_dir_all(&opts.output_dir).with_context(|| {
format!(
"Failed to create output directory: {}",
opts.output_dir.display()
)
})?;
if let (Ok(src), Ok(dst)) = (
dunce::canonicalize(source),
dunce::canonicalize(&output_path).or_else(|_| {
dunce::canonicalize(&opts.output_dir).map(|d| d.join(filename))
}),
) {
if src == dst {
tracing::warn!(path = %source.display(), "Skipping: output would overwrite source");
anyhow::bail!(
"Output would overwrite source file: {} (use a different --output directory)",
source.display()
);
}
}
if opts.overwrite {
std::fs::write(&output_path, cleaned).with_context(|| {
format!("Failed to write output file: {}", output_path.display())
})?;
} else {
use std::io::Write;
match std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(&output_path)
{
Ok(mut f) => {
f.write_all(cleaned.as_bytes()).with_context(|| {
format!("Failed to write output file: {}", output_path.display())
})?;
}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
anyhow::bail!(
"Output file already exists: {} (use --overwrite to replace)",
output_path.display()
);
}
Err(e) => {
return Err(anyhow::Error::new(e).context(format!(
"Failed to write output file: {}",
output_path.display()
)));
}
}
}
tracing::info!(
source = %source.display(),
output = %output_path.display(),
title = %title,
sections = sections,
"Converted document"
);
}
Ok(ConvertResult {
source: source.to_path_buf(),
output: output_path,
format,
title: title.to_string(),
sections,
})
}
#[cfg(feature = "convert")]
fn convert_directory(dir: &Path, opts: &ConvertOptions) -> anyhow::Result<Vec<ConvertResult>> {
let _span = tracing::info_span!("convert_directory", dir = %dir.display()).entered();
if webhelp::is_webhelp_dir(dir) {
return convert_webhelp(dir, opts).map(|r| vec![r]);
}
let mut results = Vec::new();
let mut webhelp_dirs: Vec<PathBuf> = Vec::new();
match std::fs::read_dir(dir) {
Ok(entries) => {
for entry in entries.filter_map(|e| match e {
Ok(entry) => Some(entry),
Err(err) => {
tracing::warn!(error = %err, "Skipping directory entry due to read_dir error");
None
}
}) {
let path = entry.path();
if path.is_dir() && webhelp::is_webhelp_dir(&path) {
webhelp_dirs.push(path);
}
}
}
Err(e) => {
tracing::warn!(dir = %dir.display(), error = %e, "Failed to read directory for webhelp detection");
}
}
for wh_dir in &webhelp_dirs {
match convert_webhelp(wh_dir, opts) {
Ok(r) => results.push(r),
Err(e) => tracing::warn!(
path = %wh_dir.display(),
error = %e,
"Failed to convert web help directory"
),
}
}
const MAX_WALK_DEPTH: usize = 50;
for entry in walkdir::WalkDir::new(dir)
.max_depth(MAX_WALK_DEPTH)
.into_iter()
.filter_entry(|e| !e.path_is_symlink())
.filter_map(|e| match e {
Ok(entry) => Some(entry),
Err(err) => {
tracing::warn!(error = %err, "Skipping directory entry due to walkdir error");
None
}
})
.filter(|e| e.file_type().is_file())
.filter(|e| detect_format(e.path()).is_some())
.filter(|e| !webhelp_dirs.iter().any(|wh| e.path().starts_with(wh)))
{
match convert_file(entry.path(), opts) {
Ok(r) => results.push(r),
Err(e) => tracing::warn!(
path = %entry.path().display(),
error = %e,
"Failed to convert document"
),
}
}
tracing::info!(
dir = %dir.display(),
converted = results.len(),
"Directory conversion complete"
);
Ok(results)
}
#[cfg(feature = "convert")]
fn convert_webhelp(dir: &Path, opts: &ConvertOptions) -> anyhow::Result<ConvertResult> {
let _span = tracing::info_span!("convert_webhelp", dir = %dir.display()).entered();
let raw_markdown = webhelp::webhelp_to_markdown(dir)?;
let tag_refs: Vec<&str> = opts.clean_tags.iter().map(|s| s.as_str()).collect();
let cleaned = cleaning::clean_markdown(&raw_markdown, &tag_refs);
let title = naming::extract_title(&cleaned, dir);
let filename = naming::title_to_filename(&title);
let filename = naming::resolve_conflict(&filename, dir, &opts.output_dir);
let sections = cleaned.lines().filter(|l| l.starts_with('#')).count();
finalize_output(
dir,
&cleaned,
&filename,
&title,
sections,
DocFormat::WebHelp,
opts,
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "convert")]
fn test_format_table_complete() {
let all = [
DocFormat::Pdf,
DocFormat::Html,
DocFormat::Chm,
DocFormat::Markdown,
DocFormat::WebHelp,
];
for v in &all {
let entry = FORMAT_TABLE.iter().find(|e| e.variant == *v);
assert!(entry.is_some(), "FORMAT_TABLE missing entry for {:?}", v);
let entry = entry.unwrap();
assert!(
!entry.display_name.is_empty(),
"Empty display_name for {:?}",
v
);
if entry.converter.is_some() {
assert!(
!entry.extensions.is_empty(),
"File-based format {:?} must have at least one extension",
v
);
}
}
}
#[test]
#[cfg(feature = "convert")]
fn test_detect_format_roundtrips() {
for entry in FORMAT_TABLE.iter().filter(|e| e.converter.is_some()) {
for ext in entry.extensions {
let path = std::path::Path::new("test").with_extension(ext);
assert_eq!(
detect_format(&path),
Some(entry.variant),
"detect_format failed for .{} (expected {:?})",
ext,
entry.variant
);
}
}
assert_eq!(detect_format(std::path::Path::new("doc.rs")), None);
assert_eq!(detect_format(std::path::Path::new("doc")), None);
}
#[test]
#[cfg(feature = "convert")]
fn test_detect_format_case_insensitive() {
assert_eq!(
detect_format(std::path::Path::new("doc.PDF")),
Some(DocFormat::Pdf)
);
assert_eq!(
detect_format(std::path::Path::new("doc.HTM")),
Some(DocFormat::Html)
);
}
}