use std::collections::HashMap;
use std::fs;
use std::path::Path;
const MAX_MD_FILE_SIZE: u64 = 256 * 1024 * 1024;
use crate::error::Hwp2MdError;
use crate::hwp;
use crate::hwpx;
use crate::ir;
use crate::md;
pub fn to_markdown(
input: &Path,
output: Option<&Path>,
assets_dir: Option<&Path>,
frontmatter: bool,
) -> Result<(), Hwp2MdError> {
let ext = input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let doc = match ext.as_str() {
"hwp" => {
tracing::info!("Parsing HWP 5.0: {:?}", input);
hwp::read_hwp(input)?
}
"hwpx" => {
tracing::info!("Parsing HWPX: {:?}", input);
hwpx::read_hwpx(input)?
}
_ => {
return Err(Hwp2MdError::UnsupportedFormat(format!(
".{ext}. Expected .hwp or .hwpx"
)))
}
};
if let Some(dir) = assets_dir {
write_assets(&doc, dir)?;
}
let markdown = md::write_markdown(&doc, frontmatter);
match output {
Some(path) => {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(path, &markdown)?;
tracing::info!("Written to {:?}", path);
}
None => {
print!("{markdown}");
}
}
Ok(())
}
pub fn to_hwpx(
input: &Path,
output: Option<&Path>,
style: Option<&Path>,
) -> Result<(), Hwp2MdError> {
let ext = input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
if ext != "md" && ext != "markdown" {
return Err(Hwp2MdError::UnsupportedFormat(format!(
"Expected .md or .markdown file, got .{ext}"
)));
}
let content = fs::read_to_string(input)?;
let doc = md::parse_markdown(&content);
let out_path = output.map_or_else(
|| input.with_extension("hwpx"),
std::path::Path::to_path_buf,
);
if let Some(parent) = out_path.parent() {
fs::create_dir_all(parent)?;
}
hwpx::write_hwpx(&doc, &out_path, style)?;
tracing::info!("Written to {:?}", out_path);
Ok(())
}
pub fn show_info(input: &Path) -> Result<(), Hwp2MdError> {
let ext = input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match ext.as_str() {
"hwp" => {
let doc = hwp::read_hwp(input)?;
print_info(&doc, input);
}
"hwpx" => {
let doc = hwpx::read_hwpx(input)?;
print_info(&doc, input);
}
_ => return Err(Hwp2MdError::UnsupportedFormat(format!(".{ext}"))),
}
Ok(())
}
pub fn convert_auto(input: &Path, output: &Path, force: bool) -> Result<(), Hwp2MdError> {
if !force && output.exists() {
return Err(Hwp2MdError::OutputExists {
path: output.to_path_buf(),
});
}
let in_ext = input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let out_ext = output
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let in_kind = classify_format(&in_ext);
let out_kind = classify_format(&out_ext);
match (in_kind, out_kind) {
(FormatKind::Hwp | FormatKind::Hwpx, FormatKind::Markdown) => {
to_markdown(input, Some(output), None, false)
}
(FormatKind::Markdown, FormatKind::Hwpx) => to_hwpx(input, Some(output), None),
_ => Err(Hwp2MdError::UnsupportedFormat(format!(
"cannot infer conversion direction from .{in_ext} -> .{out_ext}; \
expected .hwp/.hwpx -> .md/.markdown or .md/.markdown -> .hwpx"
))),
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum FormatKind {
Hwp,
Hwpx,
Markdown,
Unknown,
}
fn classify_format(ext: &str) -> FormatKind {
match ext {
"hwp" => FormatKind::Hwp,
"hwpx" => FormatKind::Hwpx,
"md" | "markdown" => FormatKind::Markdown,
_ => FormatKind::Unknown,
}
}
pub fn check(input: &Path) -> Result<(), Hwp2MdError> {
let ext = input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match ext.as_str() {
"hwp" => {
tracing::info!("Checking HWP 5.0: {:?}", input);
hwp::read_hwp(input)?;
}
"hwpx" => {
tracing::info!("Checking HWPX: {:?}", input);
hwpx::read_hwpx(input)?;
}
"md" | "markdown" => {
tracing::info!("Checking Markdown: {:?}", input);
let file_size = fs::metadata(input)?.len();
if file_size > MAX_MD_FILE_SIZE {
return Err(Hwp2MdError::FileTooLarge {
path: input.to_path_buf(),
size: file_size,
limit: MAX_MD_FILE_SIZE,
});
}
let content = fs::read_to_string(input)?;
let _doc = md::parse_markdown(&content);
}
_ => {
return Err(Hwp2MdError::UnsupportedFormat(format!(
".{ext}. Expected .hwp, .hwpx, .md, or .markdown"
)));
}
}
Ok(())
}
fn print_info(doc: &ir::Document, path: &Path) {
println!("File: {}", path.display());
println!(
"Format: {}",
path.extension()
.and_then(|e| e.to_str())
.unwrap_or("unknown")
);
if let Some(ref title) = doc.metadata.title {
println!("Title: {title}");
}
if let Some(ref author) = doc.metadata.author {
println!("Author: {author}");
}
println!("Sections: {}", doc.sections.len());
let block_count: usize = doc.sections.iter().map(|s| s.blocks.len()).sum();
println!("Blocks: {block_count}");
let char_count: usize = doc
.sections
.iter()
.flat_map(|s| &s.blocks)
.map(count_chars)
.sum();
println!("Characters: ~{char_count}");
println!("Assets: {}", doc.assets.len());
}
fn count_chars(block: &ir::Block) -> usize {
match block {
ir::Block::Heading { inlines, .. } | ir::Block::Paragraph { inlines } => {
inlines.iter().map(|i| i.text.chars().count()).sum()
}
ir::Block::CodeBlock { code, .. } => code.chars().count(),
ir::Block::BlockQuote { blocks } => blocks.iter().map(count_chars).sum(),
ir::Block::List { items, .. } => {
items.iter().flat_map(|i| &i.blocks).map(count_chars).sum()
}
ir::Block::Table { rows, .. } => rows
.iter()
.flat_map(|r| &r.cells)
.flat_map(|c| &c.blocks)
.map(count_chars)
.sum(),
ir::Block::Math { tex, .. } => tex.chars().count(),
ir::Block::Footnote { content, .. } => content.iter().map(count_chars).sum(),
ir::Block::Image { .. } | ir::Block::HorizontalRule | ir::Block::PageBreak => 0,
}
}
#[must_use]
pub fn sanitize_asset_name(raw: &str) -> String {
const RESERVED: &[&str] = &[
"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7",
"COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
];
let base = Path::new(raw)
.file_name()
.map(|s| s.to_string_lossy().into_owned())
.unwrap_or_default();
let base: String = base
.chars()
.map(|c| {
if c == '\0' || c == '/' || c == '\\' || (c as u32) < 0x20 || c == '\x7F' {
'_'
} else {
c
}
})
.collect();
let base = base.trim_end_matches(['.', ' ']).to_string();
let stem = base.rsplit_once('.').map_or(base.as_str(), |(s, _)| s);
let base = if RESERVED.iter().any(|&r| stem.eq_ignore_ascii_case(r)) {
format!("_{base}")
} else {
base
};
if base.is_empty() || base == "." || base == ".." {
"asset".to_string()
} else {
base
}
}
fn next_available_name(name: &str, seen: &mut HashMap<String, u32>) -> String {
let count = seen.entry(name.to_string()).or_insert(0);
*count += 1;
if *count == 1 {
return name.to_string();
}
let p = std::path::Path::new(name);
match (
p.file_stem().and_then(|s| s.to_str()),
p.extension().and_then(|e| e.to_str()),
) {
(Some(stem), Some(ext)) if !stem.is_empty() => {
format!("{stem} ({count}).{ext}")
}
_ => format!("{name} ({count})"),
}
}
fn write_assets(doc: &ir::Document, dir: &Path) -> Result<(), Hwp2MdError> {
if doc.assets.is_empty() {
return Ok(());
}
fs::create_dir_all(dir)?;
let mut seen: HashMap<String, u32> = HashMap::new();
for asset in &doc.assets {
let raw_name = sanitize_asset_name(&asset.name);
let final_name = next_available_name(&raw_name, &mut seen);
let path = dir.join(&final_name);
fs::write(&path, &asset.data)?;
tracing::info!("Extracted: {:?}", path);
}
Ok(())
}
#[derive(Debug)]
pub struct ConvertOptions<'a> {
input: &'a Path,
output: &'a Path,
assets_dir: Option<&'a Path>,
frontmatter: bool,
style: Option<&'a Path>,
force: bool,
}
impl<'a> ConvertOptions<'a> {
#[must_use]
pub fn new(input: &'a Path, output: &'a Path) -> Self {
Self {
input,
output,
assets_dir: None,
frontmatter: false,
style: None,
force: false,
}
}
#[must_use]
pub fn assets_dir(mut self, dir: &'a Path) -> Self {
self.assets_dir = Some(dir);
self
}
#[must_use]
pub fn frontmatter(mut self, enabled: bool) -> Self {
self.frontmatter = enabled;
self
}
#[must_use]
pub fn style(mut self, path: &'a Path) -> Self {
self.style = Some(path);
self
}
#[must_use]
pub fn force(mut self, enabled: bool) -> Self {
self.force = enabled;
self
}
pub fn execute(self) -> Result<(), Hwp2MdError> {
if !self.force && self.output.exists() {
return Err(Hwp2MdError::OutputExists {
path: self.output.to_path_buf(),
});
}
let in_ext = self
.input
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let out_ext = self
.output
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
match (classify_format(&in_ext), classify_format(&out_ext)) {
(FormatKind::Hwp | FormatKind::Hwpx, FormatKind::Markdown) => to_markdown(
self.input,
Some(self.output),
self.assets_dir,
self.frontmatter,
),
(FormatKind::Markdown, FormatKind::Hwpx) => {
to_hwpx(self.input, Some(self.output), self.style)
}
_ => Err(Hwp2MdError::UnsupportedFormat(format!(
"cannot infer conversion direction from .{in_ext} -> .{out_ext}; \
expected .hwp/.hwpx -> .md/.markdown or .md/.markdown -> .hwpx"
))),
}
}
}
#[cfg(test)]
#[path = "convert_tests.rs"]
mod tests;
#[cfg(test)]
#[path = "convert_tests_count.rs"]
mod tests_count;
#[cfg(test)]
#[path = "convert_tests_builder.rs"]
mod tests_builder;
#[cfg(test)]
#[path = "convert_tests_sanitize.rs"]
mod tests_sanitize;