pub mod cleanup;
pub mod detect;
pub mod equation;
pub mod error;
pub mod model;
pub mod parse_options;
pub mod render;
#[cfg(feature = "ffi")]
pub mod ffi;
#[cfg(feature = "hwp5")]
pub mod hwp5;
#[cfg(feature = "hwpx")]
pub mod hwpx;
#[cfg(feature = "hwp3")]
pub mod hwp3;
#[cfg(feature = "async")]
pub mod async_api;
pub use cleanup::{cleanup, CleanupOptions};
pub use detect::{detect_format, detect_format_from_bytes, detect_format_from_path, FormatType};
pub use error::{Error, Result};
pub use model::Document;
pub use parse_options::{ErrorMode, ExtractMode, ParseOptions};
pub use render::{RenderOptions, TableFallback};
use std::io::{Read, Seek};
use std::path::Path;
pub fn parse_file(path: impl AsRef<Path>) -> Result<Document> {
let path = path.as_ref();
let format = detect_format_from_path(path)?;
match format {
#[cfg(feature = "hwp5")]
FormatType::Hwp5 => {
let mut parser = hwp5::Hwp5Parser::open(path)?;
parser.parse()
}
#[cfg(feature = "hwpx")]
FormatType::Hwpx => {
let mut parser = hwpx::HwpxParser::open(path)?;
parser.parse()
}
#[cfg(feature = "hwp3")]
FormatType::Hwp3 => {
let mut parser = hwp3::Hwp3Parser::open(path)?;
parser.parse()
}
#[cfg(not(feature = "hwp3"))]
FormatType::Hwp3 => Err(Error::UnsupportedFormat(
"HWP 3.x support requires 'hwp3' feature".into(),
)),
#[allow(unreachable_patterns)]
_ => Err(Error::UnsupportedFormat(format.to_string())),
}
}
pub fn parse_reader<R: Read + Seek>(reader: R) -> Result<Document> {
let mut buf_reader = std::io::BufReader::new(reader);
let format = detect::detect_format(&mut buf_reader)?;
match format {
#[cfg(feature = "hwp5")]
FormatType::Hwp5 => {
let mut parser = hwp5::Hwp5Parser::from_reader(buf_reader)?;
parser.parse()
}
#[cfg(feature = "hwpx")]
FormatType::Hwpx => {
let mut parser = hwpx::HwpxParser::from_reader(buf_reader)?;
parser.parse()
}
#[cfg(feature = "hwp3")]
FormatType::Hwp3 => {
let mut parser = hwp3::Hwp3Parser::from_reader(buf_reader)?;
parser.parse()
}
#[cfg(not(feature = "hwp3"))]
FormatType::Hwp3 => Err(Error::UnsupportedFormat(
"HWP 3.x support requires 'hwp3' feature".into(),
)),
#[allow(unreachable_patterns)]
_ => Err(Error::UnsupportedFormat(format.to_string())),
}
}
pub fn parse_bytes(data: &[u8]) -> Result<Document> {
let cursor = std::io::Cursor::new(data);
parse_reader(cursor)
}
pub fn extract_text(path: impl AsRef<Path>) -> Result<String> {
let document = parse_file(path)?;
Ok(document.plain_text())
}
pub fn to_markdown(path: impl AsRef<Path>) -> Result<String> {
let document = parse_file(path)?;
render::render_markdown(&document, &RenderOptions::default())
}
pub fn to_markdown_with_options(path: impl AsRef<Path>, options: &RenderOptions) -> Result<String> {
let document = parse_file(path)?;
render::render_markdown(&document, options)
}
pub struct Unhwp {
render_options: RenderOptions,
parse_options: ParseOptions,
extract_images: bool,
}
impl Default for Unhwp {
fn default() -> Self {
Self::new()
}
}
impl Unhwp {
pub fn new() -> Self {
Self {
render_options: RenderOptions::default(),
parse_options: ParseOptions::default(),
extract_images: false,
}
}
pub fn with_images(mut self, extract: bool) -> Self {
self.extract_images = extract;
self
}
pub fn with_image_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
self.render_options.image_dir = Some(dir.into());
self
}
pub fn with_table_fallback(mut self, fallback: TableFallback) -> Self {
self.render_options.table_fallback = fallback;
self
}
pub fn with_frontmatter(mut self) -> Self {
self.render_options.include_frontmatter = true;
self
}
pub fn lenient(mut self) -> Self {
self.parse_options = self.parse_options.lenient();
self
}
pub fn text_only(mut self) -> Self {
self.parse_options = self.parse_options.text_only();
self
}
pub fn with_memory_limit_mb(mut self, mb: usize) -> Self {
self.parse_options = self.parse_options.with_memory_limit_mb(mb);
self
}
pub fn sequential(mut self) -> Self {
self.parse_options = self.parse_options.sequential();
self
}
pub fn parse(self, path: impl AsRef<Path>) -> Result<ParsedDocument> {
let document = parse_file(path)?;
Ok(ParsedDocument {
document,
render_options: self.render_options,
extract_images: self.extract_images,
})
}
}
pub struct ParsedDocument {
document: Document,
render_options: RenderOptions,
extract_images: bool,
}
impl ParsedDocument {
pub fn document(&self) -> &Document {
&self.document
}
pub fn to_markdown(&self) -> Result<String> {
if self.extract_images {
if let Some(ref image_dir) = self.render_options.image_dir {
std::fs::create_dir_all(image_dir)?;
for (name, resource) in &self.document.resources {
let path = image_dir.join(name);
std::fs::write(path, &resource.data)?;
}
}
}
render::render_markdown(&self.document, &self.render_options)
}
pub fn to_text(&self) -> String {
self.document.plain_text()
}
pub fn section_count(&self) -> usize {
self.document.sections.len()
}
pub fn paragraph_count(&self) -> usize {
self.document.paragraph_count()
}
pub fn into_document(self) -> Document {
self.document
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_format_detection_hwp5() {
let data = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1, 0x00, 0x00];
let format = detect_format_from_bytes(&data).unwrap();
assert_eq!(format, FormatType::Hwp5);
}
#[test]
fn test_format_detection_hwpx() {
let data = [0x50, 0x4B, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00];
let format = detect_format_from_bytes(&data).unwrap();
assert_eq!(format, FormatType::Hwpx);
}
#[test]
fn test_render_options_builder() {
let options = RenderOptions::default()
.with_image_dir("./images")
.with_table_fallback(TableFallback::Html)
.with_frontmatter();
assert!(options.image_dir.is_some());
assert_eq!(options.table_fallback, TableFallback::Html);
assert!(options.include_frontmatter);
}
#[test]
fn test_format_detection_empty_data() {
let data: [u8; 0] = [];
let result = detect_format_from_bytes(&data);
assert!(result.is_err());
match result {
Err(Error::InvalidData(_)) => {} _ => panic!("Expected InvalidData error for empty data"),
}
}
#[test]
fn test_format_detection_too_short() {
let data = [0xD0, 0xCF]; let result = detect_format_from_bytes(&data);
assert!(result.is_err());
match result {
Err(Error::InvalidData(_)) => {} _ => panic!("Expected InvalidData error for data too short"),
}
}
#[test]
fn test_format_detection_unknown_magic() {
let data = [0xFF, 0xFE, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05];
let result = detect_format_from_bytes(&data);
assert!(result.is_err());
match result {
Err(Error::UnknownFormat) => {}
_ => panic!("Expected UnknownFormat error for unknown magic bytes"),
}
}
#[test]
fn test_format_detection_hwp3_signature() {
let data = b"HWP Document File V3.00 \x1A\x01\x02\x03\x04";
let format = detect_format_from_bytes(data).unwrap();
assert_eq!(format, FormatType::Hwp3);
}
#[test]
fn test_render_options_empty_image_dir() {
use std::path::PathBuf;
let options = RenderOptions::default().with_image_dir("");
assert_eq!(options.image_dir, Some(PathBuf::from("")));
}
#[test]
fn test_render_options_special_chars_in_path() {
use std::path::PathBuf;
let options = RenderOptions::default().with_image_dir("./이미지 폴더/test images");
assert_eq!(
options.image_dir,
Some(PathBuf::from("./이미지 폴더/test images"))
);
}
#[test]
fn test_document_model_empty() {
let doc = model::Document::new();
assert!(doc.sections.is_empty());
assert!(doc.resources.is_empty());
assert!(doc.metadata.title.is_none());
assert!(!doc.metadata.is_distribution);
}
#[test]
fn test_table_fallback_variants() {
let options_html = RenderOptions::default().with_table_fallback(TableFallback::Html);
let options_simplified =
RenderOptions::default().with_table_fallback(TableFallback::SimplifiedMarkdown);
let options_skip = RenderOptions::default().with_table_fallback(TableFallback::Skip);
assert_eq!(options_html.table_fallback, TableFallback::Html);
assert_eq!(
options_simplified.table_fallback,
TableFallback::SimplifiedMarkdown
);
assert_eq!(options_skip.table_fallback, TableFallback::Skip);
}
#[test]
fn test_paragraph_style_heading_level() {
use model::ParagraphStyle;
let h1 = ParagraphStyle::heading(1);
assert_eq!(h1.heading_level, 1);
assert!(h1.is_heading());
let h6 = ParagraphStyle::heading(6);
assert_eq!(h6.heading_level, 6);
let h_overflow = ParagraphStyle::heading(10);
assert_eq!(h_overflow.heading_level, 6);
let normal = ParagraphStyle::heading(0);
assert_eq!(normal.heading_level, 0);
assert!(!normal.is_heading());
}
#[test]
fn test_resource_type_variants() {
use model::{Resource, ResourceType};
let image_resource = Resource {
resource_type: ResourceType::Image,
filename: Some("test.png".to_string()),
mime_type: Some("image/png".to_string()),
data: vec![0x89, 0x50, 0x4E, 0x47], size: 4,
};
let ole_resource = Resource {
resource_type: ResourceType::OleObject,
filename: Some("object.ole".to_string()),
mime_type: None,
data: vec![0xD0, 0xCF, 0x11, 0xE0],
size: 4,
};
let other_resource = Resource::new(ResourceType::Other, vec![0x00, 0x01]);
assert!(matches!(image_resource.resource_type, ResourceType::Image));
assert!(matches!(
ole_resource.resource_type,
ResourceType::OleObject
));
assert!(matches!(other_resource.resource_type, ResourceType::Other));
}
#[test]
fn test_style_registry_empty() {
let registry = model::StyleRegistry::new();
assert!(registry.get_char_style(0).is_none());
assert!(registry.get_para_style(0).is_none());
assert!(registry.get_bindata_filename(0).is_none());
assert!(registry.get_named_style(0).is_none());
assert!(registry.get_named_style_by_name("NonExistent").is_none());
}
#[test]
fn test_paragraph_default() {
let para = model::Paragraph::default();
assert!(para.content.is_empty());
assert!(!para.style.is_heading());
assert!(para.is_empty());
}
#[test]
fn test_text_run_with_unicode() {
let run = model::TextRun::new("한글 테스트 🎉 Unicode");
assert!(run.text.contains("한글"));
assert!(run.text.contains("🎉"));
assert!(!run.is_empty());
}
#[test]
fn test_text_run_with_style() {
use model::{TextRun, TextStyle};
let style = TextStyle {
bold: true,
italic: true,
font_size: Some(14.0),
..Default::default()
};
let run = TextRun::with_style("Formatted text", style);
assert_eq!(run.text, "Formatted text");
assert!(run.style.bold);
assert!(run.style.italic);
assert_eq!(run.style.font_size, Some(14.0));
}
#[test]
fn test_section_new() {
let section = model::Section::new(42);
assert_eq!(section.index, 42);
assert!(section.content.is_empty());
assert!(section.header.is_none());
assert!(section.footer.is_none());
}
#[test]
fn test_resource_extension() {
use model::{Resource, ResourceType};
let png = Resource::image(vec![], "image/png");
assert_eq!(png.extension(), "png");
let jpg = Resource::image(vec![], "image/jpeg");
assert_eq!(jpg.extension(), "jpg");
let unknown = Resource::new(ResourceType::Image, vec![]);
assert_eq!(unknown.extension(), "bin");
let ole = Resource::new(ResourceType::OleObject, vec![]);
assert_eq!(ole.extension(), "ole");
}
#[test]
fn test_text_style_has_formatting() {
use model::TextStyle;
let plain = TextStyle::default();
assert!(!plain.has_formatting());
let bold = TextStyle::bold();
assert!(bold.has_formatting());
let italic = TextStyle::italic();
assert!(italic.has_formatting());
let complex = TextStyle {
underline: true,
strikethrough: true,
..Default::default()
};
assert!(complex.has_formatting());
}
#[test]
fn test_cleanup_options_presets() {
use crate::cleanup::CleanupOptions;
let default = CleanupOptions::default();
assert!(default.normalize_strings);
assert!(default.clean_lines);
assert!(default.filter_structure);
let minimal = CleanupOptions::minimal();
assert!(minimal.normalize_strings);
assert!(!minimal.clean_lines); assert!(!minimal.filter_structure);
let aggressive = CleanupOptions::aggressive();
assert!(aggressive.normalize_strings);
assert!(aggressive.clean_lines);
assert!(aggressive.filter_structure);
assert!(aggressive.header_footer_threshold < default.header_footer_threshold);
}
#[test]
fn test_render_options_cleanup_chain() {
let with_cleanup = RenderOptions::default().with_cleanup();
assert!(with_cleanup.cleanup.is_some());
let with_minimal = RenderOptions::default().with_minimal_cleanup();
assert!(with_minimal.cleanup.is_some());
let with_aggressive = RenderOptions::default().with_aggressive_cleanup();
assert!(with_aggressive.cleanup.is_some());
}
#[test]
fn test_render_options_max_heading_level() {
let level_0 = RenderOptions::default().with_max_heading_level(0);
assert_eq!(level_0.max_heading_level, 1);
let level_10 = RenderOptions::default().with_max_heading_level(10);
assert_eq!(level_10.max_heading_level, 6);
let level_4 = RenderOptions::default().with_max_heading_level(4);
assert_eq!(level_4.max_heading_level, 4); }
#[test]
fn test_paragraph_plain_text_extraction() {
use model::{InlineContent, Paragraph, TextRun};
let mut para = Paragraph::new();
para.content
.push(InlineContent::Text(TextRun::new("Hello ")));
para.content.push(InlineContent::LineBreak);
para.content
.push(InlineContent::Text(TextRun::new("World")));
para.content.push(InlineContent::Link {
text: " Link".to_string(),
url: "https://example.com".to_string(),
});
let text = para.plain_text();
assert!(text.contains("Hello"));
assert!(text.contains("\n")); assert!(text.contains("World"));
assert!(text.contains("Link"));
}
}