import ctypes
import json
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Union, Iterator
from . import _native as native
FORMAT_UNKNOWN = native.FORMAT_UNKNOWN
FORMAT_HWP5 = native.FORMAT_HWP5
FORMAT_HWPX = native.FORMAT_HWPX
FORMAT_HWP3 = native.FORMAT_HWP3
_FORMAT_NAMES = {
FORMAT_UNKNOWN: "Unknown",
FORMAT_HWP5: "HWP 5.0",
FORMAT_HWPX: "HWPX",
FORMAT_HWP3: "HWP 3.x",
}
class UnhwpError(Exception):
pass
class FileNotFoundError(UnhwpError):
pass
class ParseError(UnhwpError):
pass
class RenderError(UnhwpError):
pass
class UnsupportedFormatError(UnhwpError):
pass
def _get_last_error() -> str:
err = native.lib.unhwp_last_error()
if err:
return err.decode("utf-8")
return "Unknown error"
def _ptr_to_string(ptr: Optional[int]) -> Optional[str]:
if not ptr:
return None
return ctypes.string_at(ptr).decode("utf-8")
@dataclass
class Image:
name: str
data: bytes
def save(self, path: Union[str, Path]) -> None:
Path(path).write_bytes(self.data)
@dataclass
class RenderOptions:
include_frontmatter: bool = False
image_path_prefix: str = ""
table_fallback: int = 0 preserve_line_breaks: bool = False
escape_special_chars: bool = True
def _to_flags(self) -> int:
flags = 0
if self.include_frontmatter:
flags |= native.UNHWP_FLAG_FRONTMATTER
if self.escape_special_chars:
flags |= native.UNHWP_FLAG_ESCAPE_SPECIAL
if self.preserve_line_breaks:
flags |= native.UNHWP_FLAG_PARAGRAPH_SPACING
return flags
@dataclass
class CleanupOptions:
enabled: bool = True
preset: int = 1 detect_mojibake: bool = True
preserve_frontmatter: bool = True
@classmethod
def minimal(cls) -> "CleanupOptions":
return cls(enabled=True, preset=0)
@classmethod
def default(cls) -> "CleanupOptions":
return cls(enabled=True, preset=1)
@classmethod
def aggressive(cls) -> "CleanupOptions":
return cls(enabled=True, preset=2)
@classmethod
def disabled(cls) -> "CleanupOptions":
return cls(enabled=False)
class ParseResult:
def __init__(self, handle: int, flags: int = 0):
self._handle = handle
self._flags = flags
self._closed = False
def __enter__(self) -> "ParseResult":
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self.close()
def __del__(self) -> None:
self.close()
def close(self) -> None:
if not self._closed and self._handle:
native.lib.unhwp_free_document(self._handle)
self._handle = None
self._closed = True
def _ensure_open(self) -> None:
if self._closed:
raise ValueError("ParseResult has been closed")
@property
def markdown(self) -> str:
self._ensure_open()
ptr = native.lib.unhwp_to_markdown(self._handle, self._flags)
if not ptr:
raise RenderError(f"Failed to convert to markdown: {_get_last_error()}")
try:
return _ptr_to_string(ptr) or ""
finally:
native.lib.unhwp_free_string(ptr)
@property
def text(self) -> str:
self._ensure_open()
ptr = native.lib.unhwp_to_text(self._handle)
if not ptr:
raise RenderError(f"Failed to convert to text: {_get_last_error()}")
try:
return _ptr_to_string(ptr) or ""
finally:
native.lib.unhwp_free_string(ptr)
@property
def plain_text(self) -> str:
self._ensure_open()
ptr = native.lib.unhwp_plain_text(self._handle)
if not ptr:
raise RenderError(f"Failed to get plain text: {_get_last_error()}")
try:
return _ptr_to_string(ptr) or ""
finally:
native.lib.unhwp_free_string(ptr)
@property
def json(self) -> str:
self._ensure_open()
ptr = native.lib.unhwp_to_json(self._handle, native.UNHWP_JSON_PRETTY)
if not ptr:
raise RenderError(f"Failed to convert to JSON: {_get_last_error()}")
try:
return _ptr_to_string(ptr) or ""
finally:
native.lib.unhwp_free_string(ptr)
@property
def section_count(self) -> int:
self._ensure_open()
count = native.lib.unhwp_section_count(self._handle)
if count < 0:
raise UnhwpError(f"Failed to get section count: {_get_last_error()}")
return count
@property
def paragraph_count(self) -> int:
self._ensure_open()
return self.section_count
@property
def is_distribution(self) -> bool:
return False
@property
def image_count(self) -> int:
self._ensure_open()
count = native.lib.unhwp_resource_count(self._handle)
if count < 0:
raise UnhwpError(f"Failed to get resource count: {_get_last_error()}")
return count
@property
def title(self) -> Optional[str]:
self._ensure_open()
ptr = native.lib.unhwp_get_title(self._handle)
if not ptr:
return None
try:
return _ptr_to_string(ptr)
finally:
native.lib.unhwp_free_string(ptr)
@property
def author(self) -> Optional[str]:
self._ensure_open()
ptr = native.lib.unhwp_get_author(self._handle)
if not ptr:
return None
try:
return _ptr_to_string(ptr)
finally:
native.lib.unhwp_free_string(ptr)
@property
def images(self) -> List[Image]:
self._ensure_open()
images = []
ids_ptr = native.lib.unhwp_get_resource_ids(self._handle)
if not ids_ptr:
return images
try:
ids_json = _ptr_to_string(ids_ptr) or "[]"
finally:
native.lib.unhwp_free_string(ids_ptr)
resource_ids = json.loads(ids_json)
for resource_id in resource_ids:
rid_bytes = resource_id.encode("utf-8")
out_len = ctypes.c_size_t(0)
data_ptr = native.lib.unhwp_get_resource_data(
self._handle, rid_bytes, ctypes.byref(out_len)
)
if data_ptr and out_len.value > 0:
data = bytes(data_ptr[:out_len.value])
native.lib.unhwp_free_bytes(data_ptr, out_len)
images.append(Image(name=resource_id, data=data))
return images
def iter_images(self) -> Iterator[Image]:
self._ensure_open()
ids_ptr = native.lib.unhwp_get_resource_ids(self._handle)
if not ids_ptr:
return
try:
ids_json = _ptr_to_string(ids_ptr) or "[]"
finally:
native.lib.unhwp_free_string(ids_ptr)
resource_ids = json.loads(ids_json)
for resource_id in resource_ids:
rid_bytes = resource_id.encode("utf-8")
out_len = ctypes.c_size_t(0)
data_ptr = native.lib.unhwp_get_resource_data(
self._handle, rid_bytes, ctypes.byref(out_len)
)
if data_ptr and out_len.value > 0:
data = bytes(data_ptr[:out_len.value])
native.lib.unhwp_free_bytes(data_ptr, out_len)
yield Image(name=resource_id, data=data)
def version() -> str:
result = native.lib.unhwp_version()
return result.decode("utf-8") if result else "unknown"
def supported_formats() -> str:
return "HWP 5.0, HWPX"
def detect_format(path: Union[str, Path]) -> int:
path_obj = Path(str(path))
if not path_obj.exists():
return FORMAT_UNKNOWN
ext = path_obj.suffix.lower()
if ext == ".hwp":
return FORMAT_HWP5
elif ext == ".hwpx":
return FORMAT_HWPX
else:
return FORMAT_UNKNOWN
def format_name(fmt: int) -> str:
return _FORMAT_NAMES.get(fmt, "Unknown")
def parse(
path: Union[str, Path],
*,
render_options: Optional[RenderOptions] = None,
) -> ParseResult:
path_bytes = str(path).encode("utf-8")
flags = (render_options or RenderOptions())._to_flags()
handle = native.lib.unhwp_parse_file(path_bytes)
if not handle:
raise ParseError(f"Failed to parse {path}: {_get_last_error()}")
return ParseResult(handle, flags)
def parse_bytes(
data: bytes,
*,
render_options: Optional[RenderOptions] = None,
) -> ParseResult:
flags = (render_options or RenderOptions())._to_flags()
data_ptr = (ctypes.c_uint8 * len(data)).from_buffer_copy(data)
handle = native.lib.unhwp_parse_bytes(data_ptr, len(data))
if not handle:
raise ParseError(f"Failed to parse bytes: {_get_last_error()}")
return ParseResult(handle, flags)
def to_markdown(path: Union[str, Path]) -> str:
with parse(path) as result:
return result.markdown
def to_markdown_with_cleanup(
path: Union[str, Path],
cleanup_options: Optional[CleanupOptions] = None,
) -> str:
return to_markdown(path)
def extract_text(path: Union[str, Path]) -> str:
with parse(path) as result:
return result.text