use super::{decode_euckr, Hwp3Header};
use crate::error::{Error, Result};
use crate::model::{Block, Document, InlineContent, Paragraph, Section, TextRun, TextStyle};
use flate2::read::ZlibDecoder;
use std::io::{Read, Seek, SeekFrom};
#[inline]
fn is_cp949_lead_byte(byte: u8) -> bool {
(0x81..=0xFE).contains(&byte)
}
#[inline]
fn is_cp949_trail_byte(byte: u8) -> bool {
(0x41..=0x5A).contains(&byte) || (0x61..=0x7A).contains(&byte) || (0x81..=0xFE).contains(&byte) }
mod control {
pub const PARA_END: u8 = 0x0D;
pub const LINE_BREAK: u8 = 0x0A;
pub const HARD_SPACE: u8 = 0xA0;
pub const TAB: u8 = 0x09;
pub const CTRL_START: u8 = 0x1B;
pub const BOLD: u8 = 0x01;
pub const ITALIC: u8 = 0x02;
pub const UNDERLINE: u8 = 0x03;
}
pub struct BodyParser {
compressed: bool,
body_offset: u32,
body_size: u32,
}
impl BodyParser {
pub fn new(header: &Hwp3Header) -> Self {
Self {
compressed: header.compressed,
body_offset: header.body_offset,
body_size: header.body_size,
}
}
pub fn parse<R: Read + Seek>(&self, reader: &mut R, document: &mut Document) -> Result<()> {
if self.body_offset == 0 {
reader.seek(SeekFrom::Start(128))?;
} else {
reader.seek(SeekFrom::Start(self.body_offset as u64))?;
}
let body_data = if self.body_size > 0 {
let mut data = vec![0u8; self.body_size as usize];
reader.read_exact(&mut data)?;
data
} else {
let mut data = Vec::new();
reader.read_to_end(&mut data)?;
data
};
let content = if self.compressed && !body_data.is_empty() {
decompress_body(&body_data)?
} else {
body_data
};
let section = self.parse_content(&content)?;
document.sections.push(section);
Ok(())
}
fn parse_content(&self, data: &[u8]) -> Result<Section> {
let mut section = Section::new(0);
let mut current_para = Paragraph::default();
let mut current_text = Vec::new();
let mut current_style = TextStyle::default();
let mut i = 0;
while i < data.len() {
let byte = data[i];
match byte {
control::PARA_END => {
flush_text(&mut current_text, ¤t_style, &mut current_para);
if !current_para.content.is_empty() {
section.content.push(Block::Paragraph(current_para));
current_para = Paragraph::default();
}
i += 1;
}
control::LINE_BREAK => {
flush_text(&mut current_text, ¤t_style, &mut current_para);
current_para.content.push(InlineContent::LineBreak);
i += 1;
}
control::TAB => {
current_text.push(b'\t');
i += 1;
}
control::HARD_SPACE => {
current_text.push(b' ');
i += 1;
}
control::CTRL_START => {
flush_text(&mut current_text, ¤t_style, &mut current_para);
if i + 1 < data.len() {
let ctrl_code = data[i + 1];
match ctrl_code {
control::BOLD => {
current_style.bold = !current_style.bold;
}
control::ITALIC => {
current_style.italic = !current_style.italic;
}
control::UNDERLINE => {
current_style.underline = !current_style.underline;
}
_ => {
}
}
i += 2;
} else {
i += 1;
}
}
_ => {
if is_cp949_lead_byte(byte) {
if i + 1 < data.len() {
let second = data[i + 1];
if is_cp949_trail_byte(second) {
current_text.push(byte);
current_text.push(second);
i += 2;
continue;
}
}
i += 1;
continue;
}
if byte >= 0x20 && byte != 0x7F {
current_text.push(byte);
}
i += 1;
}
}
}
flush_text(&mut current_text, ¤t_style, &mut current_para);
if !current_para.content.is_empty() {
section.content.push(Block::Paragraph(current_para));
}
Ok(section)
}
}
fn flush_text(text_bytes: &mut Vec<u8>, style: &TextStyle, para: &mut Paragraph) {
if !text_bytes.is_empty() {
let text = decode_euckr(text_bytes);
if !text.is_empty() {
let run = TextRun::with_style(text, style.clone());
para.content.push(InlineContent::Text(run));
}
text_bytes.clear();
}
}
fn decompress_body(data: &[u8]) -> Result<Vec<u8>> {
let mut decoder = ZlibDecoder::new(data);
let mut decompressed = Vec::new();
decoder
.read_to_end(&mut decompressed)
.map_err(|e| Error::Decompression(e.to_string()))?;
Ok(decompressed)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_body_parser_simple() {
let header = Hwp3Header::default();
let parser = BodyParser::new(&header);
let data = [0xC5, 0xD7, 0xBD, 0xBA, 0xC6, 0xAE, 0x0D];
let section = parser.parse_content(&data).unwrap();
assert_eq!(section.content.len(), 1);
let Block::Paragraph(p) = §ion.content[0] else {
unreachable!("Expected Paragraph block, got {:?}", section.content[0]);
};
assert_eq!(p.content.len(), 1);
let InlineContent::Text(run) = &p.content[0] else {
unreachable!("Expected Text inline, got {:?}", p.content[0]);
};
assert_eq!(run.text, "테스트");
}
#[test]
fn test_body_parser_ascii() {
let header = Hwp3Header::default();
let parser = BodyParser::new(&header);
let data = b"Hello World\x0D";
let section = parser.parse_content(data).unwrap();
assert_eq!(section.content.len(), 1);
if let Block::Paragraph(p) = §ion.content[0] {
if let InlineContent::Text(run) = &p.content[0] {
assert_eq!(run.text, "Hello World");
}
}
}
}