unhwp 0.3.1

A high-performance library for extracting HWP/HWPX documents into structured Markdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
//! HWP 5.0 binary format parser.
//!
//! This module handles parsing of HWP 5.0 documents stored in OLE containers.

mod bodytext;
mod container;
mod control;
mod docinfo;
mod header;
mod record;

pub use container::Hwp5Container;
pub use header::FileHeader;
pub use record::{Record, RecordHeader, RecordIterator, TagId};

use crate::error::Result;
use crate::model::Document;
use crate::streaming::{ParseEvent, SectionStreamOptions};
use std::io::{Read, Seek};
use std::ops::ControlFlow;
use std::path::Path;

/// HWP 5.0 document parser.
pub struct Hwp5Parser {
    container: Hwp5Container,
    header: FileHeader,
}

impl Hwp5Parser {
    /// Opens an HWP 5.0 document from a file path.
    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
        let container = Hwp5Container::open(path)?;
        let header = container.read_file_header()?;
        Ok(Self { container, header })
    }

    /// Opens an HWP 5.0 document from a reader.
    pub fn from_reader<R: Read + Seek>(reader: R) -> Result<Self> {
        let container = Hwp5Container::from_reader(reader)?;
        let header = container.read_file_header()?;
        Ok(Self { container, header })
    }

    /// Returns the file header.
    pub fn header(&self) -> &FileHeader {
        &self.header
    }

    /// Returns true if the document is compressed.
    pub fn is_compressed(&self) -> bool {
        self.header.is_compressed()
    }

    /// Returns true if the document is encrypted.
    pub fn is_encrypted(&self) -> bool {
        self.header.is_encrypted()
    }

    /// Parses the document into the unified document model.
    pub fn parse(&mut self) -> Result<Document> {
        if self.is_encrypted() {
            return Err(crate::error::Error::Encrypted);
        }

        let mut document = Document::new();

        // Set format version
        document.metadata.format_version = Some(self.header.version_string());

        // Set distribution flag
        document.metadata.is_distribution = self.header.is_distribution();

        // Parse metadata from SummaryInformation (best-effort, ignore errors)
        let _ = self.parse_metadata(&mut document);

        // Parse DocInfo for styles
        self.parse_docinfo(&mut document)?;

        // Parse BodyText sections
        self.parse_bodytext(&mut document)?;

        // Extract BinData resources
        self.extract_bindata(&mut document)?;

        Ok(document)
    }

    /// Parses the document in streaming mode, emitting events for each section.
    ///
    /// This is the bounded-memory alternative to [`parse`](Self::parse). Each
    /// section is parsed and emitted individually; its memory is freed before
    /// the next section is loaded.
    ///
    /// See [`crate::streaming::parse_file_streaming`] for the public API.
    pub fn for_each_section<F>(&mut self, opts: SectionStreamOptions, mut f: F) -> Result<()>
    where
        F: FnMut(ParseEvent<'_>) -> ControlFlow<()>,
    {
        if self.is_encrypted() {
            return Err(crate::error::Error::Encrypted);
        }

        // Phase 1: parse prerequisites using a temporary Document so we can
        // call the existing (document-oriented) helper methods without
        // refactoring them. We then move metadata and styles out of the
        // temporary document so they become owned stack locals, giving us
        // stack-frame lifetimes that satisfy the `'doc` bound on ParseEvent.
        let (metadata, styles) = {
            let mut tmp = Document::new();
            tmp.metadata.format_version = Some(self.header.version_string());
            tmp.metadata.is_distribution = self.header.is_distribution();
            let _ = self.parse_metadata(&mut tmp); // best-effort
            self.parse_docinfo(&mut tmp)?;
            (tmp.metadata, tmp.styles)
        };

        let section_names = self.container.list_bodytext_sections()?;
        let section_count = section_names.len();

        // Emit DocumentStart — metadata and styles live on the stack in this
        // function frame; references into them are valid for the whole loop.
        if f(ParseEvent::DocumentStart {
            metadata: &metadata,
            styles: &styles,
            section_count,
        }) == ControlFlow::Break(())
        {
            return Ok(());
        }

        // Phase 2: parse and emit sections one at a time.
        // picture_counter is shared across sections to correctly track BinId
        // references in multi-section documents.
        let is_compressed = self.is_compressed();
        let mut picture_counter: u32 = 0;

        for (index, name) in section_names.iter().enumerate() {
            let data = self.container.read_stream_decompressed(name, is_compressed);

            match data {
                Err(e) if opts.error_mode == crate::parse_options::ErrorMode::Lenient => {
                    if f(ParseEvent::SectionFailed { index, error: e }) == ControlFlow::Break(()) {
                        return Ok(());
                    }
                }
                Err(e) => return Err(e),
                Ok(bytes) => {
                    match bodytext::parse_section(&bytes, index, &styles, &mut picture_counter) {
                        Err(e) if opts.error_mode == crate::parse_options::ErrorMode::Lenient => {
                            if f(ParseEvent::SectionFailed { index, error: e })
                                == ControlFlow::Break(())
                            {
                                return Ok(());
                            }
                        }
                        Err(e) => return Err(e),
                        Ok(section) => {
                            if f(ParseEvent::SectionParsed(&section)) == ControlFlow::Break(()) {
                                return Ok(());
                            }
                            // `section` is dropped here — memory reclaimed before
                            // the next section is loaded.
                        }
                    }
                }
            }
        }

        // Phase 3: emit DocumentEnd before resources.
        if f(ParseEvent::DocumentEnd) == ControlFlow::Break(()) {
            return Ok(());
        }

        // Phase 4: resource extraction (after DocumentEnd so section memory
        // is fully freed before any large binary data arrives).
        if opts.extract_resources {
            if let Ok(resources) = self.container.list_bindata() {
                for name in resources {
                    if let Ok(data) = self.container.read_bindata(&name, is_compressed) {
                        if f(ParseEvent::ResourceExtracted { name, data }) == ControlFlow::Break(())
                        {
                            return Ok(());
                        }
                    }
                }
            }
        }

        Ok(())
    }

    /// Parses DocInfo stream for style definitions.
    fn parse_docinfo(&self, document: &mut Document) -> Result<()> {
        let data = self
            .container
            .read_stream_decompressed("DocInfo", self.is_compressed())?;

        docinfo::parse_docinfo(&data, &mut document.styles)?;
        Ok(())
    }

    /// Parses BodyText sections.
    /// Parses BodyText sections sequentially to share picture counter across sections.
    fn parse_bodytext(&self, document: &mut Document) -> Result<()> {
        let section_names = self.container.list_bodytext_sections()?;
        let is_compressed = self.is_compressed();

        // Read all section data first
        let section_data: Vec<(usize, Vec<u8>)> = section_names
            .iter()
            .enumerate()
            .filter_map(|(index, name)| {
                self.container
                    .read_stream_decompressed(name, is_compressed)
                    .ok()
                    .map(|data| (index, data))
            })
            .collect();

        let styles = document.styles.clone();

        // Parse sections sequentially to share picture_counter across sections.
        // This ensures BinId references remain correct even in multi-section documents.
        let mut picture_counter: u32 = 0;
        let mut sections: Vec<_> = section_data
            .iter()
            .filter_map(|(index, data)| {
                bodytext::parse_section(data, *index, &styles, &mut picture_counter).ok()
            })
            .collect();

        // Sort by index to maintain order
        sections.sort_by_key(|s| s.index);

        document.sections = sections;
        Ok(())
    }

    /// Extracts binary resources from BinData storage.
    fn extract_bindata(&self, document: &mut Document) -> Result<()> {
        let resources = self.container.list_bindata()?;

        for name in resources {
            if let Ok(data) = self.container.read_bindata(&name, self.is_compressed()) {
                let mime_type = guess_mime_type(&name);
                let size = data.len();
                let resource = crate::model::Resource {
                    resource_type: crate::model::ResourceType::Image,
                    filename: Some(name.clone()),
                    mime_type,
                    data,
                    size,
                };
                document.resources.insert(name, resource);
            }
        }

        Ok(())
    }

    /// Extracts document metadata from the `\x05HwpSummaryInformation` OLE stream.
    ///
    /// The stream follows the Microsoft OLE Property Set format (FMTID_SummaryInformation).
    /// Properties of interest:
    ///   PID_TITLE    = 2  → title
    ///   PID_SUBJECT  = 3  → subject
    ///   PID_AUTHOR   = 4  → author
    ///   PID_KEYWORDS = 5  → keywords (comma-separated)
    ///   PID_CREATE   = 12 → created date (FILETIME)
    ///   PID_LASTSAVE = 13 → modified date (FILETIME)
    fn parse_metadata(&self, document: &mut Document) -> Result<()> {
        // The OLE compound file stores it with the special prefix byte 0x05
        let data = self
            .container
            .read_stream_raw("\x05HwpSummaryInformation")
            .or_else(|_| self.container.read_stream_raw("SummaryInformation"))?;

        // OLE Property Set header:
        // 0x00-0x01: byte order (FE FF = little endian)
        // 0x02-0x03: version
        // 0x04-0x07: OS version
        // 0x08-0x17: FMTID (16 bytes)
        // 0x18-0x1B: section offset count (u32)
        // Then for each section: FMTID (16 bytes) + offset (u32)
        // At the section offset: size (u32) + property count (u32) + [propId (u32) + offset (u32)]*
        if data.len() < 0x30 {
            return Ok(());
        }

        // Byte order mark: FE FF = little endian
        if data[0] != 0xFE || data[1] != 0xFF {
            return Ok(());
        }

        // Number of property sets (typically 1 or 2)
        let num_sets = u32::from_le_bytes(get4(&data, 0x18)) as usize;
        if num_sets == 0 {
            return Ok(());
        }

        // First set FMTID is at 0x1C, offset at 0x2C
        let section_offset = u32::from_le_bytes(get4(&data, 0x2C)) as usize;
        if section_offset + 8 > data.len() {
            return Ok(());
        }

        let section_data = &data[section_offset..];
        let prop_count = u32::from_le_bytes(get4(section_data, 4)) as usize;

        if section_data.len() < 8 + prop_count * 8 {
            return Ok(());
        }

        // Build property ID → offset map
        for i in 0..prop_count {
            let entry_off = 8 + i * 8;
            let prop_id = u32::from_le_bytes(get4(section_data, entry_off));
            let prop_off = u32::from_le_bytes(get4(section_data, entry_off + 4)) as usize;

            if prop_off + 4 > section_data.len() {
                continue;
            }

            let vt_type = u16::from_le_bytes([section_data[prop_off], section_data[prop_off + 1]]);

            match prop_id {
                2..=5 => {
                    // VT_LPSTR = 0x001E, VT_LPWSTR = 0x001F
                    if let Some(s) = read_ole_string(section_data, prop_off, vt_type) {
                        match prop_id {
                            2 => document.metadata.title = Some(s),
                            3 => document.metadata.subject = Some(s),
                            4 => document.metadata.author = Some(s),
                            5 => {
                                document.metadata.keywords = s
                                    .split(',')
                                    .map(|k| k.trim().to_string())
                                    .filter(|k| !k.is_empty())
                                    .collect();
                            }
                            _ => {}
                        }
                    }
                }
                12 | 13 if vt_type == 0x0040 && prop_off + 12 <= section_data.len() => {
                    // VT_FILETIME = 0x0040
                    let lo = u32::from_le_bytes(get4(section_data, prop_off + 4)) as u64;
                    let hi = u32::from_le_bytes(get4(section_data, prop_off + 8)) as u64;
                    let filetime = (hi << 32) | lo;
                    let iso = filetime_to_iso8601(filetime);
                    match prop_id {
                        12 => document.metadata.created = Some(iso),
                        13 => document.metadata.modified = Some(iso),
                        _ => {}
                    }
                }
                _ => {}
            }
        }

        Ok(())
    }
}

/// Reads 4 bytes as a little-endian u32 from a slice at the given offset.
#[inline]
fn get4(data: &[u8], offset: usize) -> [u8; 4] {
    if offset + 4 <= data.len() {
        [
            data[offset],
            data[offset + 1],
            data[offset + 2],
            data[offset + 3],
        ]
    } else {
        [0u8; 4]
    }
}

/// Reads a string property from an OLE Property Set section.
///
/// Supports VT_LPSTR (0x001E, CP_ACP 8-bit) and VT_LPWSTR (0x001F, UTF-16LE).
fn read_ole_string(section_data: &[u8], prop_off: usize, vt_type: u16) -> Option<String> {
    match vt_type {
        0x001E => {
            // VT_LPSTR: 2-byte type + 2-byte padding + 4-byte count + bytes
            let count = u32::from_le_bytes(get4(section_data, prop_off + 4)) as usize;
            let start = prop_off + 8;
            let end = start.checked_add(count)?;
            if end > section_data.len() {
                return None;
            }
            // Trim trailing NUL bytes
            let bytes = &section_data[start..end];
            let trimmed = bytes
                .iter()
                .position(|&b| b == 0)
                .map_or(bytes, |n| &bytes[..n]);
            // Attempt Windows-1252 (most common for Korean HWP metadata)
            Some(String::from_utf8_lossy(trimmed).into_owned())
        }
        0x001F => {
            // VT_LPWSTR: 2-byte type + 2-byte padding + 4-byte count (in WCHARs, incl. NUL) + UTF-16LE
            let wchar_count = u32::from_le_bytes(get4(section_data, prop_off + 4)) as usize;
            let start = prop_off + 8;
            let byte_count = wchar_count.saturating_sub(1) * 2; // exclude trailing NUL
            let end = start.checked_add(byte_count)?;
            if end > section_data.len() {
                return None;
            }
            let u16_vals: Vec<u16> = section_data[start..end]
                .chunks_exact(2)
                .map(|c| u16::from_le_bytes([c[0], c[1]]))
                .collect();
            Some(String::from_utf16_lossy(&u16_vals))
        }
        _ => None,
    }
}

/// Converts a Windows FILETIME (100-nanosecond intervals since 1601-01-01) to ISO 8601.
fn filetime_to_iso8601(filetime: u64) -> String {
    // Convert to Unix epoch: FILETIME epoch is 1601-01-01, Unix is 1970-01-01
    // Difference = 116444736000000000 × 100ns intervals
    const EPOCH_DIFF: u64 = 116_444_736_000_000_000;
    if filetime < EPOCH_DIFF {
        return String::from("1601-01-01T00:00:00Z");
    }
    let unix_100ns = filetime - EPOCH_DIFF;
    let unix_secs = unix_100ns / 10_000_000;

    // Simple date computation (no external crate)
    let secs = unix_secs;
    let days = secs / 86400;
    let time_secs = secs % 86400;
    let h = time_secs / 3600;
    let m = (time_secs % 3600) / 60;
    let s = time_secs % 60;

    // Compute year/month/day from days since 1970-01-01
    let (year, month, day) = days_to_ymd(days);

    format!(
        "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
        year, month, day, h, m, s
    )
}

/// Converts days since Unix epoch to (year, month, day).
fn days_to_ymd(mut days: u64) -> (u32, u32, u32) {
    let mut year = 1970u32;
    loop {
        let leap = is_leap(year);
        let days_in_year = if leap { 366 } else { 365 };
        if days < days_in_year {
            break;
        }
        days -= days_in_year;
        year += 1;
    }
    let leap = is_leap(year);
    let month_days: [u64; 12] = [
        31,
        if leap { 29 } else { 28 },
        31,
        30,
        31,
        30,
        31,
        31,
        30,
        31,
        30,
        31,
    ];
    let mut month = 1u32;
    for &md in &month_days {
        if days < md {
            break;
        }
        days -= md;
        month += 1;
    }
    (year, month, days as u32 + 1)
}

#[inline]
fn is_leap(year: u32) -> bool {
    (year.is_multiple_of(4) && !year.is_multiple_of(100)) || year.is_multiple_of(400)
}

/// Guesses MIME type from filename extension.
fn guess_mime_type(filename: &str) -> Option<String> {
    let ext = filename.rsplit('.').next()?.to_lowercase();
    match ext.as_str() {
        "bmp" => Some("image/bmp".to_string()),
        "jpg" | "jpeg" => Some("image/jpeg".to_string()),
        "png" => Some("image/png".to_string()),
        "gif" => Some("image/gif".to_string()),
        "tif" | "tiff" => Some("image/tiff".to_string()),
        "wmf" => Some("image/x-wmf".to_string()),
        "emf" => Some("image/x-emf".to_string()),
        _ => None,
    }
}