Skip to main content

djvu_iff/
lib.rs

1//! IFF (Interchange File Format) container parser for DjVu files.
2//!
3//! This module provides two APIs:
4//!
5//! 1. **New spec-based parser** (`parse_form`) — zero-copy, borrowing slices from
6//!    the input byte buffer. Written from the sndjvu.org specification.
7//!
8//! 2. **Legacy API** (`parse`, `Chunk`, `DjvuFile`) — the original tree-based parser
9//!    kept for internal backward compatibility while the rewrite is in progress.
10//!
11//! ## DjVu IFF layout
12//!
13//! ```text
14//! [4] magic   = "AT&T"
15//! [4] id      = "FORM"
16//! [4] length  (big-endian u32, covers form_type + all chunks)
17//! [4] form_type = "DJVU" | "DJVM" | "BM44" | "PM44"
18//! ... chunks
19//! ```
20//!
21//! Each inner chunk:
22//! ```text
23//! [4] id
24//! [4] length  (big-endian u32)
25//! [n] data    (padded to even number of bytes if length is odd)
26//! ```
27
28#![cfg_attr(not(feature = "std"), no_std)]
29#![deny(unsafe_code)]
30
31#[cfg(not(feature = "std"))]
32extern crate alloc;
33
34#[cfg(not(feature = "std"))]
35use alloc::{string::String, vec::Vec};
36#[cfg(feature = "std")]
37use std::{string::String, vec::Vec};
38
39// ---- Error types ------------------------------------------------------------
40
41/// Errors that can occur while parsing the IFF container.
42#[derive(Debug, thiserror::Error, PartialEq, Eq)]
43pub enum IffError {
44    /// Input data is too short to contain a valid IFF file.
45    #[error("input is too short to be a valid IFF file")]
46    TooShort,
47
48    /// The `AT&T` magic bytes were not found at the start of the file.
49    #[error("bad magic bytes: expected AT&T, got {got:?}")]
50    BadMagic { got: [u8; 4] },
51
52    /// The FORM type identifier is not a recognised DjVu type.
53    ///
54    /// Note: this is *not* an error — callers may encounter unknown form types
55    /// in bundled documents and should handle them gracefully.
56    #[error("unknown FORM type: {id:?}")]
57    UnknownFormType { id: [u8; 4] },
58
59    /// A chunk header claims more bytes than are available in the buffer.
60    #[error(
61        "chunk {:?} claims {} bytes but only {} are available",
62        id,
63        claimed,
64        available
65    )]
66    ChunkTooLong {
67        id: [u8; 4],
68        claimed: u32,
69        available: usize,
70    },
71
72    /// The input ended unexpectedly in the middle of a chunk.
73    #[error("unexpected end of input (truncated IFF data)")]
74    Truncated,
75}
76
77/// Original error type used by the legacy implementation.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub enum LegacyError {
80    /// Input data is shorter than expected.
81    UnexpectedEof,
82    /// A required magic number or tag was not found.
83    InvalidMagic,
84    /// A chunk or field has an invalid length.
85    InvalidLength,
86    /// A required chunk is missing.
87    MissingChunk(&'static str),
88    /// An unsupported feature or version was encountered.
89    Unsupported(&'static str),
90    /// Generic format violation.
91    FormatError(String),
92}
93
94impl core::fmt::Display for LegacyError {
95    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
96        match self {
97            LegacyError::UnexpectedEof => write!(f, "unexpected end of input"),
98            LegacyError::InvalidMagic => write!(f, "invalid magic number"),
99            LegacyError::InvalidLength => write!(f, "invalid length"),
100            LegacyError::MissingChunk(id) => write!(f, "missing required chunk: {}", id),
101            LegacyError::Unsupported(msg) => write!(f, "unsupported: {}", msg),
102            LegacyError::FormatError(msg) => write!(f, "format error: {}", msg),
103        }
104    }
105}
106
107#[cfg(feature = "std")]
108impl std::error::Error for LegacyError {}
109
110/// Alias for [`LegacyError`].
111pub use LegacyError as Error;
112
113// ---- IFF chunk types --------------------------------------------------------
114
115/// The 4-byte magic that prefixes every on-disk DjVu IFF stream.
116///
117/// The single source of the literal: writers prepend `&MAGIC` rather than
118/// re-spelling `b"AT&T"`, so the emission seam owns the framing bytes. (A
119/// guard test rejects raw `b"AT&T"`/`b"FORM"` assembly outside this crate.)
120pub const MAGIC: [u8; 4] = *b"AT&T";
121
122/// A 4-byte chunk identifier (e.g., b"FORM", b"INFO", b"Sjbz").
123pub type ChunkId = [u8; 4];
124
125/// A parsed IFF chunk — either a FORM container or a leaf data chunk.
126#[derive(Debug, Clone)]
127pub enum Chunk {
128    /// A FORM container with a secondary ID and child chunks.
129    Form {
130        /// The secondary ID (e.g., b"DJVU", b"DJVM", b"DJVI", b"THUM").
131        secondary_id: ChunkId,
132        /// Total byte length of the FORM payload (from the IFF length field).
133        /// Includes the 4-byte secondary ID and all child chunk bytes.
134        length: u32,
135        /// Child chunks within this FORM.
136        children: Vec<Chunk>,
137    },
138    /// A leaf chunk with raw data.
139    Leaf {
140        /// The chunk ID (e.g., b"INFO", b"Sjbz", b"BG44").
141        id: ChunkId,
142        /// The raw chunk payload bytes.
143        data: Vec<u8>,
144    },
145}
146
147impl Chunk {
148    /// For leaf chunks, return the data slice. For FORM chunks, returns empty slice.
149    pub fn data(&self) -> &[u8] {
150        match self {
151            Chunk::Form { .. } => &[],
152            Chunk::Leaf { data, .. } => data,
153        }
154    }
155
156    /// For FORM chunks, return children. For leaf chunks, returns empty slice.
157    pub fn children(&self) -> &[Chunk] {
158        match self {
159            Chunk::Form { children, .. } => children,
160            Chunk::Leaf { .. } => &[],
161        }
162    }
163
164    /// Return the declared payload length from the IFF length field.
165    ///
166    /// For `Form` chunks, this is the value read from the IFF header — it
167    /// covers the secondary ID (4 bytes) and all children.  For `Leaf`
168    /// chunks, this equals `data().len()`.
169    pub fn payload_length(&self) -> u32 {
170        match self {
171            Chunk::Form { length, .. } => *length,
172            Chunk::Leaf { data, .. } => data.len() as u32,
173        }
174    }
175
176    /// Find the first leaf chunk with the given ID in direct children.
177    pub fn find_first(&self, target_id: &[u8; 4]) -> Option<&Chunk> {
178        self.children().iter().find(|c| match c {
179            Chunk::Leaf { id, .. } => id == target_id,
180            _ => false,
181        })
182    }
183
184    /// Find all leaf chunks with the given ID in direct children.
185    pub fn find_all(&self, target_id: &[u8; 4]) -> Vec<&Chunk> {
186        self.children()
187            .iter()
188            .filter(|c| match c {
189                Chunk::Leaf { id, .. } => id == target_id,
190                _ => false,
191            })
192            .collect()
193    }
194}
195
196/// A parsed DjVu document (the root FORM chunk).
197#[derive(Debug, Clone)]
198pub struct DjvuFile {
199    pub root: Chunk,
200}
201
202/// Parse a DjVu file from raw bytes (legacy tree-based parser).
203///
204/// Expects the file to begin with "AT&T" magic followed by a root FORM chunk.
205pub fn parse(data: &[u8]) -> Result<DjvuFile, Error> {
206    if data.len() < 4 {
207        return Err(Error::UnexpectedEof);
208    }
209    // Check for "AT&T" magic
210    let (magic, rest) = if &data[..4] == b"AT&T" {
211        (&data[..4], &data[4..])
212    } else {
213        // Some files may not have AT&T prefix (bare FORM)
214        (&data[..0], data)
215    };
216    let _ = magic;
217
218    let (root, _) = parse_chunk(rest, 0)?;
219    Ok(DjvuFile { root })
220}
221
222/// Parse a single chunk starting at `offset` within `data`.
223/// Returns the parsed chunk and the number of bytes consumed (including padding).
224fn parse_chunk(data: &[u8], offset: usize) -> Result<(Chunk, usize), Error> {
225    if offset + 8 > data.len() {
226        return Err(Error::UnexpectedEof);
227    }
228
229    let id: ChunkId = [
230        data[offset],
231        data[offset + 1],
232        data[offset + 2],
233        data[offset + 3],
234    ];
235    let length = u32::from_be_bytes([
236        data[offset + 4],
237        data[offset + 5],
238        data[offset + 6],
239        data[offset + 7],
240    ]);
241
242    let payload_start = offset + 8;
243    let payload_end = payload_start + length as usize;
244
245    if payload_end > data.len() {
246        return Err(Error::UnexpectedEof);
247    }
248
249    // Word-align: next chunk starts at even offset
250    let total = 8 + length as usize;
251    let padded_total = total + (total % 2);
252
253    if &id == b"FORM" {
254        if length < 4 {
255            return Err(Error::InvalidLength);
256        }
257        let secondary_id: ChunkId = [
258            data[payload_start],
259            data[payload_start + 1],
260            data[payload_start + 2],
261            data[payload_start + 3],
262        ];
263
264        let children_start = payload_start + 4;
265        let children = parse_children(data, children_start, payload_end)?;
266
267        Ok((
268            Chunk::Form {
269                secondary_id,
270                length,
271                children,
272            },
273            padded_total,
274        ))
275    } else {
276        let chunk_data = data[payload_start..payload_end].to_vec();
277        Ok((
278            Chunk::Leaf {
279                id,
280                data: chunk_data,
281            },
282            padded_total,
283        ))
284    }
285}
286
287/// Parse sequential chunks within a range of bytes.
288fn parse_children(data: &[u8], start: usize, end: usize) -> Result<Vec<Chunk>, Error> {
289    let mut chunks = Vec::new();
290    let mut pos = start;
291
292    while pos < end {
293        if pos + 8 > end {
294            // Trailing bytes — some files have junk at end; tolerate it
295            break;
296        }
297        let (chunk, consumed) = parse_chunk(data, pos)?;
298        chunks.push(chunk);
299        pos += consumed;
300    }
301
302    Ok(chunks)
303}
304
305// ---- Legacy emitter (round-trip support, #195) ------------------------------
306
307/// Serialise a `DjvuFile` (legacy parser) back into the on-disk IFF byte
308/// stream, including the leading "AT&T" magic.
309///
310/// Parser/emitter contract: `parse(emit(file)) == file` for any tree
311/// previously produced by `parse(...)`. This is used by property-based
312/// round-trip tests under `tests/proptest_codecs.rs` (#195) and is small
313/// enough to keep alongside the parser; not intended as a general-purpose
314/// DjVu writer.
315pub fn emit(file: &DjvuFile) -> Vec<u8> {
316    let mut out = Vec::with_capacity(64);
317    out.extend_from_slice(&MAGIC);
318    emit_chunk(&file.root, &mut out);
319    out
320}
321
322fn emit_chunk(chunk: &Chunk, out: &mut Vec<u8>) {
323    emit_chunk_inner(chunk, out, false);
324}
325
326fn emit_chunk_inner(chunk: &Chunk, out: &mut Vec<u8>, suppress_inner_pad: bool) {
327    match chunk {
328        Chunk::Form {
329            secondary_id,
330            length: stored_length,
331            children,
332        } => {
333            // Two valid IFF layouts exist for a FORM whose last child has odd
334            // payload length:
335            //   (A) FORM declared length is odd, no pad after last child;
336            //       the outer/parent loop writes the alignment byte.
337            //   (B) FORM declared length is even, includes a pad byte after
338            //       the last child inside the FORM body.
339            // Real DjVu files mix both styles. Preserve the parser's stored
340            // length parity so unmutated subtrees round-trip byte-identical.
341            let suppress_last_pad = (*stored_length & 1) == 1;
342            let mut payload: Vec<u8> = Vec::new();
343            payload.extend_from_slice(secondary_id);
344            let n = children.len();
345            for (i, child) in children.iter().enumerate() {
346                let last = i + 1 == n;
347                emit_chunk_inner(child, &mut payload, last && suppress_last_pad);
348            }
349            let len = payload.len() as u32;
350            out.extend_from_slice(b"FORM");
351            out.extend_from_slice(&len.to_be_bytes());
352            out.extend_from_slice(&payload);
353            // Outer pad to align the next sibling in our parent. Skip when
354            // our parent told us they'll provide alignment for us.
355            let total = 8 + payload.len();
356            if !suppress_inner_pad && total % 2 == 1 {
357                out.push(0);
358            }
359        }
360        Chunk::Leaf { id, data } => {
361            let len = data.len() as u32;
362            out.extend_from_slice(id);
363            out.extend_from_slice(&len.to_be_bytes());
364            out.extend_from_slice(data);
365            let total = 8 + data.len();
366            if !suppress_inner_pad && total % 2 == 1 {
367                out.push(0);
368            }
369        }
370    }
371}
372
373/// Number of bytes [`emit`] writes for `chunk`: the 8-byte header, the payload,
374/// and any word-alignment pad byte.
375///
376/// This is the single source of the framing/size arithmetic. It walks the same
377/// `suppress_last_pad` parity rule as [`emit_chunk_inner`], so `emitted_size`
378/// and `emit` can never disagree — a guarantee callers that pre-compute byte
379/// offsets (e.g. DIRM offset recomputation in the document mutator) rely on for
380/// correctness.
381pub fn emitted_size(chunk: &Chunk) -> usize {
382    emitted_size_inner(chunk, false)
383}
384
385fn emitted_size_inner(chunk: &Chunk, suppress_inner_pad: bool) -> usize {
386    match chunk {
387        Chunk::Form {
388            length: stored_length,
389            children,
390            ..
391        } => {
392            let suppress_last_pad = (*stored_length & 1) == 1;
393            let n = children.len();
394            let mut payload = 4usize; // secondary_id
395            for (i, child) in children.iter().enumerate() {
396                let last = i + 1 == n;
397                payload += emitted_size_inner(child, last && suppress_last_pad);
398            }
399            let total = 8 + payload;
400            total + usize::from(!suppress_inner_pad && total % 2 == 1)
401        }
402        Chunk::Leaf { data, .. } => {
403            let total = 8 + data.len();
404            total + usize::from(!suppress_inner_pad && total % 2 == 1)
405        }
406    }
407}
408
409/// One child for [`partial_emit`]: a parsed [`Chunk`] to re-frame, a verbatim
410/// byte slice copied as-is, or a nested `FORM` container framed from its body.
411pub enum EmitPart<'a> {
412    /// Re-frame this chunk through the canonical emitter (8-byte header,
413    /// payload, word-alignment pad).
414    Chunk(&'a Chunk),
415    /// Copy these bytes into the FORM payload verbatim. Use this for children
416    /// whose bytes must be preserved exactly (the byte-preserving path); any
417    /// word-alignment pad is added by [`partial_emit`] if the slice has odd
418    /// length, so callers may pass either padded or unpadded child blocks.
419    Verbatim(&'a [u8]),
420    /// Frame a nested `FORM` container whose *body* is given verbatim. `body`
421    /// starts with the 4-byte secondary id (`DJVU`/`DJVI`/`THUM`/…); the seam
422    /// writes the `FORM` tag, the big-endian length, the body, and the
423    /// word-alignment pad. Use this for the component sub-FORMs of a bundle so
424    /// the `FORM` framing is never hand-rolled at the call site (and so the
425    /// component's start offset is reported by [`partial_emit_with_offsets`]).
426    Form(&'a [u8]),
427}
428
429/// Emit a complete DjVu file (`AT&T` magic + one root `FORM`) whose children
430/// are a mix of re-framed chunks and verbatim original slices.
431///
432/// This is the byte-preserving counterpart to [`emit`]: untouched children pass
433/// through as [`EmitPart::Verbatim`] (their original bytes), while edited
434/// children are re-framed as [`EmitPart::Chunk`]. Every child is word-aligned
435/// inside the payload, and the FORM length is computed here — through the same
436/// framing rules as [`emit`] / [`emitted_size`], so the three can't drift.
437///
438/// Returns `None` if the assembled FORM payload exceeds `u32::MAX`.
439pub fn partial_emit(secondary_id: ChunkId, parts: &[EmitPart<'_>]) -> Option<Vec<u8>> {
440    partial_emit_with_offsets(secondary_id, parts).map(|(bytes, _)| bytes)
441}
442
443/// Like [`partial_emit`], but also returns the absolute file-byte offset of
444/// each part within the returned buffer: `offsets[i]` is the index at which
445/// `parts[i]`'s framing begins, measured from the start of the leading `AT&T`
446/// magic.
447///
448/// This is the seam for writers that must record an external index of where
449/// each component landed — most notably a bundled `FORM:DJVM`, whose `DIRM`
450/// offset table stores the file offset of every component `FORM`. Those
451/// offsets live *inside* one part (the `DIRM`) yet describe the *others*, so
452/// such a writer is inherently two-pass: emit once to learn the offsets, write
453/// them into the `DIRM`, then emit again. The second pass yields identical
454/// offsets — a part's position depends only on the sizes of the parts before
455/// it, and a fixed-width offset table does not change size when its values
456/// change — so the two passes cannot disagree.
457///
458/// Returns `None` if the assembled FORM payload (or any [`EmitPart::Form`]
459/// body) exceeds `u32::MAX`.
460pub fn partial_emit_with_offsets(
461    secondary_id: ChunkId,
462    parts: &[EmitPart<'_>],
463) -> Option<(Vec<u8>, Vec<usize>)> {
464    // The file prologue before the payload is AT&T(4) + FORM(4) + length(4) =
465    // 12 bytes, so a part written while the payload already holds `k` bytes
466    // begins at file offset 12 + k.
467    const PROLOGUE: usize = 12;
468    let mut payload = Vec::new();
469    payload.extend_from_slice(&secondary_id); // even start (4 bytes)
470    let mut offsets = Vec::with_capacity(parts.len());
471    for part in parts {
472        offsets.push(PROLOGUE + payload.len());
473        match part {
474            EmitPart::Chunk(chunk) => emit_chunk(chunk, &mut payload),
475            EmitPart::Verbatim(bytes) => {
476                payload.extend_from_slice(bytes);
477                if payload.len() % 2 == 1 {
478                    payload.push(0);
479                }
480            }
481            EmitPart::Form(body) => {
482                let len = u32::try_from(body.len()).ok()?;
483                payload.extend_from_slice(b"FORM");
484                payload.extend_from_slice(&len.to_be_bytes());
485                payload.extend_from_slice(body);
486                if payload.len() % 2 == 1 {
487                    payload.push(0);
488                }
489            }
490        }
491    }
492    let len = u32::try_from(payload.len()).ok()?;
493    let mut out = Vec::with_capacity(8 + payload.len());
494    out.extend_from_slice(&MAGIC);
495    out.extend_from_slice(b"FORM");
496    out.extend_from_slice(&len.to_be_bytes());
497    out.extend_from_slice(&payload);
498    // Payload stays even (even start + self-aligned parts), so no outer pad is
499    // ever needed; guard defensively to keep the invariant explicit.
500    if (8 + payload.len()) % 2 == 1 {
501        out.push(0);
502    }
503    Some((out, offsets))
504}
505
506// ---- New spec-based IFF parser (phase 1) ------------------------------------
507//
508// `parse_form` is a new zero-copy parser written from the sndjvu.org spec.
509// It returns `Form` and `IffChunk` types (distinct from the legacy `Chunk`).
510
511/// A parsed IFF chunk from the new spec-based parser: a 4-byte identifier
512/// plus a zero-copy slice into the original byte buffer.
513#[derive(Debug, Clone, Copy)]
514pub struct IffChunk<'a> {
515    /// The 4-byte ASCII chunk identifier.
516    pub id: [u8; 4],
517    /// The raw data bytes of this chunk (not including id or length header).
518    pub data: &'a [u8],
519}
520
521/// The top-level FORM structure parsed by the spec-based parser.
522#[derive(Debug)]
523pub struct Form<'a> {
524    /// The 4-byte FORM type (e.g. `DJVU`, `DJVM`, `BM44`, `PM44`).
525    pub form_type: [u8; 4],
526    /// All chunks contained within the FORM, in order.
527    pub chunks: Vec<IffChunk<'a>>,
528}
529
530/// Parse a DjVu IFF byte stream into a [`Form`].
531///
532/// This is the new spec-based zero-copy parser. It returns borrowed data
533/// from the input slice.
534///
535/// # Errors
536///
537/// Returns [`IffError`] if:
538/// - The data does not begin with the `AT&T` magic bytes
539/// - The FORM chunk header is missing or malformed
540/// - Any chunk extends beyond the available data
541pub fn parse_form(data: &[u8]) -> Result<Form<'_>, IffError> {
542    // Need at least: magic(4) + FORM id(4) + length(4) + form_type(4) = 16 bytes
543    if data.len() < 16 {
544        return Err(IffError::TooShort);
545    }
546
547    // Verify AT&T magic prefix
548    let magic = read_4(data, 0)?;
549    if &magic != b"AT&T" {
550        return Err(IffError::BadMagic { got: magic });
551    }
552
553    // Read FORM chunk id
554    let form_id = read_4(data, 4)?;
555    if &form_id != b"FORM" {
556        return Err(IffError::Truncated);
557    }
558
559    // Read FORM length (big-endian u32)
560    let form_len = read_u32_be(data, 8)? as usize;
561
562    // FORM data starts at byte 12 and must fit within the buffer
563    let form_data_end = 12_usize.checked_add(form_len).ok_or(IffError::Truncated)?;
564    if form_data_end > data.len() {
565        return Err(IffError::ChunkTooLong {
566            id: *b"FORM",
567            claimed: form_len as u32,
568            available: data.len().saturating_sub(12),
569        });
570    }
571
572    // Read form_type (first 4 bytes of FORM data)
573    if form_len < 4 {
574        return Err(IffError::Truncated);
575    }
576    let form_type = read_4(data, 12)?;
577
578    // Parse chunks from the FORM body (after form_type)
579    let body = data.get(16..form_data_end).ok_or(IffError::Truncated)?;
580
581    let chunks = parse_form_body(body)?;
582
583    Ok(Form { form_type, chunks })
584}
585
586/// Parse a sequence of IFF chunks from a FORM body (the bytes *after* the
587/// 4-byte form type), returning zero-copy [`IffChunk`] slices.
588///
589/// Each chunk is: `[4-byte id][4-byte big-endian length][length bytes data]`,
590/// with data padded to an even byte boundary. This is the single chunk-walker
591/// shared by the document reader, the mutator, and DJVM merge/split — callers
592/// that already stripped the `AT&T`/`FORM`/length/form-type prologue (e.g. a
593/// sub-FORM body, or a `FORM:DJVU` page extracted from a bundle) pass the
594/// remaining bytes here instead of re-implementing the walk.
595pub fn parse_form_body(mut buf: &[u8]) -> Result<Vec<IffChunk<'_>>, IffError> {
596    let mut chunks = Vec::new();
597
598    while buf.len() >= 8 {
599        let id = read_4(buf, 0)?;
600        let data_len = read_u32_be(buf, 4)? as usize;
601
602        let data_start = 8_usize;
603        let data_end = data_start
604            .checked_add(data_len)
605            .ok_or(IffError::Truncated)?;
606
607        if data_end > buf.len() {
608            return Err(IffError::ChunkTooLong {
609                id,
610                claimed: data_len as u32,
611                available: buf.len().saturating_sub(data_start),
612            });
613        }
614
615        let chunk_data = buf.get(data_start..data_end).ok_or(IffError::Truncated)?;
616        chunks.push(IffChunk {
617            id,
618            data: chunk_data,
619        });
620
621        // Advance past this chunk; pad to even boundary
622        let padded_len = data_len + (data_len & 1);
623        let next = data_start
624            .checked_add(padded_len)
625            .ok_or(IffError::Truncated)?;
626
627        // Clamp to buf length to handle trailing padding gracefully
628        buf = buf.get(next.min(buf.len())..).ok_or(IffError::Truncated)?;
629    }
630
631    Ok(chunks)
632}
633
634/// Read 4 bytes from `data` at `offset` as a `[u8; 4]`.
635#[inline]
636fn read_4(data: &[u8], offset: usize) -> Result<[u8; 4], IffError> {
637    data.get(offset..offset + 4)
638        .and_then(|s| s.try_into().ok())
639        .ok_or(IffError::Truncated)
640}
641
642/// Read a big-endian `u32` from `data` at `offset`.
643#[inline]
644fn read_u32_be(data: &[u8], offset: usize) -> Result<u32, IffError> {
645    let b = read_4(data, offset)?;
646    Ok(u32::from_be_bytes(b))
647}
648
649// ---- Legacy dump helper (tests only) ----------------------------------------
650
651/// Produce a structural dump of the chunk tree.
652#[cfg(test)]
653pub fn dump(file: &DjvuFile) -> String {
654    let mut out = String::new();
655    dump_chunk(&file.root, 1, &mut out);
656    out
657}
658
659#[cfg(test)]
660fn dump_chunk(chunk: &Chunk, depth: usize, out: &mut String) {
661    let indent = "  ".repeat(depth);
662    match chunk {
663        Chunk::Form {
664            secondary_id,
665            length,
666            children,
667        } => {
668            let sec = std::str::from_utf8(secondary_id).unwrap_or("????");
669            out.push_str(&format!("{}FORM:{} [{}] \n", indent, sec, length));
670            for child in children {
671                dump_chunk(child, depth + 1, out);
672            }
673        }
674        Chunk::Leaf { id, data } => {
675            let id_str = std::str::from_utf8(id).unwrap_or("????");
676            out.push_str(&format!("{}{} [{}] \n", indent, id_str, data.len()));
677        }
678    }
679}
680
681#[cfg(test)]
682mod tests {
683    use super::*;
684
685    fn assets_path() -> std::path::PathBuf {
686        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
687            .join("../../references/djvujs/library/assets")
688    }
689
690    fn golden_path() -> std::path::PathBuf {
691        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../tests/golden/iff")
692    }
693
694    // ---- Legacy parser tests ------------------------------------------------
695
696    /// Parse our structural dump and djvudump output to comparable lines.
697    fn normalize_dump(input: &str) -> Vec<String> {
698        input
699            .lines()
700            .filter(|l| !l.trim().is_empty())
701            .map(|line| {
702                let trimmed = line.trim_end();
703                if let Some(bracket_end) = trimmed.find(']') {
704                    let structural = &trimmed[..=bracket_end];
705                    structural.trim_end().to_string()
706                } else {
707                    trimmed.to_string()
708                }
709            })
710            .collect()
711    }
712
713    fn assert_structure_matches(djvu_file: &str, golden_file: &str) {
714        let data = std::fs::read(assets_path().join(djvu_file)).unwrap();
715        let file = parse(&data).unwrap();
716        let actual = dump(&file);
717        let expected = std::fs::read_to_string(golden_path().join(golden_file)).unwrap();
718
719        let actual_lines = normalize_dump(&actual);
720        let expected_lines = normalize_dump(&expected);
721
722        assert_eq!(
723            actual_lines.len(),
724            expected_lines.len(),
725            "Line count mismatch for {} ({} vs {})",
726            djvu_file,
727            actual_lines.len(),
728            expected_lines.len()
729        );
730
731        for (i, (a, e)) in actual_lines.iter().zip(expected_lines.iter()).enumerate() {
732            assert_eq!(
733                a,
734                e,
735                "Line {} mismatch for {}\n  actual:   {:?}\n  expected: {:?}",
736                i + 1,
737                djvu_file,
738                a,
739                e
740            );
741        }
742    }
743
744    #[test]
745    fn parse_boy_jb2_legacy() {
746        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
747        let file = parse(&data).unwrap();
748
749        match &file.root {
750            Chunk::Form {
751                secondary_id,
752                children,
753                ..
754            } => {
755                assert_eq!(secondary_id, b"DJVU");
756                assert_eq!(children.len(), 2);
757            }
758            _ => panic!("expected FORM root"),
759        }
760    }
761
762    #[test]
763    fn structure_boy_jb2() {
764        assert_structure_matches("boy_jb2.djvu", "boy_jb2.dump");
765    }
766
767    #[test]
768    fn structure_boy() {
769        assert_structure_matches("boy.djvu", "boy.dump");
770    }
771
772    #[test]
773    fn structure_chicken() {
774        assert_structure_matches("chicken.djvu", "chicken.dump");
775    }
776
777    #[test]
778    fn structure_carte() {
779        assert_structure_matches("carte.djvu", "carte.dump");
780    }
781
782    #[test]
783    fn structure_navm_fgbz() {
784        assert_structure_matches("navm_fgbz.djvu", "navm_fgbz.dump");
785    }
786
787    #[test]
788    fn structure_colorbook() {
789        assert_structure_matches("colorbook.djvu", "colorbook.dump");
790    }
791
792    #[test]
793    fn structure_djvu3spec_bundled() {
794        assert_structure_matches("DjVu3Spec_bundled.djvu", "djvu3spec_bundled.dump");
795    }
796
797    #[test]
798    fn structure_big_scanned_page() {
799        assert_structure_matches("big-scanned-page.djvu", "big_scanned_page.dump");
800    }
801
802    // ---- emitted_size / partial_emit ----------------------------------------
803
804    /// `emitted_size(root)` must equal the bytes `emit` writes for that root
805    /// (the whole file minus the 4-byte `AT&T` magic) — the invariant DIRM
806    /// offset recomputation relies on. Checked across the real-asset corpus,
807    /// which mixes odd- and even-length FORM declarations.
808    fn assert_emitted_size_matches_emit(name: &str) {
809        let Ok(data) = std::fs::read(assets_path().join(name)) else {
810            return; // asset not vendored in this checkout
811        };
812        let file = parse(&data).unwrap();
813        let emitted = emit(&file);
814        assert_eq!(
815            emitted_size(&file.root),
816            emitted.len() - 4,
817            "emitted_size disagrees with emit() for {name}"
818        );
819    }
820
821    #[test]
822    fn emitted_size_matches_emit_corpus() {
823        for name in [
824            "boy_jb2.djvu",
825            "boy.djvu",
826            "chicken.djvu",
827            "carte.djvu",
828            "navm_fgbz.djvu",
829            "colorbook.djvu",
830            "DjVu3Spec_bundled.djvu",
831            "big-scanned-page.djvu",
832        ] {
833            assert_emitted_size_matches_emit(name);
834        }
835    }
836
837    #[test]
838    fn partial_emit_verbatim_matches_chunk_framing() {
839        // A child copied verbatim from a canonical emit must produce the same
840        // bytes as re-framing that child through EmitPart::Chunk — i.e. the
841        // byte-preserving path and the re-emit path agree. Build an even-parity
842        // tree (root length 0) so emit word-aligns every child, the convention
843        // partial_emit also uses.
844        let tree = DjvuFile {
845            root: Chunk::Form {
846                secondary_id: *b"DJVU",
847                length: 0,
848                children: vec![
849                    Chunk::Leaf {
850                        id: *b"INFO",
851                        data: vec![0xAA; 5], // odd → forces a pad
852                    },
853                    Chunk::Leaf {
854                        id: *b"Sjbz",
855                        data: vec![0xBB; 4], // even
856                    },
857                ],
858            },
859        };
860        let canonical = emit(&tree); // AT&T + FORM + DJVU + framed children
861
862        let Chunk::Form { children, .. } = &tree.root else {
863            unreachable!()
864        };
865        // Re-emit each child into its own framed block to slice verbatim spans.
866        let mut info_bytes = Vec::new();
867        emit_chunk(&children[0], &mut info_bytes);
868        let mut sjbz_bytes = Vec::new();
869        emit_chunk(&children[1], &mut sjbz_bytes);
870
871        let via_verbatim = partial_emit(
872            *b"DJVU",
873            &[
874                EmitPart::Verbatim(&info_bytes),
875                EmitPart::Verbatim(&sjbz_bytes),
876            ],
877        )
878        .expect("fits in u32");
879        let via_chunks = partial_emit(
880            *b"DJVU",
881            &[EmitPart::Chunk(&children[0]), EmitPart::Chunk(&children[1])],
882        )
883        .expect("fits in u32");
884
885        assert_eq!(via_verbatim, canonical, "verbatim path must match emit");
886        assert_eq!(via_chunks, canonical, "chunk path must match emit");
887    }
888
889    #[test]
890    fn partial_emit_pads_odd_verbatim_child() {
891        // A 3-byte verbatim child must be padded to an even boundary inside the
892        // payload, exactly like an emitted odd-length chunk.
893        let parts = [EmitPart::Verbatim(&[1u8, 2, 3])];
894        let out = partial_emit(*b"DJVU", &parts).unwrap();
895        // AT&T(4) FORM(4) len(4) DJVU(4) + 3 data + 1 pad = 20 bytes.
896        assert_eq!(out.len(), 20);
897        assert_eq!(&out[..8], b"AT&TFORM");
898        // FORM length = DJVU(4) + 3 + 1 pad = 8.
899        assert_eq!(u32::from_be_bytes(out[8..12].try_into().unwrap()), 8);
900        assert_eq!(&out[12..16], b"DJVU");
901        assert_eq!(&out[16..19], &[1, 2, 3]);
902        assert_eq!(out[19], 0);
903    }
904
905    #[test]
906    fn partial_emit_form_part_frames_nested_form() {
907        // An `EmitPart::Form` body must be framed as `FORM` + len + body + pad,
908        // identical to copying a pre-framed FORM chunk verbatim.
909        let body: &[u8] = b"DJVUxyz"; // 7 bytes (odd) → forces a pad
910        let via_form = partial_emit(*b"DJVM", &[EmitPart::Form(body)]).unwrap();
911
912        // Hand-frame the same component to compare against the seam output.
913        let mut framed = Vec::new();
914        framed.extend_from_slice(b"FORM");
915        framed.extend_from_slice(&(body.len() as u32).to_be_bytes());
916        framed.extend_from_slice(body);
917        framed.push(0); // odd body → pad
918        let via_verbatim = partial_emit(*b"DJVM", &[EmitPart::Verbatim(&framed)]).unwrap();
919
920        assert_eq!(via_form, via_verbatim, "Form part must match framed FORM");
921        // Spot-check the literal bytes too.
922        assert_eq!(&via_form[..8], b"AT&TFORM");
923        assert_eq!(&via_form[12..16], b"DJVM");
924        assert_eq!(&via_form[16..20], b"FORM");
925        assert_eq!(u32::from_be_bytes(via_form[20..24].try_into().unwrap()), 7);
926        assert_eq!(&via_form[24..31], body);
927        assert_eq!(via_form[31], 0); // pad
928    }
929
930    #[test]
931    fn partial_emit_with_offsets_reports_part_starts() {
932        // Each reported offset must point at the byte where that part's framing
933        // begins (the `FORM`/leaf-id tag), measured from the `AT&T` magic.
934        let dirm = Chunk::Leaf {
935            id: *b"DIRM",
936            data: vec![0xAB; 5], // odd → the DIRM chunk gets a pad
937        };
938        let comp0: &[u8] = b"DJVU0000"; // 8 bytes (even)
939        let comp1: &[u8] = b"DJVIaa"; // 6 bytes (even)
940        let parts = [
941            EmitPart::Chunk(&dirm),
942            EmitPart::Form(comp0),
943            EmitPart::Form(comp1),
944        ];
945        let (bytes, offsets) = partial_emit_with_offsets(*b"DJVM", &parts).unwrap();
946
947        assert_eq!(offsets.len(), 3);
948        // DIRM: AT&T(4)+FORM(4)+len(4)+DJVM(4) = 16.
949        assert_eq!(offsets[0], 16);
950        assert_eq!(&bytes[offsets[0]..offsets[0] + 4], b"DIRM");
951        // Component FORM tags land exactly where the offset table says.
952        for &off in &offsets[1..] {
953            assert_eq!(&bytes[off..off + 4], b"FORM", "offset must point at FORM");
954        }
955        // comp1 sits after comp0's full framing: 8 (header) + 8 (even body).
956        assert_eq!(offsets[2] - offsets[1], 16);
957    }
958
959    // ---- New spec-based parser tests ----------------------------------------
960
961    /// Build a minimal valid single-page DjVu file in memory for testing.
962    fn minimal_djvu_bytes() -> Vec<u8> {
963        let info_data: &[u8] = &[
964            0x00, 0xB5, // width = 181
965            0x00, 0xF0, // height = 240
966            0x18, // minor version
967            0x00, // major version
968            0x64, 0x00, // dpi = 100 (little-endian)
969            0x16, // gamma byte = 22 → 2.2
970            0x00, // flags: no rotation
971        ];
972        let info_len = info_data.len() as u32;
973
974        let mut chunk = Vec::new();
975        chunk.extend_from_slice(b"INFO");
976        chunk.extend_from_slice(&info_len.to_be_bytes());
977        chunk.extend_from_slice(info_data);
978
979        let mut form_body = Vec::new();
980        form_body.extend_from_slice(b"DJVU");
981        form_body.extend_from_slice(&chunk);
982
983        let form_len = form_body.len() as u32;
984
985        let mut file = Vec::new();
986        file.extend_from_slice(b"AT&T");
987        file.extend_from_slice(b"FORM");
988        file.extend_from_slice(&form_len.to_be_bytes());
989        file.extend_from_slice(&form_body);
990
991        file
992    }
993
994    #[test]
995    fn empty_input_is_error() {
996        let result = parse_form(&[]);
997        assert!(result.is_err());
998        assert_eq!(result.unwrap_err(), IffError::TooShort);
999    }
1000
1001    #[test]
1002    fn short_input_is_error() {
1003        let result = parse_form(&[0u8; 10]);
1004        assert!(result.is_err());
1005        assert_eq!(result.unwrap_err(), IffError::TooShort);
1006    }
1007
1008    #[test]
1009    fn bad_magic_is_error() {
1010        let mut data = minimal_djvu_bytes();
1011        data[0] = 0xFF;
1012        data[1] = 0xFF;
1013        data[2] = 0xFF;
1014        data[3] = 0xFF;
1015
1016        let result = parse_form(&data);
1017        assert!(result.is_err());
1018        assert_eq!(
1019            result.unwrap_err(),
1020            IffError::BadMagic {
1021                got: [0xFF, 0xFF, 0xFF, 0xFF]
1022            }
1023        );
1024    }
1025
1026    #[test]
1027    fn valid_single_page_parses() {
1028        let data = minimal_djvu_bytes();
1029        let form = parse_form(&data).expect("should parse successfully");
1030
1031        assert_eq!(&form.form_type, b"DJVU");
1032        assert_eq!(form.chunks.len(), 1);
1033        assert_eq!(&form.chunks[0].id, b"INFO");
1034        assert_eq!(form.chunks[0].data.len(), 10);
1035    }
1036
1037    #[test]
1038    fn truncated_chunk_is_error() {
1039        let mut data = minimal_djvu_bytes();
1040        let new_len = data.len() - 4;
1041        data.truncate(new_len);
1042
1043        let result = parse_form(&data);
1044        assert!(result.is_err());
1045        match result.unwrap_err() {
1046            IffError::ChunkTooLong { .. } | IffError::Truncated => {}
1047            other => panic!("expected ChunkTooLong or Truncated, got {:?}", other),
1048        }
1049    }
1050
1051    #[test]
1052    fn unknown_form_type_allowed() {
1053        let mut data = minimal_djvu_bytes();
1054        data[12] = b'X';
1055        data[13] = b'X';
1056        data[14] = b'X';
1057        data[15] = b'X';
1058
1059        let form = parse_form(&data).expect("unknown form type should still parse");
1060        assert_eq!(&form.form_type, b"XXXX");
1061    }
1062
1063    #[test]
1064    fn real_chicken_djvu_parses() {
1065        let path = assets_path().join("chicken.djvu");
1066        let data = std::fs::read(&path).expect("chicken.djvu must exist");
1067        let form = parse_form(&data).expect("chicken.djvu should parse");
1068
1069        assert_eq!(&form.form_type, b"DJVU");
1070        assert!(!form.chunks.is_empty(), "must have at least one chunk");
1071        assert_eq!(&form.chunks[0].id, b"INFO");
1072        assert!(form.chunks[0].data.len() >= 10);
1073    }
1074
1075    #[test]
1076    fn real_multipage_djvu_parses() {
1077        let path = assets_path().join("navm_fgbz.djvu");
1078        let data = std::fs::read(&path).expect("navm_fgbz.djvu must exist");
1079        let form = parse_form(&data).expect("navm_fgbz.djvu should parse");
1080
1081        assert_eq!(&form.form_type, b"DJVM");
1082        assert!(!form.chunks.is_empty());
1083    }
1084
1085    #[test]
1086    fn odd_length_chunk_padding() {
1087        let chunk1_data: &[u8] = &[0xAA, 0xBB, 0xCC, 0xDD, 0xEE]; // 5 bytes → padded to 6
1088        let chunk2_data: &[u8] = &[0x01, 0x02]; // 2 bytes
1089
1090        let mut form_body: Vec<u8> = Vec::new();
1091        form_body.extend_from_slice(b"DJVU");
1092
1093        form_body.extend_from_slice(b"TST1");
1094        form_body.extend_from_slice(&5u32.to_be_bytes());
1095        form_body.extend_from_slice(chunk1_data);
1096        form_body.push(0x00); // padding byte
1097
1098        form_body.extend_from_slice(b"TST2");
1099        form_body.extend_from_slice(&2u32.to_be_bytes());
1100        form_body.extend_from_slice(chunk2_data);
1101
1102        let form_len = form_body.len() as u32;
1103
1104        let mut file: Vec<u8> = Vec::new();
1105        file.extend_from_slice(b"AT&T");
1106        file.extend_from_slice(b"FORM");
1107        file.extend_from_slice(&form_len.to_be_bytes());
1108        file.extend_from_slice(&form_body);
1109
1110        let form = parse_form(&file).expect("should parse padded chunk");
1111        assert_eq!(form.chunks.len(), 2);
1112        assert_eq!(&form.chunks[0].id, b"TST1");
1113        assert_eq!(form.chunks[0].data, chunk1_data);
1114        assert_eq!(&form.chunks[1].id, b"TST2");
1115        assert_eq!(form.chunks[1].data, chunk2_data);
1116    }
1117}