djvu_rs/
djvu_document.rs

1//! New document model for DjVu files — phase 3.
2//!
3//! This module provides the high-level `DjVuDocument` API built on top of the
4//! clean-room IFF parser (phase 1), BZZ decompressor (phase 2a), and IW44 decoder
5//! (phase 2c).
6//!
7//! ## Key public types
8//!
9//! - [`DjVuDocument`] — opened DjVu document (single-page or multi-page)
10//! - [`DjVuPage`] — lazy page handle (raw chunks stored until `thumbnail()` is called)
11//! - [`DjVuBookmark`] — table-of-contents entry from the NAVM chunk
12//! - [`DocError`] — typed errors for this module
13//!
14//! ## Document kinds
15//!
16//! - **FORM:DJVU** — single-page document
17//! - **FORM:DJVM + DIRM** — bundled multi-page document with an in-file page index
18//! - **FORM:DJVM + DIRM (indirect)** — pages live in separate files; a resolver
19//!   callback `fn(name: &str) -> Result<Vec<u8>, DocError>` is required
20//!
21//! ## Lazy decoding contract
22//!
23//! `DjVuPage` stores only the raw chunk bytes. No image decoding happens until
24//! the caller explicitly calls `thumbnail()` (which invokes the IW44 decoder).
25
26#[cfg(not(feature = "std"))]
27use alloc::{
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32
33use crate::{
34    annotation::{Annotation, AnnotationError, MapArea},
35    bzz_new::bzz_decode,
36    error::{BzzError, IffError, Iw44Error, Jb2Error},
37    iff::{IffChunk, parse_form},
38    info::PageInfo,
39    iw44_new::Iw44Image,
40    metadata::{DjVuMetadata, MetadataError},
41    pixmap::Pixmap,
42    text::{TextError, TextLayer},
43};
44
45// ---- Error type -------------------------------------------------------------
46
47/// Errors that can occur when working with the DjVuDocument API.
48#[derive(Debug, thiserror::Error)]
49pub enum DocError {
50    /// IFF container parse error.
51    #[error("IFF error: {0}")]
52    Iff(#[from] IffError),
53
54    /// BZZ decompression error.
55    #[error("BZZ error: {0}")]
56    Bzz(#[from] BzzError),
57
58    /// IW44 wavelet decoding error.
59    #[error("IW44 error: {0}")]
60    Iw44(#[from] Iw44Error),
61
62    /// JB2 bilevel image decoding error.
63    #[error("JB2 error: {0}")]
64    Jb2(#[from] Jb2Error),
65
66    /// The file is not a supported DjVu format.
67    #[error("not a DjVu file: found form type {0:?}")]
68    NotDjVu([u8; 4]),
69
70    /// A required chunk is missing.
71    #[error("missing required chunk: {0}")]
72    MissingChunk(&'static str),
73
74    /// The document is malformed (description included).
75    #[error("malformed DjVu document: {0}")]
76    Malformed(&'static str),
77
78    /// An indirect page reference could not be resolved.
79    #[error("failed to resolve indirect page '{0}'")]
80    IndirectResolve(String),
81
82    /// Page index is out of range.
83    #[error("page index {index} is out of range (document has {count} pages)")]
84    PageOutOfRange { index: usize, count: usize },
85
86    /// Invalid UTF-8 in a string field.
87    #[error("invalid UTF-8 in DjVu metadata")]
88    InvalidUtf8,
89
90    /// The resolver callback is required for indirect documents but was not provided.
91    #[error("indirect DjVu document requires a resolver callback")]
92    NoResolver,
93
94    /// I/O error when reading file data (only with `std` feature).
95    #[cfg(feature = "std")]
96    #[error("I/O error: {0}")]
97    Io(#[from] std::io::Error),
98
99    /// Text layer parse error.
100    #[error("text layer error: {0}")]
101    Text(#[from] TextError),
102
103    /// Annotation parse error.
104    #[error("annotation error: {0}")]
105    Annotation(#[from] AnnotationError),
106
107    /// Metadata parse error.
108    #[error("metadata error: {0}")]
109    Metadata(#[from] MetadataError),
110}
111
112// ---- Bookmark ---------------------------------------------------------------
113
114/// A table-of-contents entry from the NAVM chunk.
115#[derive(Debug, Clone)]
116#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
117pub struct DjVuBookmark {
118    /// Display title.
119    pub title: String,
120    /// Target URL (DjVu internal URL format).
121    pub url: String,
122    /// Nested child entries.
123    pub children: Vec<DjVuBookmark>,
124}
125
126// ---- Page -------------------------------------------------------------------
127
128/// Component type in the DIRM directory.
129#[derive(Debug, Clone, Copy, PartialEq, Eq)]
130enum ComponentType {
131    Shared,
132    Page,
133    Thumbnail,
134}
135
136/// A raw chunk extracted from a page FORM:DJVU.
137#[derive(Debug, Clone)]
138struct RawChunk {
139    id: [u8; 4],
140    data: Vec<u8>,
141}
142
143/// A lazy DjVu page handle.
144///
145/// Raw chunk data is stored on construction. No image decoding is performed
146/// until the caller invokes `thumbnail()` or a render function.
147///
148/// The fully decoded BG44 wavelet image is cached after the first render so
149/// that subsequent renders skip the expensive ZP arithmetic decode and only
150/// run the wavelet inverse-transform and compositor.
151#[derive(Debug, Clone)]
152pub struct DjVuPage {
153    /// Page info parsed from the INFO chunk.
154    info: PageInfo,
155    /// All raw chunks from this page's FORM:DJVU, in order.
156    chunks: Vec<RawChunk>,
157    /// Page index within the document (0-based).
158    index: usize,
159    /// Raw Djbz data from the DJVI shared dictionary component referenced via
160    /// the page's INCL chunk, if present.  Stored here so that `extract_mask`
161    /// can decode it without access to the parent document.
162    shared_djbz: Option<Vec<u8>>,
163    /// Lazily decoded BG44 background wavelet image.  Populated on first use;
164    /// subsequent renders call `to_rgb_subsample` directly on the cached image.
165    /// Only available when the `std` feature is enabled (`OnceLock` requires std).
166    #[cfg(feature = "std")]
167    bg44_decoded: std::sync::OnceLock<Option<Iw44Image>>,
168}
169
170impl DjVuPage {
171    /// Page width in pixels.
172    pub fn width(&self) -> u16 {
173        self.info.width
174    }
175
176    /// Page height in pixels.
177    pub fn height(&self) -> u16 {
178        self.info.height
179    }
180
181    /// Page resolution in dots per inch.
182    pub fn dpi(&self) -> u16 {
183        self.info.dpi
184    }
185
186    /// Display gamma from the INFO chunk.
187    pub fn gamma(&self) -> f32 {
188        self.info.gamma
189    }
190
191    /// Page rotation from the INFO chunk.
192    pub fn rotation(&self) -> crate::info::Rotation {
193        self.info.rotation
194    }
195
196    /// 0-based page index within the document.
197    pub fn index(&self) -> usize {
198        self.index
199    }
200
201    /// Dimensions as `(width, height)`.
202    pub fn dimensions(&self) -> (u16, u16) {
203        (self.info.width, self.info.height)
204    }
205
206    /// Decode the thumbnail for this page from TH44 chunks, if present.
207    ///
208    /// No image data is decoded until this method is called (lazy contract).
209    ///
210    /// Returns `Ok(None)` if the page has no TH44 thumbnail.
211    pub fn thumbnail(&self) -> Result<Option<Pixmap>, DocError> {
212        let th44_chunks: Vec<&[u8]> = self
213            .chunks
214            .iter()
215            .filter(|c| &c.id == b"TH44")
216            .map(|c| c.data.as_slice())
217            .collect();
218
219        if th44_chunks.is_empty() {
220            return Ok(None);
221        }
222
223        let mut img = Iw44Image::new();
224        for chunk_data in &th44_chunks {
225            img.decode_chunk(chunk_data)?;
226        }
227        let pixmap = img.to_rgb()?;
228        Ok(Some(pixmap))
229    }
230
231    /// Return the raw bytes of the first chunk with the given 4-byte ID.
232    ///
233    /// Returns `None` if no chunk with that ID exists.  The returned slice
234    /// points into the owned chunk storage — zero copy.
235    ///
236    /// # Example
237    ///
238    /// ```ignore
239    /// let sjbz = page.raw_chunk(b"Sjbz").expect("page must have a JB2 chunk");
240    /// ```
241    pub fn raw_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
242        self.chunks
243            .iter()
244            .find(|c| &c.id == id)
245            .map(|c| c.data.as_slice())
246    }
247
248    /// Return the raw bytes of all chunks with the given 4-byte ID, in order.
249    ///
250    /// Returns an empty `Vec` if no such chunk exists.
251    ///
252    /// # Example
253    ///
254    /// ```ignore
255    /// let bg44_chunks = page.all_chunks(b"BG44");
256    /// assert!(!bg44_chunks.is_empty(), "colour page must have BG44 data");
257    /// ```
258    pub fn all_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
259        self.chunks
260            .iter()
261            .filter(|c| &c.id == id)
262            .map(|c| c.data.as_slice())
263            .collect()
264    }
265
266    /// Return the IDs of all chunks present on this page, in order.
267    ///
268    /// Duplicate IDs appear multiple times (once per chunk).
269    pub fn chunk_ids(&self) -> Vec<[u8; 4]> {
270        self.chunks.iter().map(|c| c.id).collect()
271    }
272
273    /// Find the first chunk with the given 4-byte ID.
274    ///
275    /// Equivalent to [`Self::raw_chunk`]; kept for internal use.
276    pub fn find_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
277        self.raw_chunk(id)
278    }
279
280    /// Find all chunks with the given 4-byte ID.
281    ///
282    /// Equivalent to [`Self::all_chunks`]; kept for internal use.
283    pub fn find_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
284        self.all_chunks(id)
285    }
286
287    /// Return all BG44 background chunk data slices, in order.
288    pub fn bg44_chunks(&self) -> Vec<&[u8]> {
289        self.find_chunks(b"BG44")
290    }
291
292    /// Return the fully decoded BG44 wavelet image, decoding and caching on first call.
293    ///
294    /// Returns `None` if the page has no BG44 chunks.  On decode error the error
295    /// is swallowed and `None` is returned (same semantics as the permissive render
296    /// path), so this method is infallible.
297    ///
298    /// The result is computed once (all ZP arithmetic decode + block assembly) and
299    /// then cached inside the page.  Subsequent calls return the cached value
300    /// immediately.  The wavelet inverse-transform and YCbCr→RGB conversion are
301    /// **not** cached; they are applied at each render at the appropriate subsample
302    /// level via [`Iw44Image::to_rgb_subsample`].
303    #[cfg(feature = "std")]
304    pub fn decoded_bg44(&self) -> Option<&Iw44Image> {
305        self.bg44_decoded
306            .get_or_init(|| {
307                let chunks = self.bg44_chunks();
308                if chunks.is_empty() {
309                    return None;
310                }
311                let mut img = Iw44Image::new();
312                for chunk_data in &chunks {
313                    if img.decode_chunk(chunk_data).is_err() {
314                        break;
315                    }
316                }
317                if img.width == 0 { None } else { Some(img) }
318            })
319            .as_ref()
320    }
321
322    #[cfg(not(feature = "std"))]
323    pub fn decoded_bg44(&self) -> Option<&Iw44Image> {
324        None
325    }
326
327    /// Return all FG44 foreground chunk data slices, in order.
328    pub fn fg44_chunks(&self) -> Vec<&[u8]> {
329        self.find_chunks(b"FG44")
330    }
331
332    /// Extract the text layer from TXTz (BZZ-compressed) or TXTa (plain) chunks.
333    ///
334    /// Returns `Ok(None)` if the page has no text layer.
335    pub fn text_layer(&self) -> Result<Option<TextLayer>, DocError> {
336        let page_height = self.info.height as u32;
337
338        if let Some(txtz) = self.find_chunk(b"TXTz") {
339            if txtz.is_empty() {
340                return Ok(None);
341            }
342            let layer = crate::text::parse_text_layer_bzz(txtz, page_height)?;
343            return Ok(Some(layer));
344        }
345
346        if let Some(txta) = self.find_chunk(b"TXTa") {
347            if txta.is_empty() {
348                return Ok(None);
349            }
350            let layer = crate::text::parse_text_layer(txta, page_height)?;
351            return Ok(Some(layer));
352        }
353
354        Ok(None)
355    }
356
357    /// Parse the text layer and transform all zone rectangles to match a
358    /// rendered page of size `render_w × render_h`.
359    ///
360    /// This is a convenience wrapper around [`Self::text_layer`] followed by
361    /// [`TextLayer::transform`].  It applies the page's own rotation (from the
362    /// INFO chunk) and scales coordinates proportionally to the requested
363    /// render size, so callers can use the returned rects directly for text
364    /// selection / copy-paste overlays without any additional maths.
365    ///
366    /// Returns `Ok(None)` if the page has no text layer.
367    pub fn text_layer_at_size(
368        &self,
369        render_w: u32,
370        render_h: u32,
371    ) -> Result<Option<TextLayer>, DocError> {
372        let page_w = self.info.width as u32;
373        let page_h = self.info.height as u32;
374        let rotation = self.info.rotation;
375        Ok(self
376            .text_layer()?
377            .map(|tl| tl.transform(page_w, page_h, rotation, render_w, render_h)))
378    }
379
380    /// Extract the plain text content of the page (convenience wrapper).
381    ///
382    /// Returns `Ok(None)` if the page has no text layer.
383    pub fn text(&self) -> Result<Option<String>, DocError> {
384        Ok(self.text_layer()?.map(|tl| tl.text))
385    }
386
387    /// Parse the annotation layer from ANTz (BZZ-compressed) or ANTa (plain) chunks.
388    ///
389    /// Returns `Ok(None)` if the page has no annotation chunk.
390    pub fn annotations(&self) -> Result<Option<(Annotation, Vec<MapArea>)>, DocError> {
391        if let Some(antz) = self.find_chunk(b"ANTz") {
392            if antz.is_empty() {
393                return Ok(None);
394            }
395            let result = crate::annotation::parse_annotations_bzz(antz)?;
396            return Ok(Some(result));
397        }
398
399        if let Some(anta) = self.find_chunk(b"ANTa") {
400            if anta.is_empty() {
401                return Ok(None);
402            }
403            let result = crate::annotation::parse_annotations(anta)?;
404            return Ok(Some(result));
405        }
406
407        Ok(None)
408    }
409
410    /// Return all hyperlinks (MapAreas with a non-empty URL) on this page.
411    pub fn hyperlinks(&self) -> Result<Vec<MapArea>, DocError> {
412        match self.annotations()? {
413            None => Ok(Vec::new()),
414            Some((_, mapareas)) => Ok(mapareas.into_iter().filter(|m| !m.url.is_empty()).collect()),
415        }
416    }
417
418    /// Decode the JB2 foreground mask as a 1-bit [`Bitmap`](crate::bitmap::Bitmap).
419    ///
420    /// Returns `Ok(None)` if the page has no Sjbz (JB2 mask) chunk.
421    pub fn extract_mask(&self) -> Result<Option<crate::bitmap::Bitmap>, DocError> {
422        let sjbz = match self.find_chunk(b"Sjbz") {
423            Some(data) => data,
424            None => return Ok(None),
425        };
426
427        // Prefer an inline Djbz chunk; fall back to the shared DJVI dictionary
428        // that was resolved from the INCL chunk during document parse.
429        let dict = if let Some(djbz) = self.find_chunk(b"Djbz") {
430            Some(crate::jb2_new::decode_dict(djbz, None)?)
431        } else if let Some(djbz) = self.shared_djbz.as_deref() {
432            Some(crate::jb2_new::decode_dict(djbz, None)?)
433        } else {
434            None
435        };
436
437        let bm = crate::jb2_new::decode(sjbz, dict.as_ref())?;
438        Ok(Some(bm))
439    }
440
441    /// Decode the IW44 foreground layer (FG44 chunks) if present.
442    ///
443    /// Returns `Ok(None)` if the page has no FG44 chunks.
444    pub fn extract_foreground(&self) -> Result<Option<Pixmap>, DocError> {
445        let chunks = self.fg44_chunks();
446        if chunks.is_empty() {
447            return Ok(None);
448        }
449
450        let mut img = Iw44Image::new();
451        for chunk_data in &chunks {
452            img.decode_chunk(chunk_data)?;
453        }
454        let pixmap = img.to_rgb()?;
455        Ok(Some(pixmap))
456    }
457
458    /// Decode the IW44 background layer (BG44 chunks) if present.
459    ///
460    /// Returns `Ok(None)` if the page has no BG44 chunks.
461    pub fn extract_background(&self) -> Result<Option<Pixmap>, DocError> {
462        let chunks = self.bg44_chunks();
463        if chunks.is_empty() {
464            return Ok(None);
465        }
466
467        let mut img = Iw44Image::new();
468        for chunk_data in &chunks {
469            img.decode_chunk(chunk_data)?;
470        }
471        let pixmap = img.to_rgb()?;
472        Ok(Some(pixmap))
473    }
474
475    /// Render this page into a pre-allocated RGBA buffer using the given options.
476    ///
477    /// This is the zero-allocation render path: no heap allocation occurs when
478    /// `buf` is already sized to `opts.width * opts.height * 4` bytes.
479    ///
480    /// # Errors
481    ///
482    /// - [`crate::djvu_render::RenderError::BufTooSmall`] if buffer is too small
483    /// - [`crate::djvu_render::RenderError::InvalidDimensions`] if width/height is 0
484    /// - Propagates IW44 / JB2 decode errors
485    pub fn render_into(
486        &self,
487        opts: &crate::djvu_render::RenderOptions,
488        buf: &mut [u8],
489    ) -> Result<(), crate::djvu_render::RenderError> {
490        crate::djvu_render::render_into(self, opts, buf)
491    }
492}
493
494// ---- Document ---------------------------------------------------------------
495
496/// An opened DjVu document.
497///
498/// Supports single-page FORM:DJVU, bundled multi-page FORM:DJVM, and indirect
499/// multi-page FORM:DJVM (via resolver callback).
500#[derive(Debug)]
501pub struct DjVuDocument {
502    /// All pages, indexed by 0-based page number.
503    pages: Vec<DjVuPage>,
504    /// Parsed NAVM bookmarks, or empty if none.
505    bookmarks: Vec<DjVuBookmark>,
506    /// Raw document-level chunks (NAVM, DIRM, etc.) from the DJVM container,
507    /// or from the top-level DJVU form for single-page documents.
508    global_chunks: Vec<RawChunk>,
509}
510
511impl DjVuDocument {
512    /// Parse a DjVu document from a byte slice.
513    ///
514    /// For indirect documents (INCL references to external files), a resolver
515    /// must be supplied via [`DjVuDocument::parse_with_resolver`].
516    ///
517    /// # Errors
518    ///
519    /// Returns `DocError::NoResolver` if the document is indirect and no resolver
520    /// was provided.
521    pub fn parse(data: &[u8]) -> Result<Self, DocError> {
522        Self::parse_with_resolver(data, None::<fn(&str) -> Result<Vec<u8>, DocError>>)
523    }
524
525    /// Parse a DjVu document with an optional resolver for indirect pages.
526    ///
527    /// The resolver receives the `name` field from each INCL chunk and must
528    /// return the raw bytes of that external component file.
529    pub fn parse_with_resolver<R>(data: &[u8], resolver: Option<R>) -> Result<Self, DocError>
530    where
531        R: Fn(&str) -> Result<Vec<u8>, DocError>,
532    {
533        let form = parse_form(data)?;
534
535        match &form.form_type {
536            b"DJVU" => {
537                // Single-page document — expose all top-level chunks as global
538                let global_chunks: Vec<RawChunk> = form
539                    .chunks
540                    .iter()
541                    .map(|c| RawChunk {
542                        id: c.id,
543                        data: c.data.to_vec(),
544                    })
545                    .collect();
546                let page = parse_page_from_chunks(&form.chunks, 0, None)?;
547                Ok(DjVuDocument {
548                    pages: vec![page],
549                    bookmarks: vec![],
550                    global_chunks,
551                })
552            }
553            b"DJVM" => {
554                // Multi-page document — parse DIRM first
555                let dirm_chunk = form
556                    .chunks
557                    .iter()
558                    .find(|c| &c.id == b"DIRM")
559                    .ok_or(DocError::MissingChunk("DIRM"))?;
560
561                let (entries, is_bundled) = parse_dirm(dirm_chunk.data)?;
562
563                // Collect NAVM bookmarks (BZZ-compressed)
564                let bookmarks = parse_navm_bookmarks(&form.chunks)?;
565
566                // Store non-FORM global chunks (DIRM, NAVM, etc.)
567                let global_chunks: Vec<RawChunk> = form
568                    .chunks
569                    .iter()
570                    .filter(|c| &c.id != b"FORM")
571                    .map(|c| RawChunk {
572                        id: c.id,
573                        data: c.data.to_vec(),
574                    })
575                    .collect();
576
577                if is_bundled {
578                    // Bundled: FORM:DJVU / FORM:DJVI sub-forms follow DIRM in sequence.
579                    let sub_forms: Vec<&IffChunk<'_>> =
580                        form.chunks.iter().filter(|c| &c.id == b"FORM").collect();
581
582                    // Build a map of DJVI component ID → raw Djbz bytes for
583                    // shared symbol dictionaries (referenced via INCL chunks).
584                    // Use BTreeMap so this compiles in no_std (alloc::collections::BTreeMap
585                    // is available; std::collections::HashMap is not).
586                    #[cfg(not(feature = "std"))]
587                    use alloc::collections::BTreeMap;
588                    #[cfg(feature = "std")]
589                    use std::collections::BTreeMap;
590                    let djvi_djbz: BTreeMap<String, Vec<u8>> = entries
591                        .iter()
592                        .enumerate()
593                        .filter(|(_, e)| e.comp_type == ComponentType::Shared)
594                        .filter_map(|(comp_idx, entry)| {
595                            let sf = sub_forms.get(comp_idx)?;
596                            let chunks = parse_sub_form(sf.data).ok()?;
597                            let djbz = chunks.iter().find(|c| &c.id == b"Djbz")?;
598                            Some((entry.id.clone(), djbz.data.to_vec()))
599                        })
600                        .collect();
601
602                    let mut pages = Vec::new();
603                    let mut page_idx = 0usize;
604                    for (comp_idx, entry) in entries.iter().enumerate() {
605                        if entry.comp_type != ComponentType::Page {
606                            continue;
607                        }
608                        let sub_form = sub_forms.get(comp_idx).ok_or(DocError::Malformed(
609                            "DIRM entry count exceeds FORM children",
610                        ))?;
611                        let sub_chunks = parse_sub_form(sub_form.data)?;
612
613                        // Resolve INCL reference to a shared DJVI dictionary.
614                        let shared_djbz = sub_chunks
615                            .iter()
616                            .find(|c| &c.id == b"INCL")
617                            .and_then(|incl| core::str::from_utf8(incl.data.trim_ascii_end()).ok())
618                            .and_then(|name| djvi_djbz.get(name))
619                            .cloned();
620
621                        let page = parse_page_from_chunks(&sub_chunks, page_idx, shared_djbz)?;
622                        pages.push(page);
623                        page_idx += 1;
624                    }
625
626                    Ok(DjVuDocument {
627                        pages,
628                        bookmarks,
629                        global_chunks,
630                    })
631                } else {
632                    // Indirect: pages must be resolved by name
633                    let resolver = resolver.ok_or(DocError::NoResolver)?;
634
635                    let mut pages = Vec::new();
636                    let mut page_idx = 0usize;
637                    for entry in &entries {
638                        if entry.comp_type != ComponentType::Page {
639                            continue;
640                        }
641                        let resolved_data = resolver(&entry.id)
642                            .map_err(|_| DocError::IndirectResolve(entry.id.clone()))?;
643                        let sub_form = parse_form(&resolved_data)?;
644                        let page = parse_page_from_chunks(&sub_form.chunks, page_idx, None)?;
645                        pages.push(page);
646                        page_idx += 1;
647                    }
648
649                    Ok(DjVuDocument {
650                        pages,
651                        bookmarks,
652                        global_chunks,
653                    })
654                }
655            }
656            other => Err(DocError::NotDjVu(*other)),
657        }
658    }
659
660    /// Number of pages.
661    pub fn page_count(&self) -> usize {
662        self.pages.len()
663    }
664
665    /// Access a page by 0-based index.
666    ///
667    /// # Errors
668    ///
669    /// Returns `DocError::PageOutOfRange` if `index >= page_count()`.
670    pub fn page(&self, index: usize) -> Result<&DjVuPage, DocError> {
671        self.pages.get(index).ok_or(DocError::PageOutOfRange {
672            index,
673            count: self.pages.len(),
674        })
675    }
676
677    /// The NAVM table of contents, or an empty slice if not present.
678    pub fn bookmarks(&self) -> &[DjVuBookmark] {
679        &self.bookmarks
680    }
681
682    /// Parse document-level metadata from a METz (BZZ-compressed) or METa
683    /// (plain text) chunk.
684    ///
685    /// Returns `Ok(None)` if no METa/METz chunk is present.
686    pub fn metadata(&self) -> Result<Option<DjVuMetadata>, DocError> {
687        if let Some(metz) = self.raw_chunk(b"METz") {
688            if metz.is_empty() {
689                return Ok(None);
690            }
691            return Ok(Some(crate::metadata::parse_metadata_bzz(metz)?));
692        }
693        if let Some(meta) = self.raw_chunk(b"METa") {
694            if meta.is_empty() {
695                return Ok(None);
696            }
697            return Ok(Some(crate::metadata::parse_metadata(meta)?));
698        }
699        Ok(None)
700    }
701
702    /// Return the raw bytes of the first document-level chunk with the given
703    /// 4-byte ID.
704    ///
705    /// For single-page DJVU files this covers all top-level chunks (INFO,
706    /// Sjbz, BG44, …).  For multi-page DJVM files this covers non-page chunks
707    /// such as DIRM and NAVM.  Per-page chunks are accessed via
708    /// [`DjVuPage::raw_chunk`].
709    ///
710    /// Returns `None` if no such chunk exists.
711    pub fn raw_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
712        self.global_chunks
713            .iter()
714            .find(|c| &c.id == id)
715            .map(|c| c.data.as_slice())
716    }
717
718    /// Return the raw bytes of all document-level chunks with the given ID.
719    ///
720    /// Returns an empty `Vec` if no such chunk exists.
721    pub fn all_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
722        self.global_chunks
723            .iter()
724            .filter(|c| &c.id == id)
725            .map(|c| c.data.as_slice())
726            .collect()
727    }
728
729    /// Return the IDs of all document-level chunks, in order.
730    ///
731    /// For multi-page DJVM files this is the sequence of non-page chunks
732    /// (DIRM, NAVM, …).  Duplicate IDs appear once per chunk.
733    pub fn chunk_ids(&self) -> Vec<[u8; 4]> {
734        self.global_chunks.iter().map(|c| c.id).collect()
735    }
736}
737
738// ---- Memory-mapped document -------------------------------------------------
739
740/// A DjVu document backed by a memory-mapped file.
741///
742/// Instead of copying the entire file into a `Vec<u8>`, this type maps the file
743/// into the process address space using the OS virtual-memory subsystem.  The
744/// kernel pages data from disk on demand, which can significantly reduce peak
745/// memory usage for large multi-volume scans (100+ MB).
746///
747/// # Safety contract
748///
749/// **The underlying file must not be modified or truncated while the mapping is
750/// alive.**  Mutating a memory-mapped file is undefined behaviour on most
751/// platforms (SIGBUS on Linux/macOS, access violation on Windows).  The caller
752/// is responsible for ensuring file immutability for the lifetime of this
753/// struct.
754///
755/// Requires the `mmap` feature flag.
756#[cfg(feature = "mmap")]
757pub struct MmapDocument {
758    /// The memory mapping — kept alive so the parsed document's borrowed data
759    /// (pages, chunks) remain valid.  In practice `DjVuDocument` owns `Vec`
760    /// copies of all chunk data, so the mmap is only needed during `parse`.
761    _mmap: memmap2::Mmap,
762    doc: DjVuDocument,
763}
764
765#[cfg(feature = "mmap")]
766impl MmapDocument {
767    /// Open a DjVu file via memory-mapped I/O.
768    ///
769    /// # Safety contract
770    ///
771    /// The file at `path` **must not be modified or truncated** while the
772    /// returned `MmapDocument` is alive.  See the struct-level documentation
773    /// for details.
774    ///
775    /// # Errors
776    ///
777    /// Returns `DocError::Io` if the file cannot be opened or mapped, or any
778    /// parse error from [`DjVuDocument::parse`].
779    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self, DocError> {
780        let file = std::fs::File::open(path.as_ref())?;
781
782        // SAFETY: The caller guarantees the file is not modified while mapped.
783        // memmap2::Mmap provides a &[u8] view of the file contents.
784        #[allow(unsafe_code)]
785        let mmap = unsafe { memmap2::Mmap::map(&file) }?;
786
787        let doc = DjVuDocument::parse(&mmap)?;
788        Ok(MmapDocument { _mmap: mmap, doc })
789    }
790
791    /// Access the parsed [`DjVuDocument`].
792    pub fn document(&self) -> &DjVuDocument {
793        &self.doc
794    }
795
796    /// Number of pages in the document.
797    pub fn page_count(&self) -> usize {
798        self.doc.page_count()
799    }
800
801    /// Access a page by 0-based index.
802    pub fn page(&self, index: usize) -> Result<&DjVuPage, DocError> {
803        self.doc.page(index)
804    }
805}
806
807#[cfg(feature = "mmap")]
808impl core::ops::Deref for MmapDocument {
809    type Target = DjVuDocument;
810    fn deref(&self) -> &DjVuDocument {
811        &self.doc
812    }
813}
814
815// ---- Internal parsing helpers -----------------------------------------------
816
817/// Parse a `DjVuPage` from the chunks of a FORM:DJVU.
818///
819/// `shared_djbz` is the raw `Djbz` data from a referenced DJVI component
820/// (resolved from the page's INCL chunk by the caller); pass `None` if no
821/// shared dictionary is available.
822fn parse_page_from_chunks(
823    chunks: &[IffChunk<'_>],
824    index: usize,
825    shared_djbz: Option<Vec<u8>>,
826) -> Result<DjVuPage, DocError> {
827    let info_chunk = chunks
828        .iter()
829        .find(|c| &c.id == b"INFO")
830        .ok_or(DocError::MissingChunk("INFO"))?;
831
832    let info = PageInfo::parse(info_chunk.data)?;
833
834    // Copy all chunks to owned storage for lazy decode later.
835    let raw_chunks: Vec<RawChunk> = chunks
836        .iter()
837        .map(|c| RawChunk {
838            id: c.id,
839            data: c.data.to_vec(),
840        })
841        .collect();
842
843    Ok(DjVuPage {
844        info,
845        chunks: raw_chunks,
846        index,
847        shared_djbz,
848        #[cfg(feature = "std")]
849        bg44_decoded: std::sync::OnceLock::new(),
850    })
851}
852
853/// Parse sub-form chunks from the data portion of a FORM chunk.
854///
855/// The `data` bytes start with a 4-byte form type (e.g. `DJVU`), followed by
856/// sequential IFF chunks.
857fn parse_sub_form(data: &[u8]) -> Result<Vec<IffChunk<'_>>, DocError> {
858    if data.len() < 4 {
859        return Err(DocError::Malformed("sub-form data too short"));
860    }
861    // data[0..4] = form type (DJVU / DJVI / THUM …)
862    // data[4..] = sequential chunks
863    let body = data
864        .get(4..)
865        .ok_or(DocError::Malformed("sub-form body missing"))?;
866    let chunks = parse_iff_body_chunks(body)?;
867    Ok(chunks)
868}
869
870/// Parse sequential IFF chunks from a raw byte slice (no AT&T / FORM wrapper).
871fn parse_iff_body_chunks(mut buf: &[u8]) -> Result<Vec<IffChunk<'_>>, DocError> {
872    let mut chunks = Vec::new();
873
874    while buf.len() >= 8 {
875        let id: [u8; 4] = buf
876            .get(0..4)
877            .and_then(|s| s.try_into().ok())
878            .ok_or(IffError::Truncated)?;
879        let data_len = buf
880            .get(4..8)
881            .and_then(|b| b.try_into().ok())
882            .map(u32::from_be_bytes)
883            .map(|n| n as usize)
884            .ok_or(IffError::Truncated)?;
885
886        let data_start = 8usize;
887        let data_end = data_start
888            .checked_add(data_len)
889            .ok_or(IffError::Truncated)?;
890
891        if data_end > buf.len() {
892            return Err(DocError::Iff(IffError::ChunkTooLong {
893                id,
894                claimed: data_len as u32,
895                available: buf.len().saturating_sub(data_start),
896            }));
897        }
898
899        let chunk_data = buf.get(data_start..data_end).ok_or(IffError::Truncated)?;
900
901        // If this is a nested FORM, expose it as a FORM chunk with raw data
902        // (form_type + children) so callers can handle FORM:DJVU sub-forms.
903        chunks.push(IffChunk {
904            id,
905            data: chunk_data,
906        });
907
908        let padded_len = data_len + (data_len & 1);
909        let next = data_start
910            .checked_add(padded_len)
911            .ok_or(IffError::Truncated)?;
912        buf = buf.get(next.min(buf.len())..).ok_or(IffError::Truncated)?;
913    }
914
915    Ok(chunks)
916}
917
918/// A DIRM component entry.
919#[derive(Debug, Clone)]
920struct DirmEntry {
921    comp_type: ComponentType,
922    id: String,
923}
924
925/// Parse the DIRM chunk (directory of files in FORM:DJVM).
926///
927/// Returns `(entries, is_bundled)`.
928fn parse_dirm(data: &[u8]) -> Result<(Vec<DirmEntry>, bool), DocError> {
929    if data.len() < 3 {
930        return Err(DocError::Malformed("DIRM chunk too short"));
931    }
932
933    let dflags = *data.first().ok_or(DocError::Malformed("DIRM empty"))?;
934    let is_bundled = (dflags >> 7) != 0;
935    let nfiles = u16::from_be_bytes([
936        *data.get(1).ok_or(DocError::Malformed("DIRM too short"))?,
937        *data.get(2).ok_or(DocError::Malformed("DIRM too short"))?,
938    ]) as usize;
939
940    let mut pos = 3usize;
941
942    // Bundled documents embed 4-byte offsets (skipped; we rely on in-order FORM children).
943    if is_bundled {
944        let offsets_size = nfiles * 4;
945        pos = pos
946            .checked_add(offsets_size)
947            .ok_or(DocError::Malformed("DIRM offset arithmetic overflow"))?;
948        if pos > data.len() {
949            return Err(DocError::Malformed("DIRM offset table truncated"));
950        }
951    }
952
953    // Remaining bytes are BZZ-compressed metadata.
954    let bzz_data = data
955        .get(pos..)
956        .ok_or(DocError::Malformed("DIRM bzz data missing"))?;
957    let meta = bzz_decode(bzz_data)?;
958
959    // Layout: sizes(3 bytes × N), flags(1 byte × N), then null-terminated IDs…
960    let mut mpos = nfiles * 3; // skip per-component sizes
961
962    if mpos + nfiles > meta.len() {
963        return Err(DocError::Malformed("DIRM meta too short for flags"));
964    }
965    let flags: Vec<u8> = meta
966        .get(mpos..mpos + nfiles)
967        .ok_or(DocError::Malformed("DIRM flags truncated"))?
968        .to_vec();
969    mpos += nfiles;
970
971    let mut entries = Vec::with_capacity(nfiles);
972    for &flag in flags.iter().take(nfiles) {
973        let id = read_str_nt(&meta, &mut mpos)?;
974
975        // Optional name and title fields
976        if (flag & 0x80) != 0 {
977            let _ = read_str_nt(&meta, &mut mpos)?;
978        }
979        if (flag & 0x40) != 0 {
980            let _ = read_str_nt(&meta, &mut mpos)?;
981        }
982
983        let comp_type = match flag & 0x3f {
984            1 => ComponentType::Page,
985            2 => ComponentType::Thumbnail,
986            _ => ComponentType::Shared,
987        };
988
989        entries.push(DirmEntry { comp_type, id });
990    }
991
992    Ok((entries, is_bundled))
993}
994
995/// Read a null-terminated UTF-8 string from `data` at `*pos`, advancing `*pos`.
996fn read_str_nt(data: &[u8], pos: &mut usize) -> Result<String, DocError> {
997    let start = *pos;
998    while *pos < data.len() && *data.get(*pos).ok_or(DocError::Malformed("str read OOB"))? != 0 {
999        *pos += 1;
1000    }
1001    if *pos >= data.len() {
1002        return Err(DocError::Malformed(
1003            "null terminator missing in DIRM string",
1004        ));
1005    }
1006    let s = core::str::from_utf8(
1007        data.get(start..*pos)
1008            .ok_or(DocError::Malformed("str slice OOB"))?,
1009    )
1010    .map_err(|_| DocError::InvalidUtf8)?
1011    .to_string();
1012    *pos += 1; // consume null terminator
1013    Ok(s)
1014}
1015
1016/// Parse NAVM bookmarks from the chunk list of a FORM:DJVM.
1017///
1018/// Returns an empty Vec if there is no NAVM chunk.
1019fn parse_navm_bookmarks(chunks: &[IffChunk<'_>]) -> Result<Vec<DjVuBookmark>, DocError> {
1020    let navm_data = match chunks.iter().find(|c| &c.id == b"NAVM") {
1021        Some(c) => c.data,
1022        None => return Ok(vec![]),
1023    };
1024
1025    let decoded = bzz_decode(navm_data)?;
1026
1027    if decoded.len() < 2 {
1028        return Ok(vec![]);
1029    }
1030
1031    let b0 = *decoded
1032        .first()
1033        .ok_or(DocError::Malformed("NAVM total count byte 0"))?;
1034    let b1 = *decoded
1035        .get(1)
1036        .ok_or(DocError::Malformed("NAVM total count byte 1"))?;
1037    let total_count = u16::from_be_bytes([b0, b1]) as usize;
1038
1039    let mut pos = 2usize;
1040    let mut bookmarks = Vec::new();
1041    let mut decoded_count = 0usize;
1042
1043    while decoded_count < total_count {
1044        let bm = parse_bookmark_entry(&decoded, &mut pos, &mut decoded_count)?;
1045        bookmarks.push(bm);
1046    }
1047
1048    Ok(bookmarks)
1049}
1050
1051/// Recursively parse one bookmark entry and its children.
1052///
1053/// `total_counter` is a shared counter for ALL bookmark nodes across all recursion
1054/// levels, matching the DjVu NAVM format's flat total-count field.
1055fn parse_bookmark_entry(
1056    data: &[u8],
1057    pos: &mut usize,
1058    total_counter: &mut usize,
1059) -> Result<DjVuBookmark, DocError> {
1060    if *pos >= data.len() {
1061        return Err(DocError::Malformed("NAVM bookmark entry truncated"));
1062    }
1063
1064    // n_children is a single byte in the NAVM format
1065    let n_children = *data
1066        .get(*pos)
1067        .ok_or(DocError::Malformed("NAVM children count"))? as usize;
1068    *pos += 1;
1069
1070    let title = read_navm_str(data, pos)?;
1071    let url = read_navm_str(data, pos)?;
1072    *total_counter += 1;
1073
1074    // Children: fixed count, recurse with the same global total_counter
1075    let mut children = Vec::with_capacity(n_children);
1076    for _ in 0..n_children {
1077        let child = parse_bookmark_entry(data, pos, total_counter)?;
1078        children.push(child);
1079    }
1080
1081    Ok(DjVuBookmark {
1082        title,
1083        url,
1084        children,
1085    })
1086}
1087
1088/// Read a length-prefixed UTF-8 string from NAVM data.
1089///
1090/// Format: `[be_u24 length][utf8 bytes]`
1091fn read_navm_str(data: &[u8], pos: &mut usize) -> Result<String, DocError> {
1092    if *pos + 3 > data.len() {
1093        return Err(DocError::Malformed("NAVM string length truncated"));
1094    }
1095    let len = ((*data.get(*pos).ok_or(DocError::Malformed("NAVM str"))? as usize) << 16)
1096        | ((*data.get(*pos + 1).ok_or(DocError::Malformed("NAVM str"))? as usize) << 8)
1097        | (*data.get(*pos + 2).ok_or(DocError::Malformed("NAVM str"))? as usize);
1098    *pos += 3;
1099
1100    let bytes = data
1101        .get(*pos..*pos + len)
1102        .ok_or(DocError::Malformed("NAVM string bytes truncated"))?;
1103    *pos += len;
1104
1105    core::str::from_utf8(bytes)
1106        .map(|s| s.to_string())
1107        .map_err(|_| DocError::InvalidUtf8)
1108}
1109
1110// ---- Tests ------------------------------------------------------------------
1111
1112#[cfg(test)]
1113mod tests {
1114    use super::*;
1115
1116    fn assets_path() -> std::path::PathBuf {
1117        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1118            .join("references/djvujs/library/assets")
1119    }
1120
1121    // ---- TDD: failing tests written first (Red phase) -----------------------
1122
1123    /// Single-page FORM:DJVU — basic parse, page count, dimensions, DPI.
1124    #[test]
1125    fn single_page_parse_and_metadata() {
1126        let data =
1127            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1128        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1129
1130        assert_eq!(doc.page_count(), 1);
1131        let page = doc.page(0).expect("page 0 must exist");
1132        assert_eq!(page.width(), 181);
1133        assert_eq!(page.height(), 240);
1134        assert_eq!(page.dpi(), 100);
1135        assert!((page.gamma() - 2.2).abs() < 0.01, "gamma should be ~2.2");
1136    }
1137
1138    /// Single-page document: page index out of range.
1139    #[test]
1140    fn single_page_out_of_range() {
1141        let data =
1142            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1143        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1144        let err = doc.page(1).expect_err("page 1 should be out of range");
1145        assert!(
1146            matches!(err, DocError::PageOutOfRange { index: 1, count: 1 }),
1147            "unexpected error: {err:?}"
1148        );
1149    }
1150
1151    /// Single-page document: no thumbnails expected.
1152    #[test]
1153    fn single_page_no_thumbnail() {
1154        let data =
1155            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1156        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1157        let page = doc.page(0).expect("page 0 must exist");
1158        // Data is not decoded until thumbnail() is called — verify lazy contract
1159        let thumb = page.thumbnail().expect("thumbnail() should not error");
1160        assert!(
1161            thumb.is_none(),
1162            "single-page chicken.djvu has no TH44 chunks"
1163        );
1164    }
1165
1166    /// Single-page: dimensions helper.
1167    #[test]
1168    fn single_page_dimensions() {
1169        let data =
1170            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1171        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1172        let page = doc.page(0).unwrap();
1173        assert_eq!(page.dimensions(), (181, 240));
1174    }
1175
1176    /// Bundled multi-page FORM:DJVM — page count and DIRM parsing.
1177    #[test]
1178    fn multipage_bundled_page_count() {
1179        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu"))
1180            .expect("DjVu3Spec_bundled.djvu must exist");
1181        let doc = DjVuDocument::parse(&data).expect("bundled parse should succeed");
1182        // The bundled spec PDF has many pages — just check > 1
1183        assert!(
1184            doc.page_count() > 1,
1185            "bundled document should have more than 1 page, got {}",
1186            doc.page_count()
1187        );
1188    }
1189
1190    /// Bundled multi-page: each page should have valid metadata.
1191    #[test]
1192    fn multipage_bundled_page_metadata() {
1193        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu"))
1194            .expect("DjVu3Spec_bundled.djvu must exist");
1195        let doc = DjVuDocument::parse(&data).expect("bundled parse should succeed");
1196
1197        let page0 = doc.page(0).expect("page 0 must exist");
1198        assert!(page0.width() > 0, "page width must be non-zero");
1199        assert!(page0.height() > 0, "page height must be non-zero");
1200        assert!(page0.dpi() > 0, "page dpi must be non-zero");
1201    }
1202
1203    /// NAVM bookmarks from a document that contains them.
1204    #[test]
1205    fn navm_bookmarks_present() {
1206        let data =
1207            std::fs::read(assets_path().join("navm_fgbz.djvu")).expect("navm_fgbz.djvu must exist");
1208        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1209        // navm_fgbz.djvu has NAVM chunk — should return at least one bookmark
1210        let bm = doc.bookmarks();
1211        assert!(
1212            !bm.is_empty(),
1213            "navm_fgbz.djvu should have at least one bookmark"
1214        );
1215    }
1216
1217    /// Documents without NAVM should return empty bookmark list.
1218    #[test]
1219    fn no_navm_returns_empty_bookmarks() {
1220        let data =
1221            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1222        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1223        assert!(
1224            doc.bookmarks().is_empty(),
1225            "chicken.djvu has no NAVM — bookmarks should be empty"
1226        );
1227    }
1228
1229    /// Indirect document: parse with resolver callback.
1230    ///
1231    /// We simulate an indirect document by constructing a DJVM DIRM that marks
1232    /// entries as non-bundled and supplying a resolver that returns the bytes of
1233    /// the real chicken.djvu page.
1234    #[test]
1235    fn indirect_document_with_resolver() {
1236        // Load chicken.djvu — we'll use it as the "resolved" page.
1237        let chicken_data =
1238            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1239        // Build a minimal indirect DJVM document referencing "chicken.djvu"
1240        let djvm_data = build_indirect_djvm_bytes("chicken.djvu");
1241
1242        let resolver = |name: &str| -> Result<Vec<u8>, DocError> {
1243            if name == "chicken.djvu" {
1244                Ok(chicken_data.clone())
1245            } else {
1246                Err(DocError::IndirectResolve(name.to_string()))
1247            }
1248        };
1249
1250        let doc = DjVuDocument::parse_with_resolver(&djvm_data, Some(resolver))
1251            .expect("indirect parse should succeed");
1252
1253        assert_eq!(doc.page_count(), 1);
1254        let page = doc.page(0).unwrap();
1255        assert_eq!(page.width(), 181);
1256        assert_eq!(page.height(), 240);
1257    }
1258
1259    /// Indirect document without resolver must return NoResolver error.
1260    #[test]
1261    fn indirect_document_no_resolver_returns_error() {
1262        let djvm_data = build_indirect_djvm_bytes("chicken.djvu");
1263        let err = DjVuDocument::parse(&djvm_data).expect_err("should fail without resolver");
1264        assert!(
1265            matches!(err, DocError::NoResolver),
1266            "expected NoResolver, got {err:?}"
1267        );
1268    }
1269
1270    /// Page must not decode image data before thumbnail() is called.
1271    ///
1272    /// We verify laziness by confirming that constructing the document and
1273    /// accessing `page()` without calling `thumbnail()` does not involve
1274    /// any IW44 decoder side-effects.  We test this by calling thumbnail()
1275    /// on a page with no TH44 chunks and verifying we get Ok(None).
1276    #[test]
1277    fn page_is_lazy_no_decode_before_thumbnail() {
1278        let data =
1279            std::fs::read(assets_path().join("boy_jb2.djvu")).expect("boy_jb2.djvu must exist");
1280        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1281        let page = doc.page(0).expect("page 0 must exist");
1282
1283        // page.chunks should be populated but no decoding has happened
1284        assert!(!page.chunks.is_empty(), "chunks must be stored (lazy)");
1285
1286        // thumbnail() triggers decode — but there's no TH44 chunk in boy_jb2.djvu
1287        let thumb = page.thumbnail().expect("thumbnail() should not error");
1288        assert!(thumb.is_none());
1289    }
1290
1291    /// Non-DjVu file returns NotDjVu error.
1292    #[test]
1293    fn not_djvu_returns_error() {
1294        // Construct a valid IFF with a non-DjVu form type
1295        let mut data = Vec::new();
1296        data.extend_from_slice(b"AT&T");
1297        data.extend_from_slice(b"FORM");
1298        data.extend_from_slice(&8u32.to_be_bytes());
1299        data.extend_from_slice(b"XXXXXXXX"); // form_type = XXXX + 4 dummy bytes
1300        let err = DjVuDocument::parse(&data).expect_err("should fail");
1301        assert!(
1302            matches!(err, DocError::NotDjVu(_) | DocError::Iff(_)),
1303            "expected NotDjVu or Iff error, got {err:?}"
1304        );
1305    }
1306
1307    // ---- Helpers: build minimal DJVM documents for indirect tests -----------
1308
1309    /// Build a minimal indirect FORM:DJVM with 1 page component named "chicken.djvu".
1310    ///
1311    /// DIRM format: flags=0x00 (not bundled), nfiles=1, followed by BZZ-compressed
1312    /// metadata. The BZZ bytes below were pre-computed using the reference `bzz -e`
1313    /// tool encoding the metadata:
1314    ///   `\x00\x00\x00` (size, 3 bytes) + `\x01` (Page flag) + `chicken.djvu\x00`
1315    fn build_indirect_djvm_bytes(_page_name: &str) -> Vec<u8> {
1316        // BZZ-encoded DIRM metadata for 1 Page component named "chicken.djvu".
1317        // Generated with: printf '\x00\x00\x00\x01chicken.djvu\x00' | bzz -e - file.bzz
1318        // Verified to decode back to the original 17-byte meta block.
1319        let bzz_meta: &[u8] = &[
1320            0xff, 0xff, 0xed, 0xbf, 0x8a, 0x1f, 0xbe, 0xad, 0x14, 0x57, 0x10, 0xc9, 0x63, 0x19,
1321            0x11, 0xf0, 0x85, 0x28, 0x12, 0x8a, 0xbf,
1322        ];
1323
1324        let mut dirm_data = Vec::new();
1325        dirm_data.push(0x00); // flags: not bundled (is_bundled bit = 0)
1326        dirm_data.push(0x00); // nfiles high byte
1327        dirm_data.push(0x01); // nfiles low byte = 1
1328        dirm_data.extend_from_slice(bzz_meta);
1329
1330        build_djvm_with_dirm(&dirm_data)
1331    }
1332
1333    fn build_djvm_with_dirm(dirm_data: &[u8]) -> Vec<u8> {
1334        // DIRM chunk
1335        let mut dirm_chunk = Vec::new();
1336        dirm_chunk.extend_from_slice(b"DIRM");
1337        dirm_chunk.extend_from_slice(&(dirm_data.len() as u32).to_be_bytes());
1338        dirm_chunk.extend_from_slice(dirm_data);
1339        if !dirm_data.len().is_multiple_of(2) {
1340            dirm_chunk.push(0); // pad to even
1341        }
1342
1343        // FORM:DJVM body
1344        let mut form_body = Vec::new();
1345        form_body.extend_from_slice(b"DJVM");
1346        form_body.extend_from_slice(&dirm_chunk);
1347
1348        // Full file
1349        let mut file = Vec::new();
1350        file.extend_from_slice(b"AT&T");
1351        file.extend_from_slice(b"FORM");
1352        file.extend_from_slice(&(form_body.len() as u32).to_be_bytes());
1353        file.extend_from_slice(&form_body);
1354        file
1355    }
1356
1357    // ── raw chunk API (Issue #43) ────────────────────────────────────────────
1358
1359    /// `DjVuPage::raw_chunk` returns bytes for known chunk types.
1360    #[test]
1361    fn page_raw_chunk_info_present() {
1362        let data =
1363            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1364        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1365        let page = doc.page(0).expect("page 0 must exist");
1366
1367        // INFO chunk must be present
1368        let info = page.raw_chunk(b"INFO").expect("INFO chunk must be present");
1369        assert_eq!(info.len(), 10, "INFO chunk is always 10 bytes");
1370    }
1371
1372    /// `DjVuPage::raw_chunk` returns None for absent chunk types.
1373    #[test]
1374    fn page_raw_chunk_absent() {
1375        let data =
1376            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1377        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1378        let page = doc.page(0).expect("page 0 must exist");
1379
1380        assert!(
1381            page.raw_chunk(b"XXXX").is_none(),
1382            "unknown chunk type must return None"
1383        );
1384    }
1385
1386    /// `DjVuPage::all_chunks` returns multiple BG44 chunks in order.
1387    #[test]
1388    fn page_all_chunks_bg44_multiple() {
1389        // big-scanned-page.djvu has 4 progressive BG44 chunks
1390        let data = std::fs::read(
1391            std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1392                .join("tests/fixtures/big-scanned-page.djvu"),
1393        )
1394        .expect("big-scanned-page.djvu must exist");
1395        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1396        let page = doc.page(0).expect("page 0 must exist");
1397
1398        let bg44 = page.all_chunks(b"BG44");
1399        assert!(
1400            bg44.len() >= 2,
1401            "colour page must have ≥2 BG44 chunks, got {}",
1402            bg44.len()
1403        );
1404
1405        // Chunks must be non-empty
1406        for (i, chunk) in bg44.iter().enumerate() {
1407            assert!(!chunk.is_empty(), "BG44 chunk {i} must not be empty");
1408        }
1409    }
1410
1411    /// `DjVuPage::chunk_ids` lists all chunk IDs in order.
1412    #[test]
1413    fn page_chunk_ids_includes_info() {
1414        let data =
1415            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1416        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1417        let page = doc.page(0).expect("page 0 must exist");
1418
1419        let ids = page.chunk_ids();
1420        assert!(!ids.is_empty(), "chunk_ids must not be empty");
1421        assert!(
1422            ids.contains(b"INFO"),
1423            "chunk_ids must include INFO, got: {:?}",
1424            ids.iter()
1425                .map(|id| std::str::from_utf8(id).unwrap_or("????"))
1426                .collect::<Vec<_>>()
1427        );
1428    }
1429
1430    /// `DjVuDocument::raw_chunk` works for single-page DJVU files.
1431    #[test]
1432    fn document_raw_chunk_single_page() {
1433        let data =
1434            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1435        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1436
1437        // Single-page DJVU exposes all top-level chunks at document level too
1438        let info = doc
1439            .raw_chunk(b"INFO")
1440            .expect("document must expose INFO chunk");
1441        assert_eq!(info.len(), 10);
1442    }
1443
1444    // ── DJVI shared dictionary / INCL chunks (Issue #45) ────────────────────
1445
1446    /// DjVu3Spec_bundled.djvu has shared DJVI symbol dictionaries.
1447    /// Parsing must succeed and pages with INCL references must carry the dict.
1448    #[test]
1449    fn djvi_shared_dict_parsed_from_bundled_djvm() {
1450        let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1451            .join("tests/fixtures/DjVu3Spec_bundled.djvu");
1452        let data = std::fs::read(&path).expect("DjVu3Spec_bundled.djvu must exist");
1453        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1454
1455        assert!(doc.page_count() > 0, "document must have pages");
1456
1457        // At least one page should have a shared dict loaded (shared_djbz Some)
1458        let pages_with_dict = doc.pages.iter().filter(|p| p.shared_djbz.is_some()).count();
1459        assert!(
1460            pages_with_dict > 0,
1461            "at least one page must have a resolved shared DJVI dict"
1462        );
1463    }
1464
1465    /// Pages with INCL references must render their mask without error.
1466    #[test]
1467    fn djvi_incl_page_mask_renders_ok() {
1468        let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1469            .join("tests/fixtures/DjVu3Spec_bundled.djvu");
1470        let data = std::fs::read(&path).expect("DjVu3Spec_bundled.djvu must exist");
1471        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1472
1473        // Find first page with a shared dict and render its mask
1474        let page = doc
1475            .pages
1476            .iter()
1477            .find(|p| p.shared_djbz.is_some())
1478            .expect("at least one page must have a shared dict");
1479
1480        let mask = page
1481            .extract_mask()
1482            .expect("extract_mask must succeed for INCL page");
1483        assert!(mask.is_some(), "INCL page must have a JB2 mask");
1484        let bm = mask.unwrap();
1485        assert!(
1486            bm.width > 0 && bm.height > 0,
1487            "mask must have non-zero dimensions"
1488        );
1489    }
1490
1491    /// Pages without INCL still render correctly (no regression).
1492    #[test]
1493    fn no_regression_non_incl_pages() {
1494        // boy_jb2.djvu has a Sjbz mask and no INCL reference
1495        let data = std::fs::read(
1496            std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1497                .join("tests/fixtures/boy_jb2.djvu"),
1498        )
1499        .expect("boy_jb2.djvu must exist");
1500        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1501        let page = doc.page(0).expect("page 0 must exist");
1502        assert!(
1503            page.shared_djbz.is_none(),
1504            "single-page DJVU has no shared dict"
1505        );
1506        let mask = page.extract_mask().expect("extract_mask must succeed");
1507        assert!(mask.is_some(), "boy_jb2.djvu page must have a JB2 mask");
1508    }
1509
1510    /// Round-trip: bytes from `raw_chunk` re-parse to the same metadata.
1511    #[test]
1512    fn page_raw_chunk_info_roundtrip() {
1513        let data =
1514            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1515        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1516        let page = doc.page(0).expect("page 0 must exist");
1517
1518        let raw_info = page.raw_chunk(b"INFO").expect("INFO chunk must be present");
1519        let reparsed = crate::info::PageInfo::parse(raw_info).expect("re-parse must succeed");
1520        assert_eq!(reparsed.width, page.width() as u16);
1521        assert_eq!(reparsed.height, page.height() as u16);
1522        assert_eq!(reparsed.dpi, page.dpi());
1523    }
1524
1525    /// MmapDocument opens a file and parses identically to in-memory parse.
1526    #[test]
1527    #[cfg(feature = "mmap")]
1528    fn mmap_document_matches_parse() {
1529        let path = assets_path().join("chicken.djvu");
1530        let mmap_doc = MmapDocument::open(&path).expect("mmap open should succeed");
1531        let data = std::fs::read(&path).expect("read should succeed");
1532        let mem_doc = DjVuDocument::parse(&data).expect("parse should succeed");
1533
1534        assert_eq!(mmap_doc.page_count(), mem_doc.page_count());
1535        for i in 0..mmap_doc.page_count() {
1536            let mp = mmap_doc.page(i).unwrap();
1537            let pp = mem_doc.page(i).unwrap();
1538            assert_eq!(mp.width(), pp.width());
1539            assert_eq!(mp.height(), pp.height());
1540            assert_eq!(mp.dpi(), pp.dpi());
1541        }
1542    }
1543}
djvu_rs/djvu_document.rs

djvu_rs/
djvu_document.rs