djvu_rs/
djvu_document.rs

1//! New document model for DjVu files — phase 3.
2//!
3//! This module provides the high-level `DjVuDocument` API built on top of the
4//! clean-room IFF parser (phase 1), BZZ decompressor (phase 2a), and IW44 decoder
5//! (phase 2c).
6//!
7//! ## Key public types
8//!
9//! - [`DjVuDocument`] — opened DjVu document (single-page or multi-page)
10//! - [`DjVuPage`] — lazy page handle (raw chunks stored until `thumbnail()` is called)
11//! - [`DjVuBookmark`] — table-of-contents entry from the NAVM chunk
12//! - [`DocError`] — typed errors for this module
13//!
14//! ## Document kinds
15//!
16//! - **FORM:DJVU** — single-page document
17//! - **FORM:DJVM + DIRM** — bundled multi-page document with an in-file page index
18//! - **FORM:DJVM + DIRM (indirect)** — pages live in separate files; a resolver
19//!   callback `fn(name: &str) -> Result<Vec<u8>, DocError>` is required
20//!
21//! ## Lazy decoding contract
22//!
23//! `DjVuPage` stores only the raw chunk bytes. No image decoding happens until
24//! the caller explicitly calls `thumbnail()` (which invokes the IW44 decoder).
25
26#[cfg(not(feature = "std"))]
27use alloc::{
28    format,
29    string::{String, ToString},
30    vec,
31    vec::Vec,
32};
33
34use crate::{
35    annotation::{Annotation, AnnotationError, MapArea},
36    bzz_new::bzz_decode,
37    error::{BzzError, IffError, Iw44Error, Jb2Error},
38    iff::{IffChunk, parse_form},
39    info::PageInfo,
40    iw44_new::Iw44Image,
41    jb2::Jb2Dict,
42    metadata::{DjVuMetadata, MetadataError},
43    pixmap::Pixmap,
44    text::{TextError, TextLayer},
45};
46
47#[cfg(feature = "std")]
48use std::sync::Arc;
49
50// ---- Error type -------------------------------------------------------------
51
52/// Errors that can occur when working with the DjVuDocument API.
53#[derive(Debug, thiserror::Error)]
54pub enum DocError {
55    /// IFF container parse error.
56    #[error("IFF error: {0}")]
57    Iff(#[from] IffError),
58
59    /// BZZ decompression error.
60    #[error("BZZ error: {0}")]
61    Bzz(#[from] BzzError),
62
63    /// IW44 wavelet decoding error.
64    #[error("IW44 error: {0}")]
65    Iw44(#[from] Iw44Error),
66
67    /// JB2 bilevel image decoding error.
68    #[error("JB2 error: {0}")]
69    Jb2(#[from] Jb2Error),
70
71    /// The file is not a supported DjVu format.
72    #[error("not a DjVu file: found form type {0:?}")]
73    NotDjVu([u8; 4]),
74
75    /// A required chunk is missing.
76    #[error("missing required chunk: {0}")]
77    MissingChunk(&'static str),
78
79    /// The document is malformed (description included).
80    #[error("malformed DjVu document: {0}")]
81    Malformed(&'static str),
82
83    /// An indirect page reference could not be resolved.
84    #[error("failed to resolve indirect page '{0}'")]
85    IndirectResolve(String),
86
87    /// Page index is out of range.
88    #[error("page index {index} is out of range (document has {count} pages)")]
89    PageOutOfRange { index: usize, count: usize },
90
91    /// Invalid UTF-8 in a string field.
92    #[error("invalid UTF-8 in DjVu metadata")]
93    InvalidUtf8,
94
95    /// The resolver callback is required for indirect documents but was not provided.
96    #[error("indirect DjVu document requires a resolver callback")]
97    NoResolver,
98
99    /// I/O error when reading file data (only with `std` feature).
100    #[cfg(feature = "std")]
101    #[error("I/O error: {0}")]
102    Io(#[from] std::io::Error),
103
104    /// G4/MMR mask decoding error.
105    #[error("Smmr decode error: {0}")]
106    Smmr(String),
107
108    /// Text layer parse error.
109    #[error("text layer error: {0}")]
110    Text(#[from] TextError),
111
112    /// Annotation parse error.
113    #[error("annotation error: {0}")]
114    Annotation(#[from] AnnotationError),
115
116    /// Metadata parse error.
117    #[error("metadata error: {0}")]
118    Metadata(#[from] MetadataError),
119}
120
121// ---- Bookmark ---------------------------------------------------------------
122
123/// A table-of-contents entry from the NAVM chunk.
124#[derive(Debug, Clone)]
125#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
126pub struct DjVuBookmark {
127    /// Display title.
128    pub title: String,
129    /// Target URL (DjVu internal URL format).
130    pub url: String,
131    /// Nested child entries.
132    pub children: Vec<DjVuBookmark>,
133}
134
135// ---- Page -------------------------------------------------------------------
136
137/// Component type in the DIRM directory.
138#[derive(Debug, Clone, Copy, PartialEq, Eq)]
139enum ComponentType {
140    Shared,
141    Page,
142    Thumbnail,
143}
144
145/// A raw chunk extracted from a page FORM:DJVU.
146#[derive(Debug, Clone)]
147struct RawChunk {
148    id: [u8; 4],
149    data: Vec<u8>,
150}
151
152/// A lazy DjVu page handle.
153///
154/// Raw chunk data is stored on construction. No image decoding is performed
155/// until the caller invokes `thumbnail()` or a render function.
156///
157/// The fully decoded BG44 wavelet image is cached after the first render so
158/// that subsequent renders skip the expensive ZP arithmetic decode and only
159/// run the wavelet inverse-transform and compositor.
160pub struct DjVuPage {
161    /// Page info parsed from the INFO chunk.
162    info: PageInfo,
163    /// All raw chunks from this page's FORM:DJVU, in order.
164    chunks: Vec<RawChunk>,
165    /// Page index within the document (0-based).
166    index: usize,
167    /// Raw Djbz data from the DJVI shared dictionary component referenced via
168    /// the page's INCL chunk, if present.  Stored here so that `extract_mask`
169    /// can decode it without access to the parent document.
170    ///
171    /// Wrapped in `Arc` so that multi-page documents share one allocation
172    /// instead of cloning the bytes per page.
173    #[cfg(feature = "std")]
174    shared_djbz: Option<Arc<Vec<u8>>>,
175    #[cfg(not(feature = "std"))]
176    shared_djbz: Option<Vec<u8>>,
177    /// Lazily decoded BG44 background wavelet image (all chunks).  Used for
178    /// full-resolution and half-resolution renders.  Populated on first use.
179    /// Only available when the `std` feature is enabled (`OnceLock` requires std).
180    #[cfg(feature = "std")]
181    bg44_decoded: std::sync::OnceLock<Option<Iw44Image>>,
182    /// Lazily decoded BG44 background wavelet image from the first chunk only.
183    /// Used for sub=4 and sub=8 downscaled renders to avoid decoding the
184    /// high-frequency refinement chunks whose detail is invisible at 1/4 scale.
185    #[cfg(feature = "std")]
186    bg44_decoded_partial: std::sync::OnceLock<Option<Iw44Image>>,
187    /// Lazily decoded JB2 foreground mask (Sjbz chunk → full-resolution Bitmap).
188    /// Populated on the first call to `decoded_mask()`.  Subsequent renders at
189    /// any scale reuse the same Bitmap via the compositor's coordinate division.
190    #[cfg(feature = "std")]
191    mask_decoded: std::sync::OnceLock<Option<crate::bitmap::Bitmap>>,
192    /// Lazily computed 1/4-resolution downsampled mask.  Populated on first call
193    /// to `decoded_mask_sub4()`.  Used by the compositor for sub=4 renders so
194    /// that each output pixel needs only one bit lookup instead of 4–9.
195    #[cfg(feature = "std")]
196    mask_decoded_sub4: std::sync::OnceLock<Option<crate::bitmap::Bitmap>>,
197    /// Lazily decoded FG44 foreground color image (all FG44 chunks → Pixmap).
198    /// Populated on the first call to `decoded_fg44()`.
199    #[cfg(feature = "std")]
200    fg44_decoded: std::sync::OnceLock<Option<Pixmap>>,
201    /// Lazily decoded JB2 shared dictionary.  Populated on first use by
202    /// `decoded_shared_dict()` and reused on subsequent renders, avoiding
203    /// repeated multi-megabyte allocations.
204    #[cfg(feature = "std")]
205    jb2_dict_decoded: std::sync::OnceLock<Option<Jb2Dict>>,
206}
207
208impl Clone for DjVuPage {
209    fn clone(&self) -> Self {
210        DjVuPage {
211            info: self.info.clone(),
212            chunks: self.chunks.clone(),
213            index: self.index,
214            shared_djbz: self.shared_djbz.clone(),
215            // Caches are not cloned — they will be lazily recomputed.
216            #[cfg(feature = "std")]
217            bg44_decoded: std::sync::OnceLock::new(),
218            #[cfg(feature = "std")]
219            bg44_decoded_partial: std::sync::OnceLock::new(),
220            #[cfg(feature = "std")]
221            mask_decoded: std::sync::OnceLock::new(),
222            #[cfg(feature = "std")]
223            mask_decoded_sub4: std::sync::OnceLock::new(),
224            #[cfg(feature = "std")]
225            fg44_decoded: std::sync::OnceLock::new(),
226            #[cfg(feature = "std")]
227            jb2_dict_decoded: std::sync::OnceLock::new(),
228        }
229    }
230}
231
232impl core::fmt::Debug for DjVuPage {
233    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
234        f.debug_struct("DjVuPage")
235            .field("info", &self.info)
236            .field("chunks", &self.chunks)
237            .field("index", &self.index)
238            .field("shared_djbz", &self.shared_djbz.as_ref().map(|v| v.len()))
239            .finish_non_exhaustive()
240    }
241}
242
243impl DjVuPage {
244    /// Page width in pixels.
245    pub fn width(&self) -> u16 {
246        self.info.width
247    }
248
249    /// Page height in pixels.
250    pub fn height(&self) -> u16 {
251        self.info.height
252    }
253
254    /// Page resolution in dots per inch.
255    pub fn dpi(&self) -> u16 {
256        self.info.dpi
257    }
258
259    /// Display gamma from the INFO chunk.
260    pub fn gamma(&self) -> f32 {
261        self.info.gamma
262    }
263
264    /// Page rotation from the INFO chunk.
265    pub fn rotation(&self) -> crate::info::Rotation {
266        self.info.rotation
267    }
268
269    /// 0-based page index within the document.
270    pub fn index(&self) -> usize {
271        self.index
272    }
273
274    /// Dimensions as `(width, height)`.
275    pub fn dimensions(&self) -> (u16, u16) {
276        (self.info.width, self.info.height)
277    }
278
279    /// Decode the thumbnail for this page from TH44 chunks, if present.
280    ///
281    /// No image data is decoded until this method is called (lazy contract).
282    ///
283    /// Returns `Ok(None)` if the page has no TH44 thumbnail.
284    pub fn thumbnail(&self) -> Result<Option<Pixmap>, DocError> {
285        let th44_chunks: Vec<&[u8]> = self
286            .chunks
287            .iter()
288            .filter(|c| &c.id == b"TH44")
289            .map(|c| c.data.as_slice())
290            .collect();
291
292        if th44_chunks.is_empty() {
293            return Ok(None);
294        }
295
296        let mut img = Iw44Image::new();
297        for chunk_data in &th44_chunks {
298            img.decode_chunk(chunk_data)?;
299        }
300        let pixmap = img.to_rgb()?;
301        Ok(Some(pixmap))
302    }
303
304    /// Return the raw bytes of the first chunk with the given 4-byte ID.
305    ///
306    /// Returns `None` if no chunk with that ID exists.  The returned slice
307    /// points into the owned chunk storage — zero copy.
308    ///
309    /// # Example
310    ///
311    /// ```ignore
312    /// let sjbz = page.raw_chunk(b"Sjbz").expect("page must have a JB2 chunk");
313    /// ```
314    pub fn raw_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
315        self.chunks
316            .iter()
317            .find(|c| &c.id == id)
318            .map(|c| c.data.as_slice())
319    }
320
321    /// Return the raw bytes of all chunks with the given 4-byte ID, in order.
322    ///
323    /// Returns an empty `Vec` if no such chunk exists.
324    ///
325    /// # Example
326    ///
327    /// ```ignore
328    /// let bg44_chunks = page.all_chunks(b"BG44");
329    /// assert!(!bg44_chunks.is_empty(), "colour page must have BG44 data");
330    /// ```
331    pub fn all_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
332        self.chunks
333            .iter()
334            .filter(|c| &c.id == id)
335            .map(|c| c.data.as_slice())
336            .collect()
337    }
338
339    /// Return the IDs of all chunks present on this page, in order.
340    ///
341    /// Duplicate IDs appear multiple times (once per chunk).
342    pub fn chunk_ids(&self) -> Vec<[u8; 4]> {
343        self.chunks.iter().map(|c| c.id).collect()
344    }
345
346    /// Find the first chunk with the given 4-byte ID.
347    ///
348    /// Equivalent to [`Self::raw_chunk`]; kept for internal use.
349    pub fn find_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
350        self.raw_chunk(id)
351    }
352
353    /// Find all chunks with the given 4-byte ID.
354    ///
355    /// Equivalent to [`Self::all_chunks`]; kept for internal use.
356    pub fn find_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
357        self.all_chunks(id)
358    }
359
360    /// Return all BG44 background chunk data slices, in order.
361    pub fn bg44_chunks(&self) -> Vec<&[u8]> {
362        self.find_chunks(b"BG44")
363    }
364
365    /// Return the fully decoded BG44 wavelet image, decoding and caching on first call.
366    ///
367    /// Returns `None` if the page has no BG44 chunks.  On decode error the error
368    /// is swallowed and `None` is returned (same semantics as the permissive render
369    /// path), so this method is infallible.
370    ///
371    /// The result is computed once (all ZP arithmetic decode + block assembly) and
372    /// then cached inside the page.  Subsequent calls return the cached value
373    /// immediately.  The wavelet inverse-transform and YCbCr→RGB conversion are
374    /// **not** cached; they are applied at each render at the appropriate subsample
375    /// level via [`Iw44Image::to_rgb_subsample`].
376    #[cfg(feature = "std")]
377    pub fn decoded_bg44(&self) -> Option<&Iw44Image> {
378        self.bg44_decoded
379            .get_or_init(|| {
380                let chunks = self.bg44_chunks();
381                if chunks.is_empty() {
382                    return None;
383                }
384                let mut img = Iw44Image::new();
385                for chunk_data in &chunks {
386                    if img.decode_chunk(chunk_data).is_err() {
387                        break;
388                    }
389                }
390                if img.width == 0 { None } else { Some(img) }
391            })
392            .as_ref()
393    }
394
395    #[cfg(not(feature = "std"))]
396    pub fn decoded_bg44(&self) -> Option<&Iw44Image> {
397        None
398    }
399
400    /// Return a partially-decoded BG44 background image, decoding and caching
401    /// on first call.  Only the first BG44 chunk is decoded — subsequent
402    /// refinement chunks are skipped.  This gives roughly 4× lower ZP decode
403    /// cost at the expense of coarser quantization, which is imperceptible at
404    /// sub=4 (quarter-resolution) or sub=8 output.
405    ///
406    /// Use this instead of [`Self::decoded_bg44`] when `subsample >= 4`.
407    #[cfg(feature = "std")]
408    pub fn decoded_bg44_partial(&self) -> Option<&Iw44Image> {
409        self.bg44_decoded_partial
410            .get_or_init(|| {
411                let chunks = self.bg44_chunks();
412                if chunks.is_empty() {
413                    return None;
414                }
415                let mut img = Iw44Image::new();
416                // Decode only the first chunk; skip high-frequency refinement.
417                if img.decode_chunk(chunks[0]).is_err() {
418                    return None;
419                }
420                if img.width == 0 { None } else { Some(img) }
421            })
422            .as_ref()
423    }
424
425    #[cfg(not(feature = "std"))]
426    pub fn decoded_bg44_partial(&self) -> Option<&Iw44Image> {
427        None
428    }
429
430    /// Return the decoded JB2 shared dictionary, decoding and caching on first call.
431    ///
432    /// Returns `None` if the page has no shared dictionary (no INCL reference).
433    /// The result is computed once and then cached so that repeated renders
434    /// do not re-decode the dictionary each time.
435    #[cfg(feature = "std")]
436    pub fn decoded_shared_dict(&self) -> Option<&Jb2Dict> {
437        self.jb2_dict_decoded
438            .get_or_init(|| {
439                let djbz = self.shared_djbz.as_deref()?;
440                crate::jb2::decode_dict(djbz, None).ok()
441            })
442            .as_ref()
443    }
444
445    #[cfg(not(feature = "std"))]
446    pub fn decoded_shared_dict(&self) -> Option<&Jb2Dict> {
447        None
448    }
449
450    /// Return all FG44 foreground chunk data slices, in order.
451    pub fn fg44_chunks(&self) -> Vec<&[u8]> {
452        self.find_chunks(b"FG44")
453    }
454
455    /// Extract the text layer from TXTz (BZZ-compressed) or TXTa (plain) chunks.
456    ///
457    /// Returns `Ok(None)` if the page has no text layer.
458    pub fn text_layer(&self) -> Result<Option<TextLayer>, DocError> {
459        let page_height = self.info.height as u32;
460
461        if let Some(txtz) = self.find_chunk(b"TXTz") {
462            if txtz.is_empty() {
463                return Ok(None);
464            }
465            let layer = crate::text::parse_text_layer_bzz(txtz, page_height)?;
466            return Ok(Some(layer));
467        }
468
469        if let Some(txta) = self.find_chunk(b"TXTa") {
470            if txta.is_empty() {
471                return Ok(None);
472            }
473            let layer = crate::text::parse_text_layer(txta, page_height)?;
474            return Ok(Some(layer));
475        }
476
477        Ok(None)
478    }
479
480    /// Parse the text layer and transform all zone rectangles to match a
481    /// rendered page of size `render_w × render_h`.
482    ///
483    /// This is a convenience wrapper around [`Self::text_layer`] followed by
484    /// [`TextLayer::transform`].  It applies the page's own rotation (from the
485    /// INFO chunk) and scales coordinates proportionally to the requested
486    /// render size, so callers can use the returned rects directly for text
487    /// selection / copy-paste overlays without any additional maths.
488    ///
489    /// Returns `Ok(None)` if the page has no text layer.
490    pub fn text_layer_at_size(
491        &self,
492        render_w: u32,
493        render_h: u32,
494    ) -> Result<Option<TextLayer>, DocError> {
495        let page_w = self.info.width as u32;
496        let page_h = self.info.height as u32;
497        let rotation = self.info.rotation;
498        Ok(self
499            .text_layer()?
500            .map(|tl| tl.transform(page_w, page_h, rotation, render_w, render_h)))
501    }
502
503    /// Extract the plain text content of the page (convenience wrapper).
504    ///
505    /// Returns `Ok(None)` if the page has no text layer.
506    pub fn text(&self) -> Result<Option<String>, DocError> {
507        Ok(self.text_layer()?.map(|tl| tl.text))
508    }
509
510    /// Parse the annotation layer from ANTz (BZZ-compressed) or ANTa (plain) chunks.
511    ///
512    /// Returns `Ok(None)` if the page has no annotation chunk.
513    pub fn annotations(&self) -> Result<Option<(Annotation, Vec<MapArea>)>, DocError> {
514        if let Some(antz) = self.find_chunk(b"ANTz") {
515            if antz.is_empty() {
516                return Ok(None);
517            }
518            let result = crate::annotation::parse_annotations_bzz(antz)?;
519            return Ok(Some(result));
520        }
521
522        if let Some(anta) = self.find_chunk(b"ANTa") {
523            if anta.is_empty() {
524                return Ok(None);
525            }
526            let result = crate::annotation::parse_annotations(anta)?;
527            return Ok(Some(result));
528        }
529
530        Ok(None)
531    }
532
533    /// Return all hyperlinks (MapAreas with a non-empty URL) on this page.
534    pub fn hyperlinks(&self) -> Result<Vec<MapArea>, DocError> {
535        match self.annotations()? {
536            None => Ok(Vec::new()),
537            Some((_, mapareas)) => Ok(mapareas.into_iter().filter(|m| !m.url.is_empty()).collect()),
538        }
539    }
540
541    /// Decode the JB2 foreground mask as a 1-bit [`Bitmap`](crate::bitmap::Bitmap).
542    ///
543    /// Returns `Ok(None)` if the page has no Sjbz (JB2 mask) chunk.
544    /// Decode the foreground mask layer.
545    ///
546    /// Handles both JB2 (`Sjbz`) and G4/MMR (`Smmr`) encoded masks.
547    /// Returns `Ok(None)` if the page has neither chunk.
548    pub fn extract_mask(&self) -> Result<Option<crate::bitmap::Bitmap>, DocError> {
549        if let Some(sjbz) = self.find_chunk(b"Sjbz") {
550            // Prefer an inline Djbz chunk (decoded fresh — rare, usually small).
551            // Otherwise use the cached shared dictionary to avoid repeated multi-MB
552            // allocations on every render.
553            let inline_dict;
554            let dict_ref = if let Some(djbz) = self.find_chunk(b"Djbz") {
555                inline_dict = crate::jb2::decode_dict(djbz, None)?;
556                Some(&inline_dict)
557            } else {
558                self.decoded_shared_dict()
559            };
560            let bm = crate::jb2::decode(sjbz, dict_ref)?;
561            return Ok(Some(bm));
562        }
563        if let Some(smmr) = self.find_chunk(b"Smmr") {
564            let bm = crate::smmr::decode_smmr(smmr).map_err(|e| DocError::Smmr(e.to_string()))?;
565            return Ok(Some(bm));
566        }
567        Ok(None)
568    }
569
570    /// Decode the foreground mask with per-pixel blit index tracking.
571    ///
572    /// Falls back to a plain `Smmr` mask (without blit indices) when only an
573    /// `Smmr` chunk is present; in that case all blit indices are set to `0`.
574    /// Returns `Ok(None)` if the page has neither chunk.
575    pub fn extract_mask_indexed(
576        &self,
577    ) -> Result<Option<(crate::bitmap::Bitmap, Vec<i32>)>, DocError> {
578        if let Some(sjbz) = self.find_chunk(b"Sjbz") {
579            let inline_dict;
580            let dict_ref = if let Some(djbz) = self.find_chunk(b"Djbz") {
581                inline_dict = crate::jb2::decode_dict(djbz, None)?;
582                Some(&inline_dict)
583            } else {
584                self.decoded_shared_dict()
585            };
586            let (bm, blit_map) = crate::jb2::decode_indexed(sjbz, dict_ref)?;
587            return Ok(Some((bm, blit_map)));
588        }
589        if let Some(smmr) = self.find_chunk(b"Smmr") {
590            let bm = crate::smmr::decode_smmr(smmr).map_err(|e| DocError::Smmr(e.to_string()))?;
591            let len = (bm.width * bm.height) as usize;
592            return Ok(Some((bm, vec![0i32; len])));
593        }
594        Ok(None)
595    }
596
597    /// Decode the IW44 foreground layer (FG44 chunks) if present.
598    ///
599    /// Returns `Ok(None)` if the page has no FG44 chunks.
600    pub fn extract_foreground(&self) -> Result<Option<Pixmap>, DocError> {
601        let chunks = self.fg44_chunks();
602        if chunks.is_empty() {
603            return Ok(None);
604        }
605
606        let mut img = Iw44Image::new();
607        for chunk_data in &chunks {
608            img.decode_chunk(chunk_data)?;
609        }
610        let pixmap = img.to_rgb()?;
611        Ok(Some(pixmap))
612    }
613
614    /// Return the decoded JB2 mask (Sjbz), decoding and caching on first call.
615    ///
616    /// Unlike [`Self::extract_mask`] this method caches the result so that repeated
617    /// renders of the same page — e.g. at different DPI levels — do not re-run
618    /// the ZP arithmetic + symbol decode.
619    ///
620    /// Returns `None` if the page has no Sjbz chunk or if decoding fails.
621    #[cfg(feature = "std")]
622    pub fn decoded_mask(&self) -> Option<&crate::bitmap::Bitmap> {
623        self.mask_decoded
624            .get_or_init(|| self.extract_mask().ok().flatten())
625            .as_ref()
626    }
627
628    #[cfg(not(feature = "std"))]
629    pub fn decoded_mask(&self) -> Option<&crate::bitmap::Bitmap> {
630        None
631    }
632
633    /// Return a 1/4-resolution downsampled version of the JB2 mask.
634    ///
635    /// Each bit in the result is 1 if ANY of the corresponding 4×4 block in the
636    /// full-resolution mask is 1 (max-pool downsample).  The compositor can use
637    /// this instead of the full-resolution mask for sub=4 renders, replacing
638    /// 4–9 bit lookups per output pixel with a single lookup.
639    ///
640    /// Computed once and cached alongside `mask_decoded`.
641    #[cfg(feature = "std")]
642    pub fn decoded_mask_sub4(&self) -> Option<&crate::bitmap::Bitmap> {
643        self.mask_decoded_sub4
644            .get_or_init(|| {
645                let src = self.decoded_mask()?;
646                Some(downsample_mask_4x(src))
647            })
648            .as_ref()
649    }
650
651    #[cfg(not(feature = "std"))]
652    pub fn decoded_mask_sub4(&self) -> Option<&crate::bitmap::Bitmap> {
653        None
654    }
655
656    /// Return the decoded FG44 foreground color layer, decoding and caching on
657    /// first call.  Subsequent renders reuse the cached `Pixmap`.
658    ///
659    /// Returns `None` if the page has no FG44 chunks or if decoding fails.
660    #[cfg(feature = "std")]
661    pub fn decoded_fg44(&self) -> Option<&Pixmap> {
662        self.fg44_decoded
663            .get_or_init(|| self.extract_foreground().ok().flatten())
664            .as_ref()
665    }
666
667    #[cfg(not(feature = "std"))]
668    pub fn decoded_fg44(&self) -> Option<&Pixmap> {
669        None
670    }
671
672    /// Decode the IW44 background layer (BG44 chunks) if present.
673    ///
674    /// Returns `Ok(None)` if the page has no BG44 chunks.
675    pub fn extract_background(&self) -> Result<Option<Pixmap>, DocError> {
676        let chunks = self.bg44_chunks();
677        if chunks.is_empty() {
678            return Ok(None);
679        }
680
681        let mut img = Iw44Image::new();
682        for chunk_data in &chunks {
683            img.decode_chunk(chunk_data)?;
684        }
685        let pixmap = img.to_rgb()?;
686        Ok(Some(pixmap))
687    }
688
689    /// Render this page into a pre-allocated RGBA buffer using the given options.
690    ///
691    /// This is the zero-allocation render path: no heap allocation occurs when
692    /// `buf` is already sized to `opts.width * opts.height * 4` bytes.
693    ///
694    /// # Errors
695    ///
696    /// - [`crate::djvu_render::RenderError::BufTooSmall`] if buffer is too small
697    /// - [`crate::djvu_render::RenderError::InvalidDimensions`] if width/height is 0
698    /// - Propagates IW44 / JB2 decode errors
699    pub fn render_into(
700        &self,
701        opts: &crate::djvu_render::RenderOptions,
702        buf: &mut [u8],
703    ) -> Result<(), crate::djvu_render::RenderError> {
704        crate::djvu_render::render_into(self, opts, buf)
705    }
706}
707
708// ---- Document ---------------------------------------------------------------
709
710/// An opened DjVu document.
711///
712/// Supports single-page FORM:DJVU, bundled multi-page FORM:DJVM, and indirect
713/// multi-page FORM:DJVM (via resolver callback).
714#[derive(Debug)]
715pub struct DjVuDocument {
716    /// All pages, indexed by 0-based page number.
717    pages: Vec<DjVuPage>,
718    /// Parsed NAVM bookmarks, or empty if none.
719    bookmarks: Vec<DjVuBookmark>,
720    /// Raw document-level chunks (NAVM, DIRM, etc.) from the DJVM container,
721    /// or from the top-level DJVU form for single-page documents.
722    global_chunks: Vec<RawChunk>,
723    /// Byte ranges of each page's outer FORM chunk inside the original
724    /// document buffer, in page order. Populated only for bundled DJVM
725    /// documents parsed from a contiguous slice; empty otherwise (single-page
726    /// DJVU, indirect DJVM, or when offsets were unavailable).
727    ///
728    /// Used by [`DjVuDocument::page_byte_range`] (#196 Phase 2). Lets a
729    /// future HTTP-Range fetcher (#196 Phase 3) request exactly the bytes
730    /// for a given page.
731    page_byte_ranges: Vec<core::ops::Range<u64>>,
732}
733
734impl DjVuDocument {
735    /// Parse a DjVu document from a byte slice.
736    ///
737    /// For indirect documents (INCL references to external files), a resolver
738    /// must be supplied via [`DjVuDocument::parse_with_resolver`].
739    ///
740    /// # Errors
741    ///
742    /// Returns `DocError::NoResolver` if the document is indirect and no resolver
743    /// was provided.
744    pub fn parse(data: &[u8]) -> Result<Self, DocError> {
745        Self::parse_with_resolver(data, None::<fn(&str) -> Result<Vec<u8>, DocError>>)
746    }
747
748    /// Parse a DjVu document with an optional resolver for indirect pages.
749    ///
750    /// The resolver receives the `name` field from each INCL chunk and must
751    /// return the raw bytes of that external component file.
752    pub fn parse_with_resolver<R>(data: &[u8], resolver: Option<R>) -> Result<Self, DocError>
753    where
754        R: Fn(&str) -> Result<Vec<u8>, DocError>,
755    {
756        let form = parse_form(data)?;
757
758        match &form.form_type {
759            b"DJVU" => {
760                // Single-page document — expose all top-level chunks as global
761                let global_chunks: Vec<RawChunk> = form
762                    .chunks
763                    .iter()
764                    .map(|c| RawChunk {
765                        id: c.id,
766                        data: c.data.to_vec(),
767                    })
768                    .collect();
769                let page = parse_page_from_chunks(&form.chunks, 0, None)?;
770                // Single-page document spans the entire buffer.
771                #[allow(clippy::single_range_in_vec_init)]
772                let page_byte_ranges = vec![0u64..(data.len() as u64)];
773                Ok(DjVuDocument {
774                    pages: vec![page],
775                    bookmarks: vec![],
776                    global_chunks,
777                    page_byte_ranges,
778                })
779            }
780            b"DJVM" => {
781                // Multi-page document — parse DIRM first
782                let dirm_chunk = form
783                    .chunks
784                    .iter()
785                    .find(|c| &c.id == b"DIRM")
786                    .ok_or(DocError::MissingChunk("DIRM"))?;
787
788                let (entries, is_bundled, comp_offsets) = parse_dirm(dirm_chunk.data)?;
789
790                // Collect NAVM bookmarks (BZZ-compressed)
791                let bookmarks = parse_navm_bookmarks(&form.chunks)?;
792
793                // Store non-FORM global chunks (DIRM, NAVM, etc.)
794                let global_chunks: Vec<RawChunk> = form
795                    .chunks
796                    .iter()
797                    .filter(|c| &c.id != b"FORM")
798                    .map(|c| RawChunk {
799                        id: c.id,
800                        data: c.data.to_vec(),
801                    })
802                    .collect();
803
804                if is_bundled {
805                    // Bundled: FORM:DJVU / FORM:DJVI sub-forms follow DIRM in sequence.
806                    let sub_forms: Vec<&IffChunk<'_>> =
807                        form.chunks.iter().filter(|c| &c.id == b"FORM").collect();
808
809                    // Build a map of DJVI component ID → raw Djbz bytes for
810                    // shared symbol dictionaries (referenced via INCL chunks).
811                    // Use BTreeMap so this compiles in no_std (alloc::collections::BTreeMap
812                    // is available; std::collections::HashMap is not).
813                    #[cfg(not(feature = "std"))]
814                    use alloc::collections::BTreeMap;
815                    #[cfg(feature = "std")]
816                    use std::collections::BTreeMap;
817                    // Wrap shared dict bytes in Arc (std) so all pages that
818                    // reference the same DJVI component share one allocation.
819                    #[cfg(feature = "std")]
820                    let djvi_djbz: BTreeMap<String, Arc<Vec<u8>>> = entries
821                        .iter()
822                        .enumerate()
823                        .filter(|(_, e)| e.comp_type == ComponentType::Shared)
824                        .filter_map(|(comp_idx, entry)| {
825                            let sf = sub_forms.get(comp_idx)?;
826                            let chunks = parse_sub_form(sf.data).ok()?;
827                            let djbz = chunks.iter().find(|c| &c.id == b"Djbz")?;
828                            Some((entry.id.clone(), Arc::new(djbz.data.to_vec())))
829                        })
830                        .collect();
831                    #[cfg(not(feature = "std"))]
832                    let djvi_djbz: BTreeMap<String, Vec<u8>> = entries
833                        .iter()
834                        .enumerate()
835                        .filter(|(_, e)| e.comp_type == ComponentType::Shared)
836                        .filter_map(|(comp_idx, entry)| {
837                            let sf = sub_forms.get(comp_idx)?;
838                            let chunks = parse_sub_form(sf.data).ok()?;
839                            let djbz = chunks.iter().find(|c| &c.id == b"Djbz")?;
840                            Some((entry.id.clone(), djbz.data.to_vec()))
841                        })
842                        .collect();
843
844                    let mut pages = Vec::new();
845                    let mut page_byte_ranges = Vec::new();
846                    let mut page_idx = 0usize;
847                    for (comp_idx, entry) in entries.iter().enumerate() {
848                        if entry.comp_type != ComponentType::Page {
849                            continue;
850                        }
851                        let sub_form = sub_forms.get(comp_idx).ok_or(DocError::Malformed(
852                            "DIRM entry count exceeds FORM children",
853                        ))?;
854                        let sub_chunks = parse_sub_form(sub_form.data)?;
855
856                        // Resolve INCL reference to a shared DJVI dictionary.
857                        #[cfg(feature = "std")]
858                        let shared_djbz = sub_chunks
859                            .iter()
860                            .find(|c| &c.id == b"INCL")
861                            .and_then(|incl| core::str::from_utf8(incl.data.trim_ascii_end()).ok())
862                            .and_then(|name| djvi_djbz.get(name))
863                            .cloned();
864                        #[cfg(not(feature = "std"))]
865                        let shared_djbz = sub_chunks
866                            .iter()
867                            .find(|c| &c.id == b"INCL")
868                            .and_then(|incl| core::str::from_utf8(incl.data.trim_ascii_end()).ok())
869                            .and_then(|name| djvi_djbz.get(name))
870                            .cloned();
871
872                        let page = parse_page_from_chunks(&sub_chunks, page_idx, shared_djbz)?;
873                        pages.push(page);
874
875                        // Record the byte range of this page's outer FORM. The DIRM
876                        // offset points at the 4 bytes `b"FORM"`; the size sits at
877                        // offset+4 (BE u32) and covers the form_type + payload bytes,
878                        // so the full container is `8 + size` bytes long.
879                        if let Some(off) = comp_offsets.get(comp_idx) {
880                            let start = *off as usize;
881                            if let Some(size_bytes) = data.get(start + 4..start + 8) {
882                                let size = u32::from_be_bytes([
883                                    size_bytes[0],
884                                    size_bytes[1],
885                                    size_bytes[2],
886                                    size_bytes[3],
887                                ]) as u64;
888                                let begin = start as u64;
889                                let end = begin.saturating_add(8).saturating_add(size);
890                                page_byte_ranges.push(begin..end);
891                            }
892                        }
893                        page_idx += 1;
894                    }
895
896                    // Only expose offsets if we got one for every page; partial
897                    // tables would surprise callers iterating by page index.
898                    if page_byte_ranges.len() != pages.len() {
899                        page_byte_ranges.clear();
900                    }
901
902                    Ok(DjVuDocument {
903                        pages,
904                        bookmarks,
905                        global_chunks,
906                        page_byte_ranges,
907                    })
908                } else {
909                    // Indirect: pages must be resolved by name
910                    let resolver = resolver.ok_or(DocError::NoResolver)?;
911
912                    let mut pages = Vec::new();
913                    let mut page_idx = 0usize;
914                    for entry in &entries {
915                        if entry.comp_type != ComponentType::Page {
916                            continue;
917                        }
918                        let resolved_data = resolver(&entry.id)
919                            .map_err(|_| DocError::IndirectResolve(entry.id.clone()))?;
920                        let sub_form = parse_form(&resolved_data)?;
921                        let page = parse_page_from_chunks(&sub_form.chunks, page_idx, None)?;
922                        pages.push(page);
923                        page_idx += 1;
924                    }
925
926                    Ok(DjVuDocument {
927                        pages,
928                        bookmarks,
929                        global_chunks,
930                        // Indirect: per-page bytes live in external files, not the
931                        // index buffer — no meaningful range to expose here.
932                        page_byte_ranges: Vec::new(),
933                    })
934                }
935            }
936            other => Err(DocError::NotDjVu(*other)),
937        }
938    }
939
940    #[cfg(all(feature = "std", feature = "async"))]
941    pub(crate) fn parse_single_page_with_shared(
942        data: &[u8],
943        index: usize,
944        shared_djbz: Option<Arc<Vec<u8>>>,
945    ) -> Result<DjVuPage, DocError> {
946        let form = parse_form(data)?;
947        if form.form_type != *b"DJVU" {
948            return Err(DocError::NotDjVu(form.form_type));
949        }
950        parse_page_from_chunks(&form.chunks, index, shared_djbz)
951    }
952
953    /// Number of pages.
954    pub fn page_count(&self) -> usize {
955        self.pages.len()
956    }
957
958    /// Byte range of `page`'s outer FORM chunk inside the original document
959    /// buffer (#196 Phase 2).
960    ///
961    /// Returns `Some(start..end)` where `start` is the absolute offset of the
962    /// 4-byte `FORM` magic and `end` is one past the last byte of the chunk
963    /// payload. The range is suitable for an HTTP `Range:` request that
964    /// fetches exactly the bytes needed to decode that page (assuming any
965    /// referenced shared `DJVI` dictionaries are already in hand — those
966    /// have their own ranges too, but `page_byte_range` only covers pages).
967    ///
968    /// Returns `None` for:
969    /// - `index >= page_count()`
970    /// - Indirect DJVM documents (per-page bytes live in external files)
971    /// - Bundled DJVM documents whose DIRM offset table couldn't be matched
972    ///   to every page
973    ///
974    /// Single-page DJVU documents always return the full buffer range.
975    pub fn page_byte_range(&self, index: usize) -> Option<core::ops::Range<u64>> {
976        self.page_byte_ranges.get(index).cloned()
977    }
978
979    /// Access a page by 0-based index.
980    ///
981    /// # Errors
982    ///
983    /// Returns `DocError::PageOutOfRange` if `index >= page_count()`.
984    pub fn page(&self, index: usize) -> Result<&DjVuPage, DocError> {
985        self.pages.get(index).ok_or(DocError::PageOutOfRange {
986            index,
987            count: self.pages.len(),
988        })
989    }
990
991    /// The NAVM table of contents, or an empty slice if not present.
992    pub fn bookmarks(&self) -> &[DjVuBookmark] {
993        &self.bookmarks
994    }
995
996    /// Parse document-level metadata from a METz (BZZ-compressed) or METa
997    /// (plain text) chunk.
998    ///
999    /// Returns `Ok(None)` if no METa/METz chunk is present.
1000    pub fn metadata(&self) -> Result<Option<DjVuMetadata>, DocError> {
1001        if let Some(metz) = self.raw_chunk(b"METz") {
1002            if metz.is_empty() {
1003                return Ok(None);
1004            }
1005            return Ok(Some(crate::metadata::parse_metadata_bzz(metz)?));
1006        }
1007        if let Some(meta) = self.raw_chunk(b"METa") {
1008            if meta.is_empty() {
1009                return Ok(None);
1010            }
1011            return Ok(Some(crate::metadata::parse_metadata(meta)?));
1012        }
1013        Ok(None)
1014    }
1015
1016    /// Return the raw bytes of the first document-level chunk with the given
1017    /// 4-byte ID.
1018    ///
1019    /// For single-page DJVU files this covers all top-level chunks (INFO,
1020    /// Sjbz, BG44, …).  For multi-page DJVM files this covers non-page chunks
1021    /// such as DIRM and NAVM.  Per-page chunks are accessed via
1022    /// [`DjVuPage::raw_chunk`].
1023    ///
1024    /// Returns `None` if no such chunk exists.
1025    pub fn raw_chunk(&self, id: &[u8; 4]) -> Option<&[u8]> {
1026        self.global_chunks
1027            .iter()
1028            .find(|c| &c.id == id)
1029            .map(|c| c.data.as_slice())
1030    }
1031
1032    /// Return the raw bytes of all document-level chunks with the given ID.
1033    ///
1034    /// Returns an empty `Vec` if no such chunk exists.
1035    pub fn all_chunks(&self, id: &[u8; 4]) -> Vec<&[u8]> {
1036        self.global_chunks
1037            .iter()
1038            .filter(|c| &c.id == id)
1039            .map(|c| c.data.as_slice())
1040            .collect()
1041    }
1042
1043    /// Return the IDs of all document-level chunks, in order.
1044    ///
1045    /// For multi-page DJVM files this is the sequence of non-page chunks
1046    /// (DIRM, NAVM, …).  Duplicate IDs appear once per chunk.
1047    pub fn chunk_ids(&self) -> Vec<[u8; 4]> {
1048        self.global_chunks.iter().map(|c| c.id).collect()
1049    }
1050
1051    /// Parse an indirect DjVu document from bytes, resolving component files
1052    /// relative to `base_dir`.
1053    ///
1054    /// For bundled documents this is equivalent to [`DjVuDocument::parse`].
1055    /// For indirect documents, component names from the DIRM are resolved as
1056    /// paths under `base_dir`, and each referenced file is read from disk.
1057    ///
1058    /// # Errors
1059    ///
1060    /// Returns `DocError::Io` if a component file cannot be read, or any parse
1061    /// error from the component data.
1062    #[cfg(feature = "std")]
1063    pub fn parse_from_dir(
1064        data: &[u8],
1065        base_dir: impl AsRef<std::path::Path>,
1066    ) -> Result<Self, DocError> {
1067        let base = base_dir.as_ref().to_path_buf();
1068        let resolver = move |name: &str| -> Result<Vec<u8>, DocError> {
1069            // Strip any "file://" prefix
1070            let name = name.strip_prefix("file://").unwrap_or(name);
1071            let path = if std::path::Path::new(name).is_absolute() {
1072                std::path::PathBuf::from(name)
1073            } else {
1074                base.join(name)
1075            };
1076            std::fs::read(&path).map_err(|_| DocError::IndirectResolve(name.to_string()))
1077        };
1078        Self::parse_with_resolver(data, Some(resolver))
1079    }
1080}
1081
1082// ---- Memory-mapped document -------------------------------------------------
1083
1084/// A DjVu document backed by a memory-mapped file.
1085///
1086/// Instead of copying the entire file into a `Vec<u8>`, this type maps the file
1087/// into the process address space using the OS virtual-memory subsystem.  The
1088/// kernel pages data from disk on demand, which can significantly reduce peak
1089/// memory usage for large multi-volume scans (100+ MB).
1090///
1091/// # Safety contract
1092///
1093/// **The underlying file must not be modified or truncated while the mapping is
1094/// alive.**  Mutating a memory-mapped file is undefined behaviour on most
1095/// platforms (SIGBUS on Linux/macOS, access violation on Windows).  The caller
1096/// is responsible for ensuring file immutability for the lifetime of this
1097/// struct.
1098///
1099/// Requires the `mmap` feature flag.
1100#[cfg(feature = "mmap")]
1101pub struct MmapDocument {
1102    /// The memory mapping — kept alive so the parsed document's borrowed data
1103    /// (pages, chunks) remain valid.  In practice `DjVuDocument` owns `Vec`
1104    /// copies of all chunk data, so the mmap is only needed during `parse`.
1105    _mmap: memmap2::Mmap,
1106    doc: DjVuDocument,
1107}
1108
1109#[cfg(feature = "mmap")]
1110impl MmapDocument {
1111    /// Open a DjVu file via memory-mapped I/O.
1112    ///
1113    /// # Safety contract
1114    ///
1115    /// The file at `path` **must not be modified or truncated** while the
1116    /// returned `MmapDocument` is alive.  See the struct-level documentation
1117    /// for details.
1118    ///
1119    /// # Errors
1120    ///
1121    /// Returns `DocError::Io` if the file cannot be opened or mapped, or any
1122    /// parse error from [`DjVuDocument::parse`].
1123    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self, DocError> {
1124        let file = std::fs::File::open(path.as_ref())?;
1125
1126        // SAFETY: The caller guarantees the file is not modified while mapped.
1127        // memmap2::Mmap provides a &[u8] view of the file contents.
1128        #[allow(unsafe_code)]
1129        let mmap = unsafe { memmap2::Mmap::map(&file) }?;
1130
1131        let doc = DjVuDocument::parse(&mmap)?;
1132        Ok(MmapDocument { _mmap: mmap, doc })
1133    }
1134
1135    /// Open a DjVu file with automatic filesystem resolution for indirect pages.
1136    ///
1137    /// For bundled documents this is identical to [`MmapDocument::open`].
1138    /// For indirect DJVM documents, component files named in the DIRM are
1139    /// resolved relative to the directory containing `path`.
1140    ///
1141    /// # Safety contract
1142    ///
1143    /// The file at `path` **must not be modified or truncated** while the
1144    /// returned `MmapDocument` is alive.
1145    pub fn open_indirect(path: impl AsRef<std::path::Path>) -> Result<Self, DocError> {
1146        let path = path.as_ref();
1147        let file = std::fs::File::open(path)?;
1148        #[allow(unsafe_code)]
1149        let mmap = unsafe { memmap2::Mmap::map(&file) }?;
1150
1151        let base_dir = path
1152            .parent()
1153            .map(|p| p.to_path_buf())
1154            .unwrap_or_else(|| std::path::PathBuf::from("."));
1155        let doc = DjVuDocument::parse_from_dir(&mmap, &base_dir)?;
1156        Ok(MmapDocument { _mmap: mmap, doc })
1157    }
1158
1159    /// Access the parsed [`DjVuDocument`].
1160    pub fn document(&self) -> &DjVuDocument {
1161        &self.doc
1162    }
1163
1164    /// Number of pages in the document.
1165    pub fn page_count(&self) -> usize {
1166        self.doc.page_count()
1167    }
1168
1169    /// Access a page by 0-based index.
1170    pub fn page(&self, index: usize) -> Result<&DjVuPage, DocError> {
1171        self.doc.page(index)
1172    }
1173}
1174
1175#[cfg(feature = "mmap")]
1176impl core::ops::Deref for MmapDocument {
1177    type Target = DjVuDocument;
1178    fn deref(&self) -> &DjVuDocument {
1179        &self.doc
1180    }
1181}
1182
1183// ---- Internal parsing helpers -----------------------------------------------
1184
1185/// Parse a `DjVuPage` from the chunks of a FORM:DJVU.
1186///
1187/// `shared_djbz` is the raw `Djbz` data from a referenced DJVI component
1188/// (resolved from the page's INCL chunk by the caller); pass `None` if no
1189/// shared dictionary is available.
1190#[cfg(feature = "std")]
1191fn parse_page_from_chunks(
1192    chunks: &[IffChunk<'_>],
1193    index: usize,
1194    shared_djbz: Option<Arc<Vec<u8>>>,
1195) -> Result<DjVuPage, DocError> {
1196    let info_chunk = chunks
1197        .iter()
1198        .find(|c| &c.id == b"INFO")
1199        .ok_or(DocError::MissingChunk("INFO"))?;
1200
1201    let info = PageInfo::parse(info_chunk.data)?;
1202
1203    // Copy all chunks to owned storage for lazy decode later.
1204    let raw_chunks: Vec<RawChunk> = chunks
1205        .iter()
1206        .map(|c| RawChunk {
1207            id: c.id,
1208            data: c.data.to_vec(),
1209        })
1210        .collect();
1211
1212    Ok(DjVuPage {
1213        info,
1214        chunks: raw_chunks,
1215        index,
1216        shared_djbz,
1217        bg44_decoded: std::sync::OnceLock::new(),
1218        bg44_decoded_partial: std::sync::OnceLock::new(),
1219        mask_decoded: std::sync::OnceLock::new(),
1220        mask_decoded_sub4: std::sync::OnceLock::new(),
1221        fg44_decoded: std::sync::OnceLock::new(),
1222        jb2_dict_decoded: std::sync::OnceLock::new(),
1223    })
1224}
1225
1226#[cfg(not(feature = "std"))]
1227fn parse_page_from_chunks(
1228    chunks: &[IffChunk<'_>],
1229    index: usize,
1230    shared_djbz: Option<Vec<u8>>,
1231) -> Result<DjVuPage, DocError> {
1232    let info_chunk = chunks
1233        .iter()
1234        .find(|c| &c.id == b"INFO")
1235        .ok_or(DocError::MissingChunk("INFO"))?;
1236
1237    let info = PageInfo::parse(info_chunk.data)?;
1238
1239    let raw_chunks: Vec<RawChunk> = chunks
1240        .iter()
1241        .map(|c| RawChunk {
1242            id: c.id,
1243            data: c.data.to_vec(),
1244        })
1245        .collect();
1246
1247    Ok(DjVuPage {
1248        info,
1249        chunks: raw_chunks,
1250        index,
1251        shared_djbz,
1252    })
1253}
1254
1255/// Parse sub-form chunks from the data portion of a FORM chunk.
1256///
1257/// The `data` bytes start with a 4-byte form type (e.g. `DJVU`), followed by
1258/// sequential IFF chunks.
1259fn parse_sub_form(data: &[u8]) -> Result<Vec<IffChunk<'_>>, DocError> {
1260    if data.len() < 4 {
1261        return Err(DocError::Malformed("sub-form data too short"));
1262    }
1263    // data[0..4] = form type (DJVU / DJVI / THUM …)
1264    // data[4..] = sequential chunks
1265    let body = data
1266        .get(4..)
1267        .ok_or(DocError::Malformed("sub-form body missing"))?;
1268    let chunks = parse_iff_body_chunks(body)?;
1269    Ok(chunks)
1270}
1271
1272/// Parse sequential IFF chunks from a raw byte slice (no AT&T / FORM wrapper).
1273fn parse_iff_body_chunks(mut buf: &[u8]) -> Result<Vec<IffChunk<'_>>, DocError> {
1274    let mut chunks = Vec::new();
1275
1276    while buf.len() >= 8 {
1277        let id: [u8; 4] = buf
1278            .get(0..4)
1279            .and_then(|s| s.try_into().ok())
1280            .ok_or(IffError::Truncated)?;
1281        let data_len = buf
1282            .get(4..8)
1283            .and_then(|b| b.try_into().ok())
1284            .map(u32::from_be_bytes)
1285            .map(|n| n as usize)
1286            .ok_or(IffError::Truncated)?;
1287
1288        let data_start = 8usize;
1289        let data_end = data_start
1290            .checked_add(data_len)
1291            .ok_or(IffError::Truncated)?;
1292
1293        if data_end > buf.len() {
1294            return Err(DocError::Iff(IffError::ChunkTooLong {
1295                id,
1296                claimed: data_len as u32,
1297                available: buf.len().saturating_sub(data_start),
1298            }));
1299        }
1300
1301        let chunk_data = buf.get(data_start..data_end).ok_or(IffError::Truncated)?;
1302
1303        // If this is a nested FORM, expose it as a FORM chunk with raw data
1304        // (form_type + children) so callers can handle FORM:DJVU sub-forms.
1305        chunks.push(IffChunk {
1306            id,
1307            data: chunk_data,
1308        });
1309
1310        let padded_len = data_len + (data_len & 1);
1311        let next = data_start
1312            .checked_add(padded_len)
1313            .ok_or(IffError::Truncated)?;
1314        buf = buf.get(next.min(buf.len())..).ok_or(IffError::Truncated)?;
1315    }
1316
1317    Ok(chunks)
1318}
1319
1320/// A DIRM component entry.
1321#[derive(Debug, Clone)]
1322struct DirmEntry {
1323    comp_type: ComponentType,
1324    id: String,
1325}
1326
1327/// Parse the DIRM chunk (directory of files in FORM:DJVM).
1328///
1329/// Returns `(entries, is_bundled, offsets)`. `offsets` is non-empty only for
1330/// bundled documents; each entry is the absolute byte offset of the
1331/// corresponding component's outer `b"FORM"` header within the original
1332/// document buffer.
1333fn parse_dirm(data: &[u8]) -> Result<(Vec<DirmEntry>, bool, Vec<u32>), DocError> {
1334    if data.len() < 3 {
1335        return Err(DocError::Malformed("DIRM chunk too short"));
1336    }
1337
1338    let dflags = *data.first().ok_or(DocError::Malformed("DIRM empty"))?;
1339    let is_bundled = (dflags >> 7) != 0;
1340    let nfiles = u16::from_be_bytes([
1341        *data.get(1).ok_or(DocError::Malformed("DIRM too short"))?,
1342        *data.get(2).ok_or(DocError::Malformed("DIRM too short"))?,
1343    ]) as usize;
1344
1345    let mut pos = 3usize;
1346
1347    // Bundled documents embed 4-byte BE offsets to each component's FORM header.
1348    let mut offsets: Vec<u32> = Vec::new();
1349    if is_bundled {
1350        let offsets_size = nfiles * 4;
1351        let end = pos
1352            .checked_add(offsets_size)
1353            .ok_or(DocError::Malformed("DIRM offset arithmetic overflow"))?;
1354        if end > data.len() {
1355            return Err(DocError::Malformed("DIRM offset table truncated"));
1356        }
1357        offsets.reserve(nfiles);
1358        for i in 0..nfiles {
1359            let base = pos + i * 4;
1360            let bytes = data
1361                .get(base..base + 4)
1362                .ok_or(DocError::Malformed("DIRM offset slice OOB"))?;
1363            offsets.push(u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]));
1364        }
1365        pos = end;
1366    }
1367
1368    // Remaining bytes are BZZ-compressed metadata.
1369    let bzz_data = data
1370        .get(pos..)
1371        .ok_or(DocError::Malformed("DIRM bzz data missing"))?;
1372    let meta = bzz_decode(bzz_data).unwrap_or_default();
1373
1374    // If BZZ metadata is too short (e.g. from a minimal DIRM without full
1375    // metadata), generate synthetic entries — callers derive types from FORM.
1376    // Layout: sizes(3 bytes × N), flags(1 byte × N), then null-terminated IDs…
1377    let mut mpos = nfiles * 3; // skip per-component sizes
1378
1379    if mpos + nfiles > meta.len() {
1380        // Generate synthetic entries with unknown type — the caller will
1381        // reassign types based on the actual FORM type (DJVU/DJVI/etc.)
1382        let entries: Vec<DirmEntry> = (0..nfiles)
1383            .map(|i| DirmEntry {
1384                comp_type: ComponentType::Page,
1385                id: format!("p{:04}", i),
1386            })
1387            .collect();
1388        return Ok((entries, is_bundled, offsets));
1389    }
1390    let flags: Vec<u8> = meta
1391        .get(mpos..mpos + nfiles)
1392        .ok_or(DocError::Malformed("DIRM flags truncated"))?
1393        .to_vec();
1394    mpos += nfiles;
1395
1396    let mut entries = Vec::with_capacity(nfiles);
1397    for &flag in flags.iter().take(nfiles) {
1398        let id = read_str_nt(&meta, &mut mpos)?;
1399
1400        // Optional name and title fields
1401        if (flag & 0x80) != 0 {
1402            let _ = read_str_nt(&meta, &mut mpos)?;
1403        }
1404        if (flag & 0x40) != 0 {
1405            let _ = read_str_nt(&meta, &mut mpos)?;
1406        }
1407
1408        let comp_type = match flag & 0x3f {
1409            1 => ComponentType::Page,
1410            2 => ComponentType::Thumbnail,
1411            _ => ComponentType::Shared,
1412        };
1413
1414        entries.push(DirmEntry { comp_type, id });
1415    }
1416
1417    Ok((entries, is_bundled, offsets))
1418}
1419
1420/// Read a null-terminated UTF-8 string from `data` at `*pos`, advancing `*pos`.
1421fn read_str_nt(data: &[u8], pos: &mut usize) -> Result<String, DocError> {
1422    let start = *pos;
1423    while *pos < data.len() && *data.get(*pos).ok_or(DocError::Malformed("str read OOB"))? != 0 {
1424        *pos += 1;
1425    }
1426    if *pos >= data.len() {
1427        return Err(DocError::Malformed(
1428            "null terminator missing in DIRM string",
1429        ));
1430    }
1431    let s = core::str::from_utf8(
1432        data.get(start..*pos)
1433            .ok_or(DocError::Malformed("str slice OOB"))?,
1434    )
1435    .map_err(|_| DocError::InvalidUtf8)?
1436    .to_string();
1437    *pos += 1; // consume null terminator
1438    Ok(s)
1439}
1440
1441/// Parse NAVM bookmarks from the chunk list of a FORM:DJVM.
1442///
1443/// Returns an empty Vec if there is no NAVM chunk.
1444fn parse_navm_bookmarks(chunks: &[IffChunk<'_>]) -> Result<Vec<DjVuBookmark>, DocError> {
1445    let navm_data = match chunks.iter().find(|c| &c.id == b"NAVM") {
1446        Some(c) => c.data,
1447        None => return Ok(vec![]),
1448    };
1449
1450    let decoded = bzz_decode(navm_data)?;
1451
1452    if decoded.len() < 2 {
1453        return Ok(vec![]);
1454    }
1455
1456    let b0 = *decoded
1457        .first()
1458        .ok_or(DocError::Malformed("NAVM total count byte 0"))?;
1459    let b1 = *decoded
1460        .get(1)
1461        .ok_or(DocError::Malformed("NAVM total count byte 1"))?;
1462    let total_count = u16::from_be_bytes([b0, b1]) as usize;
1463
1464    let mut pos = 2usize;
1465    let mut bookmarks = Vec::new();
1466    let mut decoded_count = 0usize;
1467
1468    while decoded_count < total_count {
1469        let bm = parse_bookmark_entry(&decoded, &mut pos, &mut decoded_count)?;
1470        bookmarks.push(bm);
1471    }
1472
1473    Ok(bookmarks)
1474}
1475
1476/// Recursively parse one bookmark entry and its children.
1477///
1478/// `total_counter` is a shared counter for ALL bookmark nodes across all recursion
1479/// levels, matching the DjVu NAVM format's flat total-count field.
1480fn parse_bookmark_entry(
1481    data: &[u8],
1482    pos: &mut usize,
1483    total_counter: &mut usize,
1484) -> Result<DjVuBookmark, DocError> {
1485    if *pos >= data.len() {
1486        return Err(DocError::Malformed("NAVM bookmark entry truncated"));
1487    }
1488
1489    // n_children is a single byte in the NAVM format
1490    let n_children = *data
1491        .get(*pos)
1492        .ok_or(DocError::Malformed("NAVM children count"))? as usize;
1493    *pos += 1;
1494
1495    let title = read_navm_str(data, pos)?;
1496    let url = read_navm_str(data, pos)?;
1497    *total_counter += 1;
1498
1499    // Children: fixed count, recurse with the same global total_counter
1500    let mut children = Vec::with_capacity(n_children);
1501    for _ in 0..n_children {
1502        let child = parse_bookmark_entry(data, pos, total_counter)?;
1503        children.push(child);
1504    }
1505
1506    Ok(DjVuBookmark {
1507        title,
1508        url,
1509        children,
1510    })
1511}
1512
1513/// Read a length-prefixed UTF-8 string from NAVM data.
1514///
1515/// Format: `[be_u24 length][utf8 bytes]`
1516fn read_navm_str(data: &[u8], pos: &mut usize) -> Result<String, DocError> {
1517    if *pos + 3 > data.len() {
1518        return Err(DocError::Malformed("NAVM string length truncated"));
1519    }
1520    let len = ((*data.get(*pos).ok_or(DocError::Malformed("NAVM str"))? as usize) << 16)
1521        | ((*data.get(*pos + 1).ok_or(DocError::Malformed("NAVM str"))? as usize) << 8)
1522        | (*data.get(*pos + 2).ok_or(DocError::Malformed("NAVM str"))? as usize);
1523    *pos += 3;
1524
1525    let bytes = data
1526        .get(*pos..*pos + len)
1527        .ok_or(DocError::Malformed("NAVM string bytes truncated"))?;
1528    *pos += len;
1529
1530    core::str::from_utf8(bytes)
1531        .map(|s| s.to_string())
1532        .map_err(|_| DocError::InvalidUtf8)
1533}
1534
1535/// Max-pool 4× downsample of a bilevel mask.
1536///
1537/// Each output pixel is 1 if ANY bit in the corresponding 4×4 block of `src`
1538/// is set.  Used by [`DjVuPage::decoded_mask_sub4`] to build the 1/4-resolution
1539/// mask cache, which lets the compositor avoid `mask_box_any` for sub=4 renders.
1540#[cfg(feature = "std")]
1541fn downsample_mask_4x(src: &crate::bitmap::Bitmap) -> crate::bitmap::Bitmap {
1542    let out_w = src.width.div_ceil(4);
1543    let out_h = src.height.div_ceil(4);
1544    let mut out = crate::bitmap::Bitmap::new(out_w, out_h);
1545    for oy in 0..out_h {
1546        for ox in 0..out_w {
1547            'outer: for dy in 0..4u32 {
1548                for dx in 0..4u32 {
1549                    let sx = ox * 4 + dx;
1550                    let sy = oy * 4 + dy;
1551                    if sx < src.width && sy < src.height && src.get(sx, sy) {
1552                        out.set(ox, oy, true);
1553                        break 'outer;
1554                    }
1555                }
1556            }
1557        }
1558    }
1559    out
1560}
1561
1562// ---- Tests ------------------------------------------------------------------
1563
1564#[cfg(test)]
1565mod tests {
1566    use super::*;
1567
1568    fn assets_path() -> std::path::PathBuf {
1569        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1570            .join("references/djvujs/library/assets")
1571    }
1572
1573    // ---- TDD: failing tests written first (Red phase) -----------------------
1574
1575    /// Single-page FORM:DJVU — basic parse, page count, dimensions, DPI.
1576    #[test]
1577    fn single_page_parse_and_metadata() {
1578        let data =
1579            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1580        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1581
1582        assert_eq!(doc.page_count(), 1);
1583        let page = doc.page(0).expect("page 0 must exist");
1584        assert_eq!(page.width(), 181);
1585        assert_eq!(page.height(), 240);
1586        assert_eq!(page.dpi(), 100);
1587        assert!((page.gamma() - 2.2).abs() < 0.01, "gamma should be ~2.2");
1588    }
1589
1590    /// Single-page document: page index out of range.
1591    #[test]
1592    fn single_page_out_of_range() {
1593        let data =
1594            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1595        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1596        let err = doc.page(1).expect_err("page 1 should be out of range");
1597        assert!(
1598            matches!(err, DocError::PageOutOfRange { index: 1, count: 1 }),
1599            "unexpected error: {err:?}"
1600        );
1601    }
1602
1603    /// Single-page document: no thumbnails expected.
1604    #[test]
1605    fn single_page_no_thumbnail() {
1606        let data =
1607            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1608        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1609        let page = doc.page(0).expect("page 0 must exist");
1610        // Data is not decoded until thumbnail() is called — verify lazy contract
1611        let thumb = page.thumbnail().expect("thumbnail() should not error");
1612        assert!(
1613            thumb.is_none(),
1614            "single-page chicken.djvu has no TH44 chunks"
1615        );
1616    }
1617
1618    /// Single-page: dimensions helper.
1619    #[test]
1620    fn single_page_dimensions() {
1621        let data =
1622            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1623        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1624        let page = doc.page(0).unwrap();
1625        assert_eq!(page.dimensions(), (181, 240));
1626    }
1627
1628    /// Bundled multi-page FORM:DJVM — page count and DIRM parsing.
1629    #[test]
1630    fn multipage_bundled_page_count() {
1631        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu"))
1632            .expect("DjVu3Spec_bundled.djvu must exist");
1633        let doc = DjVuDocument::parse(&data).expect("bundled parse should succeed");
1634        // The bundled spec PDF has many pages — just check > 1
1635        assert!(
1636            doc.page_count() > 1,
1637            "bundled document should have more than 1 page, got {}",
1638            doc.page_count()
1639        );
1640    }
1641
1642    /// Bundled multi-page: each page should have valid metadata.
1643    #[test]
1644    fn multipage_bundled_page_metadata() {
1645        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu"))
1646            .expect("DjVu3Spec_bundled.djvu must exist");
1647        let doc = DjVuDocument::parse(&data).expect("bundled parse should succeed");
1648
1649        let page0 = doc.page(0).expect("page 0 must exist");
1650        assert!(page0.width() > 0, "page width must be non-zero");
1651        assert!(page0.height() > 0, "page height must be non-zero");
1652        assert!(page0.dpi() > 0, "page dpi must be non-zero");
1653    }
1654
1655    /// NAVM bookmarks from a document that contains them.
1656    #[test]
1657    fn navm_bookmarks_present() {
1658        let data =
1659            std::fs::read(assets_path().join("navm_fgbz.djvu")).expect("navm_fgbz.djvu must exist");
1660        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1661        // navm_fgbz.djvu has NAVM chunk — should return at least one bookmark
1662        let bm = doc.bookmarks();
1663        assert!(
1664            !bm.is_empty(),
1665            "navm_fgbz.djvu should have at least one bookmark"
1666        );
1667    }
1668
1669    /// Documents without NAVM should return empty bookmark list.
1670    #[test]
1671    fn no_navm_returns_empty_bookmarks() {
1672        let data =
1673            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1674        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1675        assert!(
1676            doc.bookmarks().is_empty(),
1677            "chicken.djvu has no NAVM — bookmarks should be empty"
1678        );
1679    }
1680
1681    /// Indirect document: parse with resolver callback.
1682    ///
1683    /// We simulate an indirect document by constructing a DJVM DIRM that marks
1684    /// entries as non-bundled and supplying a resolver that returns the bytes of
1685    /// the real chicken.djvu page.
1686    #[test]
1687    fn indirect_document_with_resolver() {
1688        // Load chicken.djvu — we'll use it as the "resolved" page.
1689        let chicken_data =
1690            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1691        // Build a minimal indirect DJVM document referencing "chicken.djvu"
1692        let djvm_data = build_indirect_djvm_bytes("chicken.djvu");
1693
1694        let resolver = |name: &str| -> Result<Vec<u8>, DocError> {
1695            if name == "chicken.djvu" {
1696                Ok(chicken_data.clone())
1697            } else {
1698                Err(DocError::IndirectResolve(name.to_string()))
1699            }
1700        };
1701
1702        let doc = DjVuDocument::parse_with_resolver(&djvm_data, Some(resolver))
1703            .expect("indirect parse should succeed");
1704
1705        assert_eq!(doc.page_count(), 1);
1706        let page = doc.page(0).unwrap();
1707        assert_eq!(page.width(), 181);
1708        assert_eq!(page.height(), 240);
1709    }
1710
1711    /// Indirect document without resolver must return NoResolver error.
1712    #[test]
1713    fn indirect_document_no_resolver_returns_error() {
1714        let djvm_data = build_indirect_djvm_bytes("chicken.djvu");
1715        let err = DjVuDocument::parse(&djvm_data).expect_err("should fail without resolver");
1716        assert!(
1717            matches!(err, DocError::NoResolver),
1718            "expected NoResolver, got {err:?}"
1719        );
1720    }
1721
1722    /// Page must not decode image data before thumbnail() is called.
1723    ///
1724    /// We verify laziness by confirming that constructing the document and
1725    /// accessing `page()` without calling `thumbnail()` does not involve
1726    /// any IW44 decoder side-effects.  We test this by calling thumbnail()
1727    /// on a page with no TH44 chunks and verifying we get Ok(None).
1728    #[test]
1729    fn page_is_lazy_no_decode_before_thumbnail() {
1730        let data =
1731            std::fs::read(assets_path().join("boy_jb2.djvu")).expect("boy_jb2.djvu must exist");
1732        let doc = DjVuDocument::parse(&data).expect("parse should succeed");
1733        let page = doc.page(0).expect("page 0 must exist");
1734
1735        // page.chunks should be populated but no decoding has happened
1736        assert!(!page.chunks.is_empty(), "chunks must be stored (lazy)");
1737
1738        // thumbnail() triggers decode — but there's no TH44 chunk in boy_jb2.djvu
1739        let thumb = page.thumbnail().expect("thumbnail() should not error");
1740        assert!(thumb.is_none());
1741    }
1742
1743    /// Non-DjVu file returns NotDjVu error.
1744    #[test]
1745    fn not_djvu_returns_error() {
1746        // Construct a valid IFF with a non-DjVu form type
1747        let mut data = Vec::new();
1748        data.extend_from_slice(b"AT&T");
1749        data.extend_from_slice(b"FORM");
1750        data.extend_from_slice(&8u32.to_be_bytes());
1751        data.extend_from_slice(b"XXXXXXXX"); // form_type = XXXX + 4 dummy bytes
1752        let err = DjVuDocument::parse(&data).expect_err("should fail");
1753        assert!(
1754            matches!(err, DocError::NotDjVu(_) | DocError::Iff(_)),
1755            "expected NotDjVu or Iff error, got {err:?}"
1756        );
1757    }
1758
1759    // ---- Helpers: build minimal DJVM documents for indirect tests -----------
1760
1761    /// Build a minimal indirect FORM:DJVM with 1 page component named "chicken.djvu".
1762    ///
1763    /// DIRM format: flags=0x00 (not bundled), nfiles=1, followed by BZZ-compressed
1764    /// metadata. The BZZ bytes below were pre-computed using the reference `bzz -e`
1765    /// tool encoding the metadata:
1766    ///   `\x00\x00\x00` (size, 3 bytes) + `\x01` (Page flag) + `chicken.djvu\x00`
1767    fn build_indirect_djvm_bytes(_page_name: &str) -> Vec<u8> {
1768        // BZZ-encoded DIRM metadata for 1 Page component named "chicken.djvu".
1769        // Generated with: printf '\x00\x00\x00\x01chicken.djvu\x00' | bzz -e - file.bzz
1770        // Verified to decode back to the original 17-byte meta block.
1771        let bzz_meta: &[u8] = &[
1772            0xff, 0xff, 0xed, 0xbf, 0x8a, 0x1f, 0xbe, 0xad, 0x14, 0x57, 0x10, 0xc9, 0x63, 0x19,
1773            0x11, 0xf0, 0x85, 0x28, 0x12, 0x8a, 0xbf,
1774        ];
1775
1776        let mut dirm_data = Vec::new();
1777        dirm_data.push(0x00); // flags: not bundled (is_bundled bit = 0)
1778        dirm_data.push(0x00); // nfiles high byte
1779        dirm_data.push(0x01); // nfiles low byte = 1
1780        dirm_data.extend_from_slice(bzz_meta);
1781
1782        build_djvm_with_dirm(&dirm_data)
1783    }
1784
1785    fn build_djvm_with_dirm(dirm_data: &[u8]) -> Vec<u8> {
1786        // DIRM chunk
1787        let mut dirm_chunk = Vec::new();
1788        dirm_chunk.extend_from_slice(b"DIRM");
1789        dirm_chunk.extend_from_slice(&(dirm_data.len() as u32).to_be_bytes());
1790        dirm_chunk.extend_from_slice(dirm_data);
1791        if !dirm_data.len().is_multiple_of(2) {
1792            dirm_chunk.push(0); // pad to even
1793        }
1794
1795        // FORM:DJVM body
1796        let mut form_body = Vec::new();
1797        form_body.extend_from_slice(b"DJVM");
1798        form_body.extend_from_slice(&dirm_chunk);
1799
1800        // Full file
1801        let mut file = Vec::new();
1802        file.extend_from_slice(b"AT&T");
1803        file.extend_from_slice(b"FORM");
1804        file.extend_from_slice(&(form_body.len() as u32).to_be_bytes());
1805        file.extend_from_slice(&form_body);
1806        file
1807    }
1808
1809    // ── raw chunk API (Issue #43) ────────────────────────────────────────────
1810
1811    /// `DjVuPage::raw_chunk` returns bytes for known chunk types.
1812    #[test]
1813    fn page_raw_chunk_info_present() {
1814        let data =
1815            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1816        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1817        let page = doc.page(0).expect("page 0 must exist");
1818
1819        // INFO chunk must be present
1820        let info = page.raw_chunk(b"INFO").expect("INFO chunk must be present");
1821        assert_eq!(info.len(), 10, "INFO chunk is always 10 bytes");
1822    }
1823
1824    /// `DjVuPage::raw_chunk` returns None for absent chunk types.
1825    #[test]
1826    fn page_raw_chunk_absent() {
1827        let data =
1828            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1829        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1830        let page = doc.page(0).expect("page 0 must exist");
1831
1832        assert!(
1833            page.raw_chunk(b"XXXX").is_none(),
1834            "unknown chunk type must return None"
1835        );
1836    }
1837
1838    /// `DjVuPage::all_chunks` returns multiple BG44 chunks in order.
1839    #[test]
1840    fn page_all_chunks_bg44_multiple() {
1841        // big-scanned-page.djvu has 4 progressive BG44 chunks
1842        let data = std::fs::read(
1843            std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1844                .join("tests/fixtures/big-scanned-page.djvu"),
1845        )
1846        .expect("big-scanned-page.djvu must exist");
1847        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1848        let page = doc.page(0).expect("page 0 must exist");
1849
1850        let bg44 = page.all_chunks(b"BG44");
1851        assert!(
1852            bg44.len() >= 2,
1853            "colour page must have ≥2 BG44 chunks, got {}",
1854            bg44.len()
1855        );
1856
1857        // Chunks must be non-empty
1858        for (i, chunk) in bg44.iter().enumerate() {
1859            assert!(!chunk.is_empty(), "BG44 chunk {i} must not be empty");
1860        }
1861    }
1862
1863    /// `DjVuPage::chunk_ids` lists all chunk IDs in order.
1864    #[test]
1865    fn page_chunk_ids_includes_info() {
1866        let data =
1867            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1868        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1869        let page = doc.page(0).expect("page 0 must exist");
1870
1871        let ids = page.chunk_ids();
1872        assert!(!ids.is_empty(), "chunk_ids must not be empty");
1873        assert!(
1874            ids.contains(b"INFO"),
1875            "chunk_ids must include INFO, got: {:?}",
1876            ids.iter()
1877                .map(|id| std::str::from_utf8(id).unwrap_or("????"))
1878                .collect::<Vec<_>>()
1879        );
1880    }
1881
1882    /// `DjVuDocument::raw_chunk` works for single-page DJVU files.
1883    #[test]
1884    fn document_raw_chunk_single_page() {
1885        let data =
1886            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1887        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1888
1889        // Single-page DJVU exposes all top-level chunks at document level too
1890        let info = doc
1891            .raw_chunk(b"INFO")
1892            .expect("document must expose INFO chunk");
1893        assert_eq!(info.len(), 10);
1894    }
1895
1896    // ── DJVI shared dictionary / INCL chunks (Issue #45) ────────────────────
1897
1898    /// DjVu3Spec_bundled.djvu has shared DJVI symbol dictionaries.
1899    /// Parsing must succeed and pages with INCL references must carry the dict.
1900    #[test]
1901    fn djvi_shared_dict_parsed_from_bundled_djvm() {
1902        let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1903            .join("tests/fixtures/DjVu3Spec_bundled.djvu");
1904        let data = std::fs::read(&path).expect("DjVu3Spec_bundled.djvu must exist");
1905        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1906
1907        assert!(doc.page_count() > 0, "document must have pages");
1908
1909        // At least one page should have a shared dict loaded (shared_djbz Some)
1910        let pages_with_dict = doc.pages.iter().filter(|p| p.shared_djbz.is_some()).count();
1911        assert!(
1912            pages_with_dict > 0,
1913            "at least one page must have a resolved shared DJVI dict"
1914        );
1915    }
1916
1917    /// Pages with INCL references must render their mask without error.
1918    #[test]
1919    fn djvi_incl_page_mask_renders_ok() {
1920        let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1921            .join("tests/fixtures/DjVu3Spec_bundled.djvu");
1922        let data = std::fs::read(&path).expect("DjVu3Spec_bundled.djvu must exist");
1923        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1924
1925        // Find first page with a shared dict and render its mask
1926        let page = doc
1927            .pages
1928            .iter()
1929            .find(|p| p.shared_djbz.is_some())
1930            .expect("at least one page must have a shared dict");
1931
1932        let mask = page
1933            .extract_mask()
1934            .expect("extract_mask must succeed for INCL page");
1935        assert!(mask.is_some(), "INCL page must have a JB2 mask");
1936        let bm = mask.unwrap();
1937        assert!(
1938            bm.width > 0 && bm.height > 0,
1939            "mask must have non-zero dimensions"
1940        );
1941    }
1942
1943    /// Pages without INCL still render correctly (no regression).
1944    #[test]
1945    fn no_regression_non_incl_pages() {
1946        // boy_jb2.djvu has a Sjbz mask and no INCL reference
1947        let data = std::fs::read(
1948            std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1949                .join("tests/fixtures/boy_jb2.djvu"),
1950        )
1951        .expect("boy_jb2.djvu must exist");
1952        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1953        let page = doc.page(0).expect("page 0 must exist");
1954        assert!(
1955            page.shared_djbz.is_none(),
1956            "single-page DJVU has no shared dict"
1957        );
1958        let mask = page.extract_mask().expect("extract_mask must succeed");
1959        assert!(mask.is_some(), "boy_jb2.djvu page must have a JB2 mask");
1960    }
1961
1962    /// Round-trip: bytes from `raw_chunk` re-parse to the same metadata.
1963    #[test]
1964    fn page_raw_chunk_info_roundtrip() {
1965        let data =
1966            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1967        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1968        let page = doc.page(0).expect("page 0 must exist");
1969
1970        let raw_info = page.raw_chunk(b"INFO").expect("INFO chunk must be present");
1971        let reparsed = crate::info::PageInfo::parse(raw_info).expect("re-parse must succeed");
1972        assert_eq!(reparsed.width, page.width() as u16);
1973        assert_eq!(reparsed.height, page.height() as u16);
1974        assert_eq!(reparsed.dpi, page.dpi());
1975    }
1976
1977    // ── #196 Phase 2: page_byte_range ────────────────────────────────────────
1978
1979    /// Single-page DJVU: byte range covers the entire input buffer.
1980    #[test]
1981    fn page_byte_range_single_page_covers_full_buffer() {
1982        let data =
1983            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
1984        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
1985
1986        let r = doc.page_byte_range(0).expect("page 0 must have a range");
1987        assert_eq!(r.start, 0);
1988        assert_eq!(r.end, data.len() as u64);
1989
1990        assert!(
1991            doc.page_byte_range(1).is_none(),
1992            "out-of-range index returns None"
1993        );
1994    }
1995
1996    /// Bundled DJVM: every page's byte range is non-empty, in-bounds,
1997    /// non-overlapping with neighbours, and re-parseable as a FORM.
1998    #[test]
1999    fn page_byte_range_bundled_djvm_round_trips() {
2000        let path = assets_path().join("DjVu3Spec_bundled.djvu");
2001        let Ok(data) = std::fs::read(&path) else {
2002            eprintln!("skip: {} missing", path.display());
2003            return;
2004        };
2005        let doc = DjVuDocument::parse(&data).expect("bundled DJVM parse must succeed");
2006
2007        let mut prev_end = 0u64;
2008        for i in 0..doc.page_count() {
2009            let r = doc
2010                .page_byte_range(i)
2011                .unwrap_or_else(|| panic!("page {i} must have a range"));
2012            assert!(r.end <= data.len() as u64, "page {i} range OOB");
2013            assert!(r.start < r.end, "page {i} range empty");
2014            assert!(r.start >= prev_end, "page {i} overlaps previous");
2015            prev_end = r.end;
2016
2017            // The range must start with `b"FORM"` magic.
2018            let slice = &data[r.start as usize..r.end as usize];
2019            assert_eq!(&slice[..4], b"FORM", "page {i} range must start with FORM");
2020        }
2021    }
2022
2023    /// Out-of-range page index returns None.
2024    #[test]
2025    fn page_byte_range_out_of_range() {
2026        let data =
2027            std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu must exist");
2028        let doc = DjVuDocument::parse(&data).expect("parse must succeed");
2029        assert!(doc.page_byte_range(99).is_none());
2030    }
2031
2032    /// MmapDocument opens a file and parses identically to in-memory parse.
2033    #[test]
2034    #[cfg(feature = "mmap")]
2035    fn mmap_document_matches_parse() {
2036        let path = assets_path().join("chicken.djvu");
2037        let mmap_doc = MmapDocument::open(&path).expect("mmap open should succeed");
2038        let data = std::fs::read(&path).expect("read should succeed");
2039        let mem_doc = DjVuDocument::parse(&data).expect("parse should succeed");
2040
2041        assert_eq!(mmap_doc.page_count(), mem_doc.page_count());
2042        for i in 0..mmap_doc.page_count() {
2043            let mp = mmap_doc.page(i).unwrap();
2044            let pp = mem_doc.page(i).unwrap();
2045            assert_eq!(mp.width(), pp.width());
2046            assert_eq!(mp.height(), pp.height());
2047            assert_eq!(mp.dpi(), pp.dpi());
2048        }
2049    }
2050}
djvu_rs/djvu_document.rs

djvu_rs/
djvu_document.rs