djvu_rs/
document.rs

1use crate::bitmap::Bitmap;
2use crate::error::Error;
3use crate::iff::{Chunk, DjvuFile};
4use crate::iw44::IW44Image;
5use crate::jb2::JB2Dict;
6use crate::pixmap::Pixmap;
7use std::collections::HashMap;
8use std::sync::{Arc, RwLock};
9
10#[cfg(test)]
11pub use crate::iw44::NormalizedPlanes;
12
13/// A bookmark entry from the NAVM chunk (table of contents).
14#[derive(Debug, Clone)]
15pub struct Bookmark {
16    pub title: String,
17    pub url: String,
18    pub children: Vec<Bookmark>,
19}
20
21/// Rotation values from INFO chunk flags.
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub enum Rotation {
24    None,
25    Cw90,
26    Cw180,
27    Cw270,
28}
29
30/// Page metadata from the INFO chunk.
31#[derive(Debug, Clone)]
32pub struct PageInfo {
33    pub width: u16,
34    pub height: u16,
35    pub dpi: u16,
36    /// Display gamma (e.g. 2.2). Defaults to 2.2 when the INFO byte is 0.
37    pub gamma: f32,
38    pub rotation: Rotation,
39}
40
41/// FGbz palette: per-blit color indices into an RGB palette.
42#[derive(Debug, Clone)]
43pub struct Palette {
44    pub colors: Vec<(u8, u8, u8)>,
45    pub indices: Vec<i16>,
46}
47
48/// Text zone type in the DjVu text layer hierarchy.
49#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50pub enum TextZoneKind {
51    Page = 1,
52    Column = 2,
53    Region = 3,
54    Paragraph = 4,
55    Line = 5,
56    Word = 6,
57    Character = 7,
58}
59
60/// A text zone with bounding box and text span within the page text.
61///
62/// Coordinates are in the DjVu coordinate system (origin at bottom-left, y increases upward).
63/// Use `text_start` and `text_len` to index into `TextLayer::text`.
64#[derive(Debug, Clone)]
65pub struct TextZone {
66    pub kind: TextZoneKind,
67    pub x: i32,
68    pub y: i32,
69    pub width: i32,
70    pub height: i32,
71    pub text_start: usize,
72    pub text_len: usize,
73    pub children: Vec<TextZone>,
74}
75
76/// The text layer of a DjVu page (from TXTz or TXTa chunks).
77#[derive(Debug, Clone)]
78pub struct TextLayer {
79    /// The full UTF-8 text content of the page.
80    pub text: String,
81    /// The zone hierarchy (None if the text has no zone structure).
82    pub root: Option<TextZone>,
83}
84
85impl TextLayer {
86    /// Get the text content of a specific zone.
87    pub fn zone_text(&self, zone: &TextZone) -> &str {
88        let end = (zone.text_start + zone.text_len).min(self.text.len());
89        let start = zone.text_start.min(end);
90        // Ensure we don't split multi-byte UTF-8 characters
91        if self.text.is_char_boundary(start) && self.text.is_char_boundary(end) {
92            &self.text[start..end]
93        } else {
94            ""
95        }
96    }
97}
98
99/// Component type in DIRM directory.
100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
101enum ComponentType {
102    Shared,    // 0 — DJVI
103    Page,      // 1 — DJVU
104    Thumbnail, // 2 — THUM
105}
106
107/// A component entry from the DIRM directory.
108#[derive(Debug, Clone)]
109struct DirmEntry {
110    comp_type: ComponentType,
111    id: String,
112}
113
114/// A parsed DjVu document (single-page or multi-page bundled).
115pub struct Document {
116    file: DjvuFile,
117    /// For DJVM: DIRM entries and FORM children (indexed by order in DIRM).
118    dirm_entries: Vec<DirmEntry>,
119    /// Indices into dirm_entries for page-type components only.
120    page_indices: Vec<usize>,
121    /// For single-page DJVU: true.
122    is_single_page: bool,
123    /// Cache of decoded shared JB2 dictionaries, keyed by pointer to raw Djbz chunk bytes.
124    ///
125    /// All pages that INCL the same DJVI component share one pointer → one decoded dict.
126    /// The pointer is stable because `DjvuFile` owns the underlying byte buffer for the
127    /// lifetime of `Document`.
128    dict_cache: RwLock<HashMap<usize, Arc<JB2Dict>>>,
129}
130
131impl Document {
132    /// Parse a DjVu document from raw bytes.
133    pub fn parse(data: &[u8]) -> Result<Self, Error> {
134        let file = crate::iff::parse(data)?;
135        match &file.root {
136            Chunk::Form {
137                secondary_id: [b'D', b'J', b'V', b'U'],
138                ..
139            } => {
140                Ok(Document {
141                    file,
142                    dirm_entries: vec![],
143                    page_indices: vec![0], // single page at index 0
144                    is_single_page: true,
145                    dict_cache: RwLock::new(HashMap::new()),
146                })
147            }
148            Chunk::Form {
149                secondary_id: [b'D', b'J', b'V', b'M'],
150                children,
151                ..
152            } => {
153                // Find and parse DIRM chunk
154                let dirm_chunk = children
155                    .iter()
156                    .find_map(|c| match c {
157                        Chunk::Leaf {
158                            id: [b'D', b'I', b'R', b'M'],
159                            data,
160                        } => Some(data.as_slice()),
161                        _ => None,
162                    })
163                    .ok_or(Error::MissingChunk("DIRM"))?;
164
165                let (dirm_entries, is_bundled) = parse_dirm(dirm_chunk)?;
166                if !is_bundled {
167                    return Err(Error::Unsupported("indirect DJVM not supported"));
168                }
169
170                let page_indices: Vec<usize> = dirm_entries
171                    .iter()
172                    .enumerate()
173                    .filter(|(_, e)| e.comp_type == ComponentType::Page)
174                    .map(|(i, _)| i)
175                    .collect();
176
177                Ok(Document {
178                    file,
179                    dirm_entries,
180                    page_indices,
181                    is_single_page: false,
182                    dict_cache: RwLock::new(HashMap::new()),
183                })
184            }
185            _ => Err(Error::Unsupported("not a DJVU or DJVM document")),
186        }
187    }
188
189    /// Number of pages (excluding thumbnails and shared components).
190    pub fn page_count(&self) -> usize {
191        self.page_indices.len()
192    }
193
194    /// Access a page by 0-based index.
195    pub fn page(&self, index: usize) -> Result<Page<'_>, Error> {
196        if index >= self.page_count() {
197            return Err(Error::FormatError(format!(
198                "page index {} out of range ({})",
199                index,
200                self.page_count()
201            )));
202        }
203
204        if self.is_single_page {
205            return Page::from_form(&self.file.root, self);
206        }
207
208        // Multi-page: find the FORM child corresponding to this page
209        let dirm_index = self.page_indices[index];
210        let form = self.get_component_form(dirm_index)?;
211        Page::from_form(form, self)
212    }
213
214    /// Get the FORM chunk for a DIRM component by its dirm index.
215    /// In bundled documents, FORM children after DIRM/NAVM correspond to DIRM entries in order.
216    fn get_component_form(&self, dirm_index: usize) -> Result<&Chunk, Error> {
217        let forms: Vec<&Chunk> = self
218            .file
219            .root
220            .children()
221            .iter()
222            .filter(|c| matches!(c, Chunk::Form { .. }))
223            .collect();
224
225        forms
226            .get(dirm_index)
227            .copied()
228            .ok_or(Error::FormatError(format!(
229                "component {} not found",
230                dirm_index
231            )))
232    }
233
234    /// Parse the NAVM bookmarks (table of contents).
235    ///
236    /// Returns an empty Vec if there is no NAVM chunk.
237    pub fn bookmarks(&self) -> Result<Vec<Bookmark>, Error> {
238        let navm_data = match self.file.root.find_first(b"NAVM") {
239            Some(c) => c.data(),
240            None => return Ok(vec![]),
241        };
242
243        let decoded = crate::bzz_new::bzz_decode(navm_data)
244            .map_err(|e| Error::FormatError(format!("NAVM BZZ decode: {}", e)))?;
245
246        if decoded.len() < 2 {
247            return Ok(vec![]);
248        }
249
250        let total_count = u16::from_be_bytes([decoded[0], decoded[1]]) as usize;
251        let mut pos = 2usize;
252        let mut bookmarks = Vec::new();
253        let mut decoded_count = 0usize;
254
255        while decoded_count < total_count {
256            let bm = parse_bookmark(&decoded, &mut pos, &mut decoded_count)?;
257            bookmarks.push(bm);
258        }
259
260        Ok(bookmarks)
261    }
262
263    /// Decode a thumbnail for the given page (0-based index).
264    ///
265    /// Thumbnails are stored in FORM:THUM components with TH44 (IW44) chunks.
266    /// Returns `Ok(None)` if no thumbnail exists for this page.
267    pub fn thumbnail(&self, page_index: usize) -> Result<Option<Pixmap>, Error> {
268        if self.is_single_page {
269            return Ok(None);
270        }
271
272        let mut thumb_idx: usize = 0;
273        for (i, entry) in self.dirm_entries.iter().enumerate() {
274            if entry.comp_type != ComponentType::Thumbnail {
275                continue;
276            }
277            let form = self.get_component_form(i)?;
278            let th44_chunks: Vec<&[u8]> = form
279                .find_all(b"TH44")
280                .into_iter()
281                .map(|c| c.data())
282                .collect();
283
284            let mut img = IW44Image::new();
285            for chunk_data in &th44_chunks {
286                if chunk_data.is_empty() {
287                    continue;
288                }
289                let serial = chunk_data[0];
290                if serial == 0 && img.width() > 0 {
291                    // Previous thumbnail is complete
292                    if thumb_idx == page_index {
293                        let pm = img
294                            .to_pixmap()
295                            .map_err(|e| Error::FormatError(e.to_string()))?;
296                        return Ok(Some(pm));
297                    }
298                    thumb_idx += 1;
299                    img = IW44Image::new();
300                }
301                img.decode_chunk(chunk_data)
302                    .map_err(|e| Error::FormatError(e.to_string()))?;
303            }
304            // Handle last thumbnail in this THUM
305            if img.width() > 0 {
306                if thumb_idx == page_index {
307                    let pm = img
308                        .to_pixmap()
309                        .map_err(|e| Error::FormatError(e.to_string()))?;
310                    return Ok(Some(pm));
311                }
312                thumb_idx += 1;
313            }
314        }
315
316        Ok(None)
317    }
318
319    /// Resolve an INCL reference to a shared DJVI component's children.
320    fn resolve_incl(&self, ref_id: &str) -> Result<&Chunk, Error> {
321        if self.is_single_page {
322            return Err(Error::FormatError("INCL in single-page document".into()));
323        }
324
325        for (i, entry) in self.dirm_entries.iter().enumerate() {
326            if entry.id == ref_id {
327                return self.get_component_form(i);
328            }
329        }
330
331        Err(Error::FormatError(format!(
332            "INCL target '{}' not found",
333            ref_id
334        )))
335    }
336
337    /// Return a shared reference to the decoded JB2 dictionary for the given Djbz chunk,
338    /// decoding and caching it on the first call.
339    ///
340    /// The cache is keyed by the address of the raw Djbz bytes, which is stable for the
341    /// lifetime of `Document` (the bytes are owned by `DjvuFile`).  All pages that INCL
342    /// the same DJVI component point to the same bytes, so they share one cached decode.
343    fn get_or_decode_dict(&self, djbz_data: &[u8]) -> Result<Arc<JB2Dict>, Error> {
344        let key = djbz_data.as_ptr() as usize;
345
346        // Fast path: already cached (read lock only).
347        {
348            let cache = self.dict_cache.read().unwrap();
349            if let Some(dict) = cache.get(&key) {
350                return Ok(Arc::clone(dict));
351            }
352        }
353
354        // Slow path: decode and insert (write lock).
355        let dict = crate::jb2::decode_dict(djbz_data, None)
356            .map_err(|e| Error::FormatError(e.to_string()))?;
357        let arc = Arc::new(dict);
358        self.dict_cache
359            .write()
360            .unwrap()
361            .insert(key, Arc::clone(&arc));
362        Ok(arc)
363    }
364}
365
366/// A single page within a DjVu document.
367pub struct Page<'a> {
368    pub info: PageInfo,
369    form: &'a Chunk,
370    doc: &'a Document,
371}
372
373impl<'a> Page<'a> {
374    fn from_form(form: &'a Chunk, doc: &'a Document) -> Result<Self, Error> {
375        let info_chunk = form
376            .find_first(b"INFO")
377            .ok_or(Error::MissingChunk("INFO"))?;
378        let info = parse_info(info_chunk.data())?;
379        Ok(Page { info, form, doc })
380    }
381
382    #[cfg(test)]
383    pub fn has_mask(&self) -> bool {
384        self.form.find_first(b"Sjbz").is_some()
385    }
386
387    #[cfg(test)]
388    pub fn has_background(&self) -> bool {
389        self.form.find_first(b"BG44").is_some()
390    }
391
392    /// Returns `true` when the page has an IW44-encoded foreground colour layer (`FG44` chunk).
393    ///
394    /// Does **not** account for JPEG-encoded foreground (`FGjp`) or palette foreground
395    /// (`FGbz`). Use [`has_palette`] to check for `FGbz`. The legacy `render` path does
396    /// not decode `FGjp`, so this check is consistent with [`decode_foreground`] for that path.
397    ///
398    /// [`has_palette`]: Self::has_palette
399    /// [`decode_foreground`]: Self::decode_foreground
400    pub fn has_foreground(&self) -> bool {
401        self.form.find_first(b"FG44").is_some()
402    }
403
404    pub fn has_palette(&self) -> bool {
405        self.form.find_first(b"FGbz").is_some()
406    }
407
408    /// Decode the JB2 mask layer, resolving shared dictionaries via INCL.
409    pub fn decode_mask(&self) -> Result<Option<Bitmap>, Error> {
410        let sjbz = match self.form.find_first(b"Sjbz") {
411            Some(c) => c.data(),
412            None => return Ok(None),
413        };
414
415        let shared_dict = self.resolve_shared_dict()?;
416
417        let bitmap = crate::jb2::decode(sjbz, shared_dict.as_deref())
418            .map_err(|e| Error::FormatError(e.to_string()))?;
419        Ok(Some(bitmap))
420    }
421
422    /// Decode the JB2 mask with per-pixel blit index map (for FGbz palette compositing).
423    pub fn decode_mask_indexed(&self) -> Result<Option<(Bitmap, Vec<i32>)>, Error> {
424        let sjbz = match self.form.find_first(b"Sjbz") {
425            Some(c) => c.data(),
426            None => return Ok(None),
427        };
428
429        let shared_dict = self.resolve_shared_dict()?;
430
431        let result = crate::jb2::decode_indexed(sjbz, shared_dict.as_deref())
432            .map_err(|e| Error::FormatError(e.to_string()))?;
433        Ok(Some(result))
434    }
435
436    /// Decode the IW44 background layer.
437    pub fn decode_background(&self) -> Result<Option<Pixmap>, Error> {
438        let img = match self.decode_iw44_layer(b"BG44")? {
439            Some(img) => img,
440            None => return Ok(None),
441        };
442        let pm = img
443            .to_pixmap()
444            .map_err(|e| Error::FormatError(e.to_string()))?;
445        Ok(Some(pm))
446    }
447
448    /// Number of BG44 chunks in this page (0 = no background layer).
449    pub fn bg44_chunk_count(&self) -> usize {
450        self.form.find_all(b"BG44").len()
451    }
452
453    /// Decode the IW44 background progressively: return a pixmap after each
454    /// BG44 chunk.  The first entry is a coarse (blurry) preview decoded from
455    /// just the first chunk; each subsequent entry is a more refined image.
456    /// The last entry is identical to `decode_background()`.
457    ///
458    /// Returns `Ok(None)` if there are no BG44 chunks.
459    pub fn decode_background_progressive(&self) -> Result<Option<Vec<Pixmap>>, Error> {
460        let chunks: Vec<&[u8]> = self
461            .form
462            .find_all(b"BG44")
463            .into_iter()
464            .map(|c| c.data())
465            .collect();
466
467        if chunks.is_empty() {
468            return Ok(None);
469        }
470
471        let mut img = IW44Image::new();
472        let mut frames = Vec::with_capacity(chunks.len());
473
474        for chunk_data in &chunks {
475            img.decode_chunk(chunk_data)
476                .map_err(|e| Error::FormatError(e.to_string()))?;
477            let pm = img
478                .to_pixmap()
479                .map_err(|e| Error::FormatError(e.to_string()))?;
480            frames.push(pm);
481        }
482
483        Ok(Some(frames))
484    }
485
486    /// Decode only the first BG44 chunk — a coarse (blurry) preview.
487    ///
488    /// Much faster than `decode_background()` because it skips refinement
489    /// chunks and only does one inverse wavelet transform. Returns `None`
490    /// if there are no BG44 chunks, or if there is only one chunk (in
491    /// which case `decode_background()` is already fast enough).
492    pub fn decode_background_coarse(&self) -> Result<Option<Pixmap>, Error> {
493        let chunks: Vec<&[u8]> = self
494            .form
495            .find_all(b"BG44")
496            .into_iter()
497            .map(|c| c.data())
498            .collect();
499
500        // Only worth it for multi-chunk backgrounds.
501        if chunks.len() <= 1 {
502            return Ok(None);
503        }
504
505        let mut img = IW44Image::new();
506        img.decode_chunk(chunks[0])
507            .map_err(|e| Error::FormatError(e.to_string()))?;
508        let pm = img
509            .to_pixmap()
510            .map_err(|e| Error::FormatError(e.to_string()))?;
511        Ok(Some(pm))
512    }
513
514    /// Decode the IW44 foreground layer.
515    pub fn decode_foreground(&self) -> Result<Option<Pixmap>, Error> {
516        let img = match self.decode_iw44_layer(b"FG44")? {
517            Some(img) => img,
518            None => return Ok(None),
519        };
520        let pm = img
521            .to_pixmap()
522            .map_err(|e| Error::FormatError(e.to_string()))?;
523        Ok(Some(pm))
524    }
525
526    #[cfg(test)]
527    pub fn decode_background_planes(&self) -> Result<Option<NormalizedPlanes>, Error> {
528        let img = match self.decode_iw44_layer(b"BG44")? {
529            Some(img) => img,
530            None => return Ok(None),
531        };
532        let planes = img
533            .to_normalized_planes_subsample(1)
534            .map_err(|e| Error::FormatError(e.to_string()))?;
535        Ok(Some(planes))
536    }
537
538    /// Parse the FGbz palette chunk.
539    pub fn decode_palette(&self) -> Result<Option<Palette>, Error> {
540        let fgbz = match self.form.find_first(b"FGbz") {
541            Some(c) => c.data(),
542            None => return Ok(None),
543        };
544        let palette = parse_fgbz(fgbz)?;
545        Ok(Some(palette))
546    }
547
548    /// Decode the text layer (TXTz or TXTa chunk).
549    ///
550    /// Returns `Ok(None)` if the page has no text layer.
551    pub fn text_layer(&self) -> Result<Option<TextLayer>, Error> {
552        // Try TXTz (BZZ-compressed) first, then TXTa (uncompressed)
553        let data = if let Some(txtz) = self.form.find_first(b"TXTz") {
554            let compressed = txtz.data();
555            if compressed.is_empty() {
556                return Ok(None);
557            }
558            crate::bzz_new::bzz_decode(compressed)
559                .map_err(|e| Error::FormatError(format!("TXTz BZZ decode: {}", e)))?
560        } else if let Some(txta) = self.form.find_first(b"TXTa") {
561            txta.data().to_vec()
562        } else {
563            return Ok(None);
564        };
565
566        parse_text_layer(&data)
567    }
568
569    /// Return the shared JB2 dictionary for this page, if any.
570    ///
571    /// Decodes the Djbz chunk on the first call and caches the result via
572    /// [`Document::get_or_decode_dict`].  Subsequent calls for pages that share
573    /// the same INCL component (or the same inline Djbz) are O(1) cache lookups.
574    fn resolve_shared_dict(&self) -> Result<Option<Arc<JB2Dict>>, Error> {
575        // Check all INCL chunks for an external DJVI component with Djbz
576        for incl in self.form.find_all(b"INCL") {
577            let ref_id = std::str::from_utf8(incl.data())
578                .map_err(|_| Error::FormatError("invalid INCL UTF-8".into()))?
579                .trim_end_matches('\0')
580                .trim();
581
582            let shared_form = self.doc.resolve_incl(ref_id)?;
583            if let Some(djbz) = shared_form.find_first(b"Djbz") {
584                return Ok(Some(self.doc.get_or_decode_dict(djbz.data())?));
585            }
586        }
587
588        // Then check for inline Djbz in the same FORM as Sjbz
589        if let Some(djbz) = self.form.find_first(b"Djbz") {
590            return Ok(Some(self.doc.get_or_decode_dict(djbz.data())?));
591        }
592
593        Ok(None)
594    }
595
596    fn decode_iw44_layer(&self, chunk_id: &[u8; 4]) -> Result<Option<IW44Image>, Error> {
597        let chunks: Vec<&[u8]> = self
598            .form
599            .find_all(chunk_id)
600            .into_iter()
601            .map(|c| c.data())
602            .collect();
603
604        if chunks.is_empty() {
605            return Ok(None);
606        }
607
608        let mut img = IW44Image::new();
609        for chunk_data in &chunks {
610            img.decode_chunk(chunk_data)
611                .map_err(|e| Error::FormatError(e.to_string()))?;
612        }
613        Ok(Some(img))
614    }
615}
616
617// ============================================================
618// INFO chunk parser
619// ============================================================
620
621fn parse_info(data: &[u8]) -> Result<PageInfo, Error> {
622    if data.len() < 5 {
623        return Err(Error::InvalidLength);
624    }
625
626    let width = u16::from_be_bytes([data[0], data[1]]);
627    let height = u16::from_be_bytes([data[2], data[3]]);
628    let _minver = data[4];
629    let _majver = if data.len() > 5 { data[5] } else { 0 };
630
631    // DPI is little-endian (unusual for IFF)
632    let raw_dpi = if data.len() >= 8 {
633        u16::from_le_bytes([data[6], data[7]])
634    } else {
635        300
636    };
637    let dpi = if (25..=6000).contains(&raw_dpi) {
638        raw_dpi
639    } else {
640        300
641    };
642
643    let gamma_byte = if data.len() >= 9 { data[8] } else { 0 };
644    let gamma = if gamma_byte == 0 {
645        2.2_f32
646    } else {
647        gamma_byte as f32 / 10.0
648    };
649
650    let flags = if data.len() >= 10 { data[9] } else { 0 };
651    let rotation = match flags & 0x07 {
652        5 => Rotation::Cw90,
653        2 => Rotation::Cw180,
654        6 => Rotation::Cw270,
655        _ => Rotation::None,
656    };
657
658    Ok(PageInfo {
659        width,
660        height,
661        dpi,
662        gamma,
663        rotation,
664    })
665}
666
667// ============================================================
668// DIRM chunk parser
669// ============================================================
670
671fn parse_dirm(data: &[u8]) -> Result<(Vec<DirmEntry>, bool), Error> {
672    if data.len() < 3 {
673        return Err(Error::InvalidLength);
674    }
675
676    let dflags = data[0];
677    let is_bundled = (dflags >> 7) != 0;
678    let nfiles = u16::from_be_bytes([data[1], data[2]]) as usize;
679
680    let mut pos = 3;
681
682    // Skip offsets array for bundled documents
683    if is_bundled {
684        let offsets_size = nfiles * 4;
685        if pos + offsets_size > data.len() {
686            return Err(Error::UnexpectedEof);
687        }
688        pos += offsets_size;
689    }
690
691    // Remaining bytes are BZZ-compressed metadata
692    let bzz_data = &data[pos..];
693    let meta =
694        crate::bzz_new::bzz_decode(bzz_data).map_err(|e| Error::FormatError(e.to_string()))?;
695
696    // Parse metadata: for each component, read size(3), flags(1), id(strNT), name?(strNT), title?(strNT)
697    let mut mpos = 0;
698    // First: skip sizes (3 bytes each)
699    mpos += nfiles * 3;
700
701    // Read flags for all components
702    if mpos + nfiles > meta.len() {
703        return Err(Error::UnexpectedEof);
704    }
705    let flags: Vec<u8> = meta[mpos..mpos + nfiles].to_vec();
706    mpos += nfiles;
707
708    // Read IDs and optional name/title strings
709    let mut entries = Vec::with_capacity(nfiles);
710    for &flag in flags.iter().take(nfiles) {
711        let id = read_str_nt(&meta, &mut mpos)?;
712        let has_name = (flag & 0x80) != 0;
713        let has_title = (flag & 0x40) != 0;
714        if has_name {
715            let _ = read_str_nt(&meta, &mut mpos)?;
716        }
717        if has_title {
718            let _ = read_str_nt(&meta, &mut mpos)?;
719        }
720
721        let comp_type = match flag & 0x3f {
722            1 => ComponentType::Page,
723            2 => ComponentType::Thumbnail,
724            _ => ComponentType::Shared,
725        };
726
727        entries.push(DirmEntry { comp_type, id });
728    }
729
730    Ok((entries, is_bundled))
731}
732
733fn read_str_nt(data: &[u8], pos: &mut usize) -> Result<String, Error> {
734    let start = *pos;
735    while *pos < data.len() && data[*pos] != 0 {
736        *pos += 1;
737    }
738    if *pos >= data.len() {
739        return Err(Error::UnexpectedEof);
740    }
741    let s = std::str::from_utf8(&data[start..*pos])
742        .map_err(|_| Error::FormatError("invalid UTF-8 in DIRM".into()))?;
743    *pos += 1; // skip null terminator
744    Ok(s.to_string())
745}
746
747// ============================================================
748// FGbz palette parser
749// ============================================================
750
751fn parse_fgbz(data: &[u8]) -> Result<Palette, Error> {
752    if data.len() < 3 {
753        return Err(Error::InvalidLength);
754    }
755
756    let version = data[0];
757    if (version & 0x7f) != 0 {
758        return Err(Error::Unsupported("unsupported FGbz version"));
759    }
760
761    let palette_size = u16::from_be_bytes([data[1], data[2]]) as usize;
762    let color_bytes = palette_size * 3;
763    if data.len() < 3 + color_bytes {
764        return Err(Error::UnexpectedEof);
765    }
766
767    // Colors are stored as BGR triplets
768    let mut colors = Vec::with_capacity(palette_size);
769    for i in 0..palette_size {
770        let base = 3 + i * 3;
771        let b = data[base];
772        let g = data[base + 1];
773        let r = data[base + 2];
774        colors.push((r, g, b));
775    }
776
777    let mut indices = Vec::new();
778    if (version & 0x80) != 0 {
779        let idx_start = 3 + color_bytes;
780        if idx_start + 3 > data.len() {
781            return Err(Error::UnexpectedEof);
782        }
783        let data_size = ((data[idx_start] as u32) << 16)
784            | ((data[idx_start + 1] as u32) << 8)
785            | (data[idx_start + 2] as u32);
786
787        let bzz_data = &data[idx_start + 3..];
788        let decoded =
789            crate::bzz_new::bzz_decode(bzz_data).map_err(|e| Error::FormatError(e.to_string()))?;
790
791        // Each index is i16be
792        let num_indices = data_size as usize;
793        if decoded.len() < num_indices * 2 {
794            return Err(Error::UnexpectedEof);
795        }
796        indices.reserve(num_indices);
797        for i in 0..num_indices {
798            let idx = i16::from_be_bytes([decoded[i * 2], decoded[i * 2 + 1]]);
799            indices.push(idx);
800        }
801    }
802
803    Ok(Palette { colors, indices })
804}
805
806// ============================================================
807// NAVM bookmark parser
808// ============================================================
809
810fn parse_bookmark(data: &[u8], pos: &mut usize, counter: &mut usize) -> Result<Bookmark, Error> {
811    if *pos >= data.len() {
812        return Err(Error::UnexpectedEof);
813    }
814    let children_count = data[*pos] as usize;
815    *pos += 1;
816
817    let title = read_navm_string(data, pos)?;
818    let url = read_navm_string(data, pos)?;
819    *counter += 1;
820
821    let mut children = Vec::with_capacity(children_count);
822    for _ in 0..children_count {
823        children.push(parse_bookmark(data, pos, counter)?);
824    }
825
826    Ok(Bookmark {
827        title,
828        url,
829        children,
830    })
831}
832
833fn read_navm_string(data: &[u8], pos: &mut usize) -> Result<String, Error> {
834    if *pos + 3 > data.len() {
835        return Err(Error::UnexpectedEof);
836    }
837    let len = ((data[*pos] as usize) << 16)
838        | ((data[*pos + 1] as usize) << 8)
839        | (data[*pos + 2] as usize);
840    *pos += 3;
841
842    if *pos + len > data.len() {
843        return Err(Error::UnexpectedEof);
844    }
845    let s = std::str::from_utf8(&data[*pos..*pos + len])
846        .map_err(|_| Error::FormatError("invalid UTF-8 in NAVM bookmark".into()))?;
847    *pos += len;
848    Ok(s.to_string())
849}
850
851// ============================================================
852// TXTz / TXTa text layer parser
853// ============================================================
854
855fn parse_text_layer(data: &[u8]) -> Result<Option<TextLayer>, Error> {
856    if data.len() < 3 {
857        return Ok(None);
858    }
859
860    let mut pos = 0;
861
862    // Read text length (u24be)
863    let text_len = read_text_u24(data, &mut pos)?;
864
865    // Read UTF-8 text
866    if pos + text_len > data.len() {
867        return Err(Error::UnexpectedEof);
868    }
869    let text = std::str::from_utf8(&data[pos..pos + text_len])
870        .map_err(|_| Error::FormatError("invalid UTF-8 in text layer".into()))?
871        .to_string();
872    pos += text_len;
873
874    // Read version byte
875    if pos >= data.len() {
876        return Ok(Some(TextLayer { text, root: None }));
877    }
878    let _version = data[pos];
879    pos += 1;
880
881    // Parse zone tree if there's more data
882    if pos >= data.len() {
883        return Ok(Some(TextLayer { text, root: None }));
884    }
885
886    let root = parse_text_zone(data, &mut pos, None, None)?;
887    Ok(Some(TextLayer {
888        text,
889        root: Some(root),
890    }))
891}
892
893/// Internal context for delta-encoded zone coordinates.
894struct ZoneCtx {
895    x: i32,
896    y: i32,
897    width: i32,
898    height: i32,
899    text_start: i32,
900    text_len: i32,
901}
902
903fn parse_text_zone(
904    data: &[u8],
905    pos: &mut usize,
906    parent: Option<&ZoneCtx>,
907    prev: Option<&ZoneCtx>,
908) -> Result<TextZone, Error> {
909    if *pos >= data.len() {
910        return Err(Error::UnexpectedEof);
911    }
912
913    let type_byte = data[*pos];
914    *pos += 1;
915
916    let kind = match type_byte {
917        1 => TextZoneKind::Page,
918        2 => TextZoneKind::Column,
919        3 => TextZoneKind::Region,
920        4 => TextZoneKind::Paragraph,
921        5 => TextZoneKind::Line,
922        6 => TextZoneKind::Word,
923        7 => TextZoneKind::Character,
924        _ => {
925            return Err(Error::FormatError(format!(
926                "unknown text zone type {}",
927                type_byte
928            )));
929        }
930    };
931
932    // Read raw delta-encoded values
933    let mut x = read_text_i16_biased(data, pos)?;
934    let mut y = read_text_i16_biased(data, pos)?;
935    let width = read_text_i16_biased(data, pos)?;
936    let height = read_text_i16_biased(data, pos)?;
937    let mut text_start = read_text_i16_biased(data, pos)?;
938    let text_len = read_text_i24(data, pos)?;
939
940    // Apply delta encoding (matches djvujs DjVuText.js decodeZone)
941    if let Some(prev) = prev {
942        match type_byte {
943            1 | 4 | 5 => {
944                // PAGE, PARAGRAPH, LINE
945                x += prev.x;
946                y = prev.y - (y + height);
947            }
948            _ => {
949                // COLUMN, REGION, WORD, CHARACTER
950                x += prev.x + prev.width;
951                y += prev.y;
952            }
953        }
954        text_start += prev.text_start + prev.text_len;
955    } else if let Some(parent) = parent {
956        x += parent.x;
957        y = parent.y + parent.height - (y + height);
958        text_start += parent.text_start;
959    }
960
961    // Read children count (i24)
962    let children_count = read_text_i24(data, pos)?.max(0) as usize;
963
964    let ctx = ZoneCtx {
965        x,
966        y,
967        width,
968        height,
969        text_start,
970        text_len,
971    };
972
973    let mut children = Vec::with_capacity(children_count);
974    let mut prev_child: Option<ZoneCtx> = None;
975
976    for _ in 0..children_count {
977        let child = parse_text_zone(data, pos, Some(&ctx), prev_child.as_ref())?;
978        prev_child = Some(ZoneCtx {
979            x: child.x,
980            y: child.y,
981            width: child.width,
982            height: child.height,
983            text_start: child.text_start as i32,
984            text_len: child.text_len as i32,
985        });
986        children.push(child);
987    }
988
989    Ok(TextZone {
990        kind,
991        x,
992        y,
993        width,
994        height,
995        text_start: text_start.max(0) as usize,
996        text_len: text_len.max(0) as usize,
997        children,
998    })
999}
1000
1001fn read_text_u24(data: &[u8], pos: &mut usize) -> Result<usize, Error> {
1002    if *pos + 3 > data.len() {
1003        return Err(Error::UnexpectedEof);
1004    }
1005    let val = ((data[*pos] as usize) << 16)
1006        | ((data[*pos + 1] as usize) << 8)
1007        | (data[*pos + 2] as usize);
1008    *pos += 3;
1009    Ok(val)
1010}
1011
1012fn read_text_i16_biased(data: &[u8], pos: &mut usize) -> Result<i32, Error> {
1013    if *pos + 2 > data.len() {
1014        return Err(Error::UnexpectedEof);
1015    }
1016    let raw = u16::from_be_bytes([data[*pos], data[*pos + 1]]);
1017    *pos += 2;
1018    Ok(raw as i32 - 0x8000)
1019}
1020
1021fn read_text_i24(data: &[u8], pos: &mut usize) -> Result<i32, Error> {
1022    if *pos + 3 > data.len() {
1023        return Err(Error::UnexpectedEof);
1024    }
1025    let val =
1026        ((data[*pos] as i32) << 16) | ((data[*pos + 1] as i32) << 8) | (data[*pos + 2] as i32);
1027    *pos += 3;
1028    Ok(val)
1029}
1030
1031#[cfg(test)]
1032mod tests {
1033    use super::*;
1034
1035    fn assets_path() -> std::path::PathBuf {
1036        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1037            .join("references/djvujs/library/assets")
1038    }
1039
1040    fn golden_path() -> std::path::PathBuf {
1041        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/golden/document")
1042    }
1043
1044    #[test]
1045    fn page_counts() {
1046        let cases: &[(&str, usize)] = &[
1047            ("boy_jb2.djvu", 1),
1048            ("boy.djvu", 1),
1049            ("chicken.djvu", 1),
1050            ("navm_fgbz.djvu", 6),
1051            ("DjVu3Spec_bundled.djvu", 71),
1052            ("colorbook.djvu", 62),
1053        ];
1054        for (file, expected) in cases {
1055            let data = std::fs::read(assets_path().join(file)).unwrap();
1056            let doc = Document::parse(&data).unwrap();
1057            assert_eq!(
1058                doc.page_count(),
1059                *expected,
1060                "page count mismatch for {}",
1061                file
1062            );
1063        }
1064    }
1065
1066    #[test]
1067    fn page_dimensions_navm_fgbz() {
1068        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1069        let doc = Document::parse(&data).unwrap();
1070
1071        let golden = std::fs::read_to_string(golden_path().join("navm_fgbz_sizes.txt")).unwrap();
1072        for (i, line) in golden.lines().enumerate() {
1073            let page = doc.page(i).unwrap();
1074            let expected = format!("width={} height={}", page.info.width, page.info.height);
1075            assert_eq!(
1076                expected,
1077                line.trim(),
1078                "size mismatch for navm_fgbz page {}",
1079                i + 1
1080            );
1081        }
1082    }
1083
1084    #[test]
1085    fn page_dimensions_djvu3spec() {
1086        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu")).unwrap();
1087        let doc = Document::parse(&data).unwrap();
1088
1089        let golden =
1090            std::fs::read_to_string(golden_path().join("djvu3spec_bundled_sizes.txt")).unwrap();
1091        for (i, line) in golden.lines().enumerate() {
1092            if line.trim().is_empty() {
1093                continue;
1094            }
1095            let page = doc.page(i).unwrap();
1096            let expected = format!("width={} height={}", page.info.width, page.info.height);
1097            assert_eq!(
1098                expected,
1099                line.trim(),
1100                "size mismatch for djvu3spec page {}",
1101                i + 1
1102            );
1103        }
1104    }
1105
1106    #[test]
1107    fn layer_availability() {
1108        // boy_jb2: mask only
1109        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1110        let doc = Document::parse(&data).unwrap();
1111        let p = doc.page(0).unwrap();
1112        assert!(p.has_mask());
1113        assert!(!p.has_background());
1114        assert!(!p.has_foreground());
1115        assert!(!p.has_palette());
1116
1117        // chicken: background only
1118        let data = std::fs::read(assets_path().join("chicken.djvu")).unwrap();
1119        let doc = Document::parse(&data).unwrap();
1120        let p = doc.page(0).unwrap();
1121        assert!(!p.has_mask());
1122        assert!(p.has_background());
1123        assert!(!p.has_foreground());
1124        assert!(!p.has_palette());
1125
1126        // navm_fgbz p1: mask + palette + background
1127        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1128        let doc = Document::parse(&data).unwrap();
1129        let p = doc.page(0).unwrap();
1130        assert!(p.has_mask());
1131        assert!(p.has_background());
1132        assert!(!p.has_foreground());
1133        assert!(p.has_palette());
1134    }
1135
1136    #[test]
1137    fn decode_mask_matches_direct_boy_jb2() {
1138        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1139
1140        // Via document API
1141        let doc = Document::parse(&data).unwrap();
1142        let mask_via_doc = doc.page(0).unwrap().decode_mask().unwrap().unwrap();
1143
1144        // Via direct JB2 decode
1145        let file = crate::iff::parse(&data).unwrap();
1146        let sjbz = file.root.find_first(b"Sjbz").unwrap();
1147        let mask_direct = crate::jb2::decode(sjbz.data(), None).unwrap();
1148
1149        assert_eq!(mask_via_doc.data, mask_direct.data, "mask data mismatch");
1150    }
1151
1152    #[test]
1153    fn decode_mask_with_shared_dict() {
1154        // navm_fgbz page 1 uses INCL → dict0006.iff
1155        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1156        let doc = Document::parse(&data).unwrap();
1157        let mask = doc.page(0).unwrap().decode_mask().unwrap();
1158        assert!(mask.is_some(), "expected mask for navm_fgbz p1");
1159        let bm = mask.unwrap();
1160        assert_eq!(bm.width, 2550);
1161        assert_eq!(bm.height, 3300);
1162    }
1163
1164    /// Calling decode_mask() twice on the same page must return bit-for-bit identical
1165    /// bitmaps.  This verifies that the shared-dict cache is not corrupted across
1166    /// repeated calls — a regression that would be invisible if the dict were
1167    /// re-decoded from scratch each time (both calls would independently produce the
1168    /// same output, masking any cache-poisoning bug).
1169    #[test]
1170    fn decode_mask_repeated_calls_identical() {
1171        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1172        let doc = Document::parse(&data).unwrap();
1173        let bm1 = doc.page(0).unwrap().decode_mask().unwrap().unwrap();
1174        let bm2 = doc.page(0).unwrap().decode_mask().unwrap().unwrap();
1175        assert_eq!(
1176            bm1.data, bm2.data,
1177            "decode_mask repeated calls must be identical (cache must not corrupt dict state)"
1178        );
1179    }
1180
1181    #[test]
1182    fn decode_background_chicken() {
1183        let data = std::fs::read(assets_path().join("chicken.djvu")).unwrap();
1184        let doc = Document::parse(&data).unwrap();
1185        let bg = doc.page(0).unwrap().decode_background().unwrap();
1186        assert!(bg.is_some());
1187        let pm = bg.unwrap();
1188        assert_eq!(pm.width, 181);
1189        assert_eq!(pm.height, 240);
1190    }
1191
1192    #[test]
1193    fn decode_palette_navm_fgbz() {
1194        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1195        let doc = Document::parse(&data).unwrap();
1196        let pal = doc.page(0).unwrap().decode_palette().unwrap();
1197        assert!(pal.is_some());
1198        let p = pal.unwrap();
1199        assert_eq!(p.colors.len(), 2); // FGbz with 2 colors per dump
1200    }
1201
1202    #[test]
1203    fn page_info_dpi() {
1204        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1205        let doc = Document::parse(&data).unwrap();
1206        let p = doc.page(0).unwrap();
1207        assert_eq!(p.info.dpi, 300);
1208    }
1209
1210    #[test]
1211    #[ignore]
1212    fn debug_bg_lowres_vs_ddjvu() {
1213        let cases = [
1214            ("carte.djvu", 0usize, "/tmp/rdjvu_debug/carte_bg_sub3.ppm"),
1215            (
1216                "colorbook.djvu",
1217                0usize,
1218                "/tmp/rdjvu_debug/colorbook_p1_bg_sub3.ppm",
1219            ),
1220            (
1221                "navm_fgbz.djvu",
1222                3usize,
1223                "/tmp/rdjvu_debug/navm_p4_bg_sub3.ppm",
1224            ),
1225        ];
1226        for (file, page_idx, ref_file) in cases {
1227            let ref_path = std::path::Path::new(ref_file);
1228            if !ref_path.exists() {
1229                continue;
1230            }
1231            let data = std::fs::read(assets_path().join(file)).unwrap();
1232            let doc = Document::parse(&data).unwrap();
1233            let page = doc.page(page_idx).unwrap();
1234            let bg = page.decode_background().unwrap().unwrap();
1235            let actual = bg.to_ppm();
1236            let expected = std::fs::read(ref_path).unwrap();
1237            let header_end = actual.iter().position(|&b| b == b'\n').unwrap() + 1;
1238            let header_end = header_end
1239                + actual[header_end..]
1240                    .iter()
1241                    .position(|&b| b == b'\n')
1242                    .unwrap()
1243                + 1;
1244            let header_end = header_end
1245                + actual[header_end..]
1246                    .iter()
1247                    .position(|&b| b == b'\n')
1248                    .unwrap()
1249                + 1;
1250            let a = &actual[header_end..];
1251            let e = &expected[header_end..];
1252            let mut diff_px = 0usize;
1253            let mut abs = [0u64; 3];
1254            let px = (a.len().min(e.len())) / 3;
1255            for p in 0..px {
1256                let i = p * 3;
1257                if a[i] != e[i] || a[i + 1] != e[i + 1] || a[i + 2] != e[i + 2] {
1258                    diff_px += 1;
1259                }
1260                abs[0] += (a[i] as i32 - e[i] as i32).unsigned_abs() as u64;
1261                abs[1] += (a[i + 1] as i32 - e[i + 1] as i32).unsigned_abs() as u64;
1262                abs[2] += (a[i + 2] as i32 - e[i + 2] as i32).unsigned_abs() as u64;
1263            }
1264            eprintln!(
1265                "{} p{} bg-lowres mismatch_px={} mean_abs=({:.3},{:.3},{:.3}) dims_a={} dims_e={}",
1266                file,
1267                page_idx + 1,
1268                diff_px,
1269                abs[0] as f64 / px as f64,
1270                abs[1] as f64 / px as f64,
1271                abs[2] as f64 / px as f64,
1272                a.len() / 3,
1273                e.len() / 3
1274            );
1275        }
1276    }
1277
1278    #[test]
1279    fn bookmarks_navm_fgbz() {
1280        let data = std::fs::read(assets_path().join("navm_fgbz.djvu")).unwrap();
1281        let doc = Document::parse(&data).unwrap();
1282        let bm = doc.bookmarks().unwrap();
1283
1284        // 4 top-level bookmarks
1285        assert_eq!(bm.len(), 4);
1286
1287        assert_eq!(bm[0].title, "Links");
1288        assert_eq!(bm[0].url, "#1");
1289        assert!(bm[0].children.is_empty());
1290
1291        assert_eq!(bm[1].title, "Ink, Rectangles, Ellipses, Lines");
1292        assert_eq!(bm[1].url, "#2");
1293        assert!(bm[1].children.is_empty());
1294
1295        assert_eq!(bm[2].title, "Stamps");
1296        assert_eq!(bm[2].url, "#3");
1297        assert_eq!(bm[2].children.len(), 2);
1298        assert_eq!(bm[2].children[0].title, "Stamps - Faces");
1299        assert_eq!(bm[2].children[0].url, "#4");
1300        assert_eq!(bm[2].children[1].title, "Stamps - Pointers");
1301        assert_eq!(bm[2].children[1].url, "#5");
1302
1303        assert_eq!(bm[3].title, "Last Page");
1304        assert_eq!(bm[3].url, "#6");
1305        assert!(bm[3].children.is_empty());
1306    }
1307
1308    #[test]
1309    fn bookmarks_empty_for_single_page() {
1310        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1311        let doc = Document::parse(&data).unwrap();
1312        let bm = doc.bookmarks().unwrap();
1313        assert!(bm.is_empty());
1314    }
1315
1316    #[test]
1317    fn bookmarks_empty_for_no_navm() {
1318        let data = std::fs::read(assets_path().join("colorbook.djvu")).unwrap();
1319        let doc = Document::parse(&data).unwrap();
1320        let bm = doc.bookmarks().unwrap();
1321        assert!(bm.is_empty());
1322    }
1323
1324    // --- Phase 6.2: Edge case tests ---
1325
1326    #[test]
1327    fn document_empty_input() {
1328        assert!(Document::parse(&[]).is_err());
1329    }
1330
1331    #[test]
1332    fn document_truncated_file() {
1333        // Just the AT&T magic, not enough for a FORM
1334        assert!(Document::parse(b"AT&T").is_err());
1335    }
1336
1337    #[test]
1338    fn document_missing_info_chunk() {
1339        // Valid IFF structure but no INFO chunk — page should have sensible error
1340        let mut data = b"AT&TFORM".to_vec();
1341        let form_size = 4 + 4 + 4 + 4; // secondary + chunk_id + size + data(4)
1342        data.extend_from_slice(&(form_size as u32).to_be_bytes());
1343        data.extend_from_slice(b"DJVU");
1344        data.extend_from_slice(b"Sjbz");
1345        data.extend_from_slice(&4u32.to_be_bytes());
1346        data.extend_from_slice(&[0u8; 4]);
1347        let result = Document::parse(&data);
1348        // Should either fail to parse or fail when accessing page info
1349        match result {
1350            Err(_) => {} // expected
1351            Ok(doc) => {
1352                assert!(doc.page(0).is_err());
1353            }
1354        }
1355    }
1356
1357    #[test]
1358    fn document_page_out_of_bounds() {
1359        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1360        let doc = Document::parse(&data).unwrap();
1361        assert_eq!(doc.page_count(), 1);
1362        assert!(doc.page(1).is_err());
1363        assert!(doc.page(100).is_err());
1364    }
1365
1366    #[test]
1367    fn document_missing_optional_chunks() {
1368        // boy_jb2.djvu has no BG44 or FG44 — decode should return None
1369        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1370        let doc = Document::parse(&data).unwrap();
1371        let page = doc.page(0).unwrap();
1372        assert!(page.decode_background().unwrap().is_none());
1373        assert!(page.decode_foreground().unwrap().is_none());
1374        assert!(!page.has_palette());
1375    }
1376
1377    // --- Text extraction tests ---
1378
1379    fn text_golden_path() -> std::path::PathBuf {
1380        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/golden/text")
1381    }
1382
1383    /// Format a TextZone tree as djvused print-txt output for comparison.
1384    fn format_zone(layer: &TextLayer, zone: &TextZone, indent: usize) -> String {
1385        let mut out = String::new();
1386        let pad = " ".repeat(indent);
1387        let kind_str = match zone.kind {
1388            TextZoneKind::Page => "page",
1389            TextZoneKind::Column => "column",
1390            TextZoneKind::Region => "region",
1391            TextZoneKind::Paragraph => "para",
1392            TextZoneKind::Line => "line",
1393            TextZoneKind::Word => "word",
1394            TextZoneKind::Character => "char",
1395        };
1396        let x2 = zone.x + zone.width;
1397        let y2 = zone.y + zone.height;
1398
1399        if zone.children.is_empty() {
1400            // Leaf zone: include text (strip trailing whitespace like djvused)
1401            let text = layer.zone_text(zone);
1402            let trimmed = text.trim_end();
1403            let escaped = djvused_escape(trimmed);
1404            out.push_str(&format!(
1405                "{}({} {} {} {} {} \"{}\")",
1406                pad, kind_str, zone.x, zone.y, x2, y2, escaped
1407            ));
1408        } else {
1409            out.push_str(&format!(
1410                "{}({} {} {} {} {}",
1411                pad, kind_str, zone.x, zone.y, x2, y2
1412            ));
1413            for child in &zone.children {
1414                out.push('\n');
1415                out.push_str(&format_zone(layer, child, indent + 1));
1416            }
1417            out.push(')');
1418        }
1419        out
1420    }
1421
1422    /// Escape text like djvused: non-printable and non-ASCII bytes as 3-digit octal.
1423    fn djvused_escape(text: &str) -> String {
1424        let mut out = String::new();
1425        for b in text.bytes() {
1426            match b {
1427                b'\\' => out.push_str("\\\\"),
1428                b'"' => out.push_str("\\\""),
1429                0x20..=0x7e => out.push(b as char),
1430                _ => out.push_str(&format!("\\{:03o}", b)),
1431            }
1432        }
1433        out
1434    }
1435
1436    fn format_text_layer(layer: &TextLayer) -> String {
1437        match &layer.root {
1438            Some(root) => format_zone(layer, root, 0),
1439            None => String::new(),
1440        }
1441    }
1442
1443    #[test]
1444    fn text_layer_none_for_no_text() {
1445        // boy_jb2 has no text layer
1446        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1447        let doc = Document::parse(&data).unwrap();
1448        let tl = doc.page(0).unwrap().text_layer().unwrap();
1449        assert!(tl.is_none());
1450    }
1451
1452    #[test]
1453    fn text_layer_carte_p1() {
1454        let data = std::fs::read(assets_path().join("carte.djvu")).unwrap();
1455        let doc = Document::parse(&data).unwrap();
1456        let tl = doc.page(0).unwrap().text_layer().unwrap().unwrap();
1457
1458        // Verify text is non-empty
1459        assert!(!tl.text.is_empty(), "carte text should not be empty");
1460
1461        // Verify root zone is PAGE type
1462        let root = tl.root.as_ref().unwrap();
1463        assert_eq!(root.kind, TextZoneKind::Page);
1464
1465        // Compare against golden djvused output
1466        let golden = std::fs::read_to_string(text_golden_path().join("carte_p1.txt")).unwrap();
1467        let actual = format_text_layer(&tl);
1468        assert_eq!(actual.trim(), golden.trim(), "carte p1 text mismatch");
1469    }
1470
1471    #[test]
1472    fn text_layer_djvu3spec_p1() {
1473        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu")).unwrap();
1474        let doc = Document::parse(&data).unwrap();
1475        let tl = doc.page(0).unwrap().text_layer().unwrap().unwrap();
1476
1477        assert!(!tl.text.is_empty());
1478
1479        let root = tl.root.as_ref().unwrap();
1480        assert_eq!(root.kind, TextZoneKind::Page);
1481        // DjVu3Spec has full hierarchy: page → column → region → para → line → word
1482        assert!(!root.children.is_empty());
1483
1484        let golden = std::fs::read_to_string(text_golden_path().join("djvu3spec_p1.txt")).unwrap();
1485        let actual = format_text_layer(&tl);
1486        assert_eq!(actual.trim(), golden.trim(), "djvu3spec p1 text mismatch");
1487    }
1488
1489    #[test]
1490    fn text_layer_colorbook_p1() {
1491        let data = std::fs::read(assets_path().join("colorbook.djvu")).unwrap();
1492        let doc = Document::parse(&data).unwrap();
1493        let tl = doc.page(0).unwrap().text_layer().unwrap().unwrap();
1494
1495        assert!(!tl.text.is_empty());
1496
1497        let golden = std::fs::read_to_string(text_golden_path().join("colorbook_p1.txt")).unwrap();
1498        let actual = format_text_layer(&tl);
1499        assert_eq!(actual.trim(), golden.trim(), "colorbook p1 text mismatch");
1500    }
1501
1502    #[test]
1503    fn text_layer_czech_p6_utf8() {
1504        // Czech text with non-ASCII characters
1505        let data = std::fs::read(assets_path().join("czech.djvu")).unwrap();
1506        let doc = Document::parse(&data).unwrap();
1507        let tl = doc.page(5).unwrap().text_layer().unwrap().unwrap();
1508
1509        assert!(!tl.text.is_empty());
1510
1511        let golden = std::fs::read_to_string(text_golden_path().join("czech_p6.txt")).unwrap();
1512        let actual = format_text_layer(&tl);
1513        assert_eq!(actual.trim(), golden.trim(), "czech p6 text mismatch");
1514    }
1515
1516    #[test]
1517    fn text_layer_zone_text_access() {
1518        // Test the zone_text helper
1519        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu")).unwrap();
1520        let doc = Document::parse(&data).unwrap();
1521        let tl = doc.page(0).unwrap().text_layer().unwrap().unwrap();
1522
1523        // Find the first word zone
1524        fn find_first_word(zone: &TextZone) -> Option<&TextZone> {
1525            if zone.kind == TextZoneKind::Word {
1526                return Some(zone);
1527            }
1528            for child in &zone.children {
1529                if let Some(w) = find_first_word(child) {
1530                    return Some(w);
1531                }
1532            }
1533            None
1534        }
1535
1536        let root = tl.root.as_ref().unwrap();
1537        let word = find_first_word(root).expect("should have at least one word");
1538        let text = tl.zone_text(word);
1539        assert!(!text.is_empty(), "first word text should not be empty");
1540    }
1541
1542    #[test]
1543    fn text_layer_all_pages_djvu3spec() {
1544        // All 71 pages should parse without error
1545        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu")).unwrap();
1546        let doc = Document::parse(&data).unwrap();
1547        for i in 0..doc.page_count() {
1548            let result = doc.page(i).unwrap().text_layer();
1549            assert!(result.is_ok(), "text_layer failed for djvu3spec page {}", i);
1550        }
1551    }
1552
1553    // --- Thumbnail tests ---
1554
1555    #[test]
1556    fn thumbnail_carte() {
1557        let data = std::fs::read(assets_path().join("carte.djvu")).unwrap();
1558        let doc = Document::parse(&data).unwrap();
1559        let thumb = doc
1560            .thumbnail(0)
1561            .unwrap()
1562            .expect("carte should have a thumbnail");
1563        // Thumbnail should be much smaller than the page (4200x2556)
1564        assert!(
1565            thumb.width > 0 && thumb.width < 500,
1566            "thumb width: {}",
1567            thumb.width
1568        );
1569        assert!(
1570            thumb.height > 0 && thumb.height < 500,
1571            "thumb height: {}",
1572            thumb.height
1573        );
1574        assert_eq!(
1575            thumb.data.len(),
1576            thumb.width as usize * thumb.height as usize * 4
1577        );
1578    }
1579
1580    #[test]
1581    fn thumbnail_djvu3spec_all_pages() {
1582        let data = std::fs::read(assets_path().join("DjVu3Spec_bundled.djvu")).unwrap();
1583        let doc = Document::parse(&data).unwrap();
1584        let mut count = 0;
1585        for i in 0..doc.page_count() {
1586            if let Some(thumb) = doc.thumbnail(i).unwrap() {
1587                assert!(
1588                    thumb.width > 0 && thumb.height > 0,
1589                    "page {} thumb empty",
1590                    i
1591                );
1592                assert_eq!(
1593                    thumb.data.len(),
1594                    thumb.width as usize * thumb.height as usize * 4,
1595                    "page {} thumb data mismatch",
1596                    i
1597                );
1598                count += 1;
1599            }
1600        }
1601        assert_eq!(count, 71, "expected 71 thumbnails, got {}", count);
1602    }
1603
1604    #[test]
1605    fn thumbnail_none_for_single_page() {
1606        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1607        let doc = Document::parse(&data).unwrap();
1608        assert!(doc.thumbnail(0).unwrap().is_none());
1609    }
1610
1611    #[test]
1612    fn thumbnail_none_for_no_thum() {
1613        let data = std::fs::read(assets_path().join("colorbook.djvu")).unwrap();
1614        let doc = Document::parse(&data).unwrap();
1615        assert!(doc.thumbnail(0).unwrap().is_none());
1616    }
1617
1618    // ── Progressive decode ─────────────────────────────────────────────
1619
1620    #[test]
1621    fn progressive_bg_returns_frames_per_chunk() {
1622        // carte.djvu is a color page with multiple BG44 chunks.
1623        let data = std::fs::read(assets_path().join("carte.djvu")).unwrap();
1624        let doc = Document::parse(&data).unwrap();
1625        let page = doc.page(0).unwrap();
1626        let chunk_count = page.bg44_chunk_count();
1627        assert!(
1628            chunk_count > 1,
1629            "need multi-chunk file for progressive test"
1630        );
1631
1632        let frames = page.decode_background_progressive().unwrap().unwrap();
1633        assert_eq!(frames.len(), chunk_count, "one frame per BG44 chunk");
1634
1635        // Each frame should have the same dimensions.
1636        let (w, h) = (frames[0].width, frames[0].height);
1637        for (i, f) in frames.iter().enumerate() {
1638            assert_eq!((f.width, f.height), (w, h), "frame {i} size mismatch");
1639        }
1640    }
1641
1642    #[test]
1643    fn progressive_last_frame_matches_full_decode() {
1644        let data = std::fs::read(assets_path().join("carte.djvu")).unwrap();
1645        let doc = Document::parse(&data).unwrap();
1646        let page = doc.page(0).unwrap();
1647
1648        let full = page.decode_background().unwrap().unwrap();
1649        let frames = page.decode_background_progressive().unwrap().unwrap();
1650        let last = frames.last().unwrap();
1651
1652        assert_eq!(full.width, last.width);
1653        assert_eq!(full.height, last.height);
1654        assert_eq!(
1655            full.data, last.data,
1656            "last progressive frame must match full decode"
1657        );
1658    }
1659
1660    #[test]
1661    fn progressive_single_chunk_returns_one_frame() {
1662        // boy.djvu has a background but likely only 1 BG44 chunk.
1663        let data = std::fs::read(assets_path().join("boy.djvu")).unwrap();
1664        let doc = Document::parse(&data).unwrap();
1665        let page = doc.page(0).unwrap();
1666        if page.bg44_chunk_count() <= 1 {
1667            let frames = page.decode_background_progressive().unwrap().unwrap();
1668            assert_eq!(frames.len(), 1);
1669        }
1670    }
1671
1672    #[test]
1673    fn coarse_decode_returns_blurry_frame() {
1674        let data = std::fs::read(assets_path().join("carte.djvu")).unwrap();
1675        let doc = Document::parse(&data).unwrap();
1676        let page = doc.page(0).unwrap();
1677        assert!(page.bg44_chunk_count() > 1);
1678
1679        let coarse = page.decode_background_coarse().unwrap().unwrap();
1680        let full = page.decode_background().unwrap().unwrap();
1681
1682        // Same dimensions, different pixel data (coarse is blurrier).
1683        assert_eq!(coarse.width, full.width);
1684        assert_eq!(coarse.height, full.height);
1685        assert_ne!(coarse.data, full.data, "coarse should differ from full");
1686    }
1687
1688    #[test]
1689    fn coarse_decode_single_chunk_returns_none() {
1690        let data = std::fs::read(assets_path().join("boy.djvu")).unwrap();
1691        let doc = Document::parse(&data).unwrap();
1692        let page = doc.page(0).unwrap();
1693        if page.bg44_chunk_count() <= 1 {
1694            assert!(page.decode_background_coarse().unwrap().is_none());
1695        }
1696    }
1697
1698    #[test]
1699    fn progressive_no_bg_returns_none() {
1700        // boy_jb2.djvu has no BG44 chunks.
1701        let data = std::fs::read(assets_path().join("boy_jb2.djvu")).unwrap();
1702        let doc = Document::parse(&data).unwrap();
1703        let page = doc.page(0).unwrap();
1704        assert_eq!(page.bg44_chunk_count(), 0);
1705        assert!(page.decode_background_progressive().unwrap().is_none());
1706    }
1707}
djvu_rs/document.rs

djvu_rs/
document.rs