Skip to main content

djvu_rs/
djvm.rs

1//! DJVM document merge and split operations.
2//!
3//! Provides [`merge`] to combine multiple DjVu documents into a single
4//! bundled DJVM, and [`split`] to extract page ranges from a document.
5//!
6//! [`merge`]: crate::djvm::merge
7//! [`split`]: crate::djvm::split
8
9#[cfg(not(feature = "std"))]
10use alloc::{format, string::String, vec, vec::Vec};
11
12use crate::djvu_document::DjVuDocument;
13use crate::error::IffError;
14use crate::iff;
15
16/// Error type for merge/split operations.
17#[derive(Debug, thiserror::Error)]
18pub enum DjvmError {
19    /// IFF container parse error.
20    #[error("IFF parse error: {0}")]
21    Iff(#[from] IffError),
22
23    /// Document model error.
24    #[error("document error: {0}")]
25    Doc(#[from] crate::djvu_document::DocError),
26
27    /// No pages to merge.
28    #[error("no pages to merge")]
29    EmptyMerge,
30
31    /// Page range is out of bounds.
32    #[error("page range {start}..{end} is out of bounds (document has {count} pages)")]
33    PageRangeOutOfBounds {
34        start: usize,
35        end: usize,
36        count: usize,
37    },
38}
39
40/// Merge multiple DjVu documents (raw bytes) into a single bundled DJVM.
41///
42/// Each input document contributes all its pages to the output.
43/// Shared dictionaries (DJVI components) are included and INCL
44/// references are preserved within each source document's pages.
45pub fn merge(documents: &[&[u8]]) -> Result<Vec<u8>, DjvmError> {
46    if documents.is_empty() {
47        return Err(DjvmError::EmptyMerge);
48    }
49
50    let mut components: Vec<Vec<u8>> = Vec::new();
51    let mut component_ids: Vec<String> = Vec::new();
52    let mut component_flags: Vec<u8> = Vec::new();
53
54    for (doc_idx, &doc_data) in documents.iter().enumerate() {
55        let form = iff::parse_form(doc_data)?;
56
57        if &form.form_type == b"DJVU" {
58            // Single-page document — the whole file is one page
59            components.push(doc_data.to_vec());
60            component_ids.push(format!("p{:04}.djvu", components.len()));
61            component_flags.push(1); // page
62        } else if &form.form_type == b"DJVM" {
63            // Multi-page bundled document — extract each FORM child
64            for chunk in &form.chunks {
65                if &chunk.id == b"FORM" && chunk.data.len() >= 4 {
66                    let child_form_type = &chunk.data[..4];
67
68                    // Wrap the chunk data back into a full FORM with AT&T header
69                    let mut form_bytes = Vec::with_capacity(4 + 4 + 4 + chunk.data.len());
70                    form_bytes.extend_from_slice(b"AT&T");
71                    form_bytes.extend_from_slice(b"FORM");
72                    let form_len = chunk.data.len() as u32;
73                    form_bytes.extend_from_slice(&form_len.to_be_bytes());
74                    form_bytes.extend_from_slice(chunk.data);
75
76                    components.push(form_bytes);
77                    component_ids.push(format!("d{}p{:04}.djvu", doc_idx, components.len()));
78
79                    let flag = if child_form_type == b"DJVI" { 0 } else { 1 }; // 0 = shared, 1 = page
80                    component_flags.push(flag);
81                }
82            }
83        }
84    }
85
86    if components.is_empty() {
87        return Err(DjvmError::EmptyMerge);
88    }
89
90    build_djvm(&components, &component_ids, &component_flags)
91}
92
93/// Split a document, extracting pages in the given range (0-based, exclusive end).
94///
95/// Returns raw DjVu bytes for a new document containing only the requested pages.
96pub fn split(doc_data: &[u8], start: usize, end: usize) -> Result<Vec<u8>, DjvmError> {
97    let doc = DjVuDocument::parse(doc_data)?;
98    let count = doc.page_count();
99
100    if start >= count || end > count || start >= end {
101        return Err(DjvmError::PageRangeOutOfBounds { start, end, count });
102    }
103
104    let form = iff::parse_form(doc_data)?;
105
106    // Single-page document: just return the whole thing
107    if &form.form_type == b"DJVU" && start == 0 && end == 1 {
108        return Ok(doc_data.to_vec());
109    }
110
111    // For a single page extraction from a multi-page document
112    if end - start == 1 && &form.form_type == b"DJVM" {
113        let mut page_idx = 0;
114        for chunk in &form.chunks {
115            if &chunk.id == b"FORM" && chunk.data.len() >= 4 && &chunk.data[..4] == b"DJVU" {
116                if page_idx == start {
117                    let mut result = Vec::with_capacity(4 + 4 + 4 + chunk.data.len());
118                    result.extend_from_slice(b"AT&T");
119                    result.extend_from_slice(b"FORM");
120                    let len = chunk.data.len() as u32;
121                    result.extend_from_slice(&len.to_be_bytes());
122                    result.extend_from_slice(chunk.data);
123                    return Ok(result);
124                }
125                page_idx += 1;
126            }
127        }
128    }
129
130    // Multiple pages: build a new DJVM bundle with the requested range
131    let mut components: Vec<Vec<u8>> = Vec::new();
132    let mut component_ids: Vec<String> = Vec::new();
133    let mut component_flags: Vec<u8> = Vec::new();
134
135    // First pass: collect shared components (DJVI) that might be needed
136    for chunk in &form.chunks {
137        if &chunk.id == b"FORM" && chunk.data.len() >= 4 && &chunk.data[..4] == b"DJVI" {
138            let mut form_bytes = Vec::with_capacity(4 + 4 + 4 + chunk.data.len());
139            form_bytes.extend_from_slice(b"AT&T");
140            form_bytes.extend_from_slice(b"FORM");
141            let len = chunk.data.len() as u32;
142            form_bytes.extend_from_slice(&len.to_be_bytes());
143            form_bytes.extend_from_slice(chunk.data);
144            components.push(form_bytes);
145            component_ids.push(format!("shared{}.djvi", components.len()));
146            component_flags.push(0); // shared
147        }
148    }
149
150    // Second pass: collect pages in the requested range
151    let mut page_idx = 0;
152    for chunk in &form.chunks {
153        if &chunk.id == b"FORM" && chunk.data.len() >= 4 && &chunk.data[..4] == b"DJVU" {
154            if page_idx >= start && page_idx < end {
155                let mut form_bytes = Vec::with_capacity(4 + 4 + 4 + chunk.data.len());
156                form_bytes.extend_from_slice(b"AT&T");
157                form_bytes.extend_from_slice(b"FORM");
158                let len = chunk.data.len() as u32;
159                form_bytes.extend_from_slice(&len.to_be_bytes());
160                form_bytes.extend_from_slice(chunk.data);
161                components.push(form_bytes);
162                component_ids.push(format!("p{:04}.djvu", page_idx + 1));
163                component_flags.push(1); // page
164            }
165            page_idx += 1;
166        }
167    }
168
169    build_djvm(&components, &component_ids, &component_flags)
170}
171
172/// Build a bundled DJVM file from components.
173fn build_djvm(components: &[Vec<u8>], ids: &[String], flags: &[u8]) -> Result<Vec<u8>, DjvmError> {
174    let n = components.len();
175
176    // Build DIRM chunk
177    let dirm_data = build_dirm(n, flags, ids);
178
179    // Calculate total FORM body size
180    let mut body_size: usize = 4; // "DJVM"
181    body_size += 8 + dirm_data.len(); // DIRM chunk header + data
182    if !dirm_data.len().is_multiple_of(2) {
183        body_size += 1; // IFF padding
184    }
185    for comp in components {
186        // Each component includes AT&T prefix — strip it for embedding
187        let comp_data = if comp.len() >= 4 && &comp[..4] == b"AT&T" {
188            &comp[4..]
189        } else {
190            comp.as_slice()
191        };
192        body_size += comp_data.len();
193        if !comp_data.len().is_multiple_of(2) {
194            body_size += 1; // IFF padding
195        }
196    }
197
198    let mut output = Vec::with_capacity(4 + 4 + 4 + body_size);
199
200    // AT&T magic
201    output.extend_from_slice(b"AT&T");
202    // FORM header
203    output.extend_from_slice(b"FORM");
204    output.extend_from_slice(&(body_size as u32).to_be_bytes());
205    // DJVM type
206    output.extend_from_slice(b"DJVM");
207
208    // DIRM chunk
209    output.extend_from_slice(b"DIRM");
210    output.extend_from_slice(&(dirm_data.len() as u32).to_be_bytes());
211    output.extend_from_slice(&dirm_data);
212    if !dirm_data.len().is_multiple_of(2) {
213        output.push(0); // IFF padding
214    }
215
216    // Component FORM chunks
217    for comp in components {
218        let comp_data = if comp.len() >= 4 && &comp[..4] == b"AT&T" {
219            &comp[4..]
220        } else {
221            comp.as_slice()
222        };
223        output.extend_from_slice(comp_data);
224        if !comp_data.len().is_multiple_of(2) {
225            output.push(0); // IFF padding
226        }
227    }
228
229    Ok(output)
230}
231
232/// Create an indirect (non-bundled) DJVM index file that references pages as
233/// separate files.
234///
235/// The returned bytes are a valid `FORM:DJVM` with a DIRM directory chunk whose
236/// `is_bundled` flag is **not** set.  Each entry in `page_names` becomes one
237/// `Page` component; there are no embedded `FORM:DJVU` sub-forms — the component
238/// data lives in separate files that must be passed to a resolver when parsing.
239///
240/// Shared-dictionary (DJVI) components are not supported by this helper; use
241/// [`merge`] to build a bundled document that includes them.
242///
243/// # Errors
244///
245/// Returns [`DjvmError::EmptyMerge`] if `page_names` is empty.
246pub fn create_indirect(page_names: &[&str]) -> Result<Vec<u8>, DjvmError> {
247    if page_names.is_empty() {
248        return Err(DjvmError::EmptyMerge);
249    }
250
251    let count = page_names.len();
252    let ids: Vec<String> = page_names.iter().map(|s| s.to_string()).collect();
253    // All entries are pages (flag = 1)
254    let flags: Vec<u8> = vec![1u8; count];
255
256    let dirm_data = build_dirm_indirect(count, &flags, &ids);
257
258    let mut body_size: usize = 4; // "DJVM"
259    body_size += 8 + dirm_data.len(); // DIRM chunk header + data
260    if !dirm_data.len().is_multiple_of(2) {
261        body_size += 1;
262    }
263
264    let mut output = Vec::with_capacity(4 + 4 + 4 + body_size);
265    output.extend_from_slice(b"AT&T");
266    output.extend_from_slice(b"FORM");
267    output.extend_from_slice(&(body_size as u32).to_be_bytes());
268    output.extend_from_slice(b"DJVM");
269    output.extend_from_slice(b"DIRM");
270    output.extend_from_slice(&(dirm_data.len() as u32).to_be_bytes());
271    output.extend_from_slice(&dirm_data);
272    if !dirm_data.len().is_multiple_of(2) {
273        output.push(0);
274    }
275
276    Ok(output)
277}
278
279/// Build an indirect (non-bundled) DIRM chunk.
280///
281/// Unlike the bundled variant, there is no per-component offset table.
282fn build_dirm_indirect(count: usize, flags: &[u8], ids: &[String]) -> Vec<u8> {
283    let mut data = Vec::new();
284
285    // Flags byte: 0x00 = indirect (not bundled)
286    data.push(0x00);
287
288    // Component count (16-bit big-endian)
289    data.push((count >> 8) as u8);
290    data.push(count as u8);
291
292    // No offset table for indirect documents.
293
294    let mut meta = Vec::new();
295    for _ in 0..count {
296        meta.extend_from_slice(&[0, 0, 0]); // sizes (unused for indirect)
297    }
298    for &f in flags {
299        meta.push(f);
300    }
301    for id in ids {
302        meta.extend_from_slice(id.as_bytes());
303        meta.push(0);
304    }
305    for id in ids {
306        meta.extend_from_slice(id.as_bytes());
307        meta.push(0);
308    }
309    meta.extend(core::iter::repeat_n(0u8, count)); // empty titles
310
311    let compressed = crate::bzz_encode::bzz_encode(&meta);
312    data.extend_from_slice(&compressed);
313
314    data
315}
316
317/// Build the DIRM chunk data.
318///
319/// Format:
320/// - 1 byte: flags (0x80 = bundled)
321/// - 2 bytes: component count (big-endian)
322/// - 4 bytes x n: component offsets (big-endian, computed from component sizes)
323/// - BZZ-compressed metadata: sizes(3b×N), flags(1b×N), IDs, names, titles
324///
325/// The BZZ-compressed section is built by reusing the existing reference
326/// BZZ stream from the first input document when possible. For fresh
327/// construction, we build the raw metadata and use a minimal BZZ wrapper.
328fn build_dirm(count: usize, flags: &[u8], ids: &[String]) -> Vec<u8> {
329    let mut data = Vec::new();
330
331    // Flags byte: 0x80 = bundled format
332    data.push(0x80);
333
334    // Component count (16-bit big-endian)
335    data.push((count >> 8) as u8);
336    data.push(count as u8);
337
338    // Placeholder for offsets (4 bytes each) — filled in below
339    let _offsets_start = data.len();
340    for _ in 0..count {
341        data.extend_from_slice(&[0, 0, 0, 0]);
342    }
343
344    // Build the raw metadata that would normally be BZZ-compressed.
345    // Layout: sizes(3b × N) + flags(1b × N) + IDs(null-term) + names(null-term) + titles(null-term)
346    let mut meta = Vec::new();
347
348    // Component sizes — 3 bytes each, set to 0 (readers use FORM boundaries)
349    for _ in 0..count {
350        meta.extend_from_slice(&[0, 0, 0]);
351    }
352    // Component flags (1 byte each)
353    for &f in flags {
354        meta.push(f);
355    }
356    // Component IDs (null-terminated)
357    for id in ids {
358        meta.extend_from_slice(id.as_bytes());
359        meta.push(0);
360    }
361    // Names (null-terminated, same as IDs)
362    for id in ids {
363        meta.extend_from_slice(id.as_bytes());
364        meta.push(0);
365    }
366    // Titles (empty, null-terminated)
367    meta.extend(core::iter::repeat_n(0u8, count));
368
369    // Encode the metadata using BZZ. We use a trivial BZZ stream:
370    // the raw metadata is small enough that we can encode it directly
371    // using the BZZ block format with a passthrough identity encoding.
372    let compressed = crate::bzz_encode::bzz_encode(&meta);
373    data.extend_from_slice(&compressed);
374
375    data
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    fn fixture_path(name: &str) -> std::path::PathBuf {
383        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
384            .join("tests/fixtures")
385            .join(name)
386    }
387
388    #[test]
389    fn merge_empty_returns_error() {
390        let result = merge(&[]);
391        assert!(result.is_err());
392    }
393
394    #[test]
395    fn split_single_page_from_multipage() {
396        let path = fixture_path("DjVu3Spec_bundled.djvu");
397        if !path.exists() {
398            // Skip if fixture not available
399            return;
400        }
401        let data = std::fs::read(&path).expect("read fixture");
402        let doc = DjVuDocument::parse(&data).expect("parse");
403        let count = doc.page_count();
404        assert!(count > 1, "need multipage fixture");
405
406        // Split out page 0
407        let page0 = split(&data, 0, 1).expect("split page 0");
408        // Verify the result is parseable
409        let form = iff::parse_form(&page0).expect("parse split page");
410        assert_eq!(&form.form_type, b"DJVU");
411    }
412
413    #[test]
414    fn merge_two_single_page_files() {
415        let path = fixture_path("irish.djvu");
416        if !path.exists() {
417            return;
418        }
419        let irish = std::fs::read(&path).expect("read fixture");
420        let data = merge(&[&irish, &irish]).expect("merge");
421        // Verify the result has the right FORM type
422        let form = iff::parse_form(&data).expect("parse merged");
423        assert_eq!(&form.form_type, b"DJVM");
424    }
425
426    #[test]
427    fn split_out_of_bounds() {
428        let path = fixture_path("irish.djvu");
429        if !path.exists() {
430            return;
431        }
432        let data = std::fs::read(&path).expect("read fixture");
433        let result = split(&data, 0, 5);
434        assert!(result.is_err());
435    }
436
437    #[test]
438    fn create_indirect_empty_returns_error() {
439        let result = create_indirect(&[]);
440        assert!(result.is_err());
441    }
442
443    #[test]
444    fn create_indirect_parses_with_resolver() {
445        // Build an indirect DJVM that references "chicken.djvu"
446        let indirect_bytes = create_indirect(&["chicken.djvu"]).expect("create_indirect");
447
448        // Verify it parses as FORM:DJVM
449        let form = iff::parse_form(&indirect_bytes).expect("parse form");
450        assert_eq!(&form.form_type, b"DJVM");
451
452        // Verify DIRM chunk has is_bundled = 0
453        let dirm = form.chunks.iter().find(|c| &c.id == b"DIRM").expect("DIRM");
454        assert_eq!(
455            dirm.data[0] & 0x80,
456            0,
457            "indirect DIRM must not have bundled bit set"
458        );
459
460        // Parse with a resolver that supplies chicken.djvu
461        let chicken_path = fixture_path("chicken.djvu");
462        if !chicken_path.exists() {
463            return;
464        }
465        let chicken_data = std::fs::read(&chicken_path).expect("read chicken.djvu");
466        let doc = DjVuDocument::parse_with_resolver(
467            &indirect_bytes,
468            Some(
469                move |name: &str| -> Result<Vec<u8>, crate::djvu_document::DocError> {
470                    if name == "chicken.djvu" {
471                        Ok(chicken_data.clone())
472                    } else {
473                        Err(crate::djvu_document::DocError::IndirectResolve(
474                            name.to_string(),
475                        ))
476                    }
477                },
478            ),
479        )
480        .expect("parse indirect with resolver");
481
482        assert_eq!(doc.page_count(), 1);
483        let page = doc.page(0).unwrap();
484        assert_eq!(page.width(), 181);
485        assert_eq!(page.height(), 240);
486    }
487
488    #[test]
489    fn create_indirect_multipage() {
490        // 3-page indirect document
491        let indirect_bytes =
492            create_indirect(&["page1.djvu", "page2.djvu", "page3.djvu"]).expect("create_indirect");
493        let form = iff::parse_form(&indirect_bytes).expect("parse");
494        assert_eq!(&form.form_type, b"DJVM");
495
496        // Component count = 3 in DIRM
497        let dirm = form.chunks.iter().find(|c| &c.id == b"DIRM").expect("DIRM");
498        let nfiles = u16::from_be_bytes([dirm.data[1], dirm.data[2]]) as usize;
499        assert_eq!(nfiles, 3);
500    }
501
502    #[test]
503    fn parse_from_dir_indirect() {
504        // Write an indirect DJVM index and chicken.djvu to a temp directory,
505        // then open it via parse_from_dir.
506        let chicken_path = fixture_path("chicken.djvu");
507        if !chicken_path.exists() {
508            return;
509        }
510        let tmp = std::env::temp_dir().join("djvu_indirect_test");
511        std::fs::create_dir_all(&tmp).unwrap();
512
513        // Copy chicken.djvu as the component
514        let component_name = "p0001.djvu";
515        std::fs::copy(&chicken_path, tmp.join(component_name)).unwrap();
516
517        // Build indirect index
518        let index_bytes = create_indirect(&[component_name]).expect("create_indirect");
519        let index_path = tmp.join("index.djvu");
520        std::fs::write(&index_path, &index_bytes).unwrap();
521
522        // Open via parse_from_dir
523        let index_data = std::fs::read(&index_path).unwrap();
524        let doc = DjVuDocument::parse_from_dir(&index_data, &tmp).expect("parse_from_dir");
525        assert_eq!(doc.page_count(), 1);
526        assert_eq!(doc.page(0).unwrap().width(), 181);
527    }
528}