hayro_jbig2/
lib.rs

1/*!
2A memory-safe, pure-Rust JBIG2 decoder.
3
4`hayro-jbig2` decodes JBIG2 images as specified in ITU-T T.88 (also known as
5ISO/IEC 14492). JBIG2 is a bi-level image compression standard commonly used
6in PDF documents for compressing scanned text documents.
7
8The crate is `no_std` compatible but requires an allocator to be available.
9
10# Safety
11This crate forbids unsafe code via a crate-level attribute.
12*/
13
14#![cfg_attr(not(feature = "std"), no_std)]
15#![forbid(unsafe_code)]
16#![allow(missing_docs)]
17
18extern crate alloc;
19
20use alloc::vec::Vec;
21
22mod arithmetic_decoder;
23mod bitmap;
24mod dictionary;
25mod error;
26mod file;
27mod gray_scale;
28mod huffman_table;
29mod integer_decoder;
30mod lazy;
31mod page_info;
32mod reader;
33mod region;
34mod segment;
35
36use error::bail;
37pub use error::{
38    DecodeError, FormatError, HuffmanError, ParseError, RegionError, Result, SegmentError,
39    SymbolError, TemplateError,
40};
41
42use crate::file::parse_segments_sequential;
43use bitmap::DecodedRegion;
44use dictionary::pattern::{PatternDictionary, decode_pattern_dictionary};
45use dictionary::symbol::{SymbolDictionary, decode_symbol_dictionary};
46use file::parse_file;
47use huffman_table::{HuffmanTable, StandardHuffmanTables};
48use page_info::{PageInformation, parse_page_information};
49use reader::Reader;
50use region::generic::decode_generic_region;
51use region::generic_refinement::decode_generic_refinement_region;
52use region::halftone::decode_halftone_region;
53use region::text::decode_text_region;
54use segment::SegmentType;
55
56/// A decoded JBIG2 image.
57#[derive(Debug, Clone)]
58pub struct Image {
59    /// The width of the image in pixels.
60    pub width: u32,
61    /// The height of the image in pixels.
62    pub height: u32,
63    /// The raw pixel data, one bool per pixel, row-major order.
64    /// `true` means black, `false` means white.
65    pub data: Vec<bool>,
66}
67
68/// Decode a JBIG2 file from the given data.
69///
70/// The file is expected to use the sequential or random-access organization,
71/// as defined in Annex D.1 and D.2.
72pub fn decode(data: &[u8]) -> Result<Image> {
73    let file = parse_file(data)?;
74    decode_with_segments(&file.segments)
75}
76
77/// Decode an embedded JBIG2 image. with the given global segments.
78///
79/// The file is expected to use the embedded organization defined in
80/// Annex D.3.
81pub fn decode_embedded(data: &[u8], globals: Option<&[u8]>) -> Result<Image> {
82    let mut segments = Vec::new();
83    if let Some(globals_data) = globals {
84        let mut reader = Reader::new(globals_data);
85        parse_segments_sequential(&mut reader, &mut segments)?;
86    };
87
88    let mut reader = Reader::new(data);
89    parse_segments_sequential(&mut reader, &mut segments)?;
90
91    segments.sort_by_key(|seg| seg.header.segment_number);
92
93    decode_with_segments(&segments)
94}
95
96fn decode_with_segments(segments: &[segment::Segment<'_>]) -> Result<Image> {
97    // Pre-scan for stripe height from EndOfStripe segments.
98    let height_from_stripes = segments
99        .iter()
100        .filter(|seg| seg.header.segment_type == SegmentType::EndOfStripe)
101        .filter_map(|seg| u32::from_be_bytes(seg.data.try_into().ok()?).checked_add(1))
102        .max();
103
104    // Find and parse page information segment first.
105    let mut ctx = if let Some(page_info) = segments
106        .iter()
107        .find(|s| s.header.segment_type == SegmentType::PageInformation)
108    {
109        let mut reader = Reader::new(page_info.data);
110        get_ctx(&mut reader, height_from_stripes)?
111    } else {
112        bail!(FormatError::MissingPageInfo);
113    };
114
115    // Process all segments.
116    for seg in segments {
117        let mut reader = Reader::new(seg.data);
118
119        match seg.header.segment_type {
120            SegmentType::PageInformation => {
121                // Already processed above, skip.
122            }
123            SegmentType::ImmediateGenericRegion | SegmentType::ImmediateLosslessGenericRegion => {
124                let had_unknown_length = seg.header.data_length.is_none();
125                let region = decode_generic_region(&mut reader, had_unknown_length)?;
126                ctx.page_bitmap.combine(&region);
127            }
128            SegmentType::IntermediateGenericRegion => {
129                // Intermediate segments cannot have unknown length.
130                let region = decode_generic_region(&mut reader, false)?;
131                ctx.store_region(seg.header.segment_number, region);
132            }
133            SegmentType::PatternDictionary => {
134                let dictionary = decode_pattern_dictionary(&mut reader)?;
135                ctx.store_pattern_dictionary(seg.header.segment_number, dictionary);
136            }
137            SegmentType::SymbolDictionary => {
138                // "1) Concatenate all the input symbol dictionaries to form SDINSYMS."
139                // (6.5.5, step 1)
140                // Collect references to avoid cloning; symbols are only cloned if re-exported.
141                let input_symbols: Vec<&DecodedRegion> = seg
142                    .header
143                    .referred_to_segments
144                    .iter()
145                    .filter_map(|&num| ctx.get_symbol_dictionary(num))
146                    .flat_map(|dict| dict.exported_symbols.iter())
147                    .collect();
148
149                // Collect Huffman tables from referred table segments.
150                let referred_tables: Vec<HuffmanTable> = seg
151                    .header
152                    .referred_to_segments
153                    .iter()
154                    .filter_map(|&num| ctx.get_huffman_table(num))
155                    .cloned()
156                    .collect();
157
158                let dictionary = decode_symbol_dictionary(
159                    &mut reader,
160                    &input_symbols,
161                    &referred_tables,
162                    &ctx.standard_tables,
163                )?;
164                ctx.store_symbol_dictionary(seg.header.segment_number, dictionary);
165            }
166            SegmentType::ImmediateTextRegion | SegmentType::ImmediateLosslessTextRegion => {
167                // Collect symbols from referred symbol dictionaries (SBSYMS).
168                let symbols: Vec<&DecodedRegion> = seg
169                    .header
170                    .referred_to_segments
171                    .iter()
172                    .filter_map(|&num| ctx.get_symbol_dictionary(num))
173                    .flat_map(|dict| dict.exported_symbols.iter())
174                    .collect();
175
176                // Collect Huffman tables from referred table segments.
177                // "These user-supplied Huffman decoding tables may be supplied either
178                // as a Tables segment..." (7.4.3.1.6)
179                let referred_tables: Vec<HuffmanTable> = seg
180                    .header
181                    .referred_to_segments
182                    .iter()
183                    .filter_map(|&num| ctx.get_huffman_table(num))
184                    .cloned()
185                    .collect();
186
187                let region = decode_text_region(
188                    &mut reader,
189                    &symbols,
190                    &referred_tables,
191                    &ctx.standard_tables,
192                )?;
193                ctx.page_bitmap.combine(&region);
194            }
195            SegmentType::IntermediateTextRegion => {
196                // Collect symbols from referred symbol dictionaries (SBSYMS).
197                let symbols: Vec<&DecodedRegion> = seg
198                    .header
199                    .referred_to_segments
200                    .iter()
201                    .filter_map(|&num| ctx.get_symbol_dictionary(num))
202                    .flat_map(|dict| dict.exported_symbols.iter())
203                    .collect();
204
205                // Collect Huffman tables from referred table segments.
206                let referred_tables: Vec<HuffmanTable> = seg
207                    .header
208                    .referred_to_segments
209                    .iter()
210                    .filter_map(|&num| ctx.get_huffman_table(num))
211                    .cloned()
212                    .collect();
213
214                let region = decode_text_region(
215                    &mut reader,
216                    &symbols,
217                    &referred_tables,
218                    &ctx.standard_tables,
219                )?;
220                ctx.store_region(seg.header.segment_number, region);
221            }
222            SegmentType::ImmediateHalftoneRegion | SegmentType::ImmediateLosslessHalftoneRegion => {
223                let pattern_dict = seg
224                    .header
225                    .referred_to_segments
226                    .first()
227                    .and_then(|&num| ctx.get_pattern_dictionary(num))
228                    .ok_or(SegmentError::MissingPatternDictionary)?;
229
230                let region = decode_halftone_region(&mut reader, pattern_dict)?;
231                ctx.page_bitmap.combine(&region);
232            }
233            SegmentType::IntermediateHalftoneRegion => {
234                let pattern_dict = seg
235                    .header
236                    .referred_to_segments
237                    .first()
238                    .and_then(|&num| ctx.get_pattern_dictionary(num))
239                    .ok_or(SegmentError::MissingPatternDictionary)?;
240
241                let region = decode_halftone_region(&mut reader, pattern_dict)?;
242                ctx.store_region(seg.header.segment_number, region);
243            }
244            SegmentType::IntermediateGenericRefinementRegion => {
245                // Same logic as immediate refinement, but store result instead of combining.
246                let reference = seg
247                    .header
248                    .referred_to_segments
249                    .first()
250                    .and_then(|&num| ctx.get_referred_segment(num))
251                    .unwrap_or(&ctx.page_bitmap);
252
253                let region = decode_generic_refinement_region(&mut reader, reference)?;
254                ctx.store_region(seg.header.segment_number, region);
255            }
256            SegmentType::ImmediateGenericRefinementRegion
257            | SegmentType::ImmediateLosslessGenericRefinementRegion => {
258                // "3) Determine the buffer associated with the region segment that
259                // this segment refers to." (7.4.7.5)
260                //
261                // "2) If there are no referred-to segments, then use the page
262                // bitmap as the reference buffer." (7.4.7.5)
263                let reference = seg
264                    .header
265                    .referred_to_segments
266                    .first()
267                    .and_then(|&num| ctx.get_referred_segment(num))
268                    .unwrap_or(&ctx.page_bitmap);
269
270                let region = decode_generic_refinement_region(&mut reader, reference)?;
271                ctx.page_bitmap.combine(&region);
272            }
273            SegmentType::Tables => {
274                // "Tables – see 7.4.13." (type 53)
275                // "This segment contains data which defines one or more user-supplied
276                // Huffman coding tables." (7.4.13)
277                let table = HuffmanTable::read_custom(&mut reader)?;
278                ctx.store_huffman_table(seg.header.segment_number, table);
279            }
280            SegmentType::EndOfPage | SegmentType::EndOfFile => {
281                break;
282            }
283            // Other segment types not yet implemented.
284            _ => {}
285        }
286    }
287
288    Ok(Image {
289        width: ctx.page_bitmap.width,
290        height: ctx.page_bitmap.height,
291        data: ctx.page_bitmap.data,
292    })
293}
294
295/// Decoding context for a JBIG2 page.
296///
297/// This holds the page information and the page bitmap that regions are
298/// decoded into.
299pub(crate) struct DecodeContext {
300    /// The parsed page information.
301    pub(crate) _page_info: PageInformation,
302    /// The page bitmap that regions are combined into.
303    pub(crate) page_bitmap: DecodedRegion,
304    /// Decoded intermediate regions, stored as (`segment_number`, region) pairs.
305    pub(crate) referred_segments: Vec<(u32, DecodedRegion)>,
306    /// Decoded pattern dictionaries, stored as (`segment_number`, dictionary) pairs.
307    pub(crate) pattern_dictionaries: Vec<(u32, PatternDictionary)>,
308    /// Decoded symbol dictionaries, stored as (`segment_number`, dictionary) pairs.
309    pub(crate) symbol_dictionaries: Vec<(u32, SymbolDictionary)>,
310    /// Decoded Huffman tables from table segments, stored as (`segment_number`, table) pairs.
311    /// "Tables – see 7.4.13." (type 53)
312    pub(crate) huffman_tables: Vec<(u32, HuffmanTable)>,
313    /// Standard Huffman tables (`TABLE_A` through `TABLE_O`).
314    pub(crate) standard_tables: StandardHuffmanTables,
315}
316
317impl DecodeContext {
318    /// Store a decoded region for later reference.
319    fn store_region(&mut self, segment_number: u32, region: DecodedRegion) {
320        self.referred_segments.push((segment_number, region));
321    }
322
323    /// Look up a referred segment by number.
324    fn get_referred_segment(&self, segment_number: u32) -> Option<&DecodedRegion> {
325        self.referred_segments
326            .binary_search_by_key(&segment_number, |(num, _)| *num)
327            .ok()
328            .map(|idx| &self.referred_segments[idx].1)
329    }
330
331    /// Store a decoded pattern dictionary for later reference.
332    fn store_pattern_dictionary(&mut self, segment_number: u32, dictionary: PatternDictionary) {
333        self.pattern_dictionaries.push((segment_number, dictionary));
334    }
335
336    /// Look up a pattern dictionary by segment number.
337    fn get_pattern_dictionary(&self, segment_number: u32) -> Option<&PatternDictionary> {
338        self.pattern_dictionaries
339            .binary_search_by_key(&segment_number, |(num, _)| *num)
340            .ok()
341            .map(|idx| &self.pattern_dictionaries[idx].1)
342    }
343
344    /// Store a decoded symbol dictionary for later reference.
345    fn store_symbol_dictionary(&mut self, segment_number: u32, dictionary: SymbolDictionary) {
346        self.symbol_dictionaries.push((segment_number, dictionary));
347    }
348
349    /// Look up a symbol dictionary by segment number.
350    fn get_symbol_dictionary(&self, segment_number: u32) -> Option<&SymbolDictionary> {
351        self.symbol_dictionaries
352            .binary_search_by_key(&segment_number, |(num, _)| *num)
353            .ok()
354            .map(|idx| &self.symbol_dictionaries[idx].1)
355    }
356
357    /// Store a decoded Huffman table for later reference.
358    fn store_huffman_table(&mut self, segment_number: u32, table: HuffmanTable) {
359        self.huffman_tables.push((segment_number, table));
360    }
361
362    /// Look up a Huffman table by segment number.
363    fn get_huffman_table(&self, segment_number: u32) -> Option<&HuffmanTable> {
364        self.huffman_tables
365            .binary_search_by_key(&segment_number, |(num, _)| *num)
366            .ok()
367            .map(|idx| &self.huffman_tables[idx].1)
368    }
369}
370
371/// Create a decode context from page information segment data.
372///
373/// This parses the page information and creates the initial page bitmap
374/// with the default pixel value.
375pub(crate) fn get_ctx(
376    reader: &mut Reader<'_>,
377    height_from_stripes: Option<u32>,
378) -> Result<DecodeContext> {
379    let page_info = parse_page_information(reader)?;
380
381    // "A page's bitmap height may be declared in its page information segment
382    // to be unknown (by specifying a height of 0xFFFFFFFF). In this case, the
383    // page must be striped." (7.4.8.2)
384    let height = if page_info.height == 0xFFFF_FFFF {
385        height_from_stripes.ok_or(FormatError::UnknownPageHeight)?
386    } else {
387        page_info.height
388    };
389
390    // "Bit 2: Page default pixel value. This bit contains the initial value
391    // for every pixel in the page, before any region segments are decoded
392    // or drawn." (7.4.8.5)
393    let mut page_bitmap = DecodedRegion::new(page_info.width, height);
394    if page_info.flags.default_pixel != 0 {
395        // Fill with true (black) if default pixel is 1.
396        for pixel in &mut page_bitmap.data {
397            *pixel = true;
398        }
399    }
400
401    Ok(DecodeContext {
402        _page_info: page_info,
403        page_bitmap,
404        referred_segments: Vec::new(),
405        pattern_dictionaries: Vec::new(),
406        symbol_dictionaries: Vec::new(),
407        huffman_tables: Vec::new(),
408        standard_tables: StandardHuffmanTables::new(),
409    })
410}