Skip to main content

pdf_ast/filters/
ccitt.rs

1#![allow(dead_code)]
2
3/// CCITT Fax decoding implementation for PDF
4/// Supports Group 3 (1D and 2D) and Group 4 fax compression
5use crate::filters::ccitt_tables::{FaxTabEnt, FAX_BLACK_TABLE, FAX_MAIN_TABLE, FAX_WHITE_TABLE};
6
7const S_NULL: u8 = 0;
8const S_PASS: u8 = 1;
9const S_HORIZ: u8 = 2;
10const S_V0: u8 = 3;
11const S_VR: u8 = 4;
12const S_VL: u8 = 5;
13const S_EXT: u8 = 6;
14const S_TERMW: u8 = 7;
15const S_TERMB: u8 = 8;
16const S_MAKEUPW: u8 = 9;
17const S_MAKEUPB: u8 = 10;
18const S_MAKEUP: u8 = 11;
19const S_EOL: u8 = 12;
20
21/// CCITT Fax decoder with full Group 3 and Group 4 support
22pub struct CcittDecoder {
23    columns: usize,
24    rows: usize,
25    k: i32,
26    end_of_line: bool,
27    encoded_byte_align: bool,
28    end_of_block: bool,
29    black_is_1: bool,
30    damaged_rows_before_error: i32,
31}
32
33impl CcittDecoder {
34    /// Creates a new CCITT decoder with specified dimensions
35    pub fn new(columns: usize, rows: usize) -> Self {
36        Self {
37            columns,
38            rows,
39            k: 0,
40            end_of_line: false,
41            encoded_byte_align: false,
42            end_of_block: true,
43            black_is_1: false,
44            damaged_rows_before_error: 0,
45        }
46    }
47
48    /// Sets the K parameter for encoding type
49    pub fn with_k(mut self, k: i32) -> Self {
50        self.k = k;
51        self
52    }
53
54    pub fn with_end_of_line(mut self, eol: bool) -> Self {
55        self.end_of_line = eol;
56        self
57    }
58
59    pub fn with_encoded_byte_align(mut self, align: bool) -> Self {
60        self.encoded_byte_align = align;
61        self
62    }
63
64    pub fn with_end_of_block(mut self, eob: bool) -> Self {
65        self.end_of_block = eob;
66        self
67    }
68
69    pub fn with_black_is_1(mut self, black: bool) -> Self {
70        self.black_is_1 = black;
71        self
72    }
73
74    pub fn with_damaged_rows_before_error(mut self, rows: i32) -> Self {
75        self.damaged_rows_before_error = rows;
76        self
77    }
78
79    /// Decode Group 3 1D (Modified Huffman)
80    pub fn decode_group3_1d(&self, data: &[u8]) -> Result<Vec<u8>, String> {
81        let mut decoder = Group3Decoder::new(self.columns, self.rows, false);
82        decoder.black_is_1 = self.black_is_1;
83        decoder.end_of_line = self.end_of_line;
84        decoder.encoded_byte_align = self.encoded_byte_align;
85        decoder.end_of_block = self.end_of_block;
86        decoder.damaged_rows_before_error = self.damaged_rows_before_error;
87        decoder.decode_1d(data)
88    }
89
90    /// Decode Group 3 2D (Modified READ)
91    pub fn decode_group3_2d(&self, data: &[u8]) -> Result<Vec<u8>, String> {
92        let mut decoder = Group3Decoder::new(self.columns, self.rows, true);
93        decoder.k = self.k;
94        decoder.black_is_1 = self.black_is_1;
95        decoder.end_of_line = self.end_of_line;
96        decoder.encoded_byte_align = self.encoded_byte_align;
97        decoder.end_of_block = self.end_of_block;
98        decoder.damaged_rows_before_error = self.damaged_rows_before_error;
99        decoder.decode_2d(data)
100    }
101
102    /// Decode Group 4 (Modified Modified READ)
103    pub fn decode_group4(&self, data: &[u8]) -> Result<Vec<u8>, String> {
104        let mut decoder = Group4Decoder::new(self.columns, self.rows);
105        decoder.black_is_1 = self.black_is_1;
106        decoder.end_of_block = self.end_of_block;
107        decoder.damaged_rows_before_error = self.damaged_rows_before_error;
108        decoder.decode(data)
109    }
110}
111
112/// Group 3 Fax decoder
113struct Group3Decoder {
114    columns: usize,
115    rows: usize,
116    is_2d: bool,
117    k: i32,
118    black_is_1: bool,
119    end_of_line: bool,
120    encoded_byte_align: bool,
121    end_of_block: bool,
122    damaged_rows_before_error: i32,
123}
124
125impl Group3Decoder {
126    fn new(columns: usize, rows: usize, is_2d: bool) -> Self {
127        Self {
128            columns,
129            rows,
130            is_2d,
131            k: if is_2d { 2 } else { 0 },
132            black_is_1: false,
133            end_of_line: false,
134            encoded_byte_align: false,
135            end_of_block: true,
136            damaged_rows_before_error: 0,
137        }
138    }
139
140    fn decode_1d(&mut self, data: &[u8]) -> Result<Vec<u8>, String> {
141        let mut reader = BitReader::new(data);
142        let bytes_per_row = self.columns.div_ceil(8);
143        let mut output = Vec::new();
144        let rows = if self.rows == 0 {
145            usize::MAX
146        } else {
147            self.rows
148        };
149        let mut decoded_rows = 0;
150
151        while decoded_rows < rows {
152            if self.end_of_line {
153                self.sync_to_eol(&mut reader)?;
154                if self.encoded_byte_align {
155                    reader.align_byte();
156                }
157            }
158
159            let mut row = vec![0u8; bytes_per_row];
160            let row_result = self.decode_row_1d(&mut reader, &mut row);
161            match row_result {
162                Ok(()) => {
163                    output.extend_from_slice(&row);
164                    decoded_rows += 1;
165                }
166                Err(err) => {
167                    if self.damaged_rows_before_error > 0 {
168                        self.damaged_rows_before_error -= 1;
169                        output.extend_from_slice(&row);
170                        decoded_rows += 1;
171                    } else {
172                        return Err(err);
173                    }
174                }
175            }
176
177            if reader.is_at_end() {
178                break;
179            }
180        }
181
182        if !self.black_is_1 {
183            invert_bits(&mut output);
184        }
185
186        Ok(output)
187    }
188
189    fn decode_2d(&mut self, data: &[u8]) -> Result<Vec<u8>, String> {
190        let mut reader = BitReader::new(data);
191        let bytes_per_row = self.columns.div_ceil(8);
192        let mut output = Vec::new();
193        let rows = if self.rows == 0 {
194            usize::MAX
195        } else {
196            self.rows
197        };
198        let mut decoded_rows = 0;
199        let mut reference_row = vec![0u8; bytes_per_row];
200
201        while decoded_rows < rows {
202            if self.end_of_line {
203                self.sync_to_eol(&mut reader)?;
204                if self.encoded_byte_align {
205                    reader.align_byte();
206                }
207            }
208
209            let mut row = vec![0u8; bytes_per_row];
210            let use_1d = if self.k > 0 {
211                decoded_rows % (self.k as usize + 1) == 0
212            } else {
213                false
214            };
215
216            let row_result = if use_1d {
217                self.decode_row_1d(&mut reader, &mut row)
218            } else {
219                self.decode_row_2d(&mut reader, &mut row, &reference_row)
220            };
221
222            match row_result {
223                Ok(()) => {
224                    output.extend_from_slice(&row);
225                    reference_row.copy_from_slice(&row);
226                    decoded_rows += 1;
227                }
228                Err(err) => {
229                    if self.damaged_rows_before_error > 0 {
230                        self.damaged_rows_before_error -= 1;
231                        output.extend_from_slice(&row);
232                        reference_row.copy_from_slice(&row);
233                        decoded_rows += 1;
234                    } else {
235                        return Err(err);
236                    }
237                }
238            }
239
240            if reader.is_at_end() {
241                break;
242            }
243        }
244
245        if !self.black_is_1 {
246            invert_bits(&mut output);
247        }
248
249        Ok(output)
250    }
251
252    fn decode_row_1d(&mut self, reader: &mut BitReader, row: &mut [u8]) -> Result<(), String> {
253        let mut a0 = 0usize;
254        let mut is_white = true;
255
256        while a0 < self.columns {
257            let run = if is_white {
258                decode_white_run(reader)?
259            } else {
260                decode_black_run(reader)?
261            };
262
263            if run == 0 && a0 == 0 && self.end_of_line {
264                break;
265            }
266
267            let run = run.min(self.columns.saturating_sub(a0));
268            if !is_white {
269                set_bits(row, a0, run);
270            }
271            a0 += run;
272            is_white = !is_white;
273        }
274
275        Ok(())
276    }
277
278    fn decode_row_2d(
279        &mut self,
280        reader: &mut BitReader,
281        row: &mut [u8],
282        reference: &[u8],
283    ) -> Result<(), String> {
284        let mut a0 = 0usize;
285        let mut is_white = true;
286        let changes = collect_changes(reference, self.columns);
287
288        while a0 < self.columns {
289            let (b1, b2) = next_b1_b2(&changes, a0);
290            match read_2d_mode(reader)? {
291                Mode::Pass => {
292                    a0 = b2;
293                }
294                Mode::Horizontal => {
295                    let run1 = if is_white {
296                        decode_white_run(reader)?
297                    } else {
298                        decode_black_run(reader)?
299                    };
300                    let run2 = if is_white {
301                        decode_black_run(reader)?
302                    } else {
303                        decode_white_run(reader)?
304                    };
305
306                    let run1 = run1.min(self.columns.saturating_sub(a0));
307                    if !is_white {
308                        set_bits(row, a0, run1);
309                    }
310                    a0 += run1;
311
312                    let run2 = run2.min(self.columns.saturating_sub(a0));
313                    if is_white {
314                        set_bits(row, a0, run2);
315                    }
316                    a0 += run2;
317                }
318                Mode::Vertical(offset) => {
319                    let a1 = clamp_vertical(b1 as isize + offset, self.columns)?;
320                    let run = a1.saturating_sub(a0);
321                    if !is_white {
322                        set_bits(row, a0, run);
323                    }
324                    a0 = a1;
325                    is_white = !is_white;
326                }
327                Mode::Extension => {
328                    return Err("CCITT uncompressed extension not supported".to_string());
329                }
330                Mode::EndOfLine => {
331                    break;
332                }
333            }
334        }
335
336        Ok(())
337    }
338
339    fn sync_to_eol(&mut self, reader: &mut BitReader) -> Result<(), String> {
340        let mut zeros = 0;
341        loop {
342            let bit = match reader.read_bit() {
343                Ok(bit) => bit,
344                Err(_) => return Err("Unexpected end of data while syncing EOL".to_string()),
345            };
346            if bit == 0 {
347                zeros += 1;
348            } else {
349                if zeros >= 11 {
350                    return Ok(());
351                }
352                zeros = 0;
353            }
354        }
355    }
356}
357
358/// Group 4 Fax decoder
359struct Group4Decoder {
360    columns: usize,
361    rows: usize,
362    black_is_1: bool,
363    end_of_block: bool,
364    damaged_rows_before_error: i32,
365}
366
367impl Group4Decoder {
368    fn new(columns: usize, rows: usize) -> Self {
369        Self {
370            columns,
371            rows,
372            black_is_1: false,
373            end_of_block: true,
374            damaged_rows_before_error: 0,
375        }
376    }
377
378    fn decode(&mut self, data: &[u8]) -> Result<Vec<u8>, String> {
379        let mut reader = BitReader::new(data);
380        let bytes_per_row = self.columns.div_ceil(8);
381        let mut output = Vec::new();
382        let rows = if self.rows == 0 {
383            usize::MAX
384        } else {
385            self.rows
386        };
387        let mut decoded_rows = 0;
388        let mut reference_row = vec![0u8; bytes_per_row];
389
390        while decoded_rows < rows {
391            let mut row = vec![0u8; bytes_per_row];
392            let row_result = self.decode_row_mmr(&mut reader, &mut row, &reference_row);
393            match row_result {
394                Ok(()) => {
395                    output.extend_from_slice(&row);
396                    reference_row.copy_from_slice(&row);
397                    decoded_rows += 1;
398                }
399                Err(err) => {
400                    if self.damaged_rows_before_error > 0 {
401                        self.damaged_rows_before_error -= 1;
402                        output.extend_from_slice(&row);
403                        reference_row.copy_from_slice(&row);
404                        decoded_rows += 1;
405                    } else {
406                        return Err(err);
407                    }
408                }
409            }
410
411            if reader.is_at_end() {
412                break;
413            }
414        }
415
416        if self.end_of_block {
417            // End-of-block marker is optional in PDF; ignore if missing
418        }
419
420        if !self.black_is_1 {
421            invert_bits(&mut output);
422        }
423
424        Ok(output)
425    }
426
427    fn decode_row_mmr(
428        &mut self,
429        reader: &mut BitReader,
430        row: &mut [u8],
431        reference: &[u8],
432    ) -> Result<(), String> {
433        let mut a0 = 0usize;
434        let mut is_white = true;
435        let changes = collect_changes(reference, self.columns);
436
437        while a0 < self.columns {
438            let (b1, b2) = next_b1_b2(&changes, a0);
439            match read_2d_mode(reader)? {
440                Mode::Pass => {
441                    a0 = b2;
442                }
443                Mode::Horizontal => {
444                    let run1 = if is_white {
445                        decode_white_run(reader)?
446                    } else {
447                        decode_black_run(reader)?
448                    };
449                    let run2 = if is_white {
450                        decode_black_run(reader)?
451                    } else {
452                        decode_white_run(reader)?
453                    };
454
455                    let run1 = run1.min(self.columns.saturating_sub(a0));
456                    if !is_white {
457                        set_bits(row, a0, run1);
458                    }
459                    a0 += run1;
460
461                    let run2 = run2.min(self.columns.saturating_sub(a0));
462                    if is_white {
463                        set_bits(row, a0, run2);
464                    }
465                    a0 += run2;
466                }
467                Mode::Vertical(offset) => {
468                    let a1 = clamp_vertical(b1 as isize + offset, self.columns)?;
469                    let run = a1.saturating_sub(a0);
470                    if !is_white {
471                        set_bits(row, a0, run);
472                    }
473                    a0 = a1;
474                    is_white = !is_white;
475                }
476                Mode::Extension => {
477                    return Err("CCITT uncompressed extension not supported".to_string());
478                }
479                Mode::EndOfLine => {
480                    break;
481                }
482            }
483        }
484
485        Ok(())
486    }
487}
488
489#[derive(Debug)]
490enum Mode {
491    Pass,
492    Horizontal,
493    Vertical(isize),
494    Extension,
495    EndOfLine,
496}
497
498fn read_2d_mode(reader: &mut BitReader) -> Result<Mode, String> {
499    let bits = reader.peek_bits(7)? as usize;
500    let entry = FAX_MAIN_TABLE
501        .get(bits)
502        .ok_or_else(|| "CCITT main table lookup out of range".to_string())?;
503    reader.consume_bits(entry.width);
504    match entry.state {
505        S_PASS => Ok(Mode::Pass),
506        S_HORIZ => Ok(Mode::Horizontal),
507        S_V0 => Ok(Mode::Vertical(0)),
508        S_VR => Ok(Mode::Vertical(entry.param as isize)),
509        S_VL => Ok(Mode::Vertical(-(entry.param as isize))),
510        S_EXT => Ok(Mode::Extension),
511        S_EOL => Ok(Mode::EndOfLine),
512        _ => Err(format!("Invalid CCITT 2D mode state: {}", entry.state)),
513    }
514}
515
516fn decode_white_run(reader: &mut BitReader) -> Result<usize, String> {
517    decode_run(reader, true)
518}
519
520fn decode_black_run(reader: &mut BitReader) -> Result<usize, String> {
521    decode_run(reader, false)
522}
523
524fn decode_run(reader: &mut BitReader, white: bool) -> Result<usize, String> {
525    let mut run = 0usize;
526    loop {
527        let entry = if white {
528            lookup_white(reader)?
529        } else {
530            lookup_black(reader)?
531        };
532
533        match entry.state {
534            S_TERMW | S_TERMB => {
535                run += entry.param as usize;
536                return Ok(run);
537            }
538            S_MAKEUPW | S_MAKEUPB | S_MAKEUP => {
539                run += entry.param as usize;
540            }
541            S_EOL => {
542                return Err("Unexpected EOL in run decoding".to_string());
543            }
544            S_NULL => {
545                return Err("Invalid CCITT code (null state)".to_string());
546            }
547            _ => {
548                return Err(format!("Invalid CCITT run state: {}", entry.state));
549            }
550        }
551    }
552}
553
554fn lookup_white(reader: &mut BitReader) -> Result<FaxTabEnt, String> {
555    let idx = reader.peek_bits(12)? as usize;
556    let entry = *FAX_WHITE_TABLE
557        .get(idx)
558        .ok_or_else(|| "White table lookup out of range".to_string())?;
559    reader.consume_bits(entry.width);
560    Ok(entry)
561}
562
563fn lookup_black(reader: &mut BitReader) -> Result<FaxTabEnt, String> {
564    let idx = reader.peek_bits(13)? as usize;
565    let entry = *FAX_BLACK_TABLE
566        .get(idx)
567        .ok_or_else(|| "Black table lookup out of range".to_string())?;
568    reader.consume_bits(entry.width);
569    Ok(entry)
570}
571
572fn invert_bits(data: &mut [u8]) {
573    for byte in data {
574        *byte = !*byte;
575    }
576}
577
578fn set_bits(row: &mut [u8], start: usize, len: usize) {
579    for i in start..start + len {
580        let byte_idx = i / 8;
581        let bit_idx = 7 - (i % 8);
582        if let Some(byte) = row.get_mut(byte_idx) {
583            *byte |= 1 << bit_idx;
584        }
585    }
586}
587
588fn collect_changes(reference: &[u8], columns: usize) -> Vec<usize> {
589    let mut changes = Vec::new();
590    let mut last = false;
591    let mut pos = 0;
592
593    for byte in reference {
594        for bit in (0..8).rev() {
595            if pos >= columns {
596                break;
597            }
598            let value = (byte >> bit) & 1 != 0;
599            if pos == 0 {
600                last = value;
601            } else if value != last {
602                changes.push(pos);
603                last = value;
604            }
605            pos += 1;
606        }
607    }
608    changes.push(columns);
609    changes
610}
611
612fn next_b1_b2(changes: &[usize], a0: usize) -> (usize, usize) {
613    let mut i = 0;
614    while i < changes.len() && changes[i] <= a0 {
615        i += 1;
616    }
617    let b1 = changes.get(i).copied().unwrap_or(a0);
618    let b2 = changes.get(i + 1).copied().unwrap_or(b1);
619    (b1, b2)
620}
621
622fn clamp_vertical(pos: isize, columns: usize) -> Result<usize, String> {
623    if pos < 0 {
624        return Err("CCITT vertical offset underflow".to_string());
625    }
626    Ok(pos.min(columns as isize) as usize)
627}
628
629/// Bit reader for CCITT decoding (LSB-first with per-byte bit reversal)
630struct BitReader<'a> {
631    data: &'a [u8],
632    byte_pos: usize,
633    bit_acc: u32,
634    bits_avail: u8,
635}
636
637impl<'a> BitReader<'a> {
638    fn new(data: &'a [u8]) -> Self {
639        Self {
640            data,
641            byte_pos: 0,
642            bit_acc: 0,
643            bits_avail: 0,
644        }
645    }
646
647    fn read_bit(&mut self) -> Result<u8, String> {
648        let bit = self.read_bits(1)?;
649        Ok(bit as u8)
650    }
651
652    fn read_bits(&mut self, count: u8) -> Result<u32, String> {
653        let bits = self.peek_bits(count)?;
654        self.consume_bits(count);
655        Ok(bits)
656    }
657
658    fn peek_bits(&mut self, count: u8) -> Result<u32, String> {
659        self.ensure_bits(count)?;
660        let mask = if count == 32 {
661            u32::MAX
662        } else {
663            (1u32 << count) - 1
664        };
665        Ok(self.bit_acc & mask)
666    }
667
668    fn consume_bits(&mut self, count: u8) {
669        self.bit_acc >>= count;
670        self.bits_avail = self.bits_avail.saturating_sub(count);
671    }
672
673    fn ensure_bits(&mut self, count: u8) -> Result<(), String> {
674        while self.bits_avail < count {
675            if self.byte_pos >= self.data.len() {
676                if self.bits_avail == 0 {
677                    return Err("End of data".to_string());
678                }
679                self.bits_avail = count;
680                return Ok(());
681            }
682            let byte = self.data[self.byte_pos].reverse_bits();
683            self.byte_pos += 1;
684            self.bit_acc |= (byte as u32) << self.bits_avail;
685            self.bits_avail = self.bits_avail.saturating_add(8);
686        }
687        Ok(())
688    }
689
690    fn align_byte(&mut self) {
691        let rem = self.bits_avail % 8;
692        if rem != 0 {
693            self.consume_bits(rem);
694        }
695    }
696
697    fn is_at_end(&self) -> bool {
698        self.byte_pos >= self.data.len() && self.bits_avail == 0
699    }
700}
701
702#[cfg(test)]
703mod tests {
704    use super::*;
705
706    fn pack_bits_lsb(bits: &[u8]) -> Vec<u8> {
707        let mut out = Vec::new();
708        let mut current = 0u8;
709        let mut pos = 0u8;
710        for &bit in bits {
711            current |= (bit & 1) << pos;
712            pos += 1;
713            if pos == 8 {
714                out.push(current.reverse_bits());
715                current = 0;
716                pos = 0;
717            }
718        }
719        if pos != 0 {
720            out.push(current.reverse_bits());
721        }
722        out
723    }
724
725    #[test]
726    fn test_bit_reader_roundtrip() {
727        let data = pack_bits_lsb(&[1, 0, 1, 1, 0, 0, 1, 0]);
728        let mut reader = BitReader::new(&data);
729        assert_eq!(reader.read_bits(4).unwrap(), 0b1101);
730    }
731
732    #[test]
733    fn test_group3_basic_decode() {
734        let decoder = CcittDecoder::new(8, 1).with_black_is_1(true);
735        let white8_bits = [1, 0, 0, 1, 1]; // LSB-first for run length 8
736        let data = pack_bits_lsb(&white8_bits);
737        let result = decoder.decode_group3_1d(&data).unwrap();
738        assert_eq!(result, vec![0x00]);
739    }
740
741    #[test]
742    fn test_group4_basic_decode() {
743        let decoder = CcittDecoder::new(8, 1).with_black_is_1(true);
744        let v0_bits = [1];
745        let data = pack_bits_lsb(&v0_bits);
746        let result = decoder.decode_group4(&data).unwrap();
747        assert_eq!(result, vec![0x00]);
748    }
749}