Skip to main content

pdf_ast/types/
stream.rs

1use crate::types::{PdfDictionary, PdfName, PdfValue};
2use serde::{Deserialize, Serialize};
3use std::fmt;
4
5#[derive(Debug, Clone, PartialEq)]
6pub struct PdfStream {
7    pub dict: PdfDictionary,
8    pub data: StreamData,
9}
10
11#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
12pub enum StreamData {
13    Raw(Vec<u8>),
14    Decoded(Vec<u8>),
15    Lazy(StreamReference),
16}
17
18impl StreamData {
19    pub fn len(&self) -> usize {
20        match self {
21            StreamData::Raw(data) | StreamData::Decoded(data) => data.len(),
22            StreamData::Lazy(reference) => reference.length,
23        }
24    }
25
26    pub fn is_empty(&self) -> bool {
27        self.len() == 0
28    }
29
30    pub fn hash(&self) -> String {
31        use std::collections::hash_map::DefaultHasher;
32        use std::hash::{Hash, Hasher};
33
34        let mut hasher = DefaultHasher::new();
35        match self {
36            StreamData::Raw(data) | StreamData::Decoded(data) => {
37                data.hash(&mut hasher);
38            }
39            StreamData::Lazy(reference) => {
40                reference.offset.hash(&mut hasher);
41                reference.length.hash(&mut hasher);
42            }
43        }
44        format!("{:x}", hasher.finish())
45    }
46
47    pub fn truncate(&mut self, len: usize) {
48        match self {
49            StreamData::Raw(data) | StreamData::Decoded(data) => {
50                data.truncate(len);
51            }
52            StreamData::Lazy(_) => {
53                // Cannot truncate lazy streams
54            }
55        }
56    }
57
58    pub fn as_bytes(&self) -> Option<&[u8]> {
59        match self {
60            StreamData::Raw(data) | StreamData::Decoded(data) => Some(data),
61            StreamData::Lazy(_) => None,
62        }
63    }
64}
65
66impl std::ops::Index<usize> for StreamData {
67    type Output = u8;
68
69    fn index(&self, index: usize) -> &Self::Output {
70        match self {
71            StreamData::Raw(data) | StreamData::Decoded(data) => &data[index],
72            StreamData::Lazy(_) => panic!("Cannot index into lazy stream data"),
73        }
74    }
75}
76
77#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
78pub struct StreamReference {
79    pub offset: u64,
80    pub length: usize,
81    pub filters: Vec<StreamFilter>,
82}
83
84#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
85pub enum StreamFilter {
86    ASCIIHexDecode,
87    ASCII85Decode,
88    LZWDecode(LZWDecodeParams),
89    FlateDecode(FlateDecodeParams),
90    RunLengthDecode,
91    CCITTFaxDecode(CCITTFaxDecodeParams),
92    JBIG2Decode,
93    DCTDecode,
94    JPXDecode,
95    Crypt(CryptFilter),
96}
97
98#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
99pub struct LZWDecodeParams {
100    pub predictor: Option<i32>,
101    pub colors: Option<i32>,
102    pub bits_per_component: Option<i32>,
103    pub columns: Option<i32>,
104    pub early_change: Option<bool>,
105}
106
107#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
108pub struct FlateDecodeParams {
109    pub predictor: Option<i32>,
110    pub colors: Option<i32>,
111    pub bits_per_component: Option<i32>,
112    pub columns: Option<i32>,
113}
114
115#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
116pub struct CCITTFaxDecodeParams {
117    pub k: Option<i32>,
118    pub end_of_line: Option<bool>,
119    pub encoded_byte_align: Option<bool>,
120    pub columns: Option<i32>,
121    pub rows: Option<i32>,
122    pub end_of_block: Option<bool>,
123    pub black_is_1: Option<bool>,
124    pub damaged_rows_before_error: Option<i32>,
125}
126
127#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
128pub struct CryptFilter {
129    pub name: PdfName,
130}
131
132#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
133pub enum CryptFilterParams {
134    /// Identity filter - no encryption
135    Identity,
136    /// V2 standard security handler (RC4)
137    V2 { name: String },
138    /// AESV2 - AES-128 encryption
139    AESV2 { name: String },
140    /// AESV3 - AES-256 encryption
141    AESV3 { name: String },
142}
143
144impl PdfStream {
145    pub fn new(dict: PdfDictionary, data: Vec<u8>) -> Self {
146        PdfStream {
147            dict,
148            data: StreamData::Raw(data),
149        }
150    }
151
152    pub fn new_lazy(dict: PdfDictionary, reference: StreamReference) -> Self {
153        PdfStream {
154            dict,
155            data: StreamData::Lazy(reference),
156        }
157    }
158
159    pub fn raw_data(&self) -> Option<&[u8]> {
160        match &self.data {
161            StreamData::Raw(data) => Some(data),
162            _ => None,
163        }
164    }
165
166    pub fn decode(&self) -> Result<Vec<u8>, String> {
167        match &self.data {
168            StreamData::Raw(data) | StreamData::Decoded(data) => {
169                let filters = self.get_filters_with_params();
170                if filters.is_empty() {
171                    Ok(data.clone())
172                } else {
173                    crate::filters::decode_stream(data, &filters).map_err(|e| e.to_string())
174                }
175            }
176            StreamData::Lazy(_) => Err("Lazy stream decoding not implemented".to_string()),
177        }
178    }
179
180    pub fn decode_with_limits(
181        &self,
182        max_output_bytes: usize,
183        max_ratio: usize,
184    ) -> Result<Vec<u8>, String> {
185        match &self.data {
186            StreamData::Raw(data) | StreamData::Decoded(data) => {
187                let filters = self.get_filters_with_params();
188                if filters.is_empty() {
189                    Ok(data.clone())
190                } else {
191                    crate::filters::decode_stream_with_limits(
192                        data,
193                        &filters,
194                        max_output_bytes,
195                        max_ratio,
196                    )
197                    .map_err(|e| e.to_string())
198                }
199            }
200            StreamData::Lazy(_) => Err("Lazy stream decoding not implemented".to_string()),
201        }
202    }
203
204    pub fn decoded_data(&self) -> Option<&[u8]> {
205        match &self.data {
206            StreamData::Decoded(data) => Some(data),
207            _ => None,
208        }
209    }
210
211    pub fn is_lazy(&self) -> bool {
212        matches!(self.data, StreamData::Lazy(_))
213    }
214
215    pub fn length(&self) -> Option<usize> {
216        match &self.data {
217            StreamData::Raw(data) | StreamData::Decoded(data) => Some(data.len()),
218            StreamData::Lazy(reference) => Some(reference.length),
219        }
220    }
221
222    pub fn get_filters(&self) -> Vec<StreamFilter> {
223        self.get_filters_with_params()
224    }
225
226    pub fn get_filters_with_params(&self) -> Vec<StreamFilter> {
227        let mut filters = Vec::new();
228
229        let filter_names: Vec<&PdfName> = match self.dict.get("Filter") {
230            Some(PdfValue::Name(name)) => vec![name],
231            Some(PdfValue::Array(array)) => array.iter().filter_map(|v| v.as_name()).collect(),
232            _ => Vec::new(),
233        };
234
235        if filter_names.is_empty() {
236            return filters;
237        }
238
239        let mut decode_params = match self.dict.get("DecodeParms") {
240            Some(PdfValue::Dictionary(dict)) => vec![Some(dict)],
241            Some(PdfValue::Array(array)) => array.iter().map(|v| v.as_dict()).collect(),
242            Some(PdfValue::Null) => vec![None],
243            _ => Vec::new(),
244        };
245
246        if decode_params.len() < filter_names.len() {
247            decode_params.resize(filter_names.len(), None);
248        }
249
250        for (i, name) in filter_names.iter().enumerate() {
251            let params = decode_params.get(i).copied().unwrap_or(None);
252            if let Some(filter) = Self::filter_from_name_with_params(name, params) {
253                filters.push(filter);
254            }
255        }
256
257        filters
258    }
259
260    fn filter_from_name_with_params(
261        name: &PdfName,
262        params: Option<&PdfDictionary>,
263    ) -> Option<StreamFilter> {
264        match name.without_slash() {
265            "ASCIIHexDecode" | "AHx" => Some(StreamFilter::ASCIIHexDecode),
266            "ASCII85Decode" | "A85" => Some(StreamFilter::ASCII85Decode),
267            "LZWDecode" | "LZW" => {
268                let mut parsed = LZWDecodeParams::default();
269                if let Some(params) = params {
270                    parsed = parse_lzw_params(params);
271                }
272                Some(StreamFilter::LZWDecode(parsed))
273            }
274            "FlateDecode" | "Fl" => {
275                let mut parsed = FlateDecodeParams::default();
276                if let Some(params) = params {
277                    parsed = parse_flate_params(params);
278                }
279                Some(StreamFilter::FlateDecode(parsed))
280            }
281            "RunLengthDecode" | "RL" => Some(StreamFilter::RunLengthDecode),
282            "CCITTFaxDecode" | "CCF" => {
283                let mut parsed = CCITTFaxDecodeParams::default();
284                if let Some(params) = params {
285                    parsed = parse_ccitt_params(params);
286                }
287                Some(StreamFilter::CCITTFaxDecode(parsed))
288            }
289            "JBIG2Decode" => Some(StreamFilter::JBIG2Decode),
290            "DCTDecode" | "DCT" => Some(StreamFilter::DCTDecode),
291            "JPXDecode" => Some(StreamFilter::JPXDecode),
292            "Crypt" => {
293                let crypt_name = params
294                    .and_then(|p| p.get("Name"))
295                    .and_then(|v| v.as_name())
296                    .cloned()
297                    .unwrap_or_else(|| PdfName::new("Identity"));
298                Some(StreamFilter::Crypt(CryptFilter { name: crypt_name }))
299            }
300            _ => None,
301        }
302    }
303}
304
305fn parse_flate_params(params: &PdfDictionary) -> FlateDecodeParams {
306    FlateDecodeParams {
307        predictor: params
308            .get("Predictor")
309            .and_then(|v| v.as_integer())
310            .map(|v| v as i32),
311        colors: params
312            .get("Colors")
313            .and_then(|v| v.as_integer())
314            .map(|v| v as i32),
315        bits_per_component: params
316            .get("BitsPerComponent")
317            .and_then(|v| v.as_integer())
318            .map(|v| v as i32),
319        columns: params
320            .get("Columns")
321            .and_then(|v| v.as_integer())
322            .map(|v| v as i32),
323    }
324}
325
326fn parse_lzw_params(params: &PdfDictionary) -> LZWDecodeParams {
327    LZWDecodeParams {
328        predictor: params
329            .get("Predictor")
330            .and_then(|v| v.as_integer())
331            .map(|v| v as i32),
332        colors: params
333            .get("Colors")
334            .and_then(|v| v.as_integer())
335            .map(|v| v as i32),
336        bits_per_component: params
337            .get("BitsPerComponent")
338            .and_then(|v| v.as_integer())
339            .map(|v| v as i32),
340        columns: params
341            .get("Columns")
342            .and_then(|v| v.as_integer())
343            .map(|v| v as i32),
344        early_change: params.get("EarlyChange").and_then(bool_from_value),
345    }
346}
347
348fn parse_ccitt_params(params: &PdfDictionary) -> CCITTFaxDecodeParams {
349    CCITTFaxDecodeParams {
350        k: params
351            .get("K")
352            .and_then(|v| v.as_integer())
353            .map(|v| v as i32),
354        end_of_line: params.get("EndOfLine").and_then(bool_from_value),
355        encoded_byte_align: params.get("EncodedByteAlign").and_then(bool_from_value),
356        columns: params
357            .get("Columns")
358            .and_then(|v| v.as_integer())
359            .map(|v| v as i32),
360        rows: params
361            .get("Rows")
362            .and_then(|v| v.as_integer())
363            .map(|v| v as i32),
364        end_of_block: params.get("EndOfBlock").and_then(bool_from_value),
365        black_is_1: params.get("BlackIs1").and_then(bool_from_value),
366        damaged_rows_before_error: params
367            .get("DamagedRowsBeforeError")
368            .and_then(|v| v.as_integer())
369            .map(|v| v as i32),
370    }
371}
372
373fn bool_from_value(value: &PdfValue) -> Option<bool> {
374    match value {
375        PdfValue::Boolean(b) => Some(*b),
376        PdfValue::Integer(i) => Some(*i != 0),
377        PdfValue::Real(r) => Some(*r != 0.0),
378        _ => None,
379    }
380}
381
382impl StreamFilter {
383    pub fn from_name(name: &PdfName) -> Option<Self> {
384        match name.without_slash() {
385            "ASCIIHexDecode" | "AHx" => Some(StreamFilter::ASCIIHexDecode),
386            "ASCII85Decode" | "A85" => Some(StreamFilter::ASCII85Decode),
387            "LZWDecode" | "LZW" => Some(StreamFilter::LZWDecode(LZWDecodeParams::default())),
388            "FlateDecode" | "Fl" => Some(StreamFilter::FlateDecode(FlateDecodeParams::default())),
389            "RunLengthDecode" | "RL" => Some(StreamFilter::RunLengthDecode),
390            "CCITTFaxDecode" | "CCF" => {
391                Some(StreamFilter::CCITTFaxDecode(CCITTFaxDecodeParams::default()))
392            }
393            "JBIG2Decode" => Some(StreamFilter::JBIG2Decode),
394            "DCTDecode" | "DCT" => Some(StreamFilter::DCTDecode),
395            "JPXDecode" => Some(StreamFilter::JPXDecode),
396            _ => None,
397        }
398    }
399
400    pub fn name(&self) -> &'static str {
401        match self {
402            StreamFilter::ASCIIHexDecode => "ASCIIHexDecode",
403            StreamFilter::ASCII85Decode => "ASCII85Decode",
404            StreamFilter::LZWDecode(_) => "LZWDecode",
405            StreamFilter::FlateDecode(_) => "FlateDecode",
406            StreamFilter::RunLengthDecode => "RunLengthDecode",
407            StreamFilter::CCITTFaxDecode(_) => "CCITTFaxDecode",
408            StreamFilter::JBIG2Decode => "JBIG2Decode",
409            StreamFilter::DCTDecode => "DCTDecode",
410            StreamFilter::JPXDecode => "JPXDecode",
411            StreamFilter::Crypt(_) => "Crypt",
412        }
413    }
414}
415
416impl fmt::Display for PdfStream {
417    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418        write!(f, "{} stream[", self.dict)?;
419        match &self.data {
420            StreamData::Raw(data) => write!(f, "{} bytes raw", data.len())?,
421            StreamData::Decoded(data) => write!(f, "{} bytes decoded", data.len())?,
422            StreamData::Lazy(reference) => write!(f, "{} bytes lazy", reference.length)?,
423        }
424        write!(f, "]endstream")
425    }
426}