pdf_extract/
lib.rs

1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26use log::{warn, error, debug};
27mod core_fonts;
28mod glyphnames;
29mod zapfglyphnames;
30mod encodings;
31
32pub struct Space;
33pub type Transform = Transform2D<f64, Space, Space>;
34
35#[derive(Debug)]
36pub enum OutputError
37{
38    FormatError(std::fmt::Error),
39    IoError(std::io::Error),
40    PdfError(lopdf::Error)
41}
42
43impl std::fmt::Display for OutputError
44{
45    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
46        match self {
47            OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
48            OutputError::IoError(e) => write!(f, "IO error: {}", e),
49            OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
50        }
51    }
52}
53
54impl std::error::Error for OutputError{}
55
56impl From<std::fmt::Error> for OutputError {
57    fn from(e: std::fmt::Error) -> Self {
58        OutputError::FormatError(e)
59    }
60}
61
62impl From<std::io::Error> for OutputError {
63    fn from(e: std::io::Error) -> Self {
64        OutputError::IoError(e)
65    }
66}
67
68impl From<lopdf::Error> for OutputError {
69    fn from(e: lopdf::Error) -> Self {
70        OutputError::PdfError(e)
71    }
72}
73
74macro_rules! dlog {
75    ($($e:expr),*) => { {$(let _ = $e;)*} }
76    //($($t:tt)*) => { println!($($t)*) }
77}
78
79fn get_info(doc: &Document) -> Option<&Dictionary> {
80    match doc.trailer.get(b"Info") {
81        Ok(&Object::Reference(ref id)) => {
82            match doc.get_object(*id) {
83                Ok(&Object::Dictionary(ref info)) => { return Some(info); }
84                _ => {}
85            }
86        }
87        _ => {}
88    }
89    None
90}
91
92fn get_catalog(doc: &Document) -> &Dictionary {
93    match doc.trailer.get(b"Root").unwrap() {
94        &Object::Reference(ref id) => {
95            match doc.get_object(*id) {
96                Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
97                _ => {}
98            }
99        }
100        _ => {}
101    }
102    panic!();
103}
104
105fn get_pages(doc: &Document) -> &Dictionary {
106    let catalog = get_catalog(doc);
107    match catalog.get(b"Pages").unwrap() {
108        &Object::Reference(ref id) => {
109            match doc.get_object(*id) {
110                Ok(&Object::Dictionary(ref pages)) => { return pages; }
111                other => {dlog!("pages: {:?}", other)}
112            }
113        }
114        other => { dlog!("pages: {:?}", other)}
115    }
116    dlog!("catalog {:?}", catalog);
117    panic!();
118}
119
120#[allow(non_upper_case_globals)]
121const PDFDocEncoding: &'static [u16] = &[
122    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
123    0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
124    0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
125    0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
126    0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
127    0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
128    0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
129    0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
130    0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
131    0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
132    0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
133    0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
134    0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
135    0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
136    0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
137    0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
138    0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
139    0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
140    0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
141    0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
142    0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
143    0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
144    0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
145    0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
147    0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
148    0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
149    0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
150    0x00fc, 0x00fd, 0x00fe, 0x00ff];
151
152fn pdf_to_utf8(s: &[u8]) -> String {
153    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
154        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
155    } else {
156        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
157               let k = PDFDocEncoding[x as usize];
158               vec![(k>>8) as u8, k as u8].into_iter()}).collect();
159        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
160    }
161}
162
163fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
164    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
165        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
166    } else {
167        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
168            let k = encoding[x as usize];
169            vec![(k>>8) as u8, k as u8].into_iter()}).collect();
170        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
171    }
172}
173
174
175fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
176    match o {
177        &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
178        _ => o
179    }
180}
181
182fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
183    dict.get(key).map(|o| maybe_deref(doc, o)).ok()
184}
185
186// an intermediate trait that can be used to chain conversions that may have failed
187trait FromOptObj<'a> {
188    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
189}
190
191// conditionally convert to Self returns None if the conversion failed
192trait FromObj<'a> where Self: std::marker::Sized {
193    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
194}
195
196impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
197    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
198        obj.and_then(|x| T::from_obj(doc,x))
199    }
200}
201
202impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
203    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
204        T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
205    }
206}
207
208// we follow the same conventions as pdfium for when to support indirect objects:
209// on arrays, streams and dicts
210impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
211    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
212        maybe_deref(doc, obj).as_array().map(|x| x.iter()
213            .map(|x| T::from_obj(doc, x).expect("wrong type"))
214            .collect()).ok()
215    }
216}
217
218// XXX: These will panic if we don't have the right number of items
219// we don't want to do that
220impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
221    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
222        maybe_deref(doc, obj).as_array().map(|x| {
223            let mut all = x.iter()
224                .map(|x| T::from_obj(doc, x).expect("wrong type"));
225            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
226        }).ok()
227    }
228}
229
230impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
231    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
232        maybe_deref(doc, obj).as_array().map(|x| {
233            let mut all = x.iter()
234                .map(|x| T::from_obj(doc, x).expect("wrong type"));
235            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
236        }).ok()
237    }
238}
239
240impl<'a> FromObj<'a> for f64 {
241    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
242        match obj {
243            &Object::Integer(i) => Some(i as f64),
244            &Object::Real(f) => Some(f.into()),
245            _ => None
246        }
247    }
248}
249
250impl<'a> FromObj<'a> for i64 {
251    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
252        match obj {
253            &Object::Integer(i) => Some(i),
254            _ => None
255        }
256    }
257}
258
259impl<'a> FromObj<'a> for &'a Dictionary {
260    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
261        maybe_deref(doc, obj).as_dict().ok()
262    }
263}
264
265impl<'a> FromObj<'a> for &'a Stream {
266    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
267        maybe_deref(doc, obj).as_stream().ok()
268    }
269}
270
271impl<'a> FromObj<'a> for &'a Object {
272    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
273        Some(maybe_deref(doc, obj))
274    }
275}
276
277fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
278    T::from_opt_obj(doc, dict.get(key).ok(), key)
279}
280
281fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
282    maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
283}
284
285fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
286    pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
287}
288
289#[allow(dead_code)]
290fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
291    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
292}
293
294fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
295    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
296}
297
298fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
299    maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
300}
301
302#[derive(Clone)]
303struct PdfSimpleFont<'a> {
304    font: &'a Dictionary,
305    doc: &'a Document,
306    encoding: Option<Vec<u16>>,
307    unicode_map: Option<HashMap<u32, String>>,
308    widths: HashMap<CharCode, f64>, // should probably just use i32 here
309    missing_width: f64,
310}
311
312#[derive(Clone)]
313struct PdfType3Font<'a> {
314    font: &'a Dictionary,
315    doc: &'a Document,
316    encoding: Option<Vec<u16>>,
317    unicode_map: Option<HashMap<CharCode, String>>,
318    widths: HashMap<CharCode, f64>, // should probably just use i32 here
319}
320
321
322fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
323    let subtype = get_name_string(doc, font, b"Subtype");
324    dlog!("MakeFont({})", subtype);
325    if subtype == "Type0" {
326        Rc::new(PdfCIDFont::new(doc, font))
327    } else if subtype == "Type3" {
328        Rc::new(PdfType3Font::new(doc, font))
329    } else {
330        Rc::new(PdfSimpleFont::new(doc, font))
331    }
332}
333
334fn is_core_font(name: &str) -> bool {
335    match name {
336        "Courier-Bold" |
337        "Courier-BoldOblique" |
338        "Courier-Oblique" |
339        "Courier" |
340        "Helvetica-Bold" |
341        "Helvetica-BoldOblique" |
342        "Helvetica-Oblique" |
343        "Helvetica" |
344        "Symbol" |
345        "Times-Bold" |
346        "Times-BoldItalic" |
347        "Times-Italic" |
348        "Times-Roman" |
349        "ZapfDingbats" => true,
350        _ => false,
351    }
352}
353
354fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
355    let encoding = match &name[..] {
356        b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
357        b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
358        b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
359        _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
360    };
361    let encoding_table = encoding.iter()
362        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
363        .collect();
364    encoding_table
365}
366
367/* "Glyphs in the font are selected by single-byte character codes obtained from a string that
368    is shown by the text-showing operators. Logically, these codes index into a table of 256
369    glyphs; the mapping from codes to glyphs is called the font’s encoding. Each font program
370    has a built-in encoding. Under some circumstances, the encoding can be altered by means
371    described in Section 5.5.5, “Character Encoding.”
372*/
373impl<'a> PdfSimpleFont<'a> {
374    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
375        let base_name = get_name_string(doc, font, b"BaseFont");
376        let subtype = get_name_string(doc, font, b"Subtype");
377
378        let encoding: Option<&Object> = get(doc, font, b"Encoding");
379        dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
380        let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
381        let mut type1_encoding = None;
382        let mut unicode_map = None;
383        if let Some(descriptor) = descriptor {
384            dlog!("descriptor {:?}", descriptor);
385            if subtype == "Type1" {
386                let file = maybe_get_obj(doc, descriptor, b"FontFile");
387                match file {
388                    Some(&Object::Stream(ref s)) => {
389                        let s = get_contents(s);
390                        //dlog!("font contents {:?}", pdf_to_utf8(&s));
391                        type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
392                    }
393                    _ => { dlog!("font file {:?}", file) }
394                }
395            } else if subtype == "TrueType" {
396                let file = maybe_get_obj(doc, descriptor, b"FontFile2");
397                match file {
398                    Some(&Object::Stream(ref s)) => {
399                        let _s = get_contents(s);
400                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
401                    }
402                    _ => { dlog!("font file {:?}", file) }
403                }
404            }
405
406            let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
407            match font_file3 {
408                Some(&Object::Stream(ref s)) => {
409                    let subtype = get_name_string(doc, &s.dict, b"Subtype");
410                    dlog!("font file {}, {:?}", subtype, s);
411                    let s = get_contents(s);
412                    if subtype == "Type1C" {
413                        let table = cff_parser::Table::parse(&s).unwrap();
414                        let charset = table.charset.get_table();
415                        let encoding = table.encoding.get_table();
416                        let mut mapping = HashMap::new();
417                        for i in 0..encoding.len().min(charset.len()) {
418                            let cid = encoding[i];
419                            let sid = charset[i];
420                            let name = cff_parser::string_by_id(&table, sid).unwrap();
421                            let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
422                                zapfglyphnames::zapfdigbats_names_to_unicode(name)
423                            });
424                            if let Some(unicode) = unicode {
425                                let str = String::from_utf16(&[unicode]).unwrap();
426                                mapping.insert(cid as u32, str);
427                            }
428                        }
429                        unicode_map = Some(mapping);
430                        //
431                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
432                    }
433
434                    //
435                    //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
436                }
437                None => {}
438                _ => { dlog!("unexpected") }
439            }
440
441            let charset = maybe_get_obj(doc, descriptor, b"CharSet");
442            let _charset = match charset {
443                Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
444                _ => { None }
445            };
446            //dlog!("charset {:?}", charset);
447        }
448
449        let mut unicode_map = match unicode_map {
450            Some(mut unicode_map) => {
451                unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
452                Some(unicode_map)
453            }
454            None => {
455                get_unicode_map(doc, font)
456            }
457        };
458
459
460        let mut encoding_table = None;
461        match encoding {
462            Some(&Object::Name(ref encoding_name)) => {
463                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
464                encoding_table = Some(encoding_to_unicode_table(encoding_name));
465            }
466            Some(&Object::Dictionary(ref encoding)) => {
467                //dlog!("Encoding {:?}", encoding);
468                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
469                    dlog!("BaseEncoding {:?}", base_encoding);
470                    encoding_to_unicode_table(base_encoding)
471                } else {
472                    Vec::from(PDFDocEncoding)
473                };
474                let differences = maybe_get_array(doc, encoding, b"Differences");
475                if let Some(differences) = differences {
476                    dlog!("Differences");
477                    let mut code = 0;
478                    for o in differences {
479                        let o = maybe_deref(doc, o);
480                        match o {
481                            &Object::Integer(i) => { code = i; },
482                            &Object::Name(ref n) => {
483                                let name = pdf_to_utf8(&n);
484                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
485                                // unicode names, so we should probably handle this differently
486                                let unicode = glyphnames::name_to_unicode(&name);
487                                if let Some(unicode) = unicode{
488                                    table[code as usize] = unicode;
489                                    if let Some(ref mut unicode_map) = unicode_map {
490                                        let be = [unicode];
491                                        match unicode_map.entry(code as u32) {
492                                            // If there's a unicode table entry missing use one based on the name
493                                            Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
494                                            Entry::Occupied(e) => {
495                                                if e.get() != &String::from_utf16(&be).unwrap() {
496                                                    let normal_match  = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
497                                                    if !normal_match {
498                                                        warn!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
499                                                    }
500                                                }
501                                            }
502                                        }
503                                    }
504                                } else {
505                                    match unicode_map {
506                                        Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
507                                            // the fontawesome tex package will use glyph names that don't have a corresponding unicode
508                                            // code point, so we'll use an empty string instead. See issue #76
509                                            match unicode_map.entry(code as u32) {
510                                                Entry::Vacant(v) => { v.insert("".to_owned()); }
511                                                Entry::Occupied(e) => {
512                                                    panic!("unexpected entry in unicode map")
513                                                }
514                                            }
515                                        }
516                                        _ => {
517                                            warn!("unknown glyph name '{}' for font {}", name, base_name);
518                                        }
519                                    }
520                                }
521                                dlog!("{} = {} ({:?})", code, name, unicode);
522                                if let Some(ref mut unicode_map) = unicode_map {
523                                    // The unicode map might not have the code in it, but the code might
524                                    // not be used so we don't want to panic here.
525                                    // An example of this is the 'suppress' character in the TeX Latin Modern font.
526                                    // This shows up in https://arxiv.org/pdf/2405.01295v1.pdf
527                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
528                                }
529                                code += 1;
530                            }
531                            _ => { panic!("wrong type {:?}", o); }
532                        }
533                    }
534                }
535                // "Type" is optional
536                let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
537                dlog!("name: {}", name);
538
539                encoding_table = Some(table);
540            }
541            None => {
542                if let Some(type1_encoding) = type1_encoding {
543                    let mut table = Vec::from(PDFDocEncoding);
544                    dlog!("type1encoding");
545                    for (code, name) in type1_encoding {
546                        let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
547                        if let Some(unicode) = unicode {
548                            table[code as usize] = unicode;
549                        } else {
550                            dlog!("unknown character {}", pdf_to_utf8(&name));
551                        }
552                    }
553                    encoding_table = Some(table)
554                } else if subtype == "TrueType" {
555                    encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
556                        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
557                        .collect());
558                }
559            }
560            _ => { panic!() }
561        }
562
563        let mut width_map = HashMap::new();
564        /* "Ordinarily, a font dictionary that refers to one of the standard fonts
565            should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
566            However, it is permissible to override a standard font by including these
567            entries and embedding the font program in the PDF file."
568
569            Note: some PDFs include a descriptor but still don't include these entries */
570
571        // If we have widths prefer them over the core font widths. Needed for https://dkp.de/wp-content/uploads/parteitage/Sozialismusvorstellungen-der-DKP.pdf
572        if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
573            // Some PDF's don't have these like fips-197.pdf
574            let mut i: i64 = 0;
575            dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
576
577            for w in widths {
578                width_map.insert((first_char + i) as CharCode, w);
579                i += 1;
580            }
581            assert_eq!(first_char + i - 1, last_char);
582        } else {
583            let name = if is_core_font(&base_name) {
584                &base_name
585            } else {
586                warn!("no widths and not core font {:?}", base_name);
587
588                // This situation is handled differently by different readers
589                // but basically we try to substitute the best font that we can.
590
591                // Poppler/Xpdf:
592                // this is technically an error -- the Widths entry is required
593                // for all but the Base-14 fonts -- but certain PDF generators
594                // apparently don't include widths for Arial and TimesNewRoman
595
596                // Pdfium: CFX_FontMapper::FindSubstFont
597
598                // mupdf: pdf_load_substitute_font
599
600                // We can try to do a better job guessing at a font by looking at the flags
601                // or the basename but for now we'll just use Helvetica
602                "Helvetica"
603            };
604            for font_metrics in core_fonts::metrics().iter() {
605                if font_metrics.0 == base_name {
606                    if let Some(ref encoding) = encoding_table {
607                        dlog!("has encoding");
608                        for w in font_metrics.2 {
609                            let c = glyphnames::name_to_unicode(w.2).unwrap();
610                            for i in 0..encoding.len() {
611                                if encoding[i] == c {
612                                    width_map.insert(i as CharCode, w.1 as f64);
613                                }
614                            }
615                        }
616                    } else {
617                        // Instead of using the encoding from the core font we'll just look up all
618                        // of the character names. We should probably verify that this produces the
619                        // same result.
620
621                        let mut table = vec![0; 256];
622                        for w in font_metrics.2 {
623                            dlog!("{} {}", w.0, w.2);
624                            // -1 is "not encoded"
625                            if w.0 != -1 {
626                                table[w.0 as usize] = if base_name == "ZapfDingbats" {
627                                    zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
628                                } else {
629                                    glyphnames::name_to_unicode(w.2).unwrap()
630                                }
631                            }
632                        }
633
634                        let encoding = &table[..];
635                        for w in font_metrics.2 {
636                            width_map.insert(w.0 as CharCode, w.1 as f64);
637                            // -1 is "not encoded"
638                        }
639                        encoding_table = Some(encoding.to_vec());
640                    }
641                    /* "Ordinarily, a font dictionary that refers to one of the standard fonts
642                        should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
643                        However, it is permissible to override a standard font by including these
644                        entries and embedding the font program in the PDF file."
645
646                        Note: some PDFs include a descriptor but still don't include these entries */
647                    // assert!(maybe_get_obj(doc, font, b"FirstChar").is_none());
648                    // assert!(maybe_get_obj(doc, font, b"LastChar").is_none());
649                    // assert!(maybe_get_obj(doc, font, b"Widths").is_none());
650                }
651            }
652        }
653
654        let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
655        PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
656    }
657
658    #[allow(dead_code)]
659    fn get_type(&self) -> String {
660        get_name_string(self.doc, self.font, b"Type")
661    }
662    #[allow(dead_code)]
663    fn get_basefont(&self) -> String {
664        get_name_string(self.doc, self.font, b"BaseFont")
665    }
666    #[allow(dead_code)]
667    fn get_subtype(&self) -> String {
668        get_name_string(self.doc, self.font, b"Subtype")
669    }
670    #[allow(dead_code)]
671    fn get_widths(&self) -> Option<&Vec<Object>> {
672        maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
673    }
674    /* For type1: This entry is obsolescent and its use is no longer recommended. (See
675     * implementation note 42 in Appendix H.) */
676    #[allow(dead_code)]
677    fn get_name(&self) -> Option<String> {
678        maybe_get_name_string(self.doc, self.font, b"Name")
679    }
680
681    #[allow(dead_code)]
682    fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
683        maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
684    }
685}
686
687
688
689impl<'a> PdfType3Font<'a> {
690    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
691
692        let unicode_map = get_unicode_map(doc, font);
693        let encoding: Option<&Object> = get(doc, font, b"Encoding");
694
695        let encoding_table;
696        match encoding {
697            Some(&Object::Name(ref encoding_name)) => {
698                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
699                encoding_table = Some(encoding_to_unicode_table(encoding_name));
700            }
701            Some(&Object::Dictionary(ref encoding)) => {
702                //dlog!("Encoding {:?}", encoding);
703                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
704                    dlog!("BaseEncoding {:?}", base_encoding);
705                    encoding_to_unicode_table(base_encoding)
706                } else {
707                    Vec::from(PDFDocEncoding)
708                };
709                let differences = maybe_get_array(doc, encoding, b"Differences");
710                if let Some(differences) = differences {
711                    dlog!("Differences");
712                    let mut code = 0;
713                    for o in differences {
714                        match o {
715                            &Object::Integer(i) => { code = i; },
716                            &Object::Name(ref n) => {
717                                let name = pdf_to_utf8(&n);
718                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
719                                // unicode names, so we should probably handle this differently
720                                let unicode = glyphnames::name_to_unicode(&name);
721                                if let Some(unicode) = unicode{
722                                    table[code as usize] = unicode;
723                                }
724                                dlog!("{} = {} ({:?})", code, name, unicode);
725                                if let Some(ref unicode_map) = unicode_map {
726                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
727                                }
728                                code += 1;
729                            }
730                            _ => { panic!("wrong type"); }
731                        }
732                    }
733                }
734                let name_encoded = encoding.get(b"Type");
735                if let Ok(Object::Name(name)) = name_encoded {
736                    dlog!("name: {}", pdf_to_utf8(name));
737                } else {
738                    dlog!("name not found");
739                }
740
741                encoding_table = Some(table);
742            }
743            _ => { panic!() }
744        }
745
746        let first_char: i64 = get(doc, font, b"FirstChar");
747        let last_char: i64 = get(doc, font, b"LastChar");
748        let widths: Vec<f64> = get(doc, font, b"Widths");
749
750        let mut width_map = HashMap::new();
751
752        let mut i = 0;
753        dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
754
755        for w in widths {
756            width_map.insert((first_char + i) as CharCode, w);
757            i += 1;
758        }
759        assert_eq!(first_char + i - 1, last_char);
760        PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
761    }
762}
763
764type CharCode = u32;
765
766struct PdfFontIter<'a>
767{
768    i: Iter<'a, u8>,
769    font: &'a dyn PdfFont,
770}
771
772impl<'a> Iterator for PdfFontIter<'a> {
773    type Item = (CharCode, u8);
774    fn next(&mut self) -> Option<(CharCode, u8)> {
775        self.font.next_char(&mut self.i)
776    }
777}
778
779trait PdfFont : Debug {
780    fn get_width(&self, id: CharCode) -> f64;
781    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
782    fn decode_char(&self, char: CharCode) -> String;
783
784        /*fn char_codes<'a>(&'a self, chars: &'a [u8]) -> PdfFontIter {
785            let p = self;
786            PdfFontIter{i: chars.iter(), font: p as &PdfFont}
787        }*/
788
789}
790
791impl<'a> dyn PdfFont + 'a {
792    fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
793        PdfFontIter{i: chars.iter(), font: self}
794    }
795    fn decode(&self, chars: &[u8]) -> String {
796        let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
797        strings.join("")
798    }
799
800}
801
802
803impl<'a> PdfFont for PdfSimpleFont<'a> {
804    fn get_width(&self, id: CharCode) -> f64 {
805        let width = self.widths.get(&id);
806        if let Some(width) = width {
807            return *width;
808        } else {
809            let mut widths = self.widths.iter().collect::<Vec<_>>();
810            widths.sort_by_key(|x| x.0);
811            dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
812            return self.missing_width;
813        }
814    }
815    /*fn decode(&self, chars: &[u8]) -> String {
816        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
817        to_utf8(encoding, chars)
818    }*/
819
820    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
821        iter.next().map(|x| (*x as CharCode, 1))
822    }
823    fn decode_char(&self, char: CharCode) -> String {
824        let slice = [char as u8];
825        if let Some(ref unicode_map) = self.unicode_map {
826            let s = unicode_map.get(&char);
827            let s = match s {
828                None => {
829                    debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
830                    // some pdf's like http://arxiv.org/pdf/2312.00064v1 are missing entries in their unicode map but do have
831                    // entries in the encoding.
832                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
833                    let s = to_utf8(encoding, &slice);
834                    debug!("falling back to encoding {} -> {:?}", char, s);
835                    s
836                }
837                Some(s) => { s.clone() }
838            };
839            return s
840        }
841        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
842        //dlog!("char_code {:?} {:?}", char, self.encoding);
843        let s = to_utf8(encoding, &slice);
844        s
845    }
846}
847
848
849
850impl<'a> fmt::Debug for PdfSimpleFont<'a> {
851    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
852        self.font.fmt(f)
853    }
854}
855
856impl<'a> PdfFont for PdfType3Font<'a> {
857    fn get_width(&self, id: CharCode) -> f64 {
858        let width = self.widths.get(&id);
859        if let Some(width) = width {
860            return *width;
861        } else {
862            panic!("missing width for {} {:?}", id, self.font);
863        }
864    }
865    /*fn decode(&self, chars: &[u8]) -> String {
866        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
867        to_utf8(encoding, chars)
868    }*/
869
870    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
871        iter.next().map(|x| (*x as CharCode, 1))
872    }
873    fn decode_char(&self, char: CharCode) -> String {
874        let slice = [char as u8];
875        if let Some(ref unicode_map) = self.unicode_map {
876            let s = unicode_map.get(&char);
877            let s = match s {
878                None => {
879                    debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
880                    // some pdf's like http://arxiv.org/pdf/2312.00577v1 are missing entries in their unicode map but do have
881                    // entries in the encoding.
882                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
883                    let s = to_utf8(encoding, &slice);
884                    debug!("falling back to encoding {} -> {:?}", char, s);
885                    s
886                }
887                Some(s) => { s.clone() }
888            };
889            return s
890        }
891        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
892        //dlog!("char_code {:?} {:?}", char, self.encoding);
893        let s = to_utf8(encoding, &slice);
894        s
895    }
896}
897
898
899
900impl<'a> fmt::Debug for PdfType3Font<'a> {
901    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
902        self.font.fmt(f)
903    }
904}
905
906struct PdfCIDFont<'a> {
907    font: &'a Dictionary,
908    #[allow(dead_code)]
909    doc: &'a Document,
910    #[allow(dead_code)]
911    encoding: ByteMapping,
912    to_unicode: Option<HashMap<u32, String>>,
913    widths: HashMap<CharCode, f64>, // should probably just use i32 here
914    default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
915}
916
917fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
918    let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
919    dlog!("ToUnicode: {:?}", to_unicode);
920    let mut unicode_map = None;
921    match to_unicode {
922        Some(&Object::Stream(ref stream)) => {
923            let contents = get_contents(stream);
924            dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
925
926            let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
927            let mut unicode = HashMap::new();
928            // "It must use the beginbfchar, endbfchar, beginbfrange, and endbfrange operators to
929            // define the mapping from character codes to Unicode character sequences expressed in
930            // UTF-16BE encoding."
931            for (&k, v) in cmap.iter() {
932                let mut be: Vec<u16> = Vec::new();
933                let mut i = 0;
934                assert!(v.len() % 2 == 0);
935                while i < v.len() {
936                    be.push(((v[i] as u16) << 8) | v[i+1] as u16);
937                    i += 2;
938                }
939                match &be[..] {
940                    [0xd800 ..= 0xdfff] => {
941                        // this range is not specified as not being encoded
942                        // we ignore them so we don't an error from from_utt16
943                        continue;
944                    }
945                    _ => {}
946                }
947                let s = String::from_utf16(&be).unwrap();
948
949                unicode.insert(k, s);
950            }
951            unicode_map = Some(unicode);
952
953            dlog!("map: {:?}", unicode_map);
954        }
955        None => { }
956        Some(&Object::Name(ref name)) => {
957            let name = pdf_to_utf8(name);
958            if name != "Identity-H" {
959                todo!("unsupported ToUnicode name: {:?}", name);
960            }
961        }
962        _ => { panic!("unsupported cmap {:?}", to_unicode)}
963    }
964    unicode_map
965}
966
967
968impl<'a> PdfCIDFont<'a> {
969    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
970        let base_name = get_name_string(doc, font, b"BaseFont");
971        let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
972        let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
973        let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
974        dlog!("base_name {} {:?}", base_name, font);
975
976        let encoding = match encoding {
977            &Object::Name(ref name) => {
978                let name = pdf_to_utf8(name);
979                dlog!("encoding {:?}", name);
980                if name == "Identity-H" || name == "Identity-V" {
981                    ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
982                } else {
983                    panic!("unsupported encoding {}", name);
984                }
985            }
986            &Object::Stream(ref stream) => {
987                let contents = get_contents(stream);
988                dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
989                adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
990            }
991            _ => { panic!("unsupported encoding {:?}", encoding)}
992        };
993
994        // Sometimes a Type0 font might refer to the same underlying data as regular font. In this case we may be able to extract some encoding
995        // data.
996        // We should also look inside the truetype data to see if there's a cmap table. It will help us convert as well.
997        // This won't work if the cmap has been subsetted. A better approach might be to hash glyph contents and use that against
998        // a global library of glyph hashes
999        let unicode_map = get_unicode_map(doc, font);
1000
1001        dlog!("descendents {:?} {:?}", descendants, ciddict);
1002
1003        let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1004        dlog!("{:?}", font_dict);
1005        let _f = font_dict.as_dict().expect("must be dict");
1006        let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1007        let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1008        dlog!("widths {:?}", w);
1009        let mut widths = HashMap::new();
1010        let mut i = 0;
1011        if let Some(w) = w {
1012            while i < w.len() {
1013                if let &Object::Array(ref wa) = w[i+1] {
1014                    let cid = w[i].as_i64().expect("id should be num");
1015                    let mut j = 0;
1016                    dlog!("wa: {:?} -> {:?}", cid, wa);
1017                    for w in wa {
1018                        widths.insert((cid + j) as CharCode, as_num(w) );
1019                        j += 1;
1020                    }
1021                    i += 2;
1022                } else {
1023                    let c_first = w[i].as_i64().expect("first should be num");
1024                    let c_last = w[i].as_i64().expect("last should be num");
1025                    let c_width = as_num(&w[i]);
1026                    for id in c_first..c_last {
1027                        widths.insert(id as CharCode, c_width);
1028                    }
1029                    i += 3;
1030                }
1031            }
1032        }
1033        PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1034    }
1035}
1036
1037impl<'a> PdfFont for PdfCIDFont<'a> {
1038    fn get_width(&self, id: CharCode) -> f64 {
1039        let width = self.widths.get(&id);
1040        if let Some(width) = width {
1041            dlog!("GetWidth {} -> {}", id, *width);
1042            return *width;
1043        } else {
1044            dlog!("missing width for {} falling back to default_width", id);
1045            return self.default_width.unwrap();
1046        }
1047    }/*
1048    fn decode(&self, chars: &[u8]) -> String {
1049        self.char_codes(chars);
1050
1051        //let utf16 = Vec::new();
1052
1053        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1054        to_utf8(encoding, chars)
1055    }*/
1056
1057    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1058        let mut c = *iter.next()? as u32;
1059        let mut code = None;
1060        'outer: for width in 1..=4 {
1061            for range in &self.encoding.codespace {
1062                if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1063                    code = Some((c as u32, width));
1064                    break 'outer;
1065                }
1066            }
1067            let next = *iter.next()?;
1068            c = ((c as u32) << 8) | next as u32;
1069        }
1070        let code = code?;
1071        for range in &self.encoding.cid {
1072            if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1073                return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1074            }
1075        }
1076        None
1077    }
1078    fn decode_char(&self, char: CharCode) -> String {
1079        let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1080        if let Some(s) = s {
1081            s.clone()
1082        } else {
1083            dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1084            "".to_string()
1085        }
1086    }
1087}
1088
1089impl<'a> fmt::Debug for PdfCIDFont<'a> {
1090    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1091        self.font.fmt(f)
1092    }
1093}
1094
1095
1096
1097#[derive(Copy, Clone)]
1098struct PdfFontDescriptor<'a> {
1099    desc: &'a Dictionary,
1100    doc: &'a Document
1101}
1102
1103impl<'a> PdfFontDescriptor<'a> {
1104    #[allow(dead_code)]
1105    fn get_file(&self) -> Option<&'a Object> {
1106        maybe_get_obj(self.doc, self.desc, b"FontFile")
1107    }
1108}
1109
1110impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1111    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1112        self.desc.fmt(f)
1113    }
1114}
1115
1116#[derive(Clone, Debug)]
1117struct Type0Func {
1118    domain: Vec<f64>,
1119    range: Vec<f64>,
1120    contents: Vec<u8>,
1121    size: Vec<i64>,
1122    bits_per_sample: i64,
1123    encode: Vec<f64>,
1124    decode: Vec<f64>,
1125}
1126
1127#[allow(dead_code)]
1128fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1129    let divisor = x - x_min;
1130    if divisor != 0. {
1131        y_min + (x - x_min) * ((y_max - y_min) / divisor)
1132    } else {
1133        // (x - x_min) will be 0 which means we want to discard the interpolation
1134        // and arbitrarily choose y_min to match pdfium
1135        y_min
1136    }
1137}
1138
1139impl Type0Func {
1140    #[allow(dead_code)]
1141    fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1142        let _n_inputs = self.domain.len() / 2;
1143        let _n_ouputs = self.range.len() / 2;
1144
1145    }
1146}
1147
1148#[derive(Clone, Debug)]
1149struct Type2Func {
1150    c0: Option<Vec<f64>>,
1151    c1: Option<Vec<f64>>,
1152    n: f64,
1153}
1154
1155#[derive(Clone, Debug)]
1156enum Function {
1157    Type0(Type0Func),
1158    Type2(Type2Func),
1159    #[allow(dead_code)]
1160    Type3,
1161    #[allow(dead_code)]
1162    Type4(Vec<u8>)
1163}
1164
1165impl Function {
1166    fn new(doc: &Document, obj: &Object) -> Function {
1167        let dict = match obj {
1168            &Object::Dictionary(ref dict) => dict,
1169            &Object::Stream(ref stream) => &stream.dict,
1170            _ => panic!()
1171        };
1172        let function_type: i64 = get(doc, dict, b"FunctionType");
1173        let f = match function_type {
1174            0 => {
1175                // Sampled function
1176                let stream = match obj {
1177                    &Object::Stream(ref stream) => stream,
1178                    _ => panic!()
1179                };
1180                let range: Vec<f64> = get(doc, dict, b"Range");
1181                let domain: Vec<f64> = get(doc, dict, b"Domain");
1182                let contents = get_contents(stream);
1183                let size: Vec<i64> = get(doc, dict, b"Size");
1184                let bits_per_sample = get(doc, dict, b"BitsPerSample");
1185                // We ignore 'Order' like pdfium, poppler and pdf.js
1186
1187                let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1188                // maybe there's some better way to write this.
1189                let encode = encode.unwrap_or_else(|| {
1190                    let mut default = Vec::new();
1191                    for i in &size {
1192                        default.extend([0., (i - 1) as f64].iter());
1193                    }
1194                    default
1195                });
1196                let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1197
1198                Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1199            }
1200            2 => {
1201                // Exponential interpolation function
1202                let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1203                let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1204                let n = get::<f64>(doc, dict, b"N");
1205                Function::Type2(Type2Func { c0, c1, n})
1206            }
1207            3 => {
1208                // Stitching function
1209                Function::Type3
1210            }
1211            4 => {
1212                // PostScript calculator function
1213                let contents = match obj {
1214                    &Object::Stream(ref stream) => {
1215                        let contents = get_contents(stream);
1216                        warn!("unhandled type-4 function");
1217                        warn!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1218                        contents
1219                    }
1220                    _ => { panic!("type 4 functions should be streams") }
1221                };
1222                Function::Type4(contents)
1223            }
1224            _ => { panic!("unhandled function type {}", function_type) }
1225        };
1226        f
1227    }
1228}
1229
1230fn as_num(o: &Object) -> f64 {
1231    match o {
1232        &Object::Integer(i) => { i as f64 }
1233        &Object::Real(f) => { f.into() }
1234        _ => { panic!("not a number") }
1235    }
1236}
1237
1238#[derive(Clone)]
1239struct TextState<'a>
1240{
1241    font: Option<Rc<dyn PdfFont + 'a>>,
1242    font_size: f64,
1243    character_spacing: f64,
1244    word_spacing: f64,
1245    horizontal_scaling: f64,
1246    leading: f64,
1247    rise: f64,
1248    tm: Transform,
1249}
1250
1251// XXX: We'd ideally implement this without having to copy the uncompressed data
1252fn get_contents(contents: &Stream) -> Vec<u8> {
1253    if contents.filters().is_ok() {
1254        contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1255    } else {
1256        contents.content.clone()
1257    }
1258}
1259
1260#[derive(Clone)]
1261struct GraphicsState<'a>
1262{
1263    ctm: Transform,
1264    ts: TextState<'a>,
1265    smask: Option<Dictionary>,
1266    fill_colorspace: ColorSpace,
1267    fill_color: Vec<f64>,
1268    stroke_colorspace: ColorSpace,
1269    stroke_color: Vec<f64>,
1270    line_width: f64,
1271}
1272
1273fn show_text(gs: &mut GraphicsState, s: &[u8],
1274             _tlm: &Transform,
1275             _flip_ctm: &Transform,
1276             output: &mut dyn OutputDev) -> Result<(), OutputError> {
1277    let ts = &mut gs.ts;
1278    let font = ts.font.as_ref().unwrap();
1279    //let encoding = font.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1280    dlog!("{:?}", font.decode(s));
1281    dlog!("{:?}", font.decode(s).as_bytes());
1282    dlog!("{:?}", s);
1283    output.begin_word()?;
1284
1285    for (c, length) in font.char_codes(s) {
1286        // 5.3.3 Text Space Details
1287        let tsm = Transform2D::row_major(ts.horizontal_scaling,
1288                                                 0.,
1289                                                 0.,
1290                                                 1.0,
1291                                                 0.,
1292                                                 ts.rise);
1293        // Trm = Tsm × Tm × CTM
1294        let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1295        //dlog!("ctm: {:?} tm {:?}", gs.ctm, tm);
1296        //dlog!("current pos: {:?}", position);
1297        // 5.9 Extraction of Text Content
1298
1299
1300        //dlog!("w: {}", font.widths[&(*c as i64)]);
1301        let w0 = font.get_width(c) / 1000.;
1302
1303        let mut spacing = ts.character_spacing;
1304        // "Word spacing is applied to every occurrence of the single-byte character code 32 in a
1305        //  string when using a simple font or a composite font that defines code 32 as a
1306        //  single-byte code. It does not apply to occurrences of the byte value 32 in
1307        //  multiple-byte codes."
1308        let is_space = c == 32 && length == 1;
1309        if is_space { spacing += ts.word_spacing }
1310
1311        output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1312        let tj = 0.;
1313        let ty = 0.;
1314        let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1315        dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1316        // dlog!("w0: {}, tx: {}", w0, tx);
1317        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1318        let _trm = ts.tm.pre_transform(&gs.ctm);
1319        //dlog!("post pos: {:?}", trm);
1320
1321    }
1322    output.end_word()?;
1323    Ok(())
1324}
1325
1326#[derive(Debug, Clone, Copy)]
1327pub struct MediaBox {
1328    pub llx: f64,
1329    pub lly: f64,
1330    pub urx: f64,
1331    pub ury: f64
1332}
1333
1334fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1335    for (k, v) in state.iter() {
1336        let k : &[u8] = k.as_ref();
1337        match k {
1338            b"SMask" => { match maybe_deref(doc, v)  {
1339                &Object::Name(ref name) => {
1340                    if name == b"None" {
1341                        gs.smask = None;
1342                    } else {
1343                        panic!("unexpected smask name")
1344                    }
1345                }
1346                &Object::Dictionary(ref dict) => {
1347                    gs.smask = Some(dict.clone());
1348                }
1349                _ => { panic!("unexpected smask type {:?}", v) }
1350            }}
1351            b"Type" => { match v {
1352                &Object::Name(ref name) => {
1353                    assert_eq!(name, b"ExtGState")
1354                }
1355                _ => { panic!("unexpected type") }
1356            }}
1357            _ => {  dlog!("unapplied state: {:?} {:?}", k, v); }
1358        }
1359    }
1360
1361}
1362
1363#[derive(Debug)]
1364pub enum PathOp {
1365    MoveTo(f64, f64),
1366    LineTo(f64, f64),
1367    // XXX: is it worth distinguishing the different kinds of curve ops?
1368    CurveTo(f64, f64, f64, f64, f64, f64),
1369    Rect(f64, f64, f64, f64),
1370    Close,
1371}
1372
1373#[derive(Debug)]
1374pub struct Path {
1375    pub ops: Vec<PathOp>
1376}
1377
1378impl Path {
1379    fn new() -> Path {
1380        Path { ops: Vec::new() }
1381    }
1382    fn current_point(&self) -> (f64, f64) {
1383        match self.ops.last().unwrap() {
1384            &PathOp::MoveTo(x, y) => { (x, y) }
1385            &PathOp::LineTo(x, y) => { (x, y) }
1386            &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1387            _ => { panic!() }
1388        }
1389    }
1390}
1391
1392#[derive(Clone, Debug)]
1393pub struct CalGray {
1394    white_point: [f64; 3],
1395    black_point: Option<[f64; 3]>,
1396    gamma: Option<f64>,
1397}
1398
1399#[derive(Clone, Debug)]
1400pub struct CalRGB {
1401    white_point: [f64; 3],
1402    black_point: Option<[f64; 3]>,
1403    gamma: Option<[f64; 3]>,
1404    matrix: Option<Vec<f64>>
1405}
1406
1407#[derive(Clone, Debug)]
1408pub struct Lab {
1409    white_point: [f64; 3],
1410    black_point: Option<[f64; 3]>,
1411    range: Option<[f64; 4]>,
1412}
1413
1414#[derive(Clone, Debug)]
1415pub enum AlternateColorSpace {
1416    DeviceGray,
1417    DeviceRGB,
1418    DeviceCMYK,
1419    CalRGB(CalRGB),
1420    CalGray(CalGray),
1421    Lab(Lab),
1422    ICCBased(Vec<u8>)
1423}
1424
1425#[derive(Clone)]
1426pub struct Separation {
1427    name: String,
1428    alternate_space: AlternateColorSpace,
1429    tint_transform: Box<Function>,
1430}
1431
1432#[derive(Clone)]
1433pub enum ColorSpace {
1434    DeviceGray,
1435    DeviceRGB,
1436    DeviceCMYK,
1437    DeviceN,
1438    Pattern,
1439    CalRGB(CalRGB),
1440    CalGray(CalGray),
1441    Lab(Lab),
1442    Separation(Separation),
1443    ICCBased(Vec<u8>)
1444}
1445
1446fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1447    match name {
1448        b"DeviceGray" => ColorSpace::DeviceGray,
1449        b"DeviceRGB" => ColorSpace::DeviceRGB,
1450        b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1451        b"Pattern" => ColorSpace::Pattern,
1452        _ => {
1453            let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1454            let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1455            if let Ok(cs) = cs.as_array() {
1456                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1457                match cs_name.as_ref() {
1458                    "Separation" => {
1459                        let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1460                        let alternate_space = match &maybe_deref(doc, &cs[2]) {
1461                            Object::Name(name) => {
1462                                match &name[..] {
1463                                    b"DeviceGray" => AlternateColorSpace::DeviceGray,
1464                                    b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1465                                    b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1466                                    _ => panic!("unexpected color space name")
1467                                }
1468                            }
1469                            Object::Array(cs) => {
1470                                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1471                                match cs_name.as_ref() {
1472                                    "ICCBased" => {
1473                                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1474                                        dlog!("ICCBased {:?}", stream);
1475                                        // XXX: we're going to be continually decompressing everytime this object is referenced
1476                                        AlternateColorSpace::ICCBased(get_contents(stream))
1477                                    }
1478                                    "CalGray" => {
1479                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1480                                        AlternateColorSpace::CalGray(CalGray {
1481                                            white_point: get(&doc, dict, b"WhitePoint"),
1482                                            black_point: get(&doc, dict, b"BackPoint"),
1483                                            gamma: get(&doc, dict, b"Gamma"),
1484                                        })
1485                                    }
1486                                    "CalRGB" => {
1487                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1488                                        AlternateColorSpace::CalRGB(CalRGB {
1489                                            white_point: get(&doc, dict, b"WhitePoint"),
1490                                            black_point: get(&doc, dict, b"BackPoint"),
1491                                            gamma: get(&doc, dict, b"Gamma"),
1492                                            matrix: get(&doc, dict, b"Matrix"),
1493                                        })
1494                                    }
1495                                    "Lab" => {
1496                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1497                                        AlternateColorSpace::Lab(Lab {
1498                                            white_point: get(&doc, dict, b"WhitePoint"),
1499                                            black_point: get(&doc, dict, b"BackPoint"),
1500                                            range: get(&doc, dict, b"Range"),
1501                                        })
1502                                    }
1503                                    _ => panic!("Unexpected color space name")
1504                                }
1505                            }
1506                            _ => panic!("Alternate space should be name or array {:?}", cs[2])
1507                        };
1508                        let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1509
1510                        dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1511                        ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1512                    }
1513                    "ICCBased" => {
1514                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1515                        dlog!("ICCBased {:?}", stream);
1516                        // XXX: we're going to be continually decompressing everytime this object is referenced
1517                        ColorSpace::ICCBased(get_contents(stream))
1518                    }
1519                    "CalGray" => {
1520                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1521                        ColorSpace::CalGray(CalGray {
1522                            white_point: get(&doc, dict, b"WhitePoint"),
1523                            black_point: get(&doc, dict, b"BackPoint"),
1524                            gamma: get(&doc, dict, b"Gamma"),
1525                        })
1526                    }
1527                    "CalRGB" => {
1528                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1529                        ColorSpace::CalRGB(CalRGB {
1530                            white_point: get(&doc, dict, b"WhitePoint"),
1531                            black_point: get(&doc, dict, b"BackPoint"),
1532                            gamma: get(&doc, dict, b"Gamma"),
1533                            matrix: get(&doc, dict, b"Matrix"),
1534                        })
1535                    }
1536                    "Lab" => {
1537                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1538                        ColorSpace::Lab(Lab {
1539                            white_point: get(&doc, dict, b"WhitePoint"),
1540                            black_point: get(&doc, dict, b"BackPoint"),
1541                            range: get(&doc, dict, b"Range"),
1542                        })
1543                    }
1544                    "Pattern" => {
1545                        ColorSpace::Pattern
1546                    },
1547                    "DeviceGray" => ColorSpace::DeviceGray,
1548                    "DeviceRGB" => ColorSpace::DeviceRGB,
1549                    "DeviceCMYK" => ColorSpace::DeviceCMYK,
1550                    "DeviceN" => ColorSpace::DeviceN,
1551                    _ => {
1552                        panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1553                    }
1554                }
1555            } else if let Ok(cs) = cs.as_name() {
1556                match pdf_to_utf8(cs).as_ref() {
1557                    "DeviceRGB" => ColorSpace::DeviceRGB,
1558                    "DeviceGray" => ColorSpace::DeviceGray,
1559                    _ => panic!()
1560                }
1561            } else {
1562                panic!();
1563            }
1564        }
1565    }
1566}
1567
1568struct Processor<'a> {
1569    _none: PhantomData<&'a ()>
1570}
1571
1572impl<'a> Processor<'a> {
1573    fn new() -> Processor<'a> {
1574        Processor { _none: PhantomData }
1575    }
1576
1577    fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1578        let content = Content::decode(&content).unwrap();
1579        let mut font_table = HashMap::new();
1580        let mut gs: GraphicsState = GraphicsState {
1581            ts: TextState {
1582                font: None,
1583                font_size: std::f64::NAN,
1584                character_spacing: 0.,
1585                word_spacing: 0.,
1586                horizontal_scaling: 100. / 100.,
1587                leading: 0.,
1588                rise: 0.,
1589                tm: Transform2D::identity(),
1590            },
1591            fill_color: Vec::new(),
1592            fill_colorspace: ColorSpace::DeviceGray,
1593            stroke_color: Vec::new(),
1594            stroke_colorspace: ColorSpace::DeviceGray,
1595            line_width: 1.,
1596            ctm: Transform2D::identity(),
1597            smask: None
1598        };
1599        //let mut ts = &mut gs.ts;
1600        let mut gs_stack = Vec::new();
1601        let mut mc_stack = Vec::new();
1602        // XXX: replace tlm with a point for text start
1603        let mut tlm = Transform2D::identity();
1604        let mut path = Path::new();
1605        let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1606        dlog!("MediaBox {:?}", media_box);
1607        for operation in &content.operations {
1608            //dlog!("op: {:?}", operation);
1609
1610            match operation.operator.as_ref() {
1611                "BT" => {
1612                    tlm = Transform2D::identity();
1613                    gs.ts.tm = tlm;
1614                }
1615                "ET" => {
1616                    tlm = Transform2D::identity();
1617                    gs.ts.tm = tlm;
1618                }
1619                "cm" => {
1620                    assert!(operation.operands.len() == 6);
1621                    let m = Transform2D::row_major(as_num(&operation.operands[0]),
1622                                                   as_num(&operation.operands[1]),
1623                                                   as_num(&operation.operands[2]),
1624                                                   as_num(&operation.operands[3]),
1625                                                   as_num(&operation.operands[4]),
1626                                                   as_num(&operation.operands[5]));
1627                    gs.ctm = gs.ctm.pre_transform(&m);
1628                    dlog!("matrix {:?}", gs.ctm);
1629                }
1630                "CS" => {
1631                    let name = operation.operands[0].as_name().unwrap();
1632                    gs.stroke_colorspace = make_colorspace(doc, name, resources);
1633                }
1634                "cs" => {
1635                    let name = operation.operands[0].as_name().unwrap();
1636                    gs.fill_colorspace = make_colorspace(doc, name, resources);
1637                }
1638                "SC" | "SCN" => {
1639                    gs.stroke_color = match gs.stroke_colorspace {
1640                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1641                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1642                    };
1643                }
1644                "sc" | "scn" => {
1645                    gs.fill_color = match gs.fill_colorspace {
1646                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1647                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1648                    };
1649                }
1650                "G" | "g" | "RG" | "rg" | "K" | "k" => {
1651                    dlog!("unhandled color operation {:?}", operation);
1652                }
1653                "TJ" => {
1654                    match operation.operands[0] {
1655                        Object::Array(ref array) => {
1656                            for e in array {
1657                                match e {
1658                                    &Object::String(ref s, _) => {
1659                                        show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1660                                    }
1661                                    &Object::Integer(i) => {
1662                                        let ts = &mut gs.ts;
1663                                        let w0 = 0.;
1664                                        let tj = i as f64;
1665                                        let ty = 0.;
1666                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1667                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1668                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1669                                    }
1670                                    &Object::Real(i) => {
1671                                        let ts = &mut gs.ts;
1672                                        let w0 = 0.;
1673                                        let tj = i as f64;
1674                                        let ty = 0.;
1675                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1676                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1677                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1678                                    }
1679                                    _ => { dlog!("kind of {:?}", e); }
1680                                }
1681                            }
1682                        }
1683                        _ => {}
1684                    }
1685                }
1686                "Tj" => {
1687                    match operation.operands[0] {
1688                        Object::String(ref s, _) => {
1689                            show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1690                        }
1691                        _ => { panic!("unexpected Tj operand {:?}", operation) }
1692                    }
1693                }
1694                "Tc" => {
1695                    gs.ts.character_spacing = as_num(&operation.operands[0]);
1696                }
1697                "Tw" => {
1698                    gs.ts.word_spacing = as_num(&operation.operands[0]);
1699                }
1700                "Tz" => {
1701                    gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1702                }
1703                "TL" => {
1704                    gs.ts.leading = as_num(&operation.operands[0]);
1705                }
1706                "Tf" => {
1707                    let fonts: &Dictionary = get(&doc, resources, b"Font");
1708                    let name = operation.operands[0].as_name().unwrap();
1709                    let font = font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1710                    {
1711                        /*let file = font.get_descriptor().and_then(|desc| desc.get_file());
1712                    if let Some(file) = file {
1713                        let file_contents = filter_data(file.as_stream().unwrap());
1714                        let mut cursor = Cursor::new(&file_contents[..]);
1715                        //let f = Font::read(&mut cursor);
1716                        //dlog!("font file: {:?}", f);
1717                    }*/
1718                    }
1719                    gs.ts.font = Some(font);
1720
1721                    gs.ts.font_size = as_num(&operation.operands[1]);
1722                    dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1723                }
1724                "Ts" => {
1725                    gs.ts.rise = as_num(&operation.operands[0]);
1726                }
1727                "Tm" => {
1728                    assert!(operation.operands.len() == 6);
1729                    tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1730                                                 as_num(&operation.operands[1]),
1731                                                 as_num(&operation.operands[2]),
1732                                                 as_num(&operation.operands[3]),
1733                                                 as_num(&operation.operands[4]),
1734                                                 as_num(&operation.operands[5]));
1735                    gs.ts.tm = tlm;
1736                    dlog!("Tm: matrix {:?}", gs.ts.tm);
1737                    output.end_line()?;
1738                }
1739                "Td" => {
1740                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1741                   tx and ty are numbers expressed in unscaled text space units.
1742                   More precisely, this operator performs the following assignments:
1743                 */
1744                    assert!(operation.operands.len() == 2);
1745                    let tx = as_num(&operation.operands[0]);
1746                    let ty = as_num(&operation.operands[1]);
1747                    dlog!("translation: {} {}", tx, ty);
1748
1749                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1750                    gs.ts.tm = tlm;
1751                    dlog!("Td matrix {:?}", gs.ts.tm);
1752                    output.end_line()?;
1753                }
1754
1755                "TD" => {
1756                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1757                   As a side effect, this operator sets the leading parameter in the text state.
1758                 */
1759                    assert!(operation.operands.len() == 2);
1760                    let tx = as_num(&operation.operands[0]);
1761                    let ty = as_num(&operation.operands[1]);
1762                    dlog!("translation: {} {}", tx, ty);
1763                    gs.ts.leading = -ty;
1764
1765                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1766                    gs.ts.tm = tlm;
1767                    dlog!("TD matrix {:?}", gs.ts.tm);
1768                    output.end_line()?;
1769                }
1770
1771                "T*" => {
1772                    let tx = 0.0;
1773                    let ty = -gs.ts.leading;
1774
1775                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1776                    gs.ts.tm = tlm;
1777                    dlog!("T* matrix {:?}", gs.ts.tm);
1778                    output.end_line()?;
1779                }
1780                "q" => { gs_stack.push(gs.clone()); }
1781                "Q" => {
1782                    let s = gs_stack.pop();
1783                    if let Some(s) = s {
1784                        gs = s;
1785                    } else {
1786                        warn!("No state to pop");
1787                    }
1788                }
1789                "gs" => {
1790                    let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1791                    let name = operation.operands[0].as_name().unwrap();
1792                    let state: &Dictionary = get(doc, ext_gstate, name);
1793                    apply_state(doc, &mut gs, state);
1794                }
1795                "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1796                "w" => { gs.line_width = as_num(&operation.operands[0]); }
1797                "J" | "j" | "M" | "d" | "ri"  => { dlog!("unknown graphics state operator {:?}", operation); }
1798                "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1799                "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1800                "c" => {
1801                    path.ops.push(PathOp::CurveTo(
1802                        as_num(&operation.operands[0]),
1803                        as_num(&operation.operands[1]),
1804                        as_num(&operation.operands[2]),
1805                        as_num(&operation.operands[3]),
1806                        as_num(&operation.operands[4]),
1807                        as_num(&operation.operands[5])))
1808                }
1809                "v" => {
1810                    let (x, y) = path.current_point();
1811                    path.ops.push(PathOp::CurveTo(
1812                        x,
1813                        y,
1814                        as_num(&operation.operands[0]),
1815                        as_num(&operation.operands[1]),
1816                        as_num(&operation.operands[2]),
1817                        as_num(&operation.operands[3])))
1818                }
1819                "y" => {
1820                    path.ops.push(PathOp::CurveTo(
1821                        as_num(&operation.operands[0]),
1822                        as_num(&operation.operands[1]),
1823                        as_num(&operation.operands[2]),
1824                        as_num(&operation.operands[3]),
1825                        as_num(&operation.operands[2]),
1826                        as_num(&operation.operands[3])))
1827                }
1828                "h" => { path.ops.push(PathOp::Close) }
1829                "re" => {
1830                    path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1831                                               as_num(&operation.operands[1]),
1832                                               as_num(&operation.operands[2]),
1833                                               as_num(&operation.operands[3])))
1834                }
1835                "s" | "f*" | "B" | "B*" | "b" => {
1836                    dlog!("unhandled path op {:?}", operation);
1837                }
1838                "S" => {
1839                    output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1840                    path.ops.clear();
1841                }
1842                "F" | "f" => {
1843                    output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1844                    path.ops.clear();
1845                }
1846                "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1847                "n" => {
1848                    dlog!("discard {:?}", path);
1849                    path.ops.clear();
1850                }
1851                "BMC" | "BDC" => {
1852                    mc_stack.push(operation);
1853                }
1854                "EMC" => {
1855                    mc_stack.pop();
1856                }
1857                "Do" => {
1858                    // `Do` process an entire subdocument, so we do a recursive call to `process_stream`
1859                    // with the subdocument content and resources
1860                    let xobject: &Dictionary = get(&doc, resources, b"XObject");
1861                    let name = operation.operands[0].as_name().unwrap();
1862                    let xf: &Stream = get(&doc, xobject, name);
1863                    let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1864                    let contents = get_contents(xf);
1865                    self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1866                }
1867                _ => { dlog!("unknown operation {:?}", operation); }
1868
1869            }
1870        }
1871        Ok(())
1872    }
1873}
1874
1875
1876pub trait OutputDev {
1877    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1878    fn end_page(&mut self)-> Result<(), OutputError>;
1879    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1880    fn begin_word(&mut self)-> Result<(), OutputError>;
1881    fn end_word(&mut self)-> Result<(), OutputError>;
1882    fn end_line(&mut self)-> Result<(), OutputError>;
1883    fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1884    fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1885}
1886
1887
1888pub struct HTMLOutput<'a>  {
1889    file: &'a mut dyn std::io::Write,
1890    flip_ctm: Transform,
1891    last_ctm: Transform,
1892    buf_ctm: Transform,
1893    buf_font_size: f64,
1894    buf: String
1895}
1896
1897fn insert_nbsp(input: &str) -> String {
1898    let mut result = String::new();
1899    let mut word_end = false;
1900    let mut chars = input.chars().peekable();
1901    while let Some(c) = chars.next() {
1902        if c == ' ' {
1903            if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1904                result += "&nbsp;";
1905            } else {
1906                result += " ";
1907            }
1908            word_end = false;
1909        } else {
1910            word_end = true;
1911            result.push(c);
1912        }
1913    }
1914    result
1915}
1916
1917impl<'a> HTMLOutput<'a> {
1918    pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1919        HTMLOutput {
1920            file,
1921            flip_ctm: Transform2D::identity(),
1922            last_ctm: Transform2D::identity(),
1923            buf_ctm: Transform2D::identity(),
1924            buf: String::new(),
1925            buf_font_size: 0.,
1926        }
1927    }
1928    fn flush_string(&mut self) -> Result<(), OutputError>{
1929        if self.buf.len() != 0 {
1930
1931            let position = self.buf_ctm.post_transform(&self.flip_ctm);
1932            let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1933            // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1934            let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1935            let (x, y) = (position.m31, position.m32);
1936            warn!("flush {} {:?}", self.buf, (x,y));
1937
1938            write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1939                   x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1940        }
1941        Ok(())
1942    }
1943}
1944
1945type ArtBox = (f64, f64, f64, f64);
1946
1947impl<'a> OutputDev for HTMLOutput<'a> {
1948    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1949        write!(self.file, "<meta charset='utf-8' /> ")?;
1950        write!(self.file, "<!-- page {} -->", page_num)?;
1951        write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1952        self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1953        Ok(())
1954    }
1955    fn end_page(&mut self) -> Result<(), OutputError> {
1956        self.flush_string()?;
1957        self.buf = String::new();
1958        self.last_ctm = Transform::identity();
1959        write!(self.file, "</div>")?;
1960        Ok(())
1961    }
1962    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1963        if trm.approx_eq(&self.last_ctm) {
1964            let position = trm.post_transform(&self.flip_ctm);
1965            let (x, y) = (position.m31, position.m32);
1966
1967            warn!("accum {} {:?}", char, (x,y));
1968            self.buf += char;
1969        } else {
1970            warn!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1971            self.flush_string()?;
1972            self.buf = char.to_owned();
1973            self.buf_font_size = font_size;
1974            self.buf_ctm = *trm;
1975        }
1976        let position = trm.post_transform(&self.flip_ctm);
1977        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1978        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1979        let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1980        let (x, y) = (position.m31, position.m32);
1981        write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1982               x, y, transformed_font_size, char)?;
1983        self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1984
1985        Ok(())
1986    }
1987    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1988    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1989    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1990}
1991
1992pub struct SVGOutput<'a>  {
1993    file: &'a mut dyn std::io::Write
1994}
1995impl<'a> SVGOutput<'a> {
1996    pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
1997        SVGOutput{file}
1998    }
1999}
2000
2001impl<'a> OutputDev for SVGOutput<'a> {
2002    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2003        let ver = 1.1;
2004        write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2005        if ver == 1.1 {
2006            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2007        } else {
2008            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2009        }
2010        if let Some(art_box) = art_box {
2011            let width = art_box.2 - art_box.0;
2012            let height = art_box.3 - art_box.1;
2013            let y = media_box.ury - art_box.1 - height;
2014            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2015        } else {
2016            let width = media_box.urx - media_box.llx;
2017            let height = media_box.ury - media_box.lly;
2018            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2019        }
2020        write!(self.file, "\n")?;
2021        type Mat = Transform;
2022
2023        let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2024        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2025               ctm.m11,
2026               ctm.m12,
2027               ctm.m21,
2028               ctm.m22,
2029               ctm.m31,
2030               ctm.m32,
2031        )?;
2032        Ok(())
2033    }
2034    fn end_page(&mut self) -> Result<(), OutputError>{
2035        write!(self.file, "</g>\n")?;
2036        write!(self.file, "</svg>")?;
2037        Ok(())
2038    }
2039    fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2040        Ok(())
2041    }
2042    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2043    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2044    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2045    fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2046        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2047               ctm.m11,
2048               ctm.m12,
2049               ctm.m21,
2050               ctm.m22,
2051               ctm.m31,
2052               ctm.m32,
2053        )?;
2054
2055        /*if path.ops.len() == 1 {
2056            if let PathOp::Rect(x, y, width, height) = path.ops[0] {
2057                write!(self.file, "<rect x={} y={} width={} height={} />\n", x, y, width, height);
2058                write!(self.file, "</g>");
2059                return;
2060            }
2061        }*/
2062        let mut d = Vec::new();
2063        for op in &path.ops {
2064            match op {
2065                &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2066                &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2067                &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2068                &PathOp::Close => { d.push(format!("Z"))},
2069                &PathOp::Rect(x, y, width, height) => {
2070                    d.push(format!("M{} {}", x, y));
2071                    d.push(format!("L{} {}", x + width, y));
2072                    d.push(format!("L{} {}", x + width, y + height));
2073                    d.push(format!("L{} {}", x, y + height));
2074                    d.push(format!("Z"));
2075                }
2076
2077            }
2078        }
2079        write!(self.file, "<path d='{}' />", d.join(" "))?;
2080        write!(self.file, "</g>")?;
2081        write!(self.file, "\n")?;
2082        Ok(())
2083    }
2084}
2085
2086/*
2087File doesn't implement std::fmt::Write so we have
2088to do some gymnastics to accept a File or String
2089See https://github.com/rust-lang/rust/issues/51305
2090*/
2091
2092pub trait ConvertToFmt {
2093    type Writer: std::fmt::Write;
2094    fn convert(self) -> Self::Writer;
2095}
2096
2097impl<'a> ConvertToFmt for &'a mut String {
2098    type Writer = &'a mut String;
2099    fn convert(self) -> Self::Writer {
2100        self
2101    }
2102}
2103
2104pub struct WriteAdapter<W> {
2105    f: W,
2106}
2107
2108impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2109    fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2110        self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2111    }
2112}
2113
2114impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2115    type Writer = WriteAdapter<Self>;
2116    fn convert(self) -> Self::Writer {
2117        WriteAdapter { f: self }
2118    }
2119}
2120
2121impl<'a> ConvertToFmt for &'a mut File {
2122    type Writer = WriteAdapter<Self>;
2123    fn convert(self) -> Self::Writer {
2124        WriteAdapter { f: self }
2125    }
2126}
2127
2128pub struct PlainTextOutput<W: ConvertToFmt>   {
2129    writer: W::Writer,
2130    last_end: f64,
2131    last_y: f64,
2132    first_char: bool,
2133    flip_ctm: Transform,
2134}
2135
2136impl<W: ConvertToFmt> PlainTextOutput<W> {
2137    pub fn new(writer: W) -> PlainTextOutput<W> {
2138        PlainTextOutput{
2139            writer: writer.convert(),
2140            last_end: 100000.,
2141            first_char: false,
2142            last_y: 0.,
2143            flip_ctm: Transform2D::identity(),
2144        }
2145    }
2146}
2147
2148/* There are some structural hints that PDFs can use to signal word and line endings:
2149 * however relying on these is not likely to be sufficient. */
2150impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2151    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2152        self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2153        Ok(())
2154    }
2155    fn end_page(&mut self) -> Result<(), OutputError> {
2156        Ok(())
2157    }
2158    fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2159        let position = trm.post_transform(&self.flip_ctm);
2160        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2161        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
2162        let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2163        let (x, y) = (position.m31, position.m32);
2164        use std::fmt::Write;
2165        //dlog!("last_end: {} x: {}, width: {}", self.last_end, x, width);
2166        if self.first_char {
2167            if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2168                write!(self.writer, "\n")?;
2169            }
2170
2171            // we've moved to the left and down
2172            if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2173                write!(self.writer, "\n")?;
2174            }
2175
2176            if x > self.last_end + transformed_font_size * 0.1 {
2177                dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2178                write!(self.writer, " ")?;
2179            }
2180        }
2181        //let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
2182        write!(self.writer, "{}", char)?;
2183        self.first_char = false;
2184        self.last_y = y;
2185        self.last_end = x + width * transformed_font_size;
2186        Ok(())
2187    }
2188    fn begin_word(&mut self) -> Result<(), OutputError> {
2189        self.first_char = true;
2190        Ok(())
2191    }
2192    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2193    fn end_line(&mut self) -> Result<(), OutputError>{
2194        //write!(self.file, "\n");
2195        Ok(())
2196    }
2197}
2198
2199
2200pub fn print_metadata(doc: &Document) {
2201    dlog!("Version: {}", doc.version);
2202    if let Some(ref info) = get_info(&doc) {
2203        for (k, v) in *info {
2204            match v {
2205                &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2206                _ => {}
2207            }
2208        }
2209    }
2210    dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2211    dlog!("Pages: {:?}", get_pages(&doc));
2212    dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2213}
2214
2215/// Extract the text from a pdf at `path` and return a `String` with the results
2216pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2217    let mut s = String::new();
2218    {
2219        let mut output = PlainTextOutput::new(&mut s);
2220        let mut doc = Document::load(path)?;
2221        maybe_decrypt(&mut doc)?;
2222        output_doc(&doc, &mut output)?;
2223    }
2224    Ok(s)
2225}
2226
2227fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2228    if ! doc.is_encrypted() {
2229        return Ok(());
2230    }
2231
2232    if let Err(e) = doc.decrypt("") {
2233        if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2234            error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2235        }
2236
2237        return Err(OutputError::PdfError(e));
2238    }
2239
2240    Ok(())
2241}
2242
2243pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>>(
2244    path: P,
2245    password: &str,
2246) -> Result<String, OutputError> {
2247    let mut s = String::new();
2248    {
2249        let mut output = PlainTextOutput::new(&mut s);
2250        let mut doc = Document::load(path)?;
2251        output_doc_encrypted(&mut doc, &mut output, password)?;
2252    }
2253    Ok(s)
2254}
2255
2256pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2257    let mut s = String::new();
2258    {
2259        let mut output = PlainTextOutput::new(&mut s);
2260        let mut doc = Document::load_mem(buffer)?;
2261        maybe_decrypt(&mut doc)?;
2262        output_doc(&doc, &mut output)?;
2263    }
2264    Ok(s)
2265}
2266
2267pub fn extract_text_from_mem_encrypted(
2268    buffer: &[u8],
2269    password: &str,
2270) -> Result<String, OutputError> {
2271    let mut s = String::new();
2272    {
2273        let mut output = PlainTextOutput::new(&mut s);
2274        let mut doc = Document::load_mem(buffer)?;
2275        output_doc_encrypted(&mut doc, &mut output, password)?;
2276    }
2277    Ok(s)
2278}
2279
2280
2281fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2282    let mut s = String::new();
2283    {
2284        let mut output = PlainTextOutput::new(&mut s);
2285        output_doc_page(doc, &mut output, page_num)?;
2286    }
2287    Ok(s)
2288}
2289
2290/// Extract the text from a pdf at `path` and return a `Vec<String>` with the results separately by page
2291
2292pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2293    let mut v = Vec::new();
2294    {
2295        let mut doc = Document::load(path)?;
2296        maybe_decrypt(&mut doc)?;
2297        let mut page_num = 1;
2298        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2299            v.push(content);
2300            page_num += 1;
2301        }
2302    }
2303    Ok(v)
2304}
2305
2306pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>>(path: P, password: &str) -> Result<Vec<String>, OutputError> {
2307    let mut v = Vec::new();
2308    {
2309        let mut doc = Document::load(path)?;
2310        doc.decrypt(password)?;
2311        let mut page_num = 1;
2312        while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2313            v.push(content);
2314            page_num += 1;
2315        }
2316    }
2317    Ok(v)
2318}
2319
2320pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2321    let mut v = Vec::new();
2322    {
2323        let mut doc = Document::load_mem(buffer)?;
2324        maybe_decrypt(&mut doc)?;
2325        let mut page_num = 1;
2326        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2327            v.push(content);
2328            page_num += 1;
2329        }
2330    }
2331    Ok(v)
2332}
2333
2334pub fn extract_text_from_mem_by_pages_encrypted(buffer: &[u8], password: &str) -> Result<Vec<String>, OutputError> {
2335    let mut v = Vec::new();
2336    {
2337        let mut doc = Document::load_mem(buffer)?;
2338        doc.decrypt(password)?;
2339        let mut page_num = 1;
2340        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2341            v.push(content);
2342            page_num += 1;
2343        }
2344    }
2345    Ok(v)
2346}
2347
2348
2349fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2350    let o: Option<T> = get(doc, dict, key);
2351    if let Some(o) = o {
2352        Some(o)
2353    } else {
2354        let parent = dict.get(b"Parent")
2355            .and_then(|parent| parent.as_reference())
2356            .and_then(|id| doc.get_dictionary(id)).ok()?;
2357        get_inherited(doc, parent, key)
2358    }
2359}
2360
2361pub fn output_doc_encrypted(
2362    doc: &mut Document,
2363    output: &mut dyn OutputDev,
2364    password: &str,
2365) -> Result<(), OutputError> {
2366    doc.decrypt(password)?;
2367    output_doc(doc, output)
2368}
2369
2370/// Parse a given document and output it to `output`
2371pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2372    if doc.is_encrypted() {
2373        error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2374    }
2375    let empty_resources = Dictionary::new();
2376    let pages = doc.get_pages();
2377    let mut p = Processor::new();
2378    for dict in pages {
2379        let page_num = dict.0;
2380        let object_id = dict.1;
2381        output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2382    }
2383    Ok(())
2384}
2385
2386pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2387    if doc.is_encrypted() {
2388        error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2389    }
2390    let empty_resources = Dictionary::new();
2391    let pages = doc.get_pages();
2392    let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2393    let mut p = Processor::new();
2394    output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2395    Ok(())
2396}
2397
2398fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2399    let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2400    dlog!("page {} {:?}", page_num, page_dict);
2401    // XXX: Some pdfs lack a Resources directory
2402    let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2403    dlog!("resources {:?}", resources);
2404    // pdfium searches up the page tree for MediaBoxes as needed
2405    let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2406    let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2407    let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2408        .map(|x| (x[0], x[1], x[2], x[3]));
2409    output.begin_page(page_num, &media_box, art_box)?;
2410    p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2411    output.end_page()?;
2412    Ok(())
2413}