pdf_extract/
lib.rs

1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26mod core_fonts;
27mod glyphnames;
28mod zapfglyphnames;
29mod encodings;
30
31pub struct Space;
32pub type Transform = Transform2D<f64, Space, Space>;
33
34#[derive(Debug)]
35pub enum OutputError
36{
37    FormatError(std::fmt::Error),
38    IoError(std::io::Error),
39    PdfError(lopdf::Error)
40}
41
42impl std::fmt::Display for OutputError
43{
44    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
45        match self {
46            OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
47            OutputError::IoError(e) => write!(f, "IO error: {}", e),
48            OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
49        }
50    }
51}
52
53impl std::error::Error for OutputError{}
54
55impl From<std::fmt::Error> for OutputError {
56    fn from(e: std::fmt::Error) -> Self {
57        OutputError::FormatError(e)
58    }
59}
60
61impl From<std::io::Error> for OutputError {
62    fn from(e: std::io::Error) -> Self {
63        OutputError::IoError(e)
64    }
65}
66
67impl From<lopdf::Error> for OutputError {
68    fn from(e: lopdf::Error) -> Self {
69        OutputError::PdfError(e)
70    }
71}
72
73macro_rules! dlog {
74    ($($e:expr),*) => { {$(let _ = $e;)*} }
75    //($($t:tt)*) => { println!($($t)*) }
76}
77
78fn get_info(doc: &Document) -> Option<&Dictionary> {
79    match doc.trailer.get(b"Info") {
80        Ok(&Object::Reference(ref id)) => {
81            match doc.get_object(*id) {
82                Ok(&Object::Dictionary(ref info)) => { return Some(info); }
83                _ => {}
84            }
85        }
86        _ => {}
87    }
88    None
89}
90
91fn get_catalog(doc: &Document) -> &Dictionary {
92    match doc.trailer.get(b"Root").unwrap() {
93        &Object::Reference(ref id) => {
94            match doc.get_object(*id) {
95                Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
96                _ => {}
97            }
98        }
99        _ => {}
100    }
101    panic!();
102}
103
104fn get_pages(doc: &Document) -> &Dictionary {
105    let catalog = get_catalog(doc);
106    match catalog.get(b"Pages").unwrap() {
107        &Object::Reference(ref id) => {
108            match doc.get_object(*id) {
109                Ok(&Object::Dictionary(ref pages)) => { return pages; }
110                other => {dlog!("pages: {:?}", other)}
111            }
112        }
113        other => { dlog!("pages: {:?}", other)}
114    }
115    dlog!("catalog {:?}", catalog);
116    panic!();
117}
118
119#[allow(non_upper_case_globals)]
120const PDFDocEncoding: &'static [u16] = &[
121    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
122    0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
123    0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
124    0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
125    0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
126    0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
127    0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
128    0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
129    0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
130    0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
131    0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
132    0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
133    0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
134    0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
135    0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
136    0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
137    0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
138    0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
139    0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
140    0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
141    0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
142    0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
143    0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
144    0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
145    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
146    0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
147    0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
148    0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
149    0x00fc, 0x00fd, 0x00fe, 0x00ff];
150
151fn pdf_to_utf8(s: &[u8]) -> String {
152    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
153        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
154    } else {
155        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
156               let k = PDFDocEncoding[x as usize];
157               vec![(k>>8) as u8, k as u8].into_iter()}).collect();
158        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
159    }
160}
161
162fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
163    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
164        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
165    } else {
166        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
167            let k = encoding[x as usize];
168            vec![(k>>8) as u8, k as u8].into_iter()}).collect();
169        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
170    }
171}
172
173
174fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
175    match o {
176        &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
177        _ => o
178    }
179}
180
181fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
182    dict.get(key).map(|o| maybe_deref(doc, o)).ok()
183}
184
185// an intermediate trait that can be used to chain conversions that may have failed
186trait FromOptObj<'a> {
187    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
188}
189
190// conditionally convert to Self returns None if the conversion failed
191trait FromObj<'a> where Self: std::marker::Sized {
192    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
193}
194
195impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
196    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
197        obj.and_then(|x| T::from_obj(doc,x))
198    }
199}
200
201impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
202    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
203        T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
204    }
205}
206
207// we follow the same conventions as pdfium for when to support indirect objects:
208// on arrays, streams and dicts
209impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
210    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
211        maybe_deref(doc, obj).as_array().map(|x| x.iter()
212            .map(|x| T::from_obj(doc, x).expect("wrong type"))
213            .collect()).ok()
214    }
215}
216
217// XXX: These will panic if we don't have the right number of items
218// we don't want to do that
219impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
220    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
221        maybe_deref(doc, obj).as_array().map(|x| {
222            let mut all = x.iter()
223                .map(|x| T::from_obj(doc, x).expect("wrong type"));
224            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
225        }).ok()
226    }
227}
228
229impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
230    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
231        maybe_deref(doc, obj).as_array().map(|x| {
232            let mut all = x.iter()
233                .map(|x| T::from_obj(doc, x).expect("wrong type"));
234            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
235        }).ok()
236    }
237}
238
239impl<'a> FromObj<'a> for f64 {
240    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
241        match obj {
242            &Object::Integer(i) => Some(i as f64),
243            &Object::Real(f) => Some(f.into()),
244            _ => None
245        }
246    }
247}
248
249impl<'a> FromObj<'a> for i64 {
250    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
251        match obj {
252            &Object::Integer(i) => Some(i),
253            _ => None
254        }
255    }
256}
257
258impl<'a> FromObj<'a> for &'a Dictionary {
259    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
260        maybe_deref(doc, obj).as_dict().ok()
261    }
262}
263
264impl<'a> FromObj<'a> for &'a Stream {
265    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
266        maybe_deref(doc, obj).as_stream().ok()
267    }
268}
269
270impl<'a> FromObj<'a> for &'a Object {
271    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
272        Some(maybe_deref(doc, obj))
273    }
274}
275
276fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
277    T::from_opt_obj(doc, dict.get(key).ok(), key)
278}
279
280fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
281    maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
282}
283
284fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
285    pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
286}
287
288#[allow(dead_code)]
289fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
290    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
291}
292
293fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
294    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
295}
296
297fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
298    maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
299}
300
301#[derive(Clone)]
302struct PdfSimpleFont<'a> {
303    font: &'a Dictionary,
304    doc: &'a Document,
305    encoding: Option<Vec<u16>>,
306    unicode_map: Option<HashMap<u32, String>>,
307    widths: HashMap<CharCode, f64>, // should probably just use i32 here
308    missing_width: f64,
309}
310
311#[derive(Clone)]
312struct PdfType3Font<'a> {
313    font: &'a Dictionary,
314    doc: &'a Document,
315    encoding: Option<Vec<u16>>,
316    unicode_map: Option<HashMap<u32, String>>,
317    widths: HashMap<CharCode, f64>, // should probably just use i32 here
318}
319
320
321fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
322    let subtype = get_name_string(doc, font, b"Subtype");
323    dlog!("MakeFont({})", subtype);
324    if subtype == "Type0" {
325        Rc::new(PdfCIDFont::new(doc, font))
326    } else if subtype == "Type3" {
327        Rc::new(PdfType3Font::new(doc, font))
328    } else {
329        Rc::new(PdfSimpleFont::new(doc, font))
330    }
331}
332
333fn is_core_font(name: &str) -> bool {
334    match name {
335        "Courier-Bold" |
336        "Courier-BoldOblique" |
337        "Courier-Oblique" |
338        "Courier" |
339        "Helvetica-Bold" |
340        "Helvetica-BoldOblique" |
341        "Helvetica-Oblique" |
342        "Helvetica" |
343        "Symbol" |
344        "Times-Bold" |
345        "Times-BoldItalic" |
346        "Times-Italic" |
347        "Times-Roman" |
348        "ZapfDingbats" => true,
349        _ => false,
350    }
351}
352
353fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
354    let encoding = match &name[..] {
355        b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
356        b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
357        b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
358        _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
359    };
360    let encoding_table = encoding.iter()
361        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
362        .collect();
363    encoding_table
364}
365
366/* "Glyphs in the font are selected by single-byte character codes obtained from a string that
367    is shown by the text-showing operators. Logically, these codes index into a table of 256
368    glyphs; the mapping from codes to glyphs is called the font’s encoding. Each font program
369    has a built-in encoding. Under some circumstances, the encoding can be altered by means
370    described in Section 5.5.5, “Character Encoding.”
371*/
372impl<'a> PdfSimpleFont<'a> {
373    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
374        let base_name = get_name_string(doc, font, b"BaseFont");
375        let subtype = get_name_string(doc, font, b"Subtype");
376
377        let encoding: Option<&Object> = get(doc, font, b"Encoding");
378        dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
379        let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
380        let mut type1_encoding = None;
381        let mut unicode_map = None;
382        if let Some(descriptor) = descriptor {
383            dlog!("descriptor {:?}", descriptor);
384            if subtype == "Type1" {
385                let file = maybe_get_obj(doc, descriptor, b"FontFile");
386                match file {
387                    Some(&Object::Stream(ref s)) => {
388                        let s = get_contents(s);
389                        //dlog!("font contents {:?}", pdf_to_utf8(&s));
390                        type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
391                    }
392                    _ => { dlog!("font file {:?}", file) }
393                }
394            } else if subtype == "TrueType" {
395                let file = maybe_get_obj(doc, descriptor, b"FontFile2");
396                match file {
397                    Some(&Object::Stream(ref s)) => {
398                        let _s = get_contents(s);
399                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
400                    }
401                    _ => { dlog!("font file {:?}", file) }
402                }
403            }
404
405            let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
406            match font_file3 {
407                Some(&Object::Stream(ref s)) => {
408                    let subtype = get_name_string(doc, &s.dict, b"Subtype");
409                    dlog!("font file {}, {:?}", subtype, s);
410                    let s = get_contents(s);
411                    if subtype == "Type1C" {
412                        let table = cff_parser::Table::parse(&s).unwrap();
413                        let charset = table.charset.get_table();
414                        let encoding = table.encoding.get_table();
415                        let mut mapping = HashMap::new();
416                        for i in 0..encoding.len().min(charset.len()) {
417                            let cid = encoding[i];
418                            let sid = charset[i];
419                            let name = cff_parser::string_by_id(&table, sid).unwrap();
420                            let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
421                                zapfglyphnames::zapfdigbats_names_to_unicode(name)
422                            });
423                            if let Some(unicode) = unicode {
424                                let str = String::from_utf16(&[unicode]).unwrap();
425                                mapping.insert(cid as u32, str);
426                            }
427                        }
428                        unicode_map = Some(mapping);
429                        //
430                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
431                    }
432
433                    //
434                    //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
435                }
436                None => {}
437                _ => { dlog!("unexpected") }
438            }
439
440            let charset = maybe_get_obj(doc, descriptor, b"CharSet");
441            let _charset = match charset {
442                Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
443                _ => { None }
444            };
445            //dlog!("charset {:?}", charset);
446        }
447
448        let mut unicode_map = match unicode_map {
449            Some(mut unicode_map) => {
450                unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
451                Some(unicode_map)
452            }
453            None => {
454                get_unicode_map(doc, font)
455            }
456        };
457
458
459        let mut encoding_table = None;
460        match encoding {
461            Some(&Object::Name(ref encoding_name)) => {
462                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
463                encoding_table = Some(encoding_to_unicode_table(encoding_name));
464            }
465            Some(&Object::Dictionary(ref encoding)) => {
466                //dlog!("Encoding {:?}", encoding);
467                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
468                    dlog!("BaseEncoding {:?}", base_encoding);
469                    encoding_to_unicode_table(base_encoding)
470                } else {
471                    Vec::from(PDFDocEncoding)
472                };
473                let differences = maybe_get_array(doc, encoding, b"Differences");
474                if let Some(differences) = differences {
475                    dlog!("Differences");
476                    let mut code = 0;
477                    for o in differences {
478                        let o = maybe_deref(doc, o);
479                        match o {
480                            &Object::Integer(i) => { code = i; },
481                            &Object::Name(ref n) => {
482                                let name = pdf_to_utf8(&n);
483                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
484                                // unicode names, so we should probably handle this differently
485                                let unicode = glyphnames::name_to_unicode(&name);
486                                if let Some(unicode) = unicode{
487                                    table[code as usize] = unicode;
488                                    if let Some(ref mut unicode_map) = unicode_map {
489                                        let be = [unicode];
490                                        match unicode_map.entry(code as u32) {
491                                            // If there's a unicode table entry missing use one based on the name
492                                            Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
493                                            Entry::Occupied(e) => {
494                                                if e.get() != &String::from_utf16(&be).unwrap() {
495                                                    let normal_match  = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
496                                                    println!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
497                                                }
498                                            }
499                                        }
500                                    }
501                                } else {
502                                    match unicode_map {
503                                        Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
504                                            // the fontawesome tex package will use glyph names that don't have a corresponding unicode
505                                            // code point, so we'll use an empty string instead. See issue #76
506                                            match unicode_map.entry(code as u32) {
507                                                Entry::Vacant(v) => { v.insert("".to_owned()); }
508                                                Entry::Occupied(e) => {
509                                                    panic!("unexpected entry in unicode map")
510                                                }
511                                            }
512                                        }
513                                        _ => {
514                                            println!("unknown glyph name '{}' for font {}", name, base_name);
515                                        }
516                                    }
517                                }
518                                dlog!("{} = {} ({:?})", code, name, unicode);
519                                if let Some(ref mut unicode_map) = unicode_map {
520                                    // The unicode map might not have the code in it, but the code might
521                                    // not be used so we don't want to panic here.
522                                    // An example of this is the 'suppress' character in the TeX Latin Modern font.
523                                    // This shows up in https://arxiv.org/pdf/2405.01295v1.pdf
524                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
525                                }
526                                code += 1;
527                            }
528                            _ => { panic!("wrong type {:?}", o); }
529                        }
530                    }
531                }
532                // "Type" is optional
533                let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
534                dlog!("name: {}", name);
535
536                encoding_table = Some(table);
537            }
538            None => {
539                if let Some(type1_encoding) = type1_encoding {
540                    let mut table = Vec::from(PDFDocEncoding);
541                    dlog!("type1encoding");
542                    for (code, name) in type1_encoding {
543                        let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
544                        if let Some(unicode) = unicode {
545                            table[code as usize] = unicode;
546                        } else {
547                            dlog!("unknown character {}", pdf_to_utf8(&name));
548                        }
549                    }
550                    encoding_table = Some(table)
551                } else if subtype == "TrueType" {
552                    encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
553                        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
554                        .collect());
555                }
556            }
557            _ => { panic!() }
558        }
559
560        let mut width_map = HashMap::new();
561        /* "Ordinarily, a font dictionary that refers to one of the standard fonts
562            should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
563            However, it is permissible to override a standard font by including these
564            entries and embedding the font program in the PDF file."
565
566            Note: some PDFs include a descriptor but still don't include these entries */
567
568        // If we have widths prefer them over the core font widths. Needed for https://dkp.de/wp-content/uploads/parteitage/Sozialismusvorstellungen-der-DKP.pdf
569        if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
570            // Some PDF's don't have these like fips-197.pdf
571            let mut i: i64 = 0;
572            dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
573
574            for w in widths {
575                width_map.insert((first_char + i) as CharCode, w);
576                i += 1;
577            }
578            assert_eq!(first_char + i - 1, last_char);
579        } else {
580            let name = if is_core_font(&base_name) {
581                &base_name
582            } else {
583                println!("no widths and not core font {:?}", base_name);
584
585                // This situation is handled differently by different readers
586                // but basically we try to substitute the best font that we can.
587
588                // Poppler/Xpdf:
589                // this is technically an error -- the Widths entry is required
590                // for all but the Base-14 fonts -- but certain PDF generators
591                // apparently don't include widths for Arial and TimesNewRoman
592
593                // Pdfium: CFX_FontMapper::FindSubstFont
594
595                // mupdf: pdf_load_substitute_font
596
597                // We can try to do a better job guessing at a font by looking at the flags
598                // or the basename but for now we'll just use Helvetica
599                "Helvetica"
600            };
601            for font_metrics in core_fonts::metrics().iter() {
602                if font_metrics.0 == base_name {
603                    if let Some(ref encoding) = encoding_table {
604                        dlog!("has encoding");
605                        for w in font_metrics.2 {
606                            let c = glyphnames::name_to_unicode(w.2).unwrap();
607                            for i in 0..encoding.len() {
608                                if encoding[i] == c {
609                                    width_map.insert(i as CharCode, w.1 as f64);
610                                }
611                            }
612                        }
613                    } else {
614                        // Instead of using the encoding from the core font we'll just look up all
615                        // of the character names. We should probably verify that this produces the
616                        // same result.
617
618                        let mut table = vec![0; 256];
619                        for w in font_metrics.2 {
620                            dlog!("{} {}", w.0, w.2);
621                            // -1 is "not encoded"
622                            if w.0 != -1 {
623                                table[w.0 as usize] = if base_name == "ZapfDingbats" {
624                                    zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
625                                } else {
626                                    glyphnames::name_to_unicode(w.2).unwrap()
627                                }
628                            }
629                        }
630
631                        let encoding = &table[..];
632                        for w in font_metrics.2 {
633                            width_map.insert(w.0 as CharCode, w.1 as f64);
634                            // -1 is "not encoded"
635                        }
636                        encoding_table = Some(encoding.to_vec());
637                    }
638                    /* "Ordinarily, a font dictionary that refers to one of the standard fonts
639                        should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
640                        However, it is permissible to override a standard font by including these
641                        entries and embedding the font program in the PDF file."
642
643                        Note: some PDFs include a descriptor but still don't include these entries */
644                    // assert!(maybe_get_obj(doc, font, b"FirstChar").is_none());
645                    // assert!(maybe_get_obj(doc, font, b"LastChar").is_none());
646                    // assert!(maybe_get_obj(doc, font, b"Widths").is_none());
647                }
648            }
649        }
650
651        let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
652        PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
653    }
654
655    #[allow(dead_code)]
656    fn get_type(&self) -> String {
657        get_name_string(self.doc, self.font, b"Type")
658    }
659    #[allow(dead_code)]
660    fn get_basefont(&self) -> String {
661        get_name_string(self.doc, self.font, b"BaseFont")
662    }
663    #[allow(dead_code)]
664    fn get_subtype(&self) -> String {
665        get_name_string(self.doc, self.font, b"Subtype")
666    }
667    #[allow(dead_code)]
668    fn get_widths(&self) -> Option<&Vec<Object>> {
669        maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
670    }
671    /* For type1: This entry is obsolescent and its use is no longer recommended. (See
672     * implementation note 42 in Appendix H.) */
673    #[allow(dead_code)]
674    fn get_name(&self) -> Option<String> {
675        maybe_get_name_string(self.doc, self.font, b"Name")
676    }
677
678    #[allow(dead_code)]
679    fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
680        maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
681    }
682}
683
684
685
686impl<'a> PdfType3Font<'a> {
687    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
688
689        let unicode_map = get_unicode_map(doc, font);
690        let encoding: Option<&Object> = get(doc, font, b"Encoding");
691
692        let encoding_table;
693        match encoding {
694            Some(&Object::Name(ref encoding_name)) => {
695                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
696                encoding_table = Some(encoding_to_unicode_table(encoding_name));
697            }
698            Some(&Object::Dictionary(ref encoding)) => {
699                //dlog!("Encoding {:?}", encoding);
700                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
701                    dlog!("BaseEncoding {:?}", base_encoding);
702                    encoding_to_unicode_table(base_encoding)
703                } else {
704                    Vec::from(PDFDocEncoding)
705                };
706                let differences = maybe_get_array(doc, encoding, b"Differences");
707                if let Some(differences) = differences {
708                    dlog!("Differences");
709                    let mut code = 0;
710                    for o in differences {
711                        match o {
712                            &Object::Integer(i) => { code = i; },
713                            &Object::Name(ref n) => {
714                                let name = pdf_to_utf8(&n);
715                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
716                                // unicode names, so we should probably handle this differently
717                                let unicode = glyphnames::name_to_unicode(&name);
718                                if let Some(unicode) = unicode{
719                                    table[code as usize] = unicode;
720                                }
721                                dlog!("{} = {} ({:?})", code, name, unicode);
722                                if let Some(ref unicode_map) = unicode_map {
723                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
724                                }
725                                code += 1;
726                            }
727                            _ => { panic!("wrong type"); }
728                        }
729                    }
730                }
731                let name_encoded = encoding.get(b"Type");
732                if let Ok(Object::Name(name)) = name_encoded {
733                    dlog!("name: {}", pdf_to_utf8(name));
734                } else {
735                    dlog!("name not found");
736                }
737
738                encoding_table = Some(table);
739            }
740            _ => { panic!() }
741        }
742
743        let first_char: i64 = get(doc, font, b"FirstChar");
744        let last_char: i64 = get(doc, font, b"LastChar");
745        let widths: Vec<f64> = get(doc, font, b"Widths");
746
747        let mut width_map = HashMap::new();
748
749        let mut i = 0;
750        dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
751
752        for w in widths {
753            width_map.insert((first_char + i) as CharCode, w);
754            i += 1;
755        }
756        assert_eq!(first_char + i - 1, last_char);
757        PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
758    }
759}
760
761type CharCode = u32;
762
763struct PdfFontIter<'a>
764{
765    i: Iter<'a, u8>,
766    font: &'a dyn PdfFont,
767}
768
769impl<'a> Iterator for PdfFontIter<'a> {
770    type Item = (CharCode, u8);
771    fn next(&mut self) -> Option<(CharCode, u8)> {
772        self.font.next_char(&mut self.i)
773    }
774}
775
776trait PdfFont : Debug {
777    fn get_width(&self, id: CharCode) -> f64;
778    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
779    fn decode_char(&self, char: CharCode) -> String;
780
781        /*fn char_codes<'a>(&'a self, chars: &'a [u8]) -> PdfFontIter {
782            let p = self;
783            PdfFontIter{i: chars.iter(), font: p as &PdfFont}
784        }*/
785
786}
787
788impl<'a> dyn PdfFont + 'a {
789    fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
790        PdfFontIter{i: chars.iter(), font: self}
791    }
792    fn decode(&self, chars: &[u8]) -> String {
793        let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
794        strings.join("")
795    }
796
797}
798
799
800impl<'a> PdfFont for PdfSimpleFont<'a> {
801    fn get_width(&self, id: CharCode) -> f64 {
802        let width = self.widths.get(&id);
803        if let Some(width) = width {
804            return *width;
805        } else {
806            let mut widths = self.widths.iter().collect::<Vec<_>>();
807            widths.sort_by_key(|x| x.0);
808            dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
809            return self.missing_width;
810        }
811    }
812    /*fn decode(&self, chars: &[u8]) -> String {
813        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
814        to_utf8(encoding, chars)
815    }*/
816
817    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
818        iter.next().map(|x| (*x as CharCode, 1))
819    }
820    fn decode_char(&self, char: CharCode) -> String {
821        let slice = [char as u8];
822        if let Some(ref unicode_map) = self.unicode_map {
823            let s = unicode_map.get(&char);
824            let s = match s {
825                None => {
826                    println!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
827                    // some pdf's like http://arxiv.org/pdf/2312.00064v1 are missing entries in their unicode map but do have
828                    // entries in the encoding.
829                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
830                    let s = to_utf8(encoding, &slice);
831                    println!("falling back to encoding {} -> {:?}", char, s);
832                    s
833                }
834                Some(s) => { s.clone() }
835            };
836            return s
837        }
838        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
839        //dlog!("char_code {:?} {:?}", char, self.encoding);
840        let s = to_utf8(encoding, &slice);
841        s
842    }
843}
844
845
846
847impl<'a> fmt::Debug for PdfSimpleFont<'a> {
848    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
849        self.font.fmt(f)
850    }
851}
852
853impl<'a> PdfFont for PdfType3Font<'a> {
854    fn get_width(&self, id: CharCode) -> f64 {
855        let width = self.widths.get(&id);
856        if let Some(width) = width {
857            return *width;
858        } else {
859            panic!("missing width for {} {:?}", id, self.font);
860        }
861    }
862    /*fn decode(&self, chars: &[u8]) -> String {
863        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
864        to_utf8(encoding, chars)
865    }*/
866
867    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
868        iter.next().map(|x| (*x as CharCode, 1))
869    }
870    fn decode_char(&self, char: CharCode) -> String {
871        let slice = [char as u8];
872        if let Some(ref unicode_map) = self.unicode_map {
873            let s = unicode_map.get(&char);
874            let s = match s {
875                None => {
876                    println!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
877                    // some pdf's like http://arxiv.org/pdf/2312.00577v1 are missing entries in their unicode map but do have
878                    // entries in the encoding.
879                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
880                    let s = to_utf8(encoding, &slice);
881                    println!("falling back to encoding {} -> {:?}", char, s);
882                    s
883                }
884                Some(s) => { s.clone() }
885            };
886            return s
887        }
888        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
889        //dlog!("char_code {:?} {:?}", char, self.encoding);
890        let s = to_utf8(encoding, &slice);
891        s
892    }
893}
894
895
896
897impl<'a> fmt::Debug for PdfType3Font<'a> {
898    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
899        self.font.fmt(f)
900    }
901}
902
903struct PdfCIDFont<'a> {
904    font: &'a Dictionary,
905    #[allow(dead_code)]
906    doc: &'a Document,
907    #[allow(dead_code)]
908    encoding: ByteMapping,
909    to_unicode: Option<HashMap<u32, String>>,
910    widths: HashMap<CharCode, f64>, // should probably just use i32 here
911    default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
912}
913
914fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
915    let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
916    dlog!("ToUnicode: {:?}", to_unicode);
917    let mut unicode_map = None;
918    match to_unicode {
919        Some(&Object::Stream(ref stream)) => {
920            let contents = get_contents(stream);
921            dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
922
923            let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
924            let mut unicode = HashMap::new();
925            // "It must use the beginbfchar, endbfchar, beginbfrange, and endbfrange operators to
926            // define the mapping from character codes to Unicode character sequences expressed in
927            // UTF-16BE encoding."
928            for (&k, v) in cmap.iter() {
929                let mut be: Vec<u16> = Vec::new();
930                let mut i = 0;
931                assert!(v.len() % 2 == 0);
932                while i < v.len() {
933                    be.push(((v[i] as u16) << 8) | v[i+1] as u16);
934                    i += 2;
935                }
936                match &be[..] {
937                    [0xd800 ..= 0xdfff] => {
938                        // this range is not specified as not being encoded
939                        // we ignore them so we don't an error from from_utt16
940                        continue;
941                    }
942                    _ => {}
943                }
944                let s = String::from_utf16(&be).unwrap();
945
946                unicode.insert(k, s);
947            }
948            unicode_map = Some(unicode);
949
950            dlog!("map: {:?}", unicode_map);
951        }
952        None => { }
953        Some(&Object::Name(ref name)) => {
954            let name = pdf_to_utf8(name);
955            if name != "Identity-H" {
956                todo!("unsupported ToUnicode name: {:?}", name);
957            }
958        }
959        _ => { panic!("unsupported cmap {:?}", to_unicode)}
960    }
961    unicode_map
962}
963
964
965impl<'a> PdfCIDFont<'a> {
966    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
967        let base_name = get_name_string(doc, font, b"BaseFont");
968        let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
969        let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
970        let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
971        dlog!("base_name {} {:?}", base_name, font);
972
973        let encoding = match encoding {
974            &Object::Name(ref name) => {
975                let name = pdf_to_utf8(name);
976                dlog!("encoding {:?}", name);
977                if name == "Identity-H" || name == "Identity-V" {
978                    ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
979                } else {
980                    panic!("unsupported encoding {}", name);
981                }
982            }
983            &Object::Stream(ref stream) => {
984                let contents = get_contents(stream);
985                dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
986                adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
987            }
988            _ => { panic!("unsupported encoding {:?}", encoding)}
989        };
990
991        // Sometimes a Type0 font might refer to the same underlying data as regular font. In this case we may be able to extract some encoding
992        // data.
993        // We should also look inside the truetype data to see if there's a cmap table. It will help us convert as well.
994        // This won't work if the cmap has been subsetted. A better approach might be to hash glyph contents and use that against
995        // a global library of glyph hashes
996        let unicode_map = get_unicode_map(doc, font);
997
998        dlog!("descendents {:?} {:?}", descendants, ciddict);
999
1000        let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1001        dlog!("{:?}", font_dict);
1002        let _f = font_dict.as_dict().expect("must be dict");
1003        let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1004        let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1005        dlog!("widths {:?}", w);
1006        let mut widths = HashMap::new();
1007        let mut i = 0;
1008        if let Some(w) = w {
1009            while i < w.len() {
1010                if let &Object::Array(ref wa) = w[i+1] {
1011                    let cid = w[i].as_i64().expect("id should be num");
1012                    let mut j = 0;
1013                    dlog!("wa: {:?} -> {:?}", cid, wa);
1014                    for w in wa {
1015                        widths.insert((cid + j) as CharCode, as_num(w) );
1016                        j += 1;
1017                    }
1018                    i += 2;
1019                } else {
1020                    let c_first = w[i].as_i64().expect("first should be num");
1021                    let c_last = w[i].as_i64().expect("last should be num");
1022                    let c_width = as_num(&w[i]);
1023                    for id in c_first..c_last {
1024                        widths.insert(id as CharCode, c_width);
1025                    }
1026                    i += 3;
1027                }
1028            }
1029        }
1030        PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1031    }
1032}
1033
1034impl<'a> PdfFont for PdfCIDFont<'a> {
1035    fn get_width(&self, id: CharCode) -> f64 {
1036        let width = self.widths.get(&id);
1037        if let Some(width) = width {
1038            dlog!("GetWidth {} -> {}", id, *width);
1039            return *width;
1040        } else {
1041            dlog!("missing width for {} falling back to default_width", id);
1042            return self.default_width.unwrap();
1043        }
1044    }/*
1045    fn decode(&self, chars: &[u8]) -> String {
1046        self.char_codes(chars);
1047
1048        //let utf16 = Vec::new();
1049
1050        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1051        to_utf8(encoding, chars)
1052    }*/
1053
1054    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1055        let mut c = *iter.next()? as u32;
1056        let mut code = None;
1057        'outer: for width in 1..=4 {
1058            for range in &self.encoding.codespace {
1059                if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1060                    code = Some((c as u32, width));
1061                    break 'outer;
1062                }
1063            }
1064            let next = *iter.next()?;
1065            c = ((c as u32) << 8) | next as u32;
1066        }
1067        let code = code?;
1068        for range in &self.encoding.cid {
1069            if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1070                return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1071            }
1072        }
1073        None
1074    }
1075    fn decode_char(&self, char: CharCode) -> String {
1076        let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1077        if let Some(s) = s {
1078            s.clone()
1079        } else {
1080            dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1081            "".to_string()
1082        }
1083    }
1084}
1085
1086impl<'a> fmt::Debug for PdfCIDFont<'a> {
1087    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1088        self.font.fmt(f)
1089    }
1090}
1091
1092
1093
1094#[derive(Copy, Clone)]
1095struct PdfFontDescriptor<'a> {
1096    desc: &'a Dictionary,
1097    doc: &'a Document
1098}
1099
1100impl<'a> PdfFontDescriptor<'a> {
1101    #[allow(dead_code)]
1102    fn get_file(&self) -> Option<&'a Object> {
1103        maybe_get_obj(self.doc, self.desc, b"FontFile")
1104    }
1105}
1106
1107impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1108    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1109        self.desc.fmt(f)
1110    }
1111}
1112
1113#[derive(Clone, Debug)]
1114struct Type0Func {
1115    domain: Vec<f64>,
1116    range: Vec<f64>,
1117    contents: Vec<u8>,
1118    size: Vec<i64>,
1119    bits_per_sample: i64,
1120    encode: Vec<f64>,
1121    decode: Vec<f64>,
1122}
1123
1124#[allow(dead_code)]
1125fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1126    let divisor = x - x_min;
1127    if divisor != 0. {
1128        y_min + (x - x_min) * ((y_max - y_min) / divisor)
1129    } else {
1130        // (x - x_min) will be 0 which means we want to discard the interpolation
1131        // and arbitrarily choose y_min to match pdfium
1132        y_min
1133    }
1134}
1135
1136impl Type0Func {
1137    #[allow(dead_code)]
1138    fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1139        let _n_inputs = self.domain.len() / 2;
1140        let _n_ouputs = self.range.len() / 2;
1141
1142    }
1143}
1144
1145#[derive(Clone, Debug)]
1146struct Type2Func {
1147    c0: Option<Vec<f64>>,
1148    c1: Option<Vec<f64>>,
1149    n: f64,
1150}
1151
1152#[derive(Clone, Debug)]
1153enum Function {
1154    Type0(Type0Func),
1155    Type2(Type2Func),
1156    #[allow(dead_code)]
1157    Type3,
1158    #[allow(dead_code)]
1159    Type4(Vec<u8>)
1160}
1161
1162impl Function {
1163    fn new(doc: &Document, obj: &Object) -> Function {
1164        let dict = match obj {
1165            &Object::Dictionary(ref dict) => dict,
1166            &Object::Stream(ref stream) => &stream.dict,
1167            _ => panic!()
1168        };
1169        let function_type: i64 = get(doc, dict, b"FunctionType");
1170        let f = match function_type {
1171            0 => {
1172                // Sampled function
1173                let stream = match obj {
1174                    &Object::Stream(ref stream) => stream,
1175                    _ => panic!()
1176                };
1177                let range: Vec<f64> = get(doc, dict, b"Range");
1178                let domain: Vec<f64> = get(doc, dict, b"Domain");
1179                let contents = get_contents(stream);
1180                let size: Vec<i64> = get(doc, dict, b"Size");
1181                let bits_per_sample = get(doc, dict, b"BitsPerSample");
1182                // We ignore 'Order' like pdfium, poppler and pdf.js
1183
1184                let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1185                // maybe there's some better way to write this.
1186                let encode = encode.unwrap_or_else(|| {
1187                    let mut default = Vec::new();
1188                    for i in &size {
1189                        default.extend([0., (i - 1) as f64].iter());
1190                    }
1191                    default
1192                });
1193                let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1194
1195                Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1196            }
1197            2 => {
1198                // Exponential interpolation function
1199                let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1200                let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1201                let n = get::<f64>(doc, dict, b"N");
1202                Function::Type2(Type2Func { c0, c1, n})
1203            }
1204            3 => {
1205                // Stitching function
1206                Function::Type3
1207            }
1208            4 => {
1209                // PostScript calculator function
1210                let contents = match obj {
1211                    &Object::Stream(ref stream) => {
1212                        let contents = get_contents(stream);
1213                        println!("unhandled type-4 function");
1214                        println!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1215                        contents
1216                    }
1217                    _ => { panic!("type 4 functions should be streams") }
1218                };
1219                Function::Type4(contents)
1220            }
1221            _ => { panic!("unhandled function type {}", function_type) }
1222        };
1223        f
1224    }
1225}
1226
1227fn as_num(o: &Object) -> f64 {
1228    match o {
1229        &Object::Integer(i) => { i as f64 }
1230        &Object::Real(f) => { f.into() }
1231        _ => { panic!("not a number") }
1232    }
1233}
1234
1235#[derive(Clone)]
1236struct TextState<'a>
1237{
1238    font: Option<Rc<dyn PdfFont + 'a>>,
1239    font_size: f64,
1240    character_spacing: f64,
1241    word_spacing: f64,
1242    horizontal_scaling: f64,
1243    leading: f64,
1244    rise: f64,
1245    tm: Transform,
1246}
1247
1248// XXX: We'd ideally implement this without having to copy the uncompressed data
1249fn get_contents(contents: &Stream) -> Vec<u8> {
1250    if contents.filter().is_ok() {
1251        contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1252    } else {
1253        contents.content.clone()
1254    }
1255}
1256
1257#[derive(Clone)]
1258struct GraphicsState<'a>
1259{
1260    ctm: Transform,
1261    ts: TextState<'a>,
1262    smask: Option<Dictionary>,
1263    fill_colorspace: ColorSpace,
1264    fill_color: Vec<f64>,
1265    stroke_colorspace: ColorSpace,
1266    stroke_color: Vec<f64>,
1267    line_width: f64,
1268}
1269
1270fn show_text(gs: &mut GraphicsState, s: &[u8],
1271             _tlm: &Transform,
1272             _flip_ctm: &Transform,
1273             output: &mut dyn OutputDev) -> Result<(), OutputError> {
1274    let ts = &mut gs.ts;
1275    let font = ts.font.as_ref().unwrap();
1276    //let encoding = font.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1277    dlog!("{:?}", font.decode(s));
1278    dlog!("{:?}", font.decode(s).as_bytes());
1279    dlog!("{:?}", s);
1280    output.begin_word()?;
1281
1282    for (c, length) in font.char_codes(s) {
1283        // 5.3.3 Text Space Details
1284        let tsm = Transform2D::row_major(ts.horizontal_scaling,
1285                                                 0.,
1286                                                 0.,
1287                                                 1.0,
1288                                                 0.,
1289                                                 ts.rise);
1290        // Trm = Tsm × Tm × CTM
1291        let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1292        //dlog!("ctm: {:?} tm {:?}", gs.ctm, tm);
1293        //dlog!("current pos: {:?}", position);
1294        // 5.9 Extraction of Text Content
1295
1296
1297        //dlog!("w: {}", font.widths[&(*c as i64)]);
1298        let w0 = font.get_width(c) / 1000.;
1299
1300        let mut spacing = ts.character_spacing;
1301        // "Word spacing is applied to every occurrence of the single-byte character code 32 in a
1302        //  string when using a simple font or a composite font that defines code 32 as a
1303        //  single-byte code. It does not apply to occurrences of the byte value 32 in
1304        //  multiple-byte codes."
1305        let is_space = c == 32 && length == 1;
1306        if is_space { spacing += ts.word_spacing }
1307
1308        output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1309        let tj = 0.;
1310        let ty = 0.;
1311        let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1312        dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1313        // dlog!("w0: {}, tx: {}", w0, tx);
1314        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1315        let _trm = ts.tm.pre_transform(&gs.ctm);
1316        //dlog!("post pos: {:?}", trm);
1317
1318    }
1319    output.end_word()?;
1320    Ok(())
1321}
1322
1323#[derive(Debug, Clone, Copy)]
1324pub struct MediaBox {
1325    pub llx: f64,
1326    pub lly: f64,
1327    pub urx: f64,
1328    pub ury: f64
1329}
1330
1331fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1332    for (k, v) in state.iter() {
1333        let k : &[u8] = k.as_ref();
1334        match k {
1335            b"SMask" => { match maybe_deref(doc, v)  {
1336                &Object::Name(ref name) => {
1337                    if name == b"None" {
1338                        gs.smask = None;
1339                    } else {
1340                        panic!("unexpected smask name")
1341                    }
1342                }
1343                &Object::Dictionary(ref dict) => {
1344                    gs.smask = Some(dict.clone());
1345                }
1346                _ => { panic!("unexpected smask type {:?}", v) }
1347            }}
1348            b"Type" => { match v {
1349                &Object::Name(ref name) => {
1350                    assert_eq!(name, b"ExtGState")
1351                }
1352                _ => { panic!("unexpected type") }
1353            }}
1354            _ => {  dlog!("unapplied state: {:?} {:?}", k, v); }
1355        }
1356    }
1357
1358}
1359
1360#[derive(Debug)]
1361pub enum PathOp {
1362    MoveTo(f64, f64),
1363    LineTo(f64, f64),
1364    // XXX: is it worth distinguishing the different kinds of curve ops?
1365    CurveTo(f64, f64, f64, f64, f64, f64),
1366    Rect(f64, f64, f64, f64),
1367    Close,
1368}
1369
1370#[derive(Debug)]
1371pub struct Path {
1372    pub ops: Vec<PathOp>
1373}
1374
1375impl Path {
1376    fn new() -> Path {
1377        Path { ops: Vec::new() }
1378    }
1379    fn current_point(&self) -> (f64, f64) {
1380        match self.ops.last().unwrap() {
1381            &PathOp::MoveTo(x, y) => { (x, y) }
1382            &PathOp::LineTo(x, y) => { (x, y) }
1383            &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1384            _ => { panic!() }
1385        }
1386    }
1387}
1388
1389#[derive(Clone, Debug)]
1390pub struct CalGray {
1391    white_point: [f64; 3],
1392    black_point: Option<[f64; 3]>,
1393    gamma: Option<f64>,
1394}
1395
1396#[derive(Clone, Debug)]
1397pub struct CalRGB {
1398    white_point: [f64; 3],
1399    black_point: Option<[f64; 3]>,
1400    gamma: Option<[f64; 3]>,
1401    matrix: Option<Vec<f64>>
1402}
1403
1404#[derive(Clone, Debug)]
1405pub struct Lab {
1406    white_point: [f64; 3],
1407    black_point: Option<[f64; 3]>,
1408    range: Option<[f64; 4]>,
1409}
1410
1411#[derive(Clone, Debug)]
1412pub enum AlternateColorSpace {
1413    DeviceGray,
1414    DeviceRGB,
1415    DeviceCMYK,
1416    CalRGB(CalRGB),
1417    CalGray(CalGray),
1418    Lab(Lab),
1419    ICCBased(Vec<u8>)
1420}
1421
1422#[derive(Clone)]
1423pub struct Separation {
1424    name: String,
1425    alternate_space: AlternateColorSpace,
1426    tint_transform: Box<Function>,
1427}
1428
1429#[derive(Clone)]
1430pub enum ColorSpace {
1431    DeviceGray,
1432    DeviceRGB,
1433    DeviceCMYK,
1434    DeviceN,
1435    Pattern,
1436    CalRGB(CalRGB),
1437    CalGray(CalGray),
1438    Lab(Lab),
1439    Separation(Separation),
1440    ICCBased(Vec<u8>)
1441}
1442
1443fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1444    match name {
1445        b"DeviceGray" => ColorSpace::DeviceGray,
1446        b"DeviceRGB" => ColorSpace::DeviceRGB,
1447        b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1448        b"Pattern" => ColorSpace::Pattern,
1449        _ => {
1450            let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1451            let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1452            if let Ok(cs) = cs.as_array() {
1453                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1454                match cs_name.as_ref() {
1455                    "Separation" => {
1456                        let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1457                        let alternate_space = match &maybe_deref(doc, &cs[2]) {
1458                            Object::Name(name) => {
1459                                match &name[..] {
1460                                    b"DeviceGray" => AlternateColorSpace::DeviceGray,
1461                                    b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1462                                    b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1463                                    _ => panic!("unexpected color space name")
1464                                }
1465                            }
1466                            Object::Array(cs) => {
1467                                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1468                                match cs_name.as_ref() {
1469                                    "ICCBased" => {
1470                                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1471                                        dlog!("ICCBased {:?}", stream);
1472                                        // XXX: we're going to be continually decompressing everytime this object is referenced
1473                                        AlternateColorSpace::ICCBased(get_contents(stream))
1474                                    }
1475                                    "CalGray" => {
1476                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1477                                        AlternateColorSpace::CalGray(CalGray {
1478                                            white_point: get(&doc, dict, b"WhitePoint"),
1479                                            black_point: get(&doc, dict, b"BackPoint"),
1480                                            gamma: get(&doc, dict, b"Gamma"),
1481                                        })
1482                                    }
1483                                    "CalRGB" => {
1484                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1485                                        AlternateColorSpace::CalRGB(CalRGB {
1486                                            white_point: get(&doc, dict, b"WhitePoint"),
1487                                            black_point: get(&doc, dict, b"BackPoint"),
1488                                            gamma: get(&doc, dict, b"Gamma"),
1489                                            matrix: get(&doc, dict, b"Matrix"),
1490                                        })
1491                                    }
1492                                    "Lab" => {
1493                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1494                                        AlternateColorSpace::Lab(Lab {
1495                                            white_point: get(&doc, dict, b"WhitePoint"),
1496                                            black_point: get(&doc, dict, b"BackPoint"),
1497                                            range: get(&doc, dict, b"Range"),
1498                                        })
1499                                    }
1500                                    _ => panic!("Unexpected color space name")
1501                                }
1502                            }
1503                            _ => panic!("Alternate space should be name or array {:?}", cs[2])
1504                        };
1505                        let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1506
1507                        dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1508                        ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1509                    }
1510                    "ICCBased" => {
1511                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1512                        dlog!("ICCBased {:?}", stream);
1513                        // XXX: we're going to be continually decompressing everytime this object is referenced
1514                        ColorSpace::ICCBased(get_contents(stream))
1515                    }
1516                    "CalGray" => {
1517                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1518                        ColorSpace::CalGray(CalGray {
1519                            white_point: get(&doc, dict, b"WhitePoint"),
1520                            black_point: get(&doc, dict, b"BackPoint"),
1521                            gamma: get(&doc, dict, b"Gamma"),
1522                        })
1523                    }
1524                    "CalRGB" => {
1525                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1526                        ColorSpace::CalRGB(CalRGB {
1527                            white_point: get(&doc, dict, b"WhitePoint"),
1528                            black_point: get(&doc, dict, b"BackPoint"),
1529                            gamma: get(&doc, dict, b"Gamma"),
1530                            matrix: get(&doc, dict, b"Matrix"),
1531                        })
1532                    }
1533                    "Lab" => {
1534                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1535                        ColorSpace::Lab(Lab {
1536                            white_point: get(&doc, dict, b"WhitePoint"),
1537                            black_point: get(&doc, dict, b"BackPoint"),
1538                            range: get(&doc, dict, b"Range"),
1539                        })
1540                    }
1541                    "Pattern" => {
1542                        ColorSpace::Pattern
1543                    },
1544                    "DeviceGray" => ColorSpace::DeviceGray,
1545                    "DeviceRGB" => ColorSpace::DeviceRGB,
1546                    "DeviceCMYK" => ColorSpace::DeviceCMYK,
1547                    "DeviceN" => ColorSpace::DeviceN,
1548                    _ => {
1549                        panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1550                    }
1551                }
1552            } else if let Ok(cs) = cs.as_name() {
1553                match pdf_to_utf8(cs).as_ref() {
1554                    "DeviceRGB" => ColorSpace::DeviceRGB,
1555                    "DeviceGray" => ColorSpace::DeviceGray,
1556                    _ => panic!()
1557                }
1558            } else {
1559                panic!();
1560            }
1561        }
1562    }
1563}
1564
1565struct Processor<'a> {
1566    _none: PhantomData<&'a ()>
1567}
1568
1569impl<'a> Processor<'a> {
1570    fn new() -> Processor<'a> {
1571        Processor { _none: PhantomData }
1572    }
1573
1574    fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1575        let content = Content::decode(&content).unwrap();
1576        let mut font_table = HashMap::new();
1577        let mut gs: GraphicsState = GraphicsState {
1578            ts: TextState {
1579                font: None,
1580                font_size: std::f64::NAN,
1581                character_spacing: 0.,
1582                word_spacing: 0.,
1583                horizontal_scaling: 100. / 100.,
1584                leading: 0.,
1585                rise: 0.,
1586                tm: Transform2D::identity(),
1587            },
1588            fill_color: Vec::new(),
1589            fill_colorspace: ColorSpace::DeviceGray,
1590            stroke_color: Vec::new(),
1591            stroke_colorspace: ColorSpace::DeviceGray,
1592            line_width: 1.,
1593            ctm: Transform2D::identity(),
1594            smask: None
1595        };
1596        //let mut ts = &mut gs.ts;
1597        let mut gs_stack = Vec::new();
1598        let mut mc_stack = Vec::new();
1599        // XXX: replace tlm with a point for text start
1600        let mut tlm = Transform2D::identity();
1601        let mut path = Path::new();
1602        let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1603        dlog!("MediaBox {:?}", media_box);
1604        for operation in &content.operations {
1605            //dlog!("op: {:?}", operation);
1606
1607            match operation.operator.as_ref() {
1608                "BT" => {
1609                    tlm = Transform2D::identity();
1610                    gs.ts.tm = tlm;
1611                }
1612                "ET" => {
1613                    tlm = Transform2D::identity();
1614                    gs.ts.tm = tlm;
1615                }
1616                "cm" => {
1617                    assert!(operation.operands.len() == 6);
1618                    let m = Transform2D::row_major(as_num(&operation.operands[0]),
1619                                                   as_num(&operation.operands[1]),
1620                                                   as_num(&operation.operands[2]),
1621                                                   as_num(&operation.operands[3]),
1622                                                   as_num(&operation.operands[4]),
1623                                                   as_num(&operation.operands[5]));
1624                    gs.ctm = gs.ctm.pre_transform(&m);
1625                    dlog!("matrix {:?}", gs.ctm);
1626                }
1627                "CS" => {
1628                    let name = operation.operands[0].as_name().unwrap();
1629                    gs.stroke_colorspace = make_colorspace(doc, name, resources);
1630                }
1631                "cs" => {
1632                    let name = operation.operands[0].as_name().unwrap();
1633                    gs.fill_colorspace = make_colorspace(doc, name, resources);
1634                }
1635                "SC" | "SCN" => {
1636                    gs.stroke_color = match gs.stroke_colorspace {
1637                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1638                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1639                    };
1640                }
1641                "sc" | "scn" => {
1642                    gs.fill_color = match gs.fill_colorspace {
1643                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1644                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1645                    };
1646                }
1647                "G" | "g" | "RG" | "rg" | "K" | "k" => {
1648                    dlog!("unhandled color operation {:?}", operation);
1649                }
1650                "TJ" => {
1651                    match operation.operands[0] {
1652                        Object::Array(ref array) => {
1653                            for e in array {
1654                                match e {
1655                                    &Object::String(ref s, _) => {
1656                                        show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1657                                    }
1658                                    &Object::Integer(i) => {
1659                                        let ts = &mut gs.ts;
1660                                        let w0 = 0.;
1661                                        let tj = i as f64;
1662                                        let ty = 0.;
1663                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1664                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1665                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1666                                    }
1667                                    &Object::Real(i) => {
1668                                        let ts = &mut gs.ts;
1669                                        let w0 = 0.;
1670                                        let tj = i as f64;
1671                                        let ty = 0.;
1672                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1673                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1674                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1675                                    }
1676                                    _ => { dlog!("kind of {:?}", e); }
1677                                }
1678                            }
1679                        }
1680                        _ => {}
1681                    }
1682                }
1683                "Tj" => {
1684                    match operation.operands[0] {
1685                        Object::String(ref s, _) => {
1686                            show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1687                        }
1688                        _ => { panic!("unexpected Tj operand {:?}", operation) }
1689                    }
1690                }
1691                "Tc" => {
1692                    gs.ts.character_spacing = as_num(&operation.operands[0]);
1693                }
1694                "Tw" => {
1695                    gs.ts.word_spacing = as_num(&operation.operands[0]);
1696                }
1697                "Tz" => {
1698                    gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1699                }
1700                "TL" => {
1701                    gs.ts.leading = as_num(&operation.operands[0]);
1702                }
1703                "Tf" => {
1704                    let fonts: &Dictionary = get(&doc, resources, b"Font");
1705                    let name = operation.operands[0].as_name().unwrap();
1706                    let font = font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1707                    {
1708                        /*let file = font.get_descriptor().and_then(|desc| desc.get_file());
1709                    if let Some(file) = file {
1710                        let file_contents = filter_data(file.as_stream().unwrap());
1711                        let mut cursor = Cursor::new(&file_contents[..]);
1712                        //let f = Font::read(&mut cursor);
1713                        //dlog!("font file: {:?}", f);
1714                    }*/
1715                    }
1716                    gs.ts.font = Some(font);
1717
1718                    gs.ts.font_size = as_num(&operation.operands[1]);
1719                    dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1720                }
1721                "Ts" => {
1722                    gs.ts.rise = as_num(&operation.operands[0]);
1723                }
1724                "Tm" => {
1725                    assert!(operation.operands.len() == 6);
1726                    tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1727                                                 as_num(&operation.operands[1]),
1728                                                 as_num(&operation.operands[2]),
1729                                                 as_num(&operation.operands[3]),
1730                                                 as_num(&operation.operands[4]),
1731                                                 as_num(&operation.operands[5]));
1732                    gs.ts.tm = tlm;
1733                    dlog!("Tm: matrix {:?}", gs.ts.tm);
1734                    output.end_line()?;
1735                }
1736                "Td" => {
1737                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1738                   tx and ty are numbers expressed in unscaled text space units.
1739                   More precisely, this operator performs the following assignments:
1740                 */
1741                    assert!(operation.operands.len() == 2);
1742                    let tx = as_num(&operation.operands[0]);
1743                    let ty = as_num(&operation.operands[1]);
1744                    dlog!("translation: {} {}", tx, ty);
1745
1746                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1747                    gs.ts.tm = tlm;
1748                    dlog!("Td matrix {:?}", gs.ts.tm);
1749                    output.end_line()?;
1750                }
1751
1752                "TD" => {
1753                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1754                   As a side effect, this operator sets the leading parameter in the text state.
1755                 */
1756                    assert!(operation.operands.len() == 2);
1757                    let tx = as_num(&operation.operands[0]);
1758                    let ty = as_num(&operation.operands[1]);
1759                    dlog!("translation: {} {}", tx, ty);
1760                    gs.ts.leading = -ty;
1761
1762                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1763                    gs.ts.tm = tlm;
1764                    dlog!("TD matrix {:?}", gs.ts.tm);
1765                    output.end_line()?;
1766                }
1767
1768                "T*" => {
1769                    let tx = 0.0;
1770                    let ty = -gs.ts.leading;
1771
1772                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1773                    gs.ts.tm = tlm;
1774                    dlog!("T* matrix {:?}", gs.ts.tm);
1775                    output.end_line()?;
1776                }
1777                "q" => { gs_stack.push(gs.clone()); }
1778                "Q" => {
1779                    let s = gs_stack.pop();
1780                    if let Some(s) = s {
1781                        gs = s;
1782                    } else {
1783                        println!("No state to pop");
1784                    }
1785                }
1786                "gs" => {
1787                    let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1788                    let name = operation.operands[0].as_name().unwrap();
1789                    let state: &Dictionary = get(doc, ext_gstate, name);
1790                    apply_state(doc, &mut gs, state);
1791                }
1792                "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1793                "w" => { gs.line_width = as_num(&operation.operands[0]); }
1794                "J" | "j" | "M" | "d" | "ri"  => { dlog!("unknown graphics state operator {:?}", operation); }
1795                "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1796                "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1797                "c" => {
1798                    path.ops.push(PathOp::CurveTo(
1799                        as_num(&operation.operands[0]),
1800                        as_num(&operation.operands[1]),
1801                        as_num(&operation.operands[2]),
1802                        as_num(&operation.operands[3]),
1803                        as_num(&operation.operands[4]),
1804                        as_num(&operation.operands[5])))
1805                }
1806                "v" => {
1807                    let (x, y) = path.current_point();
1808                    path.ops.push(PathOp::CurveTo(
1809                        x,
1810                        y,
1811                        as_num(&operation.operands[0]),
1812                        as_num(&operation.operands[1]),
1813                        as_num(&operation.operands[2]),
1814                        as_num(&operation.operands[3])))
1815                }
1816                "y" => {
1817                    path.ops.push(PathOp::CurveTo(
1818                        as_num(&operation.operands[0]),
1819                        as_num(&operation.operands[1]),
1820                        as_num(&operation.operands[2]),
1821                        as_num(&operation.operands[3]),
1822                        as_num(&operation.operands[2]),
1823                        as_num(&operation.operands[3])))
1824                }
1825                "h" => { path.ops.push(PathOp::Close) }
1826                "re" => {
1827                    path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1828                                               as_num(&operation.operands[1]),
1829                                               as_num(&operation.operands[2]),
1830                                               as_num(&operation.operands[3])))
1831                }
1832                "s" | "f*" | "B" | "B*" | "b" => {
1833                    dlog!("unhandled path op {:?}", operation);
1834                }
1835                "S" => {
1836                    output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1837                    path.ops.clear();
1838                }
1839                "F" | "f" => {
1840                    output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1841                    path.ops.clear();
1842                }
1843                "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1844                "n" => {
1845                    dlog!("discard {:?}", path);
1846                    path.ops.clear();
1847                }
1848                "BMC" | "BDC" => {
1849                    mc_stack.push(operation);
1850                }
1851                "EMC" => {
1852                    mc_stack.pop();
1853                }
1854                "Do" => {
1855                    // `Do` process an entire subdocument, so we do a recursive call to `process_stream`
1856                    // with the subdocument content and resources
1857                    let xobject: &Dictionary = get(&doc, resources, b"XObject");
1858                    let name = operation.operands[0].as_name().unwrap();
1859                    let xf: &Stream = get(&doc, xobject, name);
1860                    let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1861                    let contents = get_contents(xf);
1862                    self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1863                }
1864                _ => { dlog!("unknown operation {:?}", operation); }
1865
1866            }
1867        }
1868        Ok(())
1869    }
1870}
1871
1872
1873pub trait OutputDev {
1874    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1875    fn end_page(&mut self)-> Result<(), OutputError>;
1876    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1877    fn begin_word(&mut self)-> Result<(), OutputError>;
1878    fn end_word(&mut self)-> Result<(), OutputError>;
1879    fn end_line(&mut self)-> Result<(), OutputError>;
1880    fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1881    fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1882}
1883
1884
1885pub struct HTMLOutput<'a>  {
1886    file: &'a mut dyn std::io::Write,
1887    flip_ctm: Transform,
1888    last_ctm: Transform,
1889    buf_ctm: Transform,
1890    buf_font_size: f64,
1891    buf: String
1892}
1893
1894fn insert_nbsp(input: &str) -> String {
1895    let mut result = String::new();
1896    let mut word_end = false;
1897    let mut chars = input.chars().peekable();
1898    while let Some(c) = chars.next() {
1899        if c == ' ' {
1900            if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1901                result += "&nbsp;";
1902            } else {
1903                result += " ";
1904            }
1905            word_end = false;
1906        } else {
1907            word_end = true;
1908            result.push(c);
1909        }
1910    }
1911    result
1912}
1913
1914impl<'a> HTMLOutput<'a> {
1915    pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1916        HTMLOutput {
1917            file,
1918            flip_ctm: Transform2D::identity(),
1919            last_ctm: Transform2D::identity(),
1920            buf_ctm: Transform2D::identity(),
1921            buf: String::new(),
1922            buf_font_size: 0.,
1923        }
1924    }
1925    fn flush_string(&mut self) -> Result<(), OutputError>{
1926        if self.buf.len() != 0 {
1927
1928            let position = self.buf_ctm.post_transform(&self.flip_ctm);
1929            let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1930            // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1931            let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1932            let (x, y) = (position.m31, position.m32);
1933            println!("flush {} {:?}", self.buf, (x,y));
1934
1935            write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1936                   x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1937        }
1938        Ok(())
1939    }
1940}
1941
1942type ArtBox = (f64, f64, f64, f64);
1943
1944impl<'a> OutputDev for HTMLOutput<'a> {
1945    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1946        write!(self.file, "<meta charset='utf-8' /> ")?;
1947        write!(self.file, "<!-- page {} -->", page_num)?;
1948        write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1949        self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1950        Ok(())
1951    }
1952    fn end_page(&mut self) -> Result<(), OutputError> {
1953        self.flush_string()?;
1954        self.buf = String::new();
1955        self.last_ctm = Transform::identity();
1956        write!(self.file, "</div>")?;
1957        Ok(())
1958    }
1959    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1960        if trm.approx_eq(&self.last_ctm) {
1961            let position = trm.post_transform(&self.flip_ctm);
1962            let (x, y) = (position.m31, position.m32);
1963
1964            println!("accum {} {:?}", char, (x,y));
1965            self.buf += char;
1966        } else {
1967            println!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1968            self.flush_string()?;
1969            self.buf = char.to_owned();
1970            self.buf_font_size = font_size;
1971            self.buf_ctm = *trm;
1972        }
1973        let position = trm.post_transform(&self.flip_ctm);
1974        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1975        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1976        let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1977        let (x, y) = (position.m31, position.m32);
1978        write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1979               x, y, transformed_font_size, char)?;
1980        self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1981
1982        Ok(())
1983    }
1984    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1985    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1986    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1987}
1988
1989pub struct SVGOutput<'a>  {
1990    file: &'a mut dyn std::io::Write
1991}
1992impl<'a> SVGOutput<'a> {
1993    pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
1994        SVGOutput{file}
1995    }
1996}
1997
1998impl<'a> OutputDev for SVGOutput<'a> {
1999    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2000        let ver = 1.1;
2001        write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2002        if ver == 1.1 {
2003            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2004        } else {
2005            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2006        }
2007        if let Some(art_box) = art_box {
2008            let width = art_box.2 - art_box.0;
2009            let height = art_box.3 - art_box.1;
2010            let y = media_box.ury - art_box.1 - height;
2011            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2012        } else {
2013            let width = media_box.urx - media_box.llx;
2014            let height = media_box.ury - media_box.lly;
2015            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2016        }
2017        write!(self.file, "\n")?;
2018        type Mat = Transform;
2019
2020        let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2021        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2022               ctm.m11,
2023               ctm.m12,
2024               ctm.m21,
2025               ctm.m22,
2026               ctm.m31,
2027               ctm.m32,
2028        )?;
2029        Ok(())
2030    }
2031    fn end_page(&mut self) -> Result<(), OutputError>{
2032        write!(self.file, "</g>\n")?;
2033        write!(self.file, "</svg>")?;
2034        Ok(())
2035    }
2036    fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2037        Ok(())
2038    }
2039    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2040    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2041    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2042    fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2043        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2044               ctm.m11,
2045               ctm.m12,
2046               ctm.m21,
2047               ctm.m22,
2048               ctm.m31,
2049               ctm.m32,
2050        )?;
2051
2052        /*if path.ops.len() == 1 {
2053            if let PathOp::Rect(x, y, width, height) = path.ops[0] {
2054                write!(self.file, "<rect x={} y={} width={} height={} />\n", x, y, width, height);
2055                write!(self.file, "</g>");
2056                return;
2057            }
2058        }*/
2059        let mut d = Vec::new();
2060        for op in &path.ops {
2061            match op {
2062                &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2063                &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2064                &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2065                &PathOp::Close => { d.push(format!("Z"))},
2066                &PathOp::Rect(x, y, width, height) => {
2067                    d.push(format!("M{} {}", x, y));
2068                    d.push(format!("L{} {}", x + width, y));
2069                    d.push(format!("L{} {}", x + width, y + height));
2070                    d.push(format!("L{} {}", x, y + height));
2071                    d.push(format!("Z"));
2072                }
2073
2074            }
2075        }
2076        write!(self.file, "<path d='{}' />", d.join(" "))?;
2077        write!(self.file, "</g>")?;
2078        write!(self.file, "\n")?;
2079        Ok(())
2080    }
2081}
2082
2083/*
2084File doesn't implement std::fmt::Write so we have
2085to do some gymnastics to accept a File or String
2086See https://github.com/rust-lang/rust/issues/51305
2087*/
2088
2089pub trait ConvertToFmt {
2090    type Writer: std::fmt::Write;
2091    fn convert(self) -> Self::Writer;
2092}
2093
2094impl<'a> ConvertToFmt for &'a mut String {
2095    type Writer = &'a mut String;
2096    fn convert(self) -> Self::Writer {
2097        self
2098    }
2099}
2100
2101pub struct WriteAdapter<W> {
2102    f: W,
2103}
2104
2105impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2106    fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2107        self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2108    }
2109}
2110
2111impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2112    type Writer = WriteAdapter<Self>;
2113    fn convert(self) -> Self::Writer {
2114        WriteAdapter { f: self }
2115    }
2116}
2117
2118impl<'a> ConvertToFmt for &'a mut File {
2119    type Writer = WriteAdapter<Self>;
2120    fn convert(self) -> Self::Writer {
2121        WriteAdapter { f: self }
2122    }
2123}
2124
2125pub struct PlainTextOutput<W: ConvertToFmt>   {
2126    writer: W::Writer,
2127    last_end: f64,
2128    last_y: f64,
2129    first_char: bool,
2130    flip_ctm: Transform,
2131}
2132
2133impl<W: ConvertToFmt> PlainTextOutput<W> {
2134    pub fn new(writer: W) -> PlainTextOutput<W> {
2135        PlainTextOutput{
2136            writer: writer.convert(),
2137            last_end: 100000.,
2138            first_char: false,
2139            last_y: 0.,
2140            flip_ctm: Transform2D::identity(),
2141        }
2142    }
2143}
2144
2145/* There are some structural hints that PDFs can use to signal word and line endings:
2146 * however relying on these is not likely to be sufficient. */
2147impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2148    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2149        self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2150        Ok(())
2151    }
2152    fn end_page(&mut self) -> Result<(), OutputError> {
2153        Ok(())
2154    }
2155    fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2156        let position = trm.post_transform(&self.flip_ctm);
2157        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2158        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
2159        let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2160        let (x, y) = (position.m31, position.m32);
2161        use std::fmt::Write;
2162        //dlog!("last_end: {} x: {}, width: {}", self.last_end, x, width);
2163        if self.first_char {
2164            if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2165                write!(self.writer, "\n")?;
2166            }
2167
2168            // we've moved to the left and down
2169            if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2170                write!(self.writer, "\n")?;
2171            }
2172
2173            if x > self.last_end + transformed_font_size * 0.1 {
2174                dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2175                write!(self.writer, " ")?;
2176            }
2177        }
2178        //let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
2179        write!(self.writer, "{}", char)?;
2180        self.first_char = false;
2181        self.last_y = y;
2182        self.last_end = x + width * transformed_font_size;
2183        Ok(())
2184    }
2185    fn begin_word(&mut self) -> Result<(), OutputError> {
2186        self.first_char = true;
2187        Ok(())
2188    }
2189    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2190    fn end_line(&mut self) -> Result<(), OutputError>{
2191        //write!(self.file, "\n");
2192        Ok(())
2193    }
2194}
2195
2196
2197pub fn print_metadata(doc: &Document) {
2198    dlog!("Version: {}", doc.version);
2199    if let Some(ref info) = get_info(&doc) {
2200        for (k, v) in *info {
2201            match v {
2202                &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2203                _ => {}
2204            }
2205        }
2206    }
2207    dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2208    dlog!("Pages: {:?}", get_pages(&doc));
2209    dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2210}
2211
2212/// Extract the text from a pdf at `path` and return a `String` with the results
2213pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2214    let mut s = String::new();
2215    {
2216        let mut output = PlainTextOutput::new(&mut s);
2217        let mut doc = Document::load(path)?;
2218        maybe_decrypt(&mut doc)?;
2219        output_doc(&doc, &mut output)?;
2220    }
2221    Ok(s)
2222}
2223
2224fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2225    if ! doc.is_encrypted() {
2226        return Ok(());
2227    }
2228
2229    if let Err(e) = doc.decrypt("") {
2230        if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2231            eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2232        }
2233
2234        return Err(OutputError::PdfError(e));
2235    }
2236
2237    Ok(())
2238}
2239
2240pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>, PW: AsRef<[u8]>>(
2241    path: P,
2242    password: PW,
2243) -> Result<String, OutputError> {
2244    let mut s = String::new();
2245    {
2246        let mut output = PlainTextOutput::new(&mut s);
2247        let mut doc = Document::load(path)?;
2248        output_doc_encrypted(&mut doc, &mut output, password)?;
2249    }
2250    Ok(s)
2251}
2252
2253pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2254    let mut s = String::new();
2255    {
2256        let mut output = PlainTextOutput::new(&mut s);
2257        let mut doc = Document::load_mem(buffer)?;
2258        maybe_decrypt(&mut doc)?;
2259        output_doc(&doc, &mut output)?;
2260    }
2261    Ok(s)
2262}
2263
2264pub fn extract_text_from_mem_encrypted<PW: AsRef<[u8]>>(
2265    buffer: &[u8],
2266    password: PW,
2267) -> Result<String, OutputError> {
2268    let mut s = String::new();
2269    {
2270        let mut output = PlainTextOutput::new(&mut s);
2271        let mut doc = Document::load_mem(buffer)?;
2272        output_doc_encrypted(&mut doc, &mut output, password)?;
2273    }
2274    Ok(s)
2275}
2276
2277
2278fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2279    let mut s = String::new();
2280    {
2281        let mut output = PlainTextOutput::new(&mut s);
2282        output_doc_page(doc, &mut output, page_num)?;
2283    }
2284    Ok(s)
2285}
2286
2287/// Extract the text from a pdf at `path` and return a `Vec<String>` with the results separately by page
2288
2289pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2290    let mut v = Vec::new();
2291    {
2292        let mut doc = Document::load(path)?;
2293        maybe_decrypt(&mut doc)?;
2294        let mut page_num = 1;
2295        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2296            v.push(content);
2297            page_num += 1;
2298        }
2299    }
2300    Ok(v)
2301}
2302
2303pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>, PW: AsRef<[u8]>>(path: P, password: PW) -> Result<Vec<String>, OutputError> {
2304    let mut v = Vec::new();
2305    {
2306        let mut doc = Document::load(path)?;
2307        doc.decrypt(password)?;
2308        let mut page_num = 1;
2309        while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2310            v.push(content);
2311            page_num += 1;
2312        }
2313    }
2314    Ok(v)
2315}
2316
2317pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2318    let mut v = Vec::new();
2319    {
2320        let mut doc = Document::load_mem(buffer)?;
2321        maybe_decrypt(&mut doc)?;
2322        let mut page_num = 1;
2323        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2324            v.push(content);
2325            page_num += 1;
2326        }
2327    }
2328    Ok(v)
2329}
2330
2331pub fn extract_text_from_mem_by_pages_encrypted<PW: AsRef<[u8]>>(buffer: &[u8], password: PW) -> Result<Vec<String>, OutputError> {
2332    let mut v = Vec::new();
2333    {
2334        let mut doc = Document::load_mem(buffer)?;
2335        doc.decrypt(password)?;
2336        let mut page_num = 1;
2337        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2338            v.push(content);
2339            page_num += 1;
2340        }
2341    }
2342    Ok(v)
2343}
2344
2345
2346fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2347    let o: Option<T> = get(doc, dict, key);
2348    if let Some(o) = o {
2349        Some(o)
2350    } else {
2351        let parent = dict.get(b"Parent")
2352            .and_then(|parent| parent.as_reference())
2353            .and_then(|id| doc.get_dictionary(id)).ok()?;
2354        get_inherited(doc, parent, key)
2355    }
2356}
2357
2358pub fn output_doc_encrypted<PW: AsRef<[u8]>>(
2359    doc: &mut Document,
2360    output: &mut dyn OutputDev,
2361    password: PW,
2362) -> Result<(), OutputError> {
2363    doc.decrypt(password)?;
2364    output_doc(doc, output)
2365}
2366
2367/// Parse a given document and output it to `output`
2368pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2369    if doc.is_encrypted() {
2370        eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2371    }
2372    let empty_resources = Dictionary::new();
2373    let pages = doc.get_pages();
2374    let mut p = Processor::new();
2375    for dict in pages {
2376        let page_num = dict.0;
2377        let object_id = dict.1;
2378        output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2379    }
2380    Ok(())
2381}
2382
2383pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2384    if doc.is_encrypted() {
2385        eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2386    }
2387    let empty_resources = Dictionary::new();
2388    let pages = doc.get_pages();
2389    let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2390    let mut p = Processor::new();
2391    output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2392    Ok(())
2393}
2394
2395fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2396    let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2397    dlog!("page {} {:?}", page_num, page_dict);
2398    // XXX: Some pdfs lack a Resources directory
2399    let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2400    dlog!("resources {:?}", resources);
2401    // pdfium searches up the page tree for MediaBoxes as needed
2402    let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2403    let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2404    let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2405        .map(|x| (x[0], x[1], x[2], x[3]));
2406    output.begin_page(page_num, &media_box, art_box)?;
2407    p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2408    output.end_page()?;
2409    Ok(())
2410}