Skip to main content

pdf_extract/
lib.rs

1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26use log::{warn, error, debug};
27mod core_fonts;
28mod glyphnames;
29mod zapfglyphnames;
30mod encodings;
31
32pub struct Space;
33pub type Transform = Transform2D<f64, Space, Space>;
34
35#[derive(Debug)]
36pub enum OutputError
37{
38    FormatError(std::fmt::Error),
39    IoError(std::io::Error),
40    PdfError(lopdf::Error)
41}
42
43impl std::fmt::Display for OutputError
44{
45    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
46        match self {
47            OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
48            OutputError::IoError(e) => write!(f, "IO error: {}", e),
49            OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
50        }
51    }
52}
53
54impl std::error::Error for OutputError{}
55
56impl From<std::fmt::Error> for OutputError {
57    fn from(e: std::fmt::Error) -> Self {
58        OutputError::FormatError(e)
59    }
60}
61
62impl From<std::io::Error> for OutputError {
63    fn from(e: std::io::Error) -> Self {
64        OutputError::IoError(e)
65    }
66}
67
68impl From<lopdf::Error> for OutputError {
69    fn from(e: lopdf::Error) -> Self {
70        OutputError::PdfError(e)
71    }
72}
73
74macro_rules! dlog {
75    ($($e:expr),*) => { {} }
76    //($($t:tt)*) => { println!($($t)*) }
77}
78
79fn get_info(doc: &Document) -> Option<&Dictionary> {
80    match doc.trailer.get(b"Info") {
81        Ok(&Object::Reference(ref id)) => {
82            match doc.get_object(*id) {
83                Ok(&Object::Dictionary(ref info)) => { return Some(info); }
84                _ => {}
85            }
86        }
87        _ => {}
88    }
89    None
90}
91
92fn get_catalog(doc: &Document) -> &Dictionary {
93    match doc.trailer.get(b"Root").unwrap() {
94        &Object::Reference(ref id) => {
95            match doc.get_object(*id) {
96                Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
97                _ => {}
98            }
99        }
100        _ => {}
101    }
102    panic!();
103}
104
105fn get_pages(doc: &Document) -> &Dictionary {
106    let catalog = get_catalog(doc);
107    match catalog.get(b"Pages").unwrap() {
108        &Object::Reference(ref id) => {
109            match doc.get_object(*id) {
110                Ok(&Object::Dictionary(ref pages)) => { return pages; }
111                other => {dlog!("pages: {:?}", other)}
112            }
113        }
114        other => { dlog!("pages: {:?}", other)}
115    }
116    dlog!("catalog {:?}", catalog);
117    panic!();
118}
119
120#[allow(non_upper_case_globals)]
121const PDFDocEncoding: &'static [u16] = &[
122    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
123    0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
124    0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
125    0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
126    0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
127    0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
128    0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
129    0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
130    0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
131    0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
132    0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
133    0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
134    0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
135    0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
136    0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
137    0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
138    0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
139    0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
140    0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
141    0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
142    0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
143    0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
144    0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
145    0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
147    0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
148    0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
149    0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
150    0x00fc, 0x00fd, 0x00fe, 0x00ff];
151
152fn pdf_to_utf8(s: &[u8]) -> String {
153    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
154        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
155    } else {
156        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
157               let k = PDFDocEncoding[x as usize];
158               vec![(k>>8) as u8, k as u8].into_iter()}).collect();
159        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
160    }
161}
162
163fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
164    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
165        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
166    } else {
167        let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
168            let k = encoding[x as usize];
169            vec![(k>>8) as u8, k as u8].into_iter()}).collect();
170        return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
171    }
172}
173
174
175fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
176    match o {
177        &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
178        _ => o
179    }
180}
181
182fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
183    dict.get(key).map(|o| maybe_deref(doc, o)).ok()
184}
185
186// an intermediate trait that can be used to chain conversions that may have failed
187trait FromOptObj<'a> {
188    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
189}
190
191// conditionally convert to Self returns None if the conversion failed
192trait FromObj<'a> where Self: std::marker::Sized {
193    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
194}
195
196impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
197    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
198        obj.and_then(|x| T::from_obj(doc,x))
199    }
200}
201
202impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
203    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
204        T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
205    }
206}
207
208// we follow the same conventions as pdfium for when to support indirect objects:
209// on arrays, streams and dicts
210impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
211    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
212        maybe_deref(doc, obj).as_array().map(|x| x.iter()
213            .map(|x| T::from_obj(doc, x).expect("wrong type"))
214            .collect()).ok()
215    }
216}
217
218// XXX: These will panic if we don't have the right number of items
219// we don't want to do that
220impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
221    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
222        maybe_deref(doc, obj).as_array().map(|x| {
223            let mut all = x.iter()
224                .map(|x| T::from_obj(doc, x).expect("wrong type"));
225            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
226        }).ok()
227    }
228}
229
230impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
231    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
232        maybe_deref(doc, obj).as_array().map(|x| {
233            let mut all = x.iter()
234                .map(|x| T::from_obj(doc, x).expect("wrong type"));
235            [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
236        }).ok()
237    }
238}
239
240impl<'a> FromObj<'a> for f64 {
241    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
242        match obj {
243            &Object::Integer(i) => Some(i as f64),
244            &Object::Real(f) => Some(f.into()),
245            _ => None
246        }
247    }
248}
249
250impl<'a> FromObj<'a> for i64 {
251    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
252        match obj {
253            &Object::Integer(i) => Some(i),
254            _ => None
255        }
256    }
257}
258
259impl<'a> FromObj<'a> for &'a Dictionary {
260    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
261        maybe_deref(doc, obj).as_dict().ok()
262    }
263}
264
265impl<'a> FromObj<'a> for &'a Stream {
266    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
267        maybe_deref(doc, obj).as_stream().ok()
268    }
269}
270
271impl<'a> FromObj<'a> for &'a Object {
272    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
273        Some(maybe_deref(doc, obj))
274    }
275}
276
277fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
278    T::from_opt_obj(doc, dict.get(key).ok(), key)
279}
280
281fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
282    maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
283}
284
285fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
286    pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
287}
288
289#[allow(dead_code)]
290fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
291    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
292}
293
294fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
295    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
296}
297
298fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
299    maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
300}
301
302#[derive(Clone)]
303struct PdfSimpleFont<'a> {
304    font: &'a Dictionary,
305    doc: &'a Document,
306    encoding: Option<Vec<u16>>,
307    unicode_map: Option<HashMap<u32, String>>,
308    widths: HashMap<CharCode, f64>, // should probably just use i32 here
309    missing_width: f64,
310}
311
312#[derive(Clone)]
313struct PdfType3Font<'a> {
314    font: &'a Dictionary,
315    doc: &'a Document,
316    encoding: Option<Vec<u16>>,
317    unicode_map: Option<HashMap<CharCode, String>>,
318    widths: HashMap<CharCode, f64>, // should probably just use i32 here
319}
320
321
322fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
323    let subtype = get_name_string(doc, font, b"Subtype");
324    dlog!("MakeFont({})", subtype);
325    if subtype == "Type0" {
326        Rc::new(PdfCIDFont::new(doc, font))
327    } else if subtype == "Type3" {
328        Rc::new(PdfType3Font::new(doc, font))
329    } else {
330        Rc::new(PdfSimpleFont::new(doc, font))
331    }
332}
333
334fn is_core_font(name: &str) -> bool {
335    match name {
336        "Courier-Bold" |
337        "Courier-BoldOblique" |
338        "Courier-Oblique" |
339        "Courier" |
340        "Helvetica-Bold" |
341        "Helvetica-BoldOblique" |
342        "Helvetica-Oblique" |
343        "Helvetica" |
344        "Symbol" |
345        "Times-Bold" |
346        "Times-BoldItalic" |
347        "Times-Italic" |
348        "Times-Roman" |
349        "ZapfDingbats" => true,
350        _ => false,
351    }
352}
353
354fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
355    let encoding = match &name[..] {
356        b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
357        b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
358        b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
359        _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
360    };
361    let encoding_table = encoding.iter()
362        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
363        .collect();
364    encoding_table
365}
366
367/* "Glyphs in the font are selected by single-byte character codes obtained from a string that
368    is shown by the text-showing operators. Logically, these codes index into a table of 256
369    glyphs; the mapping from codes to glyphs is called the font’s encoding. Each font program
370    has a built-in encoding. Under some circumstances, the encoding can be altered by means
371    described in Section 5.5.5, “Character Encoding.”
372*/
373impl<'a> PdfSimpleFont<'a> {
374    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
375        let base_name = get_name_string(doc, font, b"BaseFont");
376        let subtype = get_name_string(doc, font, b"Subtype");
377
378        let encoding: Option<&Object> = get(doc, font, b"Encoding");
379        dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
380        let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
381        let mut type1_encoding = None;
382        let mut unicode_map = None;
383        if let Some(descriptor) = descriptor {
384            dlog!("descriptor {:?}", descriptor);
385            if subtype == "Type1" {
386                let file = maybe_get_obj(doc, descriptor, b"FontFile");
387                match file {
388                    Some(&Object::Stream(ref s)) => {
389                        let s = get_contents(s);
390                        //dlog!("font contents {:?}", pdf_to_utf8(&s));
391                        type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
392                    }
393                    _ => { dlog!("font file {:?}", file) }
394                }
395            } else if subtype == "TrueType" {
396                let file = maybe_get_obj(doc, descriptor, b"FontFile2");
397                match file {
398                    Some(&Object::Stream(ref s)) => {
399                        let _s = get_contents(s);
400                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
401                    }
402                    _ => { dlog!("font file {:?}", file) }
403                }
404            }
405
406            let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
407            match font_file3 {
408                Some(&Object::Stream(ref s)) => {
409                    let subtype = get_name_string(doc, &s.dict, b"Subtype");
410                    dlog!("font file {}, {:?}", subtype, s);
411                    let s = get_contents(s);
412                    if subtype == "Type1C" {
413                        let table = cff_parser::Table::parse(&s).unwrap();
414                        //use std::io::Write;
415                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
416                        
417                        let encoding = table.encoding.get_code_to_sid_table(&table.charset);
418
419                        let mapping: HashMap<u32, String> = encoding.into_iter().filter_map(|(cid, sid)| {
420                            let name = cff_parser::string_by_id(&table, sid).unwrap();
421                            if name == ".notdef" {
422                                return None;
423                            }
424                            let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
425                                zapfglyphnames::zapfdigbats_names_to_unicode(name)
426                            });
427                            if unicode.is_none() {
428                                warn!("Couldn't find unicode for {}", name);
429                                return None;
430                            }
431                            let str = String::from_utf16(&[unicode.unwrap()]).unwrap();
432                            Some((cid as u32, str))
433                        }).collect();
434                        unicode_map = Some(mapping);
435                    }
436
437                    //
438                    //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
439                }
440                None => {}
441                _ => { dlog!("unexpected") }
442            }
443
444            let charset = maybe_get_obj(doc, descriptor, b"CharSet");
445            let _charset = match charset {
446                Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
447                _ => { None }
448            };
449            //dlog!("charset {:?}", charset);
450        }
451
452        let mut unicode_map = match unicode_map {
453            Some(mut unicode_map) => {
454                unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
455                Some(unicode_map)
456            }
457            None => {
458                get_unicode_map(doc, font)
459            }
460        };
461
462
463        let mut encoding_table = None;
464        match encoding {
465            Some(&Object::Name(ref encoding_name)) => {
466                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
467                encoding_table = Some(encoding_to_unicode_table(encoding_name));
468            }
469            Some(&Object::Dictionary(ref encoding)) => {
470                //dlog!("Encoding {:?}", encoding);
471                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
472                    dlog!("BaseEncoding {:?}", base_encoding);
473                    encoding_to_unicode_table(base_encoding)
474                } else {
475                    Vec::from(PDFDocEncoding)
476                };
477                let differences = maybe_get_array(doc, encoding, b"Differences");
478                if let Some(differences) = differences {
479                    dlog!("Differences");
480                    let mut code = 0;
481                    for o in differences {
482                        let o = maybe_deref(doc, o);
483                        match o {
484                            &Object::Integer(i) => { code = i; },
485                            &Object::Name(ref n) => {
486                                let name = pdf_to_utf8(&n);
487                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
488                                // unicode names, so we should probably handle this differently
489                                let unicode = glyphnames::name_to_unicode(&name);
490                                if let Some(unicode) = unicode{
491                                    table[code as usize] = unicode;
492                                    if let Some(ref mut unicode_map) = unicode_map {
493                                        let be = [unicode];
494                                        match unicode_map.entry(code as u32) {
495                                            // If there's a unicode table entry missing use one based on the name
496                                            Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
497                                            Entry::Occupied(e) => {
498                                                if e.get() != &String::from_utf16(&be).unwrap() {
499                                                    let normal_match  = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
500                                                    if !normal_match {
501                                                        warn!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
502                                                    }
503                                                }
504                                            }
505                                        }
506                                    }
507                                } else {
508                                    match unicode_map {
509                                        Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
510                                            // the fontawesome tex package will use glyph names that don't have a corresponding unicode
511                                            // code point, so we'll use an empty string instead. See issue #76
512                                            match unicode_map.entry(code as u32) {
513                                                Entry::Vacant(v) => { v.insert("".to_owned()); }
514                                                Entry::Occupied(e) => {
515                                                    panic!("unexpected entry in unicode map")
516                                                }
517                                            }
518                                        }
519                                        _ => {
520                                            warn!("unknown glyph name '{}' for font {}", name, base_name);
521                                        }
522                                    }
523                                }
524                                dlog!("{} = {} ({:?})", code, name, unicode);
525                                if let Some(ref mut unicode_map) = unicode_map {
526                                    // The unicode map might not have the code in it, but the code might
527                                    // not be used so we don't want to panic here.
528                                    // An example of this is the 'suppress' character in the TeX Latin Modern font.
529                                    // This shows up in https://arxiv.org/pdf/2405.01295v1.pdf
530                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
531                                }
532                                code += 1;
533                            }
534                            _ => { panic!("wrong type {:?}", o); }
535                        }
536                    }
537                }
538                // "Type" is optional
539                let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
540                dlog!("name: {}", name);
541
542                encoding_table = Some(table);
543            }
544            None => {
545                if let Some(type1_encoding) = type1_encoding {
546                    let mut table = Vec::from(PDFDocEncoding);
547                    dlog!("type1encoding");
548                    for (code, name) in type1_encoding {
549                        let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
550                        if let Some(unicode) = unicode {
551                            table[code as usize] = unicode;
552                        } else {
553                            dlog!("unknown character {}", pdf_to_utf8(&name));
554                        }
555                    }
556                    encoding_table = Some(table)
557                } else if subtype == "TrueType" {
558                    encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
559                        .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
560                        .collect());
561                }
562            }
563            _ => { panic!() }
564        }
565
566        let mut width_map = HashMap::new();
567        /* "Ordinarily, a font dictionary that refers to one of the standard fonts
568            should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
569            However, it is permissible to override a standard font by including these
570            entries and embedding the font program in the PDF file."
571
572            Note: some PDFs include a descriptor but still don't include these entries */
573
574        // If we have widths prefer them over the core font widths. Needed for https://dkp.de/wp-content/uploads/parteitage/Sozialismusvorstellungen-der-DKP.pdf
575        if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
576            // Some PDF's don't have these like fips-197.pdf
577            let mut i: i64 = 0;
578            dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
579
580            for w in widths {
581                width_map.insert((first_char + i) as CharCode, w);
582                i += 1;
583            }
584            assert_eq!(first_char + i - 1, last_char);
585        } else {
586            let name = if is_core_font(&base_name) {
587                &base_name
588            } else {
589                warn!("no widths and not core font {:?}", base_name);
590
591                // This situation is handled differently by different readers
592                // but basically we try to substitute the best font that we can.
593
594                // Poppler/Xpdf:
595                // this is technically an error -- the Widths entry is required
596                // for all but the Base-14 fonts -- but certain PDF generators
597                // apparently don't include widths for Arial and TimesNewRoman
598
599                // Pdfium: CFX_FontMapper::FindSubstFont
600
601                // mupdf: pdf_load_substitute_font
602
603                // We can try to do a better job guessing at a font by looking at the flags
604                // or the basename but for now we'll just use Helvetica
605                "Helvetica"
606            };
607            for font_metrics in core_fonts::metrics().iter() {
608                if font_metrics.0 == base_name {
609                    if let Some(ref encoding) = encoding_table {
610                        dlog!("has encoding");
611                        for w in font_metrics.2 {
612                            let c = glyphnames::name_to_unicode(w.2).unwrap();
613                            for i in 0..encoding.len() {
614                                if encoding[i] == c {
615                                    width_map.insert(i as CharCode, w.1 as f64);
616                                }
617                            }
618                        }
619                    } else {
620                        // Instead of using the encoding from the core font we'll just look up all
621                        // of the character names. We should probably verify that this produces the
622                        // same result.
623
624                        let mut table = vec![0; 256];
625                        for w in font_metrics.2 {
626                            dlog!("{} {}", w.0, w.2);
627                            // -1 is "not encoded"
628                            if w.0 != -1 {
629                                table[w.0 as usize] = if base_name == "ZapfDingbats" {
630                                    zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
631                                } else {
632                                    glyphnames::name_to_unicode(w.2).unwrap()
633                                }
634                            }
635                        }
636
637                        let encoding = &table[..];
638                        for w in font_metrics.2 {
639                            width_map.insert(w.0 as CharCode, w.1 as f64);
640                            // -1 is "not encoded"
641                        }
642                        encoding_table = Some(encoding.to_vec());
643                    }
644                    /* "Ordinarily, a font dictionary that refers to one of the standard fonts
645                        should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
646                        However, it is permissible to override a standard font by including these
647                        entries and embedding the font program in the PDF file."
648
649                        Note: some PDFs include a descriptor but still don't include these entries */
650                    // assert!(maybe_get_obj(doc, font, b"FirstChar").is_none());
651                    // assert!(maybe_get_obj(doc, font, b"LastChar").is_none());
652                    // assert!(maybe_get_obj(doc, font, b"Widths").is_none());
653                }
654            }
655        }
656
657        let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
658        PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
659    }
660
661    #[allow(dead_code)]
662    fn get_type(&self) -> String {
663        get_name_string(self.doc, self.font, b"Type")
664    }
665    #[allow(dead_code)]
666    fn get_basefont(&self) -> String {
667        get_name_string(self.doc, self.font, b"BaseFont")
668    }
669    #[allow(dead_code)]
670    fn get_subtype(&self) -> String {
671        get_name_string(self.doc, self.font, b"Subtype")
672    }
673    #[allow(dead_code)]
674    fn get_widths(&self) -> Option<&Vec<Object>> {
675        maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
676    }
677    /* For type1: This entry is obsolescent and its use is no longer recommended. (See
678     * implementation note 42 in Appendix H.) */
679    #[allow(dead_code)]
680    fn get_name(&self) -> Option<String> {
681        maybe_get_name_string(self.doc, self.font, b"Name")
682    }
683
684    #[allow(dead_code)]
685    fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
686        maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
687    }
688}
689
690
691
692impl<'a> PdfType3Font<'a> {
693    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
694
695        let unicode_map = get_unicode_map(doc, font);
696        let encoding: Option<&Object> = get(doc, font, b"Encoding");
697
698        let encoding_table;
699        match encoding {
700            Some(&Object::Name(ref encoding_name)) => {
701                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
702                encoding_table = Some(encoding_to_unicode_table(encoding_name));
703            }
704            Some(&Object::Dictionary(ref encoding)) => {
705                //dlog!("Encoding {:?}", encoding);
706                let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
707                    dlog!("BaseEncoding {:?}", base_encoding);
708                    encoding_to_unicode_table(base_encoding)
709                } else {
710                    Vec::from(PDFDocEncoding)
711                };
712                let differences = maybe_get_array(doc, encoding, b"Differences");
713                if let Some(differences) = differences {
714                    dlog!("Differences");
715                    let mut code = 0;
716                    for o in differences {
717                        match o {
718                            &Object::Integer(i) => { code = i; },
719                            &Object::Name(ref n) => {
720                                let name = pdf_to_utf8(&n);
721                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
722                                // unicode names, so we should probably handle this differently
723                                let unicode = glyphnames::name_to_unicode(&name);
724                                if let Some(unicode) = unicode{
725                                    table[code as usize] = unicode;
726                                }
727                                dlog!("{} = {} ({:?})", code, name, unicode);
728                                if let Some(ref unicode_map) = unicode_map {
729                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
730                                }
731                                code += 1;
732                            }
733                            _ => { panic!("wrong type"); }
734                        }
735                    }
736                }
737                let name_encoded = encoding.get(b"Type");
738                if let Ok(Object::Name(name)) = name_encoded {
739                    dlog!("name: {}", pdf_to_utf8(name));
740                } else {
741                    dlog!("name not found");
742                }
743
744                encoding_table = Some(table);
745            }
746            _ => { panic!() }
747        }
748
749        let first_char: i64 = get(doc, font, b"FirstChar");
750        let last_char: i64 = get(doc, font, b"LastChar");
751        let widths: Vec<f64> = get(doc, font, b"Widths");
752
753        let mut width_map = HashMap::new();
754
755        let mut i = 0;
756        dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
757
758        for w in widths {
759            width_map.insert((first_char + i) as CharCode, w);
760            i += 1;
761        }
762        assert_eq!(first_char + i - 1, last_char);
763        PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
764    }
765}
766
767type CharCode = u32;
768
769struct PdfFontIter<'a>
770{
771    i: Iter<'a, u8>,
772    font: &'a dyn PdfFont,
773}
774
775impl<'a> Iterator for PdfFontIter<'a> {
776    type Item = (CharCode, u8);
777    fn next(&mut self) -> Option<(CharCode, u8)> {
778        self.font.next_char(&mut self.i)
779    }
780}
781
782trait PdfFont : Debug {
783    fn get_width(&self, id: CharCode) -> f64;
784    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
785    fn decode_char(&self, char: CharCode) -> String;
786
787        /*fn char_codes<'a>(&'a self, chars: &'a [u8]) -> PdfFontIter {
788            let p = self;
789            PdfFontIter{i: chars.iter(), font: p as &PdfFont}
790        }*/
791
792}
793
794impl<'a> dyn PdfFont + 'a {
795    fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
796        PdfFontIter{i: chars.iter(), font: self}
797    }
798    fn decode(&self, chars: &[u8]) -> String {
799        let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
800        strings.join("")
801    }
802
803}
804
805
806impl<'a> PdfFont for PdfSimpleFont<'a> {
807    fn get_width(&self, id: CharCode) -> f64 {
808        let width = self.widths.get(&id);
809        if let Some(width) = width {
810            return *width;
811        } else {
812            let mut widths = self.widths.iter().collect::<Vec<_>>();
813            widths.sort_by_key(|x| x.0);
814            dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
815            return self.missing_width;
816        }
817    }
818    /*fn decode(&self, chars: &[u8]) -> String {
819        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
820        to_utf8(encoding, chars)
821    }*/
822
823    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
824        iter.next().map(|x| (*x as CharCode, 1))
825    }
826    fn decode_char(&self, char: CharCode) -> String {
827        let slice = [char as u8];
828        if let Some(ref unicode_map) = self.unicode_map {
829            let s = unicode_map.get(&char);
830            let s = match s {
831                None => {
832                    debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
833                    // some pdf's like http://arxiv.org/pdf/2312.00064v1 are missing entries in their unicode map but do have
834                    // entries in the encoding.
835                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
836                    let s = to_utf8(encoding, &slice);
837                    debug!("falling back to encoding {} -> {:?}", char, s);
838                    s
839                }
840                Some(s) => { s.clone() }
841            };
842            return s
843        }
844        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
845        //dlog!("char_code {:?} {:?}", char, self.encoding);
846        let s = to_utf8(encoding, &slice);
847        s
848    }
849}
850
851
852
853impl<'a> fmt::Debug for PdfSimpleFont<'a> {
854    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
855        self.font.fmt(f)
856    }
857}
858
859impl<'a> PdfFont for PdfType3Font<'a> {
860    fn get_width(&self, id: CharCode) -> f64 {
861        let width = self.widths.get(&id);
862        if let Some(width) = width {
863            return *width;
864        } else {
865            panic!("missing width for {} {:?}", id, self.font);
866        }
867    }
868    /*fn decode(&self, chars: &[u8]) -> String {
869        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
870        to_utf8(encoding, chars)
871    }*/
872
873    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
874        iter.next().map(|x| (*x as CharCode, 1))
875    }
876    fn decode_char(&self, char: CharCode) -> String {
877        let slice = [char as u8];
878        if let Some(ref unicode_map) = self.unicode_map {
879            let s = unicode_map.get(&char);
880            let s = match s {
881                None => {
882                    debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
883                    // some pdf's like http://arxiv.org/pdf/2312.00577v1 are missing entries in their unicode map but do have
884                    // entries in the encoding.
885                    let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
886                    let s = to_utf8(encoding, &slice);
887                    debug!("falling back to encoding {} -> {:?}", char, s);
888                    s
889                }
890                Some(s) => { s.clone() }
891            };
892            return s
893        }
894        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
895        //dlog!("char_code {:?} {:?}", char, self.encoding);
896        let s = to_utf8(encoding, &slice);
897        s
898    }
899}
900
901
902
903impl<'a> fmt::Debug for PdfType3Font<'a> {
904    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
905        self.font.fmt(f)
906    }
907}
908
909struct PdfCIDFont<'a> {
910    font: &'a Dictionary,
911    #[allow(dead_code)]
912    doc: &'a Document,
913    #[allow(dead_code)]
914    encoding: ByteMapping,
915    to_unicode: Option<HashMap<u32, String>>,
916    widths: HashMap<CharCode, f64>, // should probably just use i32 here
917    default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
918}
919
920fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
921    let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
922    dlog!("ToUnicode: {:?}", to_unicode);
923    let mut unicode_map = None;
924    match to_unicode {
925        Some(&Object::Stream(ref stream)) => {
926            let contents = get_contents(stream);
927            dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
928
929            let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
930            let mut unicode = HashMap::new();
931            // "It must use the beginbfchar, endbfchar, beginbfrange, and endbfrange operators to
932            // define the mapping from character codes to Unicode character sequences expressed in
933            // UTF-16BE encoding."
934            for (&k, v) in cmap.iter() {
935                let mut be: Vec<u16> = Vec::new();
936                let mut i = 0;
937                assert!(v.len() % 2 == 0);
938                while i < v.len() {
939                    be.push(((v[i] as u16) << 8) | v[i+1] as u16);
940                    i += 2;
941                }
942                match &be[..] {
943                    [0xd800 ..= 0xdfff] => {
944                        // this range is not specified as not being encoded
945                        // we ignore them so we don't an error from from_utt16
946                        continue;
947                    }
948                    _ => {}
949                }
950                let s = String::from_utf16(&be).unwrap();
951
952                unicode.insert(k, s);
953            }
954            unicode_map = Some(unicode);
955
956            dlog!("map: {:?}", unicode_map);
957        }
958        None => { }
959        Some(&Object::Name(ref name)) => {
960            let name = pdf_to_utf8(name);
961            if name != "Identity-H" {
962                todo!("unsupported ToUnicode name: {:?}", name);
963            }
964        }
965        _ => { panic!("unsupported cmap {:?}", to_unicode)}
966    }
967    unicode_map
968}
969
970
971impl<'a> PdfCIDFont<'a> {
972    fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
973        let base_name = get_name_string(doc, font, b"BaseFont");
974        let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
975        let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
976        let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
977        dlog!("base_name {} {:?}", base_name, font);
978
979        let encoding = match encoding {
980            &Object::Name(ref name) => {
981                let name = pdf_to_utf8(name);
982                dlog!("encoding {:?}", name);
983                if name == "Identity-H" || name == "Identity-V" {
984                    ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
985                } else {
986                    panic!("unsupported encoding {}", name);
987                }
988            }
989            &Object::Stream(ref stream) => {
990                let contents = get_contents(stream);
991                dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
992                adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
993            }
994            _ => { panic!("unsupported encoding {:?}", encoding)}
995        };
996
997        // Sometimes a Type0 font might refer to the same underlying data as regular font. In this case we may be able to extract some encoding
998        // data.
999        // We should also look inside the truetype data to see if there's a cmap table. It will help us convert as well.
1000        // This won't work if the cmap has been subsetted. A better approach might be to hash glyph contents and use that against
1001        // a global library of glyph hashes
1002        let unicode_map = get_unicode_map(doc, font);
1003
1004        dlog!("descendents {:?} {:?}", descendants, ciddict);
1005
1006        let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1007        dlog!("{:?}", font_dict);
1008        let _f = font_dict.as_dict().expect("must be dict");
1009        let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1010        let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1011        dlog!("widths {:?}", w);
1012        let mut widths = HashMap::new();
1013        let mut i = 0;
1014        if let Some(w) = w {
1015            while i < w.len() {
1016                if let &Object::Array(ref wa) = w[i+1] {
1017                    let cid = w[i].as_i64().expect("id should be num");
1018                    let mut j = 0;
1019                    dlog!("wa: {:?} -> {:?}", cid, wa);
1020                    for w in wa {
1021                        widths.insert((cid + j) as CharCode, as_num(w) );
1022                        j += 1;
1023                    }
1024                    i += 2;
1025                } else {
1026                    let c_first = w[i].as_i64().expect("first should be num");
1027                    let c_last = w[i].as_i64().expect("last should be num");
1028                    let c_width = as_num(&w[i]);
1029                    for id in c_first..c_last {
1030                        widths.insert(id as CharCode, c_width);
1031                    }
1032                    i += 3;
1033                }
1034            }
1035        }
1036        PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1037    }
1038}
1039
1040impl<'a> PdfFont for PdfCIDFont<'a> {
1041    fn get_width(&self, id: CharCode) -> f64 {
1042        let width = self.widths.get(&id);
1043        if let Some(width) = width {
1044            dlog!("GetWidth {} -> {}", id, *width);
1045            return *width;
1046        } else {
1047            dlog!("missing width for {} falling back to default_width", id);
1048            return self.default_width.unwrap();
1049        }
1050    }/*
1051    fn decode(&self, chars: &[u8]) -> String {
1052        self.char_codes(chars);
1053
1054        //let utf16 = Vec::new();
1055
1056        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1057        to_utf8(encoding, chars)
1058    }*/
1059
1060    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1061        let mut c = *iter.next()? as u32;
1062        let mut code = None;
1063        'outer: for width in 1..=4 {
1064            for range in &self.encoding.codespace {
1065                if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1066                    code = Some((c as u32, width));
1067                    break 'outer;
1068                }
1069            }
1070            let next = *iter.next()?;
1071            c = ((c as u32) << 8) | next as u32;
1072        }
1073        let code = code?;
1074        for range in &self.encoding.cid {
1075            if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1076                return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1077            }
1078        }
1079        None
1080    }
1081    fn decode_char(&self, char: CharCode) -> String {
1082        let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1083        if let Some(s) = s {
1084            s.clone()
1085        } else {
1086            dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1087            "".to_string()
1088        }
1089    }
1090}
1091
1092impl<'a> fmt::Debug for PdfCIDFont<'a> {
1093    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1094        self.font.fmt(f)
1095    }
1096}
1097
1098
1099
1100#[derive(Copy, Clone)]
1101struct PdfFontDescriptor<'a> {
1102    desc: &'a Dictionary,
1103    doc: &'a Document
1104}
1105
1106impl<'a> PdfFontDescriptor<'a> {
1107    #[allow(dead_code)]
1108    fn get_file(&self) -> Option<&'a Object> {
1109        maybe_get_obj(self.doc, self.desc, b"FontFile")
1110    }
1111}
1112
1113impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1114    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1115        self.desc.fmt(f)
1116    }
1117}
1118
1119#[derive(Clone, Debug)]
1120struct Type0Func {
1121    domain: Vec<f64>,
1122    range: Vec<f64>,
1123    contents: Vec<u8>,
1124    size: Vec<i64>,
1125    bits_per_sample: i64,
1126    encode: Vec<f64>,
1127    decode: Vec<f64>,
1128}
1129
1130#[allow(dead_code)]
1131fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1132    let divisor = x - x_min;
1133    if divisor != 0. {
1134        y_min + (x - x_min) * ((y_max - y_min) / divisor)
1135    } else {
1136        // (x - x_min) will be 0 which means we want to discard the interpolation
1137        // and arbitrarily choose y_min to match pdfium
1138        y_min
1139    }
1140}
1141
1142impl Type0Func {
1143    #[allow(dead_code)]
1144    fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1145        let _n_inputs = self.domain.len() / 2;
1146        let _n_ouputs = self.range.len() / 2;
1147
1148    }
1149}
1150
1151#[derive(Clone, Debug)]
1152struct Type2Func {
1153    c0: Option<Vec<f64>>,
1154    c1: Option<Vec<f64>>,
1155    n: f64,
1156}
1157
1158#[derive(Clone, Debug)]
1159enum Function {
1160    Type0(Type0Func),
1161    Type2(Type2Func),
1162    #[allow(dead_code)]
1163    Type3,
1164    #[allow(dead_code)]
1165    Type4(Vec<u8>)
1166}
1167
1168impl Function {
1169    fn new(doc: &Document, obj: &Object) -> Function {
1170        let dict = match obj {
1171            &Object::Dictionary(ref dict) => dict,
1172            &Object::Stream(ref stream) => &stream.dict,
1173            _ => panic!()
1174        };
1175        let function_type: i64 = get(doc, dict, b"FunctionType");
1176        let f = match function_type {
1177            0 => {
1178                // Sampled function
1179                let stream = match obj {
1180                    &Object::Stream(ref stream) => stream,
1181                    _ => panic!()
1182                };
1183                let range: Vec<f64> = get(doc, dict, b"Range");
1184                let domain: Vec<f64> = get(doc, dict, b"Domain");
1185                let contents = get_contents(stream);
1186                let size: Vec<i64> = get(doc, dict, b"Size");
1187                let bits_per_sample = get(doc, dict, b"BitsPerSample");
1188                // We ignore 'Order' like pdfium, poppler and pdf.js
1189
1190                let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1191                // maybe there's some better way to write this.
1192                let encode = encode.unwrap_or_else(|| {
1193                    let mut default = Vec::new();
1194                    for i in &size {
1195                        default.extend([0., (i - 1) as f64].iter());
1196                    }
1197                    default
1198                });
1199                let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1200
1201                Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1202            }
1203            2 => {
1204                // Exponential interpolation function
1205                let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1206                let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1207                let n = get::<f64>(doc, dict, b"N");
1208                Function::Type2(Type2Func { c0, c1, n})
1209            }
1210            3 => {
1211                // Stitching function
1212                Function::Type3
1213            }
1214            4 => {
1215                // PostScript calculator function
1216                let contents = match obj {
1217                    &Object::Stream(ref stream) => {
1218                        let contents = get_contents(stream);
1219                        warn!("unhandled type-4 function");
1220                        warn!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1221                        contents
1222                    }
1223                    _ => { panic!("type 4 functions should be streams") }
1224                };
1225                Function::Type4(contents)
1226            }
1227            _ => { panic!("unhandled function type {}", function_type) }
1228        };
1229        f
1230    }
1231}
1232
1233fn as_num(o: &Object) -> f64 {
1234    match o {
1235        &Object::Integer(i) => { i as f64 }
1236        &Object::Real(f) => { f.into() }
1237        _ => { panic!("not a number") }
1238    }
1239}
1240
1241#[derive(Clone)]
1242struct TextState<'a>
1243{
1244    font: Option<Rc<dyn PdfFont + 'a>>,
1245    font_size: f64,
1246    character_spacing: f64,
1247    word_spacing: f64,
1248    horizontal_scaling: f64,
1249    leading: f64,
1250    rise: f64,
1251    tm: Transform,
1252}
1253
1254// XXX: We'd ideally implement this without having to copy the uncompressed data
1255fn get_contents(contents: &Stream) -> Vec<u8> {
1256    if contents.filters().is_ok() {
1257        contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1258    } else {
1259        contents.content.clone()
1260    }
1261}
1262
1263#[derive(Clone)]
1264struct GraphicsState<'a>
1265{
1266    ctm: Transform,
1267    ts: TextState<'a>,
1268    smask: Option<Dictionary>,
1269    fill_colorspace: ColorSpace,
1270    fill_color: Vec<f64>,
1271    stroke_colorspace: ColorSpace,
1272    stroke_color: Vec<f64>,
1273    line_width: f64,
1274}
1275
1276fn show_text(gs: &mut GraphicsState, s: &[u8],
1277             _tlm: &Transform,
1278             _flip_ctm: &Transform,
1279             output: &mut dyn OutputDev) -> Result<(), OutputError> {
1280    let ts = &mut gs.ts;
1281    let font = ts.font.as_ref().unwrap();
1282    //let encoding = font.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1283    dlog!("{:?}", font.decode(s));
1284    dlog!("{:?}", font.decode(s).as_bytes());
1285    dlog!("{:?}", s);
1286    output.begin_word()?;
1287
1288    for (c, length) in font.char_codes(s) {
1289        // 5.3.3 Text Space Details
1290        let tsm = Transform2D::row_major(ts.horizontal_scaling,
1291                                                 0.,
1292                                                 0.,
1293                                                 1.0,
1294                                                 0.,
1295                                                 ts.rise);
1296        // Trm = Tsm × Tm × CTM
1297        let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1298        //dlog!("ctm: {:?} tm {:?}", gs.ctm, tm);
1299        //dlog!("current pos: {:?}", position);
1300        // 5.9 Extraction of Text Content
1301
1302
1303        //dlog!("w: {}", font.widths[&(*c as i64)]);
1304        let w0 = font.get_width(c) / 1000.;
1305
1306        let mut spacing = ts.character_spacing;
1307        // "Word spacing is applied to every occurrence of the single-byte character code 32 in a
1308        //  string when using a simple font or a composite font that defines code 32 as a
1309        //  single-byte code. It does not apply to occurrences of the byte value 32 in
1310        //  multiple-byte codes."
1311        let is_space = c == 32 && length == 1;
1312        if is_space { spacing += ts.word_spacing }
1313
1314        output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1315        let tj = 0.;
1316        let ty = 0.;
1317        let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1318        dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1319        // dlog!("w0: {}, tx: {}", w0, tx);
1320        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1321        let _trm = ts.tm.pre_transform(&gs.ctm);
1322        //dlog!("post pos: {:?}", trm);
1323
1324    }
1325    output.end_word()?;
1326    Ok(())
1327}
1328
1329#[derive(Debug, Clone, Copy)]
1330pub struct MediaBox {
1331    pub llx: f64,
1332    pub lly: f64,
1333    pub urx: f64,
1334    pub ury: f64
1335}
1336
1337fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1338    for (k, v) in state.iter() {
1339        let k : &[u8] = k.as_ref();
1340        match k {
1341            b"SMask" => { match maybe_deref(doc, v)  {
1342                &Object::Name(ref name) => {
1343                    if name == b"None" {
1344                        gs.smask = None;
1345                    } else {
1346                        panic!("unexpected smask name")
1347                    }
1348                }
1349                &Object::Dictionary(ref dict) => {
1350                    gs.smask = Some(dict.clone());
1351                }
1352                _ => { panic!("unexpected smask type {:?}", v) }
1353            }}
1354            b"Type" => { match v {
1355                &Object::Name(ref name) => {
1356                    assert_eq!(name, b"ExtGState")
1357                }
1358                _ => { panic!("unexpected type") }
1359            }}
1360            _ => {  dlog!("unapplied state: {:?} {:?}", k, v); }
1361        }
1362    }
1363
1364}
1365
1366#[derive(Debug)]
1367pub enum PathOp {
1368    MoveTo(f64, f64),
1369    LineTo(f64, f64),
1370    // XXX: is it worth distinguishing the different kinds of curve ops?
1371    CurveTo(f64, f64, f64, f64, f64, f64),
1372    Rect(f64, f64, f64, f64),
1373    Close,
1374}
1375
1376#[derive(Debug)]
1377pub struct Path {
1378    pub ops: Vec<PathOp>
1379}
1380
1381impl Path {
1382    fn new() -> Path {
1383        Path { ops: Vec::new() }
1384    }
1385    fn current_point(&self) -> (f64, f64) {
1386        match self.ops.last().unwrap() {
1387            &PathOp::MoveTo(x, y) => { (x, y) }
1388            &PathOp::LineTo(x, y) => { (x, y) }
1389            &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1390            _ => { panic!() }
1391        }
1392    }
1393}
1394
1395#[derive(Clone, Debug)]
1396pub struct CalGray {
1397    white_point: [f64; 3],
1398    black_point: Option<[f64; 3]>,
1399    gamma: Option<f64>,
1400}
1401
1402#[derive(Clone, Debug)]
1403pub struct CalRGB {
1404    white_point: [f64; 3],
1405    black_point: Option<[f64; 3]>,
1406    gamma: Option<[f64; 3]>,
1407    matrix: Option<Vec<f64>>
1408}
1409
1410#[derive(Clone, Debug)]
1411pub struct Lab {
1412    white_point: [f64; 3],
1413    black_point: Option<[f64; 3]>,
1414    range: Option<[f64; 4]>,
1415}
1416
1417#[derive(Clone, Debug)]
1418pub enum AlternateColorSpace {
1419    DeviceGray,
1420    DeviceRGB,
1421    DeviceCMYK,
1422    CalRGB(CalRGB),
1423    CalGray(CalGray),
1424    Lab(Lab),
1425    ICCBased(Vec<u8>)
1426}
1427
1428#[derive(Clone)]
1429pub struct Separation {
1430    name: String,
1431    alternate_space: AlternateColorSpace,
1432    tint_transform: Box<Function>,
1433}
1434
1435#[derive(Clone)]
1436pub enum ColorSpace {
1437    DeviceGray,
1438    DeviceRGB,
1439    DeviceCMYK,
1440    DeviceN,
1441    Pattern,
1442    CalRGB(CalRGB),
1443    CalGray(CalGray),
1444    Lab(Lab),
1445    Separation(Separation),
1446    ICCBased(Vec<u8>)
1447}
1448
1449fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1450    match name {
1451        b"DeviceGray" => ColorSpace::DeviceGray,
1452        b"DeviceRGB" => ColorSpace::DeviceRGB,
1453        b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1454        b"Pattern" => ColorSpace::Pattern,
1455        _ => {
1456            let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1457            let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1458            if let Ok(cs) = cs.as_array() {
1459                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1460                match cs_name.as_ref() {
1461                    "Separation" => {
1462                        let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1463                        let alternate_space = match &maybe_deref(doc, &cs[2]) {
1464                            Object::Name(name) => {
1465                                match &name[..] {
1466                                    b"DeviceGray" => AlternateColorSpace::DeviceGray,
1467                                    b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1468                                    b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1469                                    _ => panic!("unexpected color space name")
1470                                }
1471                            }
1472                            Object::Array(cs) => {
1473                                let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1474                                match cs_name.as_ref() {
1475                                    "ICCBased" => {
1476                                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1477                                        dlog!("ICCBased {:?}", stream);
1478                                        // XXX: we're going to be continually decompressing everytime this object is referenced
1479                                        AlternateColorSpace::ICCBased(get_contents(stream))
1480                                    }
1481                                    "CalGray" => {
1482                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1483                                        AlternateColorSpace::CalGray(CalGray {
1484                                            white_point: get(&doc, dict, b"WhitePoint"),
1485                                            black_point: get(&doc, dict, b"BackPoint"),
1486                                            gamma: get(&doc, dict, b"Gamma"),
1487                                        })
1488                                    }
1489                                    "CalRGB" => {
1490                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1491                                        AlternateColorSpace::CalRGB(CalRGB {
1492                                            white_point: get(&doc, dict, b"WhitePoint"),
1493                                            black_point: get(&doc, dict, b"BackPoint"),
1494                                            gamma: get(&doc, dict, b"Gamma"),
1495                                            matrix: get(&doc, dict, b"Matrix"),
1496                                        })
1497                                    }
1498                                    "Lab" => {
1499                                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1500                                        AlternateColorSpace::Lab(Lab {
1501                                            white_point: get(&doc, dict, b"WhitePoint"),
1502                                            black_point: get(&doc, dict, b"BackPoint"),
1503                                            range: get(&doc, dict, b"Range"),
1504                                        })
1505                                    }
1506                                    _ => panic!("Unexpected color space name")
1507                                }
1508                            }
1509                            _ => panic!("Alternate space should be name or array {:?}", cs[2])
1510                        };
1511                        let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1512
1513                        dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1514                        ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1515                    }
1516                    "ICCBased" => {
1517                        let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1518                        dlog!("ICCBased {:?}", stream);
1519                        // XXX: we're going to be continually decompressing everytime this object is referenced
1520                        ColorSpace::ICCBased(get_contents(stream))
1521                    }
1522                    "CalGray" => {
1523                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1524                        ColorSpace::CalGray(CalGray {
1525                            white_point: get(&doc, dict, b"WhitePoint"),
1526                            black_point: get(&doc, dict, b"BackPoint"),
1527                            gamma: get(&doc, dict, b"Gamma"),
1528                        })
1529                    }
1530                    "CalRGB" => {
1531                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1532                        ColorSpace::CalRGB(CalRGB {
1533                            white_point: get(&doc, dict, b"WhitePoint"),
1534                            black_point: get(&doc, dict, b"BackPoint"),
1535                            gamma: get(&doc, dict, b"Gamma"),
1536                            matrix: get(&doc, dict, b"Matrix"),
1537                        })
1538                    }
1539                    "Lab" => {
1540                        let dict = cs[1].as_dict().expect("second arg must be a dict");
1541                        ColorSpace::Lab(Lab {
1542                            white_point: get(&doc, dict, b"WhitePoint"),
1543                            black_point: get(&doc, dict, b"BackPoint"),
1544                            range: get(&doc, dict, b"Range"),
1545                        })
1546                    }
1547                    "Pattern" => {
1548                        ColorSpace::Pattern
1549                    },
1550                    "DeviceGray" => ColorSpace::DeviceGray,
1551                    "DeviceRGB" => ColorSpace::DeviceRGB,
1552                    "DeviceCMYK" => ColorSpace::DeviceCMYK,
1553                    "DeviceN" => ColorSpace::DeviceN,
1554                    _ => {
1555                        panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1556                    }
1557                }
1558            } else if let Ok(cs) = cs.as_name() {
1559                match pdf_to_utf8(cs).as_ref() {
1560                    "DeviceRGB" => ColorSpace::DeviceRGB,
1561                    "DeviceGray" => ColorSpace::DeviceGray,
1562                    _ => panic!()
1563                }
1564            } else {
1565                panic!();
1566            }
1567        }
1568    }
1569}
1570
1571struct Processor<'a> {
1572    font_table: HashMap<Vec<u8>, Rc<dyn PdfFont + 'a>>,
1573    _none: PhantomData<&'a ()>,
1574}
1575
1576impl<'a> Processor<'a> {
1577    fn new() -> Processor<'a> {
1578        Processor { font_table: HashMap::new(), _none: PhantomData }
1579    }
1580
1581    fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1582        let content = Content::decode(&content).unwrap();
1583        let mut gs: GraphicsState = GraphicsState {
1584            ts: TextState {
1585                font: None,
1586                font_size: std::f64::NAN,
1587                character_spacing: 0.,
1588                word_spacing: 0.,
1589                horizontal_scaling: 100. / 100.,
1590                leading: 0.,
1591                rise: 0.,
1592                tm: Transform2D::identity(),
1593            },
1594            fill_color: Vec::new(),
1595            fill_colorspace: ColorSpace::DeviceGray,
1596            stroke_color: Vec::new(),
1597            stroke_colorspace: ColorSpace::DeviceGray,
1598            line_width: 1.,
1599            ctm: Transform2D::identity(),
1600            smask: None
1601        };
1602        //let mut ts = &mut gs.ts;
1603        let mut gs_stack = Vec::new();
1604        let mut mc_stack = Vec::new();
1605        // XXX: replace tlm with a point for text start
1606        let mut tlm = Transform2D::identity();
1607        let mut path = Path::new();
1608        let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1609        dlog!("MediaBox {:?}", media_box);
1610        for operation in &content.operations {
1611            //dlog!("op: {:?}", operation);
1612
1613            match operation.operator.as_ref() {
1614                "BT" => {
1615                    tlm = Transform2D::identity();
1616                    gs.ts.tm = tlm;
1617                }
1618                "ET" => {
1619                    tlm = Transform2D::identity();
1620                    gs.ts.tm = tlm;
1621                }
1622                "cm" => {
1623                    assert!(operation.operands.len() == 6);
1624                    let m = Transform2D::row_major(as_num(&operation.operands[0]),
1625                                                   as_num(&operation.operands[1]),
1626                                                   as_num(&operation.operands[2]),
1627                                                   as_num(&operation.operands[3]),
1628                                                   as_num(&operation.operands[4]),
1629                                                   as_num(&operation.operands[5]));
1630                    gs.ctm = gs.ctm.pre_transform(&m);
1631                    dlog!("matrix {:?}", gs.ctm);
1632                }
1633                "CS" => {
1634                    let name = operation.operands[0].as_name().unwrap();
1635                    gs.stroke_colorspace = make_colorspace(doc, name, resources);
1636                }
1637                "cs" => {
1638                    let name = operation.operands[0].as_name().unwrap();
1639                    gs.fill_colorspace = make_colorspace(doc, name, resources);
1640                }
1641                "SC" | "SCN" => {
1642                    gs.stroke_color = match gs.stroke_colorspace {
1643                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1644                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1645                    };
1646                }
1647                "sc" | "scn" => {
1648                    gs.fill_color = match gs.fill_colorspace {
1649                        ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1650                        _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1651                    };
1652                }
1653                "G" | "g" | "RG" | "rg" | "K" | "k" => {
1654                    dlog!("unhandled color operation {:?}", operation);
1655                }
1656                "TJ" => {
1657                    match operation.operands[0] {
1658                        Object::Array(ref array) => {
1659                            for e in array {
1660                                match e {
1661                                    &Object::String(ref s, _) => {
1662                                        show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1663                                    }
1664                                    &Object::Integer(i) => {
1665                                        let ts = &mut gs.ts;
1666                                        let w0 = 0.;
1667                                        let tj = i as f64;
1668                                        let ty = 0.;
1669                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1670                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1671                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1672                                    }
1673                                    &Object::Real(i) => {
1674                                        let ts = &mut gs.ts;
1675                                        let w0 = 0.;
1676                                        let tj = i as f64;
1677                                        let ty = 0.;
1678                                        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1679                                        ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1680                                        dlog!("adjust text by: {} {:?}", i, ts.tm);
1681                                    }
1682                                    _ => { dlog!("kind of {:?}", e); }
1683                                }
1684                            }
1685                        }
1686                        _ => {}
1687                    }
1688                }
1689                "Tj" => {
1690                    match operation.operands[0] {
1691                        Object::String(ref s, _) => {
1692                            show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1693                        }
1694                        _ => { panic!("unexpected Tj operand {:?}", operation) }
1695                    }
1696                }
1697                "Tc" => {
1698                    gs.ts.character_spacing = as_num(&operation.operands[0]);
1699                }
1700                "Tw" => {
1701                    gs.ts.word_spacing = as_num(&operation.operands[0]);
1702                }
1703                "Tz" => {
1704                    gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1705                }
1706                "TL" => {
1707                    gs.ts.leading = as_num(&operation.operands[0]);
1708                }
1709                "Tf" => {
1710                    let fonts: &Dictionary = get(&doc, resources, b"Font");
1711                    let name = operation.operands[0].as_name().unwrap();
1712                    let font = self.font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1713                    {
1714                        /*let file = font.get_descriptor().and_then(|desc| desc.get_file());
1715                    if let Some(file) = file {
1716                        let file_contents = filter_data(file.as_stream().unwrap());
1717                        let mut cursor = Cursor::new(&file_contents[..]);
1718                        //let f = Font::read(&mut cursor);
1719                        //dlog!("font file: {:?}", f);
1720                    }*/
1721                    }
1722                    gs.ts.font = Some(font);
1723
1724                    gs.ts.font_size = as_num(&operation.operands[1]);
1725                    dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1726                }
1727                "Ts" => {
1728                    gs.ts.rise = as_num(&operation.operands[0]);
1729                }
1730                "Tm" => {
1731                    assert!(operation.operands.len() == 6);
1732                    tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1733                                                 as_num(&operation.operands[1]),
1734                                                 as_num(&operation.operands[2]),
1735                                                 as_num(&operation.operands[3]),
1736                                                 as_num(&operation.operands[4]),
1737                                                 as_num(&operation.operands[5]));
1738                    gs.ts.tm = tlm;
1739                    dlog!("Tm: matrix {:?}", gs.ts.tm);
1740                    output.end_line()?;
1741                }
1742                "Td" => {
1743                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1744                   tx and ty are numbers expressed in unscaled text space units.
1745                   More precisely, this operator performs the following assignments:
1746                 */
1747                    assert!(operation.operands.len() == 2);
1748                    let tx = as_num(&operation.operands[0]);
1749                    let ty = as_num(&operation.operands[1]);
1750                    dlog!("translation: {} {}", tx, ty);
1751
1752                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1753                    gs.ts.tm = tlm;
1754                    dlog!("Td matrix {:?}", gs.ts.tm);
1755                    output.end_line()?;
1756                }
1757
1758                "TD" => {
1759                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1760                   As a side effect, this operator sets the leading parameter in the text state.
1761                 */
1762                    assert!(operation.operands.len() == 2);
1763                    let tx = as_num(&operation.operands[0]);
1764                    let ty = as_num(&operation.operands[1]);
1765                    dlog!("translation: {} {}", tx, ty);
1766                    gs.ts.leading = -ty;
1767
1768                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1769                    gs.ts.tm = tlm;
1770                    dlog!("TD matrix {:?}", gs.ts.tm);
1771                    output.end_line()?;
1772                }
1773
1774                "T*" => {
1775                    let tx = 0.0;
1776                    let ty = -gs.ts.leading;
1777
1778                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1779                    gs.ts.tm = tlm;
1780                    dlog!("T* matrix {:?}", gs.ts.tm);
1781                    output.end_line()?;
1782                }
1783                "q" => { gs_stack.push(gs.clone()); }
1784                "Q" => {
1785                    let s = gs_stack.pop();
1786                    if let Some(s) = s {
1787                        gs = s;
1788                    } else {
1789                        warn!("No state to pop");
1790                    }
1791                }
1792                "gs" => {
1793                    let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1794                    let name = operation.operands[0].as_name().unwrap();
1795                    let state: &Dictionary = get(doc, ext_gstate, name);
1796                    apply_state(doc, &mut gs, state);
1797                }
1798                "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1799                "w" => { gs.line_width = as_num(&operation.operands[0]); }
1800                "J" | "j" | "M" | "d" | "ri"  => { dlog!("unknown graphics state operator {:?}", operation); }
1801                "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1802                "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1803                "c" => {
1804                    path.ops.push(PathOp::CurveTo(
1805                        as_num(&operation.operands[0]),
1806                        as_num(&operation.operands[1]),
1807                        as_num(&operation.operands[2]),
1808                        as_num(&operation.operands[3]),
1809                        as_num(&operation.operands[4]),
1810                        as_num(&operation.operands[5])))
1811                }
1812                "v" => {
1813                    let (x, y) = path.current_point();
1814                    path.ops.push(PathOp::CurveTo(
1815                        x,
1816                        y,
1817                        as_num(&operation.operands[0]),
1818                        as_num(&operation.operands[1]),
1819                        as_num(&operation.operands[2]),
1820                        as_num(&operation.operands[3])))
1821                }
1822                "y" => {
1823                    path.ops.push(PathOp::CurveTo(
1824                        as_num(&operation.operands[0]),
1825                        as_num(&operation.operands[1]),
1826                        as_num(&operation.operands[2]),
1827                        as_num(&operation.operands[3]),
1828                        as_num(&operation.operands[2]),
1829                        as_num(&operation.operands[3])))
1830                }
1831                "h" => { path.ops.push(PathOp::Close) }
1832                "re" => {
1833                    path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1834                                               as_num(&operation.operands[1]),
1835                                               as_num(&operation.operands[2]),
1836                                               as_num(&operation.operands[3])))
1837                }
1838                "s" | "f*" | "B" | "B*" | "b" => {
1839                    dlog!("unhandled path op {:?}", operation);
1840                }
1841                "S" => {
1842                    output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1843                    path.ops.clear();
1844                }
1845                "F" | "f" => {
1846                    output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1847                    path.ops.clear();
1848                }
1849                "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1850                "n" => {
1851                    dlog!("discard {:?}", path);
1852                    path.ops.clear();
1853                }
1854                "BMC" | "BDC" => {
1855                    mc_stack.push(operation);
1856                }
1857                "EMC" => {
1858                    mc_stack.pop();
1859                }
1860                "Do" => {
1861                    // `Do` process an entire subdocument, so we do a recursive call to `process_stream`
1862                    // with the subdocument content and resources
1863                    let xobject: &Dictionary = get(&doc, resources, b"XObject");
1864                    let name = operation.operands[0].as_name().unwrap();
1865                    let xf: &Stream = get(&doc, xobject, name);
1866                    let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1867                    let contents = get_contents(xf);
1868                    self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1869                }
1870                _ => { dlog!("unknown operation {:?}", operation); }
1871
1872            }
1873        }
1874        Ok(())
1875    }
1876}
1877
1878
1879pub trait OutputDev {
1880    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1881    fn end_page(&mut self)-> Result<(), OutputError>;
1882    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1883    fn begin_word(&mut self)-> Result<(), OutputError>;
1884    fn end_word(&mut self)-> Result<(), OutputError>;
1885    fn end_line(&mut self)-> Result<(), OutputError>;
1886    fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1887    fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1888}
1889
1890
1891pub struct HTMLOutput<'a>  {
1892    file: &'a mut dyn std::io::Write,
1893    flip_ctm: Transform,
1894    last_ctm: Transform,
1895    buf_ctm: Transform,
1896    buf_font_size: f64,
1897    buf: String
1898}
1899
1900fn insert_nbsp(input: &str) -> String {
1901    let mut result = String::new();
1902    let mut word_end = false;
1903    let mut chars = input.chars().peekable();
1904    while let Some(c) = chars.next() {
1905        if c == ' ' {
1906            if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1907                result += "&nbsp;";
1908            } else {
1909                result += " ";
1910            }
1911            word_end = false;
1912        } else {
1913            word_end = true;
1914            result.push(c);
1915        }
1916    }
1917    result
1918}
1919
1920impl<'a> HTMLOutput<'a> {
1921    pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1922        HTMLOutput {
1923            file,
1924            flip_ctm: Transform2D::identity(),
1925            last_ctm: Transform2D::identity(),
1926            buf_ctm: Transform2D::identity(),
1927            buf: String::new(),
1928            buf_font_size: 0.,
1929        }
1930    }
1931    fn flush_string(&mut self) -> Result<(), OutputError>{
1932        if self.buf.len() != 0 {
1933
1934            let position = self.buf_ctm.post_transform(&self.flip_ctm);
1935            let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1936            // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1937            let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1938            let (x, y) = (position.m31, position.m32);
1939            warn!("flush {} {:?}", self.buf, (x,y));
1940
1941            write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1942                   x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1943        }
1944        Ok(())
1945    }
1946}
1947
1948type ArtBox = (f64, f64, f64, f64);
1949
1950impl<'a> OutputDev for HTMLOutput<'a> {
1951    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1952        write!(self.file, "<meta charset='utf-8' /> ")?;
1953        write!(self.file, "<!-- page {} -->", page_num)?;
1954        write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1955        self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1956        Ok(())
1957    }
1958    fn end_page(&mut self) -> Result<(), OutputError> {
1959        self.flush_string()?;
1960        self.buf = String::new();
1961        self.last_ctm = Transform::identity();
1962        write!(self.file, "</div>")?;
1963        Ok(())
1964    }
1965    fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1966        if trm.approx_eq(&self.last_ctm) {
1967            let position = trm.post_transform(&self.flip_ctm);
1968            let (x, y) = (position.m31, position.m32);
1969
1970            warn!("accum {} {:?}", char, (x,y));
1971            self.buf += char;
1972        } else {
1973            warn!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1974            self.flush_string()?;
1975            self.buf = char.to_owned();
1976            self.buf_font_size = font_size;
1977            self.buf_ctm = *trm;
1978        }
1979        let position = trm.post_transform(&self.flip_ctm);
1980        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1981        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1982        let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1983        let (x, y) = (position.m31, position.m32);
1984        write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1985               x, y, transformed_font_size, char)?;
1986        self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1987
1988        Ok(())
1989    }
1990    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1991    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1992    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1993}
1994
1995pub struct SVGOutput<'a>  {
1996    file: &'a mut dyn std::io::Write
1997}
1998impl<'a> SVGOutput<'a> {
1999    pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
2000        SVGOutput{file}
2001    }
2002}
2003
2004impl<'a> OutputDev for SVGOutput<'a> {
2005    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2006        let ver = 1.1;
2007        write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2008        if ver == 1.1 {
2009            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2010        } else {
2011            write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2012        }
2013        if let Some(art_box) = art_box {
2014            let width = art_box.2 - art_box.0;
2015            let height = art_box.3 - art_box.1;
2016            let y = media_box.ury - art_box.1 - height;
2017            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2018        } else {
2019            let width = media_box.urx - media_box.llx;
2020            let height = media_box.ury - media_box.lly;
2021            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2022        }
2023        write!(self.file, "\n")?;
2024        type Mat = Transform;
2025
2026        let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2027        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2028               ctm.m11,
2029               ctm.m12,
2030               ctm.m21,
2031               ctm.m22,
2032               ctm.m31,
2033               ctm.m32,
2034        )?;
2035        Ok(())
2036    }
2037    fn end_page(&mut self) -> Result<(), OutputError>{
2038        write!(self.file, "</g>\n")?;
2039        write!(self.file, "</svg>")?;
2040        Ok(())
2041    }
2042    fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2043        Ok(())
2044    }
2045    fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2046    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2047    fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2048    fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2049        write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2050               ctm.m11,
2051               ctm.m12,
2052               ctm.m21,
2053               ctm.m22,
2054               ctm.m31,
2055               ctm.m32,
2056        )?;
2057
2058        /*if path.ops.len() == 1 {
2059            if let PathOp::Rect(x, y, width, height) = path.ops[0] {
2060                write!(self.file, "<rect x={} y={} width={} height={} />\n", x, y, width, height);
2061                write!(self.file, "</g>");
2062                return;
2063            }
2064        }*/
2065        let mut d = Vec::new();
2066        for op in &path.ops {
2067            match op {
2068                &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2069                &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2070                &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2071                &PathOp::Close => { d.push(format!("Z"))},
2072                &PathOp::Rect(x, y, width, height) => {
2073                    d.push(format!("M{} {}", x, y));
2074                    d.push(format!("L{} {}", x + width, y));
2075                    d.push(format!("L{} {}", x + width, y + height));
2076                    d.push(format!("L{} {}", x, y + height));
2077                    d.push(format!("Z"));
2078                }
2079
2080            }
2081        }
2082        write!(self.file, "<path d='{}' />", d.join(" "))?;
2083        write!(self.file, "</g>")?;
2084        write!(self.file, "\n")?;
2085        Ok(())
2086    }
2087}
2088
2089/*
2090File doesn't implement std::fmt::Write so we have
2091to do some gymnastics to accept a File or String
2092See https://github.com/rust-lang/rust/issues/51305
2093*/
2094
2095pub trait ConvertToFmt {
2096    type Writer: std::fmt::Write;
2097    fn convert(self) -> Self::Writer;
2098}
2099
2100impl<'a> ConvertToFmt for &'a mut String {
2101    type Writer = &'a mut String;
2102    fn convert(self) -> Self::Writer {
2103        self
2104    }
2105}
2106
2107pub struct WriteAdapter<W> {
2108    f: W,
2109}
2110
2111impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2112    fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2113        self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2114    }
2115}
2116
2117impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2118    type Writer = WriteAdapter<Self>;
2119    fn convert(self) -> Self::Writer {
2120        WriteAdapter { f: self }
2121    }
2122}
2123
2124impl<'a> ConvertToFmt for &'a mut File {
2125    type Writer = WriteAdapter<Self>;
2126    fn convert(self) -> Self::Writer {
2127        WriteAdapter { f: self }
2128    }
2129}
2130
2131pub struct PlainTextOutput<W: ConvertToFmt>   {
2132    writer: W::Writer,
2133    last_end: f64,
2134    last_y: f64,
2135    first_char: bool,
2136    flip_ctm: Transform,
2137}
2138
2139impl<W: ConvertToFmt> PlainTextOutput<W> {
2140    pub fn new(writer: W) -> PlainTextOutput<W> {
2141        PlainTextOutput{
2142            writer: writer.convert(),
2143            last_end: 100000.,
2144            first_char: false,
2145            last_y: 0.,
2146            flip_ctm: Transform2D::identity(),
2147        }
2148    }
2149}
2150
2151/* There are some structural hints that PDFs can use to signal word and line endings:
2152 * however relying on these is not likely to be sufficient. */
2153impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2154    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2155        self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2156        Ok(())
2157    }
2158    fn end_page(&mut self) -> Result<(), OutputError> {
2159        Ok(())
2160    }
2161    fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2162        let position = trm.post_transform(&self.flip_ctm);
2163        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2164        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
2165        let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2166        let (x, y) = (position.m31, position.m32);
2167        use std::fmt::Write;
2168        //dlog!("last_end: {} x: {}, width: {}", self.last_end, x, width);
2169        if self.first_char {
2170            if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2171                write!(self.writer, "\n")?;
2172            }
2173
2174            // we've moved to the left and down
2175            if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2176                write!(self.writer, "\n")?;
2177            }
2178
2179            if x > self.last_end + transformed_font_size * 0.1 {
2180                dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2181                write!(self.writer, " ")?;
2182            }
2183        }
2184        //let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
2185        write!(self.writer, "{}", char)?;
2186        self.first_char = false;
2187        self.last_y = y;
2188        self.last_end = x + width * transformed_font_size;
2189        Ok(())
2190    }
2191    fn begin_word(&mut self) -> Result<(), OutputError> {
2192        self.first_char = true;
2193        Ok(())
2194    }
2195    fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2196    fn end_line(&mut self) -> Result<(), OutputError>{
2197        //write!(self.file, "\n");
2198        Ok(())
2199    }
2200}
2201
2202
2203pub fn print_metadata(doc: &Document) {
2204    dlog!("Version: {}", doc.version);
2205    if let Some(ref info) = get_info(&doc) {
2206        for (k, v) in *info {
2207            match v {
2208                &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2209                _ => {}
2210            }
2211        }
2212    }
2213    dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2214    dlog!("Pages: {:?}", get_pages(&doc));
2215    dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2216}
2217
2218/// Extract the text from a pdf at `path` and return a `String` with the results
2219pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2220    let mut s = String::new();
2221    {
2222        let mut output = PlainTextOutput::new(&mut s);
2223        let mut doc = Document::load(path)?;
2224        maybe_decrypt(&mut doc)?;
2225        output_doc(&doc, &mut output)?;
2226    }
2227    Ok(s)
2228}
2229
2230fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2231    if ! doc.is_encrypted() {
2232        return Ok(());
2233    }
2234
2235    if let Err(e) = doc.decrypt("") {
2236        if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2237            error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2238        }
2239
2240        return Err(OutputError::PdfError(e));
2241    }
2242
2243    Ok(())
2244}
2245
2246pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>>(
2247    path: P,
2248    password: &str,
2249) -> Result<String, OutputError> {
2250    let mut s = String::new();
2251    {
2252        let mut output = PlainTextOutput::new(&mut s);
2253        let mut doc = Document::load(path)?;
2254        output_doc_encrypted(&mut doc, &mut output, password)?;
2255    }
2256    Ok(s)
2257}
2258
2259pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2260    let mut s = String::new();
2261    {
2262        let mut output = PlainTextOutput::new(&mut s);
2263        let mut doc = Document::load_mem(buffer)?;
2264        maybe_decrypt(&mut doc)?;
2265        output_doc(&doc, &mut output)?;
2266    }
2267    Ok(s)
2268}
2269
2270pub fn extract_text_from_mem_encrypted(
2271    buffer: &[u8],
2272    password: &str,
2273) -> Result<String, OutputError> {
2274    let mut s = String::new();
2275    {
2276        let mut output = PlainTextOutput::new(&mut s);
2277        let mut doc = Document::load_mem(buffer)?;
2278        output_doc_encrypted(&mut doc, &mut output, password)?;
2279    }
2280    Ok(s)
2281}
2282
2283
2284fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2285    let mut s = String::new();
2286    {
2287        let mut output = PlainTextOutput::new(&mut s);
2288        output_doc_page(doc, &mut output, page_num)?;
2289    }
2290    Ok(s)
2291}
2292
2293/// Extract the text from a pdf at `path` and return a `Vec<String>` with the results separately by page
2294
2295pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2296    let mut v = Vec::new();
2297    {
2298        let mut doc = Document::load(path)?;
2299        maybe_decrypt(&mut doc)?;
2300        let mut page_num = 1;
2301        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2302            v.push(content);
2303            page_num += 1;
2304        }
2305    }
2306    Ok(v)
2307}
2308
2309pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>>(path: P, password: &str) -> Result<Vec<String>, OutputError> {
2310    let mut v = Vec::new();
2311    {
2312        let mut doc = Document::load(path)?;
2313        doc.decrypt(password)?;
2314        let mut page_num = 1;
2315        while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2316            v.push(content);
2317            page_num += 1;
2318        }
2319    }
2320    Ok(v)
2321}
2322
2323pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2324    let mut v = Vec::new();
2325    {
2326        let mut doc = Document::load_mem(buffer)?;
2327        maybe_decrypt(&mut doc)?;
2328        let mut page_num = 1;
2329        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2330            v.push(content);
2331            page_num += 1;
2332        }
2333    }
2334    Ok(v)
2335}
2336
2337pub fn extract_text_from_mem_by_pages_encrypted(buffer: &[u8], password: &str) -> Result<Vec<String>, OutputError> {
2338    let mut v = Vec::new();
2339    {
2340        let mut doc = Document::load_mem(buffer)?;
2341        doc.decrypt(password)?;
2342        let mut page_num = 1;
2343        while let Ok(content) = extract_text_by_page(&doc, page_num) {
2344            v.push(content);
2345            page_num += 1;
2346        }
2347    }
2348    Ok(v)
2349}
2350
2351
2352fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2353    let o: Option<T> = get(doc, dict, key);
2354    if let Some(o) = o {
2355        Some(o)
2356    } else {
2357        let parent = dict.get(b"Parent")
2358            .and_then(|parent| parent.as_reference())
2359            .and_then(|id| doc.get_dictionary(id)).ok()?;
2360        get_inherited(doc, parent, key)
2361    }
2362}
2363
2364pub fn output_doc_encrypted(
2365    doc: &mut Document,
2366    output: &mut dyn OutputDev,
2367    password: &str,
2368) -> Result<(), OutputError> {
2369    doc.decrypt(password)?;
2370    output_doc(doc, output)
2371}
2372
2373/// Parse a given document and output it to `output`
2374pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2375    if doc.is_encrypted() {
2376        error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2377    }
2378    let empty_resources = Dictionary::new();
2379    let pages = doc.get_pages();
2380    let mut p = Processor::new();
2381    for dict in pages {
2382        let page_num = dict.0;
2383        let object_id = dict.1;
2384        output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2385    }
2386    Ok(())
2387}
2388
2389pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2390    if doc.is_encrypted() {
2391        error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2392    }
2393    let empty_resources = Dictionary::new();
2394    let pages = doc.get_pages();
2395    let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2396    let mut p = Processor::new();
2397    output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2398    Ok(())
2399}
2400
2401fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2402    let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2403    dlog!("page {} {:?}", page_num, page_dict);
2404    // XXX: Some pdfs lack a Resources directory
2405    let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2406    dlog!("resources {:?}", resources);
2407    // pdfium searches up the page tree for MediaBoxes as needed
2408    let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2409    let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2410    let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2411        .map(|x| (x[0], x[1], x[2], x[3]));
2412    output.begin_page(page_num, &media_box, art_box)?;
2413    p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2414    output.end_page()?;
2415    Ok(())
2416}