pdf_extract_temporary_mitigation_panic/
lib.rs

1use euclid::*;
2use lopdf::content::Content;
3pub use lopdf::*;
4use std::fmt::Debug;
5extern crate adobe_cmap_parser;
6extern crate encoding;
7extern crate euclid;
8extern crate type1_encoding_parser;
9extern crate unicode_normalization;
10use encoding::all::UTF_16BE;
11use encoding::{DecoderTrap, Encoding};
12use error::{OutputError, Res};
13use euclid::vec2;
14use std::collections::hash_map::Entry;
15use std::collections::HashMap;
16use std::f32::consts::E;
17use std::fmt;
18use std::fs::File;
19use std::marker::PhantomData;
20use std::rc::Rc;
21use std::result::Result;
22use std::slice::Iter;
23use std::str;
24mod core_fonts;
25mod encodings;
26mod error;
27mod glyphnames;
28mod zapfglyphnames;
29
30pub struct Space;
31pub type Transform = Transform2D<f64, Space, Space>;
32
33macro_rules! dlog {
34    // Throw away all debug logging
35    ($($e:expr),*) => { {$(let _ = $e;)*} }
36    // Enable debug logging
37    // ($($t:tt)*) => { println!($($t)*) }
38}
39
40fn get_info(doc: &Document) -> Option<&Dictionary> {
41    if let Ok(&Object::Reference(ref id)) = doc.trailer.get(b"Info") {
42        if let Ok(&Object::Dictionary(ref info)) = doc.get_object(*id) {
43            return Some(info);
44        }
45    }
46    None
47}
48
49fn get_catalog(doc: &Document) -> Res<&Dictionary> {
50    if let &Object::Reference(ref id) = doc.trailer.get(b"Root")? {
51        if let Ok(&Object::Dictionary(ref catalog)) = doc.get_object(*id) {
52            return Ok(catalog);
53        }
54    }
55    Err("No catalog / Root found".into())
56}
57
58fn get_pages(doc: &Document) -> Res<&Dictionary> {
59    let catalog = get_catalog(doc)?;
60    match catalog.get(b"Pages")? {
61        &Object::Reference(ref id) => match doc.get_object(*id) {
62            Ok(&Object::Dictionary(ref pages)) => {
63                return Ok(pages);
64            }
65            other => {
66                dlog!("pages: {:?}", other)
67            }
68        },
69        other => {
70            dlog!("pages: {:?}", other)
71        }
72    }
73    dlog!("catalog {:?}", catalog);
74    Err("No pages found".into())
75}
76
77#[allow(non_upper_case_globals)]
78const PDFDocEncoding: &[u16] = &[
79    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b,
80    0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
81    0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
82    0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
83    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b,
84    0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
85    0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053,
86    0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
87    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
88    0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
89    0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026,
90    0x2014, 0x2013, 0x0192, 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
91    0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178, 0x017d, 0x0131, 0x0142,
92    0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
93    0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
94    0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
95    0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb,
96    0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
97    0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3,
98    0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
99    0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
100    0x00fc, 0x00fd, 0x00fe, 0x00ff,
101];
102
103fn pdf_to_utf8(s: &[u8]) -> Res<String> {
104    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
105        return UTF_16BE
106            .decode(&s[2..], DecoderTrap::Strict)
107            .map_err(|_| "pdf decode err".into());
108    } else {
109        let r: Vec<u8> = s
110            .iter()
111            .copied()
112            .flat_map(|x| {
113                let k = PDFDocEncoding[x as usize];
114                vec![(k >> 8) as u8, k as u8].into_iter()
115            })
116            .collect();
117        return UTF_16BE
118            .decode(&r, DecoderTrap::Strict)
119            .map_err(|_| "pdf decode err".into());
120    }
121}
122
123fn to_utf8(encoding: &[u16], s: &[u8]) -> Res<String> {
124    if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
125        return UTF_16BE
126            .decode(&s[2..], DecoderTrap::Strict)
127            .map_err(|_| "utf_16BE decode err".into());
128    } else {
129        let r: Vec<u8> = s
130            .iter()
131            .copied()
132            .flat_map(|x| {
133                let k = encoding[x as usize];
134                vec![(k >> 8) as u8, k as u8].into_iter()
135            })
136            .collect();
137        return UTF_16BE
138            .decode(&r, DecoderTrap::Strict)
139            .map_err(|_| "utf_16BE decode err".into());
140    }
141}
142
143fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
144    match o {
145        &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
146        _ => o,
147    }
148}
149
150fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
151    dict.get(key).map(|o| maybe_deref(doc, o)).ok()
152}
153
154// an intermediate trait that can be used to chain conversions that may have failed
155trait FromOptObj<'a> {
156    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
157}
158
159// conditionally convert to Self returns None if the conversion failed
160trait FromObj<'a>
161where
162    Self: std::marker::Sized,
163{
164    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
165}
166
167impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
168    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
169        obj.and_then(|x| T::from_obj(doc, x))
170    }
171}
172
173impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
174    fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
175        T::from_obj(
176            doc,
177            obj.unwrap_or_else(|| panic!("{}", String::from_utf8_lossy(key).to_string())),
178        )
179        .expect("wrong type")
180    }
181}
182
183// we follow the same conventions as pdfium for when to support indirect objects:
184// on arrays, streams and dicts
185impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
186    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
187        maybe_deref(doc, obj)
188            .as_array()
189            .map(|x| {
190                x.iter()
191                    .map(|x| T::from_obj(doc, x).expect("wrong type"))
192                    .collect()
193            })
194            .ok()
195    }
196}
197
198// XXX: These will panic if we don't have the right number of items
199// we don't want to do that
200impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
201    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
202        maybe_deref(doc, obj)
203            .as_array()
204            .map(|x| {
205                let mut all = x.iter().map(|x| T::from_obj(doc, x).expect("wrong type"));
206                [
207                    all.next().unwrap(),
208                    all.next().unwrap(),
209                    all.next().unwrap(),
210                    all.next().unwrap(),
211                ]
212            })
213            .ok()
214    }
215}
216
217impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
218    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
219        maybe_deref(doc, obj)
220            .as_array()
221            .map(|x| {
222                let mut all = x.iter().map(|x| T::from_obj(doc, x).expect("wrong type"));
223                [
224                    all.next().unwrap(),
225                    all.next().unwrap(),
226                    all.next().unwrap(),
227                ]
228            })
229            .ok()
230    }
231}
232
233impl<'a> FromObj<'a> for f64 {
234    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
235        match *obj {
236            Object::Integer(i) => Some(i as f64),
237            Object::Real(f) => Some(f as f64),
238            _ => None,
239        }
240    }
241}
242
243impl<'a> FromObj<'a> for i64 {
244    fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
245        match obj {
246            &Object::Integer(i) => Some(i),
247            _ => None,
248        }
249    }
250}
251
252impl<'a> FromObj<'a> for &'a Dictionary {
253    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
254        maybe_deref(doc, obj).as_dict().ok()
255    }
256}
257
258impl<'a> FromObj<'a> for &'a Stream {
259    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
260        maybe_deref(doc, obj).as_stream().ok()
261    }
262}
263
264impl<'a> FromObj<'a> for &'a Object {
265    fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
266        Some(maybe_deref(doc, obj))
267    }
268}
269
270fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
271    T::from_opt_obj(doc, dict.get(key).ok(), key)
272}
273
274fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Res<String> {
275    pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o))?.as_name()?)
276}
277
278#[allow(dead_code)]
279fn maybe_get_name_string<'a>(
280    doc: &'a Document,
281    dict: &'a Dictionary,
282    key: &[u8],
283) -> Option<String> {
284    let ob = maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok());
285    if let Some(ob) = ob {
286        pdf_to_utf8(ob).ok()
287    } else {
288        None
289    }
290}
291
292fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
293    maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
294}
295
296fn maybe_get_array<'a>(
297    doc: &'a Document,
298    dict: &'a Dictionary,
299    key: &[u8],
300) -> Option<&'a Vec<Object>> {
301    maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
302}
303
304#[derive(Clone)]
305struct PdfSimpleFont<'a> {
306    font: &'a Dictionary,
307    doc: &'a Document,
308    encoding: Option<Vec<u16>>,
309    unicode_map: Option<HashMap<u32, String>>,
310    widths: HashMap<CharCode, f64>, // should probably just use i32 here
311    default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
312}
313
314#[derive(Clone)]
315struct PdfType3Font<'a> {
316    font: &'a Dictionary,
317    doc: &'a Document,
318    encoding: Option<Vec<u16>>,
319    unicode_map: Option<HashMap<u32, String>>,
320    widths: HashMap<CharCode, f64>, // should probably just use i32 here
321}
322
323fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Res<Rc<dyn PdfFont + 'a>> {
324    let subtype = get_name_string(doc, font, b"Subtype")?;
325    dlog!("MakeFont({})", subtype);
326    if subtype == "Type0" {
327        Ok(Rc::new(PdfCIDFont::new(doc, font)?))
328    } else if subtype == "Type3" {
329        Ok(Rc::new(PdfType3Font::new(doc, font)?))
330    } else {
331        Ok(Rc::new(PdfSimpleFont::new(doc, font)?))
332    }
333}
334
335fn is_core_font(name: &str) -> bool {
336    matches!(
337        name,
338        "Courier-Bold"
339            | "Courier-BoldOblique"
340            | "Courier-Oblique"
341            | "Courier"
342            | "Helvetica-Bold"
343            | "Helvetica-BoldOblique"
344            | "Helvetica-Oblique"
345            | "Helvetica"
346            | "Symbol"
347            | "Times-Bold"
348            | "Times-BoldItalic"
349            | "Times-Italic"
350            | "Times-Roman"
351            | "ZapfDingbats"
352    )
353}
354
355fn encoding_to_unicode_table(name: &[u8]) -> Res<Vec<u16>> {
356    let encoding = match name {
357        b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
358        b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
359        b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
360        _ => return Err(format!("unexpected encoding {:?}", pdf_to_utf8(name)).into()),
361    };
362    let encoding_table = encoding
363        .iter()
364        .map(|x| {
365            if let &Some(x) = x {
366                glyphnames::name_to_unicode(x).unwrap()
367            } else {
368                0
369            }
370        })
371        .collect();
372    Ok(encoding_table)
373}
374
375/* "Glyphs in the font are selected by single-byte character codes obtained from a string that
376    is shown by the text-showing operators. Logically, these codes index into a table of 256
377    glyphs; the mapping from codes to glyphs is called the font’s encoding. Each font program
378    has a built-in encoding. Under some circumstances, the encoding can be altered by means
379    described in Section 5.5.5, “Character Encoding.”
380*/
381impl<'a> PdfSimpleFont<'a> {
382    fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfSimpleFont<'a>> {
383        let base_name = get_name_string(doc, font, b"BaseFont")?;
384        let subtype = get_name_string(doc, font, b"Subtype")?;
385
386        let encoding: Option<&Object> = get(doc, font, b"Encoding");
387        dlog!(
388            "base_name {} {} enc:{:?} {:?}",
389            base_name,
390            subtype,
391            encoding,
392            font
393        );
394        let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
395        let mut type1_encoding = None;
396        if let Some(descriptor) = descriptor {
397            dlog!("descriptor {:?}", descriptor);
398            if subtype == "Type1" {
399                let file = maybe_get_obj(doc, descriptor, b"FontFile");
400                match file {
401                    Some(&Object::Stream(ref s)) => {
402                        let s = get_contents(s);
403                        //dlog!("font contents {:?}", pdf_to_utf8(&s));
404                        type1_encoding =
405                            Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
406                    }
407                    _ => {
408                        dlog!("font file {:?}", file)
409                    }
410                }
411            } else if subtype == "TrueType" {
412                let file = maybe_get_obj(doc, descriptor, b"FontFile2");
413                match file {
414                    Some(&Object::Stream(ref s)) => {
415                        let _s = get_contents(s);
416                        //File::create(format!("/tmp/{}", base_name)).unwrap().write_all(&s);
417                    }
418                    _ => {
419                        dlog!("font file {:?}", file)
420                    }
421                }
422            }
423
424            let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
425            match font_file3 {
426                Some(&Object::Stream(ref s)) => {
427                    dlog!("font file {:?}", s);
428                }
429                None => {}
430                _ => {
431                    dlog!("unexpected")
432                }
433            }
434
435            let charset = maybe_get_obj(doc, descriptor, b"CharSet");
436            let _charset = match charset {
437                Some(&Object::String(ref s, _)) => Some(pdf_to_utf8(s)),
438                _ => None,
439            };
440            //dlog!("charset {:?}", charset);
441        }
442
443        let mut unicode_map = get_unicode_map(doc, font)?;
444
445        let mut encoding_table = None;
446        match encoding {
447            Some(&Object::Name(ref encoding_name)) => {
448                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
449                encoding_table = Some(encoding_to_unicode_table(encoding_name)?);
450            }
451            Some(&Object::Dictionary(ref encoding)) => {
452                //dlog!("Encoding {:?}", encoding);
453                let mut table =
454                    if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
455                        dlog!("BaseEncoding {:?}", base_encoding);
456                        encoding_to_unicode_table(base_encoding)?
457                    } else {
458                        Vec::from(PDFDocEncoding)
459                    };
460                let differences = maybe_get_array(doc, encoding, b"Differences");
461                if let Some(differences) = differences {
462                    dlog!("Differences");
463                    let mut code = 0;
464                    for o in differences {
465                        let o = maybe_deref(doc, o);
466                        match *o {
467                            Object::Integer(i) => {
468                                code = i;
469                            }
470                            Object::Name(ref n) => {
471                                let name = pdf_to_utf8(n)?;
472                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
473                                // unicode names, so we should probably handle this differently
474                                let unicode = glyphnames::name_to_unicode(&name);
475                                if let Some(unicode) = unicode {
476                                    table[code as usize] = unicode;
477                                    if let Some(ref mut unicode_map) = unicode_map {
478                                        let be = [unicode];
479                                        match unicode_map.entry(code as u32) {
480                                            // If there's a unicode table entry missing use one based on the name
481                                            Entry::Vacant(v) => {
482                                                v.insert(
483                                                    String::from_utf16(&be)
484                                                        .map_err(|_| "utf16 err")?,
485                                                );
486                                            }
487                                            Entry::Occupied(e) => {
488                                                if e.get()
489                                                    != &String::from_utf16(&be)
490                                                        .map_err(|_| "utf16 err")?
491                                                {
492                                                    println!("Unicode mismatch");
493                                                }
494                                            }
495                                        }
496                                    }
497                                }
498                                dlog!("{} = {} ({:?})", code, name, unicode);
499                                if let Some(ref mut unicode_map) = unicode_map {
500                                    dlog!("{} {}", code, unicode_map[&(code as u32)]);
501                                }
502                                code += 1;
503                            }
504                            _ => {
505                                return Err(format!("wrong type {:?}", o).into());
506                            }
507                        }
508                    }
509                }
510                let name = pdf_to_utf8(encoding.get(b"Type")?.as_name()?);
511                dlog!("name: {}", name?);
512
513                encoding_table = Some(table);
514            }
515            None => {
516                if let Some(type1_encoding) = type1_encoding {
517                    let mut table = Vec::from(PDFDocEncoding);
518                    dlog!("type1encoding");
519                    for (code, name) in type1_encoding {
520                        let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name)?);
521                        if let Some(unicode) = unicode {
522                            table[code as usize] = unicode;
523                        } else {
524                            dlog!("unknown character {}", pdf_to_utf8(&name)?);
525                        }
526                    }
527                    encoding_table = Some(table)
528                } else if subtype == "TrueType" {
529                    encoding_table = Some(
530                        encodings::WIN_ANSI_ENCODING
531                            .iter()
532                            .map(|x| {
533                                x.and_then(glyphnames::name_to_unicode).unwrap_or_else(|| {
534                                    dlog!("unknown character {:?}", x);
535                                    0
536                                })
537                            })
538                            .collect(),
539                    );
540                }
541            }
542            _ => return Err("unexpected encoding type".into()),
543        }
544
545        let mut width_map = HashMap::new();
546        if is_core_font(&base_name) {
547            for font_metrics in core_fonts::metrics().iter() {
548                if font_metrics.0 == base_name {
549                    if let Some(ref encoding) = encoding_table {
550                        dlog!("has encoding");
551                        for w in font_metrics.2 {
552                            let c = glyphnames::name_to_unicode(w.2).unwrap();
553                            for i in 0..encoding.len() {
554                                if encoding[i] == c {
555                                    width_map.insert(i as CharCode, w.1);
556                                }
557                            }
558                        }
559                    } else {
560                        // Instead of using the encoding from the core font we'll just look up all
561                        // of the character names. We should probably verify that this produces the
562                        // same result.
563
564                        let mut table = vec![0; 256];
565                        for w in font_metrics.2 {
566                            dlog!("{} {}", w.0, w.2);
567                            // -1 is "not encoded"
568                            if w.0 != -1 {
569                                table[w.0 as usize] = if base_name == "ZapfDingbats" {
570                                    zapfglyphnames::zapfdigbats_names_to_unicode(w.2)
571                                        .ok_or(format!("Bad zapfdigbads name: {:?}", w))?
572                                } else {
573                                    glyphnames::name_to_unicode(w.2).unwrap()
574                                }
575                            }
576                        }
577
578                        let encoding = &table[..];
579                        for w in font_metrics.2 {
580                            width_map.insert(w.0 as CharCode, w.1);
581                            // -1 is "not encoded"
582                        }
583                        encoding_table = Some(encoding.to_vec());
584                    }
585                    /* "Ordinarily, a font dictionary that refers to one of the standard fonts
586                    should omit the FirstChar, LastChar, Widths, and FontDescriptor entries.
587                    However, it is permissible to override a standard font by including these
588                    entries and embedding the font program in the PDF file."
589
590                    Note: some PDFs include a descriptor but still don't include these entries */
591                    // assert!(maybe_get_obj(doc, font, "FirstChar").is_none());
592                    // assert!(maybe_get_obj(doc, font, "LastChar").is_none());
593                    // assert!(maybe_get_obj(doc, font, "Widths").is_none());
594                }
595            }
596        } else {
597            // Some PDF's don't have these like fips-197.pdf
598            let first_char: i64 = get(doc, font, b"FirstChar");
599            let last_char: i64 = get(doc, font, b"LastChar");
600            let widths: Vec<f64> = get(doc, font, b"Widths");
601            let mut i = 0;
602            dlog!(
603                "first_char {:?}, last_char: {:?}, widths: {} {:?}",
604                first_char,
605                last_char,
606                widths.len(),
607                widths
608            );
609
610            for w in widths {
611                width_map.insert((first_char + i) as CharCode, w);
612                i += 1;
613            }
614            assert_eq!(first_char + i - 1, last_char);
615        }
616
617        Ok(PdfSimpleFont {
618            doc,
619            font,
620            widths: width_map,
621            encoding: encoding_table,
622            default_width: None,
623            unicode_map,
624        })
625    }
626
627    #[allow(dead_code)]
628    fn get_type(&self) -> Res<String> {
629        get_name_string(self.doc, self.font, b"Type")
630    }
631    #[allow(dead_code)]
632    fn get_basefont(&self) -> Res<String> {
633        get_name_string(self.doc, self.font, b"BaseFont")
634    }
635    #[allow(dead_code)]
636    fn get_subtype(&self) -> Res<String> {
637        get_name_string(self.doc, self.font, b"Subtype")
638    }
639    #[allow(dead_code)]
640    fn get_widths(&self) -> Option<&Vec<Object>> {
641        maybe_get_obj(self.doc, self.font, b"Widths")
642            .map(|widths| widths.as_array().expect("Widths should be an array"))
643    }
644    /* For type1: This entry is obsolescent and its use is no longer recommended. (See
645     * implementation note 42 in Appendix H.) */
646    #[allow(dead_code)]
647    fn get_name(&self) -> Option<String> {
648        maybe_get_name_string(self.doc, self.font, b"Name")
649    }
650
651    #[allow(dead_code)]
652    fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
653        maybe_get_obj(self.doc, self.font, b"FontDescriptor")
654            .and_then(|desc| desc.as_dict().ok())
655            .map(|desc| PdfFontDescriptor {
656                desc,
657                doc: self.doc,
658            })
659    }
660}
661
662impl<'a> PdfType3Font<'a> {
663    fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfType3Font<'a>> {
664        let unicode_map = get_unicode_map(doc, font)?;
665        let encoding: Option<&Object> = get(doc, font, b"Encoding");
666
667        let encoding_table;
668        match encoding {
669            Some(&Object::Name(ref encoding_name)) => {
670                dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
671                encoding_table = Some(encoding_to_unicode_table(encoding_name)?);
672            }
673            Some(&Object::Dictionary(ref encoding)) => {
674                //dlog!("Encoding {:?}", encoding);
675                let mut table =
676                    if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
677                        dlog!("BaseEncoding {:?}", base_encoding);
678                        encoding_to_unicode_table(base_encoding)?
679                    } else {
680                        Vec::from(PDFDocEncoding)
681                    };
682                let differences = maybe_get_array(doc, encoding, b"Differences");
683                if let Some(differences) = differences {
684                    dlog!("Differences");
685                    let mut code = 0;
686                    for o in differences {
687                        match *o {
688                            Object::Integer(i) => {
689                                code = i;
690                            }
691                            Object::Name(ref n) => {
692                                let name = pdf_to_utf8(n)?;
693                                // XXX: names of Type1 fonts can map to arbitrary strings instead of real
694                                // unicode names, so we should probably handle this differently
695                                let unicode = glyphnames::name_to_unicode(&name);
696                                if let Some(unicode) = unicode {
697                                    table[code as usize] = unicode;
698                                }
699                                dlog!("{} = {} ({:?})", code, name, unicode);
700                                if let Some(ref unicode_map) = unicode_map {
701                                    dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
702                                }
703                                code += 1;
704                            }
705                            _ => {
706                                return Err("wrong type".into());
707                            }
708                        }
709                    }
710                }
711                let name_encoded = encoding.get(b"Type");
712                if let Ok(Object::Name(name)) = name_encoded {
713                    dlog!("name: {}", pdf_to_utf8(name)?);
714                } else {
715                    dlog!("name not found");
716                }
717
718                encoding_table = Some(table);
719            }
720            _ => {
721                return Err("Wrong encoding".into());
722            }
723        }
724
725        let first_char: i64 = get(doc, font, b"FirstChar");
726        let last_char: i64 = get(doc, font, b"LastChar");
727        let widths: Vec<f64> = get(doc, font, b"Widths");
728
729        let mut width_map = HashMap::new();
730
731        let mut i = 0;
732        dlog!(
733            "first_char {:?}, last_char: {:?}, widths: {} {:?}",
734            first_char,
735            last_char,
736            widths.len(),
737            widths
738        );
739
740        for w in widths {
741            width_map.insert((first_char + i) as CharCode, w);
742            i += 1;
743        }
744        assert_eq!(first_char + i - 1, last_char);
745        Ok(PdfType3Font {
746            doc,
747            font,
748            widths: width_map,
749            encoding: encoding_table,
750            unicode_map,
751        })
752    }
753}
754
755type CharCode = u32;
756
757struct PdfFontIter<'a> {
758    i: Iter<'a, u8>,
759    font: &'a dyn PdfFont,
760}
761
762impl<'a> Iterator for PdfFontIter<'a> {
763    type Item = (CharCode, u8);
764    fn next(&mut self) -> Option<(CharCode, u8)> {
765        self.font.next_char(&mut self.i)
766    }
767}
768
769trait PdfFont: Debug {
770    fn get_width(&self, id: CharCode) -> Res<f64>;
771    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
772    fn decode_char(&self, char: CharCode) -> Res<String>;
773
774    /*fn char_codes<'a>(&'a self, chars: &'a [u8]) -> PdfFontIter {
775        let p = self;
776        PdfFontIter{i: chars.iter(), font: p as &PdfFont}
777    }*/
778}
779
780impl<'a> dyn PdfFont + 'a {
781    fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
782        PdfFontIter {
783            i: chars.iter(),
784            font: self,
785        }
786    }
787    fn decode(&self, chars: &[u8]) -> String {
788        let strings = self
789            .char_codes(chars)
790            // TODO: handle error, don't unwrap
791            .map(|x| self.decode_char(x.0).unwrap())
792            .collect::<Vec<_>>();
793        strings.join("")
794    }
795}
796
797impl<'a> PdfFont for PdfSimpleFont<'a> {
798    fn get_width(&self, id: CharCode) -> Res<f64> {
799        let width = self.widths.get(&id);
800        if let Some(width) = width {
801            Ok(*width)
802        } else {
803            dlog!(
804                "missing width for {} falling back to default_width {:?}",
805                id,
806                self.font
807            );
808            self.default_width
809                .ok_or_else(|| "missing default width".into())
810        }
811    }
812    /*fn decode(&self, chars: &[u8]) -> String {
813        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
814        to_utf8(encoding, chars)
815    }*/
816
817    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
818        iter.next().map(|x| (*x as CharCode, 1))
819    }
820    fn decode_char(&self, char: CharCode) -> Res<String> {
821        let slice = [char as u8];
822        if let Some(ref unicode_map) = self.unicode_map {
823            let s = unicode_map.get(&char);
824            let s = match s {
825                None => Err(format!("missing char {:?} in map {:?}", char, unicode_map).into()),
826                Some(s) => Ok(s.clone()),
827            };
828            return s;
829        }
830        let encoding = self
831            .encoding
832            .as_ref()
833            .map(|x| &x[..])
834            .unwrap_or(PDFDocEncoding);
835        //dlog!("char_code {:?} {:?}", char, self.encoding);
836
837        to_utf8(encoding, &slice)
838    }
839}
840
841impl<'a> fmt::Debug for PdfSimpleFont<'a> {
842    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
843        self.font.fmt(f)
844    }
845}
846
847impl<'a> PdfFont for PdfType3Font<'a> {
848    fn get_width(&self, id: CharCode) -> Res<f64> {
849        let width = self.widths.get(&id);
850        if let Some(width) = width {
851            Ok(*width)
852        } else {
853            Err(format!("missing width for {} {:?}", id, self.font).into())
854        }
855    }
856    /*fn decode(&self, chars: &[u8]) -> String {
857        let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
858        to_utf8(encoding, chars)
859    }*/
860
861    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
862        iter.next().map(|x| (*x as CharCode, 1))
863    }
864    fn decode_char(&self, char: CharCode) -> Res<String> {
865        let slice = [char as u8];
866        if let Some(ref unicode_map) = self.unicode_map {
867            let s = unicode_map.get(&char);
868            let s = match s {
869                None => {
870                    return Err(format!("missing char {:?} in map {:?}", char, unicode_map).into())
871                }
872                Some(s) => s.clone(),
873            };
874            return Ok(s);
875        }
876        let encoding = self
877            .encoding
878            .as_ref()
879            .map(|x| &x[..])
880            .unwrap_or(PDFDocEncoding);
881        //dlog!("char_code {:?} {:?}", char, self.encoding);
882
883        to_utf8(encoding, &slice)
884    }
885}
886
887impl<'a> fmt::Debug for PdfType3Font<'a> {
888    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
889        self.font.fmt(f)
890    }
891}
892
893struct PdfCIDFont<'a> {
894    font: &'a Dictionary,
895    #[allow(dead_code)]
896    doc: &'a Document,
897    #[allow(dead_code)]
898    encoding: Option<Vec<u16>>,
899    to_unicode: Option<HashMap<u32, String>>,
900    widths: HashMap<CharCode, f64>, // should probably just use i32 here
901    default_width: Option<f64>, // only used for CID fonts and we should probably brake out the different font types
902}
903
904fn get_unicode_map<'a>(
905    doc: &'a Document,
906    font: &'a Dictionary,
907) -> Res<Option<HashMap<u32, String>>> {
908    let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
909    dlog!("ToUnicode: {:?}", to_unicode);
910    let mut unicode_map = None;
911    match to_unicode {
912        Some(&Object::Stream(ref stream)) => {
913            let contents = get_contents(stream);
914            dlog!(
915                "Stream: {}",
916                String::from_utf8(contents.clone()).unwrap_or_else(|_| "utf8 error".into())
917            );
918
919            let cmap = adobe_cmap_parser::get_unicode_map(&contents)?;
920            let mut unicode = HashMap::new();
921            // "It must use the beginbfchar, endbfchar, beginbfrange, and endbfrange operators to
922            // define the mapping from character codes to Unicode character sequences expressed in
923            // UTF-16BE encoding."
924            for (&k, v) in cmap.iter() {
925                let mut be: Vec<u16> = Vec::new();
926                let mut i = 0;
927                assert!(v.len() % 2 == 0);
928                while i < v.len() {
929                    be.push(((v[i] as u16) << 8) | v[i + 1] as u16);
930                    i += 2;
931                }
932                if let [0xd800..=0xdfff] = &be[..] {
933                    // this range is not specified as not being encoded
934                    // we ignore them so we don't an error from from_utt16
935                    continue;
936                }
937                let s = String::from_utf16(&be).unwrap_or_else(|_| "utf8 error".into());
938
939                unicode.insert(k, s);
940            }
941            unicode_map = Some(unicode);
942
943            dlog!("map: {:?}", unicode_map);
944        }
945        None => {}
946        Some(&Object::Name(ref name)) => {
947            let name = pdf_to_utf8(name)?;
948            if name != "Identity-H" {
949                return Err(format!("unsupported cmap {:?}", name).into());
950            }
951        }
952        _ => return Err(format!("unsupported cmap {:?}", to_unicode).into()),
953    }
954    Ok(unicode_map)
955}
956
957impl<'a> PdfCIDFont<'a> {
958    fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfCIDFont<'a>> {
959        let base_name = get_name_string(doc, font, b"BaseFont")?;
960        let descendants =
961            maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
962        let ciddict = maybe_deref(doc, &descendants[0])
963            .as_dict()
964            .expect("should be CID dict");
965        let encoding =
966            maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
967        dlog!("base_name {} {:?}", base_name, font);
968
969        match *encoding {
970            Object::Name(ref name) => {
971                let name = pdf_to_utf8(name)?;
972                dlog!("encoding {:?}", name);
973                if name != "Identity-H" {
974                    return Err(format!("unsupported encoding {:?}", name).into());
975                }
976            }
977            Object::Stream(ref stream) => {
978                let contents = get_contents(stream);
979                dlog!(
980                    "Stream: {}",
981                    String::from_utf8(contents).map_err(|_| "bad utf8")?
982                );
983            }
984            _ => return Err("Unsupported formatting".into()),
985        }
986
987        // Sometimes a Type0 font might refer to the same underlying data as regular font. In this case we may be able to extract some encoding
988        // data.
989        // We should also look inside the truetype data to see if there's a cmap table. It will help us convert as well.
990        // This won't work if the cmap has been subsetted. A better approach might be to hash glyph contents and use that against
991        // a global library of glyph hashes
992        let unicode_map = get_unicode_map(doc, font)?;
993
994        dlog!("descendents {:?} {:?}", descendants, ciddict);
995
996        let font_dict =
997            maybe_get_obj(doc, ciddict, b"FontDescriptor").ok_or("No FontDescriptor")?;
998        dlog!("{:?}", font_dict);
999        let _f = font_dict.as_dict().expect("must be dict");
1000        let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1001        let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1002        dlog!("widths {:?}", w);
1003        let mut widths = HashMap::new();
1004        let mut i = 0;
1005        if let Some(w) = w {
1006            while i < w.len() {
1007                if let &Object::Array(ref wa) = w[i + 1] {
1008                    let cid = w[i].as_i64().expect("id should be num");
1009                    let mut j = 0;
1010                    dlog!("wa: {:?} -> {:?}", cid, wa);
1011                    for w in wa {
1012                        widths.insert((cid + j) as CharCode, as_num(w)?);
1013                        j += 1;
1014                    }
1015                    i += 2;
1016                } else {
1017                    let c_first = w[i].as_i64().map_err(|_e| "first should be num")?;
1018                    let c_last = w[i].as_i64().map_err(|_e| "last should be num")?;
1019                    let c_width = as_num(w[i])?;
1020                    for id in c_first..c_last {
1021                        widths.insert(id as CharCode, c_width);
1022                    }
1023                    i += 3;
1024                }
1025            }
1026        }
1027        Ok(PdfCIDFont {
1028            doc,
1029            font,
1030            widths,
1031            to_unicode: unicode_map,
1032            encoding: None,
1033            default_width: Some(default_width as f64),
1034        })
1035    }
1036}
1037
1038impl<'a> PdfFont for PdfCIDFont<'a> {
1039    fn get_width(&self, id: CharCode) -> Res<f64> {
1040        let width = self.widths.get(&id);
1041        if let Some(width) = width {
1042            dlog!("GetWidth {} -> {}", id, *width);
1043            Ok(*width)
1044        } else {
1045            dlog!("missing width for {} falling back to default_width", id);
1046            self.default_width.ok_or("No Default Width".into())
1047        }
1048    } /*
1049      fn decode(&self, chars: &[u8]) -> String {
1050          self.char_codes(chars);
1051
1052          //let utf16 = Vec::new();
1053
1054          let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1055          to_utf8(encoding, chars)
1056      }*/
1057
1058    fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1059        let p = iter.next();
1060        if let Some(&c) = p {
1061            let next = *iter.next()?;
1062            Some((((c as u32) << 8) | next as u32, 2))
1063        } else {
1064            None
1065        }
1066    }
1067    fn decode_char(&self, char: CharCode) -> Res<String> {
1068        let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1069        let o = if let Some(s) = s {
1070            s.clone()
1071        } else {
1072            dlog!(
1073                "Unknown character {:?} in {:?} {:?}",
1074                char,
1075                self.font,
1076                self.to_unicode
1077            );
1078            "".to_string()
1079        };
1080        Ok(o)
1081    }
1082}
1083
1084impl<'a> fmt::Debug for PdfCIDFont<'a> {
1085    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1086        self.font.fmt(f)
1087    }
1088}
1089
1090#[derive(Copy, Clone)]
1091struct PdfFontDescriptor<'a> {
1092    desc: &'a Dictionary,
1093    doc: &'a Document,
1094}
1095
1096impl<'a> PdfFontDescriptor<'a> {
1097    #[allow(dead_code)]
1098    fn get_file(&self) -> Option<&'a Object> {
1099        maybe_get_obj(self.doc, self.desc, b"FontFile")
1100    }
1101}
1102
1103impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1104    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1105        self.desc.fmt(f)
1106    }
1107}
1108
1109#[derive(Clone, Debug)]
1110struct Type0Func {
1111    domain: Vec<f64>,
1112    range: Vec<f64>,
1113    contents: Vec<u8>,
1114    size: Vec<i64>,
1115    bits_per_sample: i64,
1116    encode: Vec<f64>,
1117    decode: Vec<f64>,
1118}
1119
1120#[allow(dead_code)]
1121fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1122    let divisor = x - x_min;
1123    if divisor != 0. {
1124        y_min + (x - x_min) * ((y_max - y_min) / divisor)
1125    } else {
1126        // (x - x_min) will be 0 which means we want to discard the interpolation
1127        // and arbitrarily choose y_min to match pdfium
1128        y_min
1129    }
1130}
1131
1132impl Type0Func {
1133    #[allow(dead_code)]
1134    fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1135        let _n_inputs = self.domain.len() / 2;
1136        let _n_ouputs = self.range.len() / 2;
1137    }
1138}
1139
1140#[derive(Clone, Debug)]
1141struct Type2Func {
1142    c0: Option<Vec<f64>>,
1143    c1: Option<Vec<f64>>,
1144    n: f64,
1145}
1146
1147#[derive(Clone, Debug)]
1148enum Function {
1149    Type0(Type0Func),
1150    Type2(Type2Func),
1151    #[allow(dead_code)]
1152    Type3,
1153    #[allow(dead_code)]
1154    Type4,
1155}
1156
1157impl Function {
1158    fn new(doc: &Document, obj: &Object) -> Res<Function> {
1159        let dict = match *obj {
1160            Object::Dictionary(ref dict) => dict,
1161            Object::Stream(ref stream) => &stream.dict,
1162            _ => return Err("Function should be a dictionary or stream".into()),
1163        };
1164        let function_type: i64 = get(doc, dict, b"FunctionType");
1165        let f = match function_type {
1166            0 => {
1167                let stream = match obj {
1168                    &Object::Stream(ref stream) => stream,
1169                    _ => return Err("No stream".into()),
1170                };
1171                let range: Vec<f64> = get(doc, dict, b"Range");
1172                let domain: Vec<f64> = get(doc, dict, b"Domain");
1173                let contents = get_contents(stream);
1174                let size: Vec<i64> = get(doc, dict, b"Size");
1175                let bits_per_sample = get(doc, dict, b"BitsPerSample");
1176                // We ignore 'Order' like pdfium, poppler and pdf.js
1177
1178                let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1179                // maybe there's some better way to write this.
1180                let encode = encode.unwrap_or_else(|| {
1181                    let mut default = Vec::new();
1182                    for i in &size {
1183                        default.extend([0., (i - 1) as f64].iter());
1184                    }
1185                    default
1186                });
1187                let decode =
1188                    get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1189
1190                Function::Type0(Type0Func {
1191                    domain,
1192                    range,
1193                    size,
1194                    contents,
1195                    bits_per_sample,
1196                    encode,
1197                    decode,
1198                })
1199            }
1200            2 => {
1201                let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1202                let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1203                let n = get::<f64>(doc, dict, b"N");
1204                Function::Type2(Type2Func { c0, c1, n })
1205            }
1206            _ => return Err(format!("unhandled function type {}", function_type).into()),
1207        };
1208        Ok(f)
1209    }
1210}
1211
1212fn as_num(o: &Object) -> Res<f64> {
1213    match *o {
1214        Object::Integer(i) => Ok(i as f64),
1215        Object::Real(f) => Ok(f as f64),
1216        _ => Err("not a number".into()),
1217    }
1218}
1219
1220#[derive(Clone)]
1221struct TextState<'a> {
1222    font: Option<Rc<dyn PdfFont + 'a>>,
1223    font_size: f64,
1224    character_spacing: f64,
1225    word_spacing: f64,
1226    horizontal_scaling: f64,
1227    leading: f64,
1228    rise: f64,
1229    tm: Transform,
1230}
1231
1232// XXX: We'd ideally implement this without having to copy the uncompressed data
1233fn get_contents(contents: &Stream) -> Vec<u8> {
1234    if contents.filter().is_ok() {
1235        contents
1236            .decompressed_content()
1237            .unwrap_or_else(|_| contents.content.clone())
1238    } else {
1239        contents.content.clone()
1240    }
1241}
1242
1243#[derive(Clone)]
1244struct GraphicsState<'a> {
1245    ctm: Transform,
1246    ts: TextState<'a>,
1247    smask: Option<&'a Dictionary>,
1248    fill_colorspace: ColorSpace,
1249    fill_color: Vec<f64>,
1250    stroke_colorspace: ColorSpace,
1251    stroke_color: Vec<f64>,
1252    line_width: f64,
1253}
1254
1255fn show_text(
1256    gs: &mut GraphicsState,
1257    s: &[u8],
1258    _tlm: &Transform,
1259    _flip_ctm: &Transform,
1260    output: &mut dyn OutputDev,
1261) -> Res<()> {
1262    let ts = &mut gs.ts;
1263    // let font = ts.font.as_ref().unwrap();
1264    let font = match ts.font.as_ref() {
1265        None => {
1266            return Err(OutputError::Other("font".to_string()));
1267        }
1268        Some(f) => {f}
1269    };
1270    //let encoding = font.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
1271    dlog!("{:?}", font.decode(s));
1272    dlog!("{:?}", font.decode(s).as_bytes());
1273    dlog!("{:?}", s);
1274    output.begin_word()?;
1275
1276    for (c, length) in font.char_codes(s) {
1277        let tsm = Transform2D::row_major(ts.horizontal_scaling, 0., 0., 1.0, 0., ts.rise);
1278        let trm = ts.tm.pre_transform(&gs.ctm);
1279        let trm = trm.post_transform(&tsm);
1280        //dlog!("ctm: {:?} tm {:?}", gs.ctm, tm);
1281        //dlog!("current pos: {:?}", position);
1282        // 5.9 Extraction of Text Content
1283
1284        //dlog!("w: {}", font.widths[&(*c as i64)]);
1285        let w0 = font.get_width(c)? / 1000.;
1286
1287        let mut spacing = ts.character_spacing;
1288        // "Word spacing is applied to every occurrence of the single-byte character code 32 in a
1289        //  string when using a simple font or a composite font that defines code 32 as a
1290        //  single-byte code. It does not apply to occurrences of the byte value 32 in
1291        //  multiple-byte codes."
1292        let is_space = c == 32 && length == 1;
1293        if is_space {
1294            spacing += ts.word_spacing
1295        }
1296
1297        output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c)?)?;
1298        let tj = 0.;
1299        let ty = 0.;
1300        let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size + spacing);
1301        dlog!(
1302            "horizontal {} adjust {} {} {} {}",
1303            ts.horizontal_scaling,
1304            tx,
1305            w0,
1306            ts.font_size,
1307            spacing
1308        );
1309        // dlog!("w0: {}, tx: {}", w0, tx);
1310        ts.tm = ts
1311            .tm
1312            .pre_transform(&Transform2D::create_translation(tx, ty));
1313        let _trm = ts.tm.pre_transform(&gs.ctm);
1314        //dlog!("post pos: {:?}", trm);
1315    }
1316    output.end_word()?;
1317    Ok(())
1318}
1319
1320#[derive(Debug, Clone, Copy)]
1321pub struct MediaBox {
1322    pub llx: f64,
1323    pub lly: f64,
1324    pub urx: f64,
1325    pub ury: f64,
1326}
1327
1328fn apply_state(gs: &mut GraphicsState, state: &Dictionary) -> Res<()> {
1329    for (k, v) in state.iter() {
1330        let k: &[u8] = k.as_ref();
1331        match k {
1332            b"SMask" => match v {
1333                &Object::Name(ref name) => {
1334                    if name == b"None" {
1335                        gs.smask = None;
1336                    } else {
1337                        return Err("unexpected smask name".into());
1338                    }
1339                }
1340                _ => return Err(format!("unexpected smask type {:?}", v).into()),
1341            },
1342            b"Type" => match v {
1343                &Object::Name(ref name) => {
1344                    assert_eq!(name, b"ExtGState")
1345                }
1346                _ => return Err("unexpected type".into()),
1347            },
1348            _ => {
1349                dlog!("unapplied state: {:?} {:?}", k, v);
1350            }
1351        }
1352    }
1353    Ok(())
1354}
1355
1356#[derive(Debug)]
1357pub enum PathOp {
1358    MoveTo(f64, f64),
1359    LineTo(f64, f64),
1360    // XXX: is it worth distinguishing the different kinds of curve ops?
1361    CurveTo(f64, f64, f64, f64, f64, f64),
1362    Rect(f64, f64, f64, f64),
1363    Close,
1364}
1365
1366#[derive(Debug)]
1367pub struct Path {
1368    pub ops: Vec<PathOp>,
1369}
1370
1371impl Path {
1372    fn new() -> Path {
1373        Path { ops: Vec::new() }
1374    }
1375    fn current_point(&self) -> Res<(f64, f64)> {
1376        let v = match *self.ops.last().ok_or("empty path")? {
1377            PathOp::MoveTo(x, y) => (x, y),
1378            PathOp::LineTo(x, y) => (x, y),
1379            PathOp::CurveTo(_, _, _, _, x, y) => (x, y),
1380            _ => return Err("Unimplemented: current_point for path with no current point".into()),
1381        };
1382        Ok(v)
1383    }
1384}
1385
1386#[derive(Clone)]
1387pub struct CalGray {
1388    white_point: [f64; 3],
1389    black_point: Option<[f64; 3]>,
1390    gamma: Option<f64>,
1391}
1392
1393#[derive(Clone)]
1394pub struct CalRGB {
1395    white_point: [f64; 3],
1396    black_point: Option<[f64; 3]>,
1397    gamma: Option<[f64; 3]>,
1398    matrix: Option<Vec<f64>>,
1399}
1400
1401#[derive(Clone)]
1402pub struct Lab {
1403    white_point: [f64; 3],
1404    black_point: Option<[f64; 3]>,
1405    range: Option<[f64; 4]>,
1406}
1407
1408#[derive(Clone)]
1409pub struct Separation {
1410    name: String,
1411    alternate_space: String,
1412    tint_transform: Box<Function>,
1413}
1414
1415#[derive(Clone)]
1416pub enum ColorSpace {
1417    DeviceGray,
1418    DeviceRGB,
1419    DeviceCMYK,
1420    Pattern,
1421    CalRGB(CalRGB),
1422    CalGray(CalGray),
1423    Lab(Lab),
1424    Separation(Separation),
1425    ICCBased(Vec<u8>),
1426}
1427
1428fn make_colorspace<'a>(
1429    doc: &'a Document,
1430    name: &[u8],
1431    resources: &'a Dictionary,
1432) -> Res<ColorSpace> {
1433    let space = match name {
1434        b"DeviceGray" => ColorSpace::DeviceGray,
1435        b"DeviceRGB" => ColorSpace::DeviceRGB,
1436        b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1437        b"Pattern" => ColorSpace::Pattern,
1438        _ => {
1439            let colorspaces: &Dictionary = get(doc, resources, b"ColorSpace");
1440            let cs = maybe_get_array(doc, colorspaces, name)
1441                .ok_or(format!("missing colorspace {:?}", name))?;
1442            let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"))?;
1443            match cs_name.as_ref() {
1444                "Separation" => {
1445                    let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"))?;
1446                    let alternate_space =
1447                        pdf_to_utf8(cs[2].as_name().expect("second arg must be a name"))?;
1448                    let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3]))?);
1449
1450                    dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1451                    ColorSpace::Separation(Separation {
1452                        name,
1453                        alternate_space,
1454                        tint_transform,
1455                    })
1456                }
1457                "ICCBased" => {
1458                    let stream = maybe_deref(doc, &cs[1]).as_stream()?;
1459                    dlog!("ICCBased {:?}", stream);
1460                    // XXX: we're going to be continually decompressing everytime this object is referenced
1461                    ColorSpace::ICCBased(get_contents(stream))
1462                }
1463                "CalGray" => {
1464                    let dict = cs[1].as_dict().expect("second arg must be a dict");
1465                    ColorSpace::CalGray(CalGray {
1466                        white_point: get(doc, dict, b"WhitePoint"),
1467                        black_point: get(doc, dict, b"BackPoint"),
1468                        gamma: get(doc, dict, b"Gamma"),
1469                    })
1470                }
1471                "CalRGB" => {
1472                    let dict = cs[1].as_dict().expect("second arg must be a dict");
1473                    ColorSpace::CalRGB(CalRGB {
1474                        white_point: get(doc, dict, b"WhitePoint"),
1475                        black_point: get(doc, dict, b"BackPoint"),
1476                        gamma: get(doc, dict, b"Gamma"),
1477                        matrix: get(doc, dict, b"Matrix"),
1478                    })
1479                }
1480                "Lab" => {
1481                    let dict = cs[1].as_dict().expect("second arg must be a dict");
1482                    ColorSpace::Lab(Lab {
1483                        white_point: get(doc, dict, b"WhitePoint"),
1484                        black_point: get(doc, dict, b"BackPoint"),
1485                        range: get(doc, dict, b"Range"),
1486                    })
1487                }
1488                "Pattern" => ColorSpace::Pattern,
1489                _ => {
1490                    return Err(format!("color_space {:?} {:?} {:?}", name, cs_name, cs).into());
1491                }
1492            }
1493        }
1494    };
1495    Ok(space)
1496}
1497
1498struct Processor<'a> {
1499    _none: PhantomData<&'a ()>,
1500}
1501
1502impl<'a> Processor<'a> {
1503    fn process_stream(
1504        doc: &'a Document,
1505        content: Vec<u8>,
1506        resources: &'a Dictionary,
1507        media_box: &MediaBox,
1508        output: &mut dyn OutputDev,
1509    ) -> Res<()> {
1510        let content = Content::decode(&content)?;
1511        let mut font_table = HashMap::new();
1512        let mut gs: GraphicsState = GraphicsState {
1513            ts: TextState {
1514                font: None,
1515                font_size: std::f64::NAN,
1516                character_spacing: 0.,
1517                word_spacing: 0.,
1518                horizontal_scaling: 1.,
1519                leading: 0.,
1520                rise: 0.,
1521                tm: Transform2D::identity(),
1522            },
1523            fill_color: Vec::new(),
1524            fill_colorspace: ColorSpace::DeviceGray,
1525            stroke_color: Vec::new(),
1526            stroke_colorspace: ColorSpace::DeviceGray,
1527            line_width: 1.,
1528            ctm: Transform2D::identity(),
1529            smask: None,
1530        };
1531        //let mut ts = &mut gs.ts;
1532        let mut gs_stack = Vec::new();
1533        let mut mc_stack = Vec::new();
1534        // XXX: replace tlm with a point for text start
1535        let mut tlm = Transform2D::identity();
1536        let mut path = Path::new();
1537        let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1538        dlog!("MediaBox {:?}", media_box);
1539        for operation in &content.operations {
1540            //dlog!("op: {:?}", operation);
1541
1542            match operation.operator.as_ref() {
1543                "BT" => {
1544                    tlm = Transform2D::identity();
1545                    gs.ts.tm = tlm;
1546                }
1547                "ET" => {
1548                    tlm = Transform2D::identity();
1549                    gs.ts.tm = tlm;
1550                }
1551                "cm" => {
1552                    assert!(operation.operands.len() == 6);
1553                    let m = Transform2D::row_major(
1554                        as_num(&operation.operands[0])?,
1555                        as_num(&operation.operands[1])?,
1556                        as_num(&operation.operands[2])?,
1557                        as_num(&operation.operands[3])?,
1558                        as_num(&operation.operands[4])?,
1559                        as_num(&operation.operands[5])?,
1560                    );
1561                    gs.ctm = gs.ctm.pre_transform(&m);
1562                    dlog!("matrix {:?}", gs.ctm);
1563                }
1564                "CS" => {
1565                    let name = operation.operands[0].as_name()?;
1566                    gs.stroke_colorspace = make_colorspace(doc, name, resources)?;
1567                }
1568                "cs" => {
1569                    let name = operation.operands[0].as_name()?;
1570                    gs.fill_colorspace = make_colorspace(doc, name, resources)?;
1571                }
1572                "SC" | "SCN" => {
1573                    gs.stroke_color = match gs.stroke_colorspace {
1574                        ColorSpace::Pattern => {
1575                            dlog!("unhandled pattern color");
1576                            Vec::new()
1577                        }
1578                        _ => operation
1579                            .operands
1580                            .iter()
1581                            .map(|n| match as_num(n) {
1582                                Ok(n) => n,
1583                                Err(_) => {
1584                                    dlog!("unhandled color {:?}", n);
1585                                    0.
1586                                }
1587                            })
1588                            .collect(),
1589                    };
1590                }
1591                "sc" | "scn" => {
1592                    gs.fill_color = match gs.fill_colorspace {
1593                        ColorSpace::Pattern => {
1594                            dlog!("unhandled pattern color");
1595                            Vec::new()
1596                        }
1597                        _ => operation
1598                            .operands
1599                            .iter()
1600                            .map(|n| match as_num(n) {
1601                                Ok(n) => n,
1602                                Err(_) => {
1603                                    dlog!("unhandled color {:?}", n);
1604                                    0.
1605                                }
1606                            })
1607                            .collect(),
1608                    };
1609                }
1610                "G" | "g" | "RG" | "rg" | "K" | "k" => {
1611                    dlog!("unhandled color operation {:?}", operation);
1612                }
1613                "TJ" => {
1614                    if let Object::Array(ref array) = operation.operands[0] {
1615                        for e in array {
1616                            match *e {
1617                                Object::String(ref s, _) => {
1618                                    show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1619                                }
1620                                Object::Integer(i) => {
1621                                    let ts = &mut gs.ts;
1622                                    let w0 = 0.;
1623                                    let tj = i as f64;
1624                                    let ty = 0.;
1625                                    let tx =
1626                                        ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1627                                    ts.tm = ts
1628                                        .tm
1629                                        .pre_transform(&Transform2D::create_translation(tx, ty));
1630                                    dlog!("adjust text by: {} {:?}", i, ts.tm);
1631                                }
1632                                Object::Real(i) => {
1633                                    let ts = &mut gs.ts;
1634                                    let w0 = 0.;
1635                                    let tj = i as f64;
1636                                    let ty = 0.;
1637                                    let tx =
1638                                        ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1639                                    ts.tm = ts
1640                                        .tm
1641                                        .pre_transform(&Transform2D::create_translation(tx, ty));
1642                                    dlog!("adjust text by: {} {:?}", i, ts.tm);
1643                                }
1644                                _ => {
1645                                    dlog!("kind of {:?}", e);
1646                                }
1647                            }
1648                        }
1649                    }
1650                }
1651                "Tj" => match operation.operands[0] {
1652                    Object::String(ref s, _) => {
1653                        show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1654                    }
1655                    _ => {
1656                        return Err(format!("unexpected Tj operand {:?}", operation).into());
1657                    }
1658                },
1659                "Tc" => {
1660                    gs.ts.character_spacing = as_num(&operation.operands[0])?;
1661                }
1662                "Tw" => {
1663                    gs.ts.word_spacing = as_num(&operation.operands[0])?;
1664                }
1665                "Tz" => {
1666                    gs.ts.horizontal_scaling = as_num(&operation.operands[0])? / 100.;
1667                }
1668                "TL" => {
1669                    gs.ts.leading = as_num(&operation.operands[0])?;
1670                }
1671                "Tf" => {
1672                    let fonts: &Dictionary = get(doc, resources, b"Font");
1673                    let name = operation.operands[0].as_name()?;
1674                    let font = font_table
1675                        .entry(name.to_owned())
1676                        .or_insert_with(|| {
1677                            let dict = get::<&Dictionary>(doc, fonts, name);
1678                            make_font(doc, dict).ok()
1679                        })
1680                        .clone();
1681                    {
1682                        /*let file = font.get_descriptor().and_then(|desc| desc.get_file());
1683                        if let Some(file) = file {
1684                            let file_contents = filter_data(file.as_stream().unwrap());
1685                            let mut cursor = Cursor::new(&file_contents[..]);
1686                            //let f = Font::read(&mut cursor);
1687                            //dlog!("font file: {:?}", f);
1688                        }*/
1689                    }
1690                    gs.ts.font = font;
1691
1692                    gs.ts.font_size = as_num(&operation.operands[1])?;
1693                    dlog!(
1694                        "font {} size: {} {:?}",
1695                        pdf_to_utf8(name)?,
1696                        gs.ts.font_size,
1697                        operation
1698                    );
1699                }
1700                "Ts" => {
1701                    gs.ts.rise = as_num(&operation.operands[0])?;
1702                }
1703                "Tm" => {
1704                    assert!(operation.operands.len() == 6);
1705                    tlm = Transform2D::row_major(
1706                        as_num(&operation.operands[0])?,
1707                        as_num(&operation.operands[1])?,
1708                        as_num(&operation.operands[2])?,
1709                        as_num(&operation.operands[3])?,
1710                        as_num(&operation.operands[4])?,
1711                        as_num(&operation.operands[5])?,
1712                    );
1713                    gs.ts.tm = tlm;
1714                    dlog!("Tm: matrix {:?}", gs.ts.tm);
1715                    output.end_line()?;
1716                }
1717                "Td" => {
1718                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1719                      tx and ty are numbers expressed in unscaled text space units.
1720                      More precisely, this operator performs the following assignments:
1721                    */
1722                    assert!(operation.operands.len() == 2);
1723                    let tx = as_num(&operation.operands[0])?;
1724                    let ty = as_num(&operation.operands[1])?;
1725                    dlog!("translation: {} {}", tx, ty);
1726
1727                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1728                    gs.ts.tm = tlm;
1729                    dlog!("Td matrix {:?}", gs.ts.tm);
1730                    output.end_line()?;
1731                }
1732
1733                "TD" => {
1734                    /* Move to the start of the next line, offset from the start of the current line by (tx , ty ).
1735                      As a side effect, this operator sets the leading parameter in the text state.
1736                    */
1737                    assert!(operation.operands.len() == 2);
1738                    let tx = as_num(&operation.operands[0])?;
1739                    let ty = as_num(&operation.operands[1])?;
1740                    dlog!("translation: {} {}", tx, ty);
1741                    gs.ts.leading = -ty;
1742
1743                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1744                    gs.ts.tm = tlm;
1745                    dlog!("TD matrix {:?}", gs.ts.tm);
1746                    output.end_line()?;
1747                }
1748
1749                "T*" => {
1750                    let tx = 0.0;
1751                    let ty = -gs.ts.leading;
1752
1753                    tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1754                    gs.ts.tm = tlm;
1755                    dlog!("T* matrix {:?}", gs.ts.tm);
1756                    output.end_line()?;
1757                }
1758                "q" => {
1759                    gs_stack.push(gs.clone());
1760                }
1761                "Q" => {
1762                    let s = gs_stack.pop();
1763                    if let Some(s) = s {
1764                        gs = s;
1765                    } else {
1766                        dlog!("No state to pop");
1767                    }
1768                }
1769                "gs" => {
1770                    let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1771                    let name = operation.operands[0].as_name()?;
1772                    let state: &Dictionary = get(doc, ext_gstate, name);
1773                    match apply_state(&mut gs, state) {
1774                        Ok(_) => {}
1775                        Err(e) => {
1776                            dlog!("Error while applying graphics state: {}", e)
1777                        }
1778                    }
1779                }
1780                "i" => {
1781                    dlog!(
1782                        "unhandled graphics state flattness operator {:?}",
1783                        operation
1784                    );
1785                }
1786                "w" => {
1787                    gs.line_width = as_num(&operation.operands[0])?;
1788                }
1789                "J" | "j" | "M" | "d" | "ri" => {
1790                    dlog!("unknown graphics state operator {:?}", operation);
1791                }
1792                "m" => path.ops.push(PathOp::MoveTo(
1793                    as_num(&operation.operands[0])?,
1794                    as_num(&operation.operands[1])?,
1795                )),
1796                "l" => path.ops.push(PathOp::LineTo(
1797                    as_num(&operation.operands[0])?,
1798                    as_num(&operation.operands[1])?,
1799                )),
1800                "c" => path.ops.push(PathOp::CurveTo(
1801                    as_num(&operation.operands[0])?,
1802                    as_num(&operation.operands[1])?,
1803                    as_num(&operation.operands[2])?,
1804                    as_num(&operation.operands[3])?,
1805                    as_num(&operation.operands[4])?,
1806                    as_num(&operation.operands[5])?,
1807                )),
1808                "v" => {
1809                    let (x, y) = path.current_point()?;
1810                    path.ops.push(PathOp::CurveTo(
1811                        x,
1812                        y,
1813                        as_num(&operation.operands[0])?,
1814                        as_num(&operation.operands[1])?,
1815                        as_num(&operation.operands[2])?,
1816                        as_num(&operation.operands[3])?,
1817                    ))
1818                }
1819                "y" => path.ops.push(PathOp::CurveTo(
1820                    as_num(&operation.operands[0])?,
1821                    as_num(&operation.operands[1])?,
1822                    as_num(&operation.operands[2])?,
1823                    as_num(&operation.operands[3])?,
1824                    as_num(&operation.operands[2])?,
1825                    as_num(&operation.operands[3])?,
1826                )),
1827                "h" => path.ops.push(PathOp::Close),
1828                "re" => path.ops.push(PathOp::Rect(
1829                    as_num(&operation.operands[0])?,
1830                    as_num(&operation.operands[1])?,
1831                    as_num(&operation.operands[2])?,
1832                    as_num(&operation.operands[3])?,
1833                )),
1834                "s" | "f*" | "B" | "B*" | "b" => {
1835                    dlog!("unhandled path op {:?}", operation);
1836                }
1837                "S" => {
1838                    output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1839                    path.ops.clear();
1840                }
1841                "F" | "f" => {
1842                    output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1843                    path.ops.clear();
1844                }
1845                "W" | "w*" => {
1846                    dlog!("unhandled clipping operation {:?}", operation);
1847                }
1848                "n" => {
1849                    dlog!("discard {:?}", path);
1850                    path.ops.clear();
1851                }
1852                "BMC" | "BDC" => {
1853                    mc_stack.push(operation);
1854                }
1855                "EMC" => {
1856                    mc_stack.pop();
1857                }
1858                "Do" => {
1859                    // `Do` process an entire subdocument, so we do a recursive call to `process_stream`
1860                    // with the subdocument content and resources
1861                    let xobject: &Dictionary = get(doc, resources, b"XObject");
1862                    let name = operation.operands[0].as_name()?;
1863                    let xf: &Stream = get(doc, xobject, name);
1864                    let resources = maybe_get_obj(doc, &xf.dict, b"Resources")
1865                        .and_then(|n| n.as_dict().ok())
1866                        .unwrap_or(resources);
1867                    let contents = get_contents(xf);
1868                    Processor::process_stream(doc, contents, resources, media_box, output)?;
1869                }
1870                _ => {
1871                    dlog!("unknown operation {:?}", operation);
1872                }
1873            }
1874        }
1875        Ok(())
1876    }
1877}
1878
1879pub trait OutputDev {
1880    fn begin_page(
1881        &mut self,
1882        page_num: u32,
1883        media_box: &MediaBox,
1884        art_box: Option<(f64, f64, f64, f64)>,
1885    ) -> Res<()>;
1886    fn end_page(&mut self) -> Res<()>;
1887    fn output_character(
1888        &mut self,
1889        trm: &Transform,
1890        width: f64,
1891        spacing: f64,
1892        font_size: f64,
1893        char: &str,
1894    ) -> Res<()>;
1895    fn begin_word(&mut self) -> Res<()>;
1896    fn end_word(&mut self) -> Res<()>;
1897    fn end_line(&mut self) -> Res<()>;
1898    fn stroke(
1899        &mut self,
1900        _ctm: &Transform,
1901        _colorspace: &ColorSpace,
1902        _color: &[f64],
1903        _path: &Path,
1904    ) -> Res<()> {
1905        Ok(())
1906    }
1907    fn fill(
1908        &mut self,
1909        _ctm: &Transform,
1910        _colorspace: &ColorSpace,
1911        _color: &[f64],
1912        _path: &Path,
1913    ) -> Res<()> {
1914        Ok(())
1915    }
1916}
1917
1918pub struct HTMLOutput<'a> {
1919    file: &'a mut dyn std::io::Write,
1920    flip_ctm: Transform,
1921    last_ctm: Transform,
1922    buf_ctm: Transform,
1923    buf_font_size: f64,
1924    buf: String,
1925}
1926
1927fn insert_nbsp(input: &str) -> String {
1928    let mut result = String::new();
1929    let mut word_end = false;
1930    let mut chars = input.chars().peekable();
1931    while let Some(c) = chars.next() {
1932        if c == ' ' {
1933            if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1934                result += "&nbsp;";
1935            } else {
1936                result += " ";
1937            }
1938            word_end = false;
1939        } else {
1940            word_end = true;
1941            result.push(c);
1942        }
1943    }
1944    result
1945}
1946
1947impl<'a> HTMLOutput<'a> {
1948    pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1949        HTMLOutput {
1950            file,
1951            flip_ctm: Transform2D::identity(),
1952            last_ctm: Transform2D::identity(),
1953            buf_ctm: Transform2D::identity(),
1954            buf: String::new(),
1955            buf_font_size: 0.,
1956        }
1957    }
1958    fn flush_string(&mut self) -> Res<()> {
1959        if !self.buf.is_empty() {
1960            let position = self.buf_ctm.post_transform(&self.flip_ctm);
1961            let transformed_font_size_vec = self
1962                .buf_ctm
1963                .transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1964            // get the length of one sized of the square with the same area with a rectangle of size (x, y)
1965            let transformed_font_size =
1966                (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1967            let (x, y) = (position.m31, position.m32);
1968            println!("flush {} {:?}", self.buf, (x, y));
1969
1970            writeln!(
1971                self.file,
1972                "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1973                x,
1974                y,
1975                transformed_font_size,
1976                insert_nbsp(&self.buf)
1977            )?;
1978        }
1979        Ok(())
1980    }
1981}
1982
1983type ArtBox = (f64, f64, f64, f64);
1984
1985impl<'a> OutputDev for HTMLOutput<'a> {
1986    fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Res<()> {
1987        write!(self.file, "<meta charset='utf-8' /> ")?;
1988        write!(self.file, "<!-- page {} -->", page_num)?;
1989        write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1990        self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1991        Ok(())
1992    }
1993    fn end_page(&mut self) -> Res<()> {
1994        self.flush_string()?;
1995        self.buf = String::new();
1996        self.last_ctm = Transform::identity();
1997        write!(self.file, "</div>")?;
1998        Ok(())
1999    }
2000    fn output_character(
2001        &mut self,
2002        trm: &Transform,
2003        width: f64,
2004        spacing: f64,
2005        font_size: f64,
2006        char: &str,
2007    ) -> Res<()> {
2008        if trm.approx_eq(&self.last_ctm) {
2009            let position = trm.post_transform(&self.flip_ctm);
2010            let (x, y) = (position.m31, position.m32);
2011
2012            println!("accum {} {:?}", char, (x, y));
2013            self.buf += char;
2014        } else {
2015            println!(
2016                "flush {} {:?} {:?} {} {} {}",
2017                char, trm, self.last_ctm, width, font_size, spacing
2018            );
2019            self.flush_string()?;
2020            self.buf = char.to_owned();
2021            self.buf_font_size = font_size;
2022            self.buf_ctm = *trm;
2023        }
2024        let position = trm.post_transform(&self.flip_ctm);
2025        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2026        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
2027        let transformed_font_size =
2028            (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
2029        let (x, y) = (position.m31, position.m32);
2030        write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
2031               x, y, transformed_font_size, char)?;
2032        self.last_ctm = trm.pre_transform(&Transform2D::create_translation(
2033            width * font_size + spacing,
2034            0.,
2035        ));
2036
2037        Ok(())
2038    }
2039    fn begin_word(&mut self) -> Res<()> {
2040        Ok(())
2041    }
2042    fn end_word(&mut self) -> Res<()> {
2043        Ok(())
2044    }
2045    fn end_line(&mut self) -> Res<()> {
2046        Ok(())
2047    }
2048}
2049
2050pub struct SVGOutput<'a> {
2051    file: &'a mut dyn std::io::Write,
2052}
2053impl<'a> SVGOutput<'a> {
2054    pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
2055        SVGOutput { file }
2056    }
2057}
2058
2059impl<'a> OutputDev for SVGOutput<'a> {
2060    fn begin_page(
2061        &mut self,
2062        _page_num: u32,
2063        media_box: &MediaBox,
2064        art_box: Option<(f64, f64, f64, f64)>,
2065    ) -> Res<()> {
2066        let ver = 1.1;
2067        writeln!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")?;
2068        if ver == 1.1 {
2069            write!(
2070                self.file,
2071                r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#
2072            )?;
2073        } else {
2074            write!(
2075                self.file,
2076                r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#
2077            )?;
2078        }
2079        if let Some(art_box) = art_box {
2080            let width = art_box.2 - art_box.0;
2081            let height = art_box.3 - art_box.1;
2082            let y = media_box.ury - art_box.1 - height;
2083            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2084        } else {
2085            let width = media_box.urx - media_box.llx;
2086            let height = media_box.ury - media_box.lly;
2087            write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2088        }
2089        writeln!(self.file)?;
2090        type Mat = Transform;
2091
2092        let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2093        writeln!(
2094            self.file,
2095            "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2096            ctm.m11, ctm.m12, ctm.m21, ctm.m22, ctm.m31, ctm.m32,
2097        )?;
2098        Ok(())
2099    }
2100    fn end_page(&mut self) -> Res<()> {
2101        writeln!(self.file, "</g>")?;
2102        write!(self.file, "</svg>")?;
2103        Ok(())
2104    }
2105    fn output_character(
2106        &mut self,
2107        _trm: &Transform,
2108        _width: f64,
2109        _spacing: f64,
2110        _font_size: f64,
2111        _char: &str,
2112    ) -> Res<()> {
2113        Ok(())
2114    }
2115    fn begin_word(&mut self) -> Res<()> {
2116        Ok(())
2117    }
2118    fn end_word(&mut self) -> Res<()> {
2119        Ok(())
2120    }
2121    fn end_line(&mut self) -> Res<()> {
2122        Ok(())
2123    }
2124    fn fill(
2125        &mut self,
2126        ctm: &Transform,
2127        _colorspace: &ColorSpace,
2128        _color: &[f64],
2129        path: &Path,
2130    ) -> Res<()> {
2131        write!(
2132            self.file,
2133            "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2134            ctm.m11, ctm.m12, ctm.m21, ctm.m22, ctm.m31, ctm.m32,
2135        )?;
2136
2137        /*if path.ops.len() == 1 {
2138            if let PathOp::Rect(x, y, width, height) = path.ops[0] {
2139                write!(self.file, "<rect x={} y={} width={} height={} />\n", x, y, width, height);
2140                write!(self.file, "</g>");
2141                return;
2142            }
2143        }*/
2144        let mut d = Vec::new();
2145        for op in &path.ops {
2146            match *op {
2147                PathOp::MoveTo(x, y) => d.push(format!("M{} {}", x, y)),
2148                PathOp::LineTo(x, y) => d.push(format!("L{} {}", x, y)),
2149                PathOp::CurveTo(x1, y1, x2, y2, x, y) => {
2150                    d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))
2151                }
2152                PathOp::Close => d.push("Z".to_string()),
2153                PathOp::Rect(x, y, width, height) => {
2154                    d.push(format!("M{} {}", x, y));
2155                    d.push(format!("L{} {}", x + width, y));
2156                    d.push(format!("L{} {}", x + width, y + height));
2157                    d.push(format!("L{} {}", x, y + height));
2158                    d.push("Z".to_string());
2159                }
2160            }
2161        }
2162        write!(self.file, "<path d='{}' />", d.join(" "))?;
2163        write!(self.file, "</g>")?;
2164        writeln!(self.file)?;
2165        Ok(())
2166    }
2167}
2168
2169/*
2170File doesn't implement std::fmt::Write so we have
2171to do some gymnastics to accept a File or String
2172See https://github.com/rust-lang/rust/issues/51305
2173*/
2174
2175pub trait ConvertToFmt {
2176    type Writer: std::fmt::Write;
2177    fn convert(self) -> Self::Writer;
2178}
2179
2180impl<'a> ConvertToFmt for &'a mut String {
2181    type Writer = &'a mut String;
2182    fn convert(self) -> Self::Writer {
2183        self
2184    }
2185}
2186
2187pub struct WriteAdapter<W> {
2188    f: W,
2189}
2190
2191impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2192    fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2193        self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2194    }
2195}
2196
2197impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2198    type Writer = WriteAdapter<Self>;
2199    fn convert(self) -> Self::Writer {
2200        WriteAdapter { f: self }
2201    }
2202}
2203
2204impl<'a> ConvertToFmt for &'a mut File {
2205    type Writer = WriteAdapter<Self>;
2206    fn convert(self) -> Self::Writer {
2207        WriteAdapter { f: self }
2208    }
2209}
2210
2211pub struct PlainTextOutput<W: ConvertToFmt> {
2212    writer: W::Writer,
2213    last_end: f64,
2214    last_y: f64,
2215    first_char: bool,
2216    flip_ctm: Transform,
2217}
2218
2219impl<W: ConvertToFmt> PlainTextOutput<W> {
2220    pub fn new(writer: W) -> PlainTextOutput<W> {
2221        PlainTextOutput {
2222            writer: writer.convert(),
2223            last_end: 100000.,
2224            first_char: false,
2225            last_y: 0.,
2226            flip_ctm: Transform2D::identity(),
2227        }
2228    }
2229}
2230
2231/* There are some structural hints that PDFs can use to signal word and line endings:
2232 * however relying on these is not likely to be sufficient. */
2233impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2234    fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Res<()> {
2235        self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2236        Ok(())
2237    }
2238    fn end_page(&mut self) -> Res<()> {
2239        Ok(())
2240    }
2241    fn output_character(
2242        &mut self,
2243        trm: &Transform,
2244        width: f64,
2245        _spacing: f64,
2246        font_size: f64,
2247        char: &str,
2248    ) -> Res<()> {
2249        let position = trm.post_transform(&self.flip_ctm);
2250        let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2251        // get the length of one sized of the square with the same area with a rectangle of size (x, y)
2252        let transformed_font_size =
2253            (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
2254        let (x, y) = (position.m31, position.m32);
2255        use std::fmt::Write;
2256        //dlog!("last_end: {} x: {}, width: {}", self.last_end, x, width);
2257        if self.first_char {
2258            if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2259                writeln!(self.writer)?;
2260            }
2261
2262            // we've moved to the left and down
2263            if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2264                writeln!(self.writer)?;
2265            }
2266
2267            if x > self.last_end + transformed_font_size * 0.1 {
2268                dlog!(
2269                    "width: {}, space: {}, thresh: {}",
2270                    width,
2271                    x - self.last_end,
2272                    transformed_font_size * 0.1
2273                );
2274                write!(self.writer, " ")?;
2275            }
2276        }
2277        //let norm = unicode_normalization::UnicodeNormalization::nfkc(char);
2278        write!(self.writer, "{}", char)?;
2279        self.first_char = false;
2280        self.last_y = y;
2281        self.last_end = x + width * transformed_font_size;
2282        Ok(())
2283    }
2284    fn begin_word(&mut self) -> Res<()> {
2285        self.first_char = true;
2286        Ok(())
2287    }
2288    fn end_word(&mut self) -> Res<()> {
2289        Ok(())
2290    }
2291    fn end_line(&mut self) -> Res<()> {
2292        //write!(self.file, "\n");
2293        Ok(())
2294    }
2295}
2296
2297pub fn print_metadata(doc: &Document) -> Res<()> {
2298    dlog!("Version: {}", doc.version);
2299    if let Some(info) = get_info(doc) {
2300        for (k, v) in info {
2301            if let &Object::String(ref s, StringFormat::Literal) = v {
2302                dlog!("{}: {}", pdf_to_utf8(k)?, pdf_to_utf8(s)?);
2303            }
2304        }
2305    }
2306    dlog!("Page count: {}", get::<i64>(doc, get_pages(doc)?, b"Count"));
2307    dlog!("Pages: {:?}", get_pages(doc));
2308    dlog!(
2309        "Type: {:?}",
2310        get_pages(doc)?.get(b"Type").and_then(|x| x.as_name())?
2311    );
2312    Ok(())
2313}
2314
2315/// Extract the text from a pdf at `path` and return a `String` with the results
2316pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(
2317    path: P,
2318) -> Result<String, OutputError> {
2319    let mut s = String::new();
2320    {
2321        let mut output = PlainTextOutput::new(&mut s);
2322        let doc = Document::load(path)?;
2323        output_doc(&doc, &mut output)?;
2324    }
2325    Ok(s)
2326}
2327
2328/// Reads a pdf from a byte array and returns its text content
2329pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2330    let mut s = String::new();
2331    {
2332        let mut output = PlainTextOutput::new(&mut s);
2333        let doc = Document::load_mem(buffer)?;
2334        output_doc(&doc, &mut output)?;
2335    }
2336    Ok(s)
2337}
2338
2339fn get_inherited<'a, T: FromObj<'a>>(
2340    doc: &'a Document,
2341    dict: &'a Dictionary,
2342    key: &[u8],
2343) -> Option<T> {
2344    let o: Option<T> = get(doc, dict, key);
2345    if let Some(o) = o {
2346        Some(o)
2347    } else {
2348        let parent = dict
2349            .get(b"Parent")
2350            .and_then(|parent| parent.as_reference())
2351            .and_then(|id| doc.get_dictionary(id))
2352            .ok()?;
2353        get_inherited(doc, parent, key)
2354    }
2355}
2356/// Parse a given document and output it to `output`
2357pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Res<()> {
2358    let empty_resources = &Dictionary::new();
2359
2360    let pages = doc.get_pages();
2361    for dict in pages {
2362        let page_num = dict.0;
2363        let page_dict = doc.get_object(dict.1)?.as_dict()?;
2364        dlog!("page {} {:?}", page_num, page_dict);
2365        // XXX: Some pdfs lack a Resources directory
2366        let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2367        dlog!("resources {:?}", resources);
2368
2369        // pdfium searches up the page tree for MediaBoxes as needed
2370        let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2371        let media_box = MediaBox {
2372            llx: media_box[0],
2373            lly: media_box[1],
2374            urx: media_box[2],
2375            ury: media_box[3],
2376        };
2377
2378        let art_box =
2379            get::<Option<Vec<f64>>>(doc, page_dict, b"ArtBox").map(|x| (x[0], x[1], x[2], x[3]));
2380
2381        output.begin_page(page_num, &media_box, art_box)?;
2382
2383        Processor::process_stream(
2384            doc,
2385            doc.get_page_content(dict.1)?,
2386            resources,
2387            &media_box,
2388            output,
2389        )?;
2390
2391        output.end_page()?;
2392    }
2393    Ok(())
2394}