1use euclid::*;
2use lopdf::content::Content;
3pub use lopdf::*;
4use std::fmt::Debug;
5extern crate adobe_cmap_parser;
6extern crate encoding;
7extern crate euclid;
8extern crate type1_encoding_parser;
9extern crate unicode_normalization;
10use encoding::all::UTF_16BE;
11use encoding::{DecoderTrap, Encoding};
12use error::{OutputError, Res};
13use euclid::vec2;
14use std::collections::hash_map::Entry;
15use std::collections::HashMap;
16use std::f32::consts::E;
17use std::fmt;
18use std::fs::File;
19use std::marker::PhantomData;
20use std::rc::Rc;
21use std::result::Result;
22use std::slice::Iter;
23use std::str;
24mod core_fonts;
25mod encodings;
26mod error;
27mod glyphnames;
28mod zapfglyphnames;
29
30pub struct Space;
31pub type Transform = Transform2D<f64, Space, Space>;
32
33macro_rules! dlog {
34 ($($e:expr),*) => { {$(let _ = $e;)*} }
36 }
39
40fn get_info(doc: &Document) -> Option<&Dictionary> {
41 if let Ok(&Object::Reference(ref id)) = doc.trailer.get(b"Info") {
42 if let Ok(&Object::Dictionary(ref info)) = doc.get_object(*id) {
43 return Some(info);
44 }
45 }
46 None
47}
48
49fn get_catalog(doc: &Document) -> Res<&Dictionary> {
50 if let &Object::Reference(ref id) = doc.trailer.get(b"Root")? {
51 if let Ok(&Object::Dictionary(ref catalog)) = doc.get_object(*id) {
52 return Ok(catalog);
53 }
54 }
55 Err("No catalog / Root found".into())
56}
57
58fn get_pages(doc: &Document) -> Res<&Dictionary> {
59 let catalog = get_catalog(doc)?;
60 match catalog.get(b"Pages")? {
61 &Object::Reference(ref id) => match doc.get_object(*id) {
62 Ok(&Object::Dictionary(ref pages)) => {
63 return Ok(pages);
64 }
65 other => {
66 dlog!("pages: {:?}", other)
67 }
68 },
69 other => {
70 dlog!("pages: {:?}", other)
71 }
72 }
73 dlog!("catalog {:?}", catalog);
74 Err("No pages found".into())
75}
76
77#[allow(non_upper_case_globals)]
78const PDFDocEncoding: &[u16] = &[
79 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b,
80 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
81 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
82 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
83 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b,
84 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
85 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053,
86 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
87 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
88 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
89 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026,
90 0x2014, 0x2013, 0x0192, 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
91 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178, 0x017d, 0x0131, 0x0142,
92 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
93 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
94 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
95 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb,
96 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
97 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3,
98 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
99 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
100 0x00fc, 0x00fd, 0x00fe, 0x00ff,
101];
102
103fn pdf_to_utf8(s: &[u8]) -> Res<String> {
104 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
105 return UTF_16BE
106 .decode(&s[2..], DecoderTrap::Strict)
107 .map_err(|_| "pdf decode err".into());
108 } else {
109 let r: Vec<u8> = s
110 .iter()
111 .copied()
112 .flat_map(|x| {
113 let k = PDFDocEncoding[x as usize];
114 vec![(k >> 8) as u8, k as u8].into_iter()
115 })
116 .collect();
117 return UTF_16BE
118 .decode(&r, DecoderTrap::Strict)
119 .map_err(|_| "pdf decode err".into());
120 }
121}
122
123fn to_utf8(encoding: &[u16], s: &[u8]) -> Res<String> {
124 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
125 return UTF_16BE
126 .decode(&s[2..], DecoderTrap::Strict)
127 .map_err(|_| "utf_16BE decode err".into());
128 } else {
129 let r: Vec<u8> = s
130 .iter()
131 .copied()
132 .flat_map(|x| {
133 let k = encoding[x as usize];
134 vec![(k >> 8) as u8, k as u8].into_iter()
135 })
136 .collect();
137 return UTF_16BE
138 .decode(&r, DecoderTrap::Strict)
139 .map_err(|_| "utf_16BE decode err".into());
140 }
141}
142
143fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
144 match o {
145 &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
146 _ => o,
147 }
148}
149
150fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
151 dict.get(key).map(|o| maybe_deref(doc, o)).ok()
152}
153
154trait FromOptObj<'a> {
156 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
157}
158
159trait FromObj<'a>
161where
162 Self: std::marker::Sized,
163{
164 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
165}
166
167impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
168 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
169 obj.and_then(|x| T::from_obj(doc, x))
170 }
171}
172
173impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
174 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
175 T::from_obj(
176 doc,
177 obj.unwrap_or_else(|| panic!("{}", String::from_utf8_lossy(key).to_string())),
178 )
179 .expect("wrong type")
180 }
181}
182
183impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
186 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
187 maybe_deref(doc, obj)
188 .as_array()
189 .map(|x| {
190 x.iter()
191 .map(|x| T::from_obj(doc, x).expect("wrong type"))
192 .collect()
193 })
194 .ok()
195 }
196}
197
198impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
201 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
202 maybe_deref(doc, obj)
203 .as_array()
204 .map(|x| {
205 let mut all = x.iter().map(|x| T::from_obj(doc, x).expect("wrong type"));
206 [
207 all.next().unwrap(),
208 all.next().unwrap(),
209 all.next().unwrap(),
210 all.next().unwrap(),
211 ]
212 })
213 .ok()
214 }
215}
216
217impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
218 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
219 maybe_deref(doc, obj)
220 .as_array()
221 .map(|x| {
222 let mut all = x.iter().map(|x| T::from_obj(doc, x).expect("wrong type"));
223 [
224 all.next().unwrap(),
225 all.next().unwrap(),
226 all.next().unwrap(),
227 ]
228 })
229 .ok()
230 }
231}
232
233impl<'a> FromObj<'a> for f64 {
234 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
235 match *obj {
236 Object::Integer(i) => Some(i as f64),
237 Object::Real(f) => Some(f as f64),
238 _ => None,
239 }
240 }
241}
242
243impl<'a> FromObj<'a> for i64 {
244 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
245 match obj {
246 &Object::Integer(i) => Some(i),
247 _ => None,
248 }
249 }
250}
251
252impl<'a> FromObj<'a> for &'a Dictionary {
253 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
254 maybe_deref(doc, obj).as_dict().ok()
255 }
256}
257
258impl<'a> FromObj<'a> for &'a Stream {
259 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
260 maybe_deref(doc, obj).as_stream().ok()
261 }
262}
263
264impl<'a> FromObj<'a> for &'a Object {
265 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
266 Some(maybe_deref(doc, obj))
267 }
268}
269
270fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
271 T::from_opt_obj(doc, dict.get(key).ok(), key)
272}
273
274fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Res<String> {
275 pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o))?.as_name()?)
276}
277
278#[allow(dead_code)]
279fn maybe_get_name_string<'a>(
280 doc: &'a Document,
281 dict: &'a Dictionary,
282 key: &[u8],
283) -> Option<String> {
284 let ob = maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok());
285 if let Some(ob) = ob {
286 pdf_to_utf8(ob).ok()
287 } else {
288 None
289 }
290}
291
292fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
293 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
294}
295
296fn maybe_get_array<'a>(
297 doc: &'a Document,
298 dict: &'a Dictionary,
299 key: &[u8],
300) -> Option<&'a Vec<Object>> {
301 maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
302}
303
304#[derive(Clone)]
305struct PdfSimpleFont<'a> {
306 font: &'a Dictionary,
307 doc: &'a Document,
308 encoding: Option<Vec<u16>>,
309 unicode_map: Option<HashMap<u32, String>>,
310 widths: HashMap<CharCode, f64>, default_width: Option<f64>, }
313
314#[derive(Clone)]
315struct PdfType3Font<'a> {
316 font: &'a Dictionary,
317 doc: &'a Document,
318 encoding: Option<Vec<u16>>,
319 unicode_map: Option<HashMap<u32, String>>,
320 widths: HashMap<CharCode, f64>, }
322
323fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Res<Rc<dyn PdfFont + 'a>> {
324 let subtype = get_name_string(doc, font, b"Subtype")?;
325 dlog!("MakeFont({})", subtype);
326 if subtype == "Type0" {
327 Ok(Rc::new(PdfCIDFont::new(doc, font)?))
328 } else if subtype == "Type3" {
329 Ok(Rc::new(PdfType3Font::new(doc, font)?))
330 } else {
331 Ok(Rc::new(PdfSimpleFont::new(doc, font)?))
332 }
333}
334
335fn is_core_font(name: &str) -> bool {
336 matches!(
337 name,
338 "Courier-Bold"
339 | "Courier-BoldOblique"
340 | "Courier-Oblique"
341 | "Courier"
342 | "Helvetica-Bold"
343 | "Helvetica-BoldOblique"
344 | "Helvetica-Oblique"
345 | "Helvetica"
346 | "Symbol"
347 | "Times-Bold"
348 | "Times-BoldItalic"
349 | "Times-Italic"
350 | "Times-Roman"
351 | "ZapfDingbats"
352 )
353}
354
355fn encoding_to_unicode_table(name: &[u8]) -> Res<Vec<u16>> {
356 let encoding = match name {
357 b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
358 b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
359 b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
360 _ => return Err(format!("unexpected encoding {:?}", pdf_to_utf8(name)).into()),
361 };
362 let encoding_table = encoding
363 .iter()
364 .map(|x| {
365 if let &Some(x) = x {
366 glyphnames::name_to_unicode(x).unwrap()
367 } else {
368 0
369 }
370 })
371 .collect();
372 Ok(encoding_table)
373}
374
375impl<'a> PdfSimpleFont<'a> {
382 fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfSimpleFont<'a>> {
383 let base_name = get_name_string(doc, font, b"BaseFont")?;
384 let subtype = get_name_string(doc, font, b"Subtype")?;
385
386 let encoding: Option<&Object> = get(doc, font, b"Encoding");
387 dlog!(
388 "base_name {} {} enc:{:?} {:?}",
389 base_name,
390 subtype,
391 encoding,
392 font
393 );
394 let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
395 let mut type1_encoding = None;
396 if let Some(descriptor) = descriptor {
397 dlog!("descriptor {:?}", descriptor);
398 if subtype == "Type1" {
399 let file = maybe_get_obj(doc, descriptor, b"FontFile");
400 match file {
401 Some(&Object::Stream(ref s)) => {
402 let s = get_contents(s);
403 type1_encoding =
405 Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
406 }
407 _ => {
408 dlog!("font file {:?}", file)
409 }
410 }
411 } else if subtype == "TrueType" {
412 let file = maybe_get_obj(doc, descriptor, b"FontFile2");
413 match file {
414 Some(&Object::Stream(ref s)) => {
415 let _s = get_contents(s);
416 }
418 _ => {
419 dlog!("font file {:?}", file)
420 }
421 }
422 }
423
424 let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
425 match font_file3 {
426 Some(&Object::Stream(ref s)) => {
427 dlog!("font file {:?}", s);
428 }
429 None => {}
430 _ => {
431 dlog!("unexpected")
432 }
433 }
434
435 let charset = maybe_get_obj(doc, descriptor, b"CharSet");
436 let _charset = match charset {
437 Some(&Object::String(ref s, _)) => Some(pdf_to_utf8(s)),
438 _ => None,
439 };
440 }
442
443 let mut unicode_map = get_unicode_map(doc, font)?;
444
445 let mut encoding_table = None;
446 match encoding {
447 Some(&Object::Name(ref encoding_name)) => {
448 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
449 encoding_table = Some(encoding_to_unicode_table(encoding_name)?);
450 }
451 Some(&Object::Dictionary(ref encoding)) => {
452 let mut table =
454 if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
455 dlog!("BaseEncoding {:?}", base_encoding);
456 encoding_to_unicode_table(base_encoding)?
457 } else {
458 Vec::from(PDFDocEncoding)
459 };
460 let differences = maybe_get_array(doc, encoding, b"Differences");
461 if let Some(differences) = differences {
462 dlog!("Differences");
463 let mut code = 0;
464 for o in differences {
465 let o = maybe_deref(doc, o);
466 match *o {
467 Object::Integer(i) => {
468 code = i;
469 }
470 Object::Name(ref n) => {
471 let name = pdf_to_utf8(n)?;
472 let unicode = glyphnames::name_to_unicode(&name);
475 if let Some(unicode) = unicode {
476 table[code as usize] = unicode;
477 if let Some(ref mut unicode_map) = unicode_map {
478 let be = [unicode];
479 match unicode_map.entry(code as u32) {
480 Entry::Vacant(v) => {
482 v.insert(
483 String::from_utf16(&be)
484 .map_err(|_| "utf16 err")?,
485 );
486 }
487 Entry::Occupied(e) => {
488 if e.get()
489 != &String::from_utf16(&be)
490 .map_err(|_| "utf16 err")?
491 {
492 println!("Unicode mismatch");
493 }
494 }
495 }
496 }
497 }
498 dlog!("{} = {} ({:?})", code, name, unicode);
499 if let Some(ref mut unicode_map) = unicode_map {
500 dlog!("{} {}", code, unicode_map[&(code as u32)]);
501 }
502 code += 1;
503 }
504 _ => {
505 return Err(format!("wrong type {:?}", o).into());
506 }
507 }
508 }
509 }
510 let name = pdf_to_utf8(encoding.get(b"Type")?.as_name()?);
511 dlog!("name: {}", name?);
512
513 encoding_table = Some(table);
514 }
515 None => {
516 if let Some(type1_encoding) = type1_encoding {
517 let mut table = Vec::from(PDFDocEncoding);
518 dlog!("type1encoding");
519 for (code, name) in type1_encoding {
520 let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name)?);
521 if let Some(unicode) = unicode {
522 table[code as usize] = unicode;
523 } else {
524 dlog!("unknown character {}", pdf_to_utf8(&name)?);
525 }
526 }
527 encoding_table = Some(table)
528 } else if subtype == "TrueType" {
529 encoding_table = Some(
530 encodings::WIN_ANSI_ENCODING
531 .iter()
532 .map(|x| {
533 x.and_then(glyphnames::name_to_unicode).unwrap_or_else(|| {
534 dlog!("unknown character {:?}", x);
535 0
536 })
537 })
538 .collect(),
539 );
540 }
541 }
542 _ => return Err("unexpected encoding type".into()),
543 }
544
545 let mut width_map = HashMap::new();
546 if is_core_font(&base_name) {
547 for font_metrics in core_fonts::metrics().iter() {
548 if font_metrics.0 == base_name {
549 if let Some(ref encoding) = encoding_table {
550 dlog!("has encoding");
551 for w in font_metrics.2 {
552 let c = glyphnames::name_to_unicode(w.2).unwrap();
553 for i in 0..encoding.len() {
554 if encoding[i] == c {
555 width_map.insert(i as CharCode, w.1);
556 }
557 }
558 }
559 } else {
560 let mut table = vec![0; 256];
565 for w in font_metrics.2 {
566 dlog!("{} {}", w.0, w.2);
567 if w.0 != -1 {
569 table[w.0 as usize] = if base_name == "ZapfDingbats" {
570 zapfglyphnames::zapfdigbats_names_to_unicode(w.2)
571 .ok_or(format!("Bad zapfdigbads name: {:?}", w))?
572 } else {
573 glyphnames::name_to_unicode(w.2).unwrap()
574 }
575 }
576 }
577
578 let encoding = &table[..];
579 for w in font_metrics.2 {
580 width_map.insert(w.0 as CharCode, w.1);
581 }
583 encoding_table = Some(encoding.to_vec());
584 }
585 }
595 }
596 } else {
597 let first_char: i64 = get(doc, font, b"FirstChar");
599 let last_char: i64 = get(doc, font, b"LastChar");
600 let widths: Vec<f64> = get(doc, font, b"Widths");
601 let mut i = 0;
602 dlog!(
603 "first_char {:?}, last_char: {:?}, widths: {} {:?}",
604 first_char,
605 last_char,
606 widths.len(),
607 widths
608 );
609
610 for w in widths {
611 width_map.insert((first_char + i) as CharCode, w);
612 i += 1;
613 }
614 assert_eq!(first_char + i - 1, last_char);
615 }
616
617 Ok(PdfSimpleFont {
618 doc,
619 font,
620 widths: width_map,
621 encoding: encoding_table,
622 default_width: None,
623 unicode_map,
624 })
625 }
626
627 #[allow(dead_code)]
628 fn get_type(&self) -> Res<String> {
629 get_name_string(self.doc, self.font, b"Type")
630 }
631 #[allow(dead_code)]
632 fn get_basefont(&self) -> Res<String> {
633 get_name_string(self.doc, self.font, b"BaseFont")
634 }
635 #[allow(dead_code)]
636 fn get_subtype(&self) -> Res<String> {
637 get_name_string(self.doc, self.font, b"Subtype")
638 }
639 #[allow(dead_code)]
640 fn get_widths(&self) -> Option<&Vec<Object>> {
641 maybe_get_obj(self.doc, self.font, b"Widths")
642 .map(|widths| widths.as_array().expect("Widths should be an array"))
643 }
644 #[allow(dead_code)]
647 fn get_name(&self) -> Option<String> {
648 maybe_get_name_string(self.doc, self.font, b"Name")
649 }
650
651 #[allow(dead_code)]
652 fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
653 maybe_get_obj(self.doc, self.font, b"FontDescriptor")
654 .and_then(|desc| desc.as_dict().ok())
655 .map(|desc| PdfFontDescriptor {
656 desc,
657 doc: self.doc,
658 })
659 }
660}
661
662impl<'a> PdfType3Font<'a> {
663 fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfType3Font<'a>> {
664 let unicode_map = get_unicode_map(doc, font)?;
665 let encoding: Option<&Object> = get(doc, font, b"Encoding");
666
667 let encoding_table;
668 match encoding {
669 Some(&Object::Name(ref encoding_name)) => {
670 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
671 encoding_table = Some(encoding_to_unicode_table(encoding_name)?);
672 }
673 Some(&Object::Dictionary(ref encoding)) => {
674 let mut table =
676 if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
677 dlog!("BaseEncoding {:?}", base_encoding);
678 encoding_to_unicode_table(base_encoding)?
679 } else {
680 Vec::from(PDFDocEncoding)
681 };
682 let differences = maybe_get_array(doc, encoding, b"Differences");
683 if let Some(differences) = differences {
684 dlog!("Differences");
685 let mut code = 0;
686 for o in differences {
687 match *o {
688 Object::Integer(i) => {
689 code = i;
690 }
691 Object::Name(ref n) => {
692 let name = pdf_to_utf8(n)?;
693 let unicode = glyphnames::name_to_unicode(&name);
696 if let Some(unicode) = unicode {
697 table[code as usize] = unicode;
698 }
699 dlog!("{} = {} ({:?})", code, name, unicode);
700 if let Some(ref unicode_map) = unicode_map {
701 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
702 }
703 code += 1;
704 }
705 _ => {
706 return Err("wrong type".into());
707 }
708 }
709 }
710 }
711 let name_encoded = encoding.get(b"Type");
712 if let Ok(Object::Name(name)) = name_encoded {
713 dlog!("name: {}", pdf_to_utf8(name)?);
714 } else {
715 dlog!("name not found");
716 }
717
718 encoding_table = Some(table);
719 }
720 _ => {
721 return Err("Wrong encoding".into());
722 }
723 }
724
725 let first_char: i64 = get(doc, font, b"FirstChar");
726 let last_char: i64 = get(doc, font, b"LastChar");
727 let widths: Vec<f64> = get(doc, font, b"Widths");
728
729 let mut width_map = HashMap::new();
730
731 let mut i = 0;
732 dlog!(
733 "first_char {:?}, last_char: {:?}, widths: {} {:?}",
734 first_char,
735 last_char,
736 widths.len(),
737 widths
738 );
739
740 for w in widths {
741 width_map.insert((first_char + i) as CharCode, w);
742 i += 1;
743 }
744 assert_eq!(first_char + i - 1, last_char);
745 Ok(PdfType3Font {
746 doc,
747 font,
748 widths: width_map,
749 encoding: encoding_table,
750 unicode_map,
751 })
752 }
753}
754
755type CharCode = u32;
756
757struct PdfFontIter<'a> {
758 i: Iter<'a, u8>,
759 font: &'a dyn PdfFont,
760}
761
762impl<'a> Iterator for PdfFontIter<'a> {
763 type Item = (CharCode, u8);
764 fn next(&mut self) -> Option<(CharCode, u8)> {
765 self.font.next_char(&mut self.i)
766 }
767}
768
769trait PdfFont: Debug {
770 fn get_width(&self, id: CharCode) -> Res<f64>;
771 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
772 fn decode_char(&self, char: CharCode) -> Res<String>;
773
774 }
779
780impl<'a> dyn PdfFont + 'a {
781 fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
782 PdfFontIter {
783 i: chars.iter(),
784 font: self,
785 }
786 }
787 fn decode(&self, chars: &[u8]) -> String {
788 let strings = self
789 .char_codes(chars)
790 .map(|x| self.decode_char(x.0).unwrap())
792 .collect::<Vec<_>>();
793 strings.join("")
794 }
795}
796
797impl<'a> PdfFont for PdfSimpleFont<'a> {
798 fn get_width(&self, id: CharCode) -> Res<f64> {
799 let width = self.widths.get(&id);
800 if let Some(width) = width {
801 Ok(*width)
802 } else {
803 dlog!(
804 "missing width for {} falling back to default_width {:?}",
805 id,
806 self.font
807 );
808 self.default_width
809 .ok_or_else(|| "missing default width".into())
810 }
811 }
812 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
818 iter.next().map(|x| (*x as CharCode, 1))
819 }
820 fn decode_char(&self, char: CharCode) -> Res<String> {
821 let slice = [char as u8];
822 if let Some(ref unicode_map) = self.unicode_map {
823 let s = unicode_map.get(&char);
824 let s = match s {
825 None => Err(format!("missing char {:?} in map {:?}", char, unicode_map).into()),
826 Some(s) => Ok(s.clone()),
827 };
828 return s;
829 }
830 let encoding = self
831 .encoding
832 .as_ref()
833 .map(|x| &x[..])
834 .unwrap_or(PDFDocEncoding);
835 to_utf8(encoding, &slice)
838 }
839}
840
841impl<'a> fmt::Debug for PdfSimpleFont<'a> {
842 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
843 self.font.fmt(f)
844 }
845}
846
847impl<'a> PdfFont for PdfType3Font<'a> {
848 fn get_width(&self, id: CharCode) -> Res<f64> {
849 let width = self.widths.get(&id);
850 if let Some(width) = width {
851 Ok(*width)
852 } else {
853 Err(format!("missing width for {} {:?}", id, self.font).into())
854 }
855 }
856 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
862 iter.next().map(|x| (*x as CharCode, 1))
863 }
864 fn decode_char(&self, char: CharCode) -> Res<String> {
865 let slice = [char as u8];
866 if let Some(ref unicode_map) = self.unicode_map {
867 let s = unicode_map.get(&char);
868 let s = match s {
869 None => {
870 return Err(format!("missing char {:?} in map {:?}", char, unicode_map).into())
871 }
872 Some(s) => s.clone(),
873 };
874 return Ok(s);
875 }
876 let encoding = self
877 .encoding
878 .as_ref()
879 .map(|x| &x[..])
880 .unwrap_or(PDFDocEncoding);
881 to_utf8(encoding, &slice)
884 }
885}
886
887impl<'a> fmt::Debug for PdfType3Font<'a> {
888 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
889 self.font.fmt(f)
890 }
891}
892
893struct PdfCIDFont<'a> {
894 font: &'a Dictionary,
895 #[allow(dead_code)]
896 doc: &'a Document,
897 #[allow(dead_code)]
898 encoding: Option<Vec<u16>>,
899 to_unicode: Option<HashMap<u32, String>>,
900 widths: HashMap<CharCode, f64>, default_width: Option<f64>, }
903
904fn get_unicode_map<'a>(
905 doc: &'a Document,
906 font: &'a Dictionary,
907) -> Res<Option<HashMap<u32, String>>> {
908 let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
909 dlog!("ToUnicode: {:?}", to_unicode);
910 let mut unicode_map = None;
911 match to_unicode {
912 Some(&Object::Stream(ref stream)) => {
913 let contents = get_contents(stream);
914 dlog!(
915 "Stream: {}",
916 String::from_utf8(contents.clone()).unwrap_or_else(|_| "utf8 error".into())
917 );
918
919 let cmap = adobe_cmap_parser::get_unicode_map(&contents)?;
920 let mut unicode = HashMap::new();
921 for (&k, v) in cmap.iter() {
925 let mut be: Vec<u16> = Vec::new();
926 let mut i = 0;
927 assert!(v.len() % 2 == 0);
928 while i < v.len() {
929 be.push(((v[i] as u16) << 8) | v[i + 1] as u16);
930 i += 2;
931 }
932 if let [0xd800..=0xdfff] = &be[..] {
933 continue;
936 }
937 let s = String::from_utf16(&be).unwrap_or_else(|_| "utf8 error".into());
938
939 unicode.insert(k, s);
940 }
941 unicode_map = Some(unicode);
942
943 dlog!("map: {:?}", unicode_map);
944 }
945 None => {}
946 Some(&Object::Name(ref name)) => {
947 let name = pdf_to_utf8(name)?;
948 if name != "Identity-H" {
949 return Err(format!("unsupported cmap {:?}", name).into());
950 }
951 }
952 _ => return Err(format!("unsupported cmap {:?}", to_unicode).into()),
953 }
954 Ok(unicode_map)
955}
956
957impl<'a> PdfCIDFont<'a> {
958 fn new(doc: &'a Document, font: &'a Dictionary) -> Res<PdfCIDFont<'a>> {
959 let base_name = get_name_string(doc, font, b"BaseFont")?;
960 let descendants =
961 maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
962 let ciddict = maybe_deref(doc, &descendants[0])
963 .as_dict()
964 .expect("should be CID dict");
965 let encoding =
966 maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
967 dlog!("base_name {} {:?}", base_name, font);
968
969 match *encoding {
970 Object::Name(ref name) => {
971 let name = pdf_to_utf8(name)?;
972 dlog!("encoding {:?}", name);
973 if name != "Identity-H" {
974 return Err(format!("unsupported encoding {:?}", name).into());
975 }
976 }
977 Object::Stream(ref stream) => {
978 let contents = get_contents(stream);
979 dlog!(
980 "Stream: {}",
981 String::from_utf8(contents).map_err(|_| "bad utf8")?
982 );
983 }
984 _ => return Err("Unsupported formatting".into()),
985 }
986
987 let unicode_map = get_unicode_map(doc, font)?;
993
994 dlog!("descendents {:?} {:?}", descendants, ciddict);
995
996 let font_dict =
997 maybe_get_obj(doc, ciddict, b"FontDescriptor").ok_or("No FontDescriptor")?;
998 dlog!("{:?}", font_dict);
999 let _f = font_dict.as_dict().expect("must be dict");
1000 let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1001 let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1002 dlog!("widths {:?}", w);
1003 let mut widths = HashMap::new();
1004 let mut i = 0;
1005 if let Some(w) = w {
1006 while i < w.len() {
1007 if let &Object::Array(ref wa) = w[i + 1] {
1008 let cid = w[i].as_i64().expect("id should be num");
1009 let mut j = 0;
1010 dlog!("wa: {:?} -> {:?}", cid, wa);
1011 for w in wa {
1012 widths.insert((cid + j) as CharCode, as_num(w)?);
1013 j += 1;
1014 }
1015 i += 2;
1016 } else {
1017 let c_first = w[i].as_i64().map_err(|_e| "first should be num")?;
1018 let c_last = w[i].as_i64().map_err(|_e| "last should be num")?;
1019 let c_width = as_num(w[i])?;
1020 for id in c_first..c_last {
1021 widths.insert(id as CharCode, c_width);
1022 }
1023 i += 3;
1024 }
1025 }
1026 }
1027 Ok(PdfCIDFont {
1028 doc,
1029 font,
1030 widths,
1031 to_unicode: unicode_map,
1032 encoding: None,
1033 default_width: Some(default_width as f64),
1034 })
1035 }
1036}
1037
1038impl<'a> PdfFont for PdfCIDFont<'a> {
1039 fn get_width(&self, id: CharCode) -> Res<f64> {
1040 let width = self.widths.get(&id);
1041 if let Some(width) = width {
1042 dlog!("GetWidth {} -> {}", id, *width);
1043 Ok(*width)
1044 } else {
1045 dlog!("missing width for {} falling back to default_width", id);
1046 self.default_width.ok_or("No Default Width".into())
1047 }
1048 } fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1059 let p = iter.next();
1060 if let Some(&c) = p {
1061 let next = *iter.next()?;
1062 Some((((c as u32) << 8) | next as u32, 2))
1063 } else {
1064 None
1065 }
1066 }
1067 fn decode_char(&self, char: CharCode) -> Res<String> {
1068 let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1069 let o = if let Some(s) = s {
1070 s.clone()
1071 } else {
1072 dlog!(
1073 "Unknown character {:?} in {:?} {:?}",
1074 char,
1075 self.font,
1076 self.to_unicode
1077 );
1078 "".to_string()
1079 };
1080 Ok(o)
1081 }
1082}
1083
1084impl<'a> fmt::Debug for PdfCIDFont<'a> {
1085 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1086 self.font.fmt(f)
1087 }
1088}
1089
1090#[derive(Copy, Clone)]
1091struct PdfFontDescriptor<'a> {
1092 desc: &'a Dictionary,
1093 doc: &'a Document,
1094}
1095
1096impl<'a> PdfFontDescriptor<'a> {
1097 #[allow(dead_code)]
1098 fn get_file(&self) -> Option<&'a Object> {
1099 maybe_get_obj(self.doc, self.desc, b"FontFile")
1100 }
1101}
1102
1103impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1104 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1105 self.desc.fmt(f)
1106 }
1107}
1108
1109#[derive(Clone, Debug)]
1110struct Type0Func {
1111 domain: Vec<f64>,
1112 range: Vec<f64>,
1113 contents: Vec<u8>,
1114 size: Vec<i64>,
1115 bits_per_sample: i64,
1116 encode: Vec<f64>,
1117 decode: Vec<f64>,
1118}
1119
1120#[allow(dead_code)]
1121fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1122 let divisor = x - x_min;
1123 if divisor != 0. {
1124 y_min + (x - x_min) * ((y_max - y_min) / divisor)
1125 } else {
1126 y_min
1129 }
1130}
1131
1132impl Type0Func {
1133 #[allow(dead_code)]
1134 fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1135 let _n_inputs = self.domain.len() / 2;
1136 let _n_ouputs = self.range.len() / 2;
1137 }
1138}
1139
1140#[derive(Clone, Debug)]
1141struct Type2Func {
1142 c0: Option<Vec<f64>>,
1143 c1: Option<Vec<f64>>,
1144 n: f64,
1145}
1146
1147#[derive(Clone, Debug)]
1148enum Function {
1149 Type0(Type0Func),
1150 Type2(Type2Func),
1151 #[allow(dead_code)]
1152 Type3,
1153 #[allow(dead_code)]
1154 Type4,
1155}
1156
1157impl Function {
1158 fn new(doc: &Document, obj: &Object) -> Res<Function> {
1159 let dict = match *obj {
1160 Object::Dictionary(ref dict) => dict,
1161 Object::Stream(ref stream) => &stream.dict,
1162 _ => return Err("Function should be a dictionary or stream".into()),
1163 };
1164 let function_type: i64 = get(doc, dict, b"FunctionType");
1165 let f = match function_type {
1166 0 => {
1167 let stream = match obj {
1168 &Object::Stream(ref stream) => stream,
1169 _ => return Err("No stream".into()),
1170 };
1171 let range: Vec<f64> = get(doc, dict, b"Range");
1172 let domain: Vec<f64> = get(doc, dict, b"Domain");
1173 let contents = get_contents(stream);
1174 let size: Vec<i64> = get(doc, dict, b"Size");
1175 let bits_per_sample = get(doc, dict, b"BitsPerSample");
1176 let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1179 let encode = encode.unwrap_or_else(|| {
1181 let mut default = Vec::new();
1182 for i in &size {
1183 default.extend([0., (i - 1) as f64].iter());
1184 }
1185 default
1186 });
1187 let decode =
1188 get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1189
1190 Function::Type0(Type0Func {
1191 domain,
1192 range,
1193 size,
1194 contents,
1195 bits_per_sample,
1196 encode,
1197 decode,
1198 })
1199 }
1200 2 => {
1201 let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1202 let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1203 let n = get::<f64>(doc, dict, b"N");
1204 Function::Type2(Type2Func { c0, c1, n })
1205 }
1206 _ => return Err(format!("unhandled function type {}", function_type).into()),
1207 };
1208 Ok(f)
1209 }
1210}
1211
1212fn as_num(o: &Object) -> Res<f64> {
1213 match *o {
1214 Object::Integer(i) => Ok(i as f64),
1215 Object::Real(f) => Ok(f as f64),
1216 _ => Err("not a number".into()),
1217 }
1218}
1219
1220#[derive(Clone)]
1221struct TextState<'a> {
1222 font: Option<Rc<dyn PdfFont + 'a>>,
1223 font_size: f64,
1224 character_spacing: f64,
1225 word_spacing: f64,
1226 horizontal_scaling: f64,
1227 leading: f64,
1228 rise: f64,
1229 tm: Transform,
1230}
1231
1232fn get_contents(contents: &Stream) -> Vec<u8> {
1234 if contents.filter().is_ok() {
1235 contents
1236 .decompressed_content()
1237 .unwrap_or_else(|_| contents.content.clone())
1238 } else {
1239 contents.content.clone()
1240 }
1241}
1242
1243#[derive(Clone)]
1244struct GraphicsState<'a> {
1245 ctm: Transform,
1246 ts: TextState<'a>,
1247 smask: Option<&'a Dictionary>,
1248 fill_colorspace: ColorSpace,
1249 fill_color: Vec<f64>,
1250 stroke_colorspace: ColorSpace,
1251 stroke_color: Vec<f64>,
1252 line_width: f64,
1253}
1254
1255fn show_text(
1256 gs: &mut GraphicsState,
1257 s: &[u8],
1258 _tlm: &Transform,
1259 _flip_ctm: &Transform,
1260 output: &mut dyn OutputDev,
1261) -> Res<()> {
1262 let ts = &mut gs.ts;
1263 let font = match ts.font.as_ref() {
1265 None => {
1266 return Err(OutputError::Other("font".to_string()));
1267 }
1268 Some(f) => {f}
1269 };
1270 dlog!("{:?}", font.decode(s));
1272 dlog!("{:?}", font.decode(s).as_bytes());
1273 dlog!("{:?}", s);
1274 output.begin_word()?;
1275
1276 for (c, length) in font.char_codes(s) {
1277 let tsm = Transform2D::row_major(ts.horizontal_scaling, 0., 0., 1.0, 0., ts.rise);
1278 let trm = ts.tm.pre_transform(&gs.ctm);
1279 let trm = trm.post_transform(&tsm);
1280 let w0 = font.get_width(c)? / 1000.;
1286
1287 let mut spacing = ts.character_spacing;
1288 let is_space = c == 32 && length == 1;
1293 if is_space {
1294 spacing += ts.word_spacing
1295 }
1296
1297 output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c)?)?;
1298 let tj = 0.;
1299 let ty = 0.;
1300 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size + spacing);
1301 dlog!(
1302 "horizontal {} adjust {} {} {} {}",
1303 ts.horizontal_scaling,
1304 tx,
1305 w0,
1306 ts.font_size,
1307 spacing
1308 );
1309 ts.tm = ts
1311 .tm
1312 .pre_transform(&Transform2D::create_translation(tx, ty));
1313 let _trm = ts.tm.pre_transform(&gs.ctm);
1314 }
1316 output.end_word()?;
1317 Ok(())
1318}
1319
1320#[derive(Debug, Clone, Copy)]
1321pub struct MediaBox {
1322 pub llx: f64,
1323 pub lly: f64,
1324 pub urx: f64,
1325 pub ury: f64,
1326}
1327
1328fn apply_state(gs: &mut GraphicsState, state: &Dictionary) -> Res<()> {
1329 for (k, v) in state.iter() {
1330 let k: &[u8] = k.as_ref();
1331 match k {
1332 b"SMask" => match v {
1333 &Object::Name(ref name) => {
1334 if name == b"None" {
1335 gs.smask = None;
1336 } else {
1337 return Err("unexpected smask name".into());
1338 }
1339 }
1340 _ => return Err(format!("unexpected smask type {:?}", v).into()),
1341 },
1342 b"Type" => match v {
1343 &Object::Name(ref name) => {
1344 assert_eq!(name, b"ExtGState")
1345 }
1346 _ => return Err("unexpected type".into()),
1347 },
1348 _ => {
1349 dlog!("unapplied state: {:?} {:?}", k, v);
1350 }
1351 }
1352 }
1353 Ok(())
1354}
1355
1356#[derive(Debug)]
1357pub enum PathOp {
1358 MoveTo(f64, f64),
1359 LineTo(f64, f64),
1360 CurveTo(f64, f64, f64, f64, f64, f64),
1362 Rect(f64, f64, f64, f64),
1363 Close,
1364}
1365
1366#[derive(Debug)]
1367pub struct Path {
1368 pub ops: Vec<PathOp>,
1369}
1370
1371impl Path {
1372 fn new() -> Path {
1373 Path { ops: Vec::new() }
1374 }
1375 fn current_point(&self) -> Res<(f64, f64)> {
1376 let v = match *self.ops.last().ok_or("empty path")? {
1377 PathOp::MoveTo(x, y) => (x, y),
1378 PathOp::LineTo(x, y) => (x, y),
1379 PathOp::CurveTo(_, _, _, _, x, y) => (x, y),
1380 _ => return Err("Unimplemented: current_point for path with no current point".into()),
1381 };
1382 Ok(v)
1383 }
1384}
1385
1386#[derive(Clone)]
1387pub struct CalGray {
1388 white_point: [f64; 3],
1389 black_point: Option<[f64; 3]>,
1390 gamma: Option<f64>,
1391}
1392
1393#[derive(Clone)]
1394pub struct CalRGB {
1395 white_point: [f64; 3],
1396 black_point: Option<[f64; 3]>,
1397 gamma: Option<[f64; 3]>,
1398 matrix: Option<Vec<f64>>,
1399}
1400
1401#[derive(Clone)]
1402pub struct Lab {
1403 white_point: [f64; 3],
1404 black_point: Option<[f64; 3]>,
1405 range: Option<[f64; 4]>,
1406}
1407
1408#[derive(Clone)]
1409pub struct Separation {
1410 name: String,
1411 alternate_space: String,
1412 tint_transform: Box<Function>,
1413}
1414
1415#[derive(Clone)]
1416pub enum ColorSpace {
1417 DeviceGray,
1418 DeviceRGB,
1419 DeviceCMYK,
1420 Pattern,
1421 CalRGB(CalRGB),
1422 CalGray(CalGray),
1423 Lab(Lab),
1424 Separation(Separation),
1425 ICCBased(Vec<u8>),
1426}
1427
1428fn make_colorspace<'a>(
1429 doc: &'a Document,
1430 name: &[u8],
1431 resources: &'a Dictionary,
1432) -> Res<ColorSpace> {
1433 let space = match name {
1434 b"DeviceGray" => ColorSpace::DeviceGray,
1435 b"DeviceRGB" => ColorSpace::DeviceRGB,
1436 b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1437 b"Pattern" => ColorSpace::Pattern,
1438 _ => {
1439 let colorspaces: &Dictionary = get(doc, resources, b"ColorSpace");
1440 let cs = maybe_get_array(doc, colorspaces, name)
1441 .ok_or(format!("missing colorspace {:?}", name))?;
1442 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"))?;
1443 match cs_name.as_ref() {
1444 "Separation" => {
1445 let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"))?;
1446 let alternate_space =
1447 pdf_to_utf8(cs[2].as_name().expect("second arg must be a name"))?;
1448 let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3]))?);
1449
1450 dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1451 ColorSpace::Separation(Separation {
1452 name,
1453 alternate_space,
1454 tint_transform,
1455 })
1456 }
1457 "ICCBased" => {
1458 let stream = maybe_deref(doc, &cs[1]).as_stream()?;
1459 dlog!("ICCBased {:?}", stream);
1460 ColorSpace::ICCBased(get_contents(stream))
1462 }
1463 "CalGray" => {
1464 let dict = cs[1].as_dict().expect("second arg must be a dict");
1465 ColorSpace::CalGray(CalGray {
1466 white_point: get(doc, dict, b"WhitePoint"),
1467 black_point: get(doc, dict, b"BackPoint"),
1468 gamma: get(doc, dict, b"Gamma"),
1469 })
1470 }
1471 "CalRGB" => {
1472 let dict = cs[1].as_dict().expect("second arg must be a dict");
1473 ColorSpace::CalRGB(CalRGB {
1474 white_point: get(doc, dict, b"WhitePoint"),
1475 black_point: get(doc, dict, b"BackPoint"),
1476 gamma: get(doc, dict, b"Gamma"),
1477 matrix: get(doc, dict, b"Matrix"),
1478 })
1479 }
1480 "Lab" => {
1481 let dict = cs[1].as_dict().expect("second arg must be a dict");
1482 ColorSpace::Lab(Lab {
1483 white_point: get(doc, dict, b"WhitePoint"),
1484 black_point: get(doc, dict, b"BackPoint"),
1485 range: get(doc, dict, b"Range"),
1486 })
1487 }
1488 "Pattern" => ColorSpace::Pattern,
1489 _ => {
1490 return Err(format!("color_space {:?} {:?} {:?}", name, cs_name, cs).into());
1491 }
1492 }
1493 }
1494 };
1495 Ok(space)
1496}
1497
1498struct Processor<'a> {
1499 _none: PhantomData<&'a ()>,
1500}
1501
1502impl<'a> Processor<'a> {
1503 fn process_stream(
1504 doc: &'a Document,
1505 content: Vec<u8>,
1506 resources: &'a Dictionary,
1507 media_box: &MediaBox,
1508 output: &mut dyn OutputDev,
1509 ) -> Res<()> {
1510 let content = Content::decode(&content)?;
1511 let mut font_table = HashMap::new();
1512 let mut gs: GraphicsState = GraphicsState {
1513 ts: TextState {
1514 font: None,
1515 font_size: std::f64::NAN,
1516 character_spacing: 0.,
1517 word_spacing: 0.,
1518 horizontal_scaling: 1.,
1519 leading: 0.,
1520 rise: 0.,
1521 tm: Transform2D::identity(),
1522 },
1523 fill_color: Vec::new(),
1524 fill_colorspace: ColorSpace::DeviceGray,
1525 stroke_color: Vec::new(),
1526 stroke_colorspace: ColorSpace::DeviceGray,
1527 line_width: 1.,
1528 ctm: Transform2D::identity(),
1529 smask: None,
1530 };
1531 let mut gs_stack = Vec::new();
1533 let mut mc_stack = Vec::new();
1534 let mut tlm = Transform2D::identity();
1536 let mut path = Path::new();
1537 let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1538 dlog!("MediaBox {:?}", media_box);
1539 for operation in &content.operations {
1540 match operation.operator.as_ref() {
1543 "BT" => {
1544 tlm = Transform2D::identity();
1545 gs.ts.tm = tlm;
1546 }
1547 "ET" => {
1548 tlm = Transform2D::identity();
1549 gs.ts.tm = tlm;
1550 }
1551 "cm" => {
1552 assert!(operation.operands.len() == 6);
1553 let m = Transform2D::row_major(
1554 as_num(&operation.operands[0])?,
1555 as_num(&operation.operands[1])?,
1556 as_num(&operation.operands[2])?,
1557 as_num(&operation.operands[3])?,
1558 as_num(&operation.operands[4])?,
1559 as_num(&operation.operands[5])?,
1560 );
1561 gs.ctm = gs.ctm.pre_transform(&m);
1562 dlog!("matrix {:?}", gs.ctm);
1563 }
1564 "CS" => {
1565 let name = operation.operands[0].as_name()?;
1566 gs.stroke_colorspace = make_colorspace(doc, name, resources)?;
1567 }
1568 "cs" => {
1569 let name = operation.operands[0].as_name()?;
1570 gs.fill_colorspace = make_colorspace(doc, name, resources)?;
1571 }
1572 "SC" | "SCN" => {
1573 gs.stroke_color = match gs.stroke_colorspace {
1574 ColorSpace::Pattern => {
1575 dlog!("unhandled pattern color");
1576 Vec::new()
1577 }
1578 _ => operation
1579 .operands
1580 .iter()
1581 .map(|n| match as_num(n) {
1582 Ok(n) => n,
1583 Err(_) => {
1584 dlog!("unhandled color {:?}", n);
1585 0.
1586 }
1587 })
1588 .collect(),
1589 };
1590 }
1591 "sc" | "scn" => {
1592 gs.fill_color = match gs.fill_colorspace {
1593 ColorSpace::Pattern => {
1594 dlog!("unhandled pattern color");
1595 Vec::new()
1596 }
1597 _ => operation
1598 .operands
1599 .iter()
1600 .map(|n| match as_num(n) {
1601 Ok(n) => n,
1602 Err(_) => {
1603 dlog!("unhandled color {:?}", n);
1604 0.
1605 }
1606 })
1607 .collect(),
1608 };
1609 }
1610 "G" | "g" | "RG" | "rg" | "K" | "k" => {
1611 dlog!("unhandled color operation {:?}", operation);
1612 }
1613 "TJ" => {
1614 if let Object::Array(ref array) = operation.operands[0] {
1615 for e in array {
1616 match *e {
1617 Object::String(ref s, _) => {
1618 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1619 }
1620 Object::Integer(i) => {
1621 let ts = &mut gs.ts;
1622 let w0 = 0.;
1623 let tj = i as f64;
1624 let ty = 0.;
1625 let tx =
1626 ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1627 ts.tm = ts
1628 .tm
1629 .pre_transform(&Transform2D::create_translation(tx, ty));
1630 dlog!("adjust text by: {} {:?}", i, ts.tm);
1631 }
1632 Object::Real(i) => {
1633 let ts = &mut gs.ts;
1634 let w0 = 0.;
1635 let tj = i as f64;
1636 let ty = 0.;
1637 let tx =
1638 ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1639 ts.tm = ts
1640 .tm
1641 .pre_transform(&Transform2D::create_translation(tx, ty));
1642 dlog!("adjust text by: {} {:?}", i, ts.tm);
1643 }
1644 _ => {
1645 dlog!("kind of {:?}", e);
1646 }
1647 }
1648 }
1649 }
1650 }
1651 "Tj" => match operation.operands[0] {
1652 Object::String(ref s, _) => {
1653 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1654 }
1655 _ => {
1656 return Err(format!("unexpected Tj operand {:?}", operation).into());
1657 }
1658 },
1659 "Tc" => {
1660 gs.ts.character_spacing = as_num(&operation.operands[0])?;
1661 }
1662 "Tw" => {
1663 gs.ts.word_spacing = as_num(&operation.operands[0])?;
1664 }
1665 "Tz" => {
1666 gs.ts.horizontal_scaling = as_num(&operation.operands[0])? / 100.;
1667 }
1668 "TL" => {
1669 gs.ts.leading = as_num(&operation.operands[0])?;
1670 }
1671 "Tf" => {
1672 let fonts: &Dictionary = get(doc, resources, b"Font");
1673 let name = operation.operands[0].as_name()?;
1674 let font = font_table
1675 .entry(name.to_owned())
1676 .or_insert_with(|| {
1677 let dict = get::<&Dictionary>(doc, fonts, name);
1678 make_font(doc, dict).ok()
1679 })
1680 .clone();
1681 {
1682 }
1690 gs.ts.font = font;
1691
1692 gs.ts.font_size = as_num(&operation.operands[1])?;
1693 dlog!(
1694 "font {} size: {} {:?}",
1695 pdf_to_utf8(name)?,
1696 gs.ts.font_size,
1697 operation
1698 );
1699 }
1700 "Ts" => {
1701 gs.ts.rise = as_num(&operation.operands[0])?;
1702 }
1703 "Tm" => {
1704 assert!(operation.operands.len() == 6);
1705 tlm = Transform2D::row_major(
1706 as_num(&operation.operands[0])?,
1707 as_num(&operation.operands[1])?,
1708 as_num(&operation.operands[2])?,
1709 as_num(&operation.operands[3])?,
1710 as_num(&operation.operands[4])?,
1711 as_num(&operation.operands[5])?,
1712 );
1713 gs.ts.tm = tlm;
1714 dlog!("Tm: matrix {:?}", gs.ts.tm);
1715 output.end_line()?;
1716 }
1717 "Td" => {
1718 assert!(operation.operands.len() == 2);
1723 let tx = as_num(&operation.operands[0])?;
1724 let ty = as_num(&operation.operands[1])?;
1725 dlog!("translation: {} {}", tx, ty);
1726
1727 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1728 gs.ts.tm = tlm;
1729 dlog!("Td matrix {:?}", gs.ts.tm);
1730 output.end_line()?;
1731 }
1732
1733 "TD" => {
1734 assert!(operation.operands.len() == 2);
1738 let tx = as_num(&operation.operands[0])?;
1739 let ty = as_num(&operation.operands[1])?;
1740 dlog!("translation: {} {}", tx, ty);
1741 gs.ts.leading = -ty;
1742
1743 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1744 gs.ts.tm = tlm;
1745 dlog!("TD matrix {:?}", gs.ts.tm);
1746 output.end_line()?;
1747 }
1748
1749 "T*" => {
1750 let tx = 0.0;
1751 let ty = -gs.ts.leading;
1752
1753 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1754 gs.ts.tm = tlm;
1755 dlog!("T* matrix {:?}", gs.ts.tm);
1756 output.end_line()?;
1757 }
1758 "q" => {
1759 gs_stack.push(gs.clone());
1760 }
1761 "Q" => {
1762 let s = gs_stack.pop();
1763 if let Some(s) = s {
1764 gs = s;
1765 } else {
1766 dlog!("No state to pop");
1767 }
1768 }
1769 "gs" => {
1770 let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1771 let name = operation.operands[0].as_name()?;
1772 let state: &Dictionary = get(doc, ext_gstate, name);
1773 match apply_state(&mut gs, state) {
1774 Ok(_) => {}
1775 Err(e) => {
1776 dlog!("Error while applying graphics state: {}", e)
1777 }
1778 }
1779 }
1780 "i" => {
1781 dlog!(
1782 "unhandled graphics state flattness operator {:?}",
1783 operation
1784 );
1785 }
1786 "w" => {
1787 gs.line_width = as_num(&operation.operands[0])?;
1788 }
1789 "J" | "j" | "M" | "d" | "ri" => {
1790 dlog!("unknown graphics state operator {:?}", operation);
1791 }
1792 "m" => path.ops.push(PathOp::MoveTo(
1793 as_num(&operation.operands[0])?,
1794 as_num(&operation.operands[1])?,
1795 )),
1796 "l" => path.ops.push(PathOp::LineTo(
1797 as_num(&operation.operands[0])?,
1798 as_num(&operation.operands[1])?,
1799 )),
1800 "c" => path.ops.push(PathOp::CurveTo(
1801 as_num(&operation.operands[0])?,
1802 as_num(&operation.operands[1])?,
1803 as_num(&operation.operands[2])?,
1804 as_num(&operation.operands[3])?,
1805 as_num(&operation.operands[4])?,
1806 as_num(&operation.operands[5])?,
1807 )),
1808 "v" => {
1809 let (x, y) = path.current_point()?;
1810 path.ops.push(PathOp::CurveTo(
1811 x,
1812 y,
1813 as_num(&operation.operands[0])?,
1814 as_num(&operation.operands[1])?,
1815 as_num(&operation.operands[2])?,
1816 as_num(&operation.operands[3])?,
1817 ))
1818 }
1819 "y" => path.ops.push(PathOp::CurveTo(
1820 as_num(&operation.operands[0])?,
1821 as_num(&operation.operands[1])?,
1822 as_num(&operation.operands[2])?,
1823 as_num(&operation.operands[3])?,
1824 as_num(&operation.operands[2])?,
1825 as_num(&operation.operands[3])?,
1826 )),
1827 "h" => path.ops.push(PathOp::Close),
1828 "re" => path.ops.push(PathOp::Rect(
1829 as_num(&operation.operands[0])?,
1830 as_num(&operation.operands[1])?,
1831 as_num(&operation.operands[2])?,
1832 as_num(&operation.operands[3])?,
1833 )),
1834 "s" | "f*" | "B" | "B*" | "b" => {
1835 dlog!("unhandled path op {:?}", operation);
1836 }
1837 "S" => {
1838 output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1839 path.ops.clear();
1840 }
1841 "F" | "f" => {
1842 output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1843 path.ops.clear();
1844 }
1845 "W" | "w*" => {
1846 dlog!("unhandled clipping operation {:?}", operation);
1847 }
1848 "n" => {
1849 dlog!("discard {:?}", path);
1850 path.ops.clear();
1851 }
1852 "BMC" | "BDC" => {
1853 mc_stack.push(operation);
1854 }
1855 "EMC" => {
1856 mc_stack.pop();
1857 }
1858 "Do" => {
1859 let xobject: &Dictionary = get(doc, resources, b"XObject");
1862 let name = operation.operands[0].as_name()?;
1863 let xf: &Stream = get(doc, xobject, name);
1864 let resources = maybe_get_obj(doc, &xf.dict, b"Resources")
1865 .and_then(|n| n.as_dict().ok())
1866 .unwrap_or(resources);
1867 let contents = get_contents(xf);
1868 Processor::process_stream(doc, contents, resources, media_box, output)?;
1869 }
1870 _ => {
1871 dlog!("unknown operation {:?}", operation);
1872 }
1873 }
1874 }
1875 Ok(())
1876 }
1877}
1878
1879pub trait OutputDev {
1880 fn begin_page(
1881 &mut self,
1882 page_num: u32,
1883 media_box: &MediaBox,
1884 art_box: Option<(f64, f64, f64, f64)>,
1885 ) -> Res<()>;
1886 fn end_page(&mut self) -> Res<()>;
1887 fn output_character(
1888 &mut self,
1889 trm: &Transform,
1890 width: f64,
1891 spacing: f64,
1892 font_size: f64,
1893 char: &str,
1894 ) -> Res<()>;
1895 fn begin_word(&mut self) -> Res<()>;
1896 fn end_word(&mut self) -> Res<()>;
1897 fn end_line(&mut self) -> Res<()>;
1898 fn stroke(
1899 &mut self,
1900 _ctm: &Transform,
1901 _colorspace: &ColorSpace,
1902 _color: &[f64],
1903 _path: &Path,
1904 ) -> Res<()> {
1905 Ok(())
1906 }
1907 fn fill(
1908 &mut self,
1909 _ctm: &Transform,
1910 _colorspace: &ColorSpace,
1911 _color: &[f64],
1912 _path: &Path,
1913 ) -> Res<()> {
1914 Ok(())
1915 }
1916}
1917
1918pub struct HTMLOutput<'a> {
1919 file: &'a mut dyn std::io::Write,
1920 flip_ctm: Transform,
1921 last_ctm: Transform,
1922 buf_ctm: Transform,
1923 buf_font_size: f64,
1924 buf: String,
1925}
1926
1927fn insert_nbsp(input: &str) -> String {
1928 let mut result = String::new();
1929 let mut word_end = false;
1930 let mut chars = input.chars().peekable();
1931 while let Some(c) = chars.next() {
1932 if c == ' ' {
1933 if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1934 result += " ";
1935 } else {
1936 result += " ";
1937 }
1938 word_end = false;
1939 } else {
1940 word_end = true;
1941 result.push(c);
1942 }
1943 }
1944 result
1945}
1946
1947impl<'a> HTMLOutput<'a> {
1948 pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1949 HTMLOutput {
1950 file,
1951 flip_ctm: Transform2D::identity(),
1952 last_ctm: Transform2D::identity(),
1953 buf_ctm: Transform2D::identity(),
1954 buf: String::new(),
1955 buf_font_size: 0.,
1956 }
1957 }
1958 fn flush_string(&mut self) -> Res<()> {
1959 if !self.buf.is_empty() {
1960 let position = self.buf_ctm.post_transform(&self.flip_ctm);
1961 let transformed_font_size_vec = self
1962 .buf_ctm
1963 .transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1964 let transformed_font_size =
1966 (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1967 let (x, y) = (position.m31, position.m32);
1968 println!("flush {} {:?}", self.buf, (x, y));
1969
1970 writeln!(
1971 self.file,
1972 "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1973 x,
1974 y,
1975 transformed_font_size,
1976 insert_nbsp(&self.buf)
1977 )?;
1978 }
1979 Ok(())
1980 }
1981}
1982
1983type ArtBox = (f64, f64, f64, f64);
1984
1985impl<'a> OutputDev for HTMLOutput<'a> {
1986 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Res<()> {
1987 write!(self.file, "<meta charset='utf-8' /> ")?;
1988 write!(self.file, "<!-- page {} -->", page_num)?;
1989 write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1990 self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1991 Ok(())
1992 }
1993 fn end_page(&mut self) -> Res<()> {
1994 self.flush_string()?;
1995 self.buf = String::new();
1996 self.last_ctm = Transform::identity();
1997 write!(self.file, "</div>")?;
1998 Ok(())
1999 }
2000 fn output_character(
2001 &mut self,
2002 trm: &Transform,
2003 width: f64,
2004 spacing: f64,
2005 font_size: f64,
2006 char: &str,
2007 ) -> Res<()> {
2008 if trm.approx_eq(&self.last_ctm) {
2009 let position = trm.post_transform(&self.flip_ctm);
2010 let (x, y) = (position.m31, position.m32);
2011
2012 println!("accum {} {:?}", char, (x, y));
2013 self.buf += char;
2014 } else {
2015 println!(
2016 "flush {} {:?} {:?} {} {} {}",
2017 char, trm, self.last_ctm, width, font_size, spacing
2018 );
2019 self.flush_string()?;
2020 self.buf = char.to_owned();
2021 self.buf_font_size = font_size;
2022 self.buf_ctm = *trm;
2023 }
2024 let position = trm.post_transform(&self.flip_ctm);
2025 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2026 let transformed_font_size =
2028 (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
2029 let (x, y) = (position.m31, position.m32);
2030 write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
2031 x, y, transformed_font_size, char)?;
2032 self.last_ctm = trm.pre_transform(&Transform2D::create_translation(
2033 width * font_size + spacing,
2034 0.,
2035 ));
2036
2037 Ok(())
2038 }
2039 fn begin_word(&mut self) -> Res<()> {
2040 Ok(())
2041 }
2042 fn end_word(&mut self) -> Res<()> {
2043 Ok(())
2044 }
2045 fn end_line(&mut self) -> Res<()> {
2046 Ok(())
2047 }
2048}
2049
2050pub struct SVGOutput<'a> {
2051 file: &'a mut dyn std::io::Write,
2052}
2053impl<'a> SVGOutput<'a> {
2054 pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
2055 SVGOutput { file }
2056 }
2057}
2058
2059impl<'a> OutputDev for SVGOutput<'a> {
2060 fn begin_page(
2061 &mut self,
2062 _page_num: u32,
2063 media_box: &MediaBox,
2064 art_box: Option<(f64, f64, f64, f64)>,
2065 ) -> Res<()> {
2066 let ver = 1.1;
2067 writeln!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")?;
2068 if ver == 1.1 {
2069 write!(
2070 self.file,
2071 r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#
2072 )?;
2073 } else {
2074 write!(
2075 self.file,
2076 r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#
2077 )?;
2078 }
2079 if let Some(art_box) = art_box {
2080 let width = art_box.2 - art_box.0;
2081 let height = art_box.3 - art_box.1;
2082 let y = media_box.ury - art_box.1 - height;
2083 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2084 } else {
2085 let width = media_box.urx - media_box.llx;
2086 let height = media_box.ury - media_box.lly;
2087 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2088 }
2089 writeln!(self.file)?;
2090 type Mat = Transform;
2091
2092 let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2093 writeln!(
2094 self.file,
2095 "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2096 ctm.m11, ctm.m12, ctm.m21, ctm.m22, ctm.m31, ctm.m32,
2097 )?;
2098 Ok(())
2099 }
2100 fn end_page(&mut self) -> Res<()> {
2101 writeln!(self.file, "</g>")?;
2102 write!(self.file, "</svg>")?;
2103 Ok(())
2104 }
2105 fn output_character(
2106 &mut self,
2107 _trm: &Transform,
2108 _width: f64,
2109 _spacing: f64,
2110 _font_size: f64,
2111 _char: &str,
2112 ) -> Res<()> {
2113 Ok(())
2114 }
2115 fn begin_word(&mut self) -> Res<()> {
2116 Ok(())
2117 }
2118 fn end_word(&mut self) -> Res<()> {
2119 Ok(())
2120 }
2121 fn end_line(&mut self) -> Res<()> {
2122 Ok(())
2123 }
2124 fn fill(
2125 &mut self,
2126 ctm: &Transform,
2127 _colorspace: &ColorSpace,
2128 _color: &[f64],
2129 path: &Path,
2130 ) -> Res<()> {
2131 write!(
2132 self.file,
2133 "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2134 ctm.m11, ctm.m12, ctm.m21, ctm.m22, ctm.m31, ctm.m32,
2135 )?;
2136
2137 let mut d = Vec::new();
2145 for op in &path.ops {
2146 match *op {
2147 PathOp::MoveTo(x, y) => d.push(format!("M{} {}", x, y)),
2148 PathOp::LineTo(x, y) => d.push(format!("L{} {}", x, y)),
2149 PathOp::CurveTo(x1, y1, x2, y2, x, y) => {
2150 d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))
2151 }
2152 PathOp::Close => d.push("Z".to_string()),
2153 PathOp::Rect(x, y, width, height) => {
2154 d.push(format!("M{} {}", x, y));
2155 d.push(format!("L{} {}", x + width, y));
2156 d.push(format!("L{} {}", x + width, y + height));
2157 d.push(format!("L{} {}", x, y + height));
2158 d.push("Z".to_string());
2159 }
2160 }
2161 }
2162 write!(self.file, "<path d='{}' />", d.join(" "))?;
2163 write!(self.file, "</g>")?;
2164 writeln!(self.file)?;
2165 Ok(())
2166 }
2167}
2168
2169pub trait ConvertToFmt {
2176 type Writer: std::fmt::Write;
2177 fn convert(self) -> Self::Writer;
2178}
2179
2180impl<'a> ConvertToFmt for &'a mut String {
2181 type Writer = &'a mut String;
2182 fn convert(self) -> Self::Writer {
2183 self
2184 }
2185}
2186
2187pub struct WriteAdapter<W> {
2188 f: W,
2189}
2190
2191impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2192 fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2193 self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2194 }
2195}
2196
2197impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2198 type Writer = WriteAdapter<Self>;
2199 fn convert(self) -> Self::Writer {
2200 WriteAdapter { f: self }
2201 }
2202}
2203
2204impl<'a> ConvertToFmt for &'a mut File {
2205 type Writer = WriteAdapter<Self>;
2206 fn convert(self) -> Self::Writer {
2207 WriteAdapter { f: self }
2208 }
2209}
2210
2211pub struct PlainTextOutput<W: ConvertToFmt> {
2212 writer: W::Writer,
2213 last_end: f64,
2214 last_y: f64,
2215 first_char: bool,
2216 flip_ctm: Transform,
2217}
2218
2219impl<W: ConvertToFmt> PlainTextOutput<W> {
2220 pub fn new(writer: W) -> PlainTextOutput<W> {
2221 PlainTextOutput {
2222 writer: writer.convert(),
2223 last_end: 100000.,
2224 first_char: false,
2225 last_y: 0.,
2226 flip_ctm: Transform2D::identity(),
2227 }
2228 }
2229}
2230
2231impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2234 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Res<()> {
2235 self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2236 Ok(())
2237 }
2238 fn end_page(&mut self) -> Res<()> {
2239 Ok(())
2240 }
2241 fn output_character(
2242 &mut self,
2243 trm: &Transform,
2244 width: f64,
2245 _spacing: f64,
2246 font_size: f64,
2247 char: &str,
2248 ) -> Res<()> {
2249 let position = trm.post_transform(&self.flip_ctm);
2250 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2251 let transformed_font_size =
2253 (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
2254 let (x, y) = (position.m31, position.m32);
2255 use std::fmt::Write;
2256 if self.first_char {
2258 if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2259 writeln!(self.writer)?;
2260 }
2261
2262 if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2264 writeln!(self.writer)?;
2265 }
2266
2267 if x > self.last_end + transformed_font_size * 0.1 {
2268 dlog!(
2269 "width: {}, space: {}, thresh: {}",
2270 width,
2271 x - self.last_end,
2272 transformed_font_size * 0.1
2273 );
2274 write!(self.writer, " ")?;
2275 }
2276 }
2277 write!(self.writer, "{}", char)?;
2279 self.first_char = false;
2280 self.last_y = y;
2281 self.last_end = x + width * transformed_font_size;
2282 Ok(())
2283 }
2284 fn begin_word(&mut self) -> Res<()> {
2285 self.first_char = true;
2286 Ok(())
2287 }
2288 fn end_word(&mut self) -> Res<()> {
2289 Ok(())
2290 }
2291 fn end_line(&mut self) -> Res<()> {
2292 Ok(())
2294 }
2295}
2296
2297pub fn print_metadata(doc: &Document) -> Res<()> {
2298 dlog!("Version: {}", doc.version);
2299 if let Some(info) = get_info(doc) {
2300 for (k, v) in info {
2301 if let &Object::String(ref s, StringFormat::Literal) = v {
2302 dlog!("{}: {}", pdf_to_utf8(k)?, pdf_to_utf8(s)?);
2303 }
2304 }
2305 }
2306 dlog!("Page count: {}", get::<i64>(doc, get_pages(doc)?, b"Count"));
2307 dlog!("Pages: {:?}", get_pages(doc));
2308 dlog!(
2309 "Type: {:?}",
2310 get_pages(doc)?.get(b"Type").and_then(|x| x.as_name())?
2311 );
2312 Ok(())
2313}
2314
2315pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(
2317 path: P,
2318) -> Result<String, OutputError> {
2319 let mut s = String::new();
2320 {
2321 let mut output = PlainTextOutput::new(&mut s);
2322 let doc = Document::load(path)?;
2323 output_doc(&doc, &mut output)?;
2324 }
2325 Ok(s)
2326}
2327
2328pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2330 let mut s = String::new();
2331 {
2332 let mut output = PlainTextOutput::new(&mut s);
2333 let doc = Document::load_mem(buffer)?;
2334 output_doc(&doc, &mut output)?;
2335 }
2336 Ok(s)
2337}
2338
2339fn get_inherited<'a, T: FromObj<'a>>(
2340 doc: &'a Document,
2341 dict: &'a Dictionary,
2342 key: &[u8],
2343) -> Option<T> {
2344 let o: Option<T> = get(doc, dict, key);
2345 if let Some(o) = o {
2346 Some(o)
2347 } else {
2348 let parent = dict
2349 .get(b"Parent")
2350 .and_then(|parent| parent.as_reference())
2351 .and_then(|id| doc.get_dictionary(id))
2352 .ok()?;
2353 get_inherited(doc, parent, key)
2354 }
2355}
2356pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Res<()> {
2358 let empty_resources = &Dictionary::new();
2359
2360 let pages = doc.get_pages();
2361 for dict in pages {
2362 let page_num = dict.0;
2363 let page_dict = doc.get_object(dict.1)?.as_dict()?;
2364 dlog!("page {} {:?}", page_num, page_dict);
2365 let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2367 dlog!("resources {:?}", resources);
2368
2369 let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2371 let media_box = MediaBox {
2372 llx: media_box[0],
2373 lly: media_box[1],
2374 urx: media_box[2],
2375 ury: media_box[3],
2376 };
2377
2378 let art_box =
2379 get::<Option<Vec<f64>>>(doc, page_dict, b"ArtBox").map(|x| (x[0], x[1], x[2], x[3]));
2380
2381 output.begin_page(page_num, &media_box, art_box)?;
2382
2383 Processor::process_stream(
2384 doc,
2385 doc.get_page_content(dict.1)?,
2386 resources,
2387 &media_box,
2388 output,
2389 )?;
2390
2391 output.end_page()?;
2392 }
2393 Ok(())
2394}