1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26mod core_fonts;
27mod glyphnames;
28mod zapfglyphnames;
29mod encodings;
30
31pub struct Space;
32pub type Transform = Transform2D<f64, Space, Space>;
33
34#[derive(Debug)]
35pub enum OutputError
36{
37 FormatError(std::fmt::Error),
38 IoError(std::io::Error),
39 PdfError(lopdf::Error)
40}
41
42impl std::fmt::Display for OutputError
43{
44 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
45 match self {
46 OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
47 OutputError::IoError(e) => write!(f, "IO error: {}", e),
48 OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
49 }
50 }
51}
52
53impl std::error::Error for OutputError{}
54
55impl From<std::fmt::Error> for OutputError {
56 fn from(e: std::fmt::Error) -> Self {
57 OutputError::FormatError(e)
58 }
59}
60
61impl From<std::io::Error> for OutputError {
62 fn from(e: std::io::Error) -> Self {
63 OutputError::IoError(e)
64 }
65}
66
67impl From<lopdf::Error> for OutputError {
68 fn from(e: lopdf::Error) -> Self {
69 OutputError::PdfError(e)
70 }
71}
72
73macro_rules! dlog {
74 ($($e:expr),*) => { {$(let _ = $e;)*} }
75 }
77
78fn get_info(doc: &Document) -> Option<&Dictionary> {
79 match doc.trailer.get(b"Info") {
80 Ok(&Object::Reference(ref id)) => {
81 match doc.get_object(*id) {
82 Ok(&Object::Dictionary(ref info)) => { return Some(info); }
83 _ => {}
84 }
85 }
86 _ => {}
87 }
88 None
89}
90
91fn get_catalog(doc: &Document) -> &Dictionary {
92 match doc.trailer.get(b"Root").unwrap() {
93 &Object::Reference(ref id) => {
94 match doc.get_object(*id) {
95 Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
96 _ => {}
97 }
98 }
99 _ => {}
100 }
101 panic!();
102}
103
104fn get_pages(doc: &Document) -> &Dictionary {
105 let catalog = get_catalog(doc);
106 match catalog.get(b"Pages").unwrap() {
107 &Object::Reference(ref id) => {
108 match doc.get_object(*id) {
109 Ok(&Object::Dictionary(ref pages)) => { return pages; }
110 other => {dlog!("pages: {:?}", other)}
111 }
112 }
113 other => { dlog!("pages: {:?}", other)}
114 }
115 dlog!("catalog {:?}", catalog);
116 panic!();
117}
118
119#[allow(non_upper_case_globals)]
120const PDFDocEncoding: &'static [u16] = &[
121 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
122 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
123 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
124 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
125 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
126 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
127 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
128 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
129 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
130 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
131 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
132 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
133 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
134 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
135 0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
136 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
137 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
138 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
139 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
140 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
141 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
142 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
143 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
144 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
145 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
146 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
147 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
148 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
149 0x00fc, 0x00fd, 0x00fe, 0x00ff];
150
151fn pdf_to_utf8(s: &[u8]) -> String {
152 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
153 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
154 } else {
155 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
156 let k = PDFDocEncoding[x as usize];
157 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
158 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
159 }
160}
161
162fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
163 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
164 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
165 } else {
166 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
167 let k = encoding[x as usize];
168 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
169 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
170 }
171}
172
173
174fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
175 match o {
176 &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
177 _ => o
178 }
179}
180
181fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
182 dict.get(key).map(|o| maybe_deref(doc, o)).ok()
183}
184
185trait FromOptObj<'a> {
187 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
188}
189
190trait FromObj<'a> where Self: std::marker::Sized {
192 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
193}
194
195impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
196 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
197 obj.and_then(|x| T::from_obj(doc,x))
198 }
199}
200
201impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
202 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
203 T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
204 }
205}
206
207impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
210 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
211 maybe_deref(doc, obj).as_array().map(|x| x.iter()
212 .map(|x| T::from_obj(doc, x).expect("wrong type"))
213 .collect()).ok()
214 }
215}
216
217impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
220 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
221 maybe_deref(doc, obj).as_array().map(|x| {
222 let mut all = x.iter()
223 .map(|x| T::from_obj(doc, x).expect("wrong type"));
224 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
225 }).ok()
226 }
227}
228
229impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
230 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
231 maybe_deref(doc, obj).as_array().map(|x| {
232 let mut all = x.iter()
233 .map(|x| T::from_obj(doc, x).expect("wrong type"));
234 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
235 }).ok()
236 }
237}
238
239impl<'a> FromObj<'a> for f64 {
240 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
241 match obj {
242 &Object::Integer(i) => Some(i as f64),
243 &Object::Real(f) => Some(f.into()),
244 _ => None
245 }
246 }
247}
248
249impl<'a> FromObj<'a> for i64 {
250 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
251 match obj {
252 &Object::Integer(i) => Some(i),
253 _ => None
254 }
255 }
256}
257
258impl<'a> FromObj<'a> for &'a Dictionary {
259 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
260 maybe_deref(doc, obj).as_dict().ok()
261 }
262}
263
264impl<'a> FromObj<'a> for &'a Stream {
265 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
266 maybe_deref(doc, obj).as_stream().ok()
267 }
268}
269
270impl<'a> FromObj<'a> for &'a Object {
271 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
272 Some(maybe_deref(doc, obj))
273 }
274}
275
276fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
277 T::from_opt_obj(doc, dict.get(key).ok(), key)
278}
279
280fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
281 maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
282}
283
284fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
285 pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
286}
287
288#[allow(dead_code)]
289fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
290 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
291}
292
293fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
294 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
295}
296
297fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
298 maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
299}
300
301#[derive(Clone)]
302struct PdfSimpleFont<'a> {
303 font: &'a Dictionary,
304 doc: &'a Document,
305 encoding: Option<Vec<u16>>,
306 unicode_map: Option<HashMap<u32, String>>,
307 widths: HashMap<CharCode, f64>, missing_width: f64,
309}
310
311#[derive(Clone)]
312struct PdfType3Font<'a> {
313 font: &'a Dictionary,
314 doc: &'a Document,
315 encoding: Option<Vec<u16>>,
316 unicode_map: Option<HashMap<u32, String>>,
317 widths: HashMap<CharCode, f64>, }
319
320
321fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
322 let subtype = get_name_string(doc, font, b"Subtype");
323 dlog!("MakeFont({})", subtype);
324 if subtype == "Type0" {
325 Rc::new(PdfCIDFont::new(doc, font))
326 } else if subtype == "Type3" {
327 Rc::new(PdfType3Font::new(doc, font))
328 } else {
329 Rc::new(PdfSimpleFont::new(doc, font))
330 }
331}
332
333fn is_core_font(name: &str) -> bool {
334 match name {
335 "Courier-Bold" |
336 "Courier-BoldOblique" |
337 "Courier-Oblique" |
338 "Courier" |
339 "Helvetica-Bold" |
340 "Helvetica-BoldOblique" |
341 "Helvetica-Oblique" |
342 "Helvetica" |
343 "Symbol" |
344 "Times-Bold" |
345 "Times-BoldItalic" |
346 "Times-Italic" |
347 "Times-Roman" |
348 "ZapfDingbats" => true,
349 _ => false,
350 }
351}
352
353fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
354 let encoding = match &name[..] {
355 b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
356 b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
357 b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
358 _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
359 };
360 let encoding_table = encoding.iter()
361 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
362 .collect();
363 encoding_table
364}
365
366impl<'a> PdfSimpleFont<'a> {
373 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
374 let base_name = get_name_string(doc, font, b"BaseFont");
375 let subtype = get_name_string(doc, font, b"Subtype");
376
377 let encoding: Option<&Object> = get(doc, font, b"Encoding");
378 dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
379 let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
380 let mut type1_encoding = None;
381 let mut unicode_map = None;
382 if let Some(descriptor) = descriptor {
383 dlog!("descriptor {:?}", descriptor);
384 if subtype == "Type1" {
385 let file = maybe_get_obj(doc, descriptor, b"FontFile");
386 match file {
387 Some(&Object::Stream(ref s)) => {
388 let s = get_contents(s);
389 type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
391 }
392 _ => { dlog!("font file {:?}", file) }
393 }
394 } else if subtype == "TrueType" {
395 let file = maybe_get_obj(doc, descriptor, b"FontFile2");
396 match file {
397 Some(&Object::Stream(ref s)) => {
398 let _s = get_contents(s);
399 }
401 _ => { dlog!("font file {:?}", file) }
402 }
403 }
404
405 let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
406 match font_file3 {
407 Some(&Object::Stream(ref s)) => {
408 let subtype = get_name_string(doc, &s.dict, b"Subtype");
409 dlog!("font file {}, {:?}", subtype, s);
410 let s = get_contents(s);
411 if subtype == "Type1C" {
412 let table = cff_parser::Table::parse(&s).unwrap();
413 let charset = table.charset.get_table();
414 let encoding = table.encoding.get_table();
415 let mut mapping = HashMap::new();
416 for i in 0..encoding.len().min(charset.len()) {
417 let cid = encoding[i];
418 let sid = charset[i];
419 let name = cff_parser::string_by_id(&table, sid).unwrap();
420 let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
421 zapfglyphnames::zapfdigbats_names_to_unicode(name)
422 });
423 if let Some(unicode) = unicode {
424 let str = String::from_utf16(&[unicode]).unwrap();
425 mapping.insert(cid as u32, str);
426 }
427 }
428 unicode_map = Some(mapping);
429 }
432
433 }
436 None => {}
437 _ => { dlog!("unexpected") }
438 }
439
440 let charset = maybe_get_obj(doc, descriptor, b"CharSet");
441 let _charset = match charset {
442 Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
443 _ => { None }
444 };
445 }
447
448 let mut unicode_map = match unicode_map {
449 Some(mut unicode_map) => {
450 unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
451 Some(unicode_map)
452 }
453 None => {
454 get_unicode_map(doc, font)
455 }
456 };
457
458
459 let mut encoding_table = None;
460 match encoding {
461 Some(&Object::Name(ref encoding_name)) => {
462 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
463 encoding_table = Some(encoding_to_unicode_table(encoding_name));
464 }
465 Some(&Object::Dictionary(ref encoding)) => {
466 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
468 dlog!("BaseEncoding {:?}", base_encoding);
469 encoding_to_unicode_table(base_encoding)
470 } else {
471 Vec::from(PDFDocEncoding)
472 };
473 let differences = maybe_get_array(doc, encoding, b"Differences");
474 if let Some(differences) = differences {
475 dlog!("Differences");
476 let mut code = 0;
477 for o in differences {
478 let o = maybe_deref(doc, o);
479 match o {
480 &Object::Integer(i) => { code = i; },
481 &Object::Name(ref n) => {
482 let name = pdf_to_utf8(&n);
483 let unicode = glyphnames::name_to_unicode(&name);
486 if let Some(unicode) = unicode{
487 table[code as usize] = unicode;
488 if let Some(ref mut unicode_map) = unicode_map {
489 let be = [unicode];
490 match unicode_map.entry(code as u32) {
491 Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
493 Entry::Occupied(e) => {
494 if e.get() != &String::from_utf16(&be).unwrap() {
495 let normal_match = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
496 println!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
497 }
498 }
499 }
500 }
501 } else {
502 match unicode_map {
503 Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
504 match unicode_map.entry(code as u32) {
507 Entry::Vacant(v) => { v.insert("".to_owned()); }
508 Entry::Occupied(e) => {
509 panic!("unexpected entry in unicode map")
510 }
511 }
512 }
513 _ => {
514 println!("unknown glyph name '{}' for font {}", name, base_name);
515 }
516 }
517 }
518 dlog!("{} = {} ({:?})", code, name, unicode);
519 if let Some(ref mut unicode_map) = unicode_map {
520 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
525 }
526 code += 1;
527 }
528 _ => { panic!("wrong type {:?}", o); }
529 }
530 }
531 }
532 let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
534 dlog!("name: {}", name);
535
536 encoding_table = Some(table);
537 }
538 None => {
539 if let Some(type1_encoding) = type1_encoding {
540 let mut table = Vec::from(PDFDocEncoding);
541 dlog!("type1encoding");
542 for (code, name) in type1_encoding {
543 let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
544 if let Some(unicode) = unicode {
545 table[code as usize] = unicode;
546 } else {
547 dlog!("unknown character {}", pdf_to_utf8(&name));
548 }
549 }
550 encoding_table = Some(table)
551 } else if subtype == "TrueType" {
552 encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
553 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
554 .collect());
555 }
556 }
557 _ => { panic!() }
558 }
559
560 let mut width_map = HashMap::new();
561 if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
570 let mut i: i64 = 0;
572 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
573
574 for w in widths {
575 width_map.insert((first_char + i) as CharCode, w);
576 i += 1;
577 }
578 assert_eq!(first_char + i - 1, last_char);
579 } else {
580 let name = if is_core_font(&base_name) {
581 &base_name
582 } else {
583 println!("no widths and not core font {:?}", base_name);
584
585 "Helvetica"
600 };
601 for font_metrics in core_fonts::metrics().iter() {
602 if font_metrics.0 == base_name {
603 if let Some(ref encoding) = encoding_table {
604 dlog!("has encoding");
605 for w in font_metrics.2 {
606 let c = glyphnames::name_to_unicode(w.2).unwrap();
607 for i in 0..encoding.len() {
608 if encoding[i] == c {
609 width_map.insert(i as CharCode, w.1 as f64);
610 }
611 }
612 }
613 } else {
614 let mut table = vec![0; 256];
619 for w in font_metrics.2 {
620 dlog!("{} {}", w.0, w.2);
621 if w.0 != -1 {
623 table[w.0 as usize] = if base_name == "ZapfDingbats" {
624 zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
625 } else {
626 glyphnames::name_to_unicode(w.2).unwrap()
627 }
628 }
629 }
630
631 let encoding = &table[..];
632 for w in font_metrics.2 {
633 width_map.insert(w.0 as CharCode, w.1 as f64);
634 }
636 encoding_table = Some(encoding.to_vec());
637 }
638 }
648 }
649 }
650
651 let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
652 PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
653 }
654
655 #[allow(dead_code)]
656 fn get_type(&self) -> String {
657 get_name_string(self.doc, self.font, b"Type")
658 }
659 #[allow(dead_code)]
660 fn get_basefont(&self) -> String {
661 get_name_string(self.doc, self.font, b"BaseFont")
662 }
663 #[allow(dead_code)]
664 fn get_subtype(&self) -> String {
665 get_name_string(self.doc, self.font, b"Subtype")
666 }
667 #[allow(dead_code)]
668 fn get_widths(&self) -> Option<&Vec<Object>> {
669 maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
670 }
671 #[allow(dead_code)]
674 fn get_name(&self) -> Option<String> {
675 maybe_get_name_string(self.doc, self.font, b"Name")
676 }
677
678 #[allow(dead_code)]
679 fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
680 maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
681 }
682}
683
684
685
686impl<'a> PdfType3Font<'a> {
687 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
688
689 let unicode_map = get_unicode_map(doc, font);
690 let encoding: Option<&Object> = get(doc, font, b"Encoding");
691
692 let encoding_table;
693 match encoding {
694 Some(&Object::Name(ref encoding_name)) => {
695 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
696 encoding_table = Some(encoding_to_unicode_table(encoding_name));
697 }
698 Some(&Object::Dictionary(ref encoding)) => {
699 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
701 dlog!("BaseEncoding {:?}", base_encoding);
702 encoding_to_unicode_table(base_encoding)
703 } else {
704 Vec::from(PDFDocEncoding)
705 };
706 let differences = maybe_get_array(doc, encoding, b"Differences");
707 if let Some(differences) = differences {
708 dlog!("Differences");
709 let mut code = 0;
710 for o in differences {
711 match o {
712 &Object::Integer(i) => { code = i; },
713 &Object::Name(ref n) => {
714 let name = pdf_to_utf8(&n);
715 let unicode = glyphnames::name_to_unicode(&name);
718 if let Some(unicode) = unicode{
719 table[code as usize] = unicode;
720 }
721 dlog!("{} = {} ({:?})", code, name, unicode);
722 if let Some(ref unicode_map) = unicode_map {
723 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
724 }
725 code += 1;
726 }
727 _ => { panic!("wrong type"); }
728 }
729 }
730 }
731 let name_encoded = encoding.get(b"Type");
732 if let Ok(Object::Name(name)) = name_encoded {
733 dlog!("name: {}", pdf_to_utf8(name));
734 } else {
735 dlog!("name not found");
736 }
737
738 encoding_table = Some(table);
739 }
740 _ => { panic!() }
741 }
742
743 let first_char: i64 = get(doc, font, b"FirstChar");
744 let last_char: i64 = get(doc, font, b"LastChar");
745 let widths: Vec<f64> = get(doc, font, b"Widths");
746
747 let mut width_map = HashMap::new();
748
749 let mut i = 0;
750 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
751
752 for w in widths {
753 width_map.insert((first_char + i) as CharCode, w);
754 i += 1;
755 }
756 assert_eq!(first_char + i - 1, last_char);
757 PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
758 }
759}
760
761type CharCode = u32;
762
763struct PdfFontIter<'a>
764{
765 i: Iter<'a, u8>,
766 font: &'a dyn PdfFont,
767}
768
769impl<'a> Iterator for PdfFontIter<'a> {
770 type Item = (CharCode, u8);
771 fn next(&mut self) -> Option<(CharCode, u8)> {
772 self.font.next_char(&mut self.i)
773 }
774}
775
776trait PdfFont : Debug {
777 fn get_width(&self, id: CharCode) -> f64;
778 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
779 fn decode_char(&self, char: CharCode) -> String;
780
781 }
787
788impl<'a> dyn PdfFont + 'a {
789 fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
790 PdfFontIter{i: chars.iter(), font: self}
791 }
792 fn decode(&self, chars: &[u8]) -> String {
793 let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
794 strings.join("")
795 }
796
797}
798
799
800impl<'a> PdfFont for PdfSimpleFont<'a> {
801 fn get_width(&self, id: CharCode) -> f64 {
802 let width = self.widths.get(&id);
803 if let Some(width) = width {
804 return *width;
805 } else {
806 let mut widths = self.widths.iter().collect::<Vec<_>>();
807 widths.sort_by_key(|x| x.0);
808 dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
809 return self.missing_width;
810 }
811 }
812 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
818 iter.next().map(|x| (*x as CharCode, 1))
819 }
820 fn decode_char(&self, char: CharCode) -> String {
821 let slice = [char as u8];
822 if let Some(ref unicode_map) = self.unicode_map {
823 let s = unicode_map.get(&char);
824 let s = match s {
825 None => {
826 println!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
827 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
830 let s = to_utf8(encoding, &slice);
831 println!("falling back to encoding {} -> {:?}", char, s);
832 s
833 }
834 Some(s) => { s.clone() }
835 };
836 return s
837 }
838 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
839 let s = to_utf8(encoding, &slice);
841 s
842 }
843}
844
845
846
847impl<'a> fmt::Debug for PdfSimpleFont<'a> {
848 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
849 self.font.fmt(f)
850 }
851}
852
853impl<'a> PdfFont for PdfType3Font<'a> {
854 fn get_width(&self, id: CharCode) -> f64 {
855 let width = self.widths.get(&id);
856 if let Some(width) = width {
857 return *width;
858 } else {
859 panic!("missing width for {} {:?}", id, self.font);
860 }
861 }
862 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
868 iter.next().map(|x| (*x as CharCode, 1))
869 }
870 fn decode_char(&self, char: CharCode) -> String {
871 let slice = [char as u8];
872 if let Some(ref unicode_map) = self.unicode_map {
873 let s = unicode_map.get(&char);
874 let s = match s {
875 None => {
876 println!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
877 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
880 let s = to_utf8(encoding, &slice);
881 println!("falling back to encoding {} -> {:?}", char, s);
882 s
883 }
884 Some(s) => { s.clone() }
885 };
886 return s
887 }
888 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
889 let s = to_utf8(encoding, &slice);
891 s
892 }
893}
894
895
896
897impl<'a> fmt::Debug for PdfType3Font<'a> {
898 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
899 self.font.fmt(f)
900 }
901}
902
903struct PdfCIDFont<'a> {
904 font: &'a Dictionary,
905 #[allow(dead_code)]
906 doc: &'a Document,
907 #[allow(dead_code)]
908 encoding: ByteMapping,
909 to_unicode: Option<HashMap<u32, String>>,
910 widths: HashMap<CharCode, f64>, default_width: Option<f64>, }
913
914fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
915 let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
916 dlog!("ToUnicode: {:?}", to_unicode);
917 let mut unicode_map = None;
918 match to_unicode {
919 Some(&Object::Stream(ref stream)) => {
920 let contents = get_contents(stream);
921 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
922
923 let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
924 let mut unicode = HashMap::new();
925 for (&k, v) in cmap.iter() {
929 let mut be: Vec<u16> = Vec::new();
930 let mut i = 0;
931 assert!(v.len() % 2 == 0);
932 while i < v.len() {
933 be.push(((v[i] as u16) << 8) | v[i+1] as u16);
934 i += 2;
935 }
936 match &be[..] {
937 [0xd800 ..= 0xdfff] => {
938 continue;
941 }
942 _ => {}
943 }
944 let s = String::from_utf16(&be).unwrap();
945
946 unicode.insert(k, s);
947 }
948 unicode_map = Some(unicode);
949
950 dlog!("map: {:?}", unicode_map);
951 }
952 None => { }
953 Some(&Object::Name(ref name)) => {
954 let name = pdf_to_utf8(name);
955 if name != "Identity-H" {
956 todo!("unsupported ToUnicode name: {:?}", name);
957 }
958 }
959 _ => { panic!("unsupported cmap {:?}", to_unicode)}
960 }
961 unicode_map
962}
963
964
965impl<'a> PdfCIDFont<'a> {
966 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
967 let base_name = get_name_string(doc, font, b"BaseFont");
968 let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
969 let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
970 let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
971 dlog!("base_name {} {:?}", base_name, font);
972
973 let encoding = match encoding {
974 &Object::Name(ref name) => {
975 let name = pdf_to_utf8(name);
976 dlog!("encoding {:?}", name);
977 if name == "Identity-H" || name == "Identity-V" {
978 ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
979 } else {
980 panic!("unsupported encoding {}", name);
981 }
982 }
983 &Object::Stream(ref stream) => {
984 let contents = get_contents(stream);
985 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
986 adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
987 }
988 _ => { panic!("unsupported encoding {:?}", encoding)}
989 };
990
991 let unicode_map = get_unicode_map(doc, font);
997
998 dlog!("descendents {:?} {:?}", descendants, ciddict);
999
1000 let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1001 dlog!("{:?}", font_dict);
1002 let _f = font_dict.as_dict().expect("must be dict");
1003 let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1004 let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1005 dlog!("widths {:?}", w);
1006 let mut widths = HashMap::new();
1007 let mut i = 0;
1008 if let Some(w) = w {
1009 while i < w.len() {
1010 if let &Object::Array(ref wa) = w[i+1] {
1011 let cid = w[i].as_i64().expect("id should be num");
1012 let mut j = 0;
1013 dlog!("wa: {:?} -> {:?}", cid, wa);
1014 for w in wa {
1015 widths.insert((cid + j) as CharCode, as_num(w) );
1016 j += 1;
1017 }
1018 i += 2;
1019 } else {
1020 let c_first = w[i].as_i64().expect("first should be num");
1021 let c_last = w[i].as_i64().expect("last should be num");
1022 let c_width = as_num(&w[i]);
1023 for id in c_first..c_last {
1024 widths.insert(id as CharCode, c_width);
1025 }
1026 i += 3;
1027 }
1028 }
1029 }
1030 PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1031 }
1032}
1033
1034impl<'a> PdfFont for PdfCIDFont<'a> {
1035 fn get_width(&self, id: CharCode) -> f64 {
1036 let width = self.widths.get(&id);
1037 if let Some(width) = width {
1038 dlog!("GetWidth {} -> {}", id, *width);
1039 return *width;
1040 } else {
1041 dlog!("missing width for {} falling back to default_width", id);
1042 return self.default_width.unwrap();
1043 }
1044 }fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1055 let mut c = *iter.next()? as u32;
1056 let mut code = None;
1057 'outer: for width in 1..=4 {
1058 for range in &self.encoding.codespace {
1059 if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1060 code = Some((c as u32, width));
1061 break 'outer;
1062 }
1063 }
1064 let next = *iter.next()?;
1065 c = ((c as u32) << 8) | next as u32;
1066 }
1067 let code = code?;
1068 for range in &self.encoding.cid {
1069 if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1070 return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1071 }
1072 }
1073 None
1074 }
1075 fn decode_char(&self, char: CharCode) -> String {
1076 let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1077 if let Some(s) = s {
1078 s.clone()
1079 } else {
1080 dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1081 "".to_string()
1082 }
1083 }
1084}
1085
1086impl<'a> fmt::Debug for PdfCIDFont<'a> {
1087 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1088 self.font.fmt(f)
1089 }
1090}
1091
1092
1093
1094#[derive(Copy, Clone)]
1095struct PdfFontDescriptor<'a> {
1096 desc: &'a Dictionary,
1097 doc: &'a Document
1098}
1099
1100impl<'a> PdfFontDescriptor<'a> {
1101 #[allow(dead_code)]
1102 fn get_file(&self) -> Option<&'a Object> {
1103 maybe_get_obj(self.doc, self.desc, b"FontFile")
1104 }
1105}
1106
1107impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1108 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1109 self.desc.fmt(f)
1110 }
1111}
1112
1113#[derive(Clone, Debug)]
1114struct Type0Func {
1115 domain: Vec<f64>,
1116 range: Vec<f64>,
1117 contents: Vec<u8>,
1118 size: Vec<i64>,
1119 bits_per_sample: i64,
1120 encode: Vec<f64>,
1121 decode: Vec<f64>,
1122}
1123
1124#[allow(dead_code)]
1125fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1126 let divisor = x - x_min;
1127 if divisor != 0. {
1128 y_min + (x - x_min) * ((y_max - y_min) / divisor)
1129 } else {
1130 y_min
1133 }
1134}
1135
1136impl Type0Func {
1137 #[allow(dead_code)]
1138 fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1139 let _n_inputs = self.domain.len() / 2;
1140 let _n_ouputs = self.range.len() / 2;
1141
1142 }
1143}
1144
1145#[derive(Clone, Debug)]
1146struct Type2Func {
1147 c0: Option<Vec<f64>>,
1148 c1: Option<Vec<f64>>,
1149 n: f64,
1150}
1151
1152#[derive(Clone, Debug)]
1153enum Function {
1154 Type0(Type0Func),
1155 Type2(Type2Func),
1156 #[allow(dead_code)]
1157 Type3,
1158 #[allow(dead_code)]
1159 Type4(Vec<u8>)
1160}
1161
1162impl Function {
1163 fn new(doc: &Document, obj: &Object) -> Function {
1164 let dict = match obj {
1165 &Object::Dictionary(ref dict) => dict,
1166 &Object::Stream(ref stream) => &stream.dict,
1167 _ => panic!()
1168 };
1169 let function_type: i64 = get(doc, dict, b"FunctionType");
1170 let f = match function_type {
1171 0 => {
1172 let stream = match obj {
1174 &Object::Stream(ref stream) => stream,
1175 _ => panic!()
1176 };
1177 let range: Vec<f64> = get(doc, dict, b"Range");
1178 let domain: Vec<f64> = get(doc, dict, b"Domain");
1179 let contents = get_contents(stream);
1180 let size: Vec<i64> = get(doc, dict, b"Size");
1181 let bits_per_sample = get(doc, dict, b"BitsPerSample");
1182 let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1185 let encode = encode.unwrap_or_else(|| {
1187 let mut default = Vec::new();
1188 for i in &size {
1189 default.extend([0., (i - 1) as f64].iter());
1190 }
1191 default
1192 });
1193 let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1194
1195 Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1196 }
1197 2 => {
1198 let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1200 let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1201 let n = get::<f64>(doc, dict, b"N");
1202 Function::Type2(Type2Func { c0, c1, n})
1203 }
1204 3 => {
1205 Function::Type3
1207 }
1208 4 => {
1209 let contents = match obj {
1211 &Object::Stream(ref stream) => {
1212 let contents = get_contents(stream);
1213 println!("unhandled type-4 function");
1214 println!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1215 contents
1216 }
1217 _ => { panic!("type 4 functions should be streams") }
1218 };
1219 Function::Type4(contents)
1220 }
1221 _ => { panic!("unhandled function type {}", function_type) }
1222 };
1223 f
1224 }
1225}
1226
1227fn as_num(o: &Object) -> f64 {
1228 match o {
1229 &Object::Integer(i) => { i as f64 }
1230 &Object::Real(f) => { f.into() }
1231 _ => { panic!("not a number") }
1232 }
1233}
1234
1235#[derive(Clone)]
1236struct TextState<'a>
1237{
1238 font: Option<Rc<dyn PdfFont + 'a>>,
1239 font_size: f64,
1240 character_spacing: f64,
1241 word_spacing: f64,
1242 horizontal_scaling: f64,
1243 leading: f64,
1244 rise: f64,
1245 tm: Transform,
1246}
1247
1248fn get_contents(contents: &Stream) -> Vec<u8> {
1250 if contents.filter().is_ok() {
1251 contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1252 } else {
1253 contents.content.clone()
1254 }
1255}
1256
1257#[derive(Clone)]
1258struct GraphicsState<'a>
1259{
1260 ctm: Transform,
1261 ts: TextState<'a>,
1262 smask: Option<Dictionary>,
1263 fill_colorspace: ColorSpace,
1264 fill_color: Vec<f64>,
1265 stroke_colorspace: ColorSpace,
1266 stroke_color: Vec<f64>,
1267 line_width: f64,
1268}
1269
1270fn show_text(gs: &mut GraphicsState, s: &[u8],
1271 _tlm: &Transform,
1272 _flip_ctm: &Transform,
1273 output: &mut dyn OutputDev) -> Result<(), OutputError> {
1274 let ts = &mut gs.ts;
1275 let font = ts.font.as_ref().unwrap();
1276 dlog!("{:?}", font.decode(s));
1278 dlog!("{:?}", font.decode(s).as_bytes());
1279 dlog!("{:?}", s);
1280 output.begin_word()?;
1281
1282 for (c, length) in font.char_codes(s) {
1283 let tsm = Transform2D::row_major(ts.horizontal_scaling,
1285 0.,
1286 0.,
1287 1.0,
1288 0.,
1289 ts.rise);
1290 let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1292 let w0 = font.get_width(c) / 1000.;
1299
1300 let mut spacing = ts.character_spacing;
1301 let is_space = c == 32 && length == 1;
1306 if is_space { spacing += ts.word_spacing }
1307
1308 output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1309 let tj = 0.;
1310 let ty = 0.;
1311 let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1312 dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1313 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1315 let _trm = ts.tm.pre_transform(&gs.ctm);
1316 }
1319 output.end_word()?;
1320 Ok(())
1321}
1322
1323#[derive(Debug, Clone, Copy)]
1324pub struct MediaBox {
1325 pub llx: f64,
1326 pub lly: f64,
1327 pub urx: f64,
1328 pub ury: f64
1329}
1330
1331fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1332 for (k, v) in state.iter() {
1333 let k : &[u8] = k.as_ref();
1334 match k {
1335 b"SMask" => { match maybe_deref(doc, v) {
1336 &Object::Name(ref name) => {
1337 if name == b"None" {
1338 gs.smask = None;
1339 } else {
1340 panic!("unexpected smask name")
1341 }
1342 }
1343 &Object::Dictionary(ref dict) => {
1344 gs.smask = Some(dict.clone());
1345 }
1346 _ => { panic!("unexpected smask type {:?}", v) }
1347 }}
1348 b"Type" => { match v {
1349 &Object::Name(ref name) => {
1350 assert_eq!(name, b"ExtGState")
1351 }
1352 _ => { panic!("unexpected type") }
1353 }}
1354 _ => { dlog!("unapplied state: {:?} {:?}", k, v); }
1355 }
1356 }
1357
1358}
1359
1360#[derive(Debug)]
1361pub enum PathOp {
1362 MoveTo(f64, f64),
1363 LineTo(f64, f64),
1364 CurveTo(f64, f64, f64, f64, f64, f64),
1366 Rect(f64, f64, f64, f64),
1367 Close,
1368}
1369
1370#[derive(Debug)]
1371pub struct Path {
1372 pub ops: Vec<PathOp>
1373}
1374
1375impl Path {
1376 fn new() -> Path {
1377 Path { ops: Vec::new() }
1378 }
1379 fn current_point(&self) -> (f64, f64) {
1380 match self.ops.last().unwrap() {
1381 &PathOp::MoveTo(x, y) => { (x, y) }
1382 &PathOp::LineTo(x, y) => { (x, y) }
1383 &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1384 _ => { panic!() }
1385 }
1386 }
1387}
1388
1389#[derive(Clone, Debug)]
1390pub struct CalGray {
1391 white_point: [f64; 3],
1392 black_point: Option<[f64; 3]>,
1393 gamma: Option<f64>,
1394}
1395
1396#[derive(Clone, Debug)]
1397pub struct CalRGB {
1398 white_point: [f64; 3],
1399 black_point: Option<[f64; 3]>,
1400 gamma: Option<[f64; 3]>,
1401 matrix: Option<Vec<f64>>
1402}
1403
1404#[derive(Clone, Debug)]
1405pub struct Lab {
1406 white_point: [f64; 3],
1407 black_point: Option<[f64; 3]>,
1408 range: Option<[f64; 4]>,
1409}
1410
1411#[derive(Clone, Debug)]
1412pub enum AlternateColorSpace {
1413 DeviceGray,
1414 DeviceRGB,
1415 DeviceCMYK,
1416 CalRGB(CalRGB),
1417 CalGray(CalGray),
1418 Lab(Lab),
1419 ICCBased(Vec<u8>)
1420}
1421
1422#[derive(Clone)]
1423pub struct Separation {
1424 name: String,
1425 alternate_space: AlternateColorSpace,
1426 tint_transform: Box<Function>,
1427}
1428
1429#[derive(Clone)]
1430pub enum ColorSpace {
1431 DeviceGray,
1432 DeviceRGB,
1433 DeviceCMYK,
1434 DeviceN,
1435 Pattern,
1436 CalRGB(CalRGB),
1437 CalGray(CalGray),
1438 Lab(Lab),
1439 Separation(Separation),
1440 ICCBased(Vec<u8>)
1441}
1442
1443fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1444 match name {
1445 b"DeviceGray" => ColorSpace::DeviceGray,
1446 b"DeviceRGB" => ColorSpace::DeviceRGB,
1447 b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1448 b"Pattern" => ColorSpace::Pattern,
1449 _ => {
1450 let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1451 let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1452 if let Ok(cs) = cs.as_array() {
1453 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1454 match cs_name.as_ref() {
1455 "Separation" => {
1456 let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1457 let alternate_space = match &maybe_deref(doc, &cs[2]) {
1458 Object::Name(name) => {
1459 match &name[..] {
1460 b"DeviceGray" => AlternateColorSpace::DeviceGray,
1461 b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1462 b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1463 _ => panic!("unexpected color space name")
1464 }
1465 }
1466 Object::Array(cs) => {
1467 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1468 match cs_name.as_ref() {
1469 "ICCBased" => {
1470 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1471 dlog!("ICCBased {:?}", stream);
1472 AlternateColorSpace::ICCBased(get_contents(stream))
1474 }
1475 "CalGray" => {
1476 let dict = cs[1].as_dict().expect("second arg must be a dict");
1477 AlternateColorSpace::CalGray(CalGray {
1478 white_point: get(&doc, dict, b"WhitePoint"),
1479 black_point: get(&doc, dict, b"BackPoint"),
1480 gamma: get(&doc, dict, b"Gamma"),
1481 })
1482 }
1483 "CalRGB" => {
1484 let dict = cs[1].as_dict().expect("second arg must be a dict");
1485 AlternateColorSpace::CalRGB(CalRGB {
1486 white_point: get(&doc, dict, b"WhitePoint"),
1487 black_point: get(&doc, dict, b"BackPoint"),
1488 gamma: get(&doc, dict, b"Gamma"),
1489 matrix: get(&doc, dict, b"Matrix"),
1490 })
1491 }
1492 "Lab" => {
1493 let dict = cs[1].as_dict().expect("second arg must be a dict");
1494 AlternateColorSpace::Lab(Lab {
1495 white_point: get(&doc, dict, b"WhitePoint"),
1496 black_point: get(&doc, dict, b"BackPoint"),
1497 range: get(&doc, dict, b"Range"),
1498 })
1499 }
1500 _ => panic!("Unexpected color space name")
1501 }
1502 }
1503 _ => panic!("Alternate space should be name or array {:?}", cs[2])
1504 };
1505 let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1506
1507 dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1508 ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1509 }
1510 "ICCBased" => {
1511 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1512 dlog!("ICCBased {:?}", stream);
1513 ColorSpace::ICCBased(get_contents(stream))
1515 }
1516 "CalGray" => {
1517 let dict = cs[1].as_dict().expect("second arg must be a dict");
1518 ColorSpace::CalGray(CalGray {
1519 white_point: get(&doc, dict, b"WhitePoint"),
1520 black_point: get(&doc, dict, b"BackPoint"),
1521 gamma: get(&doc, dict, b"Gamma"),
1522 })
1523 }
1524 "CalRGB" => {
1525 let dict = cs[1].as_dict().expect("second arg must be a dict");
1526 ColorSpace::CalRGB(CalRGB {
1527 white_point: get(&doc, dict, b"WhitePoint"),
1528 black_point: get(&doc, dict, b"BackPoint"),
1529 gamma: get(&doc, dict, b"Gamma"),
1530 matrix: get(&doc, dict, b"Matrix"),
1531 })
1532 }
1533 "Lab" => {
1534 let dict = cs[1].as_dict().expect("second arg must be a dict");
1535 ColorSpace::Lab(Lab {
1536 white_point: get(&doc, dict, b"WhitePoint"),
1537 black_point: get(&doc, dict, b"BackPoint"),
1538 range: get(&doc, dict, b"Range"),
1539 })
1540 }
1541 "Pattern" => {
1542 ColorSpace::Pattern
1543 },
1544 "DeviceGray" => ColorSpace::DeviceGray,
1545 "DeviceRGB" => ColorSpace::DeviceRGB,
1546 "DeviceCMYK" => ColorSpace::DeviceCMYK,
1547 "DeviceN" => ColorSpace::DeviceN,
1548 _ => {
1549 panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1550 }
1551 }
1552 } else if let Ok(cs) = cs.as_name() {
1553 match pdf_to_utf8(cs).as_ref() {
1554 "DeviceRGB" => ColorSpace::DeviceRGB,
1555 "DeviceGray" => ColorSpace::DeviceGray,
1556 _ => panic!()
1557 }
1558 } else {
1559 panic!();
1560 }
1561 }
1562 }
1563}
1564
1565struct Processor<'a> {
1566 _none: PhantomData<&'a ()>
1567}
1568
1569impl<'a> Processor<'a> {
1570 fn new() -> Processor<'a> {
1571 Processor { _none: PhantomData }
1572 }
1573
1574 fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1575 let content = Content::decode(&content).unwrap();
1576 let mut font_table = HashMap::new();
1577 let mut gs: GraphicsState = GraphicsState {
1578 ts: TextState {
1579 font: None,
1580 font_size: std::f64::NAN,
1581 character_spacing: 0.,
1582 word_spacing: 0.,
1583 horizontal_scaling: 100. / 100.,
1584 leading: 0.,
1585 rise: 0.,
1586 tm: Transform2D::identity(),
1587 },
1588 fill_color: Vec::new(),
1589 fill_colorspace: ColorSpace::DeviceGray,
1590 stroke_color: Vec::new(),
1591 stroke_colorspace: ColorSpace::DeviceGray,
1592 line_width: 1.,
1593 ctm: Transform2D::identity(),
1594 smask: None
1595 };
1596 let mut gs_stack = Vec::new();
1598 let mut mc_stack = Vec::new();
1599 let mut tlm = Transform2D::identity();
1601 let mut path = Path::new();
1602 let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1603 dlog!("MediaBox {:?}", media_box);
1604 for operation in &content.operations {
1605 match operation.operator.as_ref() {
1608 "BT" => {
1609 tlm = Transform2D::identity();
1610 gs.ts.tm = tlm;
1611 }
1612 "ET" => {
1613 tlm = Transform2D::identity();
1614 gs.ts.tm = tlm;
1615 }
1616 "cm" => {
1617 assert!(operation.operands.len() == 6);
1618 let m = Transform2D::row_major(as_num(&operation.operands[0]),
1619 as_num(&operation.operands[1]),
1620 as_num(&operation.operands[2]),
1621 as_num(&operation.operands[3]),
1622 as_num(&operation.operands[4]),
1623 as_num(&operation.operands[5]));
1624 gs.ctm = gs.ctm.pre_transform(&m);
1625 dlog!("matrix {:?}", gs.ctm);
1626 }
1627 "CS" => {
1628 let name = operation.operands[0].as_name().unwrap();
1629 gs.stroke_colorspace = make_colorspace(doc, name, resources);
1630 }
1631 "cs" => {
1632 let name = operation.operands[0].as_name().unwrap();
1633 gs.fill_colorspace = make_colorspace(doc, name, resources);
1634 }
1635 "SC" | "SCN" => {
1636 gs.stroke_color = match gs.stroke_colorspace {
1637 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1638 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1639 };
1640 }
1641 "sc" | "scn" => {
1642 gs.fill_color = match gs.fill_colorspace {
1643 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1644 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1645 };
1646 }
1647 "G" | "g" | "RG" | "rg" | "K" | "k" => {
1648 dlog!("unhandled color operation {:?}", operation);
1649 }
1650 "TJ" => {
1651 match operation.operands[0] {
1652 Object::Array(ref array) => {
1653 for e in array {
1654 match e {
1655 &Object::String(ref s, _) => {
1656 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1657 }
1658 &Object::Integer(i) => {
1659 let ts = &mut gs.ts;
1660 let w0 = 0.;
1661 let tj = i as f64;
1662 let ty = 0.;
1663 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1664 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1665 dlog!("adjust text by: {} {:?}", i, ts.tm);
1666 }
1667 &Object::Real(i) => {
1668 let ts = &mut gs.ts;
1669 let w0 = 0.;
1670 let tj = i as f64;
1671 let ty = 0.;
1672 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1673 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1674 dlog!("adjust text by: {} {:?}", i, ts.tm);
1675 }
1676 _ => { dlog!("kind of {:?}", e); }
1677 }
1678 }
1679 }
1680 _ => {}
1681 }
1682 }
1683 "Tj" => {
1684 match operation.operands[0] {
1685 Object::String(ref s, _) => {
1686 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1687 }
1688 _ => { panic!("unexpected Tj operand {:?}", operation) }
1689 }
1690 }
1691 "Tc" => {
1692 gs.ts.character_spacing = as_num(&operation.operands[0]);
1693 }
1694 "Tw" => {
1695 gs.ts.word_spacing = as_num(&operation.operands[0]);
1696 }
1697 "Tz" => {
1698 gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1699 }
1700 "TL" => {
1701 gs.ts.leading = as_num(&operation.operands[0]);
1702 }
1703 "Tf" => {
1704 let fonts: &Dictionary = get(&doc, resources, b"Font");
1705 let name = operation.operands[0].as_name().unwrap();
1706 let font = font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1707 {
1708 }
1716 gs.ts.font = Some(font);
1717
1718 gs.ts.font_size = as_num(&operation.operands[1]);
1719 dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1720 }
1721 "Ts" => {
1722 gs.ts.rise = as_num(&operation.operands[0]);
1723 }
1724 "Tm" => {
1725 assert!(operation.operands.len() == 6);
1726 tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1727 as_num(&operation.operands[1]),
1728 as_num(&operation.operands[2]),
1729 as_num(&operation.operands[3]),
1730 as_num(&operation.operands[4]),
1731 as_num(&operation.operands[5]));
1732 gs.ts.tm = tlm;
1733 dlog!("Tm: matrix {:?}", gs.ts.tm);
1734 output.end_line()?;
1735 }
1736 "Td" => {
1737 assert!(operation.operands.len() == 2);
1742 let tx = as_num(&operation.operands[0]);
1743 let ty = as_num(&operation.operands[1]);
1744 dlog!("translation: {} {}", tx, ty);
1745
1746 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1747 gs.ts.tm = tlm;
1748 dlog!("Td matrix {:?}", gs.ts.tm);
1749 output.end_line()?;
1750 }
1751
1752 "TD" => {
1753 assert!(operation.operands.len() == 2);
1757 let tx = as_num(&operation.operands[0]);
1758 let ty = as_num(&operation.operands[1]);
1759 dlog!("translation: {} {}", tx, ty);
1760 gs.ts.leading = -ty;
1761
1762 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1763 gs.ts.tm = tlm;
1764 dlog!("TD matrix {:?}", gs.ts.tm);
1765 output.end_line()?;
1766 }
1767
1768 "T*" => {
1769 let tx = 0.0;
1770 let ty = -gs.ts.leading;
1771
1772 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1773 gs.ts.tm = tlm;
1774 dlog!("T* matrix {:?}", gs.ts.tm);
1775 output.end_line()?;
1776 }
1777 "q" => { gs_stack.push(gs.clone()); }
1778 "Q" => {
1779 let s = gs_stack.pop();
1780 if let Some(s) = s {
1781 gs = s;
1782 } else {
1783 println!("No state to pop");
1784 }
1785 }
1786 "gs" => {
1787 let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1788 let name = operation.operands[0].as_name().unwrap();
1789 let state: &Dictionary = get(doc, ext_gstate, name);
1790 apply_state(doc, &mut gs, state);
1791 }
1792 "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1793 "w" => { gs.line_width = as_num(&operation.operands[0]); }
1794 "J" | "j" | "M" | "d" | "ri" => { dlog!("unknown graphics state operator {:?}", operation); }
1795 "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1796 "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1797 "c" => {
1798 path.ops.push(PathOp::CurveTo(
1799 as_num(&operation.operands[0]),
1800 as_num(&operation.operands[1]),
1801 as_num(&operation.operands[2]),
1802 as_num(&operation.operands[3]),
1803 as_num(&operation.operands[4]),
1804 as_num(&operation.operands[5])))
1805 }
1806 "v" => {
1807 let (x, y) = path.current_point();
1808 path.ops.push(PathOp::CurveTo(
1809 x,
1810 y,
1811 as_num(&operation.operands[0]),
1812 as_num(&operation.operands[1]),
1813 as_num(&operation.operands[2]),
1814 as_num(&operation.operands[3])))
1815 }
1816 "y" => {
1817 path.ops.push(PathOp::CurveTo(
1818 as_num(&operation.operands[0]),
1819 as_num(&operation.operands[1]),
1820 as_num(&operation.operands[2]),
1821 as_num(&operation.operands[3]),
1822 as_num(&operation.operands[2]),
1823 as_num(&operation.operands[3])))
1824 }
1825 "h" => { path.ops.push(PathOp::Close) }
1826 "re" => {
1827 path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1828 as_num(&operation.operands[1]),
1829 as_num(&operation.operands[2]),
1830 as_num(&operation.operands[3])))
1831 }
1832 "s" | "f*" | "B" | "B*" | "b" => {
1833 dlog!("unhandled path op {:?}", operation);
1834 }
1835 "S" => {
1836 output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1837 path.ops.clear();
1838 }
1839 "F" | "f" => {
1840 output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1841 path.ops.clear();
1842 }
1843 "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1844 "n" => {
1845 dlog!("discard {:?}", path);
1846 path.ops.clear();
1847 }
1848 "BMC" | "BDC" => {
1849 mc_stack.push(operation);
1850 }
1851 "EMC" => {
1852 mc_stack.pop();
1853 }
1854 "Do" => {
1855 let xobject: &Dictionary = get(&doc, resources, b"XObject");
1858 let name = operation.operands[0].as_name().unwrap();
1859 let xf: &Stream = get(&doc, xobject, name);
1860 let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1861 let contents = get_contents(xf);
1862 self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1863 }
1864 _ => { dlog!("unknown operation {:?}", operation); }
1865
1866 }
1867 }
1868 Ok(())
1869 }
1870}
1871
1872
1873pub trait OutputDev {
1874 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1875 fn end_page(&mut self)-> Result<(), OutputError>;
1876 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1877 fn begin_word(&mut self)-> Result<(), OutputError>;
1878 fn end_word(&mut self)-> Result<(), OutputError>;
1879 fn end_line(&mut self)-> Result<(), OutputError>;
1880 fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1881 fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1882}
1883
1884
1885pub struct HTMLOutput<'a> {
1886 file: &'a mut dyn std::io::Write,
1887 flip_ctm: Transform,
1888 last_ctm: Transform,
1889 buf_ctm: Transform,
1890 buf_font_size: f64,
1891 buf: String
1892}
1893
1894fn insert_nbsp(input: &str) -> String {
1895 let mut result = String::new();
1896 let mut word_end = false;
1897 let mut chars = input.chars().peekable();
1898 while let Some(c) = chars.next() {
1899 if c == ' ' {
1900 if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1901 result += " ";
1902 } else {
1903 result += " ";
1904 }
1905 word_end = false;
1906 } else {
1907 word_end = true;
1908 result.push(c);
1909 }
1910 }
1911 result
1912}
1913
1914impl<'a> HTMLOutput<'a> {
1915 pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1916 HTMLOutput {
1917 file,
1918 flip_ctm: Transform2D::identity(),
1919 last_ctm: Transform2D::identity(),
1920 buf_ctm: Transform2D::identity(),
1921 buf: String::new(),
1922 buf_font_size: 0.,
1923 }
1924 }
1925 fn flush_string(&mut self) -> Result<(), OutputError>{
1926 if self.buf.len() != 0 {
1927
1928 let position = self.buf_ctm.post_transform(&self.flip_ctm);
1929 let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1930 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1932 let (x, y) = (position.m31, position.m32);
1933 println!("flush {} {:?}", self.buf, (x,y));
1934
1935 write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1936 x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1937 }
1938 Ok(())
1939 }
1940}
1941
1942type ArtBox = (f64, f64, f64, f64);
1943
1944impl<'a> OutputDev for HTMLOutput<'a> {
1945 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1946 write!(self.file, "<meta charset='utf-8' /> ")?;
1947 write!(self.file, "<!-- page {} -->", page_num)?;
1948 write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1949 self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1950 Ok(())
1951 }
1952 fn end_page(&mut self) -> Result<(), OutputError> {
1953 self.flush_string()?;
1954 self.buf = String::new();
1955 self.last_ctm = Transform::identity();
1956 write!(self.file, "</div>")?;
1957 Ok(())
1958 }
1959 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1960 if trm.approx_eq(&self.last_ctm) {
1961 let position = trm.post_transform(&self.flip_ctm);
1962 let (x, y) = (position.m31, position.m32);
1963
1964 println!("accum {} {:?}", char, (x,y));
1965 self.buf += char;
1966 } else {
1967 println!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1968 self.flush_string()?;
1969 self.buf = char.to_owned();
1970 self.buf_font_size = font_size;
1971 self.buf_ctm = *trm;
1972 }
1973 let position = trm.post_transform(&self.flip_ctm);
1974 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1975 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1977 let (x, y) = (position.m31, position.m32);
1978 write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1979 x, y, transformed_font_size, char)?;
1980 self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1981
1982 Ok(())
1983 }
1984 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1985 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1986 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1987}
1988
1989pub struct SVGOutput<'a> {
1990 file: &'a mut dyn std::io::Write
1991}
1992impl<'a> SVGOutput<'a> {
1993 pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
1994 SVGOutput{file}
1995 }
1996}
1997
1998impl<'a> OutputDev for SVGOutput<'a> {
1999 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2000 let ver = 1.1;
2001 write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2002 if ver == 1.1 {
2003 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2004 } else {
2005 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2006 }
2007 if let Some(art_box) = art_box {
2008 let width = art_box.2 - art_box.0;
2009 let height = art_box.3 - art_box.1;
2010 let y = media_box.ury - art_box.1 - height;
2011 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2012 } else {
2013 let width = media_box.urx - media_box.llx;
2014 let height = media_box.ury - media_box.lly;
2015 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2016 }
2017 write!(self.file, "\n")?;
2018 type Mat = Transform;
2019
2020 let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2021 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2022 ctm.m11,
2023 ctm.m12,
2024 ctm.m21,
2025 ctm.m22,
2026 ctm.m31,
2027 ctm.m32,
2028 )?;
2029 Ok(())
2030 }
2031 fn end_page(&mut self) -> Result<(), OutputError>{
2032 write!(self.file, "</g>\n")?;
2033 write!(self.file, "</svg>")?;
2034 Ok(())
2035 }
2036 fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2037 Ok(())
2038 }
2039 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2040 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2041 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2042 fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2043 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2044 ctm.m11,
2045 ctm.m12,
2046 ctm.m21,
2047 ctm.m22,
2048 ctm.m31,
2049 ctm.m32,
2050 )?;
2051
2052 let mut d = Vec::new();
2060 for op in &path.ops {
2061 match op {
2062 &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2063 &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2064 &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2065 &PathOp::Close => { d.push(format!("Z"))},
2066 &PathOp::Rect(x, y, width, height) => {
2067 d.push(format!("M{} {}", x, y));
2068 d.push(format!("L{} {}", x + width, y));
2069 d.push(format!("L{} {}", x + width, y + height));
2070 d.push(format!("L{} {}", x, y + height));
2071 d.push(format!("Z"));
2072 }
2073
2074 }
2075 }
2076 write!(self.file, "<path d='{}' />", d.join(" "))?;
2077 write!(self.file, "</g>")?;
2078 write!(self.file, "\n")?;
2079 Ok(())
2080 }
2081}
2082
2083pub trait ConvertToFmt {
2090 type Writer: std::fmt::Write;
2091 fn convert(self) -> Self::Writer;
2092}
2093
2094impl<'a> ConvertToFmt for &'a mut String {
2095 type Writer = &'a mut String;
2096 fn convert(self) -> Self::Writer {
2097 self
2098 }
2099}
2100
2101pub struct WriteAdapter<W> {
2102 f: W,
2103}
2104
2105impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2106 fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2107 self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2108 }
2109}
2110
2111impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2112 type Writer = WriteAdapter<Self>;
2113 fn convert(self) -> Self::Writer {
2114 WriteAdapter { f: self }
2115 }
2116}
2117
2118impl<'a> ConvertToFmt for &'a mut File {
2119 type Writer = WriteAdapter<Self>;
2120 fn convert(self) -> Self::Writer {
2121 WriteAdapter { f: self }
2122 }
2123}
2124
2125pub struct PlainTextOutput<W: ConvertToFmt> {
2126 writer: W::Writer,
2127 last_end: f64,
2128 last_y: f64,
2129 first_char: bool,
2130 flip_ctm: Transform,
2131}
2132
2133impl<W: ConvertToFmt> PlainTextOutput<W> {
2134 pub fn new(writer: W) -> PlainTextOutput<W> {
2135 PlainTextOutput{
2136 writer: writer.convert(),
2137 last_end: 100000.,
2138 first_char: false,
2139 last_y: 0.,
2140 flip_ctm: Transform2D::identity(),
2141 }
2142 }
2143}
2144
2145impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2148 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2149 self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2150 Ok(())
2151 }
2152 fn end_page(&mut self) -> Result<(), OutputError> {
2153 Ok(())
2154 }
2155 fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2156 let position = trm.post_transform(&self.flip_ctm);
2157 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2158 let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2160 let (x, y) = (position.m31, position.m32);
2161 use std::fmt::Write;
2162 if self.first_char {
2164 if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2165 write!(self.writer, "\n")?;
2166 }
2167
2168 if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2170 write!(self.writer, "\n")?;
2171 }
2172
2173 if x > self.last_end + transformed_font_size * 0.1 {
2174 dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2175 write!(self.writer, " ")?;
2176 }
2177 }
2178 write!(self.writer, "{}", char)?;
2180 self.first_char = false;
2181 self.last_y = y;
2182 self.last_end = x + width * transformed_font_size;
2183 Ok(())
2184 }
2185 fn begin_word(&mut self) -> Result<(), OutputError> {
2186 self.first_char = true;
2187 Ok(())
2188 }
2189 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2190 fn end_line(&mut self) -> Result<(), OutputError>{
2191 Ok(())
2193 }
2194}
2195
2196
2197pub fn print_metadata(doc: &Document) {
2198 dlog!("Version: {}", doc.version);
2199 if let Some(ref info) = get_info(&doc) {
2200 for (k, v) in *info {
2201 match v {
2202 &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2203 _ => {}
2204 }
2205 }
2206 }
2207 dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2208 dlog!("Pages: {:?}", get_pages(&doc));
2209 dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2210}
2211
2212pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2214 let mut s = String::new();
2215 {
2216 let mut output = PlainTextOutput::new(&mut s);
2217 let mut doc = Document::load(path)?;
2218 maybe_decrypt(&mut doc)?;
2219 output_doc(&doc, &mut output)?;
2220 }
2221 Ok(s)
2222}
2223
2224fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2225 if ! doc.is_encrypted() {
2226 return Ok(());
2227 }
2228
2229 if let Err(e) = doc.decrypt("") {
2230 if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2231 eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2232 }
2233
2234 return Err(OutputError::PdfError(e));
2235 }
2236
2237 Ok(())
2238}
2239
2240pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>, PW: AsRef<[u8]>>(
2241 path: P,
2242 password: PW,
2243) -> Result<String, OutputError> {
2244 let mut s = String::new();
2245 {
2246 let mut output = PlainTextOutput::new(&mut s);
2247 let mut doc = Document::load(path)?;
2248 output_doc_encrypted(&mut doc, &mut output, password)?;
2249 }
2250 Ok(s)
2251}
2252
2253pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2254 let mut s = String::new();
2255 {
2256 let mut output = PlainTextOutput::new(&mut s);
2257 let mut doc = Document::load_mem(buffer)?;
2258 maybe_decrypt(&mut doc)?;
2259 output_doc(&doc, &mut output)?;
2260 }
2261 Ok(s)
2262}
2263
2264pub fn extract_text_from_mem_encrypted<PW: AsRef<[u8]>>(
2265 buffer: &[u8],
2266 password: PW,
2267) -> Result<String, OutputError> {
2268 let mut s = String::new();
2269 {
2270 let mut output = PlainTextOutput::new(&mut s);
2271 let mut doc = Document::load_mem(buffer)?;
2272 output_doc_encrypted(&mut doc, &mut output, password)?;
2273 }
2274 Ok(s)
2275}
2276
2277
2278fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2279 let mut s = String::new();
2280 {
2281 let mut output = PlainTextOutput::new(&mut s);
2282 output_doc_page(doc, &mut output, page_num)?;
2283 }
2284 Ok(s)
2285}
2286
2287pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2290 let mut v = Vec::new();
2291 {
2292 let mut doc = Document::load(path)?;
2293 maybe_decrypt(&mut doc)?;
2294 let mut page_num = 1;
2295 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2296 v.push(content);
2297 page_num += 1;
2298 }
2299 }
2300 Ok(v)
2301}
2302
2303pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>, PW: AsRef<[u8]>>(path: P, password: PW) -> Result<Vec<String>, OutputError> {
2304 let mut v = Vec::new();
2305 {
2306 let mut doc = Document::load(path)?;
2307 doc.decrypt(password)?;
2308 let mut page_num = 1;
2309 while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2310 v.push(content);
2311 page_num += 1;
2312 }
2313 }
2314 Ok(v)
2315}
2316
2317pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2318 let mut v = Vec::new();
2319 {
2320 let mut doc = Document::load_mem(buffer)?;
2321 maybe_decrypt(&mut doc)?;
2322 let mut page_num = 1;
2323 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2324 v.push(content);
2325 page_num += 1;
2326 }
2327 }
2328 Ok(v)
2329}
2330
2331pub fn extract_text_from_mem_by_pages_encrypted<PW: AsRef<[u8]>>(buffer: &[u8], password: PW) -> Result<Vec<String>, OutputError> {
2332 let mut v = Vec::new();
2333 {
2334 let mut doc = Document::load_mem(buffer)?;
2335 doc.decrypt(password)?;
2336 let mut page_num = 1;
2337 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2338 v.push(content);
2339 page_num += 1;
2340 }
2341 }
2342 Ok(v)
2343}
2344
2345
2346fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2347 let o: Option<T> = get(doc, dict, key);
2348 if let Some(o) = o {
2349 Some(o)
2350 } else {
2351 let parent = dict.get(b"Parent")
2352 .and_then(|parent| parent.as_reference())
2353 .and_then(|id| doc.get_dictionary(id)).ok()?;
2354 get_inherited(doc, parent, key)
2355 }
2356}
2357
2358pub fn output_doc_encrypted<PW: AsRef<[u8]>>(
2359 doc: &mut Document,
2360 output: &mut dyn OutputDev,
2361 password: PW,
2362) -> Result<(), OutputError> {
2363 doc.decrypt(password)?;
2364 output_doc(doc, output)
2365}
2366
2367pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2369 if doc.is_encrypted() {
2370 eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2371 }
2372 let empty_resources = Dictionary::new();
2373 let pages = doc.get_pages();
2374 let mut p = Processor::new();
2375 for dict in pages {
2376 let page_num = dict.0;
2377 let object_id = dict.1;
2378 output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2379 }
2380 Ok(())
2381}
2382
2383pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2384 if doc.is_encrypted() {
2385 eprintln!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2386 }
2387 let empty_resources = Dictionary::new();
2388 let pages = doc.get_pages();
2389 let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2390 let mut p = Processor::new();
2391 output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2392 Ok(())
2393}
2394
2395fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2396 let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2397 dlog!("page {} {:?}", page_num, page_dict);
2398 let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2400 dlog!("resources {:?}", resources);
2401 let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2403 let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2404 let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2405 .map(|x| (x[0], x[1], x[2], x[3]));
2406 output.begin_page(page_num, &media_box, art_box)?;
2407 p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2408 output.end_page()?;
2409 Ok(())
2410}