1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26use log::{warn, error, debug};
27mod core_fonts;
28mod glyphnames;
29mod zapfglyphnames;
30mod encodings;
31
32pub struct Space;
33pub type Transform = Transform2D<f64, Space, Space>;
34
35#[derive(Debug)]
36pub enum OutputError
37{
38 FormatError(std::fmt::Error),
39 IoError(std::io::Error),
40 PdfError(lopdf::Error)
41}
42
43impl std::fmt::Display for OutputError
44{
45 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
46 match self {
47 OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
48 OutputError::IoError(e) => write!(f, "IO error: {}", e),
49 OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
50 }
51 }
52}
53
54impl std::error::Error for OutputError{}
55
56impl From<std::fmt::Error> for OutputError {
57 fn from(e: std::fmt::Error) -> Self {
58 OutputError::FormatError(e)
59 }
60}
61
62impl From<std::io::Error> for OutputError {
63 fn from(e: std::io::Error) -> Self {
64 OutputError::IoError(e)
65 }
66}
67
68impl From<lopdf::Error> for OutputError {
69 fn from(e: lopdf::Error) -> Self {
70 OutputError::PdfError(e)
71 }
72}
73
74macro_rules! dlog {
75 ($($e:expr),*) => { {$(let _ = $e;)*} }
76 }
78
79fn get_info(doc: &Document) -> Option<&Dictionary> {
80 match doc.trailer.get(b"Info") {
81 Ok(&Object::Reference(ref id)) => {
82 match doc.get_object(*id) {
83 Ok(&Object::Dictionary(ref info)) => { return Some(info); }
84 _ => {}
85 }
86 }
87 _ => {}
88 }
89 None
90}
91
92fn get_catalog(doc: &Document) -> &Dictionary {
93 match doc.trailer.get(b"Root").unwrap() {
94 &Object::Reference(ref id) => {
95 match doc.get_object(*id) {
96 Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
97 _ => {}
98 }
99 }
100 _ => {}
101 }
102 panic!();
103}
104
105fn get_pages(doc: &Document) -> &Dictionary {
106 let catalog = get_catalog(doc);
107 match catalog.get(b"Pages").unwrap() {
108 &Object::Reference(ref id) => {
109 match doc.get_object(*id) {
110 Ok(&Object::Dictionary(ref pages)) => { return pages; }
111 other => {dlog!("pages: {:?}", other)}
112 }
113 }
114 other => { dlog!("pages: {:?}", other)}
115 }
116 dlog!("catalog {:?}", catalog);
117 panic!();
118}
119
120#[allow(non_upper_case_globals)]
121const PDFDocEncoding: &'static [u16] = &[
122 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
123 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
124 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
125 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
126 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
127 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
128 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
129 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
130 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
131 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
132 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
133 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
134 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
135 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
136 0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
137 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
138 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
139 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
140 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
141 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
142 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
143 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
144 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
145 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
147 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
148 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
149 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
150 0x00fc, 0x00fd, 0x00fe, 0x00ff];
151
152fn pdf_to_utf8(s: &[u8]) -> String {
153 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
154 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
155 } else {
156 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
157 let k = PDFDocEncoding[x as usize];
158 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
159 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
160 }
161}
162
163fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
164 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
165 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
166 } else {
167 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
168 let k = encoding[x as usize];
169 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
170 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
171 }
172}
173
174
175fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
176 match o {
177 &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
178 _ => o
179 }
180}
181
182fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
183 dict.get(key).map(|o| maybe_deref(doc, o)).ok()
184}
185
186trait FromOptObj<'a> {
188 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
189}
190
191trait FromObj<'a> where Self: std::marker::Sized {
193 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
194}
195
196impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
197 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
198 obj.and_then(|x| T::from_obj(doc,x))
199 }
200}
201
202impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
203 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
204 T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
205 }
206}
207
208impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
211 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
212 maybe_deref(doc, obj).as_array().map(|x| x.iter()
213 .map(|x| T::from_obj(doc, x).expect("wrong type"))
214 .collect()).ok()
215 }
216}
217
218impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
221 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
222 maybe_deref(doc, obj).as_array().map(|x| {
223 let mut all = x.iter()
224 .map(|x| T::from_obj(doc, x).expect("wrong type"));
225 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
226 }).ok()
227 }
228}
229
230impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
231 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
232 maybe_deref(doc, obj).as_array().map(|x| {
233 let mut all = x.iter()
234 .map(|x| T::from_obj(doc, x).expect("wrong type"));
235 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
236 }).ok()
237 }
238}
239
240impl<'a> FromObj<'a> for f64 {
241 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
242 match obj {
243 &Object::Integer(i) => Some(i as f64),
244 &Object::Real(f) => Some(f.into()),
245 _ => None
246 }
247 }
248}
249
250impl<'a> FromObj<'a> for i64 {
251 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
252 match obj {
253 &Object::Integer(i) => Some(i),
254 _ => None
255 }
256 }
257}
258
259impl<'a> FromObj<'a> for &'a Dictionary {
260 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
261 maybe_deref(doc, obj).as_dict().ok()
262 }
263}
264
265impl<'a> FromObj<'a> for &'a Stream {
266 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
267 maybe_deref(doc, obj).as_stream().ok()
268 }
269}
270
271impl<'a> FromObj<'a> for &'a Object {
272 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
273 Some(maybe_deref(doc, obj))
274 }
275}
276
277fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
278 T::from_opt_obj(doc, dict.get(key).ok(), key)
279}
280
281fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
282 maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
283}
284
285fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
286 pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
287}
288
289#[allow(dead_code)]
290fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
291 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
292}
293
294fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
295 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
296}
297
298fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
299 maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
300}
301
302#[derive(Clone)]
303struct PdfSimpleFont<'a> {
304 font: &'a Dictionary,
305 doc: &'a Document,
306 encoding: Option<Vec<u16>>,
307 unicode_map: Option<HashMap<u32, String>>,
308 widths: HashMap<CharCode, f64>, missing_width: f64,
310}
311
312#[derive(Clone)]
313struct PdfType3Font<'a> {
314 font: &'a Dictionary,
315 doc: &'a Document,
316 encoding: Option<Vec<u16>>,
317 unicode_map: Option<HashMap<CharCode, String>>,
318 widths: HashMap<CharCode, f64>, }
320
321
322fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
323 let subtype = get_name_string(doc, font, b"Subtype");
324 dlog!("MakeFont({})", subtype);
325 if subtype == "Type0" {
326 Rc::new(PdfCIDFont::new(doc, font))
327 } else if subtype == "Type3" {
328 Rc::new(PdfType3Font::new(doc, font))
329 } else {
330 Rc::new(PdfSimpleFont::new(doc, font))
331 }
332}
333
334fn is_core_font(name: &str) -> bool {
335 match name {
336 "Courier-Bold" |
337 "Courier-BoldOblique" |
338 "Courier-Oblique" |
339 "Courier" |
340 "Helvetica-Bold" |
341 "Helvetica-BoldOblique" |
342 "Helvetica-Oblique" |
343 "Helvetica" |
344 "Symbol" |
345 "Times-Bold" |
346 "Times-BoldItalic" |
347 "Times-Italic" |
348 "Times-Roman" |
349 "ZapfDingbats" => true,
350 _ => false,
351 }
352}
353
354fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
355 let encoding = match &name[..] {
356 b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
357 b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
358 b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
359 _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
360 };
361 let encoding_table = encoding.iter()
362 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
363 .collect();
364 encoding_table
365}
366
367impl<'a> PdfSimpleFont<'a> {
374 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
375 let base_name = get_name_string(doc, font, b"BaseFont");
376 let subtype = get_name_string(doc, font, b"Subtype");
377
378 let encoding: Option<&Object> = get(doc, font, b"Encoding");
379 dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
380 let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
381 let mut type1_encoding = None;
382 let mut unicode_map = None;
383 if let Some(descriptor) = descriptor {
384 dlog!("descriptor {:?}", descriptor);
385 if subtype == "Type1" {
386 let file = maybe_get_obj(doc, descriptor, b"FontFile");
387 match file {
388 Some(&Object::Stream(ref s)) => {
389 let s = get_contents(s);
390 type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
392 }
393 _ => { dlog!("font file {:?}", file) }
394 }
395 } else if subtype == "TrueType" {
396 let file = maybe_get_obj(doc, descriptor, b"FontFile2");
397 match file {
398 Some(&Object::Stream(ref s)) => {
399 let _s = get_contents(s);
400 }
402 _ => { dlog!("font file {:?}", file) }
403 }
404 }
405
406 let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
407 match font_file3 {
408 Some(&Object::Stream(ref s)) => {
409 let subtype = get_name_string(doc, &s.dict, b"Subtype");
410 dlog!("font file {}, {:?}", subtype, s);
411 let s = get_contents(s);
412 if subtype == "Type1C" {
413 let table = cff_parser::Table::parse(&s).unwrap();
414 let charset = table.charset.get_table();
415 let encoding = table.encoding.get_table();
416 let mut mapping = HashMap::new();
417 for i in 0..encoding.len().min(charset.len()) {
418 let cid = encoding[i];
419 let sid = charset[i];
420 let name = cff_parser::string_by_id(&table, sid).unwrap();
421 let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
422 zapfglyphnames::zapfdigbats_names_to_unicode(name)
423 });
424 if let Some(unicode) = unicode {
425 let str = String::from_utf16(&[unicode]).unwrap();
426 mapping.insert(cid as u32, str);
427 }
428 }
429 unicode_map = Some(mapping);
430 }
433
434 }
437 None => {}
438 _ => { dlog!("unexpected") }
439 }
440
441 let charset = maybe_get_obj(doc, descriptor, b"CharSet");
442 let _charset = match charset {
443 Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
444 _ => { None }
445 };
446 }
448
449 let mut unicode_map = match unicode_map {
450 Some(mut unicode_map) => {
451 unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
452 Some(unicode_map)
453 }
454 None => {
455 get_unicode_map(doc, font)
456 }
457 };
458
459
460 let mut encoding_table = None;
461 match encoding {
462 Some(&Object::Name(ref encoding_name)) => {
463 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
464 encoding_table = Some(encoding_to_unicode_table(encoding_name));
465 }
466 Some(&Object::Dictionary(ref encoding)) => {
467 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
469 dlog!("BaseEncoding {:?}", base_encoding);
470 encoding_to_unicode_table(base_encoding)
471 } else {
472 Vec::from(PDFDocEncoding)
473 };
474 let differences = maybe_get_array(doc, encoding, b"Differences");
475 if let Some(differences) = differences {
476 dlog!("Differences");
477 let mut code = 0;
478 for o in differences {
479 let o = maybe_deref(doc, o);
480 match o {
481 &Object::Integer(i) => { code = i; },
482 &Object::Name(ref n) => {
483 let name = pdf_to_utf8(&n);
484 let unicode = glyphnames::name_to_unicode(&name);
487 if let Some(unicode) = unicode{
488 table[code as usize] = unicode;
489 if let Some(ref mut unicode_map) = unicode_map {
490 let be = [unicode];
491 match unicode_map.entry(code as u32) {
492 Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
494 Entry::Occupied(e) => {
495 if e.get() != &String::from_utf16(&be).unwrap() {
496 let normal_match = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
497 if !normal_match {
498 warn!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
499 }
500 }
501 }
502 }
503 }
504 } else {
505 match unicode_map {
506 Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
507 match unicode_map.entry(code as u32) {
510 Entry::Vacant(v) => { v.insert("".to_owned()); }
511 Entry::Occupied(e) => {
512 panic!("unexpected entry in unicode map")
513 }
514 }
515 }
516 _ => {
517 warn!("unknown glyph name '{}' for font {}", name, base_name);
518 }
519 }
520 }
521 dlog!("{} = {} ({:?})", code, name, unicode);
522 if let Some(ref mut unicode_map) = unicode_map {
523 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
528 }
529 code += 1;
530 }
531 _ => { panic!("wrong type {:?}", o); }
532 }
533 }
534 }
535 let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
537 dlog!("name: {}", name);
538
539 encoding_table = Some(table);
540 }
541 None => {
542 if let Some(type1_encoding) = type1_encoding {
543 let mut table = Vec::from(PDFDocEncoding);
544 dlog!("type1encoding");
545 for (code, name) in type1_encoding {
546 let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
547 if let Some(unicode) = unicode {
548 table[code as usize] = unicode;
549 } else {
550 dlog!("unknown character {}", pdf_to_utf8(&name));
551 }
552 }
553 encoding_table = Some(table)
554 } else if subtype == "TrueType" {
555 encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
556 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
557 .collect());
558 }
559 }
560 _ => { panic!() }
561 }
562
563 let mut width_map = HashMap::new();
564 if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
573 let mut i: i64 = 0;
575 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
576
577 for w in widths {
578 width_map.insert((first_char + i) as CharCode, w);
579 i += 1;
580 }
581 assert_eq!(first_char + i - 1, last_char);
582 } else {
583 let name = if is_core_font(&base_name) {
584 &base_name
585 } else {
586 warn!("no widths and not core font {:?}", base_name);
587
588 "Helvetica"
603 };
604 for font_metrics in core_fonts::metrics().iter() {
605 if font_metrics.0 == base_name {
606 if let Some(ref encoding) = encoding_table {
607 dlog!("has encoding");
608 for w in font_metrics.2 {
609 let c = glyphnames::name_to_unicode(w.2).unwrap();
610 for i in 0..encoding.len() {
611 if encoding[i] == c {
612 width_map.insert(i as CharCode, w.1 as f64);
613 }
614 }
615 }
616 } else {
617 let mut table = vec![0; 256];
622 for w in font_metrics.2 {
623 dlog!("{} {}", w.0, w.2);
624 if w.0 != -1 {
626 table[w.0 as usize] = if base_name == "ZapfDingbats" {
627 zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
628 } else {
629 glyphnames::name_to_unicode(w.2).unwrap()
630 }
631 }
632 }
633
634 let encoding = &table[..];
635 for w in font_metrics.2 {
636 width_map.insert(w.0 as CharCode, w.1 as f64);
637 }
639 encoding_table = Some(encoding.to_vec());
640 }
641 }
651 }
652 }
653
654 let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
655 PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
656 }
657
658 #[allow(dead_code)]
659 fn get_type(&self) -> String {
660 get_name_string(self.doc, self.font, b"Type")
661 }
662 #[allow(dead_code)]
663 fn get_basefont(&self) -> String {
664 get_name_string(self.doc, self.font, b"BaseFont")
665 }
666 #[allow(dead_code)]
667 fn get_subtype(&self) -> String {
668 get_name_string(self.doc, self.font, b"Subtype")
669 }
670 #[allow(dead_code)]
671 fn get_widths(&self) -> Option<&Vec<Object>> {
672 maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
673 }
674 #[allow(dead_code)]
677 fn get_name(&self) -> Option<String> {
678 maybe_get_name_string(self.doc, self.font, b"Name")
679 }
680
681 #[allow(dead_code)]
682 fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
683 maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
684 }
685}
686
687
688
689impl<'a> PdfType3Font<'a> {
690 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
691
692 let unicode_map = get_unicode_map(doc, font);
693 let encoding: Option<&Object> = get(doc, font, b"Encoding");
694
695 let encoding_table;
696 match encoding {
697 Some(&Object::Name(ref encoding_name)) => {
698 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
699 encoding_table = Some(encoding_to_unicode_table(encoding_name));
700 }
701 Some(&Object::Dictionary(ref encoding)) => {
702 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
704 dlog!("BaseEncoding {:?}", base_encoding);
705 encoding_to_unicode_table(base_encoding)
706 } else {
707 Vec::from(PDFDocEncoding)
708 };
709 let differences = maybe_get_array(doc, encoding, b"Differences");
710 if let Some(differences) = differences {
711 dlog!("Differences");
712 let mut code = 0;
713 for o in differences {
714 match o {
715 &Object::Integer(i) => { code = i; },
716 &Object::Name(ref n) => {
717 let name = pdf_to_utf8(&n);
718 let unicode = glyphnames::name_to_unicode(&name);
721 if let Some(unicode) = unicode{
722 table[code as usize] = unicode;
723 }
724 dlog!("{} = {} ({:?})", code, name, unicode);
725 if let Some(ref unicode_map) = unicode_map {
726 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
727 }
728 code += 1;
729 }
730 _ => { panic!("wrong type"); }
731 }
732 }
733 }
734 let name_encoded = encoding.get(b"Type");
735 if let Ok(Object::Name(name)) = name_encoded {
736 dlog!("name: {}", pdf_to_utf8(name));
737 } else {
738 dlog!("name not found");
739 }
740
741 encoding_table = Some(table);
742 }
743 _ => { panic!() }
744 }
745
746 let first_char: i64 = get(doc, font, b"FirstChar");
747 let last_char: i64 = get(doc, font, b"LastChar");
748 let widths: Vec<f64> = get(doc, font, b"Widths");
749
750 let mut width_map = HashMap::new();
751
752 let mut i = 0;
753 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
754
755 for w in widths {
756 width_map.insert((first_char + i) as CharCode, w);
757 i += 1;
758 }
759 assert_eq!(first_char + i - 1, last_char);
760 PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
761 }
762}
763
764type CharCode = u32;
765
766struct PdfFontIter<'a>
767{
768 i: Iter<'a, u8>,
769 font: &'a dyn PdfFont,
770}
771
772impl<'a> Iterator for PdfFontIter<'a> {
773 type Item = (CharCode, u8);
774 fn next(&mut self) -> Option<(CharCode, u8)> {
775 self.font.next_char(&mut self.i)
776 }
777}
778
779trait PdfFont : Debug {
780 fn get_width(&self, id: CharCode) -> f64;
781 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
782 fn decode_char(&self, char: CharCode) -> String;
783
784 }
790
791impl<'a> dyn PdfFont + 'a {
792 fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
793 PdfFontIter{i: chars.iter(), font: self}
794 }
795 fn decode(&self, chars: &[u8]) -> String {
796 let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
797 strings.join("")
798 }
799
800}
801
802
803impl<'a> PdfFont for PdfSimpleFont<'a> {
804 fn get_width(&self, id: CharCode) -> f64 {
805 let width = self.widths.get(&id);
806 if let Some(width) = width {
807 return *width;
808 } else {
809 let mut widths = self.widths.iter().collect::<Vec<_>>();
810 widths.sort_by_key(|x| x.0);
811 dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
812 return self.missing_width;
813 }
814 }
815 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
821 iter.next().map(|x| (*x as CharCode, 1))
822 }
823 fn decode_char(&self, char: CharCode) -> String {
824 let slice = [char as u8];
825 if let Some(ref unicode_map) = self.unicode_map {
826 let s = unicode_map.get(&char);
827 let s = match s {
828 None => {
829 debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
830 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
833 let s = to_utf8(encoding, &slice);
834 debug!("falling back to encoding {} -> {:?}", char, s);
835 s
836 }
837 Some(s) => { s.clone() }
838 };
839 return s
840 }
841 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
842 let s = to_utf8(encoding, &slice);
844 s
845 }
846}
847
848
849
850impl<'a> fmt::Debug for PdfSimpleFont<'a> {
851 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
852 self.font.fmt(f)
853 }
854}
855
856impl<'a> PdfFont for PdfType3Font<'a> {
857 fn get_width(&self, id: CharCode) -> f64 {
858 let width = self.widths.get(&id);
859 if let Some(width) = width {
860 return *width;
861 } else {
862 panic!("missing width for {} {:?}", id, self.font);
863 }
864 }
865 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
871 iter.next().map(|x| (*x as CharCode, 1))
872 }
873 fn decode_char(&self, char: CharCode) -> String {
874 let slice = [char as u8];
875 if let Some(ref unicode_map) = self.unicode_map {
876 let s = unicode_map.get(&char);
877 let s = match s {
878 None => {
879 debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
880 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
883 let s = to_utf8(encoding, &slice);
884 debug!("falling back to encoding {} -> {:?}", char, s);
885 s
886 }
887 Some(s) => { s.clone() }
888 };
889 return s
890 }
891 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
892 let s = to_utf8(encoding, &slice);
894 s
895 }
896}
897
898
899
900impl<'a> fmt::Debug for PdfType3Font<'a> {
901 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
902 self.font.fmt(f)
903 }
904}
905
906struct PdfCIDFont<'a> {
907 font: &'a Dictionary,
908 #[allow(dead_code)]
909 doc: &'a Document,
910 #[allow(dead_code)]
911 encoding: ByteMapping,
912 to_unicode: Option<HashMap<u32, String>>,
913 widths: HashMap<CharCode, f64>, default_width: Option<f64>, }
916
917fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
918 let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
919 dlog!("ToUnicode: {:?}", to_unicode);
920 let mut unicode_map = None;
921 match to_unicode {
922 Some(&Object::Stream(ref stream)) => {
923 let contents = get_contents(stream);
924 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
925
926 let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
927 let mut unicode = HashMap::new();
928 for (&k, v) in cmap.iter() {
932 let mut be: Vec<u16> = Vec::new();
933 let mut i = 0;
934 assert!(v.len() % 2 == 0);
935 while i < v.len() {
936 be.push(((v[i] as u16) << 8) | v[i+1] as u16);
937 i += 2;
938 }
939 match &be[..] {
940 [0xd800 ..= 0xdfff] => {
941 continue;
944 }
945 _ => {}
946 }
947 let s = String::from_utf16(&be).unwrap();
948
949 unicode.insert(k, s);
950 }
951 unicode_map = Some(unicode);
952
953 dlog!("map: {:?}", unicode_map);
954 }
955 None => { }
956 Some(&Object::Name(ref name)) => {
957 let name = pdf_to_utf8(name);
958 if name != "Identity-H" {
959 todo!("unsupported ToUnicode name: {:?}", name);
960 }
961 }
962 _ => { panic!("unsupported cmap {:?}", to_unicode)}
963 }
964 unicode_map
965}
966
967
968impl<'a> PdfCIDFont<'a> {
969 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
970 let base_name = get_name_string(doc, font, b"BaseFont");
971 let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
972 let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
973 let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
974 dlog!("base_name {} {:?}", base_name, font);
975
976 let encoding = match encoding {
977 &Object::Name(ref name) => {
978 let name = pdf_to_utf8(name);
979 dlog!("encoding {:?}", name);
980 if name == "Identity-H" || name == "Identity-V" {
981 ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
982 } else {
983 panic!("unsupported encoding {}", name);
984 }
985 }
986 &Object::Stream(ref stream) => {
987 let contents = get_contents(stream);
988 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
989 adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
990 }
991 _ => { panic!("unsupported encoding {:?}", encoding)}
992 };
993
994 let unicode_map = get_unicode_map(doc, font);
1000
1001 dlog!("descendents {:?} {:?}", descendants, ciddict);
1002
1003 let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1004 dlog!("{:?}", font_dict);
1005 let _f = font_dict.as_dict().expect("must be dict");
1006 let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1007 let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1008 dlog!("widths {:?}", w);
1009 let mut widths = HashMap::new();
1010 let mut i = 0;
1011 if let Some(w) = w {
1012 while i < w.len() {
1013 if let &Object::Array(ref wa) = w[i+1] {
1014 let cid = w[i].as_i64().expect("id should be num");
1015 let mut j = 0;
1016 dlog!("wa: {:?} -> {:?}", cid, wa);
1017 for w in wa {
1018 widths.insert((cid + j) as CharCode, as_num(w) );
1019 j += 1;
1020 }
1021 i += 2;
1022 } else {
1023 let c_first = w[i].as_i64().expect("first should be num");
1024 let c_last = w[i].as_i64().expect("last should be num");
1025 let c_width = as_num(&w[i]);
1026 for id in c_first..c_last {
1027 widths.insert(id as CharCode, c_width);
1028 }
1029 i += 3;
1030 }
1031 }
1032 }
1033 PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1034 }
1035}
1036
1037impl<'a> PdfFont for PdfCIDFont<'a> {
1038 fn get_width(&self, id: CharCode) -> f64 {
1039 let width = self.widths.get(&id);
1040 if let Some(width) = width {
1041 dlog!("GetWidth {} -> {}", id, *width);
1042 return *width;
1043 } else {
1044 dlog!("missing width for {} falling back to default_width", id);
1045 return self.default_width.unwrap();
1046 }
1047 }fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1058 let mut c = *iter.next()? as u32;
1059 let mut code = None;
1060 'outer: for width in 1..=4 {
1061 for range in &self.encoding.codespace {
1062 if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1063 code = Some((c as u32, width));
1064 break 'outer;
1065 }
1066 }
1067 let next = *iter.next()?;
1068 c = ((c as u32) << 8) | next as u32;
1069 }
1070 let code = code?;
1071 for range in &self.encoding.cid {
1072 if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1073 return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1074 }
1075 }
1076 None
1077 }
1078 fn decode_char(&self, char: CharCode) -> String {
1079 let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1080 if let Some(s) = s {
1081 s.clone()
1082 } else {
1083 dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1084 "".to_string()
1085 }
1086 }
1087}
1088
1089impl<'a> fmt::Debug for PdfCIDFont<'a> {
1090 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1091 self.font.fmt(f)
1092 }
1093}
1094
1095
1096
1097#[derive(Copy, Clone)]
1098struct PdfFontDescriptor<'a> {
1099 desc: &'a Dictionary,
1100 doc: &'a Document
1101}
1102
1103impl<'a> PdfFontDescriptor<'a> {
1104 #[allow(dead_code)]
1105 fn get_file(&self) -> Option<&'a Object> {
1106 maybe_get_obj(self.doc, self.desc, b"FontFile")
1107 }
1108}
1109
1110impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1111 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1112 self.desc.fmt(f)
1113 }
1114}
1115
1116#[derive(Clone, Debug)]
1117struct Type0Func {
1118 domain: Vec<f64>,
1119 range: Vec<f64>,
1120 contents: Vec<u8>,
1121 size: Vec<i64>,
1122 bits_per_sample: i64,
1123 encode: Vec<f64>,
1124 decode: Vec<f64>,
1125}
1126
1127#[allow(dead_code)]
1128fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1129 let divisor = x - x_min;
1130 if divisor != 0. {
1131 y_min + (x - x_min) * ((y_max - y_min) / divisor)
1132 } else {
1133 y_min
1136 }
1137}
1138
1139impl Type0Func {
1140 #[allow(dead_code)]
1141 fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1142 let _n_inputs = self.domain.len() / 2;
1143 let _n_ouputs = self.range.len() / 2;
1144
1145 }
1146}
1147
1148#[derive(Clone, Debug)]
1149struct Type2Func {
1150 c0: Option<Vec<f64>>,
1151 c1: Option<Vec<f64>>,
1152 n: f64,
1153}
1154
1155#[derive(Clone, Debug)]
1156enum Function {
1157 Type0(Type0Func),
1158 Type2(Type2Func),
1159 #[allow(dead_code)]
1160 Type3,
1161 #[allow(dead_code)]
1162 Type4(Vec<u8>)
1163}
1164
1165impl Function {
1166 fn new(doc: &Document, obj: &Object) -> Function {
1167 let dict = match obj {
1168 &Object::Dictionary(ref dict) => dict,
1169 &Object::Stream(ref stream) => &stream.dict,
1170 _ => panic!()
1171 };
1172 let function_type: i64 = get(doc, dict, b"FunctionType");
1173 let f = match function_type {
1174 0 => {
1175 let stream = match obj {
1177 &Object::Stream(ref stream) => stream,
1178 _ => panic!()
1179 };
1180 let range: Vec<f64> = get(doc, dict, b"Range");
1181 let domain: Vec<f64> = get(doc, dict, b"Domain");
1182 let contents = get_contents(stream);
1183 let size: Vec<i64> = get(doc, dict, b"Size");
1184 let bits_per_sample = get(doc, dict, b"BitsPerSample");
1185 let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1188 let encode = encode.unwrap_or_else(|| {
1190 let mut default = Vec::new();
1191 for i in &size {
1192 default.extend([0., (i - 1) as f64].iter());
1193 }
1194 default
1195 });
1196 let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1197
1198 Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1199 }
1200 2 => {
1201 let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1203 let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1204 let n = get::<f64>(doc, dict, b"N");
1205 Function::Type2(Type2Func { c0, c1, n})
1206 }
1207 3 => {
1208 Function::Type3
1210 }
1211 4 => {
1212 let contents = match obj {
1214 &Object::Stream(ref stream) => {
1215 let contents = get_contents(stream);
1216 warn!("unhandled type-4 function");
1217 warn!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1218 contents
1219 }
1220 _ => { panic!("type 4 functions should be streams") }
1221 };
1222 Function::Type4(contents)
1223 }
1224 _ => { panic!("unhandled function type {}", function_type) }
1225 };
1226 f
1227 }
1228}
1229
1230fn as_num(o: &Object) -> f64 {
1231 match o {
1232 &Object::Integer(i) => { i as f64 }
1233 &Object::Real(f) => { f.into() }
1234 _ => { panic!("not a number") }
1235 }
1236}
1237
1238#[derive(Clone)]
1239struct TextState<'a>
1240{
1241 font: Option<Rc<dyn PdfFont + 'a>>,
1242 font_size: f64,
1243 character_spacing: f64,
1244 word_spacing: f64,
1245 horizontal_scaling: f64,
1246 leading: f64,
1247 rise: f64,
1248 tm: Transform,
1249}
1250
1251fn get_contents(contents: &Stream) -> Vec<u8> {
1253 if contents.filters().is_ok() {
1254 contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1255 } else {
1256 contents.content.clone()
1257 }
1258}
1259
1260#[derive(Clone)]
1261struct GraphicsState<'a>
1262{
1263 ctm: Transform,
1264 ts: TextState<'a>,
1265 smask: Option<Dictionary>,
1266 fill_colorspace: ColorSpace,
1267 fill_color: Vec<f64>,
1268 stroke_colorspace: ColorSpace,
1269 stroke_color: Vec<f64>,
1270 line_width: f64,
1271}
1272
1273fn show_text(gs: &mut GraphicsState, s: &[u8],
1274 _tlm: &Transform,
1275 _flip_ctm: &Transform,
1276 output: &mut dyn OutputDev) -> Result<(), OutputError> {
1277 let ts = &mut gs.ts;
1278 let font = ts.font.as_ref().unwrap();
1279 dlog!("{:?}", font.decode(s));
1281 dlog!("{:?}", font.decode(s).as_bytes());
1282 dlog!("{:?}", s);
1283 output.begin_word()?;
1284
1285 for (c, length) in font.char_codes(s) {
1286 let tsm = Transform2D::row_major(ts.horizontal_scaling,
1288 0.,
1289 0.,
1290 1.0,
1291 0.,
1292 ts.rise);
1293 let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1295 let w0 = font.get_width(c) / 1000.;
1302
1303 let mut spacing = ts.character_spacing;
1304 let is_space = c == 32 && length == 1;
1309 if is_space { spacing += ts.word_spacing }
1310
1311 output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1312 let tj = 0.;
1313 let ty = 0.;
1314 let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1315 dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1316 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1318 let _trm = ts.tm.pre_transform(&gs.ctm);
1319 }
1322 output.end_word()?;
1323 Ok(())
1324}
1325
1326#[derive(Debug, Clone, Copy)]
1327pub struct MediaBox {
1328 pub llx: f64,
1329 pub lly: f64,
1330 pub urx: f64,
1331 pub ury: f64
1332}
1333
1334fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1335 for (k, v) in state.iter() {
1336 let k : &[u8] = k.as_ref();
1337 match k {
1338 b"SMask" => { match maybe_deref(doc, v) {
1339 &Object::Name(ref name) => {
1340 if name == b"None" {
1341 gs.smask = None;
1342 } else {
1343 panic!("unexpected smask name")
1344 }
1345 }
1346 &Object::Dictionary(ref dict) => {
1347 gs.smask = Some(dict.clone());
1348 }
1349 _ => { panic!("unexpected smask type {:?}", v) }
1350 }}
1351 b"Type" => { match v {
1352 &Object::Name(ref name) => {
1353 assert_eq!(name, b"ExtGState")
1354 }
1355 _ => { panic!("unexpected type") }
1356 }}
1357 _ => { dlog!("unapplied state: {:?} {:?}", k, v); }
1358 }
1359 }
1360
1361}
1362
1363#[derive(Debug)]
1364pub enum PathOp {
1365 MoveTo(f64, f64),
1366 LineTo(f64, f64),
1367 CurveTo(f64, f64, f64, f64, f64, f64),
1369 Rect(f64, f64, f64, f64),
1370 Close,
1371}
1372
1373#[derive(Debug)]
1374pub struct Path {
1375 pub ops: Vec<PathOp>
1376}
1377
1378impl Path {
1379 fn new() -> Path {
1380 Path { ops: Vec::new() }
1381 }
1382 fn current_point(&self) -> (f64, f64) {
1383 match self.ops.last().unwrap() {
1384 &PathOp::MoveTo(x, y) => { (x, y) }
1385 &PathOp::LineTo(x, y) => { (x, y) }
1386 &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1387 _ => { panic!() }
1388 }
1389 }
1390}
1391
1392#[derive(Clone, Debug)]
1393pub struct CalGray {
1394 white_point: [f64; 3],
1395 black_point: Option<[f64; 3]>,
1396 gamma: Option<f64>,
1397}
1398
1399#[derive(Clone, Debug)]
1400pub struct CalRGB {
1401 white_point: [f64; 3],
1402 black_point: Option<[f64; 3]>,
1403 gamma: Option<[f64; 3]>,
1404 matrix: Option<Vec<f64>>
1405}
1406
1407#[derive(Clone, Debug)]
1408pub struct Lab {
1409 white_point: [f64; 3],
1410 black_point: Option<[f64; 3]>,
1411 range: Option<[f64; 4]>,
1412}
1413
1414#[derive(Clone, Debug)]
1415pub enum AlternateColorSpace {
1416 DeviceGray,
1417 DeviceRGB,
1418 DeviceCMYK,
1419 CalRGB(CalRGB),
1420 CalGray(CalGray),
1421 Lab(Lab),
1422 ICCBased(Vec<u8>)
1423}
1424
1425#[derive(Clone)]
1426pub struct Separation {
1427 name: String,
1428 alternate_space: AlternateColorSpace,
1429 tint_transform: Box<Function>,
1430}
1431
1432#[derive(Clone)]
1433pub enum ColorSpace {
1434 DeviceGray,
1435 DeviceRGB,
1436 DeviceCMYK,
1437 DeviceN,
1438 Pattern,
1439 CalRGB(CalRGB),
1440 CalGray(CalGray),
1441 Lab(Lab),
1442 Separation(Separation),
1443 ICCBased(Vec<u8>)
1444}
1445
1446fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1447 match name {
1448 b"DeviceGray" => ColorSpace::DeviceGray,
1449 b"DeviceRGB" => ColorSpace::DeviceRGB,
1450 b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1451 b"Pattern" => ColorSpace::Pattern,
1452 _ => {
1453 let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1454 let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1455 if let Ok(cs) = cs.as_array() {
1456 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1457 match cs_name.as_ref() {
1458 "Separation" => {
1459 let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1460 let alternate_space = match &maybe_deref(doc, &cs[2]) {
1461 Object::Name(name) => {
1462 match &name[..] {
1463 b"DeviceGray" => AlternateColorSpace::DeviceGray,
1464 b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1465 b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1466 _ => panic!("unexpected color space name")
1467 }
1468 }
1469 Object::Array(cs) => {
1470 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1471 match cs_name.as_ref() {
1472 "ICCBased" => {
1473 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1474 dlog!("ICCBased {:?}", stream);
1475 AlternateColorSpace::ICCBased(get_contents(stream))
1477 }
1478 "CalGray" => {
1479 let dict = cs[1].as_dict().expect("second arg must be a dict");
1480 AlternateColorSpace::CalGray(CalGray {
1481 white_point: get(&doc, dict, b"WhitePoint"),
1482 black_point: get(&doc, dict, b"BackPoint"),
1483 gamma: get(&doc, dict, b"Gamma"),
1484 })
1485 }
1486 "CalRGB" => {
1487 let dict = cs[1].as_dict().expect("second arg must be a dict");
1488 AlternateColorSpace::CalRGB(CalRGB {
1489 white_point: get(&doc, dict, b"WhitePoint"),
1490 black_point: get(&doc, dict, b"BackPoint"),
1491 gamma: get(&doc, dict, b"Gamma"),
1492 matrix: get(&doc, dict, b"Matrix"),
1493 })
1494 }
1495 "Lab" => {
1496 let dict = cs[1].as_dict().expect("second arg must be a dict");
1497 AlternateColorSpace::Lab(Lab {
1498 white_point: get(&doc, dict, b"WhitePoint"),
1499 black_point: get(&doc, dict, b"BackPoint"),
1500 range: get(&doc, dict, b"Range"),
1501 })
1502 }
1503 _ => panic!("Unexpected color space name")
1504 }
1505 }
1506 _ => panic!("Alternate space should be name or array {:?}", cs[2])
1507 };
1508 let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1509
1510 dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1511 ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1512 }
1513 "ICCBased" => {
1514 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1515 dlog!("ICCBased {:?}", stream);
1516 ColorSpace::ICCBased(get_contents(stream))
1518 }
1519 "CalGray" => {
1520 let dict = cs[1].as_dict().expect("second arg must be a dict");
1521 ColorSpace::CalGray(CalGray {
1522 white_point: get(&doc, dict, b"WhitePoint"),
1523 black_point: get(&doc, dict, b"BackPoint"),
1524 gamma: get(&doc, dict, b"Gamma"),
1525 })
1526 }
1527 "CalRGB" => {
1528 let dict = cs[1].as_dict().expect("second arg must be a dict");
1529 ColorSpace::CalRGB(CalRGB {
1530 white_point: get(&doc, dict, b"WhitePoint"),
1531 black_point: get(&doc, dict, b"BackPoint"),
1532 gamma: get(&doc, dict, b"Gamma"),
1533 matrix: get(&doc, dict, b"Matrix"),
1534 })
1535 }
1536 "Lab" => {
1537 let dict = cs[1].as_dict().expect("second arg must be a dict");
1538 ColorSpace::Lab(Lab {
1539 white_point: get(&doc, dict, b"WhitePoint"),
1540 black_point: get(&doc, dict, b"BackPoint"),
1541 range: get(&doc, dict, b"Range"),
1542 })
1543 }
1544 "Pattern" => {
1545 ColorSpace::Pattern
1546 },
1547 "DeviceGray" => ColorSpace::DeviceGray,
1548 "DeviceRGB" => ColorSpace::DeviceRGB,
1549 "DeviceCMYK" => ColorSpace::DeviceCMYK,
1550 "DeviceN" => ColorSpace::DeviceN,
1551 _ => {
1552 panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1553 }
1554 }
1555 } else if let Ok(cs) = cs.as_name() {
1556 match pdf_to_utf8(cs).as_ref() {
1557 "DeviceRGB" => ColorSpace::DeviceRGB,
1558 "DeviceGray" => ColorSpace::DeviceGray,
1559 _ => panic!()
1560 }
1561 } else {
1562 panic!();
1563 }
1564 }
1565 }
1566}
1567
1568struct Processor<'a> {
1569 _none: PhantomData<&'a ()>
1570}
1571
1572impl<'a> Processor<'a> {
1573 fn new() -> Processor<'a> {
1574 Processor { _none: PhantomData }
1575 }
1576
1577 fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1578 let content = Content::decode(&content).unwrap();
1579 let mut font_table = HashMap::new();
1580 let mut gs: GraphicsState = GraphicsState {
1581 ts: TextState {
1582 font: None,
1583 font_size: std::f64::NAN,
1584 character_spacing: 0.,
1585 word_spacing: 0.,
1586 horizontal_scaling: 100. / 100.,
1587 leading: 0.,
1588 rise: 0.,
1589 tm: Transform2D::identity(),
1590 },
1591 fill_color: Vec::new(),
1592 fill_colorspace: ColorSpace::DeviceGray,
1593 stroke_color: Vec::new(),
1594 stroke_colorspace: ColorSpace::DeviceGray,
1595 line_width: 1.,
1596 ctm: Transform2D::identity(),
1597 smask: None
1598 };
1599 let mut gs_stack = Vec::new();
1601 let mut mc_stack = Vec::new();
1602 let mut tlm = Transform2D::identity();
1604 let mut path = Path::new();
1605 let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1606 dlog!("MediaBox {:?}", media_box);
1607 for operation in &content.operations {
1608 match operation.operator.as_ref() {
1611 "BT" => {
1612 tlm = Transform2D::identity();
1613 gs.ts.tm = tlm;
1614 }
1615 "ET" => {
1616 tlm = Transform2D::identity();
1617 gs.ts.tm = tlm;
1618 }
1619 "cm" => {
1620 assert!(operation.operands.len() == 6);
1621 let m = Transform2D::row_major(as_num(&operation.operands[0]),
1622 as_num(&operation.operands[1]),
1623 as_num(&operation.operands[2]),
1624 as_num(&operation.operands[3]),
1625 as_num(&operation.operands[4]),
1626 as_num(&operation.operands[5]));
1627 gs.ctm = gs.ctm.pre_transform(&m);
1628 dlog!("matrix {:?}", gs.ctm);
1629 }
1630 "CS" => {
1631 let name = operation.operands[0].as_name().unwrap();
1632 gs.stroke_colorspace = make_colorspace(doc, name, resources);
1633 }
1634 "cs" => {
1635 let name = operation.operands[0].as_name().unwrap();
1636 gs.fill_colorspace = make_colorspace(doc, name, resources);
1637 }
1638 "SC" | "SCN" => {
1639 gs.stroke_color = match gs.stroke_colorspace {
1640 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1641 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1642 };
1643 }
1644 "sc" | "scn" => {
1645 gs.fill_color = match gs.fill_colorspace {
1646 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1647 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1648 };
1649 }
1650 "G" | "g" | "RG" | "rg" | "K" | "k" => {
1651 dlog!("unhandled color operation {:?}", operation);
1652 }
1653 "TJ" => {
1654 match operation.operands[0] {
1655 Object::Array(ref array) => {
1656 for e in array {
1657 match e {
1658 &Object::String(ref s, _) => {
1659 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1660 }
1661 &Object::Integer(i) => {
1662 let ts = &mut gs.ts;
1663 let w0 = 0.;
1664 let tj = i as f64;
1665 let ty = 0.;
1666 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1667 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1668 dlog!("adjust text by: {} {:?}", i, ts.tm);
1669 }
1670 &Object::Real(i) => {
1671 let ts = &mut gs.ts;
1672 let w0 = 0.;
1673 let tj = i as f64;
1674 let ty = 0.;
1675 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1676 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1677 dlog!("adjust text by: {} {:?}", i, ts.tm);
1678 }
1679 _ => { dlog!("kind of {:?}", e); }
1680 }
1681 }
1682 }
1683 _ => {}
1684 }
1685 }
1686 "Tj" => {
1687 match operation.operands[0] {
1688 Object::String(ref s, _) => {
1689 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1690 }
1691 _ => { panic!("unexpected Tj operand {:?}", operation) }
1692 }
1693 }
1694 "Tc" => {
1695 gs.ts.character_spacing = as_num(&operation.operands[0]);
1696 }
1697 "Tw" => {
1698 gs.ts.word_spacing = as_num(&operation.operands[0]);
1699 }
1700 "Tz" => {
1701 gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1702 }
1703 "TL" => {
1704 gs.ts.leading = as_num(&operation.operands[0]);
1705 }
1706 "Tf" => {
1707 let fonts: &Dictionary = get(&doc, resources, b"Font");
1708 let name = operation.operands[0].as_name().unwrap();
1709 let font = font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1710 {
1711 }
1719 gs.ts.font = Some(font);
1720
1721 gs.ts.font_size = as_num(&operation.operands[1]);
1722 dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1723 }
1724 "Ts" => {
1725 gs.ts.rise = as_num(&operation.operands[0]);
1726 }
1727 "Tm" => {
1728 assert!(operation.operands.len() == 6);
1729 tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1730 as_num(&operation.operands[1]),
1731 as_num(&operation.operands[2]),
1732 as_num(&operation.operands[3]),
1733 as_num(&operation.operands[4]),
1734 as_num(&operation.operands[5]));
1735 gs.ts.tm = tlm;
1736 dlog!("Tm: matrix {:?}", gs.ts.tm);
1737 output.end_line()?;
1738 }
1739 "Td" => {
1740 assert!(operation.operands.len() == 2);
1745 let tx = as_num(&operation.operands[0]);
1746 let ty = as_num(&operation.operands[1]);
1747 dlog!("translation: {} {}", tx, ty);
1748
1749 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1750 gs.ts.tm = tlm;
1751 dlog!("Td matrix {:?}", gs.ts.tm);
1752 output.end_line()?;
1753 }
1754
1755 "TD" => {
1756 assert!(operation.operands.len() == 2);
1760 let tx = as_num(&operation.operands[0]);
1761 let ty = as_num(&operation.operands[1]);
1762 dlog!("translation: {} {}", tx, ty);
1763 gs.ts.leading = -ty;
1764
1765 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1766 gs.ts.tm = tlm;
1767 dlog!("TD matrix {:?}", gs.ts.tm);
1768 output.end_line()?;
1769 }
1770
1771 "T*" => {
1772 let tx = 0.0;
1773 let ty = -gs.ts.leading;
1774
1775 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1776 gs.ts.tm = tlm;
1777 dlog!("T* matrix {:?}", gs.ts.tm);
1778 output.end_line()?;
1779 }
1780 "q" => { gs_stack.push(gs.clone()); }
1781 "Q" => {
1782 let s = gs_stack.pop();
1783 if let Some(s) = s {
1784 gs = s;
1785 } else {
1786 warn!("No state to pop");
1787 }
1788 }
1789 "gs" => {
1790 let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1791 let name = operation.operands[0].as_name().unwrap();
1792 let state: &Dictionary = get(doc, ext_gstate, name);
1793 apply_state(doc, &mut gs, state);
1794 }
1795 "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1796 "w" => { gs.line_width = as_num(&operation.operands[0]); }
1797 "J" | "j" | "M" | "d" | "ri" => { dlog!("unknown graphics state operator {:?}", operation); }
1798 "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1799 "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1800 "c" => {
1801 path.ops.push(PathOp::CurveTo(
1802 as_num(&operation.operands[0]),
1803 as_num(&operation.operands[1]),
1804 as_num(&operation.operands[2]),
1805 as_num(&operation.operands[3]),
1806 as_num(&operation.operands[4]),
1807 as_num(&operation.operands[5])))
1808 }
1809 "v" => {
1810 let (x, y) = path.current_point();
1811 path.ops.push(PathOp::CurveTo(
1812 x,
1813 y,
1814 as_num(&operation.operands[0]),
1815 as_num(&operation.operands[1]),
1816 as_num(&operation.operands[2]),
1817 as_num(&operation.operands[3])))
1818 }
1819 "y" => {
1820 path.ops.push(PathOp::CurveTo(
1821 as_num(&operation.operands[0]),
1822 as_num(&operation.operands[1]),
1823 as_num(&operation.operands[2]),
1824 as_num(&operation.operands[3]),
1825 as_num(&operation.operands[2]),
1826 as_num(&operation.operands[3])))
1827 }
1828 "h" => { path.ops.push(PathOp::Close) }
1829 "re" => {
1830 path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1831 as_num(&operation.operands[1]),
1832 as_num(&operation.operands[2]),
1833 as_num(&operation.operands[3])))
1834 }
1835 "s" | "f*" | "B" | "B*" | "b" => {
1836 dlog!("unhandled path op {:?}", operation);
1837 }
1838 "S" => {
1839 output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1840 path.ops.clear();
1841 }
1842 "F" | "f" => {
1843 output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1844 path.ops.clear();
1845 }
1846 "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1847 "n" => {
1848 dlog!("discard {:?}", path);
1849 path.ops.clear();
1850 }
1851 "BMC" | "BDC" => {
1852 mc_stack.push(operation);
1853 }
1854 "EMC" => {
1855 mc_stack.pop();
1856 }
1857 "Do" => {
1858 let xobject: &Dictionary = get(&doc, resources, b"XObject");
1861 let name = operation.operands[0].as_name().unwrap();
1862 let xf: &Stream = get(&doc, xobject, name);
1863 let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1864 let contents = get_contents(xf);
1865 self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1866 }
1867 _ => { dlog!("unknown operation {:?}", operation); }
1868
1869 }
1870 }
1871 Ok(())
1872 }
1873}
1874
1875
1876pub trait OutputDev {
1877 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1878 fn end_page(&mut self)-> Result<(), OutputError>;
1879 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1880 fn begin_word(&mut self)-> Result<(), OutputError>;
1881 fn end_word(&mut self)-> Result<(), OutputError>;
1882 fn end_line(&mut self)-> Result<(), OutputError>;
1883 fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1884 fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1885}
1886
1887
1888pub struct HTMLOutput<'a> {
1889 file: &'a mut dyn std::io::Write,
1890 flip_ctm: Transform,
1891 last_ctm: Transform,
1892 buf_ctm: Transform,
1893 buf_font_size: f64,
1894 buf: String
1895}
1896
1897fn insert_nbsp(input: &str) -> String {
1898 let mut result = String::new();
1899 let mut word_end = false;
1900 let mut chars = input.chars().peekable();
1901 while let Some(c) = chars.next() {
1902 if c == ' ' {
1903 if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1904 result += " ";
1905 } else {
1906 result += " ";
1907 }
1908 word_end = false;
1909 } else {
1910 word_end = true;
1911 result.push(c);
1912 }
1913 }
1914 result
1915}
1916
1917impl<'a> HTMLOutput<'a> {
1918 pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1919 HTMLOutput {
1920 file,
1921 flip_ctm: Transform2D::identity(),
1922 last_ctm: Transform2D::identity(),
1923 buf_ctm: Transform2D::identity(),
1924 buf: String::new(),
1925 buf_font_size: 0.,
1926 }
1927 }
1928 fn flush_string(&mut self) -> Result<(), OutputError>{
1929 if self.buf.len() != 0 {
1930
1931 let position = self.buf_ctm.post_transform(&self.flip_ctm);
1932 let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1933 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1935 let (x, y) = (position.m31, position.m32);
1936 warn!("flush {} {:?}", self.buf, (x,y));
1937
1938 write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1939 x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1940 }
1941 Ok(())
1942 }
1943}
1944
1945type ArtBox = (f64, f64, f64, f64);
1946
1947impl<'a> OutputDev for HTMLOutput<'a> {
1948 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1949 write!(self.file, "<meta charset='utf-8' /> ")?;
1950 write!(self.file, "<!-- page {} -->", page_num)?;
1951 write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1952 self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1953 Ok(())
1954 }
1955 fn end_page(&mut self) -> Result<(), OutputError> {
1956 self.flush_string()?;
1957 self.buf = String::new();
1958 self.last_ctm = Transform::identity();
1959 write!(self.file, "</div>")?;
1960 Ok(())
1961 }
1962 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1963 if trm.approx_eq(&self.last_ctm) {
1964 let position = trm.post_transform(&self.flip_ctm);
1965 let (x, y) = (position.m31, position.m32);
1966
1967 warn!("accum {} {:?}", char, (x,y));
1968 self.buf += char;
1969 } else {
1970 warn!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1971 self.flush_string()?;
1972 self.buf = char.to_owned();
1973 self.buf_font_size = font_size;
1974 self.buf_ctm = *trm;
1975 }
1976 let position = trm.post_transform(&self.flip_ctm);
1977 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1978 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1980 let (x, y) = (position.m31, position.m32);
1981 write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1982 x, y, transformed_font_size, char)?;
1983 self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1984
1985 Ok(())
1986 }
1987 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1988 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1989 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1990}
1991
1992pub struct SVGOutput<'a> {
1993 file: &'a mut dyn std::io::Write
1994}
1995impl<'a> SVGOutput<'a> {
1996 pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
1997 SVGOutput{file}
1998 }
1999}
2000
2001impl<'a> OutputDev for SVGOutput<'a> {
2002 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2003 let ver = 1.1;
2004 write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2005 if ver == 1.1 {
2006 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2007 } else {
2008 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2009 }
2010 if let Some(art_box) = art_box {
2011 let width = art_box.2 - art_box.0;
2012 let height = art_box.3 - art_box.1;
2013 let y = media_box.ury - art_box.1 - height;
2014 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2015 } else {
2016 let width = media_box.urx - media_box.llx;
2017 let height = media_box.ury - media_box.lly;
2018 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2019 }
2020 write!(self.file, "\n")?;
2021 type Mat = Transform;
2022
2023 let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2024 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2025 ctm.m11,
2026 ctm.m12,
2027 ctm.m21,
2028 ctm.m22,
2029 ctm.m31,
2030 ctm.m32,
2031 )?;
2032 Ok(())
2033 }
2034 fn end_page(&mut self) -> Result<(), OutputError>{
2035 write!(self.file, "</g>\n")?;
2036 write!(self.file, "</svg>")?;
2037 Ok(())
2038 }
2039 fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2040 Ok(())
2041 }
2042 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2043 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2044 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2045 fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2046 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2047 ctm.m11,
2048 ctm.m12,
2049 ctm.m21,
2050 ctm.m22,
2051 ctm.m31,
2052 ctm.m32,
2053 )?;
2054
2055 let mut d = Vec::new();
2063 for op in &path.ops {
2064 match op {
2065 &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2066 &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2067 &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2068 &PathOp::Close => { d.push(format!("Z"))},
2069 &PathOp::Rect(x, y, width, height) => {
2070 d.push(format!("M{} {}", x, y));
2071 d.push(format!("L{} {}", x + width, y));
2072 d.push(format!("L{} {}", x + width, y + height));
2073 d.push(format!("L{} {}", x, y + height));
2074 d.push(format!("Z"));
2075 }
2076
2077 }
2078 }
2079 write!(self.file, "<path d='{}' />", d.join(" "))?;
2080 write!(self.file, "</g>")?;
2081 write!(self.file, "\n")?;
2082 Ok(())
2083 }
2084}
2085
2086pub trait ConvertToFmt {
2093 type Writer: std::fmt::Write;
2094 fn convert(self) -> Self::Writer;
2095}
2096
2097impl<'a> ConvertToFmt for &'a mut String {
2098 type Writer = &'a mut String;
2099 fn convert(self) -> Self::Writer {
2100 self
2101 }
2102}
2103
2104pub struct WriteAdapter<W> {
2105 f: W,
2106}
2107
2108impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2109 fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2110 self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2111 }
2112}
2113
2114impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2115 type Writer = WriteAdapter<Self>;
2116 fn convert(self) -> Self::Writer {
2117 WriteAdapter { f: self }
2118 }
2119}
2120
2121impl<'a> ConvertToFmt for &'a mut File {
2122 type Writer = WriteAdapter<Self>;
2123 fn convert(self) -> Self::Writer {
2124 WriteAdapter { f: self }
2125 }
2126}
2127
2128pub struct PlainTextOutput<W: ConvertToFmt> {
2129 writer: W::Writer,
2130 last_end: f64,
2131 last_y: f64,
2132 first_char: bool,
2133 flip_ctm: Transform,
2134}
2135
2136impl<W: ConvertToFmt> PlainTextOutput<W> {
2137 pub fn new(writer: W) -> PlainTextOutput<W> {
2138 PlainTextOutput{
2139 writer: writer.convert(),
2140 last_end: 100000.,
2141 first_char: false,
2142 last_y: 0.,
2143 flip_ctm: Transform2D::identity(),
2144 }
2145 }
2146}
2147
2148impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2151 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2152 self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2153 Ok(())
2154 }
2155 fn end_page(&mut self) -> Result<(), OutputError> {
2156 Ok(())
2157 }
2158 fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2159 let position = trm.post_transform(&self.flip_ctm);
2160 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2161 let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2163 let (x, y) = (position.m31, position.m32);
2164 use std::fmt::Write;
2165 if self.first_char {
2167 if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2168 write!(self.writer, "\n")?;
2169 }
2170
2171 if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2173 write!(self.writer, "\n")?;
2174 }
2175
2176 if x > self.last_end + transformed_font_size * 0.1 {
2177 dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2178 write!(self.writer, " ")?;
2179 }
2180 }
2181 write!(self.writer, "{}", char)?;
2183 self.first_char = false;
2184 self.last_y = y;
2185 self.last_end = x + width * transformed_font_size;
2186 Ok(())
2187 }
2188 fn begin_word(&mut self) -> Result<(), OutputError> {
2189 self.first_char = true;
2190 Ok(())
2191 }
2192 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2193 fn end_line(&mut self) -> Result<(), OutputError>{
2194 Ok(())
2196 }
2197}
2198
2199
2200pub fn print_metadata(doc: &Document) {
2201 dlog!("Version: {}", doc.version);
2202 if let Some(ref info) = get_info(&doc) {
2203 for (k, v) in *info {
2204 match v {
2205 &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2206 _ => {}
2207 }
2208 }
2209 }
2210 dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2211 dlog!("Pages: {:?}", get_pages(&doc));
2212 dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2213}
2214
2215pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2217 let mut s = String::new();
2218 {
2219 let mut output = PlainTextOutput::new(&mut s);
2220 let mut doc = Document::load(path)?;
2221 maybe_decrypt(&mut doc)?;
2222 output_doc(&doc, &mut output)?;
2223 }
2224 Ok(s)
2225}
2226
2227fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2228 if ! doc.is_encrypted() {
2229 return Ok(());
2230 }
2231
2232 if let Err(e) = doc.decrypt("") {
2233 if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2234 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2235 }
2236
2237 return Err(OutputError::PdfError(e));
2238 }
2239
2240 Ok(())
2241}
2242
2243pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>>(
2244 path: P,
2245 password: &str,
2246) -> Result<String, OutputError> {
2247 let mut s = String::new();
2248 {
2249 let mut output = PlainTextOutput::new(&mut s);
2250 let mut doc = Document::load(path)?;
2251 output_doc_encrypted(&mut doc, &mut output, password)?;
2252 }
2253 Ok(s)
2254}
2255
2256pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2257 let mut s = String::new();
2258 {
2259 let mut output = PlainTextOutput::new(&mut s);
2260 let mut doc = Document::load_mem(buffer)?;
2261 maybe_decrypt(&mut doc)?;
2262 output_doc(&doc, &mut output)?;
2263 }
2264 Ok(s)
2265}
2266
2267pub fn extract_text_from_mem_encrypted(
2268 buffer: &[u8],
2269 password: &str,
2270) -> Result<String, OutputError> {
2271 let mut s = String::new();
2272 {
2273 let mut output = PlainTextOutput::new(&mut s);
2274 let mut doc = Document::load_mem(buffer)?;
2275 output_doc_encrypted(&mut doc, &mut output, password)?;
2276 }
2277 Ok(s)
2278}
2279
2280
2281fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2282 let mut s = String::new();
2283 {
2284 let mut output = PlainTextOutput::new(&mut s);
2285 output_doc_page(doc, &mut output, page_num)?;
2286 }
2287 Ok(s)
2288}
2289
2290pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2293 let mut v = Vec::new();
2294 {
2295 let mut doc = Document::load(path)?;
2296 maybe_decrypt(&mut doc)?;
2297 let mut page_num = 1;
2298 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2299 v.push(content);
2300 page_num += 1;
2301 }
2302 }
2303 Ok(v)
2304}
2305
2306pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>>(path: P, password: &str) -> Result<Vec<String>, OutputError> {
2307 let mut v = Vec::new();
2308 {
2309 let mut doc = Document::load(path)?;
2310 doc.decrypt(password)?;
2311 let mut page_num = 1;
2312 while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2313 v.push(content);
2314 page_num += 1;
2315 }
2316 }
2317 Ok(v)
2318}
2319
2320pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2321 let mut v = Vec::new();
2322 {
2323 let mut doc = Document::load_mem(buffer)?;
2324 maybe_decrypt(&mut doc)?;
2325 let mut page_num = 1;
2326 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2327 v.push(content);
2328 page_num += 1;
2329 }
2330 }
2331 Ok(v)
2332}
2333
2334pub fn extract_text_from_mem_by_pages_encrypted(buffer: &[u8], password: &str) -> Result<Vec<String>, OutputError> {
2335 let mut v = Vec::new();
2336 {
2337 let mut doc = Document::load_mem(buffer)?;
2338 doc.decrypt(password)?;
2339 let mut page_num = 1;
2340 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2341 v.push(content);
2342 page_num += 1;
2343 }
2344 }
2345 Ok(v)
2346}
2347
2348
2349fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2350 let o: Option<T> = get(doc, dict, key);
2351 if let Some(o) = o {
2352 Some(o)
2353 } else {
2354 let parent = dict.get(b"Parent")
2355 .and_then(|parent| parent.as_reference())
2356 .and_then(|id| doc.get_dictionary(id)).ok()?;
2357 get_inherited(doc, parent, key)
2358 }
2359}
2360
2361pub fn output_doc_encrypted(
2362 doc: &mut Document,
2363 output: &mut dyn OutputDev,
2364 password: &str,
2365) -> Result<(), OutputError> {
2366 doc.decrypt(password)?;
2367 output_doc(doc, output)
2368}
2369
2370pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2372 if doc.is_encrypted() {
2373 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2374 }
2375 let empty_resources = Dictionary::new();
2376 let pages = doc.get_pages();
2377 let mut p = Processor::new();
2378 for dict in pages {
2379 let page_num = dict.0;
2380 let object_id = dict.1;
2381 output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2382 }
2383 Ok(())
2384}
2385
2386pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2387 if doc.is_encrypted() {
2388 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2389 }
2390 let empty_resources = Dictionary::new();
2391 let pages = doc.get_pages();
2392 let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2393 let mut p = Processor::new();
2394 output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2395 Ok(())
2396}
2397
2398fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2399 let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2400 dlog!("page {} {:?}", page_num, page_dict);
2401 let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2403 dlog!("resources {:?}", resources);
2404 let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2406 let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2407 let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2408 .map(|x| (x[0], x[1], x[2], x[3]));
2409 output.begin_page(page_num, &media_box, art_box)?;
2410 p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2411 output.end_page()?;
2412 Ok(())
2413}