1extern crate lopdf;
2
3use adobe_cmap_parser::{ByteMapping, CodeRange, CIDRange};
4use encoding_rs::UTF_16BE;
5use lopdf::content::Content;
6pub use lopdf::*;
7use euclid::*;
8use lopdf::encryption::DecryptionError;
9use std::fmt::{Debug, Formatter};
10extern crate encoding_rs;
11extern crate euclid;
12extern crate adobe_cmap_parser;
13extern crate type1_encoding_parser;
14extern crate unicode_normalization;
15use euclid::vec2;
16use unicode_normalization::UnicodeNormalization;
17use std::fmt;
18use std::str;
19use std::fs::File;
20use std::slice::Iter;
21use std::collections::HashMap;
22use std::collections::hash_map::Entry;
23use std::rc::Rc;
24use std::marker::PhantomData;
25use std::result::Result;
26use log::{warn, error, debug};
27mod core_fonts;
28mod glyphnames;
29mod zapfglyphnames;
30mod encodings;
31
32pub struct Space;
33pub type Transform = Transform2D<f64, Space, Space>;
34
35#[derive(Debug)]
36pub enum OutputError
37{
38 FormatError(std::fmt::Error),
39 IoError(std::io::Error),
40 PdfError(lopdf::Error)
41}
42
43impl std::fmt::Display for OutputError
44{
45 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
46 match self {
47 OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
48 OutputError::IoError(e) => write!(f, "IO error: {}", e),
49 OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
50 }
51 }
52}
53
54impl std::error::Error for OutputError{}
55
56impl From<std::fmt::Error> for OutputError {
57 fn from(e: std::fmt::Error) -> Self {
58 OutputError::FormatError(e)
59 }
60}
61
62impl From<std::io::Error> for OutputError {
63 fn from(e: std::io::Error) -> Self {
64 OutputError::IoError(e)
65 }
66}
67
68impl From<lopdf::Error> for OutputError {
69 fn from(e: lopdf::Error) -> Self {
70 OutputError::PdfError(e)
71 }
72}
73
74macro_rules! dlog {
75 ($($e:expr),*) => { {} }
76 }
78
79fn get_info(doc: &Document) -> Option<&Dictionary> {
80 match doc.trailer.get(b"Info") {
81 Ok(&Object::Reference(ref id)) => {
82 match doc.get_object(*id) {
83 Ok(&Object::Dictionary(ref info)) => { return Some(info); }
84 _ => {}
85 }
86 }
87 _ => {}
88 }
89 None
90}
91
92fn get_catalog(doc: &Document) -> &Dictionary {
93 match doc.trailer.get(b"Root").unwrap() {
94 &Object::Reference(ref id) => {
95 match doc.get_object(*id) {
96 Ok(&Object::Dictionary(ref catalog)) => { return catalog; }
97 _ => {}
98 }
99 }
100 _ => {}
101 }
102 panic!();
103}
104
105fn get_pages(doc: &Document) -> &Dictionary {
106 let catalog = get_catalog(doc);
107 match catalog.get(b"Pages").unwrap() {
108 &Object::Reference(ref id) => {
109 match doc.get_object(*id) {
110 Ok(&Object::Dictionary(ref pages)) => { return pages; }
111 other => {dlog!("pages: {:?}", other)}
112 }
113 }
114 other => { dlog!("pages: {:?}", other)}
115 }
116 dlog!("catalog {:?}", catalog);
117 panic!();
118}
119
120#[allow(non_upper_case_globals)]
121const PDFDocEncoding: &'static [u16] = &[
122 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
123 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
124 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
125 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
126 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
127 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
128 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
129 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
130 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
131 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
132 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
133 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
134 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
135 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
136 0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
137 0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
138 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
139 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
140 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
141 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
142 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
143 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
144 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
145 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
147 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
148 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
149 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
150 0x00fc, 0x00fd, 0x00fe, 0x00ff];
151
152fn pdf_to_utf8(s: &[u8]) -> String {
153 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
154 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
155 } else {
156 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
157 let k = PDFDocEncoding[x as usize];
158 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
159 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
160 }
161}
162
163fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
164 if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
165 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&s[2..]).unwrap().to_string()
166 } else {
167 let r : Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
168 let k = encoding[x as usize];
169 vec![(k>>8) as u8, k as u8].into_iter()}).collect();
170 return UTF_16BE.decode_without_bom_handling_and_without_replacement(&r).unwrap().to_string()
171 }
172}
173
174
175fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
176 match o {
177 &Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
178 _ => o
179 }
180}
181
182fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
183 dict.get(key).map(|o| maybe_deref(doc, o)).ok()
184}
185
186trait FromOptObj<'a> {
188 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
189}
190
191trait FromObj<'a> where Self: std::marker::Sized {
193 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
194}
195
196impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
197 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
198 obj.and_then(|x| T::from_obj(doc,x))
199 }
200}
201
202impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
203 fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
204 T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
205 }
206}
207
208impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
211 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
212 maybe_deref(doc, obj).as_array().map(|x| x.iter()
213 .map(|x| T::from_obj(doc, x).expect("wrong type"))
214 .collect()).ok()
215 }
216}
217
218impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
221 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
222 maybe_deref(doc, obj).as_array().map(|x| {
223 let mut all = x.iter()
224 .map(|x| T::from_obj(doc, x).expect("wrong type"));
225 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
226 }).ok()
227 }
228}
229
230impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
231 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
232 maybe_deref(doc, obj).as_array().map(|x| {
233 let mut all = x.iter()
234 .map(|x| T::from_obj(doc, x).expect("wrong type"));
235 [all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
236 }).ok()
237 }
238}
239
240impl<'a> FromObj<'a> for f64 {
241 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
242 match obj {
243 &Object::Integer(i) => Some(i as f64),
244 &Object::Real(f) => Some(f.into()),
245 _ => None
246 }
247 }
248}
249
250impl<'a> FromObj<'a> for i64 {
251 fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
252 match obj {
253 &Object::Integer(i) => Some(i),
254 _ => None
255 }
256 }
257}
258
259impl<'a> FromObj<'a> for &'a Dictionary {
260 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
261 maybe_deref(doc, obj).as_dict().ok()
262 }
263}
264
265impl<'a> FromObj<'a> for &'a Stream {
266 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
267 maybe_deref(doc, obj).as_stream().ok()
268 }
269}
270
271impl<'a> FromObj<'a> for &'a Object {
272 fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
273 Some(maybe_deref(doc, obj))
274 }
275}
276
277fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
278 T::from_opt_obj(doc, dict.get(key).ok(), key)
279}
280
281fn maybe_get<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
282 maybe_get_obj(doc, dict, key).and_then(|o| T::from_obj(doc, o))
283}
284
285fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
286 pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
287}
288
289#[allow(dead_code)]
290fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
291 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
292}
293
294fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
295 maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
296}
297
298fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
299 maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
300}
301
302#[derive(Clone)]
303struct PdfSimpleFont<'a> {
304 font: &'a Dictionary,
305 doc: &'a Document,
306 encoding: Option<Vec<u16>>,
307 unicode_map: Option<HashMap<u32, String>>,
308 widths: HashMap<CharCode, f64>, missing_width: f64,
310}
311
312#[derive(Clone)]
313struct PdfType3Font<'a> {
314 font: &'a Dictionary,
315 doc: &'a Document,
316 encoding: Option<Vec<u16>>,
317 unicode_map: Option<HashMap<CharCode, String>>,
318 widths: HashMap<CharCode, f64>, }
320
321
322fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
323 let subtype = get_name_string(doc, font, b"Subtype");
324 dlog!("MakeFont({})", subtype);
325 if subtype == "Type0" {
326 Rc::new(PdfCIDFont::new(doc, font))
327 } else if subtype == "Type3" {
328 Rc::new(PdfType3Font::new(doc, font))
329 } else {
330 Rc::new(PdfSimpleFont::new(doc, font))
331 }
332}
333
334fn is_core_font(name: &str) -> bool {
335 match name {
336 "Courier-Bold" |
337 "Courier-BoldOblique" |
338 "Courier-Oblique" |
339 "Courier" |
340 "Helvetica-Bold" |
341 "Helvetica-BoldOblique" |
342 "Helvetica-Oblique" |
343 "Helvetica" |
344 "Symbol" |
345 "Times-Bold" |
346 "Times-BoldItalic" |
347 "Times-Italic" |
348 "Times-Roman" |
349 "ZapfDingbats" => true,
350 _ => false,
351 }
352}
353
354fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
355 let encoding = match &name[..] {
356 b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
357 b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
358 b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
359 _ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
360 };
361 let encoding_table = encoding.iter()
362 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
363 .collect();
364 encoding_table
365}
366
367impl<'a> PdfSimpleFont<'a> {
374 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
375 let base_name = get_name_string(doc, font, b"BaseFont");
376 let subtype = get_name_string(doc, font, b"Subtype");
377
378 let encoding: Option<&Object> = get(doc, font, b"Encoding");
379 dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
380 let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
381 let mut type1_encoding = None;
382 let mut unicode_map = None;
383 if let Some(descriptor) = descriptor {
384 dlog!("descriptor {:?}", descriptor);
385 if subtype == "Type1" {
386 let file = maybe_get_obj(doc, descriptor, b"FontFile");
387 match file {
388 Some(&Object::Stream(ref s)) => {
389 let s = get_contents(s);
390 type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
392 }
393 _ => { dlog!("font file {:?}", file) }
394 }
395 } else if subtype == "TrueType" {
396 let file = maybe_get_obj(doc, descriptor, b"FontFile2");
397 match file {
398 Some(&Object::Stream(ref s)) => {
399 let _s = get_contents(s);
400 }
402 _ => { dlog!("font file {:?}", file) }
403 }
404 }
405
406 let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
407 match font_file3 {
408 Some(&Object::Stream(ref s)) => {
409 let subtype = get_name_string(doc, &s.dict, b"Subtype");
410 dlog!("font file {}, {:?}", subtype, s);
411 let s = get_contents(s);
412 if subtype == "Type1C" {
413 let table = cff_parser::Table::parse(&s).unwrap();
414 let encoding = table.encoding.get_code_to_sid_table(&table.charset);
418
419 let mapping: HashMap<u32, String> = encoding.into_iter().filter_map(|(cid, sid)| {
420 let name = cff_parser::string_by_id(&table, sid).unwrap();
421 if name == ".notdef" {
422 return None;
423 }
424 let unicode = glyphnames::name_to_unicode(&name).or_else(|| {
425 zapfglyphnames::zapfdigbats_names_to_unicode(name)
426 });
427 if unicode.is_none() {
428 warn!("Couldn't find unicode for {}", name);
429 return None;
430 }
431 let str = String::from_utf16(&[unicode.unwrap()]).unwrap();
432 Some((cid as u32, str))
433 }).collect();
434 unicode_map = Some(mapping);
435 }
436
437 }
440 None => {}
441 _ => { dlog!("unexpected") }
442 }
443
444 let charset = maybe_get_obj(doc, descriptor, b"CharSet");
445 let _charset = match charset {
446 Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
447 _ => { None }
448 };
449 }
451
452 let mut unicode_map = match unicode_map {
453 Some(mut unicode_map) => {
454 unicode_map.extend(get_unicode_map(doc, font).unwrap_or(HashMap::new()));
455 Some(unicode_map)
456 }
457 None => {
458 get_unicode_map(doc, font)
459 }
460 };
461
462
463 let mut encoding_table = None;
464 match encoding {
465 Some(&Object::Name(ref encoding_name)) => {
466 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
467 encoding_table = Some(encoding_to_unicode_table(encoding_name));
468 }
469 Some(&Object::Dictionary(ref encoding)) => {
470 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
472 dlog!("BaseEncoding {:?}", base_encoding);
473 encoding_to_unicode_table(base_encoding)
474 } else {
475 Vec::from(PDFDocEncoding)
476 };
477 let differences = maybe_get_array(doc, encoding, b"Differences");
478 if let Some(differences) = differences {
479 dlog!("Differences");
480 let mut code = 0;
481 for o in differences {
482 let o = maybe_deref(doc, o);
483 match o {
484 &Object::Integer(i) => { code = i; },
485 &Object::Name(ref n) => {
486 let name = pdf_to_utf8(&n);
487 let unicode = glyphnames::name_to_unicode(&name);
490 if let Some(unicode) = unicode{
491 table[code as usize] = unicode;
492 if let Some(ref mut unicode_map) = unicode_map {
493 let be = [unicode];
494 match unicode_map.entry(code as u32) {
495 Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
497 Entry::Occupied(e) => {
498 if e.get() != &String::from_utf16(&be).unwrap() {
499 let normal_match = e.get().nfkc().eq(String::from_utf16(&be).unwrap().nfkc());
500 if !normal_match {
501 warn!("Unicode mismatch {} {} {:?} {:?} {:?}", normal_match, name, e.get(), String::from_utf16(&be), be);
502 }
503 }
504 }
505 }
506 }
507 } else {
508 match unicode_map {
509 Some(ref mut unicode_map) if base_name.contains("FontAwesome") => {
510 match unicode_map.entry(code as u32) {
513 Entry::Vacant(v) => { v.insert("".to_owned()); }
514 Entry::Occupied(e) => {
515 panic!("unexpected entry in unicode map")
516 }
517 }
518 }
519 _ => {
520 warn!("unknown glyph name '{}' for font {}", name, base_name);
521 }
522 }
523 }
524 dlog!("{} = {} ({:?})", code, name, unicode);
525 if let Some(ref mut unicode_map) = unicode_map {
526 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
531 }
532 code += 1;
533 }
534 _ => { panic!("wrong type {:?}", o); }
535 }
536 }
537 }
538 let name = encoding.get(b"Type").and_then(|x| x.as_name()).and_then(|x| Ok(pdf_to_utf8(x)));
540 dlog!("name: {}", name);
541
542 encoding_table = Some(table);
543 }
544 None => {
545 if let Some(type1_encoding) = type1_encoding {
546 let mut table = Vec::from(PDFDocEncoding);
547 dlog!("type1encoding");
548 for (code, name) in type1_encoding {
549 let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
550 if let Some(unicode) = unicode {
551 table[code as usize] = unicode;
552 } else {
553 dlog!("unknown character {}", pdf_to_utf8(&name));
554 }
555 }
556 encoding_table = Some(table)
557 } else if subtype == "TrueType" {
558 encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
559 .map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
560 .collect());
561 }
562 }
563 _ => { panic!() }
564 }
565
566 let mut width_map = HashMap::new();
567 if let (Some(first_char), Some(last_char), Some(widths)) = (maybe_get::<i64>(doc, font, b"FirstChar"), maybe_get::<i64>(doc, font, b"LastChar"), maybe_get::<Vec<f64>>(doc, font, b"Widths")) {
576 let mut i: i64 = 0;
578 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
579
580 for w in widths {
581 width_map.insert((first_char + i) as CharCode, w);
582 i += 1;
583 }
584 assert_eq!(first_char + i - 1, last_char);
585 } else {
586 let name = if is_core_font(&base_name) {
587 &base_name
588 } else {
589 warn!("no widths and not core font {:?}", base_name);
590
591 "Helvetica"
606 };
607 for font_metrics in core_fonts::metrics().iter() {
608 if font_metrics.0 == base_name {
609 if let Some(ref encoding) = encoding_table {
610 dlog!("has encoding");
611 for w in font_metrics.2 {
612 let c = glyphnames::name_to_unicode(w.2).unwrap();
613 for i in 0..encoding.len() {
614 if encoding[i] == c {
615 width_map.insert(i as CharCode, w.1 as f64);
616 }
617 }
618 }
619 } else {
620 let mut table = vec![0; 256];
625 for w in font_metrics.2 {
626 dlog!("{} {}", w.0, w.2);
627 if w.0 != -1 {
629 table[w.0 as usize] = if base_name == "ZapfDingbats" {
630 zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
631 } else {
632 glyphnames::name_to_unicode(w.2).unwrap()
633 }
634 }
635 }
636
637 let encoding = &table[..];
638 for w in font_metrics.2 {
639 width_map.insert(w.0 as CharCode, w.1 as f64);
640 }
642 encoding_table = Some(encoding.to_vec());
643 }
644 }
654 }
655 }
656
657 let missing_width = get::<Option<f64>>(doc, font, b"MissingWidth").unwrap_or(0.);
658 PdfSimpleFont {doc, font, widths: width_map, encoding: encoding_table, missing_width, unicode_map}
659 }
660
661 #[allow(dead_code)]
662 fn get_type(&self) -> String {
663 get_name_string(self.doc, self.font, b"Type")
664 }
665 #[allow(dead_code)]
666 fn get_basefont(&self) -> String {
667 get_name_string(self.doc, self.font, b"BaseFont")
668 }
669 #[allow(dead_code)]
670 fn get_subtype(&self) -> String {
671 get_name_string(self.doc, self.font, b"Subtype")
672 }
673 #[allow(dead_code)]
674 fn get_widths(&self) -> Option<&Vec<Object>> {
675 maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
676 }
677 #[allow(dead_code)]
680 fn get_name(&self) -> Option<String> {
681 maybe_get_name_string(self.doc, self.font, b"Name")
682 }
683
684 #[allow(dead_code)]
685 fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
686 maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor{desc: desc, doc: self.doc})
687 }
688}
689
690
691
692impl<'a> PdfType3Font<'a> {
693 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
694
695 let unicode_map = get_unicode_map(doc, font);
696 let encoding: Option<&Object> = get(doc, font, b"Encoding");
697
698 let encoding_table;
699 match encoding {
700 Some(&Object::Name(ref encoding_name)) => {
701 dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
702 encoding_table = Some(encoding_to_unicode_table(encoding_name));
703 }
704 Some(&Object::Dictionary(ref encoding)) => {
705 let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
707 dlog!("BaseEncoding {:?}", base_encoding);
708 encoding_to_unicode_table(base_encoding)
709 } else {
710 Vec::from(PDFDocEncoding)
711 };
712 let differences = maybe_get_array(doc, encoding, b"Differences");
713 if let Some(differences) = differences {
714 dlog!("Differences");
715 let mut code = 0;
716 for o in differences {
717 match o {
718 &Object::Integer(i) => { code = i; },
719 &Object::Name(ref n) => {
720 let name = pdf_to_utf8(&n);
721 let unicode = glyphnames::name_to_unicode(&name);
724 if let Some(unicode) = unicode{
725 table[code as usize] = unicode;
726 }
727 dlog!("{} = {} ({:?})", code, name, unicode);
728 if let Some(ref unicode_map) = unicode_map {
729 dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
730 }
731 code += 1;
732 }
733 _ => { panic!("wrong type"); }
734 }
735 }
736 }
737 let name_encoded = encoding.get(b"Type");
738 if let Ok(Object::Name(name)) = name_encoded {
739 dlog!("name: {}", pdf_to_utf8(name));
740 } else {
741 dlog!("name not found");
742 }
743
744 encoding_table = Some(table);
745 }
746 _ => { panic!() }
747 }
748
749 let first_char: i64 = get(doc, font, b"FirstChar");
750 let last_char: i64 = get(doc, font, b"LastChar");
751 let widths: Vec<f64> = get(doc, font, b"Widths");
752
753 let mut width_map = HashMap::new();
754
755 let mut i = 0;
756 dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
757
758 for w in widths {
759 width_map.insert((first_char + i) as CharCode, w);
760 i += 1;
761 }
762 assert_eq!(first_char + i - 1, last_char);
763 PdfType3Font {doc, font, widths: width_map, encoding: encoding_table, unicode_map}
764 }
765}
766
767type CharCode = u32;
768
769struct PdfFontIter<'a>
770{
771 i: Iter<'a, u8>,
772 font: &'a dyn PdfFont,
773}
774
775impl<'a> Iterator for PdfFontIter<'a> {
776 type Item = (CharCode, u8);
777 fn next(&mut self) -> Option<(CharCode, u8)> {
778 self.font.next_char(&mut self.i)
779 }
780}
781
782trait PdfFont : Debug {
783 fn get_width(&self, id: CharCode) -> f64;
784 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
785 fn decode_char(&self, char: CharCode) -> String;
786
787 }
793
794impl<'a> dyn PdfFont + 'a {
795 fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
796 PdfFontIter{i: chars.iter(), font: self}
797 }
798 fn decode(&self, chars: &[u8]) -> String {
799 let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
800 strings.join("")
801 }
802
803}
804
805
806impl<'a> PdfFont for PdfSimpleFont<'a> {
807 fn get_width(&self, id: CharCode) -> f64 {
808 let width = self.widths.get(&id);
809 if let Some(width) = width {
810 return *width;
811 } else {
812 let mut widths = self.widths.iter().collect::<Vec<_>>();
813 widths.sort_by_key(|x| x.0);
814 dlog!("missing width for {} len(widths) = {}, {:?} falling back to missing_width {:?}", id, self.widths.len(), widths, self.font);
815 return self.missing_width;
816 }
817 }
818 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
824 iter.next().map(|x| (*x as CharCode, 1))
825 }
826 fn decode_char(&self, char: CharCode) -> String {
827 let slice = [char as u8];
828 if let Some(ref unicode_map) = self.unicode_map {
829 let s = unicode_map.get(&char);
830 let s = match s {
831 None => {
832 debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
833 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
836 let s = to_utf8(encoding, &slice);
837 debug!("falling back to encoding {} -> {:?}", char, s);
838 s
839 }
840 Some(s) => { s.clone() }
841 };
842 return s
843 }
844 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
845 let s = to_utf8(encoding, &slice);
847 s
848 }
849}
850
851
852
853impl<'a> fmt::Debug for PdfSimpleFont<'a> {
854 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
855 self.font.fmt(f)
856 }
857}
858
859impl<'a> PdfFont for PdfType3Font<'a> {
860 fn get_width(&self, id: CharCode) -> f64 {
861 let width = self.widths.get(&id);
862 if let Some(width) = width {
863 return *width;
864 } else {
865 panic!("missing width for {} {:?}", id, self.font);
866 }
867 }
868 fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
874 iter.next().map(|x| (*x as CharCode, 1))
875 }
876 fn decode_char(&self, char: CharCode) -> String {
877 let slice = [char as u8];
878 if let Some(ref unicode_map) = self.unicode_map {
879 let s = unicode_map.get(&char);
880 let s = match s {
881 None => {
882 debug!("missing char {:?} in unicode map {:?} for {:?}", char, unicode_map, self.font);
883 let encoding = self.encoding.as_ref().map(|x| &x[..]).expect("missing unicode map and encoding");
886 let s = to_utf8(encoding, &slice);
887 debug!("falling back to encoding {} -> {:?}", char, s);
888 s
889 }
890 Some(s) => { s.clone() }
891 };
892 return s
893 }
894 let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
895 let s = to_utf8(encoding, &slice);
897 s
898 }
899}
900
901
902
903impl<'a> fmt::Debug for PdfType3Font<'a> {
904 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
905 self.font.fmt(f)
906 }
907}
908
909struct PdfCIDFont<'a> {
910 font: &'a Dictionary,
911 #[allow(dead_code)]
912 doc: &'a Document,
913 #[allow(dead_code)]
914 encoding: ByteMapping,
915 to_unicode: Option<HashMap<u32, String>>,
916 widths: HashMap<CharCode, f64>, default_width: Option<f64>, }
919
920fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
921 let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
922 dlog!("ToUnicode: {:?}", to_unicode);
923 let mut unicode_map = None;
924 match to_unicode {
925 Some(&Object::Stream(ref stream)) => {
926 let contents = get_contents(stream);
927 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
928
929 let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
930 let mut unicode = HashMap::new();
931 for (&k, v) in cmap.iter() {
935 let mut be: Vec<u16> = Vec::new();
936 let mut i = 0;
937 assert!(v.len() % 2 == 0);
938 while i < v.len() {
939 be.push(((v[i] as u16) << 8) | v[i+1] as u16);
940 i += 2;
941 }
942 match &be[..] {
943 [0xd800 ..= 0xdfff] => {
944 continue;
947 }
948 _ => {}
949 }
950 let s = String::from_utf16(&be).unwrap();
951
952 unicode.insert(k, s);
953 }
954 unicode_map = Some(unicode);
955
956 dlog!("map: {:?}", unicode_map);
957 }
958 None => { }
959 Some(&Object::Name(ref name)) => {
960 let name = pdf_to_utf8(name);
961 if name != "Identity-H" {
962 todo!("unsupported ToUnicode name: {:?}", name);
963 }
964 }
965 _ => { panic!("unsupported cmap {:?}", to_unicode)}
966 }
967 unicode_map
968}
969
970
971impl<'a> PdfCIDFont<'a> {
972 fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
973 let base_name = get_name_string(doc, font, b"BaseFont");
974 let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
975 let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
976 let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
977 dlog!("base_name {} {:?}", base_name, font);
978
979 let encoding = match encoding {
980 &Object::Name(ref name) => {
981 let name = pdf_to_utf8(name);
982 dlog!("encoding {:?}", name);
983 if name == "Identity-H" || name == "Identity-V" {
984 ByteMapping { codespace: vec![CodeRange{width: 2, start: 0, end: 0xffff }], cid: vec![CIDRange{ src_code_lo: 0, src_code_hi: 0xffff, dst_CID_lo: 0 }]}
985 } else {
986 panic!("unsupported encoding {}", name);
987 }
988 }
989 &Object::Stream(ref stream) => {
990 let contents = get_contents(stream);
991 dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
992 adobe_cmap_parser::get_byte_mapping(&contents).unwrap()
993 }
994 _ => { panic!("unsupported encoding {:?}", encoding)}
995 };
996
997 let unicode_map = get_unicode_map(doc, font);
1003
1004 dlog!("descendents {:?} {:?}", descendants, ciddict);
1005
1006 let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
1007 dlog!("{:?}", font_dict);
1008 let _f = font_dict.as_dict().expect("must be dict");
1009 let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
1010 let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
1011 dlog!("widths {:?}", w);
1012 let mut widths = HashMap::new();
1013 let mut i = 0;
1014 if let Some(w) = w {
1015 while i < w.len() {
1016 if let &Object::Array(ref wa) = w[i+1] {
1017 let cid = w[i].as_i64().expect("id should be num");
1018 let mut j = 0;
1019 dlog!("wa: {:?} -> {:?}", cid, wa);
1020 for w in wa {
1021 widths.insert((cid + j) as CharCode, as_num(w) );
1022 j += 1;
1023 }
1024 i += 2;
1025 } else {
1026 let c_first = w[i].as_i64().expect("first should be num");
1027 let c_last = w[i].as_i64().expect("last should be num");
1028 let c_width = as_num(&w[i]);
1029 for id in c_first..c_last {
1030 widths.insert(id as CharCode, c_width);
1031 }
1032 i += 3;
1033 }
1034 }
1035 }
1036 PdfCIDFont{doc, font, widths, to_unicode: unicode_map, encoding, default_width: Some(default_width as f64) }
1037 }
1038}
1039
1040impl<'a> PdfFont for PdfCIDFont<'a> {
1041 fn get_width(&self, id: CharCode) -> f64 {
1042 let width = self.widths.get(&id);
1043 if let Some(width) = width {
1044 dlog!("GetWidth {} -> {}", id, *width);
1045 return *width;
1046 } else {
1047 dlog!("missing width for {} falling back to default_width", id);
1048 return self.default_width.unwrap();
1049 }
1050 }fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
1061 let mut c = *iter.next()? as u32;
1062 let mut code = None;
1063 'outer: for width in 1..=4 {
1064 for range in &self.encoding.codespace {
1065 if c as u32 >= range.start && c as u32 <= range.end && range.width == width {
1066 code = Some((c as u32, width));
1067 break 'outer;
1068 }
1069 }
1070 let next = *iter.next()?;
1071 c = ((c as u32) << 8) | next as u32;
1072 }
1073 let code = code?;
1074 for range in &self.encoding.cid {
1075 if code.0 >= range.src_code_lo && code.0 <= range.src_code_hi {
1076 return Some((code.0 + range.dst_CID_lo, code.1 as u8));
1077 }
1078 }
1079 None
1080 }
1081 fn decode_char(&self, char: CharCode) -> String {
1082 let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
1083 if let Some(s) = s {
1084 s.clone()
1085 } else {
1086 dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
1087 "".to_string()
1088 }
1089 }
1090}
1091
1092impl<'a> fmt::Debug for PdfCIDFont<'a> {
1093 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1094 self.font.fmt(f)
1095 }
1096}
1097
1098
1099
1100#[derive(Copy, Clone)]
1101struct PdfFontDescriptor<'a> {
1102 desc: &'a Dictionary,
1103 doc: &'a Document
1104}
1105
1106impl<'a> PdfFontDescriptor<'a> {
1107 #[allow(dead_code)]
1108 fn get_file(&self) -> Option<&'a Object> {
1109 maybe_get_obj(self.doc, self.desc, b"FontFile")
1110 }
1111}
1112
1113impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
1114 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1115 self.desc.fmt(f)
1116 }
1117}
1118
1119#[derive(Clone, Debug)]
1120struct Type0Func {
1121 domain: Vec<f64>,
1122 range: Vec<f64>,
1123 contents: Vec<u8>,
1124 size: Vec<i64>,
1125 bits_per_sample: i64,
1126 encode: Vec<f64>,
1127 decode: Vec<f64>,
1128}
1129
1130#[allow(dead_code)]
1131fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
1132 let divisor = x - x_min;
1133 if divisor != 0. {
1134 y_min + (x - x_min) * ((y_max - y_min) / divisor)
1135 } else {
1136 y_min
1139 }
1140}
1141
1142impl Type0Func {
1143 #[allow(dead_code)]
1144 fn eval(&self, _input: &[f64], _output: &mut [f64]) {
1145 let _n_inputs = self.domain.len() / 2;
1146 let _n_ouputs = self.range.len() / 2;
1147
1148 }
1149}
1150
1151#[derive(Clone, Debug)]
1152struct Type2Func {
1153 c0: Option<Vec<f64>>,
1154 c1: Option<Vec<f64>>,
1155 n: f64,
1156}
1157
1158#[derive(Clone, Debug)]
1159enum Function {
1160 Type0(Type0Func),
1161 Type2(Type2Func),
1162 #[allow(dead_code)]
1163 Type3,
1164 #[allow(dead_code)]
1165 Type4(Vec<u8>)
1166}
1167
1168impl Function {
1169 fn new(doc: &Document, obj: &Object) -> Function {
1170 let dict = match obj {
1171 &Object::Dictionary(ref dict) => dict,
1172 &Object::Stream(ref stream) => &stream.dict,
1173 _ => panic!()
1174 };
1175 let function_type: i64 = get(doc, dict, b"FunctionType");
1176 let f = match function_type {
1177 0 => {
1178 let stream = match obj {
1180 &Object::Stream(ref stream) => stream,
1181 _ => panic!()
1182 };
1183 let range: Vec<f64> = get(doc, dict, b"Range");
1184 let domain: Vec<f64> = get(doc, dict, b"Domain");
1185 let contents = get_contents(stream);
1186 let size: Vec<i64> = get(doc, dict, b"Size");
1187 let bits_per_sample = get(doc, dict, b"BitsPerSample");
1188 let encode = get::<Option<Vec<f64>>>(doc, dict, b"Encode");
1191 let encode = encode.unwrap_or_else(|| {
1193 let mut default = Vec::new();
1194 for i in &size {
1195 default.extend([0., (i - 1) as f64].iter());
1196 }
1197 default
1198 });
1199 let decode = get::<Option<Vec<f64>>>(doc, dict, b"Decode").unwrap_or_else(|| range.clone());
1200
1201 Function::Type0(Type0Func { domain, range, size, contents, bits_per_sample, encode, decode })
1202 }
1203 2 => {
1204 let c0 = get::<Option<Vec<f64>>>(doc, dict, b"C0");
1206 let c1 = get::<Option<Vec<f64>>>(doc, dict, b"C1");
1207 let n = get::<f64>(doc, dict, b"N");
1208 Function::Type2(Type2Func { c0, c1, n})
1209 }
1210 3 => {
1211 Function::Type3
1213 }
1214 4 => {
1215 let contents = match obj {
1217 &Object::Stream(ref stream) => {
1218 let contents = get_contents(stream);
1219 warn!("unhandled type-4 function");
1220 warn!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
1221 contents
1222 }
1223 _ => { panic!("type 4 functions should be streams") }
1224 };
1225 Function::Type4(contents)
1226 }
1227 _ => { panic!("unhandled function type {}", function_type) }
1228 };
1229 f
1230 }
1231}
1232
1233fn as_num(o: &Object) -> f64 {
1234 match o {
1235 &Object::Integer(i) => { i as f64 }
1236 &Object::Real(f) => { f.into() }
1237 _ => { panic!("not a number") }
1238 }
1239}
1240
1241#[derive(Clone)]
1242struct TextState<'a>
1243{
1244 font: Option<Rc<dyn PdfFont + 'a>>,
1245 font_size: f64,
1246 character_spacing: f64,
1247 word_spacing: f64,
1248 horizontal_scaling: f64,
1249 leading: f64,
1250 rise: f64,
1251 tm: Transform,
1252}
1253
1254fn get_contents(contents: &Stream) -> Vec<u8> {
1256 if contents.filters().is_ok() {
1257 contents.decompressed_content().unwrap_or_else(|_|contents.content.clone())
1258 } else {
1259 contents.content.clone()
1260 }
1261}
1262
1263#[derive(Clone)]
1264struct GraphicsState<'a>
1265{
1266 ctm: Transform,
1267 ts: TextState<'a>,
1268 smask: Option<Dictionary>,
1269 fill_colorspace: ColorSpace,
1270 fill_color: Vec<f64>,
1271 stroke_colorspace: ColorSpace,
1272 stroke_color: Vec<f64>,
1273 line_width: f64,
1274}
1275
1276fn show_text(gs: &mut GraphicsState, s: &[u8],
1277 _tlm: &Transform,
1278 _flip_ctm: &Transform,
1279 output: &mut dyn OutputDev) -> Result<(), OutputError> {
1280 let ts = &mut gs.ts;
1281 let font = ts.font.as_ref().unwrap();
1282 dlog!("{:?}", font.decode(s));
1284 dlog!("{:?}", font.decode(s).as_bytes());
1285 dlog!("{:?}", s);
1286 output.begin_word()?;
1287
1288 for (c, length) in font.char_codes(s) {
1289 let tsm = Transform2D::row_major(ts.horizontal_scaling,
1291 0.,
1292 0.,
1293 1.0,
1294 0.,
1295 ts.rise);
1296 let trm = tsm.post_transform(&ts.tm.post_transform(&gs.ctm));
1298 let w0 = font.get_width(c) / 1000.;
1305
1306 let mut spacing = ts.character_spacing;
1307 let is_space = c == 32 && length == 1;
1312 if is_space { spacing += ts.word_spacing }
1313
1314 output.output_character(&trm, w0, spacing, ts.font_size, &font.decode_char(c))?;
1315 let tj = 0.;
1316 let ty = 0.;
1317 let tx = ts.horizontal_scaling * ((w0 - tj/1000.)* ts.font_size + spacing);
1318 dlog!("horizontal {} adjust {} {} {} {}", ts.horizontal_scaling, tx, w0, ts.font_size, spacing);
1319 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1321 let _trm = ts.tm.pre_transform(&gs.ctm);
1322 }
1325 output.end_word()?;
1326 Ok(())
1327}
1328
1329#[derive(Debug, Clone, Copy)]
1330pub struct MediaBox {
1331 pub llx: f64,
1332 pub lly: f64,
1333 pub urx: f64,
1334 pub ury: f64
1335}
1336
1337fn apply_state(doc: &Document, gs: &mut GraphicsState, state: &Dictionary) {
1338 for (k, v) in state.iter() {
1339 let k : &[u8] = k.as_ref();
1340 match k {
1341 b"SMask" => { match maybe_deref(doc, v) {
1342 &Object::Name(ref name) => {
1343 if name == b"None" {
1344 gs.smask = None;
1345 } else {
1346 panic!("unexpected smask name")
1347 }
1348 }
1349 &Object::Dictionary(ref dict) => {
1350 gs.smask = Some(dict.clone());
1351 }
1352 _ => { panic!("unexpected smask type {:?}", v) }
1353 }}
1354 b"Type" => { match v {
1355 &Object::Name(ref name) => {
1356 assert_eq!(name, b"ExtGState")
1357 }
1358 _ => { panic!("unexpected type") }
1359 }}
1360 _ => { dlog!("unapplied state: {:?} {:?}", k, v); }
1361 }
1362 }
1363
1364}
1365
1366#[derive(Debug)]
1367pub enum PathOp {
1368 MoveTo(f64, f64),
1369 LineTo(f64, f64),
1370 CurveTo(f64, f64, f64, f64, f64, f64),
1372 Rect(f64, f64, f64, f64),
1373 Close,
1374}
1375
1376#[derive(Debug)]
1377pub struct Path {
1378 pub ops: Vec<PathOp>
1379}
1380
1381impl Path {
1382 fn new() -> Path {
1383 Path { ops: Vec::new() }
1384 }
1385 fn current_point(&self) -> (f64, f64) {
1386 match self.ops.last().unwrap() {
1387 &PathOp::MoveTo(x, y) => { (x, y) }
1388 &PathOp::LineTo(x, y) => { (x, y) }
1389 &PathOp::CurveTo(_, _, _, _, x, y) => { (x, y) }
1390 _ => { panic!() }
1391 }
1392 }
1393}
1394
1395#[derive(Clone, Debug)]
1396pub struct CalGray {
1397 white_point: [f64; 3],
1398 black_point: Option<[f64; 3]>,
1399 gamma: Option<f64>,
1400}
1401
1402#[derive(Clone, Debug)]
1403pub struct CalRGB {
1404 white_point: [f64; 3],
1405 black_point: Option<[f64; 3]>,
1406 gamma: Option<[f64; 3]>,
1407 matrix: Option<Vec<f64>>
1408}
1409
1410#[derive(Clone, Debug)]
1411pub struct Lab {
1412 white_point: [f64; 3],
1413 black_point: Option<[f64; 3]>,
1414 range: Option<[f64; 4]>,
1415}
1416
1417#[derive(Clone, Debug)]
1418pub enum AlternateColorSpace {
1419 DeviceGray,
1420 DeviceRGB,
1421 DeviceCMYK,
1422 CalRGB(CalRGB),
1423 CalGray(CalGray),
1424 Lab(Lab),
1425 ICCBased(Vec<u8>)
1426}
1427
1428#[derive(Clone)]
1429pub struct Separation {
1430 name: String,
1431 alternate_space: AlternateColorSpace,
1432 tint_transform: Box<Function>,
1433}
1434
1435#[derive(Clone)]
1436pub enum ColorSpace {
1437 DeviceGray,
1438 DeviceRGB,
1439 DeviceCMYK,
1440 DeviceN,
1441 Pattern,
1442 CalRGB(CalRGB),
1443 CalGray(CalGray),
1444 Lab(Lab),
1445 Separation(Separation),
1446 ICCBased(Vec<u8>)
1447}
1448
1449fn make_colorspace<'a>(doc: &'a Document, name: &[u8], resources: &'a Dictionary) -> ColorSpace {
1450 match name {
1451 b"DeviceGray" => ColorSpace::DeviceGray,
1452 b"DeviceRGB" => ColorSpace::DeviceRGB,
1453 b"DeviceCMYK" => ColorSpace::DeviceCMYK,
1454 b"Pattern" => ColorSpace::Pattern,
1455 _ => {
1456 let colorspaces: &Dictionary = get(&doc, resources, b"ColorSpace");
1457 let cs: &Object = maybe_get_obj(doc, colorspaces, &name[..]).unwrap_or_else(|| panic!("missing colorspace {:?}", &name[..]));
1458 if let Ok(cs) = cs.as_array() {
1459 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1460 match cs_name.as_ref() {
1461 "Separation" => {
1462 let name = pdf_to_utf8(cs[1].as_name().expect("second arg must be a name"));
1463 let alternate_space = match &maybe_deref(doc, &cs[2]) {
1464 Object::Name(name) => {
1465 match &name[..] {
1466 b"DeviceGray" => AlternateColorSpace::DeviceGray,
1467 b"DeviceRGB" => AlternateColorSpace::DeviceRGB,
1468 b"DeviceCMYK" => AlternateColorSpace::DeviceCMYK,
1469 _ => panic!("unexpected color space name")
1470 }
1471 }
1472 Object::Array(cs) => {
1473 let cs_name = pdf_to_utf8(cs[0].as_name().expect("first arg must be a name"));
1474 match cs_name.as_ref() {
1475 "ICCBased" => {
1476 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1477 dlog!("ICCBased {:?}", stream);
1478 AlternateColorSpace::ICCBased(get_contents(stream))
1480 }
1481 "CalGray" => {
1482 let dict = cs[1].as_dict().expect("second arg must be a dict");
1483 AlternateColorSpace::CalGray(CalGray {
1484 white_point: get(&doc, dict, b"WhitePoint"),
1485 black_point: get(&doc, dict, b"BackPoint"),
1486 gamma: get(&doc, dict, b"Gamma"),
1487 })
1488 }
1489 "CalRGB" => {
1490 let dict = cs[1].as_dict().expect("second arg must be a dict");
1491 AlternateColorSpace::CalRGB(CalRGB {
1492 white_point: get(&doc, dict, b"WhitePoint"),
1493 black_point: get(&doc, dict, b"BackPoint"),
1494 gamma: get(&doc, dict, b"Gamma"),
1495 matrix: get(&doc, dict, b"Matrix"),
1496 })
1497 }
1498 "Lab" => {
1499 let dict = cs[1].as_dict().expect("second arg must be a dict");
1500 AlternateColorSpace::Lab(Lab {
1501 white_point: get(&doc, dict, b"WhitePoint"),
1502 black_point: get(&doc, dict, b"BackPoint"),
1503 range: get(&doc, dict, b"Range"),
1504 })
1505 }
1506 _ => panic!("Unexpected color space name")
1507 }
1508 }
1509 _ => panic!("Alternate space should be name or array {:?}", cs[2])
1510 };
1511 let tint_transform = Box::new(Function::new(doc, maybe_deref(doc, &cs[3])));
1512
1513 dlog!("{:?} {:?} {:?}", name, alternate_space, tint_transform);
1514 ColorSpace::Separation(Separation{ name, alternate_space, tint_transform})
1515 }
1516 "ICCBased" => {
1517 let stream = maybe_deref(doc, &cs[1]).as_stream().unwrap();
1518 dlog!("ICCBased {:?}", stream);
1519 ColorSpace::ICCBased(get_contents(stream))
1521 }
1522 "CalGray" => {
1523 let dict = cs[1].as_dict().expect("second arg must be a dict");
1524 ColorSpace::CalGray(CalGray {
1525 white_point: get(&doc, dict, b"WhitePoint"),
1526 black_point: get(&doc, dict, b"BackPoint"),
1527 gamma: get(&doc, dict, b"Gamma"),
1528 })
1529 }
1530 "CalRGB" => {
1531 let dict = cs[1].as_dict().expect("second arg must be a dict");
1532 ColorSpace::CalRGB(CalRGB {
1533 white_point: get(&doc, dict, b"WhitePoint"),
1534 black_point: get(&doc, dict, b"BackPoint"),
1535 gamma: get(&doc, dict, b"Gamma"),
1536 matrix: get(&doc, dict, b"Matrix"),
1537 })
1538 }
1539 "Lab" => {
1540 let dict = cs[1].as_dict().expect("second arg must be a dict");
1541 ColorSpace::Lab(Lab {
1542 white_point: get(&doc, dict, b"WhitePoint"),
1543 black_point: get(&doc, dict, b"BackPoint"),
1544 range: get(&doc, dict, b"Range"),
1545 })
1546 }
1547 "Pattern" => {
1548 ColorSpace::Pattern
1549 },
1550 "DeviceGray" => ColorSpace::DeviceGray,
1551 "DeviceRGB" => ColorSpace::DeviceRGB,
1552 "DeviceCMYK" => ColorSpace::DeviceCMYK,
1553 "DeviceN" => ColorSpace::DeviceN,
1554 _ => {
1555 panic!("color_space {:?} {:?} {:?}", name, cs_name, cs)
1556 }
1557 }
1558 } else if let Ok(cs) = cs.as_name() {
1559 match pdf_to_utf8(cs).as_ref() {
1560 "DeviceRGB" => ColorSpace::DeviceRGB,
1561 "DeviceGray" => ColorSpace::DeviceGray,
1562 _ => panic!()
1563 }
1564 } else {
1565 panic!();
1566 }
1567 }
1568 }
1569}
1570
1571struct Processor<'a> {
1572 font_table: HashMap<Vec<u8>, Rc<dyn PdfFont + 'a>>,
1573 _none: PhantomData<&'a ()>,
1574}
1575
1576impl<'a> Processor<'a> {
1577 fn new() -> Processor<'a> {
1578 Processor { font_table: HashMap::new(), _none: PhantomData }
1579 }
1580
1581 fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary, media_box: &MediaBox, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
1582 let content = Content::decode(&content).unwrap();
1583 let mut gs: GraphicsState = GraphicsState {
1584 ts: TextState {
1585 font: None,
1586 font_size: std::f64::NAN,
1587 character_spacing: 0.,
1588 word_spacing: 0.,
1589 horizontal_scaling: 100. / 100.,
1590 leading: 0.,
1591 rise: 0.,
1592 tm: Transform2D::identity(),
1593 },
1594 fill_color: Vec::new(),
1595 fill_colorspace: ColorSpace::DeviceGray,
1596 stroke_color: Vec::new(),
1597 stroke_colorspace: ColorSpace::DeviceGray,
1598 line_width: 1.,
1599 ctm: Transform2D::identity(),
1600 smask: None
1601 };
1602 let mut gs_stack = Vec::new();
1604 let mut mc_stack = Vec::new();
1605 let mut tlm = Transform2D::identity();
1607 let mut path = Path::new();
1608 let flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1609 dlog!("MediaBox {:?}", media_box);
1610 for operation in &content.operations {
1611 match operation.operator.as_ref() {
1614 "BT" => {
1615 tlm = Transform2D::identity();
1616 gs.ts.tm = tlm;
1617 }
1618 "ET" => {
1619 tlm = Transform2D::identity();
1620 gs.ts.tm = tlm;
1621 }
1622 "cm" => {
1623 assert!(operation.operands.len() == 6);
1624 let m = Transform2D::row_major(as_num(&operation.operands[0]),
1625 as_num(&operation.operands[1]),
1626 as_num(&operation.operands[2]),
1627 as_num(&operation.operands[3]),
1628 as_num(&operation.operands[4]),
1629 as_num(&operation.operands[5]));
1630 gs.ctm = gs.ctm.pre_transform(&m);
1631 dlog!("matrix {:?}", gs.ctm);
1632 }
1633 "CS" => {
1634 let name = operation.operands[0].as_name().unwrap();
1635 gs.stroke_colorspace = make_colorspace(doc, name, resources);
1636 }
1637 "cs" => {
1638 let name = operation.operands[0].as_name().unwrap();
1639 gs.fill_colorspace = make_colorspace(doc, name, resources);
1640 }
1641 "SC" | "SCN" => {
1642 gs.stroke_color = match gs.stroke_colorspace {
1643 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1644 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1645 };
1646 }
1647 "sc" | "scn" => {
1648 gs.fill_color = match gs.fill_colorspace {
1649 ColorSpace::Pattern => { dlog!("unhandled pattern color"); Vec::new() }
1650 _ => { operation.operands.iter().map(|x| as_num(x)).collect() }
1651 };
1652 }
1653 "G" | "g" | "RG" | "rg" | "K" | "k" => {
1654 dlog!("unhandled color operation {:?}", operation);
1655 }
1656 "TJ" => {
1657 match operation.operands[0] {
1658 Object::Array(ref array) => {
1659 for e in array {
1660 match e {
1661 &Object::String(ref s, _) => {
1662 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1663 }
1664 &Object::Integer(i) => {
1665 let ts = &mut gs.ts;
1666 let w0 = 0.;
1667 let tj = i as f64;
1668 let ty = 0.;
1669 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1670 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1671 dlog!("adjust text by: {} {:?}", i, ts.tm);
1672 }
1673 &Object::Real(i) => {
1674 let ts = &mut gs.ts;
1675 let w0 = 0.;
1676 let tj = i as f64;
1677 let ty = 0.;
1678 let tx = ts.horizontal_scaling * ((w0 - tj / 1000.) * ts.font_size);
1679 ts.tm = ts.tm.pre_transform(&Transform2D::create_translation(tx, ty));
1680 dlog!("adjust text by: {} {:?}", i, ts.tm);
1681 }
1682 _ => { dlog!("kind of {:?}", e); }
1683 }
1684 }
1685 }
1686 _ => {}
1687 }
1688 }
1689 "Tj" => {
1690 match operation.operands[0] {
1691 Object::String(ref s, _) => {
1692 show_text(&mut gs, s, &tlm, &flip_ctm, output)?;
1693 }
1694 _ => { panic!("unexpected Tj operand {:?}", operation) }
1695 }
1696 }
1697 "Tc" => {
1698 gs.ts.character_spacing = as_num(&operation.operands[0]);
1699 }
1700 "Tw" => {
1701 gs.ts.word_spacing = as_num(&operation.operands[0]);
1702 }
1703 "Tz" => {
1704 gs.ts.horizontal_scaling = as_num(&operation.operands[0]) / 100.;
1705 }
1706 "TL" => {
1707 gs.ts.leading = as_num(&operation.operands[0]);
1708 }
1709 "Tf" => {
1710 let fonts: &Dictionary = get(&doc, resources, b"Font");
1711 let name = operation.operands[0].as_name().unwrap();
1712 let font = self.font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
1713 {
1714 }
1722 gs.ts.font = Some(font);
1723
1724 gs.ts.font_size = as_num(&operation.operands[1]);
1725 dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
1726 }
1727 "Ts" => {
1728 gs.ts.rise = as_num(&operation.operands[0]);
1729 }
1730 "Tm" => {
1731 assert!(operation.operands.len() == 6);
1732 tlm = Transform2D::row_major(as_num(&operation.operands[0]),
1733 as_num(&operation.operands[1]),
1734 as_num(&operation.operands[2]),
1735 as_num(&operation.operands[3]),
1736 as_num(&operation.operands[4]),
1737 as_num(&operation.operands[5]));
1738 gs.ts.tm = tlm;
1739 dlog!("Tm: matrix {:?}", gs.ts.tm);
1740 output.end_line()?;
1741 }
1742 "Td" => {
1743 assert!(operation.operands.len() == 2);
1748 let tx = as_num(&operation.operands[0]);
1749 let ty = as_num(&operation.operands[1]);
1750 dlog!("translation: {} {}", tx, ty);
1751
1752 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1753 gs.ts.tm = tlm;
1754 dlog!("Td matrix {:?}", gs.ts.tm);
1755 output.end_line()?;
1756 }
1757
1758 "TD" => {
1759 assert!(operation.operands.len() == 2);
1763 let tx = as_num(&operation.operands[0]);
1764 let ty = as_num(&operation.operands[1]);
1765 dlog!("translation: {} {}", tx, ty);
1766 gs.ts.leading = -ty;
1767
1768 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1769 gs.ts.tm = tlm;
1770 dlog!("TD matrix {:?}", gs.ts.tm);
1771 output.end_line()?;
1772 }
1773
1774 "T*" => {
1775 let tx = 0.0;
1776 let ty = -gs.ts.leading;
1777
1778 tlm = tlm.pre_transform(&Transform2D::create_translation(tx, ty));
1779 gs.ts.tm = tlm;
1780 dlog!("T* matrix {:?}", gs.ts.tm);
1781 output.end_line()?;
1782 }
1783 "q" => { gs_stack.push(gs.clone()); }
1784 "Q" => {
1785 let s = gs_stack.pop();
1786 if let Some(s) = s {
1787 gs = s;
1788 } else {
1789 warn!("No state to pop");
1790 }
1791 }
1792 "gs" => {
1793 let ext_gstate: &Dictionary = get(doc, resources, b"ExtGState");
1794 let name = operation.operands[0].as_name().unwrap();
1795 let state: &Dictionary = get(doc, ext_gstate, name);
1796 apply_state(doc, &mut gs, state);
1797 }
1798 "i" => { dlog!("unhandled graphics state flattness operator {:?}", operation); }
1799 "w" => { gs.line_width = as_num(&operation.operands[0]); }
1800 "J" | "j" | "M" | "d" | "ri" => { dlog!("unknown graphics state operator {:?}", operation); }
1801 "m" => { path.ops.push(PathOp::MoveTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1802 "l" => { path.ops.push(PathOp::LineTo(as_num(&operation.operands[0]), as_num(&operation.operands[1]))) }
1803 "c" => {
1804 path.ops.push(PathOp::CurveTo(
1805 as_num(&operation.operands[0]),
1806 as_num(&operation.operands[1]),
1807 as_num(&operation.operands[2]),
1808 as_num(&operation.operands[3]),
1809 as_num(&operation.operands[4]),
1810 as_num(&operation.operands[5])))
1811 }
1812 "v" => {
1813 let (x, y) = path.current_point();
1814 path.ops.push(PathOp::CurveTo(
1815 x,
1816 y,
1817 as_num(&operation.operands[0]),
1818 as_num(&operation.operands[1]),
1819 as_num(&operation.operands[2]),
1820 as_num(&operation.operands[3])))
1821 }
1822 "y" => {
1823 path.ops.push(PathOp::CurveTo(
1824 as_num(&operation.operands[0]),
1825 as_num(&operation.operands[1]),
1826 as_num(&operation.operands[2]),
1827 as_num(&operation.operands[3]),
1828 as_num(&operation.operands[2]),
1829 as_num(&operation.operands[3])))
1830 }
1831 "h" => { path.ops.push(PathOp::Close) }
1832 "re" => {
1833 path.ops.push(PathOp::Rect(as_num(&operation.operands[0]),
1834 as_num(&operation.operands[1]),
1835 as_num(&operation.operands[2]),
1836 as_num(&operation.operands[3])))
1837 }
1838 "s" | "f*" | "B" | "B*" | "b" => {
1839 dlog!("unhandled path op {:?}", operation);
1840 }
1841 "S" => {
1842 output.stroke(&gs.ctm, &gs.stroke_colorspace, &gs.stroke_color, &path)?;
1843 path.ops.clear();
1844 }
1845 "F" | "f" => {
1846 output.fill(&gs.ctm, &gs.fill_colorspace, &gs.fill_color, &path)?;
1847 path.ops.clear();
1848 }
1849 "W" | "w*" => { dlog!("unhandled clipping operation {:?}", operation); }
1850 "n" => {
1851 dlog!("discard {:?}", path);
1852 path.ops.clear();
1853 }
1854 "BMC" | "BDC" => {
1855 mc_stack.push(operation);
1856 }
1857 "EMC" => {
1858 mc_stack.pop();
1859 }
1860 "Do" => {
1861 let xobject: &Dictionary = get(&doc, resources, b"XObject");
1864 let name = operation.operands[0].as_name().unwrap();
1865 let xf: &Stream = get(&doc, xobject, name);
1866 let resources = maybe_get_obj(&doc, &xf.dict, b"Resources").and_then(|n| n.as_dict().ok()).unwrap_or(resources);
1867 let contents = get_contents(xf);
1868 self.process_stream(&doc, contents, resources, &media_box, output, page_num)?;
1869 }
1870 _ => { dlog!("unknown operation {:?}", operation); }
1871
1872 }
1873 }
1874 Ok(())
1875 }
1876}
1877
1878
1879pub trait OutputDev {
1880 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>)-> Result<(), OutputError>;
1881 fn end_page(&mut self)-> Result<(), OutputError>;
1882 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>;
1883 fn begin_word(&mut self)-> Result<(), OutputError>;
1884 fn end_word(&mut self)-> Result<(), OutputError>;
1885 fn end_line(&mut self)-> Result<(), OutputError>;
1886 fn stroke(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1887 fn fill(&mut self, _ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], _path: &Path)-> Result<(), OutputError> {Ok(())}
1888}
1889
1890
1891pub struct HTMLOutput<'a> {
1892 file: &'a mut dyn std::io::Write,
1893 flip_ctm: Transform,
1894 last_ctm: Transform,
1895 buf_ctm: Transform,
1896 buf_font_size: f64,
1897 buf: String
1898}
1899
1900fn insert_nbsp(input: &str) -> String {
1901 let mut result = String::new();
1902 let mut word_end = false;
1903 let mut chars = input.chars().peekable();
1904 while let Some(c) = chars.next() {
1905 if c == ' ' {
1906 if !word_end || chars.peek().filter(|x| **x != ' ').is_none() {
1907 result += " ";
1908 } else {
1909 result += " ";
1910 }
1911 word_end = false;
1912 } else {
1913 word_end = true;
1914 result.push(c);
1915 }
1916 }
1917 result
1918}
1919
1920impl<'a> HTMLOutput<'a> {
1921 pub fn new(file: &mut dyn std::io::Write) -> HTMLOutput {
1922 HTMLOutput {
1923 file,
1924 flip_ctm: Transform2D::identity(),
1925 last_ctm: Transform2D::identity(),
1926 buf_ctm: Transform2D::identity(),
1927 buf: String::new(),
1928 buf_font_size: 0.,
1929 }
1930 }
1931 fn flush_string(&mut self) -> Result<(), OutputError>{
1932 if self.buf.len() != 0 {
1933
1934 let position = self.buf_ctm.post_transform(&self.flip_ctm);
1935 let transformed_font_size_vec = self.buf_ctm.transform_vector(vec2(self.buf_font_size, self.buf_font_size));
1936 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1938 let (x, y) = (position.m31, position.m32);
1939 warn!("flush {} {:?}", self.buf, (x,y));
1940
1941 write!(self.file, "<div style='position: absolute; left: {}px; top: {}px; font-size: {}px'>{}</div>\n",
1942 x, y, transformed_font_size, insert_nbsp(&self.buf))?;
1943 }
1944 Ok(())
1945 }
1946}
1947
1948type ArtBox = (f64, f64, f64, f64);
1949
1950impl<'a> OutputDev for HTMLOutput<'a> {
1951 fn begin_page(&mut self, page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
1952 write!(self.file, "<meta charset='utf-8' /> ")?;
1953 write!(self.file, "<!-- page {} -->", page_num)?;
1954 write!(self.file, "<div id='page{}' style='position: relative; height: {}px; width: {}px; border: 1px black solid'>", page_num, media_box.ury - media_box.lly, media_box.urx - media_box.llx)?;
1955 self.flip_ctm = Transform::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
1956 Ok(())
1957 }
1958 fn end_page(&mut self) -> Result<(), OutputError> {
1959 self.flush_string()?;
1960 self.buf = String::new();
1961 self.last_ctm = Transform::identity();
1962 write!(self.file, "</div>")?;
1963 Ok(())
1964 }
1965 fn output_character(&mut self, trm: &Transform, width: f64, spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError>{
1966 if trm.approx_eq(&self.last_ctm) {
1967 let position = trm.post_transform(&self.flip_ctm);
1968 let (x, y) = (position.m31, position.m32);
1969
1970 warn!("accum {} {:?}", char, (x,y));
1971 self.buf += char;
1972 } else {
1973 warn!("flush {} {:?} {:?} {} {} {}", char, trm, self.last_ctm, width, font_size, spacing);
1974 self.flush_string()?;
1975 self.buf = char.to_owned();
1976 self.buf_font_size = font_size;
1977 self.buf_ctm = *trm;
1978 }
1979 let position = trm.post_transform(&self.flip_ctm);
1980 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
1981 let transformed_font_size = (transformed_font_size_vec.x * transformed_font_size_vec.y).sqrt();
1983 let (x, y) = (position.m31, position.m32);
1984 write!(self.file, "<div style='position: absolute; color: red; left: {}px; top: {}px; font-size: {}px'>{}</div>",
1985 x, y, transformed_font_size, char)?;
1986 self.last_ctm = trm.pre_transform(&Transform2D::create_translation(width * font_size + spacing, 0.));
1987
1988 Ok(())
1989 }
1990 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
1991 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
1992 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
1993}
1994
1995pub struct SVGOutput<'a> {
1996 file: &'a mut dyn std::io::Write
1997}
1998impl<'a> SVGOutput<'a> {
1999 pub fn new(file: &mut dyn std::io::Write) -> SVGOutput {
2000 SVGOutput{file}
2001 }
2002}
2003
2004impl<'a> OutputDev for SVGOutput<'a> {
2005 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, art_box: Option<(f64, f64, f64, f64)>) -> Result<(), OutputError> {
2006 let ver = 1.1;
2007 write!(self.file, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n")?;
2008 if ver == 1.1 {
2009 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">"#)?;
2010 } else {
2011 write!(self.file, r#"<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">"#)?;
2012 }
2013 if let Some(art_box) = art_box {
2014 let width = art_box.2 - art_box.0;
2015 let height = art_box.3 - art_box.1;
2016 let y = media_box.ury - art_box.1 - height;
2017 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, art_box.0, y, width, height)?;
2018 } else {
2019 let width = media_box.urx - media_box.llx;
2020 let height = media_box.ury - media_box.lly;
2021 write!(self.file, "<svg width=\"{}\" height=\"{}\" xmlns=\"http://www.w3.org/2000/svg\" version=\"{}\" viewBox='{} {} {} {}'>", width, height, ver, media_box.llx, media_box.lly, width, height)?;
2022 }
2023 write!(self.file, "\n")?;
2024 type Mat = Transform;
2025
2026 let ctm = Mat::create_scale(1., -1.).post_translate(vec2(0., media_box.ury));
2027 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>\n",
2028 ctm.m11,
2029 ctm.m12,
2030 ctm.m21,
2031 ctm.m22,
2032 ctm.m31,
2033 ctm.m32,
2034 )?;
2035 Ok(())
2036 }
2037 fn end_page(&mut self) -> Result<(), OutputError>{
2038 write!(self.file, "</g>\n")?;
2039 write!(self.file, "</svg>")?;
2040 Ok(())
2041 }
2042 fn output_character(&mut self, _trm: &Transform, _width: f64, _spacing: f64, _font_size: f64, _char: &str) -> Result<(), OutputError>{
2043 Ok(())
2044 }
2045 fn begin_word(&mut self) -> Result<(), OutputError> {Ok(())}
2046 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2047 fn end_line(&mut self) -> Result<(), OutputError> {Ok(())}
2048 fn fill(&mut self, ctm: &Transform, _colorspace: &ColorSpace, _color: &[f64], path: &Path) -> Result<(), OutputError>{
2049 write!(self.file, "<g transform='matrix({}, {}, {}, {}, {}, {})'>",
2050 ctm.m11,
2051 ctm.m12,
2052 ctm.m21,
2053 ctm.m22,
2054 ctm.m31,
2055 ctm.m32,
2056 )?;
2057
2058 let mut d = Vec::new();
2066 for op in &path.ops {
2067 match op {
2068 &PathOp::MoveTo(x, y) => { d.push(format!("M{} {}", x, y))}
2069 &PathOp::LineTo(x, y) => { d.push(format!("L{} {}", x, y))},
2070 &PathOp::CurveTo(x1, y1, x2, y2, x, y) => { d.push(format!("C{} {} {} {} {} {}", x1, y1, x2, y2, x, y))},
2071 &PathOp::Close => { d.push(format!("Z"))},
2072 &PathOp::Rect(x, y, width, height) => {
2073 d.push(format!("M{} {}", x, y));
2074 d.push(format!("L{} {}", x + width, y));
2075 d.push(format!("L{} {}", x + width, y + height));
2076 d.push(format!("L{} {}", x, y + height));
2077 d.push(format!("Z"));
2078 }
2079
2080 }
2081 }
2082 write!(self.file, "<path d='{}' />", d.join(" "))?;
2083 write!(self.file, "</g>")?;
2084 write!(self.file, "\n")?;
2085 Ok(())
2086 }
2087}
2088
2089pub trait ConvertToFmt {
2096 type Writer: std::fmt::Write;
2097 fn convert(self) -> Self::Writer;
2098}
2099
2100impl<'a> ConvertToFmt for &'a mut String {
2101 type Writer = &'a mut String;
2102 fn convert(self) -> Self::Writer {
2103 self
2104 }
2105}
2106
2107pub struct WriteAdapter<W> {
2108 f: W,
2109}
2110
2111impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
2112 fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
2113 self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
2114 }
2115}
2116
2117impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
2118 type Writer = WriteAdapter<Self>;
2119 fn convert(self) -> Self::Writer {
2120 WriteAdapter { f: self }
2121 }
2122}
2123
2124impl<'a> ConvertToFmt for &'a mut File {
2125 type Writer = WriteAdapter<Self>;
2126 fn convert(self) -> Self::Writer {
2127 WriteAdapter { f: self }
2128 }
2129}
2130
2131pub struct PlainTextOutput<W: ConvertToFmt> {
2132 writer: W::Writer,
2133 last_end: f64,
2134 last_y: f64,
2135 first_char: bool,
2136 flip_ctm: Transform,
2137}
2138
2139impl<W: ConvertToFmt> PlainTextOutput<W> {
2140 pub fn new(writer: W) -> PlainTextOutput<W> {
2141 PlainTextOutput{
2142 writer: writer.convert(),
2143 last_end: 100000.,
2144 first_char: false,
2145 last_y: 0.,
2146 flip_ctm: Transform2D::identity(),
2147 }
2148 }
2149}
2150
2151impl<W: ConvertToFmt> OutputDev for PlainTextOutput<W> {
2154 fn begin_page(&mut self, _page_num: u32, media_box: &MediaBox, _: Option<ArtBox>) -> Result<(), OutputError> {
2155 self.flip_ctm = Transform2D::row_major(1., 0., 0., -1., 0., media_box.ury - media_box.lly);
2156 Ok(())
2157 }
2158 fn end_page(&mut self) -> Result<(), OutputError> {
2159 Ok(())
2160 }
2161 fn output_character(&mut self, trm: &Transform, width: f64, _spacing: f64, font_size: f64, char: &str) -> Result<(), OutputError> {
2162 let position = trm.post_transform(&self.flip_ctm);
2163 let transformed_font_size_vec = trm.transform_vector(vec2(font_size, font_size));
2164 let transformed_font_size = (transformed_font_size_vec.x*transformed_font_size_vec.y).sqrt();
2166 let (x, y) = (position.m31, position.m32);
2167 use std::fmt::Write;
2168 if self.first_char {
2170 if (y - self.last_y).abs() > transformed_font_size * 1.5 {
2171 write!(self.writer, "\n")?;
2172 }
2173
2174 if x < self.last_end && (y - self.last_y).abs() > transformed_font_size * 0.5 {
2176 write!(self.writer, "\n")?;
2177 }
2178
2179 if x > self.last_end + transformed_font_size * 0.1 {
2180 dlog!("width: {}, space: {}, thresh: {}", width, x - self.last_end, transformed_font_size * 0.1);
2181 write!(self.writer, " ")?;
2182 }
2183 }
2184 write!(self.writer, "{}", char)?;
2186 self.first_char = false;
2187 self.last_y = y;
2188 self.last_end = x + width * transformed_font_size;
2189 Ok(())
2190 }
2191 fn begin_word(&mut self) -> Result<(), OutputError> {
2192 self.first_char = true;
2193 Ok(())
2194 }
2195 fn end_word(&mut self) -> Result<(), OutputError> {Ok(())}
2196 fn end_line(&mut self) -> Result<(), OutputError>{
2197 Ok(())
2199 }
2200}
2201
2202
2203pub fn print_metadata(doc: &Document) {
2204 dlog!("Version: {}", doc.version);
2205 if let Some(ref info) = get_info(&doc) {
2206 for (k, v) in *info {
2207 match v {
2208 &Object::String(ref s, StringFormat::Literal) => { dlog!("{}: {}", pdf_to_utf8(k), pdf_to_utf8(s)); }
2209 _ => {}
2210 }
2211 }
2212 }
2213 dlog!("Page count: {}", get::<i64>(&doc, &get_pages(&doc), b"Count"));
2214 dlog!("Pages: {:?}", get_pages(&doc));
2215 dlog!("Type: {:?}", get_pages(&doc).get(b"Type").and_then(|x| x.as_name()).unwrap());
2216}
2217
2218pub fn extract_text<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<String, OutputError> {
2220 let mut s = String::new();
2221 {
2222 let mut output = PlainTextOutput::new(&mut s);
2223 let mut doc = Document::load(path)?;
2224 maybe_decrypt(&mut doc)?;
2225 output_doc(&doc, &mut output)?;
2226 }
2227 Ok(s)
2228}
2229
2230fn maybe_decrypt(doc: &mut Document) -> Result<(), OutputError> {
2231 if ! doc.is_encrypted() {
2232 return Ok(());
2233 }
2234
2235 if let Err(e) = doc.decrypt("") {
2236 if let Error::Decryption(DecryptionError::IncorrectPassword) = e {
2237 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted")
2238 }
2239
2240 return Err(OutputError::PdfError(e));
2241 }
2242
2243 Ok(())
2244}
2245
2246pub fn extract_text_encrypted<P: std::convert::AsRef<std::path::Path>>(
2247 path: P,
2248 password: &str,
2249) -> Result<String, OutputError> {
2250 let mut s = String::new();
2251 {
2252 let mut output = PlainTextOutput::new(&mut s);
2253 let mut doc = Document::load(path)?;
2254 output_doc_encrypted(&mut doc, &mut output, password)?;
2255 }
2256 Ok(s)
2257}
2258
2259pub fn extract_text_from_mem(buffer: &[u8]) -> Result<String, OutputError> {
2260 let mut s = String::new();
2261 {
2262 let mut output = PlainTextOutput::new(&mut s);
2263 let mut doc = Document::load_mem(buffer)?;
2264 maybe_decrypt(&mut doc)?;
2265 output_doc(&doc, &mut output)?;
2266 }
2267 Ok(s)
2268}
2269
2270pub fn extract_text_from_mem_encrypted(
2271 buffer: &[u8],
2272 password: &str,
2273) -> Result<String, OutputError> {
2274 let mut s = String::new();
2275 {
2276 let mut output = PlainTextOutput::new(&mut s);
2277 let mut doc = Document::load_mem(buffer)?;
2278 output_doc_encrypted(&mut doc, &mut output, password)?;
2279 }
2280 Ok(s)
2281}
2282
2283
2284fn extract_text_by_page(doc: &Document, page_num: u32) -> Result<String, OutputError> {
2285 let mut s = String::new();
2286 {
2287 let mut output = PlainTextOutput::new(&mut s);
2288 output_doc_page(doc, &mut output, page_num)?;
2289 }
2290 Ok(s)
2291}
2292
2293pub fn extract_text_by_pages<P: std::convert::AsRef<std::path::Path>>(path: P) -> Result<Vec<String>, OutputError> {
2296 let mut v = Vec::new();
2297 {
2298 let mut doc = Document::load(path)?;
2299 maybe_decrypt(&mut doc)?;
2300 let mut page_num = 1;
2301 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2302 v.push(content);
2303 page_num += 1;
2304 }
2305 }
2306 Ok(v)
2307}
2308
2309pub fn extract_text_by_pages_encrypted<P: std::convert::AsRef<std::path::Path>>(path: P, password: &str) -> Result<Vec<String>, OutputError> {
2310 let mut v = Vec::new();
2311 {
2312 let mut doc = Document::load(path)?;
2313 doc.decrypt(password)?;
2314 let mut page_num = 1;
2315 while let Ok(content) = extract_text_by_page(&mut doc, page_num) {
2316 v.push(content);
2317 page_num += 1;
2318 }
2319 }
2320 Ok(v)
2321}
2322
2323pub fn extract_text_from_mem_by_pages(buffer: &[u8]) -> Result<Vec<String>, OutputError> {
2324 let mut v = Vec::new();
2325 {
2326 let mut doc = Document::load_mem(buffer)?;
2327 maybe_decrypt(&mut doc)?;
2328 let mut page_num = 1;
2329 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2330 v.push(content);
2331 page_num += 1;
2332 }
2333 }
2334 Ok(v)
2335}
2336
2337pub fn extract_text_from_mem_by_pages_encrypted(buffer: &[u8], password: &str) -> Result<Vec<String>, OutputError> {
2338 let mut v = Vec::new();
2339 {
2340 let mut doc = Document::load_mem(buffer)?;
2341 doc.decrypt(password)?;
2342 let mut page_num = 1;
2343 while let Ok(content) = extract_text_by_page(&doc, page_num) {
2344 v.push(content);
2345 page_num += 1;
2346 }
2347 }
2348 Ok(v)
2349}
2350
2351
2352fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
2353 let o: Option<T> = get(doc, dict, key);
2354 if let Some(o) = o {
2355 Some(o)
2356 } else {
2357 let parent = dict.get(b"Parent")
2358 .and_then(|parent| parent.as_reference())
2359 .and_then(|id| doc.get_dictionary(id)).ok()?;
2360 get_inherited(doc, parent, key)
2361 }
2362}
2363
2364pub fn output_doc_encrypted(
2365 doc: &mut Document,
2366 output: &mut dyn OutputDev,
2367 password: &str,
2368) -> Result<(), OutputError> {
2369 doc.decrypt(password)?;
2370 output_doc(doc, output)
2371}
2372
2373pub fn output_doc(doc: &Document, output: &mut dyn OutputDev) -> Result<(), OutputError> {
2375 if doc.is_encrypted() {
2376 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2377 }
2378 let empty_resources = Dictionary::new();
2379 let pages = doc.get_pages();
2380 let mut p = Processor::new();
2381 for dict in pages {
2382 let page_num = dict.0;
2383 let object_id = dict.1;
2384 output_doc_inner(page_num, object_id, doc, &mut p, output, &empty_resources)?;
2385 }
2386 Ok(())
2387}
2388
2389pub fn output_doc_page(doc: &Document, output: &mut dyn OutputDev, page_num: u32) -> Result<(), OutputError> {
2390 if doc.is_encrypted() {
2391 error!("Encrypted documents must be decrypted with a password using {{extract_text|extract_text_from_mem|output_doc}}_encrypted");
2392 }
2393 let empty_resources = Dictionary::new();
2394 let pages = doc.get_pages();
2395 let object_id = pages.get(&page_num).ok_or(lopdf::Error::PageNumberNotFound(page_num))?;
2396 let mut p = Processor::new();
2397 output_doc_inner(page_num, *object_id, doc, &mut p, output, &empty_resources)?;
2398 Ok(())
2399}
2400
2401fn output_doc_inner<'a>(page_num: u32, object_id: ObjectId, doc: &'a Document, p: & mut Processor<'a>, output: &mut dyn OutputDev, empty_resources: &'a Dictionary) -> Result<(), OutputError> {
2402 let page_dict = doc.get_object(object_id).unwrap().as_dict().unwrap();
2403 dlog!("page {} {:?}", page_num, page_dict);
2404 let resources = get_inherited(doc, page_dict, b"Resources").unwrap_or(empty_resources);
2406 dlog!("resources {:?}", resources);
2407 let media_box: Vec<f64> = get_inherited(doc, page_dict, b"MediaBox").expect("MediaBox");
2409 let media_box = MediaBox { llx: media_box[0], lly: media_box[1], urx: media_box[2], ury: media_box[3] };
2410 let art_box = get::<Option<Vec<f64>>>(&doc, page_dict, b"ArtBox")
2411 .map(|x| (x[0], x[1], x[2], x[3]));
2412 output.begin_page(page_num, &media_box, art_box)?;
2413 p.process_stream(&doc, doc.get_page_content(object_id).unwrap(), resources, &media_box, output, page_num)?;
2414 output.end_page()?;
2415 Ok(())
2416}