extern crate lopdf;
use lopdf::content::Content;
use lopdf::*;
use euclid::*;
use std::fmt::{Debug, Formatter};
extern crate encoding;
extern crate euclid;
extern crate type1_encoding_parser;
extern crate unicode_normalization;
use encoding::{Encoding, DecoderTrap};
use encoding::all::UTF_16BE;
use std::fmt;
use std::str;
use std::fs::File;
use std::slice::Iter;
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::rc::Rc;
use std::marker::PhantomData;
use std::result::Result;
use json::{array, JsonValue};
mod core_fonts;
mod glyphnames;
mod zapfglyphnames;
mod encodings;
pub fn output_doc(path: String) -> JsonValue {
let doc = Document::load(path).unwrap();
let empty_resources = &Dictionary::new();
let pages = doc.get_pages();
let mut p = Processor::new();
for dict in pages {
let page_dict = doc.get_object(dict.1).unwrap().as_dict().unwrap();
let resources = get_inherited(&doc, page_dict, b"Resources").unwrap_or(empty_resources);
let data = p.process_stream(&doc, doc.get_page_content(dict.1).unwrap(), resources);
return filter(data);
}
return array![];
}
pub struct Space;
pub type Transform = Transform2D<f64, Space, Space>;
#[derive(Debug)]
pub enum OutputError
{
FormatError(std::fmt::Error),
IoError(std::io::Error),
PdfError(lopdf::Error),
}
impl std::fmt::Display for OutputError
{
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
OutputError::FormatError(e) => write!(f, "Formating error: {}", e),
OutputError::IoError(e) => write!(f, "IO error: {}", e),
OutputError::PdfError(e) => write!(f, "PDF error: {}", e)
}
}
}
impl std::error::Error for OutputError {}
impl From<std::fmt::Error> for OutputError {
fn from(e: std::fmt::Error) -> Self {
OutputError::FormatError(e)
}
}
impl From<std::io::Error> for OutputError {
fn from(e: std::io::Error) -> Self {
OutputError::IoError(e)
}
}
impl From<lopdf::Error> for OutputError {
fn from(e: lopdf::Error) -> Self {
OutputError::PdfError(e)
}
}
macro_rules! dlog {
($($e:expr),*) => { {$(let _ = $e;)*} }
}
#[allow(non_upper_case_globals)]
const PDFDocEncoding: &'static [u16] = &[
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x02d8, 0x02c7, 0x02c6,
0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0020, 0x0021, 0x0022, 0x0023,
0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c,
0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035,
0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e,
0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059,
0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062,
0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b,
0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d,
0x007e, 0x0000, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192,
0x2044, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0178,
0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x20ac, 0x00a1,
0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa,
0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc,
0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5,
0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce,
0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0,
0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9,
0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2,
0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb,
0x00fc, 0x00fd, 0x00fe, 0x00ff];
fn pdf_to_utf8(s: &[u8]) -> String {
if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
return UTF_16BE.decode(&s[2..], DecoderTrap::Strict).unwrap();
} else {
let r: Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
let k = PDFDocEncoding[x as usize];
vec![(k >> 8) as u8, k as u8].into_iter()
}).collect();
return UTF_16BE.decode(&r, DecoderTrap::Strict).unwrap();
}
}
fn to_utf8(encoding: &[u16], s: &[u8]) -> String {
if s.len() > 2 && s[0] == 0xfe && s[1] == 0xff {
return UTF_16BE.decode(&s[2..], DecoderTrap::Strict).unwrap();
} else {
let r: Vec<u8> = s.iter().map(|x| *x).flat_map(|x| {
let k = encoding[x as usize];
vec![(k >> 8) as u8, k as u8].into_iter()
}).collect();
return UTF_16BE.decode(&r, DecoderTrap::Strict).unwrap();
}
}
fn maybe_deref<'a>(doc: &'a Document, o: &'a Object) -> &'a Object {
match o {
&Object::Reference(r) => doc.get_object(r).expect("missing object reference"),
_ => o
}
}
fn maybe_get_obj<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Object> {
dict.get(key).map(|o| maybe_deref(doc, o)).ok()
}
trait FromOptObj<'a> {
fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self;
}
trait FromObj<'a> where Self: std::marker::Sized {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self>;
}
impl<'a, T: FromObj<'a>> FromOptObj<'a> for Option<T> {
fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, _key: &[u8]) -> Self {
obj.and_then(|x| T::from_obj(doc, x))
}
}
impl<'a, T: FromObj<'a>> FromOptObj<'a> for T {
fn from_opt_obj(doc: &'a Document, obj: Option<&'a Object>, key: &[u8]) -> Self {
T::from_obj(doc, obj.expect(&String::from_utf8_lossy(key))).expect("wrong type")
}
}
impl<'a, T: FromObj<'a>> FromObj<'a> for Vec<T> {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
maybe_deref(doc, obj).as_array().map(|x| x.iter()
.map(|x| T::from_obj(doc, x).expect("wrong type"))
.collect()).ok()
}
}
impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 4] {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
maybe_deref(doc, obj).as_array().map(|x| {
let mut all = x.iter()
.map(|x| T::from_obj(doc, x).expect("wrong type"));
[all.next().unwrap(), all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
}).ok()
}
}
impl<'a, T: FromObj<'a>> FromObj<'a> for [T; 3] {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<Self> {
maybe_deref(doc, obj).as_array().map(|x| {
let mut all = x.iter()
.map(|x| T::from_obj(doc, x).expect("wrong type"));
[all.next().unwrap(), all.next().unwrap(), all.next().unwrap()]
}).ok()
}
}
impl<'a> FromObj<'a> for f64 {
fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
match obj {
&Object::Integer(i) => Some(i as f64),
&Object::Real(f) => Some(f as f64),
_ => None
}
}
}
impl<'a> FromObj<'a> for i64 {
fn from_obj(_doc: &Document, obj: &Object) -> Option<Self> {
match obj {
&Object::Integer(i) => Some(i),
_ => None
}
}
}
impl<'a> FromObj<'a> for &'a Dictionary {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
maybe_deref(doc, obj).as_dict().ok()
}
}
impl<'a> FromObj<'a> for &'a Stream {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Stream> {
maybe_deref(doc, obj).as_stream().ok()
}
}
impl<'a> FromObj<'a> for &'a Object {
fn from_obj(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
Some(maybe_deref(doc, obj))
}
}
fn get<'a, T: FromOptObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> T {
T::from_opt_obj(doc, dict.get(key).ok(), key)
}
fn get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> String {
pdf_to_utf8(dict.get(key).map(|o| maybe_deref(doc, o)).unwrap_or_else(|_| panic!("deref")).as_name().expect("name"))
}
#[allow(dead_code)]
fn maybe_get_name_string<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<String> {
maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok()).map(|n| pdf_to_utf8(n))
}
fn maybe_get_name<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a [u8]> {
maybe_get_obj(doc, dict, key).and_then(|n| n.as_name().ok())
}
fn maybe_get_array<'a>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<&'a Vec<Object>> {
maybe_get_obj(doc, dict, key).and_then(|n| n.as_array().ok())
}
#[derive(Clone)]
struct PdfSimpleFont<'a> {
font: &'a Dictionary,
doc: &'a Document,
encoding: Option<Vec<u16>>,
unicode_map: Option<HashMap<u32, String>>,
widths: HashMap<CharCode, f64>,
default_width: Option<f64>, }
#[derive(Clone)]
struct PdfType3Font<'a> {
font: &'a Dictionary,
doc: &'a Document,
encoding: Option<Vec<u16>>,
unicode_map: Option<HashMap<u32, String>>,
widths: HashMap<CharCode, f64>, }
fn make_font<'a>(doc: &'a Document, font: &'a Dictionary) -> Rc<dyn PdfFont + 'a> {
let subtype = get_name_string(doc, font, b"Subtype");
dlog!("MakeFont({})", subtype);
if subtype == "Type0" {
Rc::new(PdfCIDFont::new(doc, font))
} else if subtype == "Type3" {
Rc::new(PdfType3Font::new(doc, font))
} else {
Rc::new(PdfSimpleFont::new(doc, font))
}
}
fn is_core_font(name: &str) -> bool {
match name {
"Courier-Bold" |
"Courier-BoldOblique" |
"Courier-Oblique" |
"Courier" |
"Helvetica-Bold" |
"Helvetica-BoldOblique" |
"Helvetica-Oblique" |
"Helvetica" |
"Symbol" |
"Times-Bold" |
"Times-BoldItalic" |
"Times-Italic" |
"Times-Roman" |
"ZapfDingbats" => true,
_ => false,
}
}
fn encoding_to_unicode_table(name: &[u8]) -> Vec<u16> {
let encoding = match &name[..] {
b"MacRomanEncoding" => encodings::MAC_ROMAN_ENCODING,
b"MacExpertEncoding" => encodings::MAC_EXPERT_ENCODING,
b"WinAnsiEncoding" => encodings::WIN_ANSI_ENCODING,
_ => panic!("unexpected encoding {:?}", pdf_to_utf8(name))
};
let encoding_table = encoding.iter()
.map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
.collect();
encoding_table
}
impl<'a> PdfSimpleFont<'a> {
fn new(doc: &'a Document, font: &'a Dictionary) -> PdfSimpleFont<'a> {
let base_name = get_name_string(doc, font, b"BaseFont");
let subtype = get_name_string(doc, font, b"Subtype");
let encoding: Option<&Object> = get(doc, font, b"Encoding");
dlog!("base_name {} {} enc:{:?} {:?}", base_name, subtype, encoding, font);
let descriptor: Option<&Dictionary> = get(doc, font, b"FontDescriptor");
let mut type1_encoding = None;
if let Some(descriptor) = descriptor {
dlog!("descriptor {:?}", descriptor);
if subtype == "Type1" {
let file = maybe_get_obj(doc, descriptor, b"FontFile");
match file {
Some(&Object::Stream(ref s)) => {
let s = get_contents(s);
type1_encoding = Some(type1_encoding_parser::get_encoding_map(&s).expect("encoding"));
}
_ => { dlog!("font file {:?}", file) }
}
} else if subtype == "TrueType" {
let file = maybe_get_obj(doc, descriptor, b"FontFile2");
match file {
Some(&Object::Stream(ref s)) => {
let _s = get_contents(s);
}
_ => { dlog!("font file {:?}", file) }
}
}
let font_file3 = get::<Option<&Object>>(doc, descriptor, b"FontFile3");
match font_file3 {
Some(&Object::Stream(ref s)) => {
dlog!("font file {:?}", s);
}
None => {}
_ => { dlog!("unexpected") }
}
let charset = maybe_get_obj(doc, descriptor, b"CharSet");
let _charset = match charset {
Some(&Object::String(ref s, _)) => { Some(pdf_to_utf8(&s)) }
_ => { None }
};
}
let mut unicode_map = get_unicode_map(doc, font);
let mut encoding_table = None;
match encoding {
Some(&Object::Name(ref encoding_name)) => {
dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
encoding_table = Some(encoding_to_unicode_table(encoding_name));
}
Some(&Object::Dictionary(ref encoding)) => {
let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
dlog!("BaseEncoding {:?}", base_encoding);
encoding_to_unicode_table(base_encoding)
} else {
Vec::from(PDFDocEncoding)
};
let differences = maybe_get_array(doc, encoding, b"Differences");
if let Some(differences) = differences {
dlog!("Differences");
let mut code = 0;
for o in differences {
let o = maybe_deref(doc, o);
match o {
&Object::Integer(i) => { code = i; }
&Object::Name(ref n) => {
let name = pdf_to_utf8(&n);
let unicode = glyphnames::name_to_unicode(&name);
if let Some(unicode) = unicode {
table[code as usize] = unicode;
if let Some(ref mut unicode_map) = unicode_map {
let be = [unicode];
match unicode_map.entry(code as u32) {
Entry::Vacant(v) => { v.insert(String::from_utf16(&be).unwrap()); }
Entry::Occupied(e) => {
if e.get() != &String::from_utf16(&be).unwrap() {
println!("Unicode mismatch");
}
}
}
}
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref mut unicode_map) = unicode_map {
dlog!("{} {}", code, unicode_map[&(code as u32)]);
}
code += 1;
}
_ => { panic!("wrong type {:?}", o); }
}
}
}
let name = pdf_to_utf8(encoding.get(b"Type").unwrap().as_name().unwrap());
dlog!("name: {}", name);
encoding_table = Some(table);
}
None => {
if let Some(type1_encoding) = type1_encoding {
let mut table = Vec::from(PDFDocEncoding);
dlog!("type1encoding");
for (code, name) in type1_encoding {
let unicode = glyphnames::name_to_unicode(&pdf_to_utf8(&name));
if let Some(unicode) = unicode {
table[code as usize] = unicode;
} else {
dlog!("unknown character {}", pdf_to_utf8(&name));
}
}
encoding_table = Some(table)
} else if subtype == "TrueType" {
encoding_table = Some(encodings::WIN_ANSI_ENCODING.iter()
.map(|x| if let &Some(x) = x { glyphnames::name_to_unicode(x).unwrap() } else { 0 })
.collect());
}
}
_ => { panic!() }
}
let mut width_map = HashMap::new();
if is_core_font(&base_name) {
for font_metrics in core_fonts::metrics().iter() {
if font_metrics.0 == base_name {
if let Some(ref encoding) = encoding_table {
dlog!("has encoding");
for w in font_metrics.2 {
let c = glyphnames::name_to_unicode(w.2).unwrap();
for i in 0..encoding.len() {
if encoding[i] == c {
width_map.insert(i as CharCode, w.1 as f64);
}
}
}
} else {
let mut table = vec![0; 256];
for w in font_metrics.2 {
dlog!("{} {}", w.0, w.2);
if w.0 != -1 {
table[w.0 as usize] = if base_name == "ZapfDingbats" {
zapfglyphnames::zapfdigbats_names_to_unicode(w.2).unwrap_or_else(|| panic!("bad name {:?}", w))
} else {
glyphnames::name_to_unicode(w.2).unwrap()
}
}
}
let encoding = &table[..];
for w in font_metrics.2 {
width_map.insert(w.0 as CharCode, w.1 as f64);
}
encoding_table = Some(encoding.to_vec());
}
}
}
} else {
let first_char: i64 = get(doc, font, b"FirstChar");
let last_char: i64 = get(doc, font, b"LastChar");
let widths: Vec<f64> = get(doc, font, b"Widths");
let mut i = 0;
dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
for w in widths {
width_map.insert((first_char + i) as CharCode, w);
i += 1;
}
assert_eq!(first_char + i - 1, last_char);
}
PdfSimpleFont { doc, font, widths: width_map, encoding: encoding_table, default_width: None, unicode_map }
}
#[allow(dead_code)]
fn get_type(&self) -> String {
get_name_string(self.doc, self.font, b"Type")
}
#[allow(dead_code)]
fn get_basefont(&self) -> String {
get_name_string(self.doc, self.font, b"BaseFont")
}
#[allow(dead_code)]
fn get_subtype(&self) -> String {
get_name_string(self.doc, self.font, b"Subtype")
}
#[allow(dead_code)]
fn get_widths(&self) -> Option<&Vec<Object>> {
maybe_get_obj(self.doc, self.font, b"Widths").map(|widths| widths.as_array().expect("Widths should be an array"))
}
#[allow(dead_code)]
fn get_name(&self) -> Option<String> {
maybe_get_name_string(self.doc, self.font, b"Name")
}
#[allow(dead_code)]
fn get_descriptor(&self) -> Option<PdfFontDescriptor> {
maybe_get_obj(self.doc, self.font, b"FontDescriptor").and_then(|desc| desc.as_dict().ok()).map(|desc| PdfFontDescriptor { desc: desc, doc: self.doc })
}
}
impl<'a> PdfType3Font<'a> {
fn new(doc: &'a Document, font: &'a Dictionary) -> PdfType3Font<'a> {
let unicode_map = get_unicode_map(doc, font);
let encoding: Option<&Object> = get(doc, font, b"Encoding");
let encoding_table;
match encoding {
Some(&Object::Name(ref encoding_name)) => {
dlog!("encoding {:?}", pdf_to_utf8(encoding_name));
encoding_table = Some(encoding_to_unicode_table(encoding_name));
}
Some(&Object::Dictionary(ref encoding)) => {
let mut table = if let Some(base_encoding) = maybe_get_name(doc, encoding, b"BaseEncoding") {
dlog!("BaseEncoding {:?}", base_encoding);
encoding_to_unicode_table(base_encoding)
} else {
Vec::from(PDFDocEncoding)
};
let differences = maybe_get_array(doc, encoding, b"Differences");
if let Some(differences) = differences {
dlog!("Differences");
let mut code = 0;
for o in differences {
match o {
&Object::Integer(i) => { code = i; }
&Object::Name(ref n) => {
let name = pdf_to_utf8(&n);
let unicode = glyphnames::name_to_unicode(&name);
if let Some(unicode) = unicode {
table[code as usize] = unicode;
}
dlog!("{} = {} ({:?})", code, name, unicode);
if let Some(ref unicode_map) = unicode_map {
dlog!("{} {:?}", code, unicode_map.get(&(code as u32)));
}
code += 1;
}
_ => { panic!("wrong type"); }
}
}
}
let name_encoded = encoding.get(b"Type");
if let Ok(Object::Name(name)) = name_encoded {
dlog!("name: {}", pdf_to_utf8(name));
} else {
dlog!("name not found");
}
encoding_table = Some(table);
}
_ => { panic!() }
}
let first_char: i64 = get(doc, font, b"FirstChar");
let last_char: i64 = get(doc, font, b"LastChar");
let widths: Vec<f64> = get(doc, font, b"Widths");
let mut width_map = HashMap::new();
let mut i = 0;
dlog!("first_char {:?}, last_char: {:?}, widths: {} {:?}", first_char, last_char, widths.len(), widths);
for w in widths {
width_map.insert((first_char + i) as CharCode, w);
i += 1;
}
assert_eq!(first_char + i - 1, last_char);
PdfType3Font { doc, font, widths: width_map, encoding: encoding_table, unicode_map }
}
}
type CharCode = u32;
struct PdfFontIter<'a>
{
i: Iter<'a, u8>,
font: &'a dyn PdfFont,
}
impl<'a> Iterator for PdfFontIter<'a> {
type Item = (CharCode, u8);
fn next(&mut self) -> Option<(CharCode, u8)> {
self.font.next_char(&mut self.i)
}
}
trait PdfFont: Debug {
fn get_width(&self, id: CharCode) -> f64;
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)>;
fn decode_char(&self, char: CharCode) -> String;
}
impl<'a> dyn PdfFont + 'a {
fn char_codes(&'a self, chars: &'a [u8]) -> PdfFontIter {
PdfFontIter { i: chars.iter(), font: self }
}
fn decode(&self, chars: &[u8]) -> String {
let strings = self.char_codes(chars).map(|x| self.decode_char(x.0)).collect::<Vec<_>>();
strings.join("")
}
}
impl<'a> PdfFont for PdfSimpleFont<'a> {
fn get_width(&self, id: CharCode) -> f64 {
let width = self.widths.get(&id);
if let Some(width) = width {
return *width;
} else {
dlog!("missing width for {} falling back to default_width {:?}", id, self.font);
return self.default_width.unwrap();
}
}
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map) }
Some(s) => { s.clone() }
};
return s;
}
let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
let s = to_utf8(encoding, &slice);
s
}
}
impl<'a> fmt::Debug for PdfSimpleFont<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.font.fmt(f)
}
}
impl<'a> PdfFont for PdfType3Font<'a> {
fn get_width(&self, id: CharCode) -> f64 {
let width = self.widths.get(&id);
if let Some(width) = width {
return *width;
} else {
panic!("missing width for {} {:?}", id, self.font);
}
}
fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
iter.next().map(|x| (*x as CharCode, 1))
}
fn decode_char(&self, char: CharCode) -> String {
let slice = [char as u8];
if let Some(ref unicode_map) = self.unicode_map {
let s = unicode_map.get(&char);
let s = match s {
None => { panic!("missing char {:?} in map {:?}", char, unicode_map) }
Some(s) => { s.clone() }
};
return s;
}
let encoding = self.encoding.as_ref().map(|x| &x[..]).unwrap_or(&PDFDocEncoding);
let s = to_utf8(encoding, &slice);
s
}
}
impl<'a> fmt::Debug for PdfType3Font<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.font.fmt(f)
}
}
struct PdfCIDFont<'a> {
font: &'a Dictionary,
#[allow(dead_code)]
doc: &'a Document,
#[allow(dead_code)]
encoding: Option<Vec<u16>>,
to_unicode: Option<HashMap<u32, String>>,
widths: HashMap<CharCode, f64>,
default_width: Option<f64>, }
fn get_unicode_map<'a>(doc: &'a Document, font: &'a Dictionary) -> Option<HashMap<u32, String>> {
let to_unicode = maybe_get_obj(doc, font, b"ToUnicode");
dlog!("ToUnicode: {:?}", to_unicode);
let mut unicode_map = None;
match to_unicode {
Some(&Object::Stream(ref stream)) => {
let contents = get_contents(stream);
dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
let cmap = adobe_cmap_parser::get_unicode_map(&contents).unwrap();
let mut unicode = HashMap::new();
for (&k, v) in cmap.iter() {
let mut be: Vec<u16> = Vec::new();
let mut i = 0;
assert!(v.len() % 2 == 0);
while i < v.len() {
be.push(((v[i] as u16) << 8) | v[i + 1] as u16);
i += 2;
}
match &be[..] {
[0xd800..=0xdfff] => {
continue;
}
_ => {}
}
let s = String::from_utf16(&be).unwrap();
unicode.insert(k, s);
}
unicode_map = Some(unicode);
dlog!("map: {:?}", unicode_map);
}
None => {}
Some(&Object::Name(ref name)) => {
let name = pdf_to_utf8(name);
assert!(name == "Identity-H");
}
_ => { panic!("unsupported cmap {:?}", to_unicode) }
}
unicode_map
}
impl<'a> PdfCIDFont<'a> {
fn new(doc: &'a Document, font: &'a Dictionary) -> PdfCIDFont<'a> {
let base_name = get_name_string(doc, font, b"BaseFont");
let descendants = maybe_get_array(doc, font, b"DescendantFonts").expect("Descendant fonts required");
let ciddict = maybe_deref(doc, &descendants[0]).as_dict().expect("should be CID dict");
let encoding = maybe_get_obj(doc, font, b"Encoding").expect("Encoding required in type0 fonts");
dlog!("base_name {} {:?}", base_name, font);
match encoding {
&Object::Name(ref name) => {
let name = pdf_to_utf8(name);
dlog!("encoding {:?}", name);
assert!(name == "Identity-H");
}
&Object::Stream(ref stream) => {
let contents = get_contents(stream);
dlog!("Stream: {}", String::from_utf8(contents.clone()).unwrap());
}
_ => { panic!("unsupported encoding {:?}", encoding) }
}
let unicode_map = get_unicode_map(doc, font);
dlog!("descendents {:?} {:?}", descendants, ciddict);
let font_dict = maybe_get_obj(doc, ciddict, b"FontDescriptor").expect("required");
dlog!("{:?}", font_dict);
let _f = font_dict.as_dict().expect("must be dict");
let default_width = get::<Option<i64>>(doc, ciddict, b"DW").unwrap_or(1000);
let w: Option<Vec<&Object>> = get(doc, ciddict, b"W");
dlog!("widths {:?}", w);
let mut widths = HashMap::new();
let mut i = 0;
if let Some(w) = w {
while i < w.len() {
if let &Object::Array(ref wa) = w[i + 1] {
let cid = w[i].as_i64().expect("id should be num");
let mut j = 0;
dlog!("wa: {:?} -> {:?}", cid, wa);
for w in wa {
widths.insert((cid + j) as CharCode, as_num(w));
j += 1;
}
i += 2;
} else {
let c_first = w[i].as_i64().expect("first should be num");
let c_last = w[i].as_i64().expect("last should be num");
let c_width = as_num(&w[i]);
for id in c_first..c_last {
widths.insert(id as CharCode, c_width);
}
i += 3;
}
}
}
PdfCIDFont { doc, font, widths, to_unicode: unicode_map, encoding: None, default_width: Some(default_width as f64) }
}
}
impl<'a> PdfFont for PdfCIDFont<'a> {
fn get_width(&self, id: CharCode) -> f64 {
let width = self.widths.get(&id);
if let Some(width) = width {
dlog!("GetWidth {} -> {}", id, *width);
return *width;
} else {
dlog!("missing width for {} falling back to default_width", id);
return self.default_width.unwrap();
}
}fn next_char(&self, iter: &mut Iter<u8>) -> Option<(CharCode, u8)> {
let p = iter.next();
if let Some(&c) = p {
let next = *iter.next().unwrap();
Some((((c as u32) << 8) | next as u32, 2))
} else {
None
}
}
fn decode_char(&self, char: CharCode) -> String {
let s = self.to_unicode.as_ref().and_then(|x| x.get(&char));
if let Some(s) = s {
s.clone()
} else {
dlog!("Unknown character {:?} in {:?} {:?}", char, self.font, self.to_unicode);
"".to_string()
}
}
}
impl<'a> fmt::Debug for PdfCIDFont<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.font.fmt(f)
}
}
#[derive(Copy, Clone)]
struct PdfFontDescriptor<'a> {
desc: &'a Dictionary,
doc: &'a Document,
}
impl<'a> PdfFontDescriptor<'a> {
#[allow(dead_code)]
fn get_file(&self) -> Option<&'a Object> {
maybe_get_obj(self.doc, self.desc, b"FontFile")
}
}
impl<'a> fmt::Debug for PdfFontDescriptor<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.desc.fmt(f)
}
}
#[derive(Clone, Debug)]
struct Type0Func {
domain: Vec<f64>,
range: Vec<f64>,
_contents: Vec<u8>,
_size: Vec<i64>,
_bits_per_sample: i64,
_encode: Vec<f64>,
_decode: Vec<f64>,
}
#[allow(dead_code)]
fn interpolate(x: f64, x_min: f64, _x_max: f64, y_min: f64, y_max: f64) -> f64 {
let divisor = x - x_min;
if divisor != 0. {
y_min + (x - x_min) * ((y_max - y_min) / divisor)
} else {
y_min
}
}
impl Type0Func {
#[allow(dead_code)]
fn eval(&self, _input: &[f64], _output: &mut [f64]) {
let _n_inputs = self.domain.len() / 2;
let _n_ouputs = self.range.len() / 2;
}
}
#[derive(Clone, Debug)]
enum Function {
#[allow(dead_code)]
Type0(Type0Func),
#[allow(dead_code)]
Type3,
#[allow(dead_code)]
Type4,
}
fn as_num(o: &Object) -> f64 {
match o {
&Object::Integer(i) => { i as f64 }
&Object::Real(f) => { f as f64 }
_ => { panic!("not a number") }
}
}
#[derive(Clone)]
struct TextState<'a>
{
font: Option<Rc<dyn PdfFont + 'a>>,
font_size: f64
}
fn get_contents(contents: &Stream) -> Vec<u8> {
if contents.filter().is_ok() {
contents.decompressed_content().unwrap_or_else(|_| contents.content.clone())
} else {
contents.content.clone()
}
}
#[derive(Clone)]
struct GraphicsState<'a>
{
ts: TextState<'a>,
}
fn show_text_list(gs: &mut GraphicsState, s: &[u8]) -> String {
let ts = &mut gs.ts;
let font = ts.font.as_ref().unwrap();
return font.decode(s);
}
#[derive(Debug, Clone, Copy)]
pub struct MediaBox {
pub llx: f64,
pub lly: f64,
pub urx: f64,
pub ury: f64,
}
#[derive(Debug)]
pub enum PathOp {
MoveTo(f64, f64),
LineTo(f64, f64),
CurveTo(f64, f64, f64, f64, f64, f64),
Rect(f64, f64, f64, f64),
Close,
}
#[derive(Clone)]
pub enum ColorSpace {
DeviceGray,
DeviceRGB,
DeviceCMYK,
Pattern,
ICCBased(Vec<u8>),
}
struct Processor<'a> {
_none: PhantomData<&'a ()>,
}
impl<'a> Processor<'a> {
fn new() -> Processor<'a> {
Processor { _none: PhantomData }
}
fn process_stream(&mut self, doc: &'a Document, content: Vec<u8>, resources: &'a Dictionary) -> JsonValue {
let content = Content::decode(&content).unwrap();
let mut font_table = HashMap::new();
let mut gs: GraphicsState = GraphicsState {
ts: TextState {
font: None,
font_size: std::f64::NAN,
}
};
let mut list = array![];
let mut index = 0;
for operation in &content.operations {
match operation.operator.as_ref() {
"BT" => {
list[index] = String::new().into();
}
"ET" => {
index += 1;
}
"TJ" => {
match operation.operands[0] {
Object::Array(ref array) => {
for e in array {
match e {
&Object::String(ref s, _) => {
let text = show_text_list(&mut gs, s);
list[index] = format!("{}{}", list[index], text).into();
}
_ => { dlog!("kind of {:?}", e); }
}
}
}
_ => {}
}
}
"Tj" => {
match operation.operands[0] {
Object::String(ref s, _) => {
let text = show_text_list(&mut gs, s);
list[index] = format!("{}{}", list[index], text).into();
}
_ => { panic!("unexpected Tj operand {:?}", operation) }
}
}
"Tf" => {
let fonts: &Dictionary = get(&doc, resources, b"Font");
let name = operation.operands[0].as_name().unwrap();
let font = font_table.entry(name.to_owned()).or_insert_with(|| make_font(doc, get::<&Dictionary>(doc, fonts, name))).clone();
gs.ts.font = Some(font);
gs.ts.font_size = as_num(&operation.operands[1]);
dlog!("font {} size: {} {:?}", pdf_to_utf8(name), gs.ts.font_size, operation);
}
_ => {}
}
}
list
}
}
pub trait ConvertToFmt {
type Writer: std::fmt::Write;
fn convert(self) -> Self::Writer;
}
impl<'a> ConvertToFmt for &'a mut String {
type Writer = &'a mut String;
fn convert(self) -> Self::Writer {
self
}
}
pub struct WriteAdapter<W> {
f: W,
}
impl<W: std::io::Write> std::fmt::Write for WriteAdapter<W> {
fn write_str(&mut self, s: &str) -> Result<(), std::fmt::Error> {
self.f.write_all(s.as_bytes()).map_err(|_| fmt::Error)
}
}
impl<'a> ConvertToFmt for &'a mut dyn std::io::Write {
type Writer = WriteAdapter<Self>;
fn convert(self) -> Self::Writer {
WriteAdapter { f: self }
}
}
impl<'a> ConvertToFmt for &'a mut File {
type Writer = WriteAdapter<Self>;
fn convert(self) -> Self::Writer {
WriteAdapter { f: self }
}
}
fn get_inherited<'a, T: FromObj<'a>>(doc: &'a Document, dict: &'a Dictionary, key: &[u8]) -> Option<T> {
let o: Option<T> = get(doc, dict, key);
if let Some(o) = o {
Some(o)
} else {
let parent = dict.get(b"Parent")
.and_then(|parent| parent.as_reference())
.and_then(|id| doc.get_dictionary(id)).ok()?;
get_inherited(doc, parent, key)
}
}
pub fn filter(data: JsonValue) -> JsonValue {
let mut row = array![];
let mut com = false;
for item in data.members() {
if item.to_string().len() < 14 {
continue;
}
if item.to_string().contains("mm") {
continue;
}
match item.to_string().len() {
18 => {
if com {
row.push(item.to_string()).unwrap();
com = false;
}
}
20 => {
row[0] = item.to_string().into();
}
_ => {
if !item.to_string().find("公司").is_none() || !item.to_string().find("店").is_none() {
com = true;
row.push(item.to_string()).unwrap();
}
}
}
}
row
}