1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
use pom::{Input, DataInput};
use std::cmp;
use std::io::{Result, Read, Error, ErrorKind};
use std::path::Path;
use std::fs::File;
use super::{Document, Object, ObjectId};
use super::parser;
use xref::XrefEntry;
use object_stream::ObjectStream;
impl Document {
#[inline]
pub fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
let file = File::open(path)?;
let buffer = Vec::with_capacity(file.metadata()?.len() as usize);
Self::load_internal(file, buffer)
}
#[inline]
pub fn load_from<R: Read>(source: R) -> Result<Document> {
let buffer = Vec::<u8>::new();
Self::load_internal(source, buffer)
}
fn load_internal<R: Read>(mut source: R, mut buffer: Vec<u8>) -> Result<Document> {
source.read_to_end(&mut buffer)?;
let mut reader = Reader {
buffer: buffer,
document: Document::new(),
};
reader.read()?;
Ok(reader.document)
}
}
pub struct Reader {
buffer: Vec<u8>,
document: Document,
}
impl Reader {
fn read(&mut self) -> Result<()> {
let mut input = DataInput::new(&self.buffer);
let version = parser::header().parse(&mut input)
.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (header)."))?;
let xref_start = Self::get_xref_start(&self.buffer, &mut input)?;
input.jump_to(xref_start);
let (mut xref, mut trailer) = parser::xref_and_trailer(&self).parse(&mut input)
.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (xref_and_trailer)."))?;
let mut prev_xref_start = trailer.remove("Prev");
while let Some(prev) = prev_xref_start.and_then(|offset|offset.as_i64()) {
input.jump_to(prev as usize);
let (prev_xref, mut prev_trailer) = parser::xref_and_trailer(&self).parse(&mut input)
.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (prev xref_and_trailer)."))?;
xref.extend(prev_xref);
let prev_xref_stream_start = trailer.remove("XRefStm");
if let Some(prev) = prev_xref_stream_start.and_then(|offset|offset.as_i64()) {
input.jump_to(prev as usize);
let (prev_xref, _) = parser::xref_and_trailer(&self).parse(&mut input)
.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (prev xref_and_trailer)."))?;
xref.extend(prev_xref);
}
prev_xref_start = prev_trailer.remove("Prev");
}
self.document.version = version;
self.document.max_id = xref.size - 1;
self.document.trailer = trailer;
self.document.reference_table = xref;
for entry in self.document.reference_table.entries.values().filter(|entry|entry.is_normal()) {
match *entry {
XrefEntry::Normal{offset, ..} => {
let (object_id, mut object) = self.read_object(offset as usize)?;
match object {
Object::Stream(ref mut stream) => if stream.dict.type_is(b"ObjStm") {
self.document.streams.insert(object_id.0, ObjectStream::new(stream));
},
_ => {}
}
self.document.objects.insert(object_id, object);
},
_ => {},
};
}
Ok(())
}
fn get_offset(&self, id: ObjectId) -> Option<u32> {
if let Some(entry) = self.document.reference_table.get(id.0) {
match *entry {
XrefEntry::Normal{offset, generation} => {
if id.1 == generation { Some(offset) } else { None }
},
_ => None,
}
} else {
None
}
}
pub fn get_object(&self, id: ObjectId) -> Option<Object> {
if let Some(offset) = self.get_offset(id) {
if let Ok((_, obj)) = self.read_object(offset as usize) {
return Some(obj);
}
}
return None;
}
fn read_object(&self, offset: usize) -> Result<(ObjectId, Object)> {
let mut input = DataInput::new(&self.buffer);
input.jump_to(offset);
parser::indirect_object(self).parse(&mut input)
.map_err(|err|Error::new(ErrorKind::InvalidData, format!("Not a valid PDF file (read object at {}).\n{:?}", offset, err)))
}
fn get_xref_start(buffer: &[u8], input: &mut Input<u8>) -> Result<usize> {
let seek_pos = buffer.len() - cmp::min(buffer.len(), 512);
Self::search_substring(buffer, b"%%EOF", seek_pos)
.and_then(|eof_pos| Self::search_substring(buffer, b"startxref", eof_pos - 25))
.and_then(|xref_pos| {
input.jump_to(xref_pos);
match parser::xref_start().parse(input) {
Ok(startxref) => Some(startxref as usize),
_ => None,
}
})
.ok_or(Error::new(ErrorKind::InvalidData, "Not a valid PDF file (xref_start)."))
}
fn search_substring(buffer: &[u8], pattern: &[u8], start_pos: usize) -> Option<usize> {
let mut seek_pos = start_pos;
let mut index = 0;
while seek_pos < buffer.len() && index < pattern.len() {
if buffer[seek_pos] == pattern[index] {
index += 1;
} else if index > 0 {
seek_pos -= index;
index = 0;
}
seek_pos += 1;
if index == pattern.len() {
return Some(seek_pos - index);
}
}
return None;
}
}
#[test]
fn load_document() {
let mut doc = Document::load("assets/example.pdf").unwrap();
assert_eq!(doc.version, "1.5");
doc.save("test_2_load.pdf").unwrap();
}