1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
use pom::{Input, DataInput};
use std::cmp;
use std::io::{Result, Read, Error, ErrorKind};
use std::path::Path;
use std::fs::File;

use super::{Document, Object, ObjectId};
use super::parser;
use xref::XrefEntry;
use object_stream::ObjectStream;

impl Document {

	/// Load PDF document from specified file path.
	#[inline]
	pub fn load<P: AsRef<Path>>(path: P) -> Result<Document> {
		let file = File::open(path)?;
		let buffer = Vec::with_capacity(file.metadata()?.len() as usize);
		Self::load_internal(file, buffer)
	}

	/// Load PDF document from arbitrary source
	#[inline]
	pub fn load_from<R: Read>(source: R) -> Result<Document> {
		let buffer = Vec::<u8>::new();
		Self::load_internal(source, buffer)
	}

	fn load_internal<R: Read>(mut source: R, mut buffer: Vec<u8>) -> Result<Document> {

		source.read_to_end(&mut buffer)?;

		let mut reader = Reader {
			buffer: buffer,
			document: Document::new(),
		};

		reader.read()?;
		Ok(reader.document)
	}
}

pub struct Reader {
	buffer: Vec<u8>,
	document: Document,
}

impl Reader {
	/// Read whole document.
	fn read(&mut self) -> Result<()> {
		let mut input = DataInput::new(&self.buffer);
		// The document structure can be expressed in PEG as:
		//   document <- header indirect_object* xref trailer xref_start
		let version = parser::header().parse(&mut input)
			.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (header)."))?;

		let xref_start = Self::get_xref_start(&self.buffer, &mut input)?;
		input.jump_to(xref_start);

		let (mut xref, mut trailer) = parser::xref_and_trailer(&self).parse(&mut input)
			.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (xref_and_trailer)."))?;

		// Read previous Xrefs of linearized or incremental updated document.
		let mut prev_xref_start = trailer.remove("Prev");
		while let Some(prev) = prev_xref_start.and_then(|offset|offset.as_i64()) {
			input.jump_to(prev as usize);
			let (prev_xref, mut prev_trailer) = parser::xref_and_trailer(&self).parse(&mut input)
				.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (prev xref_and_trailer)."))?;
			xref.extend(prev_xref);

			// Read xref stream in hybrid-reference file
			let prev_xref_stream_start = trailer.remove("XRefStm");
			if let Some(prev) = prev_xref_stream_start.and_then(|offset|offset.as_i64()) {
				input.jump_to(prev as usize);
				let (prev_xref, _) = parser::xref_and_trailer(&self).parse(&mut input)
					.map_err(|_|Error::new(ErrorKind::InvalidData, "Not a valid PDF file (prev xref_and_trailer)."))?;
				xref.extend(prev_xref);
			}

			prev_xref_start = prev_trailer.remove("Prev");
		}

		self.document.version = version;
		self.document.max_id = xref.size - 1;
		self.document.trailer = trailer;
		self.document.reference_table = xref;

		for entry in self.document.reference_table.entries.values().filter(|entry|entry.is_normal()) {
			match *entry {
				XrefEntry::Normal{offset, ..} => {
					let (object_id, mut object) = self.read_object(offset as usize)?;

					match object {
						Object::Stream(ref mut stream) => if stream.dict.type_is(b"ObjStm") {
							self.document.streams.insert(object_id.0, ObjectStream::new(stream));
						},
						_ => {}
					}

					self.document.objects.insert(object_id, object);
				},
				_ => {},
			};
		}

		Ok(())
	}

	/// Get object offset by object id.
	fn get_offset(&self, id: ObjectId) -> Option<u32> {
		if let Some(entry) = self.document.reference_table.get(id.0) {
			match *entry {
				XrefEntry::Normal{offset, generation} => {
					if id.1 == generation { Some(offset) } else { None }
				},
				_ => None,
			}
		} else {
			None
		}
	}

	pub fn get_object(&self, id: ObjectId) -> Option<Object> {
		if let Some(offset) = self.get_offset(id) {
			if let Ok((_, obj)) = self.read_object(offset as usize) {
				return Some(obj);
			}
		}
		return None;
	}

	fn read_object(&self, offset: usize) -> Result<(ObjectId, Object)> {
		let mut input = DataInput::new(&self.buffer);
		input.jump_to(offset);
		parser::indirect_object(self).parse(&mut input)
			.map_err(|err|Error::new(ErrorKind::InvalidData, format!("Not a valid PDF file (read object at {}).\n{:?}", offset, err)))
	}

	fn get_xref_start(buffer: &[u8], input: &mut Input<u8>) -> Result<usize> {
		let seek_pos = buffer.len() - cmp::min(buffer.len(), 512);
		Self::search_substring(buffer, b"%%EOF", seek_pos)
			.and_then(|eof_pos| Self::search_substring(buffer, b"startxref", eof_pos - 25))
			.and_then(|xref_pos| {
				input.jump_to(xref_pos);
				match parser::xref_start().parse(input) {
					Ok(startxref) => Some(startxref as usize),
					_ => None,
				}
			})
			.ok_or(Error::new(ErrorKind::InvalidData, "Not a valid PDF file (xref_start)."))
	}

	fn search_substring(buffer: &[u8], pattern: &[u8], start_pos: usize) -> Option<usize> {
		let mut seek_pos = start_pos;
		let mut index = 0;

		while seek_pos < buffer.len() && index < pattern.len() {
			if buffer[seek_pos] == pattern[index] {
				index += 1;
			} else if index > 0 {
				seek_pos -= index;
				index = 0;
			}
			seek_pos += 1;

			if index == pattern.len() {
				return Some(seek_pos - index);
			}
		}

		return None;
	}
}

#[test]
fn load_document() {

	let mut doc = Document::load("assets/example.pdf").unwrap();
	assert_eq!(doc.version, "1.5");
	doc.save("test_2_load.pdf").unwrap();
}