use std::cmp::Ordering;
use std::collections::BTreeMap;
use crate::units::Length;
use crate::{LoError, Result};
fn pdf_escape(value: &str) -> String {
value
.replace('\\', "\\\\")
.replace('(', "\\(")
.replace(')', "\\)")
}
pub fn write_text_pdf(lines: &[String], page_width: Length, page_height: Length) -> Vec<u8> {
let width_pt = page_width.as_pt();
let height_pt = page_height.as_pt();
let mut content = String::new();
content.push_str("BT\n/F1 12 Tf\n14 TL\n50 ");
content.push_str(&format!("{:.2}", height_pt - 50.0));
content.push_str(" Td\n");
for (index, line) in lines.iter().enumerate() {
if index > 0 {
content.push_str("T*\n");
}
content.push('(');
content.push_str(&pdf_escape(line));
content.push_str(") Tj\n");
}
content.push_str("ET\n");
let objects = vec![
"<< /Type /Catalog /Pages 2 0 R >>".to_string(),
"<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_string(),
format!(
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 {width_pt:.2} {height_pt:.2}] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>"
),
format!("<< /Length {} >>\nstream\n{}endstream", content.len(), content),
"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>".to_string(),
];
pdf_from_objects(&objects)
}
pub fn pdf_from_objects(objects: &[String]) -> Vec<u8> {
let mut out = Vec::new();
out.extend_from_slice(b"%PDF-1.4\n%");
out.extend_from_slice(&[0xE2, 0xE3, 0xCF, 0xD3]);
out.push(b'\n');
let mut offsets = Vec::with_capacity(objects.len() + 1);
offsets.push(0usize);
for (index, object) in objects.iter().enumerate() {
offsets.push(out.len());
out.extend_from_slice(format!("{} 0 obj\n{}\nendobj\n", index + 1, object).as_bytes());
}
let xref_pos = out.len();
out.extend_from_slice(format!("xref\n0 {}\n", objects.len() + 1).as_bytes());
out.extend_from_slice(b"0000000000 65535 f \n");
for offset in offsets.iter().skip(1) {
out.extend_from_slice(format!("{:010} 00000 n \n", offset).as_bytes());
}
out.extend_from_slice(
format!(
"trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
objects.len() + 1,
xref_pos
)
.as_bytes(),
);
out
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct PdfObjectId {
pub object: u32,
pub generation: u16,
}
impl PdfObjectId {
pub const fn new(object: u32, generation: u16) -> Self {
Self { object, generation }
}
}
#[derive(Clone, Debug, PartialEq)]
pub enum PdfValue {
Null,
Bool(bool),
Number(f64),
Name(String),
String(Vec<u8>),
Array(Vec<PdfValue>),
Dict(BTreeMap<String, PdfValue>),
Stream(PdfStream),
Ref(PdfObjectId),
}
impl PdfValue {
fn as_dict(&self) -> Option<&BTreeMap<String, PdfValue>> {
match self {
Self::Dict(dict) => Some(dict),
Self::Stream(stream) => Some(&stream.dict),
_ => None,
}
}
fn as_name(&self) -> Option<&str> {
match self {
Self::Name(name) => Some(name.as_str()),
_ => None,
}
}
fn as_number(&self) -> Option<f64> {
match self {
Self::Number(value) => Some(*value),
_ => None,
}
}
fn as_ref(&self) -> Option<PdfObjectId> {
match self {
Self::Ref(id) => Some(*id),
_ => None,
}
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct PdfStream {
pub dict: BTreeMap<String, PdfValue>,
pub data: Vec<u8>,
}
#[derive(Clone, Debug, Default)]
pub struct PdfBuilder {
objects: Vec<Vec<u8>>,
root: Option<PdfObjectId>,
info: Option<PdfObjectId>,
}
impl PdfBuilder {
pub fn new() -> Self {
Self::default()
}
pub fn add_object(&mut self, value: PdfValue) -> PdfObjectId {
let id = PdfObjectId::new((self.objects.len() + 1) as u32, 0);
self.objects.push(serialize_pdf_value(&value));
id
}
pub fn set_root(&mut self, id: PdfObjectId) {
self.root = Some(id);
}
pub fn set_info(&mut self, id: PdfObjectId) {
self.info = Some(id);
}
pub fn finish(self) -> Result<Vec<u8>> {
let root = self
.root
.ok_or_else(|| LoError::InvalidInput("pdf builder missing catalog root".to_string()))?;
let mut out = Vec::new();
out.extend_from_slice(b"%PDF-1.4\n%");
out.extend_from_slice(&[0xE2, 0xE3, 0xCF, 0xD3]);
out.push(b'\n');
let mut offsets = Vec::with_capacity(self.objects.len() + 1);
offsets.push(0usize);
for (index, object) in self.objects.iter().enumerate() {
offsets.push(out.len());
out.extend_from_slice(format!("{} 0 obj\n", index + 1).as_bytes());
out.extend_from_slice(object);
if !out.ends_with(b"\n") {
out.push(b'\n');
}
out.extend_from_slice(b"endobj\n");
}
let xref_pos = out.len();
out.extend_from_slice(format!("xref\n0 {}\n", self.objects.len() + 1).as_bytes());
out.extend_from_slice(b"0000000000 65535 f \n");
for offset in offsets.iter().skip(1) {
out.extend_from_slice(format!("{:010} 00000 n \n", offset).as_bytes());
}
let mut trailer = BTreeMap::new();
trailer.insert("Size".to_string(), PdfValue::Number((self.objects.len() + 1) as f64));
trailer.insert("Root".to_string(), PdfValue::Ref(root));
if let Some(info) = self.info {
trailer.insert("Info".to_string(), PdfValue::Ref(info));
}
out.extend_from_slice(b"trailer\n");
out.extend_from_slice(&serialize_pdf_value(&PdfValue::Dict(trailer)));
out.extend_from_slice(format!("\nstartxref\n{}\n%%EOF\n", xref_pos).as_bytes());
Ok(out)
}
}
fn serialize_pdf_value(value: &PdfValue) -> Vec<u8> {
match value {
PdfValue::Null => b"null".to_vec(),
PdfValue::Bool(v) => {
if *v {
b"true".to_vec()
} else {
b"false".to_vec()
}
}
PdfValue::Number(number) => {
if number.fract() == 0.0 {
format!("{:.0}", number).into_bytes()
} else {
format!("{number}").into_bytes()
}
}
PdfValue::Name(name) => format!("/{}", escape_pdf_name(name)).into_bytes(),
PdfValue::String(bytes) => serialize_pdf_string(bytes),
PdfValue::Array(items) => {
let mut out = Vec::new();
out.push(b'[');
for (index, item) in items.iter().enumerate() {
if index > 0 {
out.push(b' ');
}
out.extend_from_slice(&serialize_pdf_value(item));
}
out.push(b']');
out
}
PdfValue::Dict(dict) => serialize_pdf_dict(dict),
PdfValue::Stream(stream) => {
let mut dict = stream.dict.clone();
dict.insert(
"Length".to_string(),
PdfValue::Number(stream.data.len() as f64),
);
let mut out = serialize_pdf_dict(&dict);
out.extend_from_slice(b"\nstream\n");
out.extend_from_slice(&stream.data);
if !stream.data.ends_with(b"\n") {
out.push(b'\n');
}
out.extend_from_slice(b"endstream");
out
}
PdfValue::Ref(id) => format!("{} {} R", id.object, id.generation).into_bytes(),
}
}
fn serialize_pdf_dict(dict: &BTreeMap<String, PdfValue>) -> Vec<u8> {
let mut out = Vec::new();
out.extend_from_slice(b"<<");
for (key, value) in dict {
out.push(b' ');
out.extend_from_slice(format!("/{} ", escape_pdf_name(key)).as_bytes());
out.extend_from_slice(&serialize_pdf_value(value));
}
out.extend_from_slice(b" >>");
out
}
fn escape_pdf_name(name: &str) -> String {
let mut out = String::with_capacity(name.len());
for byte in name.as_bytes() {
match *byte {
b'#' | b'/' | b'%' | b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}'
| b' ' | b'\t' | b'\r' | b'\n' => out.push_str(&format!("#{byte:02X}")),
_ => out.push(*byte as char),
}
}
out
}
fn serialize_pdf_string(bytes: &[u8]) -> Vec<u8> {
let printable = bytes.iter().all(|byte| matches!(*byte, 0x20..=0x7E) && *byte != b'(' && *byte != b')' && *byte != b'\\');
if printable {
let mut out = Vec::new();
out.push(b'(');
for &byte in bytes {
match byte {
b'(' | b')' | b'\\' => {
out.push(b'\\');
out.push(byte);
}
_ => out.push(byte),
}
}
out.push(b')');
out
} else {
let mut out = String::from("<");
for byte in bytes {
out.push_str(&format!("{byte:02X}"));
}
out.push('>');
out.into_bytes()
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct PdfTextSpan {
pub x: f32,
pub y: f32,
pub end_x: f32,
pub font_size: f32,
pub text: String,
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct PdfTextPage {
pub width: f32,
pub height: f32,
pub spans: Vec<PdfTextSpan>,
}
impl PdfTextPage {
pub fn plain_text(&self) -> String {
if self.spans.is_empty() {
return String::new();
}
let mut spans = self.spans.clone();
spans.sort_by(|a, b| {
match b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal) {
Ordering::Equal => a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal),
other => other,
}
});
let mut lines: Vec<Vec<PdfTextSpan>> = Vec::new();
for span in spans {
let tolerance = span.font_size.max(8.0) * 0.35;
if let Some(line) = lines.last_mut() {
let baseline = line.first().map(|first| first.y).unwrap_or(span.y);
if (baseline - span.y).abs() <= tolerance {
line.push(span);
continue;
}
}
lines.push(vec![span]);
}
let mut out = String::new();
for (line_index, line) in lines.iter_mut().enumerate() {
if line_index > 0 {
out.push('\n');
}
line.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
let mut prev_end: Option<f32> = None;
let mut prev_font: f32 = 10.0;
for span in line {
if let Some(end_x) = prev_end {
let gap = span.x - end_x;
if gap > prev_font.max(span.font_size) * 0.18 {
if !out.ends_with(' ') && !out.ends_with('\n') {
out.push(' ');
}
}
}
out.push_str(span.text.trim_matches('\0'));
prev_end = Some(span.end_x.max(span.x));
prev_font = span.font_size.max(1.0);
}
}
out
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct ParsedPdf {
pub pages: Vec<PdfTextPage>,
pub metadata: BTreeMap<String, String>,
}
impl ParsedPdf {
pub fn extract_text(&self) -> String {
let mut out = String::new();
for (index, page) in self.pages.iter().enumerate() {
if index > 0 {
out.push('\u{000C}');
out.push('\n');
}
out.push_str(&page.plain_text());
}
out
}
pub fn page_texts(&self) -> Vec<String> {
self.pages.iter().map(PdfTextPage::plain_text).collect()
}
}
pub fn parse_pdf(bytes: &[u8]) -> Result<ParsedPdf> {
let file = PdfFile::parse(bytes)?;
let metadata = file.extract_metadata();
let page_nodes = file.collect_pages()?;
let mut pages = Vec::with_capacity(page_nodes.len());
for page in page_nodes {
pages.push(file.extract_page_text(&page)?);
}
Ok(ParsedPdf { pages, metadata })
}
pub fn extract_text_from_pdf(bytes: &[u8]) -> Result<String> {
Ok(parse_pdf(bytes)?.extract_text())
}
pub fn extract_pages_from_pdf(bytes: &[u8]) -> Result<Vec<String>> {
Ok(parse_pdf(bytes)?.page_texts())
}
#[derive(Clone, Debug, Default)]
struct PdfFile {
objects: BTreeMap<PdfObjectId, PdfValue>,
trailer: BTreeMap<String, PdfValue>,
}
impl PdfFile {
fn parse(bytes: &[u8]) -> Result<Self> {
let mut objects = BTreeMap::new();
let mut pos = 0usize;
while pos < bytes.len() {
match try_parse_indirect_object(bytes, pos, &objects)? {
Some((id, value, end)) => {
objects.insert(id, value);
pos = end;
}
None => pos += 1,
}
}
let trailer = parse_trailer(bytes).unwrap_or_default();
let mut file = Self { objects, trailer };
file.expand_object_streams()?;
Ok(file)
}
fn resolve(&self, value: &PdfValue) -> Option<PdfValue> {
let mut current = value.clone();
for _ in 0..32 {
match current {
PdfValue::Ref(id) => {
current = self.objects.get(&id)?.clone();
}
_ => return Some(current),
}
}
None
}
fn resolve_dict(&self, value: &PdfValue) -> Option<BTreeMap<String, PdfValue>> {
match self.resolve(value)? {
PdfValue::Dict(dict) => Some(dict),
PdfValue::Stream(stream) => Some(stream.dict),
_ => None,
}
}
fn resolve_stream(&self, value: &PdfValue) -> Option<PdfStream> {
match self.resolve(value)? {
PdfValue::Stream(stream) => Some(stream),
_ => None,
}
}
fn resolve_number(&self, value: &PdfValue) -> Option<f64> {
self.resolve(value)?.as_number()
}
fn extract_metadata(&self) -> BTreeMap<String, String> {
let mut meta = BTreeMap::new();
if let Some(info) = self.trailer.get("Info") {
if let Some(dict) = self.resolve_dict(info) {
for key in ["Title", "Author", "Subject", "Keywords", "Creator", "Producer"] {
if let Some(value) = dict.get(key) {
if let Some(text) = decode_pdf_text_object(&self.resolve(value).unwrap_or_else(|| value.clone())) {
if !text.trim().is_empty() {
meta.insert(key.to_string(), text);
}
}
}
}
}
}
meta
}
fn catalog_id(&self) -> Option<PdfObjectId> {
if let Some(root) = self.trailer.get("Root").and_then(PdfValue::as_ref) {
return Some(root);
}
self.objects.iter().find_map(|(id, value)| {
let dict = value.as_dict()?;
if dict.get("Type").and_then(PdfValue::as_name) == Some("Catalog") {
Some(*id)
} else {
None
}
})
}
fn collect_pages(&self) -> Result<Vec<PageNode>> {
if let Some(catalog_id) = self.catalog_id() {
if let Some(catalog) = self.objects.get(&catalog_id).and_then(PdfValue::as_dict) {
if let Some(pages_ref) = catalog.get("Pages").and_then(PdfValue::as_ref) {
let mut out = Vec::new();
self.walk_pages(pages_ref, &InheritedPageState::default(), &mut out)?;
if !out.is_empty() {
return Ok(out);
}
}
}
}
let mut fallback = Vec::new();
for (id, value) in &self.objects {
let Some(dict) = value.as_dict() else { continue };
if dict.get("Type").and_then(PdfValue::as_name) == Some("Page") {
let media_box = dict
.get("MediaBox")
.and_then(|value| parse_media_box(self, value))
.unwrap_or((0.0, 0.0, 612.0, 792.0));
let resources = dict
.get("Resources")
.and_then(|value| self.resolve_dict(value))
.unwrap_or_default();
fallback.push(PageNode {
id: *id,
dict: dict.clone(),
resources,
media_box,
});
}
}
fallback.sort_by_key(|page| page.id);
Ok(fallback)
}
fn walk_pages(
&self,
node_id: PdfObjectId,
inherited: &InheritedPageState,
out: &mut Vec<PageNode>,
) -> Result<()> {
let dict = self
.objects
.get(&node_id)
.and_then(PdfValue::as_dict)
.cloned()
.ok_or_else(|| LoError::Parse(format!("missing page tree node {} {}", node_id.object, node_id.generation)))?;
let ty = dict.get("Type").and_then(PdfValue::as_name).unwrap_or("");
let mut next = inherited.clone();
if let Some(resources) = dict.get("Resources").and_then(|value| self.resolve_dict(value)) {
next.resources = Some(resources);
}
if let Some(media_box) = dict.get("MediaBox").and_then(|value| parse_media_box(self, value)) {
next.media_box = Some(media_box);
}
if ty == "Pages" || dict.contains_key("Kids") {
if let Some(PdfValue::Array(kids)) = dict.get("Kids") {
for kid in kids {
if let Some(id) = kid.as_ref() {
self.walk_pages(id, &next, out)?;
}
}
}
return Ok(());
}
let media_box = next.media_box.unwrap_or((0.0, 0.0, 612.0, 792.0));
let resources = next.resources.clone().unwrap_or_default();
out.push(PageNode {
id: node_id,
dict,
resources,
media_box,
});
Ok(())
}
fn extract_page_text(&self, page: &PageNode) -> Result<PdfTextPage> {
let resources = self.build_resources(&page.resources)?;
let mut spans = Vec::new();
for stream in self.page_content_streams(&page.dict)? {
let data = self.decode_stream(&stream)?;
self.extract_content_stream(&data, &resources, Matrix::identity(), &mut spans)?;
}
spans.retain(|span| !span.text.trim().is_empty());
let (_, _, x1, y1) = page.media_box;
Ok(PdfTextPage {
width: x1,
height: y1,
spans,
})
}
fn page_content_streams(&self, page: &BTreeMap<String, PdfValue>) -> Result<Vec<PdfStream>> {
let Some(contents) = page.get("Contents") else {
return Ok(Vec::new());
};
let mut out = Vec::new();
match self.resolve(contents) {
Some(PdfValue::Stream(stream)) => out.push(stream),
Some(PdfValue::Array(items)) => {
for item in items {
if let Some(stream) = self.resolve_stream(&item) {
out.push(stream);
}
}
}
Some(other) => {
return Err(LoError::Parse(format!(
"page Contents is not a stream/array: {other:?}"
)))
}
None => {}
}
Ok(out)
}
fn decode_stream(&self, stream: &PdfStream) -> Result<Vec<u8>> {
let filters = collect_filter_names(stream.dict.get("Filter"));
let mut out = stream.data.clone();
for filter in filters {
out = match filter.as_str() {
"FlateDecode" | "Fl" => decode_flate_stream(&out)?,
"ASCIIHexDecode" | "AHx" => decode_ascii_hex(&out)?,
"ASCII85Decode" | "A85" => decode_ascii85(&out)?,
"RunLengthDecode" | "RL" => decode_run_length(&out)?,
"LZWDecode" | "LZW" => decode_lzw(&out)?,
other => {
return Err(LoError::Unsupported(format!(
"pdf filter not supported: {other}"
)))
}
};
}
Ok(out)
}
fn expand_object_streams(&mut self) -> Result<()> {
let object_streams: Vec<(PdfObjectId, PdfStream)> = self
.objects
.iter()
.filter_map(|(id, value)| match value {
PdfValue::Stream(stream)
if stream
.dict
.get("Type")
.and_then(PdfValue::as_name)
== Some("ObjStm") => Some((*id, stream.clone())),
_ => None,
})
.collect();
for (_id, stream) in object_streams {
let decoded = self.decode_stream(&stream)?;
let n = stream
.dict
.get("N")
.and_then(PdfValue::as_number)
.unwrap_or(0.0) as usize;
let first = stream
.dict
.get("First")
.and_then(PdfValue::as_number)
.unwrap_or(0.0) as usize;
if first > decoded.len() {
continue;
}
let header = std::str::from_utf8(&decoded[..first]).unwrap_or("");
let mut header_numbers = header
.split_whitespace()
.filter_map(|part| part.parse::<usize>().ok());
let mut entries = Vec::new();
for _ in 0..n {
let Some(obj_num) = header_numbers.next() else { break };
let Some(offset) = header_numbers.next() else { break };
entries.push((obj_num as u32, offset));
}
for (obj_num, offset) in entries {
let start = first + offset;
if start >= decoded.len() {
continue;
}
let mut parser = Parser::new(&decoded[start..]);
if let Ok(value) = parser.parse_value() {
self.objects
.entry(PdfObjectId::new(obj_num, 0))
.or_insert(value);
}
}
}
Ok(())
}
fn build_resources(&self, dict: &BTreeMap<String, PdfValue>) -> Result<PdfResources> {
let mut resources = PdfResources::default();
if let Some(fonts) = dict.get("Font").and_then(|value| self.resolve_dict(value)) {
for (name, value) in fonts {
let Some(font_value) = self.resolve(&value) else { continue };
resources
.fonts
.insert(name.clone(), FontDecoder::from_pdf_value(self, &font_value)?);
}
}
if let Some(xobjects) = dict.get("XObject").and_then(|value| self.resolve_dict(value)) {
for (name, value) in xobjects {
let Some(stream) = self.resolve_stream(&value) else { continue };
let subtype = stream
.dict
.get("Subtype")
.and_then(PdfValue::as_name)
.unwrap_or("")
.to_string();
if subtype != "Form" {
continue;
}
let matrix = stream
.dict
.get("Matrix")
.and_then(|value| parse_matrix(self, value))
.unwrap_or_else(Matrix::identity);
let form_resources = stream
.dict
.get("Resources")
.and_then(|value| self.resolve_dict(value))
.unwrap_or_default();
resources.xobjects.insert(
name.clone(),
PdfXObject {
data: self.decode_stream(&stream)?,
resources: form_resources,
matrix,
},
);
}
}
Ok(resources)
}
fn extract_content_stream(
&self,
data: &[u8],
resources: &PdfResources,
initial_ctm: Matrix,
spans: &mut Vec<PdfTextSpan>,
) -> Result<()> {
let mut parser = ContentParser::new(data);
let mut operands: Vec<ContentToken> = Vec::new();
let mut graphics_stack: Vec<Matrix> = vec![initial_ctm];
let mut text = TextState::default();
while let Some(token) = parser.next_token()? {
match token {
ContentToken::Operator(op) => {
match op.as_str() {
"q" => graphics_stack.push(*graphics_stack.last().unwrap_or(&Matrix::identity())),
"Q" => {
if graphics_stack.len() > 1 {
graphics_stack.pop();
}
}
"cm" => {
if let Some(matrix) = take_six_numbers(&mut operands) {
let current = *graphics_stack.last().unwrap_or(&Matrix::identity());
if let Some(last) = graphics_stack.last_mut() {
*last = current.multiply(&Matrix::new(matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5]));
}
}
}
"BT" => text = TextState::default(),
"ET" => {}
"Tf" => {
if operands.len() >= 2 {
let size = operands.pop().and_then(|t| t.as_number()).unwrap_or(12.0);
let name = operands.pop().and_then(|t| t.into_name()).unwrap_or_default();
text.font = name;
text.font_size = size.max(1.0);
}
}
"Tm" => {
if let Some(values) = take_six_numbers(&mut operands) {
let matrix = Matrix::new(values[0], values[1], values[2], values[3], values[4], values[5]);
text.text_matrix = matrix;
text.line_matrix = matrix;
}
}
"Td" => {
if let Some([tx, ty]) = take_two_numbers(&mut operands) {
text.translate(tx, ty);
}
}
"TD" => {
if let Some([tx, ty]) = take_two_numbers(&mut operands) {
text.leading = -ty;
text.translate(tx, ty);
}
}
"T*" => text.translate(0.0, -text.leading),
"Tc" => {
if let Some(value) = operands.pop().and_then(|t| t.as_number()) {
text.char_spacing = value;
}
}
"Tw" => {
if let Some(value) = operands.pop().and_then(|t| t.as_number()) {
text.word_spacing = value;
}
}
"TL" => {
if let Some(value) = operands.pop().and_then(|t| t.as_number()) {
text.leading = value;
}
}
"Tz" => {
if let Some(value) = operands.pop().and_then(|t| t.as_number()) {
text.horizontal_scaling = value / 100.0;
}
}
"Ts" => {
if let Some(value) = operands.pop().and_then(|t| t.as_number()) {
text.rise = value;
}
}
"Tj" => {
if let Some(bytes) = operands.pop().and_then(|t| t.into_bytes()) {
show_pdf_text(resources, &graphics_stack, &mut text, &bytes, spans);
}
}
"TJ" => {
if let Some(items) = operands.pop().and_then(|t| t.into_array()) {
for item in items {
match item {
ContentToken::String(bytes) => {
show_pdf_text(resources, &graphics_stack, &mut text, &bytes, spans);
}
ContentToken::Number(adjust) => {
let shift = -(adjust / 1000.0) * text.font_size * text.horizontal_scaling;
text.text_matrix = text.text_matrix.translate(shift, 0.0);
}
_ => {}
}
}
}
}
"'" => {
text.translate(0.0, -text.leading);
if let Some(bytes) = operands.pop().and_then(|t| t.into_bytes()) {
show_pdf_text(resources, &graphics_stack, &mut text, &bytes, spans);
}
}
"\"" => {
if operands.len() >= 3 {
let string = operands.pop().and_then(|t| t.into_bytes()).unwrap_or_default();
text.char_spacing = operands.pop().and_then(|t| t.as_number()).unwrap_or(text.char_spacing);
text.word_spacing = operands.pop().and_then(|t| t.as_number()).unwrap_or(text.word_spacing);
text.translate(0.0, -text.leading);
show_pdf_text(resources, &graphics_stack, &mut text, &string, spans);
}
}
"Do" => {
if let Some(name) = operands.pop().and_then(|t| t.into_name()) {
if let Some(xobj) = resources.xobjects.get(&name) {
let nested = self.build_resources(&xobj.resources)?;
let ctm = graphics_stack
.last()
.copied()
.unwrap_or_else(Matrix::identity)
.multiply(&xobj.matrix);
self.extract_content_stream(&xobj.data, &nested, ctm, spans)?;
}
}
}
"BI" => parser.skip_inline_image(),
_ => {}
}
operands.clear();
}
other => operands.push(other),
}
}
Ok(())
}
}
#[derive(Clone, Debug)]
struct PageNode {
id: PdfObjectId,
dict: BTreeMap<String, PdfValue>,
resources: BTreeMap<String, PdfValue>,
media_box: (f32, f32, f32, f32),
}
#[derive(Clone, Debug, Default)]
struct InheritedPageState {
resources: Option<BTreeMap<String, PdfValue>>,
media_box: Option<(f32, f32, f32, f32)>,
}
fn parse_media_box(file: &PdfFile, value: &PdfValue) -> Option<(f32, f32, f32, f32)> {
let PdfValue::Array(items) = file.resolve(value)? else {
return None;
};
if items.len() != 4 {
return None;
}
Some((
file.resolve_number(&items[0])? as f32,
file.resolve_number(&items[1])? as f32,
file.resolve_number(&items[2])? as f32,
file.resolve_number(&items[3])? as f32,
))
}
fn parse_matrix(file: &PdfFile, value: &PdfValue) -> Option<Matrix> {
let PdfValue::Array(items) = file.resolve(value)? else {
return None;
};
if items.len() != 6 {
return None;
}
Some(Matrix::new(
file.resolve_number(&items[0])? as f32,
file.resolve_number(&items[1])? as f32,
file.resolve_number(&items[2])? as f32,
file.resolve_number(&items[3])? as f32,
file.resolve_number(&items[4])? as f32,
file.resolve_number(&items[5])? as f32,
))
}
fn decode_pdf_text_object(value: &PdfValue) -> Option<String> {
match value {
PdfValue::String(bytes) => decode_pdf_string_bytes(bytes),
PdfValue::Name(name) => Some(name.clone()),
PdfValue::Number(n) => Some(n.to_string()),
_ => None,
}
}
fn try_parse_indirect_object(
bytes: &[u8],
start: usize,
objects: &BTreeMap<PdfObjectId, PdfValue>,
) -> Result<Option<(PdfObjectId, PdfValue, usize)>> {
if start > 0 && !is_pdf_space(bytes[start - 1]) {
return Ok(None);
}
let mut header = Parser::new_at(bytes, start);
let Some(object) = header.parse_unsigned_number().ok() else {
return Ok(None);
};
header.skip_ws_and_comments();
let Some(generation) = header.parse_unsigned_number().ok() else {
return Ok(None);
};
header.skip_ws_and_comments();
if !header.consume_keyword(b"obj") {
return Ok(None);
}
header.skip_ws_and_comments();
let mut value = header.parse_value()?;
header.skip_ws_and_comments();
if matches!(value, PdfValue::Dict(_)) && header.consume_keyword(b"stream") {
let data_start = skip_stream_eol(bytes, header.pos);
let length = match &value {
PdfValue::Dict(dict) => dict
.get("Length")
.and_then(|value| resolve_length_hint(value, objects)),
_ => None,
};
let (data, after_stream) = read_stream_bytes(bytes, data_start, length)?;
let dict = match value {
PdfValue::Dict(dict) => dict,
_ => unreachable!(),
};
value = PdfValue::Stream(PdfStream { dict, data });
header.pos = after_stream;
header.skip_ws_and_comments();
}
let endobj = find_token(bytes, header.pos, b"endobj")
.ok_or_else(|| LoError::Parse("pdf indirect object missing endobj".to_string()))?;
Ok(Some((
PdfObjectId::new(object, generation as u16),
value,
endobj + "endobj".len(),
)))
}
fn resolve_length_hint(
value: &PdfValue,
objects: &BTreeMap<PdfObjectId, PdfValue>,
) -> Option<usize> {
match value {
PdfValue::Number(number) => Some((*number).max(0.0) as usize),
PdfValue::Ref(id) => match objects.get(id) {
Some(PdfValue::Number(number)) => Some((*number).max(0.0) as usize),
_ => None,
},
_ => None,
}
}
fn skip_stream_eol(bytes: &[u8], mut pos: usize) -> usize {
if bytes.get(pos) == Some(&b'\r') {
pos += 1;
}
if bytes.get(pos) == Some(&b'\n') {
pos += 1;
}
pos
}
fn read_stream_bytes(bytes: &[u8], data_start: usize, length: Option<usize>) -> Result<(Vec<u8>, usize)> {
if let Some(length) = length {
let end = data_start.saturating_add(length);
if end > bytes.len() {
return Err(LoError::Parse("pdf stream length is out of bounds".to_string()));
}
let mut cursor = end;
if bytes.get(cursor) == Some(&b'\r') {
cursor += 1;
}
if bytes.get(cursor) == Some(&b'\n') {
cursor += 1;
}
let endstream = find_token(bytes, cursor, b"endstream")
.ok_or_else(|| LoError::Parse("pdf stream missing endstream".to_string()))?;
return Ok((bytes[data_start..end].to_vec(), endstream + "endstream".len()));
}
let endstream = find_token(bytes, data_start, b"endstream")
.ok_or_else(|| LoError::Parse("pdf stream missing endstream".to_string()))?;
let mut data_end = endstream;
while data_end > data_start && matches!(bytes[data_end - 1], b'\r' | b'\n') {
data_end -= 1;
}
Ok((bytes[data_start..data_end].to_vec(), endstream + "endstream".len()))
}
fn parse_trailer(bytes: &[u8]) -> Option<BTreeMap<String, PdfValue>> {
let index = rfind_bytes(bytes, b"trailer")?;
let mut parser = Parser::new_at(bytes, index + "trailer".len());
parser.skip_ws_and_comments();
let PdfValue::Dict(dict) = parser.parse_value().ok()? else {
return None;
};
Some(dict)
}
fn find_token(bytes: &[u8], start: usize, token: &[u8]) -> Option<usize> {
let mut index = start;
while index + token.len() <= bytes.len() {
if &bytes[index..index + token.len()] == token {
let prev_ok = index == 0 || is_pdf_space(bytes[index - 1]) || is_pdf_delim(bytes[index - 1]);
let next_ok = index + token.len() == bytes.len()
|| is_pdf_space(bytes[index + token.len()])
|| is_pdf_delim(bytes[index + token.len()]);
if prev_ok && next_ok {
return Some(index);
}
}
index += 1;
}
None
}
fn rfind_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
haystack.windows(needle.len()).rposition(|window| window == needle)
}
#[derive(Clone, Debug, Default)]
struct PdfResources {
fonts: BTreeMap<String, FontDecoder>,
xobjects: BTreeMap<String, PdfXObject>,
}
#[derive(Clone, Debug)]
struct PdfXObject {
data: Vec<u8>,
resources: BTreeMap<String, PdfValue>,
matrix: Matrix,
}
#[derive(Clone, Debug)]
struct FontDecoder {
cmap: Option<ToUnicodeMap>,
encoding: SimpleEncoding,
}
impl Default for FontDecoder {
fn default() -> Self {
Self {
cmap: None,
encoding: SimpleEncoding::WinAnsi,
}
}
}
impl FontDecoder {
fn from_pdf_value(file: &PdfFile, value: &PdfValue) -> Result<Self> {
let Some(dict) = value.as_dict() else {
return Ok(Self::default());
};
let encoding = parse_simple_encoding(file, dict);
let cmap = if let Some(to_unicode) = dict.get("ToUnicode") {
if let Some(stream) = file.resolve_stream(to_unicode) {
Some(ToUnicodeMap::parse(&file.decode_stream(&stream)?)?)
} else {
None
}
} else {
None
};
Ok(Self { cmap, encoding })
}
fn decode(&self, bytes: &[u8]) -> String {
if let Some(cmap) = &self.cmap {
let text = cmap.decode(bytes);
if !text.is_empty() {
return text;
}
}
decode_simple_encoding(bytes, self.encoding)
}
}
#[derive(Clone, Copy, Debug)]
enum SimpleEncoding {
Ascii,
WinAnsi,
}
fn parse_simple_encoding(file: &PdfFile, dict: &BTreeMap<String, PdfValue>) -> SimpleEncoding {
if let Some(encoding) = dict.get("Encoding") {
match file.resolve(encoding) {
Some(PdfValue::Name(name)) => {
let lower = name.to_ascii_lowercase();
if lower.contains("winansi") || lower.contains("standard") {
SimpleEncoding::WinAnsi
} else {
SimpleEncoding::Ascii
}
}
Some(PdfValue::Dict(inner)) => inner
.get("BaseEncoding")
.and_then(PdfValue::as_name)
.map(|name| {
let lower = name.to_ascii_lowercase();
if lower.contains("winansi") || lower.contains("standard") {
SimpleEncoding::WinAnsi
} else {
SimpleEncoding::Ascii
}
})
.unwrap_or(SimpleEncoding::Ascii),
_ => SimpleEncoding::Ascii,
}
} else {
SimpleEncoding::WinAnsi
}
}
#[derive(Clone, Debug, Default)]
struct ToUnicodeMap {
code_ranges: Vec<(usize, u32, u32)>,
direct: BTreeMap<u32, String>,
}
impl ToUnicodeMap {
fn parse(data: &[u8]) -> Result<Self> {
let text = String::from_utf8_lossy(data);
let mut out = Self::default();
let lines: Vec<&str> = text.lines().collect();
let mut index = 0usize;
while index < lines.len() {
let line = lines[index].trim();
if let Some(count) = line.strip_suffix("begincodespacerange").and_then(|prefix| prefix.trim().parse::<usize>().ok()) {
for offset in 1..=count {
if let Some((start, end)) = parse_two_hex_strings(lines.get(index + offset).copied().unwrap_or("")) {
out.code_ranges.push((start.len(), hex_bytes_to_u32(&start), hex_bytes_to_u32(&end)));
}
}
index += count + 1;
continue;
}
if let Some(count) = line.strip_suffix("beginbfchar").and_then(|prefix| prefix.trim().parse::<usize>().ok()) {
for offset in 1..=count {
if let Some((src, dst)) = parse_two_hex_strings(lines.get(index + offset).copied().unwrap_or("")) {
out.direct
.insert(hex_bytes_to_u32(&src), decode_utf16be_fallback(&dst));
}
}
index += count + 1;
continue;
}
if let Some(count) = line.strip_suffix("beginbfrange").and_then(|prefix| prefix.trim().parse::<usize>().ok()) {
for offset in 1..=count {
let entry = lines.get(index + offset).copied().unwrap_or("").trim();
if let Some((start, end, dst)) = parse_bfrange_entry(entry) {
let start_code = hex_bytes_to_u32(&start);
let end_code = hex_bytes_to_u32(&end);
match dst {
BfRangeDest::Sequential(base) => {
let mut current = base.clone();
for code in start_code..=end_code {
out.direct.insert(code, decode_utf16be_fallback(¤t));
increment_utf16be_bytes(&mut current);
}
}
BfRangeDest::Explicit(items) => {
for (offset, item) in items.into_iter().enumerate() {
out.direct.insert(start_code + offset as u32, decode_utf16be_fallback(&item));
}
}
}
}
}
index += count + 1;
continue;
}
index += 1;
}
if out.code_ranges.is_empty() {
out.code_ranges.push((1, 0x00, 0xFF));
out.code_ranges.push((2, 0x0000, 0xFFFF));
}
out.code_ranges.sort_by_key(|entry| entry.0);
Ok(out)
}
fn decode(&self, bytes: &[u8]) -> String {
let mut out = String::new();
let mut pos = 0usize;
let max_len = self.code_ranges.iter().map(|entry| entry.0).max().unwrap_or(1);
while pos < bytes.len() {
let mut matched = false;
for len in (1..=max_len).rev() {
if pos + len > bytes.len() {
continue;
}
let code = hex_bytes_to_u32(&bytes[pos..pos + len]);
if !self.code_ranges.iter().any(|(range_len, start, end)| *range_len == len && code >= *start && code <= *end) {
continue;
}
if let Some(mapped) = self.direct.get(&code) {
out.push_str(mapped);
pos += len;
matched = true;
break;
}
}
if !matched {
out.push_str(&decode_simple_encoding(&bytes[pos..pos + 1], SimpleEncoding::WinAnsi));
pos += 1;
}
}
out
}
}
enum BfRangeDest {
Sequential(Vec<u8>),
Explicit(Vec<Vec<u8>>),
}
fn parse_bfrange_entry(line: &str) -> Option<(Vec<u8>, Vec<u8>, BfRangeDest)> {
let trimmed = line.trim();
let (start, end) = parse_two_hex_strings(trimmed)?;
let first_end = trimmed.find('>')?;
let rest_after_first = &trimmed[first_end + 1..];
let second_start = rest_after_first.find('<')?;
let rest_after_second_start = &rest_after_first[second_start..];
let second_end = rest_after_second_start.find('>')?;
let tail = rest_after_second_start[second_end + 1..].trim();
if tail.starts_with('[') {
let mut items = Vec::new();
let mut cursor = tail;
while let Some(begin) = cursor.find('<') {
let rest = &cursor[begin..];
let end_idx = rest.find('>')?;
items.push(parse_hex_bytes(&rest[..=end_idx])?);
cursor = &rest[end_idx + 1..];
}
Some((start, end, BfRangeDest::Explicit(items)))
} else {
let dest_start = tail.find('<')?;
let dest_rest = &tail[dest_start..];
let dest_end = dest_rest.find('>')?;
let dest = parse_hex_bytes(&dest_rest[..=dest_end])?;
Some((start, end, BfRangeDest::Sequential(dest)))
}
}
fn parse_two_hex_strings(line: &str) -> Option<(Vec<u8>, Vec<u8>)> {
let mut cursor = line;
let first_start = cursor.find('<')?;
cursor = &cursor[first_start..];
let first_end = cursor.find('>')?;
let first = parse_hex_bytes(&cursor[..=first_end])?;
cursor = &cursor[first_end + 1..];
let second_start = cursor.find('<')?;
cursor = &cursor[second_start..];
let second_end = cursor.find('>')?;
let second = parse_hex_bytes(&cursor[..=second_end])?;
Some((first, second))
}
fn parse_hex_bytes(token: &str) -> Option<Vec<u8>> {
let mut trimmed = token.trim();
if !trimmed.starts_with('<') || !trimmed.contains('>') {
return None;
}
trimmed = trimmed.trim_start_matches('<');
trimmed = trimmed.trim_end_matches('>');
let mut cleaned = String::new();
for ch in trimmed.chars() {
if !ch.is_ascii_whitespace() {
cleaned.push(ch);
}
}
if cleaned.len() % 2 == 1 {
cleaned.push('0');
}
let mut out = Vec::new();
let bytes = cleaned.as_bytes();
let mut index = 0usize;
while index + 1 < bytes.len() {
let hi = (bytes[index] as char).to_digit(16)? as u8;
let lo = (bytes[index + 1] as char).to_digit(16)? as u8;
out.push((hi << 4) | lo);
index += 2;
}
Some(out)
}
fn increment_utf16be_bytes(bytes: &mut Vec<u8>) {
if bytes.len() >= 2 {
for chunk_index in (0..bytes.len()).step_by(2).rev() {
let hi = bytes[chunk_index] as u16;
let lo = bytes[chunk_index + 1] as u16;
let value = (hi << 8) | lo;
if value < 0xFFFF {
let next = value + 1;
bytes[chunk_index] = (next >> 8) as u8;
bytes[chunk_index + 1] = next as u8;
return;
}
bytes[chunk_index] = 0;
bytes[chunk_index + 1] = 0;
}
}
}
fn decode_utf16be_fallback(bytes: &[u8]) -> String {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
return decode_utf16be_fallback(&bytes[2..]);
}
if bytes.len() % 2 == 0 && !bytes.is_empty() {
let mut units = Vec::with_capacity(bytes.len() / 2);
for chunk in bytes.chunks_exact(2) {
units.push(u16::from_be_bytes([chunk[0], chunk[1]]));
}
if let Ok(text) = String::from_utf16(&units) {
return text;
}
}
bytes.iter().map(|&b| char::from(b)).collect()
}
fn hex_bytes_to_u32(bytes: &[u8]) -> u32 {
let mut out = 0u32;
for &byte in bytes {
out = (out << 8) | byte as u32;
}
out
}
fn decode_pdf_string_bytes(bytes: &[u8]) -> Option<String> {
if bytes.is_empty() {
return Some(String::new());
}
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
return Some(decode_utf16be_fallback(bytes));
}
if bytes.len() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE {
let mut units = Vec::new();
for chunk in bytes[2..].chunks_exact(2) {
units.push(u16::from_le_bytes([chunk[0], chunk[1]]));
}
if let Ok(text) = String::from_utf16(&units) {
return Some(text);
}
}
Some(decode_simple_encoding(bytes, SimpleEncoding::WinAnsi))
}
fn decode_simple_encoding(bytes: &[u8], encoding: SimpleEncoding) -> String {
match encoding {
SimpleEncoding::Ascii => bytes.iter().map(|&byte| char::from(byte)).collect(),
SimpleEncoding::WinAnsi => bytes.iter().map(|&byte| decode_win_ansi(byte)).collect(),
}
}
fn decode_win_ansi(byte: u8) -> char {
match byte {
0x80 => '€',
0x82 => '‚',
0x83 => 'ƒ',
0x84 => '„',
0x85 => '…',
0x86 => '†',
0x87 => '‡',
0x88 => 'ˆ',
0x89 => '‰',
0x8A => 'Š',
0x8B => '‹',
0x8C => 'Œ',
0x8E => 'Ž',
0x91 => '‘',
0x92 => '’',
0x93 => '“',
0x94 => '”',
0x95 => '•',
0x96 => '–',
0x97 => '—',
0x98 => '˜',
0x99 => '™',
0x9A => 'š',
0x9B => '›',
0x9C => 'œ',
0x9E => 'ž',
0x9F => 'Ÿ',
_ => byte as char,
}
}
#[derive(Clone, Copy, Debug)]
struct Matrix {
a: f32,
b: f32,
c: f32,
d: f32,
e: f32,
f: f32,
}
impl Matrix {
fn identity() -> Self {
Self::new(1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
}
fn new(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32) -> Self {
Self { a, b, c, d, e, f }
}
fn multiply(&self, other: &Self) -> Self {
Self {
a: self.a * other.a + self.c * other.b,
b: self.b * other.a + self.d * other.b,
c: self.a * other.c + self.c * other.d,
d: self.b * other.c + self.d * other.d,
e: self.a * other.e + self.c * other.f + self.e,
f: self.b * other.e + self.d * other.f + self.f,
}
}
fn translate(&self, tx: f32, ty: f32) -> Self {
self.multiply(&Self::new(1.0, 0.0, 0.0, 1.0, tx, ty))
}
fn transform_point(&self, x: f32, y: f32) -> (f32, f32) {
(
self.a * x + self.c * y + self.e,
self.b * x + self.d * y + self.f,
)
}
}
#[derive(Clone, Debug)]
struct TextState {
font: String,
font_size: f32,
leading: f32,
char_spacing: f32,
word_spacing: f32,
horizontal_scaling: f32,
rise: f32,
text_matrix: Matrix,
line_matrix: Matrix,
}
impl Default for TextState {
fn default() -> Self {
Self {
font: String::new(),
font_size: 12.0,
leading: 12.0,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 1.0,
rise: 0.0,
text_matrix: Matrix::identity(),
line_matrix: Matrix::identity(),
}
}
}
impl TextState {
fn translate(&mut self, tx: f32, ty: f32) {
self.line_matrix = self.line_matrix.translate(tx, ty);
self.text_matrix = self.line_matrix;
}
}
fn show_pdf_text(
resources: &PdfResources,
graphics_stack: &[Matrix],
text: &mut TextState,
bytes: &[u8],
spans: &mut Vec<PdfTextSpan>,
) {
let font = resources.fonts.get(&text.font).cloned().unwrap_or_default();
let decoded = font.decode(bytes);
let display = decoded.replace('\u{0}', "");
if display.trim().is_empty() {
return;
}
let ctm = graphics_stack.last().copied().unwrap_or_else(Matrix::identity);
let text_ctm = ctm.multiply(&text.text_matrix);
let (x, y) = text_ctm.transform_point(0.0, text.rise);
let advance = estimate_text_advance(&display, text.font_size, text.char_spacing, text.word_spacing, text.horizontal_scaling);
spans.push(PdfTextSpan {
x,
y,
end_x: x + advance,
font_size: text.font_size,
text: display,
});
text.text_matrix = text.text_matrix.translate(advance, 0.0);
}
fn estimate_text_advance(
text: &str,
font_size: f32,
char_spacing: f32,
word_spacing: f32,
horizontal_scaling: f32,
) -> f32 {
let mut width = 0.0;
for ch in text.chars() {
width += if ch.is_ascii_punctuation() {
font_size * 0.35
} else if ch.is_whitespace() {
font_size * 0.33 + word_spacing
} else if ch.is_ascii_digit() {
font_size * 0.55
} else if ch.is_ascii_uppercase() {
font_size * 0.62
} else {
font_size * 0.52
};
width += char_spacing;
}
width * horizontal_scaling.max(0.01)
}
#[derive(Clone, Debug, PartialEq)]
enum ContentToken {
Number(f32),
Name(String),
String(Vec<u8>),
Array(Vec<ContentToken>),
Operator(String),
}
impl ContentToken {
fn as_number(self) -> Option<f32> {
match self {
Self::Number(value) => Some(value),
_ => None,
}
}
fn into_name(self) -> Option<String> {
match self {
Self::Name(name) => Some(name),
_ => None,
}
}
fn into_bytes(self) -> Option<Vec<u8>> {
match self {
Self::String(bytes) => Some(bytes),
_ => None,
}
}
fn into_array(self) -> Option<Vec<ContentToken>> {
match self {
Self::Array(items) => Some(items),
_ => None,
}
}
}
fn take_six_numbers(operands: &mut Vec<ContentToken>) -> Option<[f32; 6]> {
if operands.len() < 6 {
return None;
}
let mut values = [0.0; 6];
for index in (0..6).rev() {
values[index] = operands.pop()?.as_number()?;
}
Some(values)
}
fn take_two_numbers(operands: &mut Vec<ContentToken>) -> Option<[f32; 2]> {
if operands.len() < 2 {
return None;
}
let second = operands.pop()?.as_number()?;
let first = operands.pop()?.as_number()?;
Some([first, second])
}
struct ContentParser<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> ContentParser<'a> {
fn new(data: &'a [u8]) -> Self {
Self { data, pos: 0 }
}
fn next_token(&mut self) -> Result<Option<ContentToken>> {
self.skip_ws_and_comments();
let Some(byte) = self.peek() else {
return Ok(None);
};
let token = match byte {
b'/' => ContentToken::Name(self.parse_name()?),
b'(' => ContentToken::String(self.parse_literal_string()?),
b'<' if self.peek_n(1) != Some(b'<') => ContentToken::String(self.parse_hex_string()?),
b'[' => ContentToken::Array(self.parse_array()?),
b'+' | b'-' | b'.' | b'0'..=b'9' => ContentToken::Number(self.parse_number()? as f32),
_ => ContentToken::Operator(self.parse_operator()),
};
Ok(Some(token))
}
fn skip_inline_image(&mut self) {
if let Some(index) = self.data[self.pos..].windows(2).position(|window| window == b"EI") {
self.pos += index + 2;
} else {
self.pos = self.data.len();
}
}
fn parse_array(&mut self) -> Result<Vec<ContentToken>> {
self.expect(b'[')?;
let mut items = Vec::new();
loop {
self.skip_ws_and_comments();
if self.peek() == Some(b']') {
self.pos += 1;
break;
}
let Some(token) = self.next_token()? else {
return Err(LoError::Parse("unterminated pdf content array".to_string()));
};
items.push(token);
}
Ok(items)
}
fn parse_name(&mut self) -> Result<String> {
self.expect(b'/')?;
let start = self.pos;
while let Some(byte) = self.peek() {
if is_pdf_space(byte) || is_pdf_delim(byte) {
break;
}
self.pos += 1;
}
Ok(String::from_utf8_lossy(&self.data[start..self.pos]).to_string())
}
fn parse_number(&mut self) -> Result<f64> {
let start = self.pos;
if matches!(self.peek(), Some(b'+') | Some(b'-')) {
self.pos += 1;
}
while let Some(byte) = self.peek() {
if byte.is_ascii_digit() || byte == b'.' {
self.pos += 1;
} else {
break;
}
}
let text = std::str::from_utf8(&self.data[start..self.pos])
.map_err(|err| LoError::Parse(format!("invalid pdf content number: {err}")))?;
text.parse::<f64>()
.map_err(|err| LoError::Parse(format!("invalid pdf content number: {err}")))
}
fn parse_operator(&mut self) -> String {
let start = self.pos;
while let Some(byte) = self.peek() {
if is_pdf_space(byte) || matches!(byte, b'[' | b']' | b'<' | b'>' | b'(' | b')' | b'/' ) {
break;
}
self.pos += 1;
}
String::from_utf8_lossy(&self.data[start..self.pos]).to_string()
}
fn parse_hex_string(&mut self) -> Result<Vec<u8>> {
self.expect(b'<')?;
let start = self.pos;
while let Some(byte) = self.peek() {
if byte == b'>' {
break;
}
self.pos += 1;
}
let end = self.pos;
self.expect(b'>')?;
parse_hex_bytes(std::str::from_utf8(&self.data[start - 1..=end]).unwrap_or(""))
.ok_or_else(|| LoError::Parse("invalid pdf content hex string".to_string()))
}
fn parse_literal_string(&mut self) -> Result<Vec<u8>> {
self.expect(b'(')?;
let mut depth = 1usize;
let mut out = Vec::new();
while let Some(byte) = self.next_byte() {
match byte {
b'(' => {
depth += 1;
out.push(byte);
}
b')' => {
depth -= 1;
if depth == 0 {
return Ok(out);
}
out.push(byte);
}
b'\\' => {
let Some(escaped) = self.next_byte() else { break };
match escaped {
b'n' => out.push(b'\n'),
b'r' => out.push(b'\r'),
b't' => out.push(b'\t'),
b'b' => out.push(0x08),
b'f' => out.push(0x0C),
b'(' | b')' | b'\\' => out.push(escaped),
b'\r' => {
if self.peek() == Some(b'\n') {
self.pos += 1;
}
}
b'\n' => {}
b'0'..=b'7' => {
let mut value = (escaped - b'0') as u16;
for _ in 0..2 {
match self.peek() {
Some(next @ b'0'..=b'7') => {
self.pos += 1;
value = (value << 3) | (next - b'0') as u16;
}
_ => break,
}
}
out.push(value as u8);
}
other => out.push(other),
}
}
_ => out.push(byte),
}
}
Err(LoError::Parse("unterminated pdf content string".to_string()))
}
fn skip_ws_and_comments(&mut self) {
loop {
while let Some(byte) = self.peek() {
if is_pdf_space(byte) {
self.pos += 1;
} else {
break;
}
}
if self.peek() == Some(b'%') {
while let Some(byte) = self.next_byte() {
if byte == b'\n' || byte == b'\r' {
break;
}
}
continue;
}
break;
}
}
fn expect(&mut self, byte: u8) -> Result<()> {
match self.next_byte() {
Some(value) if value == byte => Ok(()),
other => Err(LoError::Parse(format!(
"expected byte {byte:?}, got {other:?}"
))),
}
}
fn peek(&self) -> Option<u8> {
self.data.get(self.pos).copied()
}
fn peek_n(&self, offset: usize) -> Option<u8> {
self.data.get(self.pos + offset).copied()
}
fn next_byte(&mut self) -> Option<u8> {
let byte = self.peek()?;
self.pos += 1;
Some(byte)
}
}
struct Parser<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> Parser<'a> {
fn new(data: &'a [u8]) -> Self {
Self { data, pos: 0 }
}
fn new_at(data: &'a [u8], pos: usize) -> Self {
Self { data, pos }
}
fn parse_value(&mut self) -> Result<PdfValue> {
self.skip_ws_and_comments();
match self.peek() {
Some(b'/') => Ok(PdfValue::Name(self.parse_name()?)),
Some(b'(') => Ok(PdfValue::String(self.parse_literal_string()?)),
Some(b'<') if self.peek_n(1) == Some(b'<') => Ok(PdfValue::Dict(self.parse_dict()?)),
Some(b'<') => Ok(PdfValue::String(self.parse_hex_string()?)),
Some(b'[') => Ok(PdfValue::Array(self.parse_array()?)),
Some(b't') if self.consume_keyword(b"true") => Ok(PdfValue::Bool(true)),
Some(b'f') if self.consume_keyword(b"false") => Ok(PdfValue::Bool(false)),
Some(b'n') if self.consume_keyword(b"null") => Ok(PdfValue::Null),
Some(b'+') | Some(b'-') | Some(b'.') | Some(b'0'..=b'9') => self.parse_number_or_ref(),
other => Err(LoError::Parse(format!("unexpected pdf token at {:?}", other))),
}
}
fn parse_unsigned_number(&mut self) -> Result<u32> {
let start = self.pos;
while let Some(byte) = self.peek() {
if byte.is_ascii_digit() {
self.pos += 1;
} else {
break;
}
}
if self.pos == start {
return Err(LoError::Parse("expected unsigned integer".to_string()));
}
std::str::from_utf8(&self.data[start..self.pos])
.map_err(|err| LoError::Parse(format!("invalid pdf integer: {err}")))?
.parse::<u32>()
.map_err(|err| LoError::Parse(format!("invalid pdf integer: {err}")))
}
fn parse_number_or_ref(&mut self) -> Result<PdfValue> {
let first_start = self.pos;
let first = self.parse_number_token()?;
let after_first = self.pos;
if let Some(first_int) = first.integer {
self.skip_ws_and_comments();
let save = self.pos;
if let Ok(second) = self.parse_number_token() {
if let Some(second_int) = second.integer {
self.skip_ws_and_comments();
if self.consume_keyword(b"R") {
return Ok(PdfValue::Ref(PdfObjectId::new(first_int as u32, second_int as u16)));
}
}
}
self.pos = save;
}
self.pos = after_first;
let text = std::str::from_utf8(&self.data[first_start..after_first])
.map_err(|err| LoError::Parse(format!("invalid pdf number: {err}")))?;
let number = text
.parse::<f64>()
.map_err(|err| LoError::Parse(format!("invalid pdf number: {err}")))?;
Ok(PdfValue::Number(number))
}
fn parse_number_token(&mut self) -> Result<NumberToken> {
let start = self.pos;
if matches!(self.peek(), Some(b'+') | Some(b'-')) {
self.pos += 1;
}
let mut has_dot = false;
while let Some(byte) = self.peek() {
if byte.is_ascii_digit() {
self.pos += 1;
} else if byte == b'.' && !has_dot {
has_dot = true;
self.pos += 1;
} else {
break;
}
}
if self.pos == start {
return Err(LoError::Parse("expected pdf number".to_string()));
}
let text = std::str::from_utf8(&self.data[start..self.pos])
.map_err(|err| LoError::Parse(format!("invalid pdf number: {err}")))?
.to_string();
let integer = if !text.contains('.') {
text.parse::<i64>().ok()
} else {
None
};
Ok(NumberToken { integer })
}
fn parse_name(&mut self) -> Result<String> {
self.expect(b'/')?;
let mut out = String::new();
while let Some(byte) = self.peek() {
if is_pdf_space(byte) || is_pdf_delim(byte) {
break;
}
if byte == b'#' {
self.pos += 1;
let a = self.next_byte().ok_or_else(|| LoError::Parse("truncated pdf name hex escape".to_string()))?;
let b = self.next_byte().ok_or_else(|| LoError::Parse("truncated pdf name hex escape".to_string()))?;
let hi = (a as char)
.to_digit(16)
.ok_or_else(|| LoError::Parse("invalid pdf name hex escape".to_string()))? as u8;
let lo = (b as char)
.to_digit(16)
.ok_or_else(|| LoError::Parse("invalid pdf name hex escape".to_string()))? as u8;
out.push(((hi << 4) | lo) as char);
continue;
}
self.pos += 1;
out.push(byte as char);
}
Ok(out)
}
fn parse_array(&mut self) -> Result<Vec<PdfValue>> {
self.expect(b'[')?;
let mut items = Vec::new();
loop {
self.skip_ws_and_comments();
if self.peek() == Some(b']') {
self.pos += 1;
break;
}
items.push(self.parse_value()?);
}
Ok(items)
}
fn parse_dict(&mut self) -> Result<BTreeMap<String, PdfValue>> {
self.expect(b'<')?;
self.expect(b'<')?;
let mut dict = BTreeMap::new();
loop {
self.skip_ws_and_comments();
if self.peek() == Some(b'>') && self.peek_n(1) == Some(b'>') {
self.pos += 2;
break;
}
let key = self.parse_name()?;
let value = self.parse_value()?;
dict.insert(key, value);
}
Ok(dict)
}
fn parse_literal_string(&mut self) -> Result<Vec<u8>> {
let mut content = ContentParser::new(&self.data[self.pos..]);
let value = content.parse_literal_string()?;
self.pos += content.pos;
Ok(value)
}
fn parse_hex_string(&mut self) -> Result<Vec<u8>> {
let mut content = ContentParser::new(&self.data[self.pos..]);
let value = content.parse_hex_string()?;
self.pos += content.pos;
Ok(value)
}
fn skip_ws_and_comments(&mut self) {
loop {
while let Some(byte) = self.peek() {
if is_pdf_space(byte) {
self.pos += 1;
} else {
break;
}
}
if self.peek() == Some(b'%') {
while let Some(byte) = self.next_byte() {
if byte == b'\n' || byte == b'\r' {
break;
}
}
continue;
}
break;
}
}
fn consume_keyword(&mut self, keyword: &[u8]) -> bool {
if self.data.get(self.pos..self.pos + keyword.len()) == Some(keyword) {
self.pos += keyword.len();
true
} else {
false
}
}
fn expect(&mut self, byte: u8) -> Result<()> {
match self.next_byte() {
Some(value) if value == byte => Ok(()),
other => Err(LoError::Parse(format!(
"expected byte {byte:?}, got {other:?}"
))),
}
}
fn peek(&self) -> Option<u8> {
self.data.get(self.pos).copied()
}
fn peek_n(&self, offset: usize) -> Option<u8> {
self.data.get(self.pos + offset).copied()
}
fn next_byte(&mut self) -> Option<u8> {
let byte = self.peek()?;
self.pos += 1;
Some(byte)
}
}
struct NumberToken {
integer: Option<i64>,
}
fn is_pdf_space(byte: u8) -> bool {
matches!(byte, b'\0' | b'\t' | b'\n' | 0x0C | b'\r' | b' ')
}
fn is_pdf_delim(byte: u8) -> bool {
matches!(byte, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%')
}
fn collect_filter_names(value: Option<&PdfValue>) -> Vec<String> {
match value {
Some(PdfValue::Name(name)) => vec![name.clone()],
Some(PdfValue::Array(items)) => items
.iter()
.filter_map(|item| item.as_name().map(str::to_string))
.collect(),
_ => Vec::new(),
}
}
fn decode_ascii_hex(data: &[u8]) -> Result<Vec<u8>> {
let mut cleaned = Vec::new();
for &byte in data {
if byte == b'>' {
break;
}
if !is_pdf_space(byte) {
cleaned.push(byte);
}
}
if cleaned.len() % 2 == 1 {
cleaned.push(b'0');
}
let mut out = Vec::with_capacity(cleaned.len() / 2);
let mut index = 0usize;
while index + 1 < cleaned.len() {
let hi = (cleaned[index] as char)
.to_digit(16)
.ok_or_else(|| LoError::Parse("invalid ASCIIHex digit".to_string()))? as u8;
let lo = (cleaned[index + 1] as char)
.to_digit(16)
.ok_or_else(|| LoError::Parse("invalid ASCIIHex digit".to_string()))? as u8;
out.push((hi << 4) | lo);
index += 2;
}
Ok(out)
}
fn decode_ascii85(data: &[u8]) -> Result<Vec<u8>> {
let mut out = Vec::new();
let mut group = [0u32; 5];
let mut count = 0usize;
let mut index = 0usize;
while index < data.len() {
let byte = data[index];
index += 1;
match byte {
b'~' => break,
b'z' if count == 0 => {
out.extend_from_slice(&[0, 0, 0, 0]);
}
_ if is_pdf_space(byte) => continue,
b'!'..=b'u' => {
group[count] = (byte - b'!') as u32;
count += 1;
if count == 5 {
let mut value = 0u32;
for item in group {
value = value.saturating_mul(85).saturating_add(item);
}
out.extend_from_slice(&value.to_be_bytes());
count = 0;
}
}
other => {
return Err(LoError::Parse(format!(
"invalid ASCII85 byte: {other}"
)))
}
}
}
if count > 0 {
for slot in group.iter_mut().skip(count) {
*slot = 84;
}
let mut value = 0u32;
for item in group {
value = value.saturating_mul(85).saturating_add(item);
}
let bytes = value.to_be_bytes();
out.extend_from_slice(&bytes[..count - 1]);
}
Ok(out)
}
fn decode_run_length(data: &[u8]) -> Result<Vec<u8>> {
let mut out = Vec::new();
let mut index = 0usize;
while index < data.len() {
let len = data[index];
index += 1;
match len {
128 => break,
0..=127 => {
let count = len as usize + 1;
if index + count > data.len() {
return Err(LoError::Parse("truncated RunLength stream".to_string()));
}
out.extend_from_slice(&data[index..index + count]);
index += count;
}
129..=255 => {
let count = 257usize - len as usize;
let value = *data
.get(index)
.ok_or_else(|| LoError::Parse("truncated RunLength repeat".to_string()))?;
index += 1;
for _ in 0..count {
out.push(value);
}
}
}
}
Ok(out)
}
fn decode_flate_stream(data: &[u8]) -> Result<Vec<u8>> {
if data.len() >= 2 {
let cmf = data[0];
let flg = data[1];
if (cmf & 0x0F) == 8 && ((cmf as u16) << 8 | flg as u16) % 31 == 0 {
if data.len() < 6 {
return Err(LoError::Parse("truncated zlib stream".to_string()));
}
let start = 2 + if flg & 0x20 != 0 { 4 } else { 0 };
if start > data.len().saturating_sub(4) {
return Err(LoError::Parse("invalid zlib stream".to_string()));
}
return inflate_raw_deflate(&data[start..data.len() - 4], 0);
}
}
inflate_raw_deflate(data, 0)
}
fn decode_lzw(data: &[u8]) -> Result<Vec<u8>> {
let mut reader = MsbBitReader::new(data);
let mut dict: Vec<Vec<u8>> = vec![Vec::new(); 258];
for value in 0u16..=255 {
dict[value as usize] = vec![value as u8];
}
let clear = 256u16;
let eod = 257u16;
let mut code_size = 9u8;
let mut next_code = 258u16;
let mut prev: Option<Vec<u8>> = None;
let mut out = Vec::new();
while let Some(code) = reader.read_bits(code_size)? {
let code = code as u16;
if code == clear {
dict.truncate(258);
if dict.len() < 258 {
dict.resize(258, Vec::new());
}
for value in 0u16..=255 {
if dict[value as usize].is_empty() {
dict[value as usize] = vec![value as u8];
}
}
code_size = 9;
next_code = 258;
prev = None;
continue;
}
if code == eod {
break;
}
let entry = if let Some(existing) = dict.get(code as usize).filter(|entry| !entry.is_empty()) {
existing.clone()
} else if code == next_code {
let mut generated = prev.clone().ok_or_else(|| {
LoError::Parse("invalid LZW back-reference".to_string())
})?;
let first = *generated.first().ok_or_else(|| {
LoError::Parse("empty LZW previous entry".to_string())
})?;
generated.push(first);
generated
} else {
return Err(LoError::Parse(format!("invalid LZW code {code}")));
};
out.extend_from_slice(&entry);
if let Some(previous) = prev.take() {
let mut new_entry = previous;
if let Some(&first) = entry.first() {
new_entry.push(first);
if dict.len() == next_code as usize {
dict.push(new_entry);
} else if dict.len() > next_code as usize {
dict[next_code as usize] = new_entry;
} else {
while dict.len() < next_code as usize {
dict.push(Vec::new());
}
dict.push(new_entry);
}
next_code += 1;
if next_code == (1u16 << code_size) && code_size < 12 {
code_size += 1;
}
}
}
prev = Some(entry);
}
Ok(out)
}
struct MsbBitReader<'a> {
data: &'a [u8],
bit_pos: usize,
}
impl<'a> MsbBitReader<'a> {
fn new(data: &'a [u8]) -> Self {
Self { data, bit_pos: 0 }
}
fn read_bits(&mut self, count: u8) -> Result<Option<u32>> {
if count == 0 {
return Ok(Some(0));
}
if self.bit_pos + count as usize > self.data.len() * 8 {
return Ok(None);
}
let mut out = 0u32;
for _ in 0..count {
let byte_index = self.bit_pos / 8;
let bit_in_byte = 7 - (self.bit_pos % 8);
let byte = self.data[byte_index];
out = (out << 1) | ((byte >> bit_in_byte) & 1) as u32;
self.bit_pos += 1;
}
Ok(Some(out))
}
}
struct BitReader<'a> {
data: &'a [u8],
bit_pos: usize,
}
impl<'a> BitReader<'a> {
fn new(data: &'a [u8]) -> Self {
Self { data, bit_pos: 0 }
}
fn read_bits(&mut self, count: u8) -> Result<u32> {
if count == 0 {
return Ok(0);
}
let mut out = 0u32;
for bit_index in 0..count {
let byte_index = self.bit_pos / 8;
let bit_in_byte = self.bit_pos % 8;
let byte = *self
.data
.get(byte_index)
.ok_or_else(|| LoError::Parse("unexpected end of deflate stream".to_string()))?;
let bit = (byte >> bit_in_byte) & 1;
out |= (bit as u32) << bit_index;
self.bit_pos += 1;
}
Ok(out)
}
fn align_byte(&mut self) {
self.bit_pos = (self.bit_pos + 7) & !7;
}
fn read_aligned_bytes(&mut self, len: usize) -> Result<&'a [u8]> {
self.align_byte();
let start = self.bit_pos / 8;
let end = start + len;
let slice = self.data.get(start..end).ok_or_else(|| {
LoError::Parse("unexpected end of aligned deflate block".to_string())
})?;
self.bit_pos += len * 8;
Ok(slice)
}
}
#[derive(Clone, Debug)]
struct Huffman {
max_len: u8,
table: BTreeMap<(u8, u16), u16>,
}
impl Huffman {
fn from_code_lengths(lengths: &[u8]) -> Result<Self> {
let max_len = *lengths.iter().max().unwrap_or(&0);
if max_len == 0 {
return Err(LoError::Parse("empty huffman table".to_string()));
}
let mut bl_count = vec![0u16; max_len as usize + 1];
for &len in lengths {
if len > 0 {
bl_count[len as usize] += 1;
}
}
let mut next_code = vec![0u16; max_len as usize + 1];
let mut code = 0u16;
for bits in 1..=max_len as usize {
code = (code + bl_count[bits - 1]) << 1;
next_code[bits] = code;
}
let mut table = BTreeMap::new();
for (symbol, &len) in lengths.iter().enumerate() {
if len == 0 {
continue;
}
let canonical = next_code[len as usize];
next_code[len as usize] += 1;
let reversed = reverse_bits(canonical, len);
table.insert((len, reversed), symbol as u16);
}
Ok(Self { max_len, table })
}
fn decode_symbol(&self, bits: &mut BitReader<'_>) -> Result<u16> {
let mut code = 0u16;
for len in 1..=self.max_len {
let bit = bits.read_bits(1)? as u16;
code |= bit << (len - 1);
if let Some(symbol) = self.table.get(&(len, code)) {
return Ok(*symbol);
}
}
Err(LoError::Parse("invalid huffman code".to_string()))
}
}
fn reverse_bits(mut code: u16, len: u8) -> u16 {
let mut out = 0u16;
for _ in 0..len {
out = (out << 1) | (code & 1);
code >>= 1;
}
out
}
fn inflate_raw_deflate(data: &[u8], expected_len: usize) -> Result<Vec<u8>> {
let mut reader = BitReader::new(data);
let mut out = Vec::with_capacity(expected_len.max(256));
loop {
let is_final = reader.read_bits(1)? != 0;
let block_type = reader.read_bits(2)? as u8;
match block_type {
0 => read_stored_block(&mut reader, &mut out)?,
1 => {
let litlen = fixed_literal_huffman()?;
let dist = fixed_distance_huffman()?;
read_huffman_block(&mut reader, &litlen, &dist, &mut out)?;
}
2 => {
let (litlen, dist) = read_dynamic_huffman_tables(&mut reader)?;
read_huffman_block(&mut reader, &litlen, &dist, &mut out)?;
}
3 => return Err(LoError::Parse("reserved deflate block type".to_string())),
_ => unreachable!(),
}
if is_final {
break;
}
}
Ok(out)
}
fn read_stored_block(reader: &mut BitReader<'_>, out: &mut Vec<u8>) -> Result<()> {
reader.align_byte();
let header = reader.read_aligned_bytes(4)?;
let len = u16::from_le_bytes([header[0], header[1]]);
let nlen = u16::from_le_bytes([header[2], header[3]]);
if len != !nlen {
return Err(LoError::Parse(
"stored deflate block length checksum mismatch".to_string(),
));
}
let bytes = reader.read_aligned_bytes(len as usize)?;
out.extend_from_slice(bytes);
Ok(())
}
fn read_dynamic_huffman_tables(reader: &mut BitReader<'_>) -> Result<(Huffman, Huffman)> {
let hlit = reader.read_bits(5)? as usize + 257;
let hdist = reader.read_bits(5)? as usize + 1;
let hclen = reader.read_bits(4)? as usize + 4;
let order = [16usize, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15];
let mut code_lengths = vec![0u8; 19];
for i in 0..hclen {
code_lengths[order[i]] = reader.read_bits(3)? as u8;
}
let code_length_huffman = Huffman::from_code_lengths(&code_lengths)?;
let total = hlit + hdist;
let mut lengths = Vec::with_capacity(total);
while lengths.len() < total {
match code_length_huffman.decode_symbol(reader)? {
symbol @ 0..=15 => lengths.push(symbol as u8),
16 => {
let repeat = reader.read_bits(2)? as usize + 3;
let previous = *lengths.last().ok_or_else(|| {
LoError::Parse("repeat code without previous code length".to_string())
})?;
for _ in 0..repeat {
lengths.push(previous);
}
}
17 => {
let repeat = reader.read_bits(3)? as usize + 3;
for _ in 0..repeat {
lengths.push(0u8);
}
}
18 => {
let repeat = reader.read_bits(7)? as usize + 11;
for _ in 0..repeat {
lengths.push(0u8);
}
}
other => {
return Err(LoError::Parse(format!(
"invalid code length symbol {other}"
)))
}
}
}
let litlen = Huffman::from_code_lengths(&lengths[..hlit])?;
let dist_lengths = &lengths[hlit..hlit + hdist];
let dist = if dist_lengths.iter().all(|&len| len == 0) {
Huffman::from_code_lengths(&[1])?
} else {
Huffman::from_code_lengths(dist_lengths)?
};
Ok((litlen, dist))
}
fn read_huffman_block(
reader: &mut BitReader<'_>,
litlen: &Huffman,
dist: &Huffman,
out: &mut Vec<u8>,
) -> Result<()> {
loop {
let symbol = litlen.decode_symbol(reader)?;
match symbol {
0..=255 => out.push(symbol as u8),
256 => return Ok(()),
257..=285 => {
let length = decode_length(reader, symbol)?;
let distance_symbol = dist.decode_symbol(reader)?;
let distance = decode_distance(reader, distance_symbol)?;
if distance == 0 || distance > out.len() {
return Err(LoError::Parse(
"invalid deflate back-reference distance".to_string(),
));
}
let start = out.len() - distance;
for i in 0..length {
let byte = out[start + (i % distance)];
out.push(byte);
}
}
other => {
return Err(LoError::Parse(format!(
"invalid deflate literal/length symbol {other}"
)))
}
}
}
}
fn fixed_literal_huffman() -> Result<Huffman> {
let mut lengths = vec![0u8; 288];
for symbol in 0..=143 {
lengths[symbol] = 8;
}
for symbol in 144..=255 {
lengths[symbol] = 9;
}
for symbol in 256..=279 {
lengths[symbol] = 7;
}
for symbol in 280..=287 {
lengths[symbol] = 8;
}
Huffman::from_code_lengths(&lengths)
}
fn fixed_distance_huffman() -> Result<Huffman> {
Huffman::from_code_lengths(&[5u8; 32])
}
fn decode_length(reader: &mut BitReader<'_>, symbol: u16) -> Result<usize> {
const BASES: [usize; 29] = [
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99,
115, 131, 163, 195, 227, 258,
];
const EXTRA: [u8; 29] = [
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0,
];
if !(257..=285).contains(&symbol) {
return Err(LoError::Parse(format!(
"invalid deflate length symbol {symbol}"
)));
}
if symbol == 285 {
return Ok(258);
}
let index = (symbol - 257) as usize;
let extra_bits = EXTRA[index];
let extra = if extra_bits == 0 {
0
} else {
reader.read_bits(extra_bits)? as usize
};
Ok(BASES[index] + extra)
}
fn decode_distance(reader: &mut BitReader<'_>, symbol: u16) -> Result<usize> {
const BASES: [usize; 30] = [
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025,
1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577,
];
const EXTRA: [u8; 30] = [
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
12, 13, 13,
];
let index = symbol as usize;
if index >= BASES.len() {
return Err(LoError::Parse(format!(
"invalid deflate distance symbol {symbol}"
)));
}
let extra_bits = EXTRA[index];
let extra = if extra_bits == 0 {
0
} else {
reader.read_bits(extra_bits)? as usize
};
Ok(BASES[index] + extra)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn writes_pdf_header_and_eof() {
let pdf = write_text_pdf(
&["Hello, world".to_string(), "Second line".to_string()],
Length::pt(595.0),
Length::pt(842.0),
);
assert!(pdf.starts_with(b"%PDF-1.4"));
assert!(pdf.ends_with(b"%%EOF\n"));
assert!(pdf.windows(4).any(|w| w == b"xref"));
}
#[test]
fn escapes_pdf_text_special_chars() {
let pdf = write_text_pdf(&["a (b) \\c".to_string()], Length::pt(100.0), Length::pt(100.0));
let s = String::from_utf8_lossy(&pdf);
assert!(s.contains("a \\(b\\) \\\\c"));
}
#[test]
fn parses_simple_text_pdf() {
let pdf = write_text_pdf(&["Hello PDF".to_string()], Length::pt(200.0), Length::pt(200.0));
let extracted = extract_text_from_pdf(&pdf).unwrap();
assert!(extracted.contains("Hello PDF"));
}
#[test]
fn decodes_ascii_hex() {
assert_eq!(decode_ascii_hex(b"48656C6C6F>").unwrap(), b"Hello");
}
#[test]
fn decodes_run_length() {
let data = [2u8, b'a', b'b', b'c', 253, b'Z', 128];
assert_eq!(decode_run_length(&data).unwrap(), b"abcZZZZ");
}
}