use crate::object::{Object, ObjectRef};
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone)]
pub struct LinearizationConfig {
pub first_page: usize,
pub include_hints: bool,
}
impl Default for LinearizationConfig {
fn default() -> Self {
Self {
first_page: 0,
include_hints: true,
}
}
}
#[derive(Debug, Clone)]
pub struct LinearizationParams {
pub version: f32,
pub file_length: u64,
pub hint_stream: [u64; 2],
pub first_page_object: u32,
pub end_of_first_page: u64,
pub num_pages: u32,
pub main_xref_offset: u64,
pub first_page_num: u32,
}
impl LinearizationParams {
pub fn new(num_pages: u32) -> Self {
Self {
version: 1.0,
file_length: 0,
hint_stream: [0, 0],
first_page_object: 0,
end_of_first_page: 0,
num_pages,
main_xref_offset: 0,
first_page_num: 0,
}
}
pub fn to_object(&self) -> Object {
let mut dict = HashMap::new();
dict.insert("Linearized".to_string(), Object::Real(self.version as f64));
dict.insert("L".to_string(), Object::Integer(self.file_length as i64));
dict.insert(
"H".to_string(),
Object::Array(vec![
Object::Integer(self.hint_stream[0] as i64),
Object::Integer(self.hint_stream[1] as i64),
]),
);
dict.insert("O".to_string(), Object::Integer(self.first_page_object as i64));
dict.insert("E".to_string(), Object::Integer(self.end_of_first_page as i64));
dict.insert("N".to_string(), Object::Integer(self.num_pages as i64));
dict.insert("T".to_string(), Object::Integer(self.main_xref_offset as i64));
if self.first_page_num != 0 {
dict.insert("P".to_string(), Object::Integer(self.first_page_num as i64));
}
Object::Dictionary(dict)
}
}
#[derive(Debug, Clone, Default)]
pub struct PageOffsetEntry {
pub num_objects_delta: u32,
pub page_length_delta: u32,
pub num_shared_objects: u32,
pub shared_object_ids: Vec<u32>,
pub shared_object_numerators: Vec<u32>,
pub content_stream_offset_delta: u32,
pub content_stream_length_delta: u32,
}
#[derive(Debug, Clone, Default)]
pub struct PageOffsetHeader {
pub min_object_num: u32,
pub first_page_location: u64,
pub bits_page_length: u8,
pub min_page_length: u32,
pub bits_object_count: u8,
pub min_object_count: u32,
pub bits_content_offset: u8,
pub min_content_offset: u32,
pub bits_content_length: u8,
pub min_content_length: u32,
pub bits_shared_object_id: u8,
pub bits_shared_numerator: u8,
pub shared_denominator: u32,
}
#[derive(Debug, Clone, Default)]
pub struct SharedObjectEntry {
pub object_length_delta: u32,
pub in_first_page: bool,
pub object_num_delta: u32,
pub num_objects: u32,
}
#[derive(Debug, Clone, Default)]
pub struct SharedObjectHeader {
pub first_object_num: u32,
pub first_object_location: u64,
pub num_first_page_entries: u32,
pub num_remaining_entries: u32,
pub bits_object_length: u8,
pub min_object_length: u32,
pub bits_object_num: u8,
}
#[derive(Debug, Clone, Default)]
pub struct HintTables {
pub page_offset_header: PageOffsetHeader,
pub page_offset_entries: Vec<PageOffsetEntry>,
pub shared_object_header: SharedObjectHeader,
pub shared_object_entries: Vec<SharedObjectEntry>,
}
impl HintTables {
pub fn new() -> Self {
Self::default()
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut data = Vec::new();
self.write_page_offset_table(&mut data);
self.write_shared_object_table(&mut data);
data
}
fn write_page_offset_table(&self, data: &mut Vec<u8>) {
let header = &self.page_offset_header;
Self::write_u32(data, header.min_object_num);
Self::write_u64(data, header.first_page_location);
Self::write_u16(data, header.bits_page_length as u16);
Self::write_u32(data, header.min_page_length);
Self::write_u16(data, header.bits_object_count as u16);
Self::write_u32(data, header.min_object_count);
Self::write_u16(data, header.bits_content_offset as u16);
Self::write_u32(data, header.min_content_offset);
Self::write_u16(data, header.bits_content_length as u16);
Self::write_u32(data, header.min_content_length);
Self::write_u16(data, header.bits_shared_object_id as u16);
Self::write_u16(data, header.bits_shared_numerator as u16);
Self::write_u16(data, header.shared_denominator as u16);
let mut bit_writer = BitWriter::new();
for entry in &self.page_offset_entries {
bit_writer.write_bits(entry.num_objects_delta as u64, header.bits_object_count);
bit_writer.write_bits(entry.page_length_delta as u64, header.bits_page_length);
bit_writer.write_bits(entry.num_shared_objects as u64, header.bits_shared_object_id);
for &id in &entry.shared_object_ids {
bit_writer.write_bits(id as u64, header.bits_shared_object_id);
}
for &num in &entry.shared_object_numerators {
bit_writer.write_bits(num as u64, header.bits_shared_numerator);
}
bit_writer
.write_bits(entry.content_stream_offset_delta as u64, header.bits_content_offset);
bit_writer
.write_bits(entry.content_stream_length_delta as u64, header.bits_content_length);
}
data.extend(bit_writer.finish());
}
fn write_shared_object_table(&self, data: &mut Vec<u8>) {
let header = &self.shared_object_header;
Self::write_u32(data, header.first_object_num);
Self::write_u64(data, header.first_object_location);
Self::write_u32(data, header.num_first_page_entries);
Self::write_u32(data, header.num_remaining_entries);
Self::write_u16(data, header.bits_object_length as u16);
Self::write_u32(data, header.min_object_length);
Self::write_u16(data, header.bits_object_num as u16);
let mut bit_writer = BitWriter::new();
for entry in &self.shared_object_entries {
bit_writer.write_bits(entry.object_length_delta as u64, header.bits_object_length);
bit_writer.write_bits(if entry.in_first_page { 1 } else { 0 }, 1);
bit_writer.write_bits(entry.object_num_delta as u64, header.bits_object_num);
bit_writer.write_bits(entry.num_objects as u64, header.bits_object_num);
}
data.extend(bit_writer.finish());
}
fn write_u16(data: &mut Vec<u8>, value: u16) {
data.extend(&value.to_be_bytes());
}
fn write_u32(data: &mut Vec<u8>, value: u32) {
data.extend(&value.to_be_bytes());
}
fn write_u64(data: &mut Vec<u8>, value: u64) {
data.extend(&value.to_be_bytes());
}
}
struct BitWriter {
buffer: Vec<u8>,
current_byte: u8,
bit_position: u8,
}
impl BitWriter {
fn new() -> Self {
Self {
buffer: Vec::new(),
current_byte: 0,
bit_position: 0,
}
}
fn write_bits(&mut self, value: u64, num_bits: u8) {
if num_bits == 0 {
return;
}
for i in (0..num_bits).rev() {
let bit = ((value >> i) & 1) as u8;
self.current_byte = (self.current_byte << 1) | bit;
self.bit_position += 1;
if self.bit_position == 8 {
self.buffer.push(self.current_byte);
self.current_byte = 0;
self.bit_position = 0;
}
}
}
fn finish(mut self) -> Vec<u8> {
if self.bit_position > 0 {
self.current_byte <<= 8 - self.bit_position;
self.buffer.push(self.current_byte);
}
self.buffer
}
}
#[derive(Debug, Clone)]
pub struct ObjectInfo {
pub obj_ref: ObjectRef,
pub offset: u64,
pub length: u64,
pub referenced_by_pages: HashSet<usize>,
pub is_content_stream: bool,
pub is_page_object: bool,
}
#[derive(Debug)]
pub struct LinearizationAnalyzer {
objects: Vec<ObjectInfo>,
#[allow(dead_code)]
num_pages: usize,
first_page: usize,
first_page_objects: HashSet<u32>,
shared_objects: HashSet<u32>,
page_objects: Vec<HashSet<u32>>,
}
impl LinearizationAnalyzer {
pub fn new(num_pages: usize, first_page: usize) -> Self {
Self {
objects: Vec::new(),
num_pages,
first_page,
first_page_objects: HashSet::new(),
shared_objects: HashSet::new(),
page_objects: vec![HashSet::new(); num_pages],
}
}
pub fn add_object(&mut self, info: ObjectInfo) {
self.objects.push(info);
}
pub fn analyze(&mut self) {
let mut reference_counts: HashMap<u32, usize> = HashMap::new();
for obj in &self.objects {
for &page in &obj.referenced_by_pages {
*reference_counts.entry(obj.obj_ref.id).or_default() += 1;
if page < self.page_objects.len() {
self.page_objects[page].insert(obj.obj_ref.id);
}
}
}
for obj in &self.objects {
let id = obj.obj_ref.id;
if obj.referenced_by_pages.contains(&self.first_page) {
self.first_page_objects.insert(id);
}
let ref_count = reference_counts.get(&id).copied().unwrap_or(0);
if ref_count > 1 {
self.shared_objects.insert(id);
}
}
}
pub fn get_first_page_objects(&self) -> Vec<u32> {
let mut objects: Vec<_> = self.first_page_objects.iter().copied().collect();
objects.sort();
objects
}
pub fn get_shared_objects(&self) -> Vec<u32> {
let mut objects: Vec<_> = self.shared_objects.iter().copied().collect();
objects.sort();
objects
}
pub fn get_page_specific_objects(&self, page: usize) -> Vec<u32> {
if page >= self.page_objects.len() || page == self.first_page {
return Vec::new();
}
let mut objects: Vec<_> = self.page_objects[page]
.iter()
.filter(|id| !self.first_page_objects.contains(id) && !self.shared_objects.contains(id))
.copied()
.collect();
objects.sort();
objects
}
}
pub struct LinearizedPdfBuilder {
#[allow(dead_code)]
config: LinearizationConfig,
params: LinearizationParams,
hint_tables: HintTables,
}
impl LinearizedPdfBuilder {
pub fn new(num_pages: u32, config: LinearizationConfig) -> Self {
let params = LinearizationParams::new(num_pages);
Self {
config,
params,
hint_tables: HintTables::new(),
}
}
pub fn set_first_page_object(&mut self, obj_num: u32) {
self.params.first_page_object = obj_num;
}
pub fn set_first_page_num(&mut self, page_num: u32) {
self.params.first_page_num = page_num;
}
pub fn set_file_length(&mut self, length: u64) {
self.params.file_length = length;
}
pub fn set_hint_stream_info(&mut self, offset: u64, length: u64) {
self.params.hint_stream = [offset, length];
}
pub fn set_end_of_first_page(&mut self, offset: u64) {
self.params.end_of_first_page = offset;
}
pub fn set_main_xref_offset(&mut self, offset: u64) {
self.params.main_xref_offset = offset;
}
pub fn params(&self) -> &LinearizationParams {
&self.params
}
pub fn hint_tables_mut(&mut self) -> &mut HintTables {
&mut self.hint_tables
}
pub fn build_params_object(&self) -> Object {
self.params.to_object()
}
pub fn build_hint_stream(&self) -> Vec<u8> {
self.hint_tables.to_bytes()
}
}
pub fn bits_needed(value: u32) -> u8 {
if value == 0 {
return 0;
}
32 - value.leading_zeros() as u8
}
pub fn calculate_delta_encoding(values: &[u32]) -> (u32, u8) {
if values.is_empty() {
return (0, 0);
}
let min = *values.iter().min().unwrap_or(&0);
let max_delta = values
.iter()
.map(|&v| v.saturating_sub(min))
.max()
.unwrap_or(0);
(min, bits_needed(max_delta))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_linearization_params() {
let params = LinearizationParams::new(5);
assert_eq!(params.num_pages, 5);
assert_eq!(params.version, 1.0);
}
#[test]
fn test_linearization_params_to_object() {
let mut params = LinearizationParams::new(10);
params.file_length = 50000;
params.hint_stream = [1024, 512];
params.first_page_object = 4;
params.end_of_first_page = 5000;
params.main_xref_offset = 45000;
let obj = params.to_object();
if let Object::Dictionary(dict) = obj {
assert!(dict.contains_key("Linearized"));
assert!(dict.contains_key("L"));
assert!(dict.contains_key("H"));
assert!(dict.contains_key("O"));
assert!(dict.contains_key("E"));
assert!(dict.contains_key("N"));
assert!(dict.contains_key("T"));
assert!(!dict.contains_key("P"));
} else {
panic!("Expected dictionary");
}
}
#[test]
fn test_bits_needed() {
assert_eq!(bits_needed(0), 0);
assert_eq!(bits_needed(1), 1);
assert_eq!(bits_needed(2), 2);
assert_eq!(bits_needed(3), 2);
assert_eq!(bits_needed(4), 3);
assert_eq!(bits_needed(255), 8);
assert_eq!(bits_needed(256), 9);
}
#[test]
fn test_delta_encoding() {
let values = vec![10, 15, 20, 25];
let (min, bits) = calculate_delta_encoding(&values);
assert_eq!(min, 10);
assert_eq!(bits, 4); }
#[test]
fn test_bit_writer() {
let mut writer = BitWriter::new();
writer.write_bits(0b101, 3);
writer.write_bits(0b1100, 4);
writer.write_bits(0b1, 1);
let data = writer.finish();
assert_eq!(data, vec![0b10111001]);
}
#[test]
fn test_linearization_analyzer() {
let mut analyzer = LinearizationAnalyzer::new(3, 0);
let mut first_page_refs = HashSet::new();
first_page_refs.insert(0);
analyzer.add_object(ObjectInfo {
obj_ref: ObjectRef::new(1, 0),
offset: 100,
length: 50,
referenced_by_pages: first_page_refs.clone(),
is_content_stream: false,
is_page_object: true,
});
let mut shared_refs = HashSet::new();
shared_refs.insert(0);
shared_refs.insert(1);
analyzer.add_object(ObjectInfo {
obj_ref: ObjectRef::new(2, 0),
offset: 200,
length: 100,
referenced_by_pages: shared_refs,
is_content_stream: false,
is_page_object: false,
});
analyzer.analyze();
assert!(analyzer.first_page_objects.contains(&1));
assert!(analyzer.shared_objects.contains(&2));
}
#[test]
fn test_linearized_builder() {
let config = LinearizationConfig::default();
let mut builder = LinearizedPdfBuilder::new(5, config);
builder.set_first_page_object(4);
builder.set_file_length(50000);
builder.set_hint_stream_info(1024, 512);
builder.set_end_of_first_page(5000);
builder.set_main_xref_offset(45000);
let params = builder.params();
assert_eq!(params.first_page_object, 4);
assert_eq!(params.file_length, 50000);
}
#[test]
fn test_hint_tables_serialization() {
let mut tables = HintTables::new();
tables.page_offset_header.min_object_num = 1;
tables.page_offset_header.bits_page_length = 8;
tables.page_offset_header.min_page_length = 1000;
let bytes = tables.to_bytes();
assert!(!bytes.is_empty());
}
}