use anyhow::{anyhow, Result};
use std::fs;
use serde::{Serialize, Deserialize};
pub fn merge_pdfs(input_files: &[&str], output_file: &str) -> Result<()> {
if input_files.is_empty() {
return Err(anyhow!("No input files provided for merge"));
}
let mut all_page_streams: Vec<Vec<u8>> = Vec::new();
for path in input_files {
let doc = crate::pdf::PdfDocument::load_from_file(path)?;
let streams = extract_page_streams(&doc);
if streams.is_empty() {
eprintln!("[merge] Warning: no page streams found in {}", path);
}
all_page_streams.extend(streams);
}
if all_page_streams.is_empty() {
return Err(anyhow!("No page content found in any input file"));
}
let layout = crate::pdf_generator::PageLayout::portrait();
assemble_merged_pdf(output_file, &all_page_streams, "Helvetica", &layout)?;
println!(
"[merge] Combined {} pages from {} files into {}",
all_page_streams.len(),
input_files.len(),
output_file
);
Ok(())
}
pub fn merge_pdfs_sequential(
documents: &[crate::pdf::PdfDocument],
output_file: &str,
) -> Result<()> {
if documents.is_empty() {
return Err(anyhow!("No documents provided for merge"));
}
let mut all_page_streams: Vec<Vec<u8>> = Vec::new();
for doc in documents {
let streams = extract_page_streams(doc);
if streams.is_empty() {
eprintln!("[merge] Warning: no page streams found in document");
}
all_page_streams.extend(streams);
}
if all_page_streams.is_empty() {
return Err(anyhow!("No page content found in any document"));
}
let layout = crate::pdf_generator::PageLayout::portrait();
assemble_merged_pdf(output_file, &all_page_streams, "Helvetica", &layout)?;
println!(
"[merge] Combined {} pages from {} documents into {}",
all_page_streams.len(),
documents.len(),
output_file
);
Ok(())
}
pub fn split_pdf(input_file: &str, output_file: &str, start: usize, end: usize) -> Result<()> {
if start == 0 || end == 0 || start > end {
return Err(anyhow!(
"Invalid page range: start={} end={} (1-indexed, inclusive)",
start,
end
));
}
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
let total = all_streams.len();
if total == 0 {
return Err(anyhow!("No pages found in {}", input_file));
}
if start > total {
return Err(anyhow!(
"Start page {} exceeds total pages {}",
start,
total
));
}
let actual_end = end.min(total);
let selected: Vec<Vec<u8>> = all_streams[(start - 1)..actual_end].to_vec();
let layout = crate::pdf_generator::PageLayout::portrait();
assemble_merged_pdf(output_file, &selected, "Helvetica", &layout)?;
println!(
"[split] Extracted pages {}-{} ({} pages) from {} into {}",
start,
actual_end,
selected.len(),
input_file,
output_file
);
Ok(())
}
#[derive(Debug, Clone, Default)]
pub struct PdfMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub keywords: Option<String>,
pub creator: Option<String>,
pub custom_fields: std::collections::HashMap<String, String>,
}
impl PdfMetadata {
pub fn new() -> Self {
Self::default()
}
pub fn add_custom_field(&mut self, key: String, value: String) {
self.custom_fields.insert(key, value);
}
pub fn get_custom_field(&self, key: &str) -> Option<&String> {
self.custom_fields.get(key)
}
pub fn remove_custom_field(&mut self, key: &str) -> Option<String> {
self.custom_fields.remove(key)
}
fn to_info_dict(&self) -> String {
let mut entries = Vec::new();
if let Some(ref t) = self.title {
entries.push(format!("/Title ({})", escape_pdf_meta(t)));
}
if let Some(ref a) = self.author {
entries.push(format!("/Author ({})", escape_pdf_meta(a)));
}
if let Some(ref s) = self.subject {
entries.push(format!("/Subject ({})", escape_pdf_meta(s)));
}
if let Some(ref k) = self.keywords {
entries.push(format!("/Keywords ({})", escape_pdf_meta(k)));
}
if let Some(ref c) = self.creator {
entries.push(format!("/Creator ({})", escape_pdf_meta(c)));
}
entries.push("/Producer (pdf-cli)".to_string());
for (key, value) in &self.custom_fields {
let escaped_key = escape_pdf_meta(key);
let escaped_value = escape_pdf_meta(value);
entries.push(format!("/{} ({})", escaped_key, escaped_value));
}
format!("<<\n{}\n>>\n", entries.join("\n"))
}
}
pub fn create_pdf_with_metadata(
markdown_file: &str,
output_file: &str,
font: &str,
font_size: f32,
orientation: crate::pdf_generator::PageOrientation,
metadata: &PdfMetadata,
) -> Result<()> {
let content = fs::read_to_string(markdown_file)?;
let elements = crate::elements::parse_markdown(&content);
let layout = crate::pdf_generator::PageLayout::from_orientation(orientation);
create_pdf_elements_with_metadata(output_file, &elements, font, font_size, layout, metadata)
}
pub fn create_pdf_elements_with_metadata(
filename: &str,
elements: &[crate::elements::Element],
font: &str,
base_font_size: f32,
layout: crate::pdf_generator::PageLayout,
metadata: &PdfMetadata,
) -> Result<()> {
let show_page_numbers = true;
let page_streams = build_page_streams(elements, base_font_size, show_page_numbers, layout);
assemble_pdf_with_metadata(filename, &page_streams, font, &layout, metadata)?;
Ok(())
}
fn extract_page_streams(doc: &crate::pdf::PdfDocument) -> Vec<Vec<u8>> {
let mut streams = Vec::new();
let mut page_ids: Vec<u32> = doc
.objects
.iter()
.filter_map(|(id, obj)| {
if let crate::pdf::PdfObject::Dictionary(dict) = obj
&& let Some(crate::pdf::PdfValue::Object(crate::pdf::PdfObject::String(kind))) = dict.get("Type")
&& kind == "/Page" {
return Some(*id);
}
None
})
.collect();
page_ids.sort_unstable();
for page_id in page_ids {
if let Some(crate::pdf::PdfObject::Dictionary(dict)) = doc.objects.get(&page_id)
&& let Some(crate::pdf::PdfValue::Object(crate::pdf::PdfObject::String(contents_id_raw))) = dict.get("Contents")
&& let Ok(contents_id) = contents_id_raw.parse::<u32>()
&& let Some(crate::pdf::PdfObject::Stream { data, .. }) = doc.objects.get(&contents_id) {
streams.push(decompress_if_needed(data));
}
}
if !streams.is_empty() {
return streams;
}
let mut sorted_ids: Vec<&u32> = doc.objects.keys().collect();
sorted_ids.sort();
for id in sorted_ids {
if let crate::pdf::PdfObject::Stream { data, .. } = &doc.objects[id] {
let decompressed = decompress_if_needed(data);
let content = String::from_utf8_lossy(&decompressed);
if content.contains("Tj") || content.contains("TJ") || content.contains("BT") {
streams.push(decompressed);
}
}
}
streams
}
fn decompress_if_needed(data: &[u8]) -> Vec<u8> {
if data.len() > 2 && data[0] == 0x78 && ((data[0] as u16) * 256 + (data[1] as u16)).is_multiple_of(31) {
match crate::compression::decompress_deflate(data) {
Ok(d) => d,
Err(_) => data.to_vec(),
}
} else {
data.to_vec()
}
}
fn build_page_streams(
elements: &[crate::elements::Element],
base_font_size: f32,
_show_page_numbers: bool,
_layout: crate::pdf_generator::PageLayout,
) -> Vec<Vec<u8>> {
let tmp = format!(
"/tmp/pdf_cli_build_{}.pdf",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_nanos()
);
if crate::pdf_generator::create_pdf_from_elements_with_layout(
&tmp,
elements,
"Helvetica",
base_font_size,
_layout,
)
.is_ok()
{
if let Ok(doc) = crate::pdf::PdfDocument::load_from_file(&tmp) {
let streams = extract_page_streams(&doc);
let _ = fs::remove_file(&tmp);
return streams;
}
let _ = fs::remove_file(&tmp);
}
Vec::new()
}
fn assemble_merged_pdf(
filename: &str,
page_streams: &[Vec<u8>],
font: &str,
layout: &crate::pdf_generator::PageLayout,
) -> Result<()> {
let metadata = PdfMetadata::default();
assemble_pdf_with_metadata(filename, page_streams, font, layout, &metadata)
}
fn assemble_pdf_with_metadata(
filename: &str,
page_streams: &[Vec<u8>],
font: &str,
layout: &crate::pdf_generator::PageLayout,
metadata: &PdfMetadata,
) -> Result<()> {
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut page_ids = Vec::new();
let has_metadata = metadata.title.is_some()
|| metadata.author.is_some()
|| metadata.subject.is_some()
|| metadata.keywords.is_some()
|| metadata.creator.is_some();
let pages_obj_id = (page_streams.len() as u32) * 3 + 1;
for page_stream in page_streams {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let page_dict = format!(
"<< /Type /Page\n\
/Parent {} 0 R\n\
/MediaBox [0 0 {} {}]\n\
/Contents {} 0 R\n\
/Resources << /Font << /F1 {} 0 R >> >>\n\
>>\n",
pages_obj_id, layout.width, layout.height, content_id, font_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
let font_dict = format!(
"<< /Type /Font\n/Subtype /Type1\n/BaseFont /{}\n>>\n",
font
);
generator.add_object(font_dict);
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!(
"<< /Type /Pages\n\
/Kids [{}]\n\
/Count {}\n\
>>\n",
kids.join(" "),
page_ids.len()
);
let actual_pages_id = generator.add_object(pages_dict);
assert_eq!(actual_pages_id, pages_obj_id);
let info_id = if has_metadata {
Some(generator.add_object(metadata.to_info_dict()))
} else {
let default_meta = PdfMetadata::default();
Some(generator.add_object(default_meta.to_info_dict()))
};
let catalog_dict = format!(
"<< /Type /Catalog\n\
/Pages {} 0 R\n\
>>\n",
actual_pages_id
);
generator.add_object(catalog_dict);
let pdf_data = if let Some(info) = info_id {
generate_with_info(&generator, info)
} else {
generator.generate()
};
let mut file = std::fs::File::create(filename)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
Ok(())
}
fn generate_with_info(generator: &crate::pdf_generator::PdfGenerator, info_id: u32) -> Vec<u8> {
let mut pdf = Vec::new();
pdf.extend_from_slice(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n");
let mut offsets = Vec::new();
let mut current_offset = pdf.len() as u32;
for obj in &generator.objects {
offsets.push(current_offset);
let obj_header = format!("{} {} obj\n", obj.id, obj.generation);
pdf.extend_from_slice(obj_header.as_bytes());
pdf.extend_from_slice(obj.content.as_bytes());
if obj.is_stream
&& let Some(data) = &obj.stream_data {
pdf.extend_from_slice(b"stream\n");
pdf.extend_from_slice(data);
pdf.extend_from_slice(b"\nendstream\n");
}
pdf.extend_from_slice(b"endobj\n");
current_offset = pdf.len() as u32;
}
let xref_offset = pdf.len() as u32;
pdf.extend_from_slice(format!("xref\n0 {}\n", generator.objects.len() + 1).as_bytes());
pdf.extend_from_slice(b"0000000000 65535 f \n");
for offset in offsets {
pdf.extend_from_slice(format!("{:010} 00000 n \n", offset).as_bytes());
}
pdf.extend_from_slice(b"trailer\n");
pdf.extend_from_slice(b"<<\n");
pdf.extend_from_slice(format!("/Size {}\n", generator.objects.len() + 1).as_bytes());
if !generator.objects.is_empty() {
pdf.extend_from_slice(format!("/Root {} 0 R\n", generator.objects.len()).as_bytes());
}
pdf.extend_from_slice(format!("/Info {} 0 R\n", info_id).as_bytes());
pdf.extend_from_slice(b">>\n");
pdf.extend_from_slice(b"startxref\n");
pdf.extend_from_slice(format!("{}\n", xref_offset).as_bytes());
pdf.extend_from_slice(b"%%EOF\n");
pdf
}
pub fn rotate_pdf(input_file: &str, output_file: &str, rotation: u32) -> Result<()> {
if rotation != 0 && rotation != 90 && rotation != 180 && rotation != 270 {
return Err(anyhow!(
"Invalid rotation: {}. Must be 0, 90, 180, or 270.",
rotation
));
}
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
if all_streams.is_empty() {
return Err(anyhow!("No pages found in {}", input_file));
}
let layout = crate::pdf_generator::PageLayout::portrait();
assemble_rotated_pdf(output_file, &all_streams, "Helvetica", &layout, rotation)?;
println!(
"[rotate] Rotated {} pages by {}° in {}",
all_streams.len(),
rotation,
output_file
);
Ok(())
}
fn assemble_rotated_pdf(
filename: &str,
page_streams: &[Vec<u8>],
font: &str,
layout: &crate::pdf_generator::PageLayout,
rotation: u32,
) -> Result<()> {
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut page_ids = Vec::new();
let pages_obj_id = (page_streams.len() as u32) * 3 + 1;
for page_stream in page_streams {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let page_dict = format!(
"<< /Type /Page\n\
/Parent {} 0 R\n\
/MediaBox [0 0 {} {}]\n\
/Rotate {}\n\
/Contents {} 0 R\n\
/Resources << /Font << /F1 {} 0 R >> >>\n\
>>\n",
pages_obj_id, layout.width, layout.height, rotation, content_id, font_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
let font_dict = format!(
"<< /Type /Font\n/Subtype /Type1\n/BaseFont /{}\n>>\n",
font
);
generator.add_object(font_dict);
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!(
"<< /Type /Pages\n/Kids [{}]\n/Count {}\n>>\n",
kids.join(" "),
page_ids.len()
);
let actual_pages_id = generator.add_object(pages_dict);
assert_eq!(actual_pages_id, pages_obj_id);
let catalog_dict = format!(
"<< /Type /Catalog\n/Pages {} 0 R\n>>\n",
actual_pages_id
);
generator.add_object(catalog_dict);
let pdf_data = generator.generate();
let mut file = std::fs::File::create(filename)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
Ok(())
}
pub fn extract_metadata_from_pdf(doc: &crate::pdf::PdfDocument) -> Result<PdfMetadata> {
let mut metadata = PdfMetadata::new();
for obj in doc.objects.values() {
if let crate::pdf::PdfObject::Dictionary(data) = obj {
let dict_str = dict_to_string(data);
if dict_str.contains("/Title")
&& let Some(title) = extract_pdf_string_field(&dict_str, "/Title") {
metadata.title = Some(title);
}
if dict_str.contains("/Author")
&& let Some(author) = extract_pdf_string_field(&dict_str, "/Author") {
metadata.author = Some(author);
}
if dict_str.contains("/Subject")
&& let Some(subject) = extract_pdf_string_field(&dict_str, "/Subject") {
metadata.subject = Some(subject);
}
if dict_str.contains("/Keywords")
&& let Some(keywords) = extract_pdf_string_field(&dict_str, "/Keywords") {
metadata.keywords = Some(keywords);
}
if dict_str.contains("/Creator")
&& let Some(creator) = extract_pdf_string_field(&dict_str, "/Creator") {
metadata.creator = Some(creator);
}
}
}
Ok(metadata)
}
fn dict_to_string(dict: &std::collections::HashMap<String, crate::pdf::PdfValue>) -> String {
let mut parts = Vec::new();
for (key, value) in dict {
parts.push(format!("/{} {}", key, value_to_string(value)));
}
parts.join(" ")
}
fn value_to_string(value: &crate::pdf::PdfValue) -> String {
match value {
crate::pdf::PdfValue::Object(obj) => object_to_string(obj),
crate::pdf::PdfValue::Reference(id, generation) => format!("{} {} R", id, generation),
}
}
fn object_to_string(obj: &crate::pdf::PdfObject) -> String {
match obj {
crate::pdf::PdfObject::Dictionary(dict) => {
let entries: Vec<String> = dict.iter()
.map(|(k, v)| format!("/{} {}", k, value_to_string(v)))
.collect();
format!("<< {} >>", entries.join(" "))
}
crate::pdf::PdfObject::Stream { dictionary: _, data: _ } => {
"<< stream >>".to_string()
}
crate::pdf::PdfObject::Array(arr) => {
let elems: Vec<String> = arr.iter().map(value_to_string).collect();
format!("[{}]", elems.join(" "))
}
crate::pdf::PdfObject::String(s) => format!("({})", escape_pdf_meta(s)),
crate::pdf::PdfObject::Number(n) => n.to_string(),
crate::pdf::PdfObject::Boolean(b) => {
if *b { "true" } else { "false" }.to_string()
}
crate::pdf::PdfObject::Null => "null".to_string(),
crate::pdf::PdfObject::Reference(id, generation) => format!("{} {} R", id, generation),
crate::pdf::PdfObject::Name(n) => format!("/{}", n),
}
}
fn extract_pdf_string_field(content: &str, field: &str) -> Option<String> {
let field_pattern_start = format!("{} ", field);
if let Some(start) = content.find(&field_pattern_start) {
let after_field = &content[start + field_pattern_start.len()..];
if let Some(paren_start) = after_field.find('(') {
let value_start = start + field_pattern_start.len() + paren_start + 1;
let mut paren_count = 1;
let mut value_end = value_start;
let chars: Vec<char> = content[value_start..].chars().collect();
let mut i = 0;
while i < chars.len() && paren_count > 0 {
if chars[i] == '\\' && i + 1 < chars.len() {
i += 2;
continue;
}
if chars[i] == '(' {
paren_count += 1;
} else if chars[i] == ')' {
paren_count -= 1;
}
if paren_count > 0 {
value_end = value_start + i + 1;
}
i += 1;
}
let value = &content[value_start..value_end];
Some(unescape_pdf_string(value))
} else {
None
}
} else {
None
}
}
fn unescape_pdf_string(s: &str) -> String {
let mut result = String::new();
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '\\' {
if let Some(next) = chars.next() {
match next {
'n' => result.push('\n'),
'r' => result.push('\r'),
't' => result.push('\t'),
'b' => result.push('\x08'),
'f' => result.push('\x0c'),
'(' | ')' | '\\' => result.push(next),
'0'..='7' => {
let mut octal = String::from(next);
if let Some(&c) = chars.peek()
&& ('0'..='7').contains(&c) {
chars.next();
octal.push(c);
if let Some(&c) = chars.peek()
&& ('0'..='7').contains(&c) {
chars.next();
octal.push(c);
}
}
if let Ok(code) = u8::from_str_radix(&octal, 8) {
result.push(code as char);
}
}
_ => result.push(next),
}
}
} else {
result.push(c);
}
}
result
}
pub fn merge_metadata(base: &PdfMetadata, new_metadata: &PdfMetadata) -> PdfMetadata {
let mut merged = base.clone();
if new_metadata.title.is_some() {
merged.title = new_metadata.title.clone();
}
if new_metadata.author.is_some() {
merged.author = new_metadata.author.clone();
}
if new_metadata.subject.is_some() {
merged.subject = new_metadata.subject.clone();
}
if new_metadata.keywords.is_some() {
merged.keywords = new_metadata.keywords.clone();
}
if new_metadata.creator.is_some() {
merged.creator = new_metadata.creator.clone();
}
for (key, value) in &new_metadata.custom_fields {
merged.custom_fields.insert(key.clone(), value.clone());
}
merged
}
#[derive(Debug, Clone)]
pub struct TextAnnotation {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub content: String,
pub title: String,
}
#[derive(Debug, Clone)]
pub struct LinkAnnotation {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub url: String,
}
#[derive(Debug, Clone)]
pub struct HighlightAnnotation {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub color_r: f32,
pub color_g: f32,
pub color_b: f32,
}
pub fn create_pdf_with_all_annotations(
output_file: &str,
text: &str,
annotations: &[TextAnnotation],
links: &[LinkAnnotation],
highlights: &[HighlightAnnotation],
) -> Result<()> {
let elements = crate::elements::parse_markdown(text);
let layout = crate::pdf_generator::PageLayout::portrait();
let page_streams = build_page_streams(&elements, 12.0, true, layout);
if page_streams.is_empty() {
return Err(anyhow!("No page content generated"));
}
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut annot_ids: Vec<u32> = Vec::new();
for annot in annotations {
let annot_dict = format!(
"<< /Type /Annot\n/Subtype /Text\n/Rect [{} {} {} {}]\n/Contents ({})\n/T ({})\n/Open false\n>>\n",
annot.x, annot.y, annot.x + annot.width, annot.y + annot.height,
escape_pdf_meta(&annot.content), escape_pdf_meta(&annot.title),
);
annot_ids.push(generator.add_object(annot_dict));
}
for link in links {
let link_dict = format!(
"<< /Type /Annot\n/Subtype /Link\n/Rect [{} {} {} {}]\n/Border [0 0 0]\n/A << /Type /Action\n/S /URI\n/URI ({}) >>\n>>\n",
link.x, link.y, link.x + link.width, link.y + link.height,
escape_pdf_meta(&link.url),
);
annot_ids.push(generator.add_object(link_dict));
}
for hl in highlights {
let hl_dict = format!(
"<< /Type /Annot\n/Subtype /Highlight\n/Rect [{} {} {} {}]\n/C [{} {} {}]\n/QuadPoints [{} {} {} {} {} {} {} {}]\n>>\n",
hl.x, hl.y, hl.x + hl.width, hl.y + hl.height,
hl.color_r, hl.color_g, hl.color_b,
hl.x, hl.y + hl.height, hl.x + hl.width, hl.y + hl.height,
hl.x, hl.y, hl.x + hl.width, hl.y,
);
annot_ids.push(generator.add_object(hl_dict));
}
let annot_offset = annot_ids.len() as u32;
let pages_obj_id = annot_offset + (page_streams.len() as u32) * 3 + 1;
let mut page_ids = Vec::new();
for (i, page_stream) in page_streams.iter().enumerate() {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let annots_str = if i == 0 && !annot_ids.is_empty() {
let refs: Vec<String> = annot_ids.iter().map(|id| format!("{} 0 R", id)).collect();
format!("/Annots [{}]\n", refs.join(" "))
} else {
String::new()
};
let page_dict = format!(
"<< /Type /Page\n/Parent {} 0 R\n/MediaBox [0 0 {} {}]\n/Contents {} 0 R\n{}/Resources << /Font << /F1 {} 0 R >> >>\n>>\n",
pages_obj_id, layout.width, layout.height, content_id, annots_str, font_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
generator.add_object("<< /Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\n".to_string());
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!("<< /Type /Pages\n/Kids [{}]\n/Count {}\n>>\n", kids.join(" "), page_ids.len());
let actual_pages_id = generator.add_object(pages_dict);
assert_eq!(actual_pages_id, pages_obj_id);
generator.add_object(format!("<< /Type /Catalog\n/Pages {} 0 R\n>>\n", actual_pages_id));
let pdf_data = generator.generate();
let mut file = std::fs::File::create(output_file)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
println!(
"[annotate] Created {} with {} text, {} link, {} highlight annotations",
output_file, annotations.len(), links.len(), highlights.len()
);
Ok(())
}
pub fn create_pdf_with_annotations(
output_file: &str,
text: &str,
annotations: &[TextAnnotation],
links: &[LinkAnnotation],
) -> Result<()> {
let elements = crate::elements::parse_markdown(text);
let layout = crate::pdf_generator::PageLayout::portrait();
let page_streams = build_page_streams(&elements, 12.0, true, layout);
if page_streams.is_empty() {
return Err(anyhow!("No page content generated"));
}
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut annot_ids: Vec<u32> = Vec::new();
for annot in annotations {
let annot_dict = format!(
"<< /Type /Annot\n\
/Subtype /Text\n\
/Rect [{} {} {} {}]\n\
/Contents ({})\n\
/T ({})\n\
/Open false\n\
>>\n",
annot.x,
annot.y,
annot.x + annot.width,
annot.y + annot.height,
escape_pdf_meta(&annot.content),
escape_pdf_meta(&annot.title),
);
annot_ids.push(generator.add_object(annot_dict));
}
for link in links {
let link_dict = format!(
"<< /Type /Annot\n\
/Subtype /Link\n\
/Rect [{} {} {} {}]\n\
/Border [0 0 0]\n\
/A << /Type /Action\n/S /URI\n/URI ({}) >>\n\
>>\n",
link.x,
link.y,
link.x + link.width,
link.y + link.height,
escape_pdf_meta(&link.url),
);
annot_ids.push(generator.add_object(link_dict));
}
let annot_offset = annot_ids.len() as u32;
let pages_obj_id = annot_offset + (page_streams.len() as u32) * 3 + 1;
let mut page_ids = Vec::new();
for (i, page_stream) in page_streams.iter().enumerate() {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let annots_str = if i == 0 && !annot_ids.is_empty() {
let refs: Vec<String> = annot_ids.iter().map(|id| format!("{} 0 R", id)).collect();
format!("/Annots [{}]\n", refs.join(" "))
} else {
String::new()
};
let page_dict = format!(
"<< /Type /Page\n\
/Parent {} 0 R\n\
/MediaBox [0 0 {} {}]\n\
/Contents {} 0 R\n\
{}\
/Resources << /Font << /F1 {} 0 R >> >>\n\
>>\n",
pages_obj_id, layout.width, layout.height, content_id, annots_str, font_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
let font_dict = "<< /Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\n".to_string();
generator.add_object(font_dict);
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!(
"<< /Type /Pages\n/Kids [{}]\n/Count {}\n>>\n",
kids.join(" "),
page_ids.len()
);
let actual_pages_id = generator.add_object(pages_dict);
assert_eq!(actual_pages_id, pages_obj_id);
let catalog_dict = format!(
"<< /Type /Catalog\n/Pages {} 0 R\n>>\n",
actual_pages_id
);
generator.add_object(catalog_dict);
let pdf_data = generator.generate();
let mut file = std::fs::File::create(output_file)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
println!(
"[annotate] Created {} with {} text annotations, {} link annotations",
output_file,
annotations.len(),
links.len()
);
Ok(())
}
pub fn create_pdf_with_images(
output_file: &str,
images: &[(String, f32, f32, f32, f32)], ) -> Result<()> {
if images.is_empty() {
return Err(anyhow!("No images provided"));
}
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut image_refs: Vec<(u32, String)> = Vec::new();
for (i, (path, _, _, _, _)) in images.iter().enumerate() {
let info = crate::image::load_image(path)?;
let name = format!("Im{}", i + 1);
let image_id = crate::image::create_image_object(&mut generator, info)?;
image_refs.push((image_id, name));
}
let mut content = Vec::new();
for (i, (_, x, y, w, h)) in images.iter().enumerate() {
let name = &image_refs[i].1;
content.extend_from_slice(b"q\n");
content.extend_from_slice(format!("{} 0 0 {} {} {} cm\n", w, h, x, y).as_bytes());
content.extend_from_slice(format!("/{} Do\n", name).as_bytes());
content.extend_from_slice(b"Q\n");
}
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", content.len()),
content,
);
let xobj_entries: Vec<String> = image_refs
.iter()
.map(|(id, name)| format!("/{} {} 0 R", name, id))
.collect();
let xobj_dict = xobj_entries.join(" ");
let page_dict = format!(
"<< /Type /Page\n\
/Parent 0 0 R\n\
/MediaBox [0 0 612 792]\n\
/Contents {} 0 R\n\
/Resources << /XObject << {} >> >>\n\
>>\n",
content_id, xobj_dict
);
let page_id = generator.add_object(page_dict);
let pages_dict = format!(
"<< /Type /Pages\n/Kids [{} 0 R]\n/Count 1\n>>\n",
page_id
);
let pages_id = generator.add_object(pages_dict);
let catalog = format!("<< /Type /Catalog\n/Pages {} 0 R\n>>\n", pages_id);
generator.add_object(catalog);
let pdf_data = generator.generate();
fs::write(output_file, &pdf_data)?;
println!(
"[images] Created {} with {} images",
output_file,
images.len()
);
Ok(())
}
pub fn watermark_pdf(
input_file: &str,
output_file: &str,
watermark_text: &str,
font_size: f32,
opacity: f32,
) -> Result<()> {
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
if all_streams.is_empty() {
return Err(anyhow!("No pages found in {}", input_file));
}
let layout = crate::pdf_generator::PageLayout::portrait();
let watermark_stream = build_watermark_stream(watermark_text, font_size, opacity, &layout);
let watermarked: Vec<Vec<u8>> = all_streams
.iter()
.map(|stream| {
let mut combined = stream.clone();
combined.extend_from_slice(&watermark_stream);
combined
})
.collect();
assemble_merged_pdf(output_file, &watermarked, "Helvetica", &layout)?;
println!(
"[watermark] Added watermark '{}' to {} pages in {}",
watermark_text,
watermarked.len(),
output_file
);
Ok(())
}
fn build_watermark_stream(text: &str, font_size: f32, opacity: f32, layout: &crate::pdf_generator::PageLayout) -> Vec<u8> {
let escaped = escape_pdf_meta(text);
let cx = layout.width / 2.0;
let cy = layout.height / 2.0;
let cos45: f32 = std::f32::consts::FRAC_1_SQRT_2;
let sin45: f32 = std::f32::consts::FRAC_1_SQRT_2;
let mut stream = Vec::new();
stream.extend_from_slice(b"q\n");
stream.extend_from_slice(format!("{} {} {} rg\n", opacity, opacity, opacity).as_bytes());
stream.extend_from_slice(b"BT\n");
stream.extend_from_slice(format!("/F1 {} Tf\n", font_size).as_bytes());
stream.extend_from_slice(
format!(
"{} {} {} {} {} {} Tm\n",
cos45, sin45, -sin45, cos45, cx - 100.0, cy - 50.0
)
.as_bytes(),
);
stream.extend_from_slice(format!("({}) Tj\n", escaped).as_bytes());
stream.extend_from_slice(b"ET\n");
stream.extend_from_slice(b"Q\n");
stream
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum FormFieldType {
Text,
Checkbox,
Radio,
Dropdown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FormField {
pub name: String,
pub field_type: FormFieldType,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub default_value: Option<String>,
pub options: Vec<String>, pub required: bool,
}
pub fn create_pdf_with_form_fields(
output_file: &str,
text: &str,
form_fields: &[FormField],
) -> Result<()> {
let elements = crate::elements::parse_markdown(text);
let layout = crate::pdf_generator::PageLayout::portrait();
let page_streams = build_page_streams(&elements, 12.0, true, layout);
if page_streams.is_empty() {
return Err(anyhow!("No page content generated"));
}
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut field_ids: Vec<u32> = Vec::new();
for field in form_fields {
let field_dict = create_form_field_dict(field);
field_ids.push(generator.add_object(field_dict));
}
let kids_refs: Vec<String> = field_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let acroform_dict = format!(
"<< /Fields [{}]\n>>\n",
kids_refs.join(" ")
);
let acroform_id = generator.add_object(acroform_dict);
let field_offset = field_ids.len() as u32;
let pages_obj_id = field_offset + 1 + (page_streams.len() as u32) * 3 + 1;
let mut page_ids = Vec::new();
for (i, page_stream) in page_streams.iter().enumerate() {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let annots_str = if i == 0 && !field_ids.is_empty() {
let refs: Vec<String> = field_ids.iter().map(|id| format!("{} 0 R", id)).collect();
format!("/Annots [{}]\n", refs.join(" "))
} else {
String::new()
};
let page_dict = format!(
"<< /Type /Page\n\
/Parent {} 0 R\n\
/MediaBox [0 0 {} {}]\n\
/Contents {} 0 R\n\
{}\
/Resources << /Font << /F1 {} 0 R >> >>\n\
>>\n",
pages_obj_id, layout.width, layout.height, content_id, annots_str, font_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
generator.add_object("<< /Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\n".to_string());
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!("<< /Type /Pages\n/Kids [{}]\n/Count {}\n>>\n", kids.join(" "), page_ids.len());
let actual_pages_id = generator.add_object(pages_dict);
let catalog_dict = format!(
"<< /Type /Catalog\n/Pages {} 0 R\n/AcroForm {} 0 R\n>>\n",
actual_pages_id, acroform_id
);
generator.add_object(catalog_dict);
let pdf_data = generator.generate();
let mut file = std::fs::File::create(output_file)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
println!(
"[form] Created {} with {} form fields",
output_file,
form_fields.len()
);
Ok(())
}
fn create_form_field_dict(field: &FormField) -> String {
let base_dict = format!(
"<< /Type /Annot\n/Subtype /Widget\n\
/Rect [{} {} {} {}]\n\
/FT {}\n\
/T ({})\n",
field.x,
field.y,
field.x + field.width,
field.y + field.height,
field_type_to_pdf(&field.field_type),
escape_pdf_meta(&field.name)
);
let mut dict = base_dict;
if let Some(ref value) = field.default_value {
dict.push_str(&format!("/V ({})\n", escape_pdf_meta(value)));
}
match field.field_type {
FormFieldType::Text => {
dict.push_str(&format!(
"/Ff {}\n",
if field.required { 2 } else { 0 } ));
dict.push_str("/AP << /N << /Type /Appearance\n/Length 0 >> >>\n");
}
FormFieldType::Checkbox => {
dict.push_str(&format!(
"/V /Off\n/Ff {}\n",
if field.required { 2 } else { 0 }
));
dict.push_str("/AP << /N << /Type /Appearance\n/Length 0 >> >>\n");
}
FormFieldType::Radio => {
if !field.options.is_empty() {
let opts: Vec<String> = field.options.iter().map(|o| format!("({})", escape_pdf_meta(o))).collect();
dict.push_str(&format!("/Opt [{}]\n", opts.join(" ")));
}
dict.push_str(&format!(
"/V /Off\n/Ff {}\n",
if field.required { 2 } else { 0 }
));
}
FormFieldType::Dropdown => {
if !field.options.is_empty() {
let opts: Vec<String> = field.options.iter().map(|o| format!("({})", escape_pdf_meta(o))).collect();
dict.push_str(&format!("/Opt [{}]\n", opts.join(" ")));
}
dict.push_str(&format!(
"/Ff {}131072\n",
if field.required { 2 + 131072 } else { 131072 } ));
}
}
dict.push_str(">>\n");
dict
}
fn field_type_to_pdf(field_type: &FormFieldType) -> String {
match field_type {
FormFieldType::Text => "/Tx".to_string(),
FormFieldType::Checkbox => "/Btn".to_string(),
FormFieldType::Radio => "/Btn".to_string(),
FormFieldType::Dropdown => "/Ch".to_string(),
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DetectedFormField {
pub name: String,
pub field_type: String,
pub value: Option<String>,
pub options: Vec<String>,
pub required: bool,
}
pub fn detect_form_fields(input_file: &str) -> Result<Vec<DetectedFormField>> {
let pdf_bytes = fs::read(input_file)?;
let content = String::from_utf8_lossy(&pdf_bytes);
let mut fields = Vec::new();
let obj_re = regex::Regex::new(r"(?s)(\d+)\s+0\s+obj(.*?)endobj").unwrap();
let opt_re = regex::Regex::new(r"\(([^)]*)\)").unwrap();
for caps in obj_re.captures_iter(&content) {
let obj_text = &caps[0];
let obj_body = &caps[2];
if !obj_body.contains("/Type /Annot") || !obj_body.contains("/Subtype /Widget") {
continue;
}
let dict_text = obj_text;
let name = extract_pdf_dict_value(dict_text, "/T")
.unwrap_or_default()
.trim_matches(|c| c == '(' || c == ')')
.to_string();
if name.is_empty() {
continue;
}
let field_type = extract_pdf_dict_value(dict_text, "/FT")
.unwrap_or_default()
.trim_start_matches('/')
.to_string();
let type_str = match field_type.as_str() {
"Tx" => "text",
"Btn" => {
if extract_pdf_dict_value(dict_text, "/Opt").is_some() {
"radio"
} else {
"checkbox"
}
}
"Ch" => "dropdown",
_ => "unknown",
};
let value = extract_pdf_dict_value(dict_text, "/V").map(|v| {
if v.starts_with('(') && v.ends_with(')') {
v[1..v.len()-1].to_string()
} else if v.starts_with('<') && v.ends_with('>') {
crate::pdf::decode_pdf_hex_string(&v[1..v.len()-1])
} else {
v.to_string()
}
});
let options = if let Some(opt_raw) = extract_pdf_dict_value(dict_text, "/Opt") {
opt_re.captures_iter(&opt_raw)
.map(|c| c[1].to_string())
.collect()
} else {
Vec::new()
};
let required = extract_pdf_dict_value(dict_text, "/Ff")
.and_then(|f| f.parse::<u32>().ok())
.map(|flags| (flags & 2) != 0)
.unwrap_or(false);
fields.push(DetectedFormField {
name,
field_type: type_str.to_string(),
value,
options,
required,
});
}
Ok(fields)
}
pub fn fill_form_fields(
input_file: &str,
output_file: &str,
field_values: &std::collections::HashMap<String, String>,
) -> Result<()> {
let pdf_bytes = fs::read(input_file)?;
let content = String::from_utf8_lossy(&pdf_bytes);
if field_values.is_empty() {
fs::write(output_file, &pdf_bytes)?;
return Ok(());
}
let obj_re = regex::Regex::new(r"(?s)(\d+)\s+0\s+obj(.*?)endobj").unwrap();
let v_re = regex::Regex::new(r"/V\s*\([^)]*\)").unwrap();
let mut updated_bytes = pdf_bytes.clone();
let mut offset_delta: isize = 0;
for caps in obj_re.captures_iter(&content) {
let dict_text = &caps[0];
let obj_body = &caps[2];
let full_match_start = caps.get(0).unwrap().start();
if !obj_body.contains("/Type /Annot") || !obj_body.contains("/Subtype /Widget") {
continue;
}
let name = extract_pdf_dict_value(dict_text, "/T")
.unwrap_or_default()
.trim_matches(|c| c == '(' || c == ')')
.to_string();
if name.is_empty() || !field_values.contains_key(&name) {
continue;
}
let new_value = &field_values[&name];
let escaped_value = escape_pdf_meta(new_value);
let adjusted_start = ((full_match_start as isize) + offset_delta) as usize;
let adjusted_end = adjusted_start + dict_text.len();
if adjusted_end > updated_bytes.len() {
continue;
}
let local_dict = String::from_utf8_lossy(&updated_bytes[adjusted_start..adjusted_end]);
let updated_dict = if local_dict.contains("/V ") {
let new_v = format!("/V ({})", escaped_value);
v_re.replace(&local_dict, &new_v).to_string()
} else {
local_dict.replace(">>", &format!("/V ({})\n>>", escaped_value))
};
if updated_dict != *local_dict {
let old_len = local_dict.len();
let new_len = updated_dict.len();
updated_bytes.splice(adjusted_start..adjusted_end, updated_dict.bytes());
offset_delta += (new_len as isize) - (old_len as isize);
}
}
fs::write(output_file, &updated_bytes)?;
println!("[fill] Updated {} field(s) in {}", field_values.len(), output_file);
Ok(())
}
pub fn overlay_image_on_pdf(
input_file: &str,
output_file: &str,
image_path: &str,
x: f32,
y: f32,
width: f32,
height: f32,
opacity: f32,
) -> Result<()> {
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
if all_streams.is_empty() {
return Err(anyhow!("No pages found in {}", input_file));
}
let image_info = crate::image::load_image(image_path)?;
let mut generator = crate::pdf_generator::PdfGenerator::new();
let image_id = crate::image::create_image_object(&mut generator, image_info.clone())?;
let mut overlay_content = Vec::new();
if opacity < 1.0 {
overlay_content.extend_from_slice(format!("{} {} {} rg\n", opacity, opacity, opacity).as_bytes());
}
overlay_content.extend_from_slice(b"q\n");
overlay_content.extend_from_slice(format!("{} 0 0 {} {} {} cm\n", width, height, x, y).as_bytes());
overlay_content.extend_from_slice(b"/Im1 Do\n");
overlay_content.extend_from_slice(b"Q\n");
let layout = crate::pdf_generator::PageLayout::portrait();
let overlayed: Vec<Vec<u8>> = all_streams
.iter()
.map(|stream| {
let mut combined = stream.clone();
combined.extend_from_slice(&overlay_content);
combined
})
.collect();
assemble_pdf_with_image_overlay(output_file, &overlayed, "Helvetica", &layout, image_id)?;
println!(
"[overlay] Added image overlay '{}' to {} pages in {}",
image_path,
overlayed.len(),
output_file
);
Ok(())
}
fn assemble_pdf_with_image_overlay(
filename: &str,
page_streams: &[Vec<u8>],
font: &str,
layout: &crate::pdf_generator::PageLayout,
image_id: u32,
) -> Result<()> {
let mut generator = crate::pdf_generator::PdfGenerator::new();
let mut page_ids = Vec::new();
let pages_obj_id = (page_streams.len() as u32) * 3 + 2;
for page_stream in page_streams {
let content_id = generator.add_stream_object(
format!("<< /Length {} >>\n", page_stream.len()),
page_stream.clone(),
);
let font_id = content_id + 2;
let page_dict = format!(
"<< /Type /Page\n\
/Parent {} 0 R\n\
/MediaBox [0 0 {} {}]\n\
/Contents {} 0 R\n\
/Resources << /Font << /F1 {} 0 R >> /XObject << /Im1 {} 0 R >> >>\n\
>>\n",
pages_obj_id, layout.width, layout.height, content_id, font_id, image_id
);
let page_id = generator.add_object(page_dict);
page_ids.push(page_id);
let font_dict = format!(
"<< /Type /Font\n/Subtype /Type1\n/BaseFont /{}\n>>\n",
font
);
generator.add_object(font_dict);
}
let kids: Vec<String> = page_ids.iter().map(|id| format!("{} 0 R", id)).collect();
let pages_dict = format!(
"<< /Type /Pages\n/Kids [{}]\n/Count {}\n>>\n",
kids.join(" "),
page_ids.len()
);
let actual_pages_id = generator.add_object(pages_dict);
assert_eq!(actual_pages_id, pages_obj_id);
let catalog_dict = format!(
"<< /Type /Catalog\n/Pages {} 0 R\n>>\n",
actual_pages_id
);
generator.add_object(catalog_dict);
let pdf_data = generator.generate();
let mut file = std::fs::File::create(filename)?;
std::io::Write::write_all(&mut file, &pdf_data)?;
Ok(())
}
#[derive(Debug, Clone, Copy)]
pub enum WatermarkType {
Text,
Image,
}
pub enum WatermarkContent {
Text(String),
Image(String), }
pub fn watermark_pdf_advanced(
input_file: &str,
output_file: &str,
content: WatermarkContent,
opacity: f32,
position: WatermarkPosition,
) -> Result<()> {
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
if all_streams.is_empty() {
return Err(anyhow!("No pages found in {}", input_file));
}
let layout = crate::pdf_generator::PageLayout::portrait();
let watermark_stream = match content {
WatermarkContent::Text(text) => {
build_text_watermark_stream(&text, 48.0, opacity, &layout, position)
}
WatermarkContent::Image(image_path) => {
let image_info = crate::image::load_image(&image_path)?;
build_image_watermark_stream(&image_info, opacity, &layout, position)?
}
};
let watermarked: Vec<Vec<u8>> = all_streams
.iter()
.map(|stream| {
let mut combined = stream.clone();
combined.extend_from_slice(&watermark_stream);
combined
})
.collect();
assemble_merged_pdf(output_file, &watermarked, "Helvetica", &layout)?;
println!(
"[watermark] Added watermark to {} pages in {}",
watermarked.len(),
output_file
);
Ok(())
}
#[derive(Debug, Clone, Copy)]
pub enum WatermarkPosition {
Center,
TopLeft,
TopRight,
BottomLeft,
BottomRight,
Diagonal, }
fn build_text_watermark_stream(
text: &str,
font_size: f32,
opacity: f32,
layout: &crate::pdf_generator::PageLayout,
position: WatermarkPosition,
) -> Vec<u8> {
let escaped = escape_pdf_meta(text);
let (x, y, rotation) = match position {
WatermarkPosition::Center => {
(layout.width / 2.0, layout.height / 2.0, 0.0)
}
WatermarkPosition::TopLeft => {
(72.0, layout.height - 72.0, 0.0)
}
WatermarkPosition::TopRight => {
(layout.width - 72.0, layout.height - 72.0, 0.0)
}
WatermarkPosition::BottomLeft => {
(72.0, 72.0, 0.0)
}
WatermarkPosition::BottomRight => {
(layout.width - 72.0, 72.0, 0.0)
}
WatermarkPosition::Diagonal => {
(layout.width / 2.0 - 100.0, layout.height / 2.0 - 50.0, 45.0)
}
};
let mut stream = Vec::new();
stream.extend_from_slice(b"q\n");
stream.extend_from_slice(format!("{} {} {} rg\n", opacity, opacity, opacity).as_bytes());
stream.extend_from_slice(b"BT\n");
stream.extend_from_slice(format!("/F1 {} Tf\n", font_size).as_bytes());
if rotation != 0.0 {
let rad = rotation * std::f32::consts::PI / 180.0;
let cos = rad.cos();
let sin = rad.sin();
stream.extend_from_slice(
format!("{} {} {} {} {} {} Tm\n", cos, sin, -sin, cos, x, y).as_bytes()
);
} else {
stream.extend_from_slice(format!("{} {} Td\n", x, y).as_bytes());
}
stream.extend_from_slice(format!("({}) Tj\n", escaped).as_bytes());
stream.extend_from_slice(b"ET\n");
stream.extend_from_slice(b"Q\n");
stream
}
fn build_image_watermark_stream(
image_info: &crate::image::ImageInfo,
opacity: f32,
layout: &crate::pdf_generator::PageLayout,
position: WatermarkPosition,
) -> Result<Vec<u8>> {
let max_width = layout.width * 0.5;
let max_height = layout.height * 0.5;
let (img_width, img_height) = crate::image::scale_to_fit(
image_info.width,
image_info.height,
max_width,
max_height,
);
let (x, y) = match position {
WatermarkPosition::Center => {
((layout.width - img_width) / 2.0, (layout.height - img_height) / 2.0)
}
WatermarkPosition::TopLeft => {
(36.0, layout.height - img_height - 36.0)
}
WatermarkPosition::TopRight => {
(layout.width - img_width - 36.0, layout.height - img_height - 36.0)
}
WatermarkPosition::BottomLeft => {
(36.0, 36.0)
}
WatermarkPosition::BottomRight => {
(layout.width - img_width - 36.0, 36.0)
}
WatermarkPosition::Diagonal => {
((layout.width - img_width) / 2.0, (layout.height - img_height) / 2.0)
}
};
let mut stream = Vec::new();
stream.extend_from_slice(b"q\n");
if opacity < 1.0 {
stream.extend_from_slice(format!("{} {} {} rg\n", opacity, opacity, opacity).as_bytes());
}
stream.extend_from_slice(b"q\n");
stream.extend_from_slice(format!("{} 0 0 {} {} {} cm\n", img_width, img_height, x, y).as_bytes());
stream.extend_from_slice(b"/Im1 Do\n");
stream.extend_from_slice(b"Q\n");
stream.extend_from_slice(b"Q\n");
Ok(stream)
}
pub fn reorder_pages(input_file: &str, output_file: &str, page_order: &[usize]) -> Result<()> {
if page_order.is_empty() {
return Err(anyhow!("Page order list is empty"));
}
let doc = crate::pdf::PdfDocument::load_from_file(input_file)?;
let all_streams = extract_page_streams(&doc);
let total = all_streams.len();
if total == 0 {
return Err(anyhow!("No pages found in {}", input_file));
}
for &p in page_order {
if p == 0 || p > total {
return Err(anyhow!(
"Invalid page number {} (document has {} pages)",
p,
total
));
}
}
let reordered: Vec<Vec<u8>> = page_order
.iter()
.map(|&p| all_streams[p - 1].clone())
.collect();
let layout = crate::pdf_generator::PageLayout::portrait();
assemble_merged_pdf(output_file, &reordered, "Helvetica", &layout)?;
println!(
"[reorder] Reordered {} pages from {} into {}",
reordered.len(),
input_file,
output_file
);
Ok(())
}
pub fn protect_pdf(input_file: &str, output_file: &str, security: &crate::security::PdfSecurity) -> Result<()> {
let content = fs::read_to_string(input_file)?;
let trailer_pos = content.rfind("trailer")
.ok_or_else(|| anyhow!("No trailer found in PDF"))?;
let encryption_dict = security.create_encryption_dict();
if !security.is_protected() {
fs::write(output_file, content)?;
return Ok(());
}
let mut protected_content = content.clone();
if let Some(trailer_start) = content[trailer_pos..].find("<<") {
let insert_pos = trailer_pos + trailer_start;
let _encryption_entry = format!("\n/Encrypt {} 0 R\n ", 1);
let protection_notice = format!(
"% PDF PROTECTED: Algorithm={}, Permissions={:08X}\n",
security.encryption_algorithm.name(),
security.permissions.to_pdf_flags()
);
protected_content.insert_str(0, &protection_notice);
let trailer_with_encrypt = content[insert_pos..].replacen(
"<<",
&format!("<<\n/Encrypt <<{}>>", encryption_dict),
1,
);
protected_content = format!(
"{}{}",
&protected_content[..insert_pos.min(protected_content.len())],
trailer_with_encrypt
);
}
fs::write(output_file, protected_content)?;
println!(
"[protect] Applied protection to {} (algorithm: {})",
output_file,
security.encryption_algorithm.name()
);
Ok(())
}
pub(crate) fn escape_pdf_meta(s: &str) -> String {
s.replace('\\', "\\\\")
.replace('(', "\\(")
.replace(')', "\\)")
}
use sha2::{Digest, Sha256};
pub fn sign_pdf(input_file: &str, output_file: &str, signature: &crate::security::DigitalSignature) -> Result<()> {
let pdf_bytes = fs::read(input_file)?;
let sig = signature.clone();
let contents_placeholder = "0".repeat(8192);
let mut sig_dict = format!(
"<< /Type /Sig\n\
/Filter /Adobe.PPKLite\n\
/SubFilter /adbe.pkcs7.detached\n\
/Contents <{}>\n\
/ByteRange [0 0 0 0]\n",
contents_placeholder
);
if let Some(ref date) = sig.date {
sig_dict.push_str(&format!(" /M (D:{})\n", escape_pdf_meta(date)));
}
sig_dict.push_str(&format!(" /Name ({})\n", escape_pdf_meta(&sig.signer_name)));
if let Some(ref reason) = sig.reason {
sig_dict.push_str(&format!(" /Reason ({})\n", escape_pdf_meta(reason)));
}
if let Some(ref location) = sig.location {
sig_dict.push_str(&format!(" /Location ({})\n", escape_pdf_meta(location)));
}
if let Some(ref contact) = sig.contact_info {
sig_dict.push_str(&format!(" /ContactInfo ({})\n", escape_pdf_meta(contact)));
}
sig_dict.push_str(">>");
let original_len = pdf_bytes.len();
let mut output = pdf_bytes.clone();
let last_eof = output.windows(5).rposition(|w| w == b"%%EOF").unwrap_or(0);
let startxref_pos = output[..last_eof].windows(9).rposition(|w| w == b"startxref").unwrap_or(0);
let xref_offset: usize = String::from_utf8_lossy(&output[startxref_pos + 9..last_eof])
.trim()
.parse()
.unwrap_or(0);
let trailer_end = output[startxref_pos..].iter().position(|&b| b == b'>').unwrap_or(0);
let trailer_text = String::from_utf8_lossy(&output[startxref_pos..startxref_pos + trailer_end]);
let catalog_ref = trailer_text
.lines()
.find(|l| l.contains("/Root"))
.and_then(|l| {
l.split("/Root")
.nth(1)?
.split_whitespace()
.next()
.map(|s| s.trim())
})
.unwrap_or("");
let update_start = original_len;
let mut update = Vec::new();
let sig_obj_num = 999; let sig_dict_obj = format!("{} 0 obj\n{}\nendobj\n", sig_obj_num, sig_dict);
update.extend_from_slice(sig_dict_obj.as_bytes());
let field_obj_num = sig_obj_num + 1;
let field_dict = format!(
"{} 0 obj\n<< /Type /Annot\n\
/Subtype /Widget\n\
/FT /Sig\n\
/T (Signature1)\n\
/V {} 0 R\n\
/P 1 0 R\n\
/Rect [0 0 0 0]\n\
/F 132\n\
>>\nendobj\n",
field_obj_num, sig_obj_num
);
update.extend_from_slice(field_dict.as_bytes());
let new_catalog_num = sig_obj_num + 2;
let new_catalog = format!(
"{} 0 obj\n<< /Type /Catalog\n\
/Pages {}\n\
/AcroForm << /Fields [{} 0 R] /SigFlags 3 >>\n\
>>\nendobj\n",
new_catalog_num,
if catalog_ref.is_empty() { "1 0 R".to_string() } else { catalog_ref.to_string() },
field_obj_num
);
update.extend_from_slice(new_catalog.as_bytes());
let xref_offset_new = update_start;
let xref = format!(
"xref\n\
0 1\n\
0000000000 65535 f \n\
{} 3\n\
{:010} 00000 n \n\
{:010} 00000 n \n\
{:010} 00000 n \n",
sig_obj_num,
xref_offset_new,
xref_offset_new + sig_dict_obj.len(),
xref_offset_new + sig_dict_obj.len() + field_dict.len()
);
update.extend_from_slice(xref.as_bytes());
let trailer = format!(
"trailer\n<< /Size {} /Root {} 0 R /Prev {} >>\nstartxref\n{}\n%%EOF\n",
new_catalog_num + 1,
new_catalog_num,
xref_offset,
update_start
);
update.extend_from_slice(trailer.as_bytes());
output.extend_from_slice(&update);
let full_output = output.clone();
let contents_marker = format!("Contents <{}", contents_placeholder);
let contents_start = full_output
.windows(contents_marker.len())
.position(|w| w == contents_marker.as_bytes())
.ok_or_else(|| anyhow!("Could not find signature contents placeholder"))?;
let value_start = contents_start + 1; let value_end = contents_start + contents_marker.len() + 1;
let byte_range = [0u32,
value_start as u32,
value_end as u32,
(full_output.len() - value_end) as u32];
let mut hasher = Sha256::new();
hasher.update(&full_output[0..value_start]);
hasher.update(&full_output[value_end..]);
let hash = hasher.finalize();
let hash_hex = hash.iter().map(|b| format!("{:02x}", b)).collect::<String>();
let padded_hash = format!("{:0<width$}", hash_hex, width = contents_placeholder.len());
let old_marker = format!("Contents <{}", contents_placeholder);
let new_marker = format!("Contents <{}", padded_hash);
let output_str = String::from_utf8_lossy(&full_output);
let final_output = output_str.replace(&old_marker, &new_marker);
let final_output = final_output.replace(
"/ByteRange [0 0 0 0]",
&format!(
"/ByteRange [{} {} {} {}]",
byte_range[0], byte_range[1], byte_range[2], byte_range[3]
),
);
fs::write(output_file, final_output)?;
println!(
"[sign] Signed {} -> {} (signer: {}, hash: {})",
input_file, output_file, sig.signer_name, &hash_hex[..16]
);
Ok(())
}
pub fn verify_pdf_signature(input_file: &str) -> Result<Vec<SignatureInfo>> {
let pdf_bytes = fs::read(input_file)?;
let text = String::from_utf8_lossy(&pdf_bytes);
let mut results = Vec::new();
let obj_re = regex::Regex::new(r"(?s)(\d+)\s+0\s+obj\s+<<(.+?)>>\s+endobj").unwrap();
for caps in obj_re.captures_iter(&text) {
let dict_content = &caps[2];
if dict_content.contains("/Type /Sig") || dict_content.contains("/Type/Sig") {
let name = extract_pdf_dict_value(dict_content, "/Name").unwrap_or_default();
let reason = extract_pdf_dict_value(dict_content, "/Reason");
let location = extract_pdf_dict_value(dict_content, "/Location");
let date = extract_pdf_dict_value(dict_content, "/M");
let byte_range = extract_pdf_dict_value(dict_content, "/ByteRange");
results.push(SignatureInfo {
signer_name: name,
reason,
location,
date,
byte_range,
valid: false,
});
}
}
Ok(results)
}
fn extract_pdf_dict_value(dict: &str, key: &str) -> Option<String> {
let pos = dict.match_indices(key)
.find(|(i, _)| {
let end = i + key.len();
end == dict.len() || dict[end..].starts_with(|c: char| c.is_whitespace() || c == '(' || c == '<' || c == '[')
})
.map(|(i, _)| i)?;
let after = dict[pos + key.len()..].trim_start();
if after.starts_with('(') {
let end = after.find(')')?;
Some(after[1..end].to_string())
} else if after.starts_with('<') && !after.starts_with("<<") {
let end = after.find('>')?;
Some(after[1..end].to_string())
} else if after.starts_with('[') {
let end = after.find(']')?;
Some(after[..=end].to_string())
} else if let Some(name_after) = after.strip_prefix('/') {
let end = name_after.find(|c: char| c.is_whitespace() || c == '/' || c == '>' || c == '[').unwrap_or(name_after.len());
Some(name_after[..end].to_string())
} else {
let end = after.find(|c: char| c.is_whitespace() || c == '/' || c == '>').unwrap_or(after.len());
Some(after[..end].to_string())
}
}
#[derive(Debug, Clone, PartialEq)]
struct TextFragment {
text: String,
x: f32,
y: f32,
}
pub fn extract_tables_from_pdf(input_file: &str) -> Result<Vec<String>> {
use crate::pdf::{PdfDocument, PdfObject};
let doc = PdfDocument::load_from_file(input_file)?;
let mut all_fragments: Vec<TextFragment> = Vec::new();
let tj_re = regex::Regex::new(r"\(((?:[^()\\]|\\.|(?:\([^()]*\)))*)\)\s*Tj").unwrap();
let tj_hex_re = regex::Regex::new(r"<([0-9a-fA-F\s]+)>\s*Tj").unwrap();
let td_re = regex::Regex::new(r"([\d.\-]+)\s+([\d.\-]+)\s+T[dD]").unwrap();
let tm_re = regex::Regex::new(r"[\d.\-]+\s+[\d.\-]+\s+[\d.\-]+\s+[\d.\-]+\s+([\d.\-]+)\s+([\d.\-]+)\s+Tm").unwrap();
for obj in doc.objects.values() {
if let PdfObject::Stream { data, .. } = obj {
let processed_data = crate::compression::decompress_deflate(data).unwrap_or_else(|_| data.to_vec());
let content = String::from_utf8_lossy(&processed_data);
let mut current_x: f32 = 0.0;
let mut current_y: f32 = 0.0;
for line in content.lines() {
let line = line.trim();
if let Some(caps) = td_re.captures(line)
&& let (Ok(x), Ok(y)) = (caps[1].parse::<f32>(), caps[2].parse::<f32>()) {
current_x = x;
current_y = y;
}
if let Some(caps) = tm_re.captures(line)
&& let (Ok(x), Ok(y)) = (caps[1].parse::<f32>(), caps[2].parse::<f32>()) {
current_x = x;
current_y = y;
}
for caps in tj_re.captures_iter(line) {
let extracted = &caps[1];
let unescaped = crate::pdf::unescape_pdf_string(extracted);
if !unescaped.trim().is_empty() {
all_fragments.push(TextFragment {
text: unescaped.trim().to_string(),
x: current_x,
y: current_y,
});
}
}
for caps in tj_hex_re.captures_iter(line) {
let hex_str = caps[1].replace(char::is_whitespace, "");
let decoded = crate::pdf::decode_pdf_hex_string(&hex_str);
if !decoded.trim().is_empty() {
all_fragments.push(TextFragment {
text: decoded.trim().to_string(),
x: current_x,
y: current_y,
});
}
}
}
}
}
if all_fragments.is_empty() {
return Ok(Vec::new());
}
all_fragments.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap());
let y_tolerance = 3.0; let mut rows: Vec<Vec<TextFragment>> = Vec::new();
let mut current_row: Vec<TextFragment> = Vec::new();
let mut current_y = all_fragments[0].y;
for frag in all_fragments {
let frag_y = frag.y;
if (frag_y - current_y).abs() <= y_tolerance {
current_row.push(frag);
} else {
if !current_row.is_empty() {
current_row.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
rows.push(current_row);
}
current_row = vec![frag];
current_y = frag_y;
}
}
if !current_row.is_empty() {
current_row.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
rows.push(current_row);
}
let mut merged_rows: Vec<Vec<TextFragment>> = Vec::new();
for row in rows {
if let Some(last) = merged_rows.last_mut() {
let last_y = last.iter().map(|f| f.y).sum::<f32>() / last.len() as f32;
let row_y = row.iter().map(|f| f.y).sum::<f32>() / row.len() as f32;
if (last_y - row_y).abs() <= y_tolerance {
last.extend(row);
last.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
continue;
}
}
merged_rows.push(row);
}
let mut tables: Vec<Vec<Vec<String>>> = Vec::new();
let mut current_table: Vec<Vec<String>> = Vec::new();
let x_tolerance = 8.0;
for row in &merged_rows {
let cells = group_row_into_cells(row, x_tolerance);
if cells.len() >= 2 {
current_table.push(cells);
} else if !current_table.is_empty() {
if current_table.len() >= 2 {
tables.push(current_table);
}
current_table = Vec::new();
}
}
if !current_table.is_empty() && current_table.len() >= 2 {
tables.push(current_table);
}
let mut csv_outputs = Vec::new();
for table in tables {
let mut csv = String::new();
for row in table {
let escaped: Vec<String> = row.iter().map(|cell| escape_csv_field(cell)).collect();
csv.push_str(&escaped.join(","));
csv.push('\n');
}
csv_outputs.push(csv);
}
Ok(csv_outputs)
}
fn group_row_into_cells(row: &[TextFragment], x_tolerance: f32) -> Vec<String> {
if row.is_empty() {
return Vec::new();
}
let mut cells: Vec<Vec<String>> = Vec::new();
let mut current_cell: Vec<String> = Vec::new();
let mut last_x = row[0].x;
for frag in row {
if (frag.x - last_x).abs() > x_tolerance && !current_cell.is_empty() {
cells.push(current_cell);
current_cell = Vec::new();
}
current_cell.push(frag.text.clone());
last_x = frag.x;
}
if !current_cell.is_empty() {
cells.push(current_cell);
}
cells.into_iter().map(|parts| parts.join(" ")).collect()
}
fn escape_csv_field(field: &str) -> String {
if field.contains(',') || field.contains('"') || field.contains('\n') || field.contains('\r') {
let escaped = field.replace('"', "\"\"");
format!("\"{}\"", escaped)
} else {
field.to_string()
}
}
#[derive(Debug, Clone, PartialEq)]
struct StyledTextFragment {
text: String,
x: f32,
y: f32,
font_size: f32,
font_name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DetectedHeading {
pub level: u8,
pub text: String,
pub page_hint: Option<u32>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DetectedSection {
pub title: Option<String>,
pub level: u8,
pub content_lines: Vec<String>,
pub has_table: bool,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct DocumentStructure {
pub headings: Vec<DetectedHeading>,
pub sections: Vec<DetectedSection>,
pub estimated_page_count: u32,
pub body_font_size: f32,
}
pub fn detect_document_structure(input_file: &str) -> Result<DocumentStructure> {
use crate::pdf::{PdfDocument, PdfObject};
let doc = PdfDocument::load_from_file(input_file)?;
let mut all_fragments: Vec<StyledTextFragment> = Vec::new();
let tj_re = regex::Regex::new(r"\(((?:[^()\\]|\\.|(?:\([^()]*\)))*)\)\s*Tj").unwrap();
let tj_hex_re = regex::Regex::new(r"<([0-9a-fA-F\s]+)>\s*Tj").unwrap();
let td_re = regex::Regex::new(r"([\d.\-]+)\s+([\d.\-]+)\s+T[dD]").unwrap();
let tm_re = regex::Regex::new(r"([\d.\-]+)\s+([\d.\-]+)\s+([\d.\-]+)\s+([\d.\-]+)\s+([\d.\-]+)\s+([\d.\-]+)\s+Tm").unwrap();
let tf_re = regex::Regex::new(r"/(\S+)\s+([\d.\-]+)\s+Tf").unwrap();
for obj in doc.objects.values() {
if let PdfObject::Stream { data, .. } = obj {
let processed_data = crate::compression::decompress_deflate(data).unwrap_or_else(|_| data.to_vec());
let content = String::from_utf8_lossy(&processed_data);
let mut current_x: f32 = 0.0;
let mut current_y: f32 = 0.0;
let mut current_font_size: f32 = 12.0;
let mut current_font_name: String = String::new();
let mut tm_scale: f32 = 1.0;
for line in content.lines() {
let line = line.trim();
if let Some(caps) = tf_re.captures(line)
&& let Ok(size) = caps[2].parse::<f32>() {
current_font_name = caps[1].to_string();
current_font_size = size;
}
if let Some(caps) = td_re.captures(line)
&& let (Ok(x), Ok(y)) = (caps[1].parse::<f32>(), caps[2].parse::<f32>()) {
current_x = x;
current_y = y;
}
if let Some(caps) = tm_re.captures(line)
&& let (Ok(a), Ok(_d), Ok(x), Ok(y)) = (caps[1].parse::<f32>(), caps[4].parse::<f32>(), caps[5].parse::<f32>(), caps[6].parse::<f32>()) {
current_x = x;
current_y = y;
tm_scale = a.abs();
if let Ok(d) = caps[4].parse::<f32>()
&& d.abs() > 0.01 {
tm_scale = d.abs();
}
}
for caps in tj_re.captures_iter(line) {
let extracted = &caps[1];
let unescaped = crate::pdf::unescape_pdf_string(extracted);
if !unescaped.trim().is_empty() {
all_fragments.push(StyledTextFragment {
text: unescaped.trim().to_string(),
x: current_x,
y: current_y,
font_size: current_font_size * tm_scale,
font_name: current_font_name.clone(),
});
}
}
for caps in tj_hex_re.captures_iter(line) {
let hex_str = caps[1].replace(char::is_whitespace, "");
let decoded = crate::pdf::decode_pdf_hex_string(&hex_str);
if !decoded.trim().is_empty() {
all_fragments.push(StyledTextFragment {
text: decoded.trim().to_string(),
x: current_x,
y: current_y,
font_size: current_font_size * tm_scale,
font_name: current_font_name.clone(),
});
}
}
}
}
}
if all_fragments.is_empty() {
return Ok(DocumentStructure {
headings: Vec::new(),
sections: Vec::new(),
estimated_page_count: 1,
body_font_size: 12.0,
});
}
all_fragments.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap());
let y_tolerance = 3.0;
let mut lines: Vec<Vec<StyledTextFragment>> = Vec::new();
let mut current_line: Vec<StyledTextFragment> = Vec::new();
let mut current_y = all_fragments[0].y;
for frag in &all_fragments {
let frag_y = frag.y;
if (frag_y - current_y).abs() <= y_tolerance {
current_line.push(frag.clone());
} else {
if !current_line.is_empty() {
current_line.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
lines.push(current_line);
}
current_line = vec![frag.clone()];
current_y = frag_y;
}
}
if !current_line.is_empty() {
current_line.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
lines.push(current_line);
}
let mut merged_lines: Vec<Vec<StyledTextFragment>> = Vec::new();
for line in lines {
if let Some(last) = merged_lines.last_mut() {
let last_y = last.iter().map(|f| f.y).sum::<f32>() / last.len() as f32;
let this_y = line.iter().map(|f| f.y).sum::<f32>() / line.len() as f32;
if (this_y - last_y).abs() <= 1.5 {
last.extend(line);
last.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
continue;
}
}
merged_lines.push(line);
}
let mut size_counts: std::collections::HashMap<u32, usize> = std::collections::HashMap::new();
for line in &merged_lines {
for frag in line {
let size_key = (frag.font_size.round() as u32).max(1);
*size_counts.entry(size_key).or_insert(0) += 1;
}
}
let body_font_size = size_counts
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(size, _)| size as f32)
.unwrap_or(12.0);
let mut headings: Vec<DetectedHeading> = Vec::new();
let mut sections: Vec<DetectedSection> = Vec::new();
let mut current_section_lines: Vec<String> = Vec::new();
let mut current_section_level: u8 = 0;
let mut current_section_title: Option<String> = None;
for line in &merged_lines {
let line_text: String = line.iter().map(|f| &f.text as &str).collect::<Vec<_>>().join(" ");
if line_text.trim().is_empty() {
continue;
}
let avg_font_size = line.iter().map(|f| f.font_size).sum::<f32>() / line.len().max(1) as f32;
let is_bold = line.iter().any(|f| {
let name = f.font_name.to_lowercase();
name.contains("bold") || name.contains("heavy") || name.contains("black")
});
let word_count = line_text.split_whitespace().count();
let is_heading = if avg_font_size >= body_font_size * 2.0 {
true
} else if avg_font_size >= body_font_size * 1.5 {
true
} else if is_bold && word_count <= 10 && avg_font_size >= body_font_size * 1.1 {
true
} else {
false
};
if is_heading {
if !current_section_lines.is_empty() || current_section_title.is_some() {
sections.push(DetectedSection {
title: current_section_title.clone(),
level: current_section_level,
content_lines: current_section_lines.clone(),
has_table: false, });
}
let level = if avg_font_size >= body_font_size * 2.0 {
1
} else if avg_font_size >= body_font_size * 1.5 {
2
} else {
3
};
headings.push(DetectedHeading {
level,
text: line_text.trim().to_string(),
page_hint: None,
});
current_section_title = Some(line_text.trim().to_string());
current_section_level = level;
current_section_lines = Vec::new();
} else {
current_section_lines.push(line_text.trim().to_string());
}
}
if !current_section_lines.is_empty() || current_section_title.is_some() {
sections.push(DetectedSection {
title: current_section_title,
level: current_section_level,
content_lines: current_section_lines,
has_table: false,
});
}
let y_min = all_fragments.iter().map(|f| f.y).fold(f32::INFINITY, f32::min);
let y_max = all_fragments.iter().map(|f| f.y).fold(f32::NEG_INFINITY, f32::max);
let estimated_pages = ((y_max - y_min) / 800.0).ceil().max(1.0) as u32;
Ok(DocumentStructure {
headings,
sections,
estimated_page_count: estimated_pages,
body_font_size,
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SignatureInfo {
pub signer_name: String,
pub reason: Option<String>,
pub location: Option<String>,
pub date: Option<String>,
pub byte_range: Option<String>,
pub valid: bool,
}
pub fn extract_images_from_pdf(input_path: &str, output_dir: &str) -> Result<Vec<String>> {
use crate::pdf::{PdfDocument, PdfObject, PdfValue};
use std::path::Path;
fs::create_dir_all(output_dir)?;
let doc = PdfDocument::load_from_file(input_path)?;
let mut extracted = Vec::new();
let mut image_idx = 0;
for (obj_id, obj) in &doc.objects {
let (dictionary, data) = match obj {
PdfObject::Stream { dictionary, data } => (dictionary, data),
_ => continue,
};
let is_image = dictionary.get("Subtype")
.and_then(|v| match v {
PdfValue::Object(PdfObject::String(s)) => Some(s.as_str()),
_ => None,
})
.map(|s| s == "/Image" || s == "Image")
.unwrap_or(false);
if !is_image {
continue;
}
let filter = dictionary.get("Filter")
.and_then(|v| match v {
PdfValue::Object(PdfObject::String(s)) => Some(s.as_str()),
_ => None,
});
let (ext, raw_data) = match filter {
Some("/DCTDecode") | Some("DCTDecode") => {
let jpeg_data = if data.len() >= 2 && data[0] == 0xFF && data[1] == 0xD8 {
data.clone()
} else {
let mut prefixed = vec![0xFF, 0xD8];
prefixed.extend_from_slice(data);
prefixed
};
("jpg", jpeg_data)
}
_ => {
let decompressed = decompress_if_needed(data);
("bin", decompressed)
}
};
let filename = format!("image_{:03}.{}.{}", image_idx, obj_id, ext);
let out_path = Path::new(output_dir).join(&filename);
fs::write(&out_path, &raw_data)?;
extracted.push(out_path.to_string_lossy().to_string());
image_idx += 1;
}
Ok(extracted)
}
pub fn create_portfolio_pdf(
output_file: &str,
files: &[(String, String)],
title: Option<&str>,
) -> Result<()> {
use crate::pdf::{PdfDocument, PdfObject, PdfValue};
use std::collections::HashMap;
let mut doc = PdfDocument::new();
let catalog_dict = HashMap::new();
let catalog_id = 1;
doc.objects.insert(catalog_id, PdfObject::Dictionary(catalog_dict));
doc.catalog = catalog_id;
let mut file_specs: Vec<(String, u32)> = Vec::new(); for (path, _desc) in files {
let data = std::fs::read(path)
.map_err(|e| anyhow::anyhow!("Cannot read {}: {}", path, e))?;
let filename = std::path::Path::new(path)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or(path);
let fs_id = doc.embed_file(filename, &data)?;
file_specs.push((filename.to_string(), fs_id));
}
let mut collection_dict = HashMap::new();
collection_dict.insert("Type".to_string(), PdfValue::Object(PdfObject::String("/Collection".to_string())));
collection_dict.insert("View".to_string(), PdfValue::Object(PdfObject::String("/D".to_string())));
let mut schema_entries = Vec::new();
schema_entries.push("/Name << /Type /F /O << /D [ (Name) ] >> >>".to_string());
schema_entries.push("/Description << /Type /Desc /O << /D [ (Description) ] >> >>".to_string());
let schema = format!("<< {} >>", schema_entries.join(" "));
collection_dict.insert("Schema".to_string(), PdfValue::Object(PdfObject::String(schema)));
let sort = "<< /S /Name /A true >>".to_string();
collection_dict.insert("Sort".to_string(), PdfValue::Object(PdfObject::String(sort)));
let next_id = doc.objects.keys().copied().max().unwrap_or(0) + 1;
let collection_id = next_id;
doc.objects.insert(collection_id, PdfObject::Dictionary(collection_dict));
if let Some(PdfObject::Dictionary(catalog_dict)) = doc.objects.get_mut(&doc.catalog) {
catalog_dict.insert("Collection".to_string(), PdfValue::Object(PdfObject::String(format!("{} 0 R", collection_id))));
if let Some(t) = title {
catalog_dict.insert("Title".to_string(), PdfValue::Object(PdfObject::String(format!("({})", escape_pdf_meta(t)))));
}
}
std::fs::write(output_file, doc.to_bytes())?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_metadata_info_dict() {
let meta = PdfMetadata {
title: Some("Test Title".into()),
author: Some("Test Author".into()),
subject: None,
keywords: None,
creator: None,
custom_fields: std::collections::HashMap::new(),
};
let dict = meta.to_info_dict();
assert!(dict.contains("/Title (Test Title)"));
assert!(dict.contains("/Author (Test Author)"));
assert!(dict.contains("/Producer (pdf-cli)"));
assert!(!dict.contains("/Subject"));
}
#[test]
fn test_pdf_metadata_escape() {
assert_eq!(escape_pdf_meta("hello (world)"), "hello \\(world\\)");
assert_eq!(escape_pdf_meta("back\\slash"), "back\\\\slash");
}
#[test]
fn test_pdf_metadata_default() {
let meta = PdfMetadata::new();
assert!(meta.title.is_none());
assert!(meta.author.is_none());
let dict = meta.to_info_dict();
assert!(dict.contains("/Producer (pdf-cli)"));
}
#[test]
fn test_split_invalid_range() {
let result = split_pdf("nonexistent.pdf", "out.pdf", 0, 5);
assert!(result.is_err());
let result = split_pdf("nonexistent.pdf", "out.pdf", 5, 3);
assert!(result.is_err());
}
#[test]
fn test_merge_empty_input() {
let result = merge_pdfs(&[], "out.pdf");
assert!(result.is_err());
}
#[test]
fn test_rotate_invalid_angle() {
let result = rotate_pdf("nonexistent.pdf", "out.pdf", 45);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("Invalid rotation"));
}
#[test]
fn test_rotate_valid_angles() {
for angle in [0, 90, 180, 270] {
let result = rotate_pdf("nonexistent.pdf", "out.pdf", angle);
assert!(result.is_err());
assert!(!result.unwrap_err().to_string().contains("Invalid rotation"));
}
}
#[test]
fn test_create_pdf_with_images_empty() {
let result = create_pdf_with_images("out.pdf", &[]);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("No images"));
}
#[test]
fn test_text_annotation_struct() {
let annot = TextAnnotation {
x: 100.0,
y: 700.0,
width: 200.0,
height: 20.0,
content: "A note".into(),
title: "Author".into(),
};
assert_eq!(annot.content, "A note");
assert_eq!(annot.x, 100.0);
}
#[test]
fn test_link_annotation_struct() {
let link = LinkAnnotation {
x: 72.0,
y: 500.0,
width: 100.0,
height: 15.0,
url: "https://example.com".into(),
};
assert_eq!(link.url, "https://example.com");
}
#[test]
fn test_reorder_empty() {
let result = reorder_pages("nonexistent.pdf", "out.pdf", &[]);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("empty"));
}
#[test]
fn test_build_watermark_stream() {
let layout = crate::pdf_generator::PageLayout::portrait();
let stream = build_watermark_stream("DRAFT", 48.0, 0.3, &layout);
let content = String::from_utf8_lossy(&stream);
assert!(content.contains("(DRAFT) Tj"));
assert!(content.contains("0.7071")); assert!(content.contains("q\n")); assert!(content.contains("Q\n")); }
#[test]
fn test_highlight_annotation_struct() {
let hl = HighlightAnnotation {
x: 72.0,
y: 700.0,
width: 200.0,
height: 12.0,
color_r: 1.0,
color_g: 1.0,
color_b: 0.0,
};
assert_eq!(hl.color_r, 1.0);
assert_eq!(hl.color_g, 1.0);
assert_eq!(hl.color_b, 0.0);
}
#[test]
fn test_color_constructors() {
let black = crate::pdf_generator::Color::black();
assert_eq!(black.r, 0.0);
assert_eq!(black.g, 0.0);
assert_eq!(black.b, 0.0);
let red = crate::pdf_generator::Color::red();
assert_eq!(red.r, 1.0);
let custom = crate::pdf_generator::Color::rgb(0.2, 0.4, 0.6);
assert_eq!(custom.r, 0.2);
assert_eq!(custom.g, 0.4);
assert_eq!(custom.b, 0.6);
}
#[test]
fn test_custom_metadata_fields() {
let mut metadata = PdfMetadata::new();
metadata.add_custom_field("CustomField1".to_string(), "Value1".to_string());
metadata.add_custom_field("CustomField2".to_string(), "Value2".to_string());
assert_eq!(metadata.get_custom_field("CustomField1"), Some(&"Value1".to_string()));
assert_eq!(metadata.get_custom_field("CustomField2"), Some(&"Value2".to_string()));
assert_eq!(metadata.get_custom_field("NonExistent"), None);
let removed = metadata.remove_custom_field("CustomField1");
assert_eq!(removed, Some("Value1".to_string()));
assert_eq!(metadata.get_custom_field("CustomField1"), None);
let dict = metadata.to_info_dict();
assert!(dict.contains("/CustomField2 (Value2)"));
}
#[test]
fn test_metadata_info_dict_with_custom_fields() {
let mut metadata = PdfMetadata {
title: Some("Test Title".to_string()),
author: Some("Test Author".to_string()),
creator: Some("Test Creator".to_string()),
..Default::default()
};
metadata.add_custom_field("Version".to_string(), "1.0".to_string());
metadata.add_custom_field("Company".to_string(), "ACME Corp".to_string());
let dict = metadata.to_info_dict();
assert!(dict.contains("/Title (Test Title)"));
assert!(dict.contains("/Author (Test Author)"));
assert!(dict.contains("/Creator (Test Creator)"));
assert!(dict.contains("/Version (1.0)"));
assert!(dict.contains("/Company (ACME Corp)"));
assert!(dict.contains("/Producer (pdf-cli)"));
}
#[test]
fn test_merge_metadata() {
let mut base = PdfMetadata {
title: Some("Base Title".to_string()),
author: Some("Base Author".to_string()),
..Default::default()
};
base.add_custom_field("BaseField".to_string(), "BaseValue".to_string());
let mut new_meta = PdfMetadata {
title: Some("New Title".to_string()),
subject: Some("New Subject".to_string()),
..Default::default()
};
new_meta.add_custom_field("NewField".to_string(), "NewValue".to_string());
let merged = merge_metadata(&base, &new_meta);
assert_eq!(merged.title, Some("New Title".to_string())); assert_eq!(merged.author, Some("Base Author".to_string())); assert_eq!(merged.subject, Some("New Subject".to_string())); assert_eq!(merged.get_custom_field("BaseField"), Some(&"BaseValue".to_string())); assert_eq!(merged.get_custom_field("NewField"), Some(&"NewValue".to_string())); }
#[test]
fn test_unescape_pdf_string() {
assert_eq!(unescape_pdf_string("hello"), "hello");
assert_eq!(unescape_pdf_string(r"hello\(world\)"), "hello(world)");
assert_eq!(unescape_pdf_string(r"line1\nline2"), "line1\nline2");
assert_eq!(unescape_pdf_string(r"tab\there"), "tab\there");
assert_eq!(unescape_pdf_string(r"\050"), "("); assert_eq!(unescape_pdf_string(r"\051"), ")"); }
#[test]
fn test_extract_pdf_string_field() {
let content = r"<< /Title (Test Title) /Author (Test \(Author\) ) /Subject None >>";
assert_eq!(extract_pdf_string_field(content, "/Title"), Some("Test Title".to_string()));
assert_eq!(extract_pdf_string_field(content, "/Author"), Some("Test (Author) ".to_string()));
assert_eq!(extract_pdf_string_field(content, "/Subject"), None);
assert_eq!(extract_pdf_string_field(content, "/NonExistent"), None);
}
#[test]
fn test_form_field_struct() {
let field = FormField {
name: "firstName".to_string(),
field_type: FormFieldType::Text,
x: 100.0,
y: 700.0,
width: 200.0,
height: 20.0,
default_value: Some("John".to_string()),
options: vec![],
required: true,
};
assert_eq!(field.name, "firstName");
assert_eq!(field.field_type, FormFieldType::Text);
assert!(field.required);
assert_eq!(field.default_value, Some("John".to_string()));
}
#[test]
fn test_field_type_to_pdf() {
assert_eq!(field_type_to_pdf(&FormFieldType::Text), "/Tx");
assert_eq!(field_type_to_pdf(&FormFieldType::Checkbox), "/Btn");
assert_eq!(field_type_to_pdf(&FormFieldType::Radio), "/Btn");
assert_eq!(field_type_to_pdf(&FormFieldType::Dropdown), "/Ch");
}
#[test]
fn test_create_form_field_dict_text() {
let field = FormField {
name: "username".to_string(),
field_type: FormFieldType::Text,
x: 50.0,
y: 600.0,
width: 150.0,
height: 18.0,
default_value: Some("default".to_string()),
options: vec![],
required: false,
};
let dict = create_form_field_dict(&field);
assert!(dict.contains("/Type /Annot"));
assert!(dict.contains("/Subtype /Widget"));
assert!(dict.contains("/T (username)"));
assert!(dict.contains("/FT /Tx"));
assert!(dict.contains("/V (default)"));
assert!(dict.contains("/Rect [50 600 200 618]"));
}
#[test]
fn test_create_form_field_dict_checkbox() {
let field = FormField {
name: "agree".to_string(),
field_type: FormFieldType::Checkbox,
x: 50.0,
y: 550.0,
width: 15.0,
height: 15.0,
default_value: None,
options: vec![],
required: true,
};
let dict = create_form_field_dict(&field);
assert!(dict.contains("/FT /Btn"));
assert!(dict.contains("/T (agree)"));
assert!(dict.contains("/Ff 2")); assert!(dict.contains("/V /Off"));
}
#[test]
fn test_create_form_field_dict_dropdown() {
let field = FormField {
name: "country".to_string(),
field_type: FormFieldType::Dropdown,
x: 50.0,
y: 500.0,
width: 100.0,
height: 20.0,
default_value: Some("USA".to_string()),
options: vec!["USA".to_string(), "Canada".to_string(), "Mexico".to_string()],
required: false,
};
let dict = create_form_field_dict(&field);
assert!(dict.contains("/FT /Ch"));
assert!(dict.contains("/T (country)"));
assert!(dict.contains("/V (USA)"));
assert!(dict.contains("(USA)"));
assert!(dict.contains("(Canada)"));
assert!(dict.contains("(Mexico)"));
assert!(dict.contains("/Ff 131072")); }
#[test]
fn test_build_text_watermark_positions() {
let layout = crate::pdf_generator::PageLayout::portrait();
let center_stream = build_text_watermark_stream("TEST", 24.0, 0.5, &layout, WatermarkPosition::Center);
assert!(String::from_utf8_lossy(¢er_stream).contains("(TEST) Tj"));
let diagonal_stream = build_text_watermark_stream("DRAFT", 48.0, 0.3, &layout, WatermarkPosition::Diagonal);
let content = String::from_utf8_lossy(&diagonal_stream);
assert!(content.contains("(DRAFT) Tj"));
assert!(content.contains("0.707")); }
#[test]
fn test_watermark_position_variants() {
let layout = crate::pdf_generator::PageLayout::portrait();
for position in [
WatermarkPosition::Center,
WatermarkPosition::TopLeft,
WatermarkPosition::TopRight,
WatermarkPosition::BottomLeft,
WatermarkPosition::BottomRight,
WatermarkPosition::Diagonal,
] {
let stream = build_text_watermark_stream("TEST", 24.0, 0.5, &layout, position);
assert!(!stream.is_empty());
}
}
#[test]
fn test_image_watermark_stream() {
let layout = crate::pdf_generator::PageLayout::portrait();
let image_info = crate::image::ImageInfo {
format: crate::image::ImageFormat::Jpeg,
width: 800,
height: 600,
data: vec![],
bits_per_component: 8,
color_components: 3,
alt_text: None,
};
let result = build_image_watermark_stream(&image_info, 0.5, &layout, WatermarkPosition::Center);
assert!(result.is_ok());
let stream = result.unwrap();
let content = String::from_utf8_lossy(&stream);
assert!(content.contains("/Im1 Do"));
assert!(content.contains("q\n"));
assert!(content.contains("Q\n"));
}
}
#[cfg(test)]
mod proptest_tests {
use proptest::prelude::*;
use super::*;
proptest! {
#[test]
fn merge_metadata_idempotent(base_title in ".*", base_author in ".*",
new_title in ".*", new_author in ".*") {
let mut base = PdfMetadata::new();
base.title = Some(base_title);
base.author = Some(base_author);
let mut new_meta = PdfMetadata::new();
new_meta.title = Some(new_title);
new_meta.author = Some(new_author);
let merged1 = merge_metadata(&base, &new_meta);
let merged2 = merge_metadata(&merged1, &new_meta);
assert_eq!(merged1.title, merged2.title);
assert_eq!(merged1.author, merged2.author);
}
}
proptest! {
#[test]
fn custom_fields_preserved(key in "[a-zA-Z0-9_]{1,20}", value in ".*") {
let mut metadata = PdfMetadata::new();
metadata.add_custom_field(key.clone(), value.clone());
assert_eq!(metadata.get_custom_field(&key), Some(&value));
let removed = metadata.remove_custom_field(&key);
assert_eq!(removed, Some(value));
assert_eq!(metadata.get_custom_field(&key), None);
}
}
proptest! {
#[test]
fn escape_pdf_meta_roundtrip(s in ".*") {
let escaped = escape_pdf_meta(&s);
for (_, c) in s.chars().enumerate() {
match c {
'(' | ')' => {
assert!(escaped.contains(&format!(r"\{}", c)));
}
'\\' => {
assert!(escaped.contains(r"\\"));
}
_ => {}
}
}
}
}
#[test]
fn test_create_portfolio_pdf() {
let tmp_dir = std::env::temp_dir();
let file1 = tmp_dir.join("portfolio_test1.txt");
let file2 = tmp_dir.join("portfolio_test2.csv");
let output = tmp_dir.join("test_portfolio.pdf");
std::fs::write(&file1, b"Hello from file 1").unwrap();
std::fs::write(&file2, b"a,b,c\n1,2,3").unwrap();
let files = vec![
(file1.to_string_lossy().to_string(), "Text file".to_string()),
(file2.to_string_lossy().to_string(), "CSV data".to_string()),
];
create_portfolio_pdf(
&output.to_string_lossy(),
&files,
Some("Test Portfolio"),
).unwrap();
assert!(output.exists(), "Portfolio PDF should be created");
let bytes = std::fs::read(&output).unwrap();
let content = String::from_utf8_lossy(&bytes);
assert!(content.contains("/Collection"), "Should contain /Collection");
assert!(content.contains("/EmbeddedFile"), "Should contain /EmbeddedFile");
assert!(content.contains("/Filespec"), "Should contain /Filespec");
assert!(content.contains("Test Portfolio"), "Should contain portfolio title");
assert!(content.starts_with("%PDF-"), "Should be a valid PDF");
let _ = std::fs::remove_file(&file1);
let _ = std::fs::remove_file(&file2);
let _ = std::fs::remove_file(&output);
}
}