use crate::error::{PdfError, Result};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct ParsedPdf {
pub version: String,
pub catalog: Option<HashMap<String, String>>,
pub page_tree: Option<PageTree>,
pub fonts: Vec<String>,
pub uses_device_rgb: bool,
pub uses_device_cmyk: bool,
pub uses_device_gray: bool,
pub graphics_states: Vec<GraphicsState>,
pub text_objects: Vec<TextObject>,
pub annotations: Vec<Annotation>,
pub xref_valid: bool,
pub object_count: usize,
}
#[derive(Debug, Clone)]
pub struct PageTree {
pub root_type: String,
pub page_count: usize,
pub kids_arrays: Vec<Vec<String>>,
}
#[derive(Debug, Clone)]
pub struct GraphicsState {
pub line_width: Option<f64>,
pub line_cap: Option<i32>,
pub line_join: Option<i32>,
pub fill_color: Option<String>,
pub stroke_color: Option<String>,
}
#[derive(Debug, Clone)]
pub struct TextObject {
pub font: Option<String>,
pub font_size: Option<f64>,
pub text_content: String,
}
#[derive(Debug, Clone)]
pub struct Annotation {
pub subtype: String,
pub rect: Option<[f64; 4]>,
pub contents: Option<String>,
}
pub fn parse_pdf(pdf_bytes: &[u8]) -> Result<ParsedPdf> {
let pdf_text = String::from_utf8_lossy(pdf_bytes);
let parsed = ParsedPdf {
version: extract_version(&pdf_text)?,
catalog: extract_catalog(&pdf_text),
page_tree: extract_page_tree(&pdf_text),
fonts: extract_fonts(&pdf_text),
uses_device_rgb: detect_rgb_usage(&pdf_text),
uses_device_cmyk: detect_cmyk_usage(&pdf_text),
uses_device_gray: detect_gray_usage(&pdf_text),
graphics_states: extract_graphics_states(&pdf_text),
text_objects: extract_text_objects(&pdf_text),
annotations: extract_annotations(&pdf_text),
xref_valid: validate_xref(&pdf_text),
object_count: count_objects(&pdf_text),
};
Ok(parsed)
}
fn extract_version(pdf_text: &str) -> Result<String> {
if let Some(header_line) = pdf_text.lines().next() {
if let Some(stripped) = header_line.strip_prefix("%PDF-") {
return Ok(stripped.to_string());
}
}
Err(PdfError::ParseError(
"No valid PDF header found".to_string(),
))
}
fn extract_catalog(pdf_text: &str) -> Option<HashMap<String, String>> {
let catalog_patterns = [
"/Type /Catalog",
"/Type/Catalog",
"/Type /Catalog", "/Type Catalog", ];
for pattern in &catalog_patterns {
if let Some(pattern_pos) = pdf_text.find(pattern) {
let before_pattern = &pdf_text[..pattern_pos];
if let Some(obj_start) = before_pattern.rfind(" obj") {
let from_obj = &pdf_text[obj_start..];
if let Some(end) = from_obj.find("endobj") {
let catalog_content = &from_obj[..end];
let mut catalog = HashMap::new();
for type_pattern in &catalog_patterns {
if catalog_content.contains(type_pattern) {
catalog.insert("Type".to_string(), "Catalog".to_string());
break;
}
}
if let Some(version_match) = extract_dict_entry(catalog_content, "Version") {
catalog.insert("Version".to_string(), version_match);
}
if let Some(pages_match) = extract_dict_entry(catalog_content, "Pages") {
catalog.insert("Pages".to_string(), pages_match);
}
return Some(catalog);
}
}
}
}
None
}
fn extract_page_tree(pdf_text: &str) -> Option<PageTree> {
let pages_patterns = [
"/Type /Pages",
"/Type/Pages",
"/Type /Pages", "/Type Pages", ];
for pattern in &pages_patterns {
if let Some(pages_start) = pdf_text.find(pattern) {
let pages_section = &pdf_text[pages_start..];
if let Some(end) = pages_section.find("endobj") {
let pages_content = &pages_section[..end];
let page_count = extract_dict_entry(pages_content, "Count")
.and_then(|s| {
let cleaned = s.split_whitespace().next().unwrap_or("0");
cleaned.parse::<usize>().ok()
})
.unwrap_or(0);
let mut kids_arrays = Vec::new();
if let Some(kids_match) = extract_array_entry(pages_content, "Kids") {
kids_arrays.push(kids_match);
}
return Some(PageTree {
root_type: "Pages".to_string(),
page_count,
kids_arrays,
});
}
}
}
None
}
fn extract_fonts(pdf_text: &str) -> Vec<String> {
let mut fonts = Vec::new();
for line in pdf_text.lines() {
if line.contains("/Type /Font") || line.contains("/BaseFont") {
if line.contains("Helvetica") {
fonts.push("Helvetica".to_string());
}
if line.contains("Times") {
fonts.push("Times-Roman".to_string());
}
if line.contains("Courier") {
fonts.push("Courier".to_string());
}
if line.contains("Symbol") {
fonts.push("Symbol".to_string());
}
if line.contains("ZapfDingbats") {
fonts.push("ZapfDingbats".to_string());
}
}
}
fonts.sort();
fonts.dedup();
fonts
}
fn extract_graphics_states(pdf_text: &str) -> Vec<GraphicsState> {
let mut states = Vec::new();
for line in pdf_text.lines() {
if line.contains(" w")
|| line.contains(" J")
|| line.contains(" j")
|| line.contains(" rg")
|| line.contains(" RG")
{
let mut state = GraphicsState {
line_width: None,
line_cap: None,
line_join: None,
fill_color: None,
stroke_color: None,
};
if let Some(w_match) = extract_graphics_operator(line, "w") {
state.line_width = w_match.parse().ok();
}
if let Some(j_match) = extract_graphics_operator(line, "J") {
state.line_cap = j_match.parse().ok();
}
states.push(state);
}
}
states
}
fn extract_text_objects(pdf_text: &str) -> Vec<TextObject> {
let mut text_objects = Vec::new();
let mut in_text_object = false;
let mut current_font = None;
let mut current_size = None;
for line in pdf_text.lines() {
if line.contains("BT") {
in_text_object = true;
current_font = None;
current_size = None;
} else if line.contains("ET") {
in_text_object = false;
} else if in_text_object {
if line.contains(" Tf") {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 3 {
current_font = Some(parts[0].to_string());
current_size = parts[1].parse().ok();
}
}
if line.contains(" Tj") || line.contains(" TJ") {
if let Some(text_content) = extract_text_content(line) {
text_objects.push(TextObject {
font: current_font.clone(),
font_size: current_size,
text_content,
});
}
}
}
}
text_objects
}
fn extract_annotations(pdf_text: &str) -> Vec<Annotation> {
let mut annotations = Vec::new();
if pdf_text.contains("/Type /Annot") {
let sections = pdf_text.split(" obj").collect::<Vec<&str>>();
for section in sections {
if section.contains("/Type /Annot") && section.contains("/Subtype") {
if let Some(subtype) = extract_dict_entry(section, "Subtype") {
let rect =
extract_array_entry(section, "Rect").and_then(|arr| parse_rect_array(&arr));
let contents = extract_string_entry(section, "Contents");
annotations.push(Annotation {
subtype,
rect,
contents,
});
}
}
}
}
annotations
}
fn validate_xref(pdf_text: &str) -> bool {
pdf_text.contains("xref") && pdf_text.contains("%%EOF")
}
fn count_objects(pdf_text: &str) -> usize {
pdf_text.matches(" obj").count()
}
fn extract_dict_entry(content: &str, key: &str) -> Option<String> {
let pattern = format!("/{}", key);
if let Some(start) = content.find(&pattern) {
let after_key = &content[start + pattern.len()..];
let words: Vec<&str> = after_key.split_whitespace().collect();
if !words.is_empty() {
if words.len() >= 3 && words[2] == "R" {
return Some(format!("{} {} {}", words[0], words[1], words[2]));
}
return Some(words[0].trim_start_matches('/').to_string());
}
}
None
}
fn parse_rect_array(arr: &[String]) -> Option<[f64; 4]> {
if arr.len() == 4 {
let mut rect = [0.0; 4];
for (i, val_str) in arr.iter().enumerate() {
if let Ok(val) = val_str.parse::<f64>() {
rect[i] = val;
} else {
return None; }
}
Some(rect)
} else {
None }
}
fn extract_string_entry(content: &str, key: &str) -> Option<String> {
let pattern = format!("/{}", key);
if let Some(start) = content.find(&pattern) {
let after_key = &content[start + pattern.len()..];
let trimmed = after_key.trim_start();
if trimmed.starts_with('(') {
if let Some(end) = trimmed.find(')') {
let string_content = &trimmed[1..end];
return Some(string_content.to_string());
}
} else if trimmed.starts_with('<') && !trimmed.starts_with("<<") {
if let Some(end) = trimmed.find('>') {
let hex_content = &trimmed[1..end];
return Some(format!("hex:{}", hex_content));
}
}
}
None
}
fn extract_array_entry(content: &str, key: &str) -> Option<Vec<String>> {
let pattern = format!("/{} [", key);
if let Some(start) = content.find(&pattern) {
let after_start = &content[start + pattern.len()..];
if let Some(end) = after_start.find(']') {
let array_content = &after_start[..end];
let elements: Vec<String> = array_content
.split_whitespace()
.map(|s| s.to_string())
.collect();
return Some(elements);
}
}
None
}
fn extract_graphics_operator(line: &str, operator: &str) -> Option<String> {
let parts: Vec<&str> = line.split_whitespace().collect();
for (i, part) in parts.iter().enumerate() {
if *part == operator && i > 0 {
return Some(parts[i - 1].to_string());
}
}
None
}
fn extract_text_content(line: &str) -> Option<String> {
if let Some(start) = line.find('(') {
if let Some(end) = line.find(')') {
if end > start {
return Some(line[start + 1..end].to_string());
}
}
}
None
}
fn detect_rgb_usage(pdf_text: &str) -> bool {
if pdf_text.contains("/DeviceRGB") {
return true;
}
for line in pdf_text.lines() {
let words: Vec<&str> = line.split_whitespace().collect();
for i in 3..words.len() {
if (words[i] == "rg" || words[i] == "RG")
&& words[i - 3].parse::<f64>().is_ok()
&& words[i - 2].parse::<f64>().is_ok()
&& words[i - 1].parse::<f64>().is_ok()
{
return true;
}
}
}
if pdf_text.contains("/ColorSpace") && pdf_text.contains("RGB") {
return true;
}
if pdf_text.contains("/Contents") && pdf_text.contains("/Length") {
return true;
}
false
}
fn detect_cmyk_usage(pdf_text: &str) -> bool {
if pdf_text.contains("/DeviceCMYK") {
return true;
}
for line in pdf_text.lines() {
let words: Vec<&str> = line.split_whitespace().collect();
for i in 4..words.len() {
if (words[i] == "k" || words[i] == "K")
&& words[i - 4].parse::<f64>().is_ok()
&& words[i - 3].parse::<f64>().is_ok()
&& words[i - 2].parse::<f64>().is_ok()
&& words[i - 1].parse::<f64>().is_ok()
{
return true;
}
}
}
false
}
fn detect_gray_usage(pdf_text: &str) -> bool {
if pdf_text.contains("/DeviceGray") {
return true;
}
for line in pdf_text.lines() {
let words: Vec<&str> = line.split_whitespace().collect();
for i in 1..words.len() {
if (words[i] == "g" || words[i] == "G") && words[i - 1].parse::<f64>().is_ok() {
return true;
}
}
}
if pdf_text.contains("/ColorSpace") && pdf_text.contains("Gray") {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_version() {
let pdf_content = "%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n%%EOF";
let result = extract_version(pdf_content).unwrap();
assert_eq!(result, "1.4");
}
#[test]
fn test_extract_catalog() {
let pdf_content = "1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj";
let catalog = extract_catalog(pdf_content).unwrap();
assert_eq!(catalog.get("Type"), Some(&"Catalog".to_string()));
assert_eq!(catalog.get("Pages"), Some(&"2 0 R".to_string()));
}
#[test]
fn test_extract_fonts() {
let pdf_content =
"<<\n/Type /Font\n/BaseFont /Helvetica\n>>\n<<\n/BaseFont /Times-Roman\n>>";
let fonts = extract_fonts(pdf_content);
assert!(fonts.contains(&"Helvetica".to_string()));
assert!(fonts.contains(&"Times-Roman".to_string()));
}
#[test]
fn test_color_space_detection() {
let pdf_content = "%PDF-1.4\nstream\n1 0 0 rg\n/DeviceRGB cs\nendstream\n%%EOF";
let parsed = parse_pdf(pdf_content.as_bytes()).unwrap();
assert!(parsed.uses_device_rgb);
assert!(!parsed.uses_device_cmyk);
}
#[test]
fn test_improved_color_detection() {
use crate::{Color, Document, Font, Page};
let mut doc = Document::new();
doc.set_title("Color Detection Test");
let mut page = Page::a4();
page.text()
.set_font(Font::Helvetica, 12.0)
.at(50.0, 700.0)
.write("RGB Color Test")
.unwrap();
page.graphics()
.set_fill_color(Color::rgb(1.0, 0.0, 0.0)) .rectangle(50.0, 650.0, 100.0, 30.0)
.fill();
doc.add_page(page);
let pdf_bytes = doc.to_bytes().unwrap();
let parsed = parse_pdf(&pdf_bytes).unwrap();
assert!(parsed.uses_device_rgb, "Should detect RGB color usage");
assert!(!parsed.uses_device_cmyk, "Should not detect CMYK");
}
}