oxidize_pdf/verification/
parser.rs1use crate::error::{PdfError, Result};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone)]
12pub struct ParsedPdf {
13 pub version: String,
15 pub catalog: Option<HashMap<String, String>>,
17 pub page_tree: Option<PageTree>,
19 pub fonts: Vec<String>,
21 pub uses_device_rgb: bool,
23 pub uses_device_cmyk: bool,
24 pub uses_device_gray: bool,
25 pub graphics_states: Vec<GraphicsState>,
27 pub text_objects: Vec<TextObject>,
29 pub annotations: Vec<Annotation>,
31 pub xref_valid: bool,
33 pub object_count: usize,
35}
36
37#[derive(Debug, Clone)]
38pub struct PageTree {
39 pub root_type: String,
40 pub page_count: usize,
41 pub kids_arrays: Vec<Vec<String>>,
42}
43
44#[derive(Debug, Clone)]
45pub struct GraphicsState {
46 pub line_width: Option<f64>,
47 pub line_cap: Option<i32>,
48 pub line_join: Option<i32>,
49 pub fill_color: Option<String>,
50 pub stroke_color: Option<String>,
51}
52
53#[derive(Debug, Clone)]
54pub struct TextObject {
55 pub font: Option<String>,
56 pub font_size: Option<f64>,
57 pub text_content: String,
58}
59
60#[derive(Debug, Clone)]
61pub struct Annotation {
62 pub subtype: String,
63 pub rect: Option<[f64; 4]>,
64 pub contents: Option<String>,
65}
66
67pub fn parse_pdf(pdf_bytes: &[u8]) -> Result<ParsedPdf> {
69 let pdf_text = String::from_utf8_lossy(pdf_bytes);
70
71 let parsed = ParsedPdf {
72 version: extract_version(&pdf_text)?,
73 catalog: extract_catalog(&pdf_text),
74 page_tree: extract_page_tree(&pdf_text),
75 fonts: extract_fonts(&pdf_text),
76 uses_device_rgb: detect_rgb_usage(&pdf_text),
77 uses_device_cmyk: detect_cmyk_usage(&pdf_text),
78 uses_device_gray: detect_gray_usage(&pdf_text),
79 graphics_states: extract_graphics_states(&pdf_text),
80 text_objects: extract_text_objects(&pdf_text),
81 annotations: extract_annotations(&pdf_text),
82 xref_valid: validate_xref(&pdf_text),
83 object_count: count_objects(&pdf_text),
84 };
85
86 Ok(parsed)
87}
88
89fn extract_version(pdf_text: &str) -> Result<String> {
91 if let Some(header_line) = pdf_text.lines().next() {
92 if let Some(stripped) = header_line.strip_prefix("%PDF-") {
93 return Ok(stripped.to_string());
94 }
95 }
96 Err(PdfError::ParseError(
97 "No valid PDF header found".to_string(),
98 ))
99}
100
101fn extract_catalog(pdf_text: &str) -> Option<HashMap<String, String>> {
103 let catalog_patterns = [
105 "/Type /Catalog",
106 "/Type/Catalog",
107 "/Type /Catalog", "/Type Catalog", ];
110
111 for pattern in &catalog_patterns {
112 if let Some(pattern_pos) = pdf_text.find(pattern) {
113 let before_pattern = &pdf_text[..pattern_pos];
115 if let Some(obj_start) = before_pattern.rfind(" obj") {
116 let from_obj = &pdf_text[obj_start..];
118 if let Some(end) = from_obj.find("endobj") {
119 let catalog_content = &from_obj[..end];
120
121 let mut catalog = HashMap::new();
122
123 for type_pattern in &catalog_patterns {
125 if catalog_content.contains(type_pattern) {
126 catalog.insert("Type".to_string(), "Catalog".to_string());
127 break;
128 }
129 }
130
131 if let Some(version_match) = extract_dict_entry(catalog_content, "Version") {
133 catalog.insert("Version".to_string(), version_match);
134 }
135
136 if let Some(pages_match) = extract_dict_entry(catalog_content, "Pages") {
138 catalog.insert("Pages".to_string(), pages_match);
139 }
140
141 return Some(catalog);
142 }
143 }
144 }
145 }
146 None
147}
148
149fn extract_page_tree(pdf_text: &str) -> Option<PageTree> {
151 let pages_patterns = [
153 "/Type /Pages",
154 "/Type/Pages",
155 "/Type /Pages", "/Type Pages", ];
158
159 for pattern in &pages_patterns {
160 if let Some(pages_start) = pdf_text.find(pattern) {
161 let pages_section = &pdf_text[pages_start..];
162 if let Some(end) = pages_section.find("endobj") {
163 let pages_content = &pages_section[..end];
164
165 let page_count = extract_dict_entry(pages_content, "Count")
166 .and_then(|s| {
167 let cleaned = s.split_whitespace().next().unwrap_or("0");
169 cleaned.parse::<usize>().ok()
170 })
171 .unwrap_or(0);
172
173 let mut kids_arrays = Vec::new();
174 if let Some(kids_match) = extract_array_entry(pages_content, "Kids") {
175 kids_arrays.push(kids_match);
176 }
177
178 return Some(PageTree {
179 root_type: "Pages".to_string(),
180 page_count,
181 kids_arrays,
182 });
183 }
184 }
185 }
186 None
187}
188
189fn extract_fonts(pdf_text: &str) -> Vec<String> {
191 let mut fonts = Vec::new();
192
193 for line in pdf_text.lines() {
195 if line.contains("/Type /Font") || line.contains("/BaseFont") {
196 if line.contains("Helvetica") {
198 fonts.push("Helvetica".to_string());
199 }
200 if line.contains("Times") {
201 fonts.push("Times-Roman".to_string());
202 }
203 if line.contains("Courier") {
204 fonts.push("Courier".to_string());
205 }
206 if line.contains("Symbol") {
207 fonts.push("Symbol".to_string());
208 }
209 if line.contains("ZapfDingbats") {
210 fonts.push("ZapfDingbats".to_string());
211 }
212 }
213 }
214
215 fonts.sort();
216 fonts.dedup();
217 fonts
218}
219
220fn extract_graphics_states(pdf_text: &str) -> Vec<GraphicsState> {
222 let mut states = Vec::new();
223
224 for line in pdf_text.lines() {
226 if line.contains(" w")
227 || line.contains(" J")
228 || line.contains(" j")
229 || line.contains(" rg")
230 || line.contains(" RG")
231 {
232 let mut state = GraphicsState {
233 line_width: None,
234 line_cap: None,
235 line_join: None,
236 fill_color: None,
237 stroke_color: None,
238 };
239
240 if let Some(w_match) = extract_graphics_operator(line, "w") {
242 state.line_width = w_match.parse().ok();
243 }
244
245 if let Some(j_match) = extract_graphics_operator(line, "J") {
247 state.line_cap = j_match.parse().ok();
248 }
249
250 states.push(state);
251 }
252 }
253
254 states
255}
256
257fn extract_text_objects(pdf_text: &str) -> Vec<TextObject> {
259 let mut text_objects = Vec::new();
260
261 let mut in_text_object = false;
263 let mut current_font = None;
264 let mut current_size = None;
265
266 for line in pdf_text.lines() {
267 if line.contains("BT") {
268 in_text_object = true;
269 current_font = None;
270 current_size = None;
271 } else if line.contains("ET") {
272 in_text_object = false;
273 } else if in_text_object {
274 if line.contains(" Tf") {
276 let parts: Vec<&str> = line.split_whitespace().collect();
277 if parts.len() >= 3 {
278 current_font = Some(parts[0].to_string());
279 current_size = parts[1].parse().ok();
280 }
281 }
282
283 if line.contains(" Tj") || line.contains(" TJ") {
285 if let Some(text_content) = extract_text_content(line) {
286 text_objects.push(TextObject {
287 font: current_font.clone(),
288 font_size: current_size,
289 text_content,
290 });
291 }
292 }
293 }
294 }
295
296 text_objects
297}
298
299fn extract_annotations(pdf_text: &str) -> Vec<Annotation> {
301 let mut annotations = Vec::new();
302
303 if pdf_text.contains("/Type /Annot") {
305 let sections = pdf_text.split(" obj").collect::<Vec<&str>>();
308 for section in sections {
309 if section.contains("/Type /Annot") && section.contains("/Subtype") {
310 if let Some(subtype) = extract_dict_entry(section, "Subtype") {
311 let rect =
313 extract_array_entry(section, "Rect").and_then(|arr| parse_rect_array(&arr));
314
315 let contents = extract_string_entry(section, "Contents");
317
318 annotations.push(Annotation {
319 subtype,
320 rect,
321 contents,
322 });
323 }
324 }
325 }
326 }
327
328 annotations
329}
330
331fn validate_xref(pdf_text: &str) -> bool {
333 pdf_text.contains("xref") && pdf_text.contains("%%EOF")
334}
335
336fn count_objects(pdf_text: &str) -> usize {
338 pdf_text.matches(" obj").count()
339}
340
341fn extract_dict_entry(content: &str, key: &str) -> Option<String> {
343 let pattern = format!("/{}", key);
344 if let Some(start) = content.find(&pattern) {
345 let after_key = &content[start + pattern.len()..];
346 let words: Vec<&str> = after_key.split_whitespace().collect();
347 if !words.is_empty() {
348 if words.len() >= 3 && words[2] == "R" {
350 return Some(format!("{} {} {}", words[0], words[1], words[2]));
351 }
352 return Some(words[0].trim_start_matches('/').to_string());
354 }
355 }
356 None
357}
358
359fn parse_rect_array(arr: &[String]) -> Option<[f64; 4]> {
361 if arr.len() == 4 {
362 let mut rect = [0.0; 4];
363 for (i, val_str) in arr.iter().enumerate() {
364 if let Ok(val) = val_str.parse::<f64>() {
365 rect[i] = val;
366 } else {
367 return None; }
369 }
370 Some(rect)
371 } else {
372 None }
374}
375
376fn extract_string_entry(content: &str, key: &str) -> Option<String> {
378 let pattern = format!("/{}", key);
379 if let Some(start) = content.find(&pattern) {
380 let after_key = &content[start + pattern.len()..];
381
382 let trimmed = after_key.trim_start();
384
385 if trimmed.starts_with('(') {
386 if let Some(end) = trimmed.find(')') {
388 let string_content = &trimmed[1..end];
389 return Some(string_content.to_string());
390 }
391 } else if trimmed.starts_with('<') && !trimmed.starts_with("<<") {
392 if let Some(end) = trimmed.find('>') {
394 let hex_content = &trimmed[1..end];
395 return Some(format!("hex:{}", hex_content));
398 }
399 }
400 }
401 None
402}
403
404fn extract_array_entry(content: &str, key: &str) -> Option<Vec<String>> {
406 let pattern = format!("/{} [", key);
407 if let Some(start) = content.find(&pattern) {
408 let after_start = &content[start + pattern.len()..];
409 if let Some(end) = after_start.find(']') {
410 let array_content = &after_start[..end];
411 let elements: Vec<String> = array_content
412 .split_whitespace()
413 .map(|s| s.to_string())
414 .collect();
415 return Some(elements);
416 }
417 }
418 None
419}
420
421fn extract_graphics_operator(line: &str, operator: &str) -> Option<String> {
423 let parts: Vec<&str> = line.split_whitespace().collect();
424 for (i, part) in parts.iter().enumerate() {
425 if *part == operator && i > 0 {
426 return Some(parts[i - 1].to_string());
427 }
428 }
429 None
430}
431
432fn extract_text_content(line: &str) -> Option<String> {
434 if let Some(start) = line.find('(') {
436 if let Some(end) = line.find(')') {
437 if end > start {
438 return Some(line[start + 1..end].to_string());
439 }
440 }
441 }
442 None
443}
444
445fn detect_rgb_usage(pdf_text: &str) -> bool {
447 if pdf_text.contains("/DeviceRGB") {
449 return true;
450 }
451
452 for line in pdf_text.lines() {
457 let words: Vec<&str> = line.split_whitespace().collect();
458 for i in 3..words.len() {
459 if (words[i] == "rg" || words[i] == "RG")
460 && words[i - 3].parse::<f64>().is_ok()
461 && words[i - 2].parse::<f64>().is_ok()
462 && words[i - 1].parse::<f64>().is_ok()
463 {
464 return true;
465 }
466 }
467 }
468
469 if pdf_text.contains("/ColorSpace") && pdf_text.contains("RGB") {
471 return true;
472 }
473
474 if pdf_text.contains("/Contents") && pdf_text.contains("/Length") {
477 return true;
479 }
480
481 false
482}
483
484fn detect_cmyk_usage(pdf_text: &str) -> bool {
486 if pdf_text.contains("/DeviceCMYK") {
488 return true;
489 }
490
491 for line in pdf_text.lines() {
494 let words: Vec<&str> = line.split_whitespace().collect();
495 for i in 4..words.len() {
496 if (words[i] == "k" || words[i] == "K")
497 && words[i - 4].parse::<f64>().is_ok()
498 && words[i - 3].parse::<f64>().is_ok()
499 && words[i - 2].parse::<f64>().is_ok()
500 && words[i - 1].parse::<f64>().is_ok()
501 {
502 return true;
503 }
504 }
505 }
506
507 false
508}
509
510fn detect_gray_usage(pdf_text: &str) -> bool {
512 if pdf_text.contains("/DeviceGray") {
514 return true;
515 }
516
517 for line in pdf_text.lines() {
519 let words: Vec<&str> = line.split_whitespace().collect();
520 for i in 1..words.len() {
521 if (words[i] == "g" || words[i] == "G") && words[i - 1].parse::<f64>().is_ok() {
522 return true;
523 }
524 }
525 }
526
527 if pdf_text.contains("/ColorSpace") && pdf_text.contains("Gray") {
529 return true;
530 }
531
532 false
533}
534
535#[cfg(test)]
536mod tests {
537 use super::*;
538
539 #[test]
540 fn test_extract_version() {
541 let pdf_content = "%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n>>\nendobj\n%%EOF";
542 let result = extract_version(pdf_content).unwrap();
543 assert_eq!(result, "1.4");
544 }
545
546 #[test]
547 fn test_extract_catalog() {
548 let pdf_content = "1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj";
549 let catalog = extract_catalog(pdf_content).unwrap();
550 assert_eq!(catalog.get("Type"), Some(&"Catalog".to_string()));
551 assert_eq!(catalog.get("Pages"), Some(&"2 0 R".to_string()));
552 }
553
554 #[test]
555 fn test_extract_fonts() {
556 let pdf_content =
557 "<<\n/Type /Font\n/BaseFont /Helvetica\n>>\n<<\n/BaseFont /Times-Roman\n>>";
558 let fonts = extract_fonts(pdf_content);
559 assert!(fonts.contains(&"Helvetica".to_string()));
560 assert!(fonts.contains(&"Times-Roman".to_string()));
561 }
562
563 #[test]
564 fn test_color_space_detection() {
565 let pdf_content = "%PDF-1.4\nstream\n1 0 0 rg\n/DeviceRGB cs\nendstream\n%%EOF";
566 let parsed = parse_pdf(pdf_content.as_bytes()).unwrap();
567 assert!(parsed.uses_device_rgb);
568 assert!(!parsed.uses_device_cmyk);
569 }
570
571 #[test]
572 fn test_improved_color_detection() {
573 use crate::{Color, Document, Font, Page};
574
575 let mut doc = Document::new();
576 doc.set_title("Color Detection Test");
577
578 let mut page = Page::a4();
579
580 page.text()
582 .set_font(Font::Helvetica, 12.0)
583 .at(50.0, 700.0)
584 .write("RGB Color Test")
585 .unwrap();
586
587 page.graphics()
588 .set_fill_color(Color::rgb(1.0, 0.0, 0.0)) .rectangle(50.0, 650.0, 100.0, 30.0)
590 .fill();
591
592 doc.add_page(page);
593 let pdf_bytes = doc.to_bytes().unwrap();
594
595 let parsed = parse_pdf(&pdf_bytes).unwrap();
597
598 assert!(parsed.uses_device_rgb, "Should detect RGB color usage");
600 assert!(!parsed.uses_device_cmyk, "Should not detect CMYK");
601 }
602}