1use crate::ast::NodeId;
2use crate::types::ObjectId;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::time::{SystemTime, UNIX_EPOCH};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum ProvenanceQuery {
9 Incremental,
10 Recovered,
11 Inferred,
12 FromObjectStream(ObjectId),
13 Revision(u32),
14}
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct ProvenanceInfo {
18 pub source: SourceInfo,
20
21 pub parsing: ParsingInfo,
23
24 pub transformations: Vec<TransformationInfo>,
26
27 pub visibility_state: Option<VisibilityState>,
29
30 pub xref_chain: Vec<ProvenanceXRefEntry>,
32
33 pub revision_chain: Vec<RevisionInfo>,
35
36 pub validation: ValidationMetadata,
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct SourceInfo {
42 pub file_offset: u64,
44
45 pub object_size: u64,
47
48 pub pdf_version: Option<String>,
50
51 pub linearized_hint: Option<u64>,
53
54 pub container_stream: Option<ObjectId>,
56
57 pub stream_index: Option<u32>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ParsingInfo {
63 pub parse_timestamp: u64,
65
66 pub parser_version: String,
68
69 pub parse_mode: String,
71
72 pub recovery_operations: Vec<RecoveryOperation>,
74
75 pub parse_issues: Vec<ParseIssue>,
77
78 pub performance_metrics: PerformanceMetrics,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct TransformationInfo {
84 pub transformation_type: TransformationType,
86
87 pub parameters: HashMap<String, String>,
89
90 pub status: TransformationStatus,
92
93 pub original_size: Option<u64>,
95
96 pub final_size: Option<u64>,
98
99 pub timestamp: u64,
101}
102
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub enum TransformationType {
105 Decryption {
107 algorithm: String,
108 key_length: u32,
109 crypt_filter: Option<String>,
110 },
111
112 Decompression {
114 filter: String,
115 predictor: Option<u32>,
116 columns: Option<u32>,
117 },
118
119 AsciiDecoding { encoding: String },
121
122 ImageProcessing {
124 color_space: String,
125 bits_per_component: u32,
126 width: u32,
127 height: u32,
128 },
129
130 ContentProcessing {
132 operators_parsed: u32,
133 graphics_state_depth: u32,
134 },
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub enum TransformationStatus {
139 Success,
140 Partial { reason: String },
141 Failed { error: String },
142 Skipped { reason: String },
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct VisibilityState {
147 pub active_ocgs: Vec<String>,
149
150 pub ocg_config: Option<String>,
152
153 pub context: VisibilityContext,
155
156 pub zoom_level: Option<f64>,
158
159 pub rotation: Option<u32>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub enum VisibilityContext {
165 View,
166 Print,
167 Export,
168 Custom(String),
169}
170
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct ProvenanceXRefEntry {
173 pub object_id: ObjectId,
175
176 pub offset: u64,
178
179 pub entry_type: XRefEntryType,
181
182 pub generation: u16,
184
185 pub next_free: Option<u32>,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub enum XRefEntryType {
191 Free,
192 InUse,
193 Compressed { stream_id: u32, index: u32 },
194}
195
196#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct RevisionInfo {
198 pub revision_number: u32,
200
201 pub xref_offset: u64,
203
204 pub trailer_size: u64,
206
207 pub prev_offset: Option<u64>,
209
210 pub changed_objects: Vec<ObjectId>,
212
213 pub update_timestamp: Option<String>,
215
216 pub signature_coverage: Option<SignatureCoverage>,
218}
219
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SignatureCoverage {
222 pub field_name: String,
224
225 pub byte_range: Vec<u64>,
227
228 pub is_valid: bool,
230
231 pub signer: Option<String>,
233}
234
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct RecoveryOperation {
237 pub operation_type: RecoveryType,
239
240 pub description: String,
242
243 pub confidence: f64,
245
246 pub alternatives: Vec<String>,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub enum RecoveryType {
252 XRefRepair,
253 TrailerReconstruction,
254 ObjectRepair,
255 StreamLengthFix,
256 EncodingFallback,
257 StructureReconstruction,
258}
259
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct ParseIssue {
262 pub severity: IssueSeverity,
264
265 pub category: IssueCategory,
267
268 pub message: String,
270
271 pub location: Option<FileLocation>,
273
274 pub suggestion: Option<String>,
276}
277
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub enum IssueSeverity {
280 Info,
281 Warning,
282 Error,
283 Critical,
284}
285
286#[derive(Debug, Clone, Serialize, Deserialize)]
287pub enum IssueCategory {
288 Syntax,
289 Structure,
290 Security,
291 Compatibility,
292 Performance,
293 Accessibility,
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct FileLocation {
298 pub offset: u64,
299 pub length: u64,
300 pub line: Option<u32>,
301 pub column: Option<u32>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PerformanceMetrics {
306 pub parse_time_us: u64,
308
309 pub memory_allocated: u64,
311
312 pub children_processed: u32,
314
315 pub max_recursion_depth: u32,
317
318 pub cache_hit_ratio: Option<f64>,
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct ValidationMetadata {
324 pub pdfa_compliance: Option<String>,
326
327 pub pdfx_compliance: Option<String>,
329
330 pub accessibility_score: Option<f64>,
332
333 pub security_assessment: SecurityAssessment,
335
336 pub quality_metrics: QualityMetrics,
338}
339
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct SecurityAssessment {
342 pub risk_level: RiskLevel,
344
345 pub threats: Vec<ThreatInfo>,
347
348 pub encryption_strength: Option<EncryptionStrength>,
350
351 pub signature_validity: Vec<SignatureValidation>,
353}
354
355#[derive(Debug, Clone, Serialize, Deserialize)]
356pub enum RiskLevel {
357 Low,
358 Medium,
359 High,
360 Critical,
361}
362
363#[derive(Debug, Clone, Serialize, Deserialize)]
364pub struct ThreatInfo {
365 pub threat_type: String,
366 pub description: String,
367 pub mitigated: bool,
368 pub confidence: f64,
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct EncryptionStrength {
373 pub algorithm: String,
374 pub key_length: u32,
375 pub is_strong: bool,
376 pub vulnerabilities: Vec<String>,
377}
378
379#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct SignatureValidation {
381 pub field_name: String,
382 pub is_valid: bool,
383 pub certificate_chain_valid: bool,
384 pub timestamp_valid: Option<bool>,
385 pub revocation_status: Option<RevocationStatus>,
386}
387
388#[derive(Debug, Clone, Serialize, Deserialize)]
389pub enum RevocationStatus {
390 Valid,
391 Revoked,
392 Unknown,
393 Error(String),
394}
395
396#[derive(Debug, Clone, Serialize, Deserialize)]
397pub struct QualityMetrics {
398 pub text_quality: Option<f64>,
400
401 pub font_completeness: Option<f64>,
403
404 pub color_consistency: Option<f64>,
406
407 pub structure_completeness: Option<f64>,
409
410 pub image_quality: Option<f64>,
412}
413
414pub struct ProvenanceTracker {
416 node_provenance: HashMap<NodeId, ProvenanceInfo>,
418
419 current_context: ParsingContext,
421
422 document_metadata: DocumentProvenance,
424}
425
426#[derive(Debug, Clone)]
427pub struct ParsingContext {
428 pub current_offset: u64,
429 pub current_revision: u32,
430 pub active_ocgs: Vec<String>,
431 pub decryption_state: Option<DecryptionState>,
432 pub performance_tracker: PerformanceTracker,
433}
434
435#[derive(Debug, Clone)]
436pub struct DecryptionState {
437 pub algorithm: String,
438 pub key: Vec<u8>,
439 pub crypt_filter: Option<String>,
440 pub objects_decrypted: u32,
441}
442
443#[derive(Debug, Clone)]
444pub struct PerformanceTracker {
445 pub start_time: SystemTime,
446 pub objects_parsed: u32,
447 pub bytes_processed: u64,
448 pub memory_peak: u64,
449}
450
451#[derive(Debug, Clone, Serialize, Deserialize)]
452pub struct DocumentProvenance {
453 pub file_size: u64,
454 pub file_hash_sha256: String,
455 pub parse_start_time: u64,
456 pub parse_duration_ms: u64,
457 pub parser_version: String,
458 pub total_revisions: u32,
459 pub linearized: bool,
460 pub encrypted: bool,
461 pub signed: bool,
462 pub pdf_version: String,
463}
464
465impl ProvenanceTracker {
466 pub fn new() -> Self {
467 let start_time = SystemTime::now();
468
469 Self {
470 node_provenance: HashMap::new(),
471 current_context: ParsingContext {
472 current_offset: 0,
473 current_revision: 0,
474 active_ocgs: Vec::new(),
475 decryption_state: None,
476 performance_tracker: PerformanceTracker {
477 start_time,
478 objects_parsed: 0,
479 bytes_processed: 0,
480 memory_peak: 0,
481 },
482 },
483 document_metadata: DocumentProvenance {
484 file_size: 0,
485 file_hash_sha256: String::new(),
486 parse_start_time: start_time.duration_since(UNIX_EPOCH).unwrap().as_secs(),
487 parse_duration_ms: 0,
488 parser_version: env!("CARGO_PKG_VERSION").to_string(),
489 total_revisions: 0,
490 linearized: false,
491 encrypted: false,
492 signed: false,
493 pdf_version: String::new(),
494 },
495 }
496 }
497
498 pub fn record_object_parsed(
499 &mut self,
500 node_id: NodeId,
501 _object_id: ObjectId,
502 offset: u64,
503 size: u64,
504 ) {
505 let parse_time = self
506 .current_context
507 .performance_tracker
508 .start_time
509 .elapsed()
510 .unwrap_or_default()
511 .as_micros() as u64;
512
513 let source_info = SourceInfo {
514 file_offset: offset,
515 object_size: size,
516 pdf_version: Some(self.document_metadata.pdf_version.clone()),
517 linearized_hint: None,
518 container_stream: None,
519 stream_index: None,
520 };
521
522 let parsing_info = ParsingInfo {
523 parse_timestamp: SystemTime::now()
524 .duration_since(UNIX_EPOCH)
525 .unwrap_or_default()
526 .as_secs(),
527 parser_version: self.document_metadata.parser_version.clone(),
528 parse_mode: "tolerant".to_string(),
529 recovery_operations: Vec::new(),
530 parse_issues: Vec::new(),
531 performance_metrics: PerformanceMetrics {
532 parse_time_us: parse_time,
533 memory_allocated: size,
534 children_processed: 0,
535 max_recursion_depth: 0,
536 cache_hit_ratio: None,
537 },
538 };
539
540 let provenance = ProvenanceInfo {
541 source: source_info,
542 parsing: parsing_info,
543 transformations: Vec::new(),
544 visibility_state: None,
545 xref_chain: Vec::new(),
546 revision_chain: Vec::new(),
547 validation: ValidationMetadata {
548 pdfa_compliance: None,
549 pdfx_compliance: None,
550 accessibility_score: None,
551 security_assessment: SecurityAssessment {
552 risk_level: RiskLevel::Low,
553 threats: Vec::new(),
554 encryption_strength: None,
555 signature_validity: Vec::new(),
556 },
557 quality_metrics: QualityMetrics {
558 text_quality: None,
559 font_completeness: None,
560 color_consistency: None,
561 structure_completeness: None,
562 image_quality: None,
563 },
564 },
565 };
566
567 self.node_provenance.insert(node_id, provenance);
568 self.current_context.performance_tracker.objects_parsed += 1;
569 self.current_context.performance_tracker.bytes_processed += size;
570 }
571
572 pub fn record_transformation(&mut self, node_id: NodeId, transformation: TransformationInfo) {
573 if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
574 provenance.transformations.push(transformation);
575 }
576 }
577
578 pub fn record_decryption(&mut self, node_id: NodeId, algorithm: String, key_length: u32) {
579 let transformation = TransformationInfo {
580 transformation_type: TransformationType::Decryption {
581 algorithm: algorithm.clone(),
582 key_length,
583 crypt_filter: self
584 .current_context
585 .decryption_state
586 .as_ref()
587 .and_then(|s| s.crypt_filter.clone()),
588 },
589 parameters: HashMap::new(),
590 status: TransformationStatus::Success,
591 original_size: None,
592 final_size: None,
593 timestamp: SystemTime::now()
594 .duration_since(UNIX_EPOCH)
595 .unwrap_or_default()
596 .as_secs(),
597 };
598
599 self.record_transformation(node_id, transformation);
600 }
601
602 pub fn record_decompression(
603 &mut self,
604 node_id: NodeId,
605 filter: String,
606 original_size: u64,
607 final_size: u64,
608 ) {
609 let transformation = TransformationInfo {
610 transformation_type: TransformationType::Decompression {
611 filter,
612 predictor: None,
613 columns: None,
614 },
615 parameters: HashMap::new(),
616 status: TransformationStatus::Success,
617 original_size: Some(original_size),
618 final_size: Some(final_size),
619 timestamp: SystemTime::now()
620 .duration_since(UNIX_EPOCH)
621 .unwrap_or_default()
622 .as_secs(),
623 };
624
625 self.record_transformation(node_id, transformation);
626 }
627
628 pub fn set_visibility_context(
629 &mut self,
630 active_ocgs: Vec<String>,
631 _context: VisibilityContext,
632 ) {
633 self.current_context.active_ocgs = active_ocgs;
634 }
635
636 pub fn record_parse_issue(&mut self, node_id: NodeId, issue: ParseIssue) {
637 if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
638 provenance.parsing.parse_issues.push(issue);
639 }
640 }
641
642 pub fn record_recovery_operation(&mut self, node_id: NodeId, operation: RecoveryOperation) {
643 if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
644 provenance.parsing.recovery_operations.push(operation);
645 }
646 }
647
648 pub fn get_provenance(&self, node_id: NodeId) -> Option<&ProvenanceInfo> {
649 self.node_provenance.get(&node_id)
650 }
651
652 pub fn get_all_provenance(&self) -> &HashMap<NodeId, ProvenanceInfo> {
653 &self.node_provenance
654 }
655
656 pub fn query(&self, query: ProvenanceQuery) -> Vec<NodeId> {
657 self.node_provenance
658 .iter()
659 .filter_map(|(node_id, provenance)| {
660 if provenance_matches_query(provenance, query) {
661 Some(*node_id)
662 } else {
663 None
664 }
665 })
666 .collect()
667 }
668
669 pub fn get_document_metadata(&self) -> &DocumentProvenance {
670 &self.document_metadata
671 }
672
673 pub fn set_document_metadata(&mut self, metadata: DocumentProvenance) {
674 self.document_metadata = metadata;
675 }
676
677 pub fn finalize(&mut self) {
678 let duration = self
679 .current_context
680 .performance_tracker
681 .start_time
682 .elapsed()
683 .unwrap_or_default();
684 self.document_metadata.parse_duration_ms = duration.as_millis() as u64;
685 }
686}
687
688fn provenance_matches_query(provenance: &ProvenanceInfo, query: ProvenanceQuery) -> bool {
689 match query {
690 ProvenanceQuery::Incremental => !provenance.revision_chain.is_empty(),
691 ProvenanceQuery::Recovered => !provenance.parsing.recovery_operations.is_empty(),
692 ProvenanceQuery::Inferred => provenance.parsing.recovery_operations.iter().any(|op| {
693 matches!(
694 op.operation_type,
695 RecoveryType::StructureReconstruction | RecoveryType::ObjectRepair
696 )
697 }),
698 ProvenanceQuery::FromObjectStream(object_id) => {
699 provenance.source.container_stream == Some(object_id)
700 }
701 ProvenanceQuery::Revision(revision) => provenance
702 .revision_chain
703 .iter()
704 .any(|entry| entry.revision_number == revision),
705 }
706}
707
708impl Default for ProvenanceTracker {
709 fn default() -> Self {
710 Self::new()
711 }
712}