Skip to main content

pdf_ast/ast/
provenance.rs

1use crate::ast::NodeId;
2use crate::types::ObjectId;
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use std::time::{SystemTime, UNIX_EPOCH};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum ProvenanceQuery {
9    Incremental,
10    Recovered,
11    Inferred,
12    FromObjectStream(ObjectId),
13    Revision(u32),
14}
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct ProvenanceInfo {
18    /// Source information
19    pub source: SourceInfo,
20
21    /// Parsing metadata
22    pub parsing: ParsingInfo,
23
24    /// Decryption/decompression applied
25    pub transformations: Vec<TransformationInfo>,
26
27    /// OCG visibility state when parsed
28    pub visibility_state: Option<VisibilityState>,
29
30    /// Cross-reference chain
31    pub xref_chain: Vec<ProvenanceXRefEntry>,
32
33    /// Revision chain from Prev entries
34    pub revision_chain: Vec<RevisionInfo>,
35
36    /// Validation results
37    pub validation: ValidationMetadata,
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct SourceInfo {
42    /// File offset where this object starts
43    pub file_offset: u64,
44
45    /// Size in bytes of the object
46    pub object_size: u64,
47
48    /// PDF version when this object was created
49    pub pdf_version: Option<String>,
50
51    /// Linearization hint table reference
52    pub linearized_hint: Option<u64>,
53
54    /// Object stream container if compressed
55    pub container_stream: Option<ObjectId>,
56
57    /// Index within object stream
58    pub stream_index: Option<u32>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ParsingInfo {
63    /// Timestamp when parsed
64    pub parse_timestamp: u64,
65
66    /// Parser version used
67    pub parser_version: String,
68
69    /// Parse mode (strict, tolerant, etc.)
70    pub parse_mode: String,
71
72    /// Recovery operations applied
73    pub recovery_operations: Vec<RecoveryOperation>,
74
75    /// Parse warnings/issues
76    pub parse_issues: Vec<ParseIssue>,
77
78    /// Performance metrics
79    pub performance_metrics: PerformanceMetrics,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct TransformationInfo {
84    /// Type of transformation
85    pub transformation_type: TransformationType,
86
87    /// Parameters used
88    pub parameters: HashMap<String, String>,
89
90    /// Success/failure status
91    pub status: TransformationStatus,
92
93    /// Original size before transformation
94    pub original_size: Option<u64>,
95
96    /// Final size after transformation
97    pub final_size: Option<u64>,
98
99    /// Transformation timestamp
100    pub timestamp: u64,
101}
102
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub enum TransformationType {
105    /// Decryption applied
106    Decryption {
107        algorithm: String,
108        key_length: u32,
109        crypt_filter: Option<String>,
110    },
111
112    /// Decompression applied
113    Decompression {
114        filter: String,
115        predictor: Option<u32>,
116        columns: Option<u32>,
117    },
118
119    /// ASCII decoding
120    AsciiDecoding { encoding: String },
121
122    /// Image processing
123    ImageProcessing {
124        color_space: String,
125        bits_per_component: u32,
126        width: u32,
127        height: u32,
128    },
129
130    /// Content stream processing
131    ContentProcessing {
132        operators_parsed: u32,
133        graphics_state_depth: u32,
134    },
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub enum TransformationStatus {
139    Success,
140    Partial { reason: String },
141    Failed { error: String },
142    Skipped { reason: String },
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct VisibilityState {
147    /// OCG states active during parsing
148    pub active_ocgs: Vec<String>,
149
150    /// OCG configuration used
151    pub ocg_config: Option<String>,
152
153    /// Print/view context
154    pub context: VisibilityContext,
155
156    /// Zoom level if applicable
157    pub zoom_level: Option<f64>,
158
159    /// Page rotation applied
160    pub rotation: Option<u32>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub enum VisibilityContext {
165    View,
166    Print,
167    Export,
168    Custom(String),
169}
170
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct ProvenanceXRefEntry {
173    /// Object ID
174    pub object_id: ObjectId,
175
176    /// File offset
177    pub offset: u64,
178
179    /// Entry type (free, in-use, compressed)
180    pub entry_type: XRefEntryType,
181
182    /// Generation number
183    pub generation: u16,
184
185    /// Next free object (for free entries)
186    pub next_free: Option<u32>,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub enum XRefEntryType {
191    Free,
192    InUse,
193    Compressed { stream_id: u32, index: u32 },
194}
195
196#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct RevisionInfo {
198    /// Revision number (0 is original)
199    pub revision_number: u32,
200
201    /// Byte offset of xref table for this revision
202    pub xref_offset: u64,
203
204    /// Trailer dictionary for this revision
205    pub trailer_size: u64,
206
207    /// Previous revision offset
208    pub prev_offset: Option<u64>,
209
210    /// Objects changed in this revision
211    pub changed_objects: Vec<ObjectId>,
212
213    /// Incremental update timestamp
214    pub update_timestamp: Option<String>,
215
216    /// Digital signature covering this revision
217    pub signature_coverage: Option<SignatureCoverage>,
218}
219
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct SignatureCoverage {
222    /// Signature field name
223    pub field_name: String,
224
225    /// Byte range covered by signature
226    pub byte_range: Vec<u64>,
227
228    /// Signature validation status
229    pub is_valid: bool,
230
231    /// Signer certificate info
232    pub signer: Option<String>,
233}
234
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct RecoveryOperation {
237    /// Type of recovery performed
238    pub operation_type: RecoveryType,
239
240    /// Description of what was recovered
241    pub description: String,
242
243    /// Confidence level (0.0 to 1.0)
244    pub confidence: f64,
245
246    /// Alternative interpretations considered
247    pub alternatives: Vec<String>,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub enum RecoveryType {
252    XRefRepair,
253    TrailerReconstruction,
254    ObjectRepair,
255    StreamLengthFix,
256    EncodingFallback,
257    StructureReconstruction,
258}
259
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct ParseIssue {
262    /// Issue severity
263    pub severity: IssueSeverity,
264
265    /// Issue category
266    pub category: IssueCategory,
267
268    /// Description
269    pub message: String,
270
271    /// File location
272    pub location: Option<FileLocation>,
273
274    /// Suggested fix
275    pub suggestion: Option<String>,
276}
277
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub enum IssueSeverity {
280    Info,
281    Warning,
282    Error,
283    Critical,
284}
285
286#[derive(Debug, Clone, Serialize, Deserialize)]
287pub enum IssueCategory {
288    Syntax,
289    Structure,
290    Security,
291    Compatibility,
292    Performance,
293    Accessibility,
294}
295
296#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct FileLocation {
298    pub offset: u64,
299    pub length: u64,
300    pub line: Option<u32>,
301    pub column: Option<u32>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PerformanceMetrics {
306    /// Time spent parsing this object (microseconds)
307    pub parse_time_us: u64,
308
309    /// Memory allocated for this object
310    pub memory_allocated: u64,
311
312    /// Number of child objects processed
313    pub children_processed: u32,
314
315    /// Recursion depth reached
316    pub max_recursion_depth: u32,
317
318    /// Cache hit ratio if applicable
319    pub cache_hit_ratio: Option<f64>,
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct ValidationMetadata {
324    /// PDF/A compliance level
325    pub pdfa_compliance: Option<String>,
326
327    /// PDF/X compliance level
328    pub pdfx_compliance: Option<String>,
329
330    /// Accessibility compliance
331    pub accessibility_score: Option<f64>,
332
333    /// Security assessment
334    pub security_assessment: SecurityAssessment,
335
336    /// Quality metrics
337    pub quality_metrics: QualityMetrics,
338}
339
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct SecurityAssessment {
342    /// Risk level (Low, Medium, High, Critical)
343    pub risk_level: RiskLevel,
344
345    /// Threats detected
346    pub threats: Vec<ThreatInfo>,
347
348    /// Encryption strength if applicable
349    pub encryption_strength: Option<EncryptionStrength>,
350
351    /// Digital signature validity
352    pub signature_validity: Vec<SignatureValidation>,
353}
354
355#[derive(Debug, Clone, Serialize, Deserialize)]
356pub enum RiskLevel {
357    Low,
358    Medium,
359    High,
360    Critical,
361}
362
363#[derive(Debug, Clone, Serialize, Deserialize)]
364pub struct ThreatInfo {
365    pub threat_type: String,
366    pub description: String,
367    pub mitigated: bool,
368    pub confidence: f64,
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
372pub struct EncryptionStrength {
373    pub algorithm: String,
374    pub key_length: u32,
375    pub is_strong: bool,
376    pub vulnerabilities: Vec<String>,
377}
378
379#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct SignatureValidation {
381    pub field_name: String,
382    pub is_valid: bool,
383    pub certificate_chain_valid: bool,
384    pub timestamp_valid: Option<bool>,
385    pub revocation_status: Option<RevocationStatus>,
386}
387
388#[derive(Debug, Clone, Serialize, Deserialize)]
389pub enum RevocationStatus {
390    Valid,
391    Revoked,
392    Unknown,
393    Error(String),
394}
395
396#[derive(Debug, Clone, Serialize, Deserialize)]
397pub struct QualityMetrics {
398    /// Text extraction quality (0.0 to 1.0)
399    pub text_quality: Option<f64>,
400
401    /// Font embedding completeness
402    pub font_completeness: Option<f64>,
403
404    /// Color consistency
405    pub color_consistency: Option<f64>,
406
407    /// Structure completeness for tagged PDFs
408    pub structure_completeness: Option<f64>,
409
410    /// Image quality assessment
411    pub image_quality: Option<f64>,
412}
413
414/// Provenance tracker that collects metadata during parsing
415pub struct ProvenanceTracker {
416    /// Provenance data per node
417    node_provenance: HashMap<NodeId, ProvenanceInfo>,
418
419    /// Current parsing context
420    current_context: ParsingContext,
421
422    /// Global document metadata
423    document_metadata: DocumentProvenance,
424}
425
426#[derive(Debug, Clone)]
427pub struct ParsingContext {
428    pub current_offset: u64,
429    pub current_revision: u32,
430    pub active_ocgs: Vec<String>,
431    pub decryption_state: Option<DecryptionState>,
432    pub performance_tracker: PerformanceTracker,
433}
434
435#[derive(Debug, Clone)]
436pub struct DecryptionState {
437    pub algorithm: String,
438    pub key: Vec<u8>,
439    pub crypt_filter: Option<String>,
440    pub objects_decrypted: u32,
441}
442
443#[derive(Debug, Clone)]
444pub struct PerformanceTracker {
445    pub start_time: SystemTime,
446    pub objects_parsed: u32,
447    pub bytes_processed: u64,
448    pub memory_peak: u64,
449}
450
451#[derive(Debug, Clone, Serialize, Deserialize)]
452pub struct DocumentProvenance {
453    pub file_size: u64,
454    pub file_hash_sha256: String,
455    pub parse_start_time: u64,
456    pub parse_duration_ms: u64,
457    pub parser_version: String,
458    pub total_revisions: u32,
459    pub linearized: bool,
460    pub encrypted: bool,
461    pub signed: bool,
462    pub pdf_version: String,
463}
464
465impl ProvenanceTracker {
466    pub fn new() -> Self {
467        let start_time = SystemTime::now();
468
469        Self {
470            node_provenance: HashMap::new(),
471            current_context: ParsingContext {
472                current_offset: 0,
473                current_revision: 0,
474                active_ocgs: Vec::new(),
475                decryption_state: None,
476                performance_tracker: PerformanceTracker {
477                    start_time,
478                    objects_parsed: 0,
479                    bytes_processed: 0,
480                    memory_peak: 0,
481                },
482            },
483            document_metadata: DocumentProvenance {
484                file_size: 0,
485                file_hash_sha256: String::new(),
486                parse_start_time: start_time.duration_since(UNIX_EPOCH).unwrap().as_secs(),
487                parse_duration_ms: 0,
488                parser_version: env!("CARGO_PKG_VERSION").to_string(),
489                total_revisions: 0,
490                linearized: false,
491                encrypted: false,
492                signed: false,
493                pdf_version: String::new(),
494            },
495        }
496    }
497
498    pub fn record_object_parsed(
499        &mut self,
500        node_id: NodeId,
501        _object_id: ObjectId,
502        offset: u64,
503        size: u64,
504    ) {
505        let parse_time = self
506            .current_context
507            .performance_tracker
508            .start_time
509            .elapsed()
510            .unwrap_or_default()
511            .as_micros() as u64;
512
513        let source_info = SourceInfo {
514            file_offset: offset,
515            object_size: size,
516            pdf_version: Some(self.document_metadata.pdf_version.clone()),
517            linearized_hint: None,
518            container_stream: None,
519            stream_index: None,
520        };
521
522        let parsing_info = ParsingInfo {
523            parse_timestamp: SystemTime::now()
524                .duration_since(UNIX_EPOCH)
525                .unwrap_or_default()
526                .as_secs(),
527            parser_version: self.document_metadata.parser_version.clone(),
528            parse_mode: "tolerant".to_string(),
529            recovery_operations: Vec::new(),
530            parse_issues: Vec::new(),
531            performance_metrics: PerformanceMetrics {
532                parse_time_us: parse_time,
533                memory_allocated: size,
534                children_processed: 0,
535                max_recursion_depth: 0,
536                cache_hit_ratio: None,
537            },
538        };
539
540        let provenance = ProvenanceInfo {
541            source: source_info,
542            parsing: parsing_info,
543            transformations: Vec::new(),
544            visibility_state: None,
545            xref_chain: Vec::new(),
546            revision_chain: Vec::new(),
547            validation: ValidationMetadata {
548                pdfa_compliance: None,
549                pdfx_compliance: None,
550                accessibility_score: None,
551                security_assessment: SecurityAssessment {
552                    risk_level: RiskLevel::Low,
553                    threats: Vec::new(),
554                    encryption_strength: None,
555                    signature_validity: Vec::new(),
556                },
557                quality_metrics: QualityMetrics {
558                    text_quality: None,
559                    font_completeness: None,
560                    color_consistency: None,
561                    structure_completeness: None,
562                    image_quality: None,
563                },
564            },
565        };
566
567        self.node_provenance.insert(node_id, provenance);
568        self.current_context.performance_tracker.objects_parsed += 1;
569        self.current_context.performance_tracker.bytes_processed += size;
570    }
571
572    pub fn record_transformation(&mut self, node_id: NodeId, transformation: TransformationInfo) {
573        if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
574            provenance.transformations.push(transformation);
575        }
576    }
577
578    pub fn record_decryption(&mut self, node_id: NodeId, algorithm: String, key_length: u32) {
579        let transformation = TransformationInfo {
580            transformation_type: TransformationType::Decryption {
581                algorithm: algorithm.clone(),
582                key_length,
583                crypt_filter: self
584                    .current_context
585                    .decryption_state
586                    .as_ref()
587                    .and_then(|s| s.crypt_filter.clone()),
588            },
589            parameters: HashMap::new(),
590            status: TransformationStatus::Success,
591            original_size: None,
592            final_size: None,
593            timestamp: SystemTime::now()
594                .duration_since(UNIX_EPOCH)
595                .unwrap_or_default()
596                .as_secs(),
597        };
598
599        self.record_transformation(node_id, transformation);
600    }
601
602    pub fn record_decompression(
603        &mut self,
604        node_id: NodeId,
605        filter: String,
606        original_size: u64,
607        final_size: u64,
608    ) {
609        let transformation = TransformationInfo {
610            transformation_type: TransformationType::Decompression {
611                filter,
612                predictor: None,
613                columns: None,
614            },
615            parameters: HashMap::new(),
616            status: TransformationStatus::Success,
617            original_size: Some(original_size),
618            final_size: Some(final_size),
619            timestamp: SystemTime::now()
620                .duration_since(UNIX_EPOCH)
621                .unwrap_or_default()
622                .as_secs(),
623        };
624
625        self.record_transformation(node_id, transformation);
626    }
627
628    pub fn set_visibility_context(
629        &mut self,
630        active_ocgs: Vec<String>,
631        _context: VisibilityContext,
632    ) {
633        self.current_context.active_ocgs = active_ocgs;
634    }
635
636    pub fn record_parse_issue(&mut self, node_id: NodeId, issue: ParseIssue) {
637        if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
638            provenance.parsing.parse_issues.push(issue);
639        }
640    }
641
642    pub fn record_recovery_operation(&mut self, node_id: NodeId, operation: RecoveryOperation) {
643        if let Some(provenance) = self.node_provenance.get_mut(&node_id) {
644            provenance.parsing.recovery_operations.push(operation);
645        }
646    }
647
648    pub fn get_provenance(&self, node_id: NodeId) -> Option<&ProvenanceInfo> {
649        self.node_provenance.get(&node_id)
650    }
651
652    pub fn get_all_provenance(&self) -> &HashMap<NodeId, ProvenanceInfo> {
653        &self.node_provenance
654    }
655
656    pub fn query(&self, query: ProvenanceQuery) -> Vec<NodeId> {
657        self.node_provenance
658            .iter()
659            .filter_map(|(node_id, provenance)| {
660                if provenance_matches_query(provenance, query) {
661                    Some(*node_id)
662                } else {
663                    None
664                }
665            })
666            .collect()
667    }
668
669    pub fn get_document_metadata(&self) -> &DocumentProvenance {
670        &self.document_metadata
671    }
672
673    pub fn set_document_metadata(&mut self, metadata: DocumentProvenance) {
674        self.document_metadata = metadata;
675    }
676
677    pub fn finalize(&mut self) {
678        let duration = self
679            .current_context
680            .performance_tracker
681            .start_time
682            .elapsed()
683            .unwrap_or_default();
684        self.document_metadata.parse_duration_ms = duration.as_millis() as u64;
685    }
686}
687
688fn provenance_matches_query(provenance: &ProvenanceInfo, query: ProvenanceQuery) -> bool {
689    match query {
690        ProvenanceQuery::Incremental => !provenance.revision_chain.is_empty(),
691        ProvenanceQuery::Recovered => !provenance.parsing.recovery_operations.is_empty(),
692        ProvenanceQuery::Inferred => provenance.parsing.recovery_operations.iter().any(|op| {
693            matches!(
694                op.operation_type,
695                RecoveryType::StructureReconstruction | RecoveryType::ObjectRepair
696            )
697        }),
698        ProvenanceQuery::FromObjectStream(object_id) => {
699            provenance.source.container_stream == Some(object_id)
700        }
701        ProvenanceQuery::Revision(revision) => provenance
702            .revision_chain
703            .iter()
704            .any(|entry| entry.revision_number == revision),
705    }
706}
707
708impl Default for ProvenanceTracker {
709    fn default() -> Self {
710        Self::new()
711    }
712}