1use std::{
4 collections::{BTreeMap, BTreeSet, HashMap, HashSet},
5 io::{Read, Seek, SeekFrom},
6 num::NonZeroU64,
7 path::{Path, PathBuf},
8 sync::Arc,
9 time::Instant,
10};
11
12use crate::{
13 Assertion, BoundedText, BuiltinProfileRepository, ENGINE_VERSION, ErrorArgument, FeatureObject,
14 FeatureReport, FeatureValue, Identifier, IndirectObject, InputKind, InputSummary, ModelValue,
15 ObjectKey, ObjectLocation, ObjectTypeName, ParsedDocument, Parser, PdfName, PdfvError,
16 PolicyOperator, PolicyReport, PolicyRule, PolicyRuleResult, PolicySet, PolicyValue,
17 ProfileReport, ProfileRepository, PropertyName, ResourceLimits, Result, Rule, RuleEvaluator,
18 RuleId, RuleOutcome, TaskDuration, UnsupportedRule, ValidationError, ValidationOptions,
19 ValidationReport, ValidationStatus,
20 profile::DefaultRuleEvaluator,
21 xmp::{FlavourDetector, parse_document_xmp},
22};
23
24const CATALOG_DIRECT_PROPERTIES: &[&str] = &["Type", "Metadata", "Pages", "OutputIntents"];
25const METADATA_DIRECT_PROPERTIES: &[&str] = &["Type", "Subtype", "Filter", "Length"];
26const PAGE_DIRECT_PROPERTIES: &[&str] = &["Type", "Parent", "Contents", "Resources", "Annots"];
27const FONT_DIRECT_PROPERTIES: &[&str] = &[
28 "Type",
29 "Subtype",
30 "BaseFont",
31 "FontDescriptor",
32 "FirstChar",
33 "LastChar",
34 "Widths",
35 "Encoding",
36 "ToUnicode",
37 "CIDToGIDMap",
38];
39const ANNOTATION_DIRECT_PROPERTIES: &[&str] = &[
40 "Type", "Subtype", "F", "C", "IC", "AP", "FT", "CA", "A", "AA",
41];
42const OUTPUT_INTENT_DIRECT_PROPERTIES: &[&str] = &[
43 "Type",
44 "S",
45 "DestOutputProfile",
46 "OutputConditionIdentifier",
47 "Info",
48];
49const STREAM_DIRECT_PROPERTIES: &[&str] = &[
50 "Type",
51 "Subtype",
52 "Filter",
53 "DecodeParms",
54 "F",
55 "FFilter",
56 "FDecodeParms",
57];
58
59const RESOURCE_DIRECT_PROPERTIES: &[&str] = &[
60 "Font",
61 "XObject",
62 "ColorSpace",
63 "ExtGState",
64 "Pattern",
65 "Shading",
66 "Properties",
67 "ProcSet",
68];
69const ACRO_FORM_DIRECT_PROPERTIES: &[&str] = &[
70 "Fields",
71 "NeedAppearances",
72 "SigFlags",
73 "DR",
74 "DA",
75 "Q",
76 "XFA",
77];
78const STRUCTURE_DIRECT_PROPERTIES: &[&str] = &[
79 "Type",
80 "K",
81 "ParentTree",
82 "ParentTreeNextKey",
83 "RoleMap",
84 "ClassMap",
85 "IDTree",
86];
87const OPTIONAL_CONTENT_DIRECT_PROPERTIES: &[&str] = &["OCGs", "D", "Configs"];
88const NAMES_DIRECT_PROPERTIES: &[&str] = &[
89 "Dests",
90 "AP",
91 "JavaScript",
92 "Pages",
93 "Templates",
94 "IDS",
95 "URLS",
96 "EmbeddedFiles",
97 "AlternatePresentations",
98 "Renditions",
99];
100const OUTLINES_DIRECT_PROPERTIES: &[&str] = &["Type", "First", "Last", "Count"];
101const DESTINATION_DIRECT_PROPERTIES: &[&str] = &["D", "Dest", "A"];
102const ACTION_DIRECT_PROPERTIES: &[&str] = &["Type", "S", "D", "URI", "Next", "NewWindow"];
103const FORM_FIELD_DIRECT_PROPERTIES: &[&str] = &[
104 "FT", "T", "TU", "TM", "Ff", "V", "DV", "Kids", "Parent", "AA",
105];
106const IMAGE_DIRECT_PROPERTIES: &[&str] = &[
107 "Type",
108 "Subtype",
109 "Width",
110 "Height",
111 "ColorSpace",
112 "BitsPerComponent",
113 "Filter",
114 "DecodeParms",
115 "SMask",
116 "Mask",
117 "Intent",
118];
119const XOBJECT_DIRECT_PROPERTIES: &[&str] = &[
120 "Type",
121 "Subtype",
122 "BBox",
123 "Matrix",
124 "Resources",
125 "Group",
126 "Filter",
127 "DecodeParms",
128];
129const CMAP_DIRECT_PROPERTIES: &[&str] = &["Type", "Subtype", "CMapName", "CIDSystemInfo"];
130const COLOR_SPACE_DIRECT_PROPERTIES: &[&str] =
131 &["Type", "N", "Alternate", "Range", "Metadata", "Filter"];
132const EXT_GSTATE_DIRECT_PROPERTIES: &[&str] =
133 &["Type", "BM", "CA", "ca", "SMask", "AIS", "OP", "op", "OPM"];
134const SIGNATURE_DIRECT_PROPERTIES: &[&str] = &[
135 "Type",
136 "Filter",
137 "SubFilter",
138 "ByteRange",
139 "Contents",
140 "Reference",
141 "M",
142];
143const SECURITY_DIRECT_PROPERTIES: &[&str] = &["Filter", "SubFilter", "V", "R", "Length", "P"];
144
145const DIRECT_PROPERTY_NAMES: &[&str] = &[
146 "A",
147 "AA",
148 "AIS",
149 "AP",
150 "Alternate",
151 "AlternatePresentations",
152 "Annot",
153 "Annots",
154 "BBox",
155 "BM",
156 "BaseFont",
157 "BitsPerComponent",
158 "ByteRange",
159 "C",
160 "CA",
161 "CIDSystemInfo",
162 "CIDToGIDMap",
163 "ClassMap",
164 "ColorSpace",
165 "Configs",
166 "Contents",
167 "Count",
168 "D",
169 "DA",
170 "DR",
171 "DV",
172 "DecodeParms",
173 "Dest",
174 "DestOutputProfile",
175 "Dests",
176 "EmbeddedFiles",
177 "Encoding",
178 "F",
179 "FDecodeParms",
180 "FFilter",
181 "FT",
182 "Ff",
183 "Fields",
184 "Filter",
185 "First",
186 "FirstChar",
187 "Font",
188 "FontDescriptor",
189 "Group",
190 "Height",
191 "IC",
192 "IDS",
193 "IDTree",
194 "Info",
195 "Intent",
196 "JavaScript",
197 "K",
198 "Kids",
199 "Last",
200 "LastChar",
201 "Length",
202 "M",
203 "Mask",
204 "Matrix",
205 "Metadata",
206 "N",
207 "NeedAppearances",
208 "Next",
209 "OCGs",
210 "OP",
211 "OPM",
212 "OutputConditionIdentifier",
213 "P",
214 "Pages",
215 "Parent",
216 "ParentTree",
217 "ParentTreeNextKey",
218 "Pattern",
219 "ProcSet",
220 "Properties",
221 "Q",
222 "Range",
223 "Reference",
224 "Renditions",
225 "Resources",
226 "RoleMap",
227 "S",
228 "SMask",
229 "Shading",
230 "SigFlags",
231 "SubFilter",
232 "Subtype",
233 "T",
234 "TM",
235 "TU",
236 "Templates",
237 "ToUnicode",
238 "Type",
239 "URI",
240 "URLS",
241 "V",
242 "Width",
243 "Widths",
244 "XFA",
245 "XObject",
246 "ca",
247 "op",
248];
249
250const OBJECT_PROPERTIES: &[&str] = &["Type", "Subtype"];
251const DOCUMENT_PROPERTIES: &[&str] = &[
252 "headerOffset",
253 "postEOFDataSize",
254 "header",
255 "encrypted",
256 "isEncrypted",
257 "hasCatalog",
258 "containsXRefStream",
259 "nrIndirects",
260 "containsPDFUAIdentification",
261 "containsPDFAIdentification",
262 "part",
263 "partPrefix",
264 "rev",
265 "revPrefix",
266];
267const CATALOG_PROPERTIES: &[&str] = &[
268 "hasMetadata",
269 "hasAcroForm",
270 "hasStructTreeRoot",
271 "hasOCProperties",
272 "hasLang",
273 "hasOutlines",
274 "hasNames",
275 "hasDests",
276 "language",
277 "permissions",
278 "containsStructTreeRoot",
279 "containsOCProperties",
280 "containsAcroForm",
281 "Marked",
282 "Type",
283 "Metadata",
284 "Pages",
285 "OutputIntents",
286 "AcroForm",
287 "StructTreeRoot",
288 "OCProperties",
289 "Lang",
290 "Perms",
291 "Outlines",
292 "Names",
293 "Dests",
294];
295const METADATA_PROPERTIES: &[&str] = &[
296 "present",
297 "catalogMetadata",
298 "containsPDFAIdentification",
299 "containsPDFUAIdentification",
300 "part",
301 "partPrefix",
302 "conformance",
303 "conformancePrefix",
304 "rev",
305 "revPrefix",
306 "amdPrefix",
307 "corrPrefix",
308 "declarations",
309 "Type",
310 "Subtype",
311 "Filter",
312 "Length",
313];
314const PAGE_PROPERTIES: &[&str] = &[
315 "hasContents",
316 "hasResources",
317 "annotationCount",
318 "Type",
319 "Parent",
320 "Contents",
321 "Resources",
322 "Annots",
323];
324const PAGE_TREE_PROPERTIES: &[&str] = &["Type", "Kids", "Count", "Parent", "Resources"];
325const RESOURCE_PROPERTIES: &[&str] = RESOURCE_DIRECT_PROPERTIES;
326const NAMES_PROPERTIES: &[&str] = NAMES_DIRECT_PROPERTIES;
327const OUTLINE_PROPERTIES: &[&str] = OUTLINES_DIRECT_PROPERTIES;
328const DESTINATION_PROPERTIES: &[&str] = DESTINATION_DIRECT_PROPERTIES;
329const ACRO_FORM_PROPERTIES: &[&str] = ACRO_FORM_DIRECT_PROPERTIES;
330const OPTIONAL_CONTENT_PROPERTIES: &[&str] = OPTIONAL_CONTENT_DIRECT_PROPERTIES;
331const PERMISSIONS_PROPERTIES: &[&str] = &["DocMDP", "UR", "UR3"];
332const FONT_PROPERTIES: &[&str] = &[
333 "embedded",
334 "hasSubtype",
335 "Type",
336 "Subtype",
337 "BaseFont",
338 "FontDescriptor",
339 "FirstChar",
340 "LastChar",
341 "Widths",
342 "Encoding",
343 "ToUnicode",
344 "CIDToGIDMap",
345];
346const CMAP_PROPERTIES: &[&str] = CMAP_DIRECT_PROPERTIES;
347const IMAGE_PROPERTIES: &[&str] = IMAGE_DIRECT_PROPERTIES;
348const XOBJECT_PROPERTIES: &[&str] = XOBJECT_DIRECT_PROPERTIES;
349const CONTENT_STREAM_PROPERTIES: &[&str] = &[
350 "lengthMatches",
351 "declaredLength",
352 "discoveredLength",
353 "operatorCount",
354 "markedContentCount",
355 "Type",
356 "Subtype",
357 "Filter",
358 "DecodeParms",
359 "F",
360 "FFilter",
361 "FDecodeParms",
362];
363const UNDEFINED_OPERATOR_PROPERTIES: &[&str] = &["name"];
364const ANNOTATION_PROPERTIES: &[&str] = &[
365 "hasSubtype",
366 "Type",
367 "Subtype",
368 "F",
369 "C",
370 "IC",
371 "AP",
372 "FT",
373 "CA",
374 "A",
375 "AA",
376];
377const ACTION_PROPERTIES: &[&str] = ACTION_DIRECT_PROPERTIES;
378const FORM_FIELD_PROPERTIES: &[&str] = FORM_FIELD_DIRECT_PROPERTIES;
379const COLOR_SPACE_PROPERTIES: &[&str] = COLOR_SPACE_DIRECT_PROPERTIES;
380const EXT_GSTATE_PROPERTIES: &[&str] = EXT_GSTATE_DIRECT_PROPERTIES;
381const STRUCTURE_PROPERTIES: &[&str] = STRUCTURE_DIRECT_PROPERTIES;
382const STRUCTURE_ELEMENT_PROPERTIES: &[&str] = &[
383 "Type",
384 "S",
385 "P",
386 "K",
387 "Pg",
388 "Alt",
389 "ActualText",
390 "Lang",
391 "A",
392 "C",
393 "ID",
394 "containsParent",
395 "containsRef",
396 "parentStandardType",
397 "parentStandardTypeNamespaceURL",
398 "parentType",
399 "parentNamespaceURL",
400 "structParentStandardType",
401 "structParentType",
402 "firstChildStandardTypeNamespaceURL",
403 "kidsStandardTypes",
404 "hasContentItems",
405 "containsLabels",
406 "ListNumbering",
407 "NoteType",
408 "orphanRefs",
409 "ghostRefs",
410 "isArtifact",
411 "isTaggedContent",
412 "parentsTags",
413 "isNotMappedToStandardType",
414 "circularMappingExist",
415 "roleMapToSameNamespaceTag",
416 "remappedStandardType",
417 "hasIntersection",
418 "numberOfColumnWithWrongRowSpan",
419 "numberOfRowWithWrongColumnSpan",
420 "wrongColumnSpan",
421 "differentTargetAnnotObjectKey",
422];
423const SIGNATURE_PROPERTIES: &[&str] = SIGNATURE_DIRECT_PROPERTIES;
424const SECURITY_PROPERTIES: &[&str] = SECURITY_DIRECT_PROPERTIES;
425const OUTPUT_INTENT_PROPERTIES: &[&str] = &[
426 "hasDestOutputProfile",
427 "Type",
428 "S",
429 "DestOutputProfile",
430 "OutputConditionIdentifier",
431 "Info",
432];
433const STREAM_PROPERTIES: &[&str] = &[
434 "lengthMatches",
435 "declaredLength",
436 "discoveredLength",
437 "streamKeywordCRLFCompliant",
438 "endstreamKeywordEOLCompliant",
439 "Type",
440 "Subtype",
441 "Filter",
442 "DecodeParms",
443 "F",
444 "FFilter",
445 "FDecodeParms",
446];
447
448const SAFE_FEATURE_STRING_PROPERTIES: &[&str] = &[
449 "BaseFont",
450 "CIDToGIDMap",
451 "CMapName",
452 "Encoding",
453 "FT",
454 "Filter",
455 "S",
456 "Subtype",
457 "Type",
458 "conformance",
459 "conformancePrefix",
460 "header",
461 "partPrefix",
462 "revPrefix",
463];
464
465const EMPTY_LINK_NAMES: &[(&str, &str)] = &[];
466const DOCUMENT_LINKS: &[(&str, &str)] = &[("catalog", "catalog"), ("streams", "stream")];
467const CATALOG_LINKS: &[(&str, &str)] = &[
468 ("metadata", "metadata"),
469 ("pages", "page"),
470 ("outputIntents", "outputIntent"),
471 ("acroForm", "acroForm"),
472 ("structureTreeRoot", "structureTreeRoot"),
473 ("optionalContentProperties", "optionalContentProperties"),
474 ("names", "names"),
475 ("outlines", "outline"),
476 ("destinations", "destination"),
477 ("permissions", "permissions"),
478];
479const PAGE_LINKS: &[(&str, &str)] = &[
480 ("resources", "resource"),
481 ("fonts", "font"),
482 ("annotations", "annotation"),
483 ("contentStreams", "contentStream"),
484];
485
486#[derive(Clone, Debug, Default, serde::Deserialize, Eq, PartialEq, serde::Serialize)]
488#[non_exhaustive]
489#[serde(rename_all = "camelCase", deny_unknown_fields)]
490pub enum FeatureSelection {
491 #[default]
493 None,
494 All,
496 Families {
498 families: Vec<ObjectTypeName>,
500 },
501}
502
503impl FeatureSelection {
504 #[must_use]
506 pub fn is_enabled(&self) -> bool {
507 !matches!(self, Self::None)
508 }
509}
510
511#[derive(Clone, Debug, Eq, PartialEq)]
513pub struct InputName(Option<PathBuf>);
514
515impl InputName {
516 #[must_use]
518 pub fn memory() -> Self {
519 Self(None)
520 }
521
522 #[must_use]
524 pub fn path(path: impl Into<PathBuf>) -> Self {
525 Self(Some(path.into()))
526 }
527
528 fn summary(&self, kind: InputKind, bytes: Option<u64>) -> InputSummary {
529 InputSummary::new(kind, self.0.clone(), bytes)
530 }
531}
532
533#[derive(Clone)]
535pub struct Validator {
536 options: ValidationOptions,
537 profiles: Arc<dyn ProfileRepository + Send + Sync>,
538}
539
540impl std::fmt::Debug for Validator {
541 fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
542 formatter
543 .debug_struct("Validator")
544 .field("options", &self.options)
545 .finish_non_exhaustive()
546 }
547}
548
549impl Validator {
550 pub fn new(options: ValidationOptions) -> Result<Self> {
556 let validator = Self {
557 options,
558 profiles: Arc::new(BuiltinProfileRepository::new()),
559 };
560 validator
561 .profiles
562 .profiles_for(&validator.options.flavour)?;
563 validate_feature_configuration(&validator.options)?;
564 Ok(validator)
565 }
566
567 pub fn with_profiles(
573 options: ValidationOptions,
574 profiles: Arc<dyn ProfileRepository + Send + Sync>,
575 ) -> Result<Self> {
576 let validator = Self { options, profiles };
577 validator
578 .profiles
579 .profiles_for(&validator.options.flavour)?;
580 validate_feature_configuration(&validator.options)?;
581 Ok(validator)
582 }
583
584 #[allow(
590 clippy::disallowed_types,
591 reason = "core validation is synchronous per spec; async file I/O belongs to the CLI phase"
592 )]
593 pub fn validate_path(&self, path: impl AsRef<Path>) -> Result<ValidationReport> {
594 let path = path.as_ref();
595 let file = std::fs::File::open(path).map_err(|source| PdfvError::Io {
596 path: Some(path.to_path_buf()),
597 source,
598 })?;
599 let name = InputName::path(path);
600 self.validate_reader_with_kind(file, &name, InputKind::File)
601 }
602
603 #[allow(
609 clippy::needless_pass_by_value,
610 reason = "public API owns InputName to match the validation facade contract"
611 )]
612 pub fn validate_reader<R: Read + Seek>(
613 &self,
614 source: R,
615 name: InputName,
616 ) -> Result<ValidationReport> {
617 self.validate_reader_with_kind(source, &name, InputKind::Memory)
618 }
619
620 #[allow(
621 clippy::too_many_lines,
622 reason = "the facade keeps parse, validation, feature, and policy task ordering in one \
623 place so report construction remains auditable"
624 )]
625 fn validate_reader_with_kind<R: Read + Seek>(
626 &self,
627 mut source: R,
628 name: &InputName,
629 kind: InputKind,
630 ) -> Result<ValidationReport> {
631 let started = Instant::now();
632 let bytes = reader_len(&mut source)?;
633 source
634 .rewind()
635 .map_err(|source| PdfvError::Io { path: None, source })?;
636 let source_summary = name.summary(kind, bytes);
637 let parser = Parser::new(self.options.resource_limits.clone());
638 let parsed = match parser.parse_with_options(
639 source,
640 crate::ParseOptions {
641 password: self.options.password.as_ref(),
642 },
643 ) {
644 Ok(parsed) => parsed,
645 Err(PdfvError::Parse(error)) => {
646 return parse_failed_report(source_summary, &error, started.elapsed());
647 }
648 Err(error) => return Err(error),
649 };
650
651 let mut parsed = parsed;
652 if parsed.is_encrypted() {
653 let xmp = parse_document_xmp(&parsed, &self.options.resource_limits, false)?;
654 parsed.parse_facts.extend(xmp.parse_facts);
655 parsed.warnings.extend(xmp.warnings);
656 return base_report(
657 source_summary,
658 ValidationStatus::Encrypted,
659 Vec::new(),
660 parsed,
661 started.elapsed(),
662 );
663 }
664
665 let profiles = match &self.options.flavour {
666 crate::FlavourSelection::Auto { default } => {
667 let detected = FlavourDetector::new(Arc::clone(&self.profiles)).detect(
668 &parsed,
669 default.as_ref(),
670 &self.options.resource_limits,
671 )?;
672 parsed.parse_facts.extend(detected.parse_facts);
673 parsed.warnings.extend(detected.warnings);
674 detected.profiles
675 }
676 crate::FlavourSelection::Explicit { .. }
677 | crate::FlavourSelection::CustomProfile { .. } => {
678 let xmp = parse_document_xmp(&parsed, &self.options.resource_limits, false)?;
679 parsed.parse_facts.extend(xmp.parse_facts);
680 parsed.warnings.extend(xmp.warnings);
681 self.profiles.profiles_for(&self.options.flavour)?
682 }
683 };
684 if profiles.is_empty() {
685 return base_report(
686 source_summary,
687 ValidationStatus::Incomplete,
688 Vec::new(),
689 parsed,
690 started.elapsed(),
691 );
692 }
693 let mut session = ValidationSession::new(
694 parsed,
695 self.options.resource_limits.clone(),
696 self.options.max_failed_assertions_per_rule.get(),
697 self.options.record_passed_assertions,
698 );
699 let mut profile_reports = Vec::with_capacity(profiles.len());
700 for profile in &profiles {
701 profile_reports.push(session.validate_profile(profile)?);
702 }
703 let mut status = if profile_reports
704 .iter()
705 .any(|report| !report.unsupported_rules.is_empty())
706 {
707 ValidationStatus::Incomplete
708 } else if profile_reports.iter().all(|report| report.is_compliant) {
709 ValidationStatus::Valid
710 } else {
711 ValidationStatus::Invalid
712 };
713 let flavours = profiles
714 .iter()
715 .map(|profile| profile.flavour.clone())
716 .collect::<Vec<_>>();
717
718 let needs_features =
719 self.options.feature_selection.is_enabled() || self.options.policy.is_some();
720 let feature_started = Instant::now();
721 let feature_report = if needs_features {
722 Some(session.extract_features(&self.options.feature_selection)?)
723 } else {
724 None
725 };
726 let feature_duration = needs_features.then(|| {
727 TaskDuration::from_duration(
728 Identifier::unchecked("featureExtraction"),
729 feature_started.elapsed(),
730 )
731 });
732 let policy_started = Instant::now();
733 let policy_report = match (&self.options.policy, feature_report.as_ref()) {
734 (Some(policy), Some(features)) => {
735 policy.validate()?;
736 let report = evaluate_policy(policy, features)?;
737 if !report.is_compliant && matches!(status, ValidationStatus::Valid) {
738 status = ValidationStatus::Invalid;
739 }
740 Some(report)
741 }
742 (Some(_), None) => {
743 return Err(crate::PolicyError::Evaluation {
744 reason: BoundedText::unchecked("policy evaluation requires feature report"),
745 }
746 .into());
747 }
748 (None, _) => None,
749 };
750 let policy_duration = self.options.policy.is_some().then(|| {
751 TaskDuration::from_duration(Identifier::unchecked("policy"), policy_started.elapsed())
752 });
753 let parse_facts = session.document.parse_facts.clone();
754 let warnings = session.document.warnings.clone();
755 let mut task_durations = vec![TaskDuration::from_duration(
756 Identifier::new("validate")?,
757 started.elapsed(),
758 )];
759 if let Some(duration) = feature_duration {
760 task_durations.push(duration);
761 }
762 if let Some(duration) = policy_duration {
763 task_durations.push(duration);
764 }
765 Ok(ValidationReport::builder()
766 .engine_version(ENGINE_VERSION.to_owned())
767 .source(source_summary)
768 .status(status)
769 .flavours(flavours)
770 .profile_reports(profile_reports)
771 .parse_facts(parse_facts)
772 .warnings(warnings)
773 .feature_report(feature_report)
774 .policy_report(policy_report)
775 .task_durations(task_durations)
776 .build())
777 }
778}
779
780#[derive(Debug)]
782pub struct ValidationSession {
783 document: ParsedDocument,
784 limits: ResourceLimits,
785 max_failed_assertions_per_rule: u32,
786 record_passed_assertions: bool,
787}
788
789impl ValidationSession {
790 fn new(
791 document: ParsedDocument,
792 limits: ResourceLimits,
793 max_failed_assertions_per_rule: u32,
794 record_passed_assertions: bool,
795 ) -> Self {
796 Self {
797 document,
798 limits,
799 max_failed_assertions_per_rule,
800 record_passed_assertions,
801 }
802 }
803
804 fn validate_profile(&mut self, profile: &crate::ValidationProfile) -> Result<ProfileReport> {
805 let index = RuleIndex::new(&profile.rules);
806 let graph = ModelGraph::for_rules(&self.document, &self.limits, &profile.rules);
807 let mut evaluator = DefaultRuleEvaluator::new(self.limits.clone());
808 let mut state = ProfileState::new(
809 profile.identity.clone(),
810 self.max_failed_assertions_per_rule,
811 self.record_passed_assertions,
812 );
813 state.register_static_unsupported_rules(&profile.rules);
814 let mut stack = Vec::from([ModelObjectRef::Document(DocumentModel::new(&self.document))]);
815 let mut visited = HashSet::new();
816 let mut deferred = Vec::new();
817
818 while let Some(object) = stack.pop() {
819 let visited_key = object.identity_key();
820 if !visited.insert(visited_key) {
821 continue;
822 }
823 let object_rules = index.rules_for(&object);
824 for rule in object_rules {
825 if matches!(rule.test, crate::RuleExpr::Unsupported { .. }) {
826 continue;
827 }
828 if rule.deferred {
829 deferred.push((object.clone(), rule));
830 } else {
831 state.apply_rule(&object, rule, &mut evaluator)?;
832 }
833 }
834 if u64::try_from(visited.len()).map_err(|_| ValidationError::LimitExceeded {
835 limit: "max_objects",
836 })? > self.limits.max_objects
837 {
838 return Err(ValidationError::LimitExceeded {
839 limit: "max_objects",
840 }
841 .into());
842 }
843 let object_budget = remaining_object_budget(&self.limits, visited.len(), stack.len())?;
844 for linked in object.linked_objects(&graph, object_budget)? {
845 stack.push(linked);
846 }
847 }
848 for (object, rule) in deferred {
849 state.apply_rule(&object, rule, &mut evaluator)?;
850 }
851 Ok(state.finish())
852 }
853
854 fn extract_features(&self, selection: &FeatureSelection) -> Result<FeatureReport> {
855 let registry = ModelRegistry::default_registry();
856 let selected = selected_feature_families(selection, ®istry)?;
857 let graph = ModelGraph::with_all_families(&self.document, &self.limits);
858 let mut stack = Vec::from([ModelObjectRef::Document(DocumentModel::new(&self.document))]);
859 let mut visited = HashSet::new();
860 let mut objects = Vec::new();
861 let mut truncated = false;
862
863 while let Some(object) = stack.pop() {
864 let visited_key = object.identity_key();
865 if !visited.insert(visited_key) {
866 continue;
867 }
868 let object_type = object.object_type();
869 if selected.contains(&object_type)
870 && let Some(feature) = feature_object(®istry, &object, &object_type)?
871 {
872 objects.push(feature);
873 }
874 if u64::try_from(visited.len()).map_err(|_| ValidationError::LimitExceeded {
875 limit: "max_objects",
876 })? > self.limits.max_objects
877 {
878 truncated = true;
879 break;
880 }
881 let object_budget =
882 match remaining_object_budget(&self.limits, visited.len(), stack.len()) {
883 Ok(budget) => budget,
884 Err(error) if is_object_limit_error(&error) => {
885 truncated = true;
886 break;
887 }
888 Err(error) => return Err(error),
889 };
890 let linked_objects = match object.linked_objects(&graph, object_budget) {
891 Ok(objects) => objects,
892 Err(error) if is_object_limit_error(&error) => {
893 truncated = true;
894 break;
895 }
896 Err(error) => return Err(error),
897 };
898 for linked in linked_objects {
899 stack.push(linked);
900 }
901 }
902 let visited_objects = u64::try_from(visited.len()).unwrap_or(u64::MAX);
903 Ok(FeatureReport::builder()
904 .objects(objects)
905 .visited_objects(visited_objects)
906 .selected_families(selected.into_iter().collect())
907 .truncated(truncated)
908 .build())
909 }
910}
911
912fn selected_feature_families(
913 selection: &FeatureSelection,
914 registry: &ModelRegistry,
915) -> Result<BTreeSet<ObjectTypeName>> {
916 match selection {
917 FeatureSelection::None | FeatureSelection::All => {
918 Ok(registry.family_names().cloned().collect())
919 }
920 FeatureSelection::Families { families } => {
921 let mut selected = BTreeSet::new();
922 for family in families {
923 if !registry.has_family(family) {
924 return Err(crate::ConfigError::InvalidValue {
925 field: "extract",
926 reason: BoundedText::unchecked("unknown feature family"),
927 }
928 .into());
929 }
930 selected.insert(family.clone());
931 }
932 Ok(selected)
933 }
934 }
935}
936
937fn is_object_limit_error(error: &PdfvError) -> bool {
938 matches!(
939 error,
940 PdfvError::Validation(ValidationError::LimitExceeded {
941 limit: "max_objects"
942 })
943 )
944}
945
946fn validate_feature_configuration(options: &ValidationOptions) -> Result<()> {
947 let registry = ModelRegistry::default_registry();
948 let _selected = selected_feature_families(&options.feature_selection, ®istry)?;
949 if let Some(policy) = &options.policy {
950 policy.validate()?;
951 validate_policy_schema(policy, ®istry)?;
952 }
953 Ok(())
954}
955
956fn validate_policy_schema(policy: &PolicySet, registry: &ModelRegistry) -> Result<()> {
957 for rule in &policy.rules {
958 if !registry.has_family(&rule.family) {
959 return Err(policy_invalid("family", "unknown policy feature family"));
960 }
961 if !registry.has_family_property(&rule.family, &rule.field) {
962 return Err(policy_invalid(
963 "field",
964 "unknown policy feature field for family",
965 ));
966 }
967 match rule.operator {
968 PolicyOperator::Exists | PolicyOperator::Absent => {
969 if rule.value.is_some() {
970 return Err(policy_invalid(
971 "value",
972 "exists and absent operators do not accept values",
973 ));
974 }
975 }
976 PolicyOperator::Equals | PolicyOperator::NotEquals => {
977 if rule.value.is_none() {
978 return Err(policy_invalid(
979 "value",
980 "comparison operator requires a value",
981 ));
982 }
983 }
984 PolicyOperator::Min | PolicyOperator::Max => {
985 if !matches!(rule.value, Some(PolicyValue::Number(_))) {
986 return Err(policy_invalid(
987 "value",
988 "numeric operator requires a number value",
989 ));
990 }
991 }
992 }
993 }
994 Ok(())
995}
996
997fn policy_invalid(field: &'static str, reason: &'static str) -> PdfvError {
998 crate::PolicyError::InvalidField {
999 field,
1000 reason: BoundedText::unchecked(reason),
1001 }
1002 .into()
1003}
1004
1005fn feature_object(
1006 registry: &ModelRegistry,
1007 object: &ModelObjectRef<'_>,
1008 object_type: &ObjectTypeName,
1009) -> Result<Option<FeatureObject>> {
1010 let Some(properties) = registry.family_property_names(object_type) else {
1011 return Ok(None);
1012 };
1013 let mut values = BTreeMap::new();
1014 for property in properties {
1015 match object.property(&property) {
1016 Ok(value) => {
1017 values.insert(property.clone(), safe_feature_value(&property, value));
1018 }
1019 Err(PdfvError::Profile(crate::ProfileError::UnknownProperty { .. })) => {}
1020 Err(error) => return Err(error),
1021 }
1022 }
1023 Ok(Some(
1024 FeatureObject::builder()
1025 .family(object_type.clone())
1026 .location(object.location())
1027 .context(object.context())
1028 .properties(values)
1029 .build(),
1030 ))
1031}
1032
1033impl From<ModelValue> for FeatureValue {
1034 fn from(value: ModelValue) -> Self {
1035 match value {
1036 ModelValue::Null => Self::Null,
1037 ModelValue::Bool(value) => Self::Bool(value),
1038 ModelValue::Number(value) => Self::Number(value),
1039 ModelValue::String(value) => Self::String(value),
1040 ModelValue::ObjectKey(value) => Self::ObjectKey(value),
1041 ModelValue::List(values) => {
1042 Self::List(values.into_iter().map(FeatureValue::from).collect())
1043 }
1044 }
1045 }
1046}
1047
1048fn safe_feature_value(property: &PropertyName, value: ModelValue) -> FeatureValue {
1049 match value {
1050 ModelValue::String(value)
1051 if !SAFE_FEATURE_STRING_PROPERTIES.contains(&property.as_str()) =>
1052 {
1053 FeatureValue::RedactedString {
1054 bytes: u64::try_from(value.as_str().len()).unwrap_or(u64::MAX),
1055 }
1056 }
1057 ModelValue::List(values) => FeatureValue::List(
1058 values
1059 .into_iter()
1060 .map(|value| safe_feature_value(property, value))
1061 .collect(),
1062 ),
1063 other => FeatureValue::from(other),
1064 }
1065}
1066
1067fn evaluate_policy(policy: &PolicySet, features: &FeatureReport) -> Result<PolicyReport> {
1068 let results = policy
1069 .rules
1070 .iter()
1071 .map(|rule| evaluate_policy_rule(rule, features))
1072 .collect::<Result<Vec<_>>>()?;
1073 let is_compliant = results.iter().all(|result| result.passed);
1074 Ok(PolicyReport::builder()
1075 .name(policy.name.clone())
1076 .is_compliant(is_compliant)
1077 .results(results)
1078 .build())
1079}
1080
1081fn evaluate_policy_rule(rule: &PolicyRule, features: &FeatureReport) -> Result<PolicyRuleResult> {
1082 let matches = features
1083 .objects
1084 .iter()
1085 .filter(|object| object.family == rule.family)
1086 .collect::<Vec<_>>();
1087 let values = matches
1088 .iter()
1089 .filter_map(|object| object.properties.get(&rule.field))
1090 .collect::<Vec<_>>();
1091 let passed = match rule.operator {
1092 PolicyOperator::Exists => !values.is_empty(),
1093 PolicyOperator::Absent => values.is_empty(),
1094 PolicyOperator::Equals => {
1095 let expected = required_policy_value(rule)?;
1096 values
1097 .iter()
1098 .any(|actual| policy_value_matches(actual, expected))
1099 }
1100 PolicyOperator::NotEquals => {
1101 let expected = required_policy_value(rule)?;
1102 values
1103 .iter()
1104 .all(|actual| !policy_value_matches(actual, expected))
1105 }
1106 PolicyOperator::Min => {
1107 let expected = required_policy_number(rule)?;
1108 values
1109 .iter()
1110 .filter_map(|value| feature_number(value))
1111 .any(|actual| actual >= expected)
1112 }
1113 PolicyOperator::Max => {
1114 let expected = required_policy_number(rule)?;
1115 values
1116 .iter()
1117 .filter_map(|value| feature_number(value))
1118 .any(|actual| actual <= expected)
1119 }
1120 };
1121 let matches = u64::try_from(matches.len()).unwrap_or(u64::MAX);
1122 Ok(PolicyRuleResult::builder()
1123 .id(rule.id.clone())
1124 .description(rule.description.clone())
1125 .passed(passed)
1126 .matches(matches)
1127 .message(policy_message(rule, passed, matches)?)
1128 .build())
1129}
1130
1131fn required_policy_value(rule: &PolicyRule) -> Result<&PolicyValue> {
1132 rule.value.as_ref().ok_or_else(|| {
1133 crate::PolicyError::InvalidField {
1134 field: "value",
1135 reason: BoundedText::unchecked("operator requires a comparison value"),
1136 }
1137 .into()
1138 })
1139}
1140
1141fn required_policy_number(rule: &PolicyRule) -> Result<f64> {
1142 match required_policy_value(rule)? {
1143 PolicyValue::Number(value) => Ok(f64::from(*value)),
1144 _ => Err(crate::PolicyError::InvalidField {
1145 field: "value",
1146 reason: BoundedText::unchecked("operator requires a numeric comparison value"),
1147 }
1148 .into()),
1149 }
1150}
1151
1152fn policy_value_matches(actual: &FeatureValue, expected: &PolicyValue) -> bool {
1153 match (actual, expected) {
1154 (FeatureValue::Bool(actual), PolicyValue::Bool(expected)) => actual == expected,
1155 (FeatureValue::Number(actual), PolicyValue::Number(expected)) => {
1156 (*actual - f64::from(*expected)).abs() < f64::EPSILON
1157 }
1158 (FeatureValue::String(actual), PolicyValue::String(expected)) => actual == expected,
1159 _ => false,
1160 }
1161}
1162
1163fn feature_number(value: &FeatureValue) -> Option<f64> {
1164 match value {
1165 FeatureValue::Number(value) if value.is_finite() => Some(*value),
1166 _ => None,
1167 }
1168}
1169
1170fn policy_message(
1171 rule: &PolicyRule,
1172 passed: bool,
1173 matches: u64,
1174) -> std::result::Result<BoundedText, crate::ConfigError> {
1175 let status = if passed { "passed" } else { "failed" };
1176 BoundedText::new(
1177 format!(
1178 "policy rule {} {status} with {matches} matching feature objects",
1179 rule.id.as_str()
1180 ),
1181 256,
1182 )
1183}
1184
1185#[derive(Clone, Debug, Eq, PartialEq)]
1187pub(crate) struct PropertySpec {
1188 pub name: PropertyName,
1190}
1191
1192impl PropertySpec {
1193 fn new(name: &str) -> Self {
1194 Self {
1195 name: PropertyName::unchecked(name),
1196 }
1197 }
1198}
1199
1200#[derive(Clone, Debug, Eq, PartialEq)]
1202pub(crate) struct LinkSpec {
1203 pub name: LinkName,
1205 pub target: ObjectTypeName,
1207}
1208
1209impl LinkSpec {
1210 fn new(name: &'static str, target: &'static str) -> Self {
1211 Self {
1212 name: LinkName(Identifier::unchecked(name)),
1213 target: ObjectTypeName::unchecked(target),
1214 }
1215 }
1216}
1217
1218pub(crate) trait ModelFamily {
1220 fn family_name(&self) -> ObjectTypeName;
1222 fn property_schema(&self) -> &[PropertySpec];
1224 fn link_schema(&self) -> &[LinkSpec];
1226}
1227
1228#[derive(Clone)]
1230pub(crate) struct ModelRegistry {
1231 families: BTreeMap<ObjectTypeName, Arc<dyn ModelFamily + Send + Sync>>,
1232 all_properties: BTreeSet<PropertyName>,
1233}
1234
1235impl std::fmt::Debug for ModelRegistry {
1236 fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1237 formatter
1238 .debug_struct("ModelRegistry")
1239 .field("families", &self.families.keys().collect::<Vec<_>>())
1240 .field("all_properties_len", &self.all_properties.len())
1241 .finish()
1242 }
1243}
1244
1245impl ModelRegistry {
1246 #[must_use]
1248 pub(crate) fn default_registry() -> Self {
1249 let families = [
1250 family("document", DOCUMENT_PROPERTIES, DOCUMENT_LINKS),
1251 family("catalog", CATALOG_PROPERTIES, CATALOG_LINKS),
1252 family("metadata", METADATA_PROPERTIES, EMPTY_LINK_NAMES),
1253 family("page", PAGE_PROPERTIES, PAGE_LINKS),
1254 family("pageTree", PAGE_TREE_PROPERTIES, EMPTY_LINK_NAMES),
1255 family("resource", RESOURCE_PROPERTIES, EMPTY_LINK_NAMES),
1256 family("names", NAMES_PROPERTIES, EMPTY_LINK_NAMES),
1257 family("outline", OUTLINE_PROPERTIES, EMPTY_LINK_NAMES),
1258 family("destination", DESTINATION_PROPERTIES, EMPTY_LINK_NAMES),
1259 family("acroForm", ACRO_FORM_PROPERTIES, EMPTY_LINK_NAMES),
1260 family(
1261 "optionalContentProperties",
1262 OPTIONAL_CONTENT_PROPERTIES,
1263 EMPTY_LINK_NAMES,
1264 ),
1265 family("permissions", PERMISSIONS_PROPERTIES, EMPTY_LINK_NAMES),
1266 family("font", FONT_PROPERTIES, EMPTY_LINK_NAMES),
1267 family("cMap", CMAP_PROPERTIES, EMPTY_LINK_NAMES),
1268 family("embeddedFontFile", STREAM_PROPERTIES, EMPTY_LINK_NAMES),
1269 family("image", IMAGE_PROPERTIES, EMPTY_LINK_NAMES),
1270 family("xObject", XOBJECT_PROPERTIES, EMPTY_LINK_NAMES),
1271 family("contentStream", CONTENT_STREAM_PROPERTIES, EMPTY_LINK_NAMES),
1272 family(
1273 "undefinedOperator",
1274 UNDEFINED_OPERATOR_PROPERTIES,
1275 EMPTY_LINK_NAMES,
1276 ),
1277 family("annotation", ANNOTATION_PROPERTIES, EMPTY_LINK_NAMES),
1278 family("action", ACTION_PROPERTIES, EMPTY_LINK_NAMES),
1279 family("formField", FORM_FIELD_PROPERTIES, EMPTY_LINK_NAMES),
1280 family("colorSpace", COLOR_SPACE_PROPERTIES, EMPTY_LINK_NAMES),
1281 family("extGState", EXT_GSTATE_PROPERTIES, EMPTY_LINK_NAMES),
1282 family("structureTreeRoot", STRUCTURE_PROPERTIES, EMPTY_LINK_NAMES),
1283 family(
1284 "structureElement",
1285 STRUCTURE_ELEMENT_PROPERTIES,
1286 EMPTY_LINK_NAMES,
1287 ),
1288 family("signature", SIGNATURE_PROPERTIES, EMPTY_LINK_NAMES),
1289 family("security", SECURITY_PROPERTIES, EMPTY_LINK_NAMES),
1290 family("outputIntent", OUTPUT_INTENT_PROPERTIES, EMPTY_LINK_NAMES),
1291 family("stream", STREAM_PROPERTIES, EMPTY_LINK_NAMES),
1292 family("object", OBJECT_PROPERTIES, EMPTY_LINK_NAMES),
1293 ];
1294 let mut by_name: BTreeMap<ObjectTypeName, Arc<dyn ModelFamily + Send + Sync>> =
1295 BTreeMap::new();
1296 let mut all_properties = BTreeSet::new();
1297 for family in families {
1298 for property in family.property_schema() {
1299 all_properties.insert(property.name.clone());
1300 }
1301 by_name.insert(family.family_name(), Arc::new(family) as Arc<_>);
1302 }
1303 for family in by_name.values() {
1304 for link in family.link_schema() {
1305 debug_assert!(
1306 by_name.contains_key(&link.target),
1307 "model registry link target is not registered"
1308 );
1309 }
1310 }
1311 for property in DIRECT_PROPERTY_NAMES {
1312 all_properties.insert(PropertyName::unchecked(*property));
1313 }
1314 Self {
1315 families: by_name,
1316 all_properties,
1317 }
1318 }
1319
1320 #[must_use]
1322 pub(crate) fn has_family(&self, family: &ObjectTypeName) -> bool {
1323 self.families.contains_key(family)
1324 }
1325
1326 #[must_use]
1328 pub(crate) fn has_family_property(
1329 &self,
1330 family: &ObjectTypeName,
1331 property: &PropertyName,
1332 ) -> bool {
1333 self.families.get(family).is_some_and(|family| {
1334 family
1335 .property_schema()
1336 .iter()
1337 .any(|spec| spec.name == *property)
1338 })
1339 }
1340
1341 fn family_property_names(&self, family: &ObjectTypeName) -> Option<Vec<PropertyName>> {
1342 self.families.get(family).map(|family| {
1343 family
1344 .property_schema()
1345 .iter()
1346 .map(|property| property.name.clone())
1347 .collect()
1348 })
1349 }
1350
1351 pub(crate) fn family_names(&self) -> impl Iterator<Item = &ObjectTypeName> {
1353 self.families.keys()
1354 }
1355}
1356
1357#[derive(Debug)]
1358struct StaticModelFamily {
1359 name: ObjectTypeName,
1360 properties: Vec<PropertySpec>,
1361 links: Vec<LinkSpec>,
1362}
1363
1364impl ModelFamily for StaticModelFamily {
1365 fn family_name(&self) -> ObjectTypeName {
1366 self.name.clone()
1367 }
1368
1369 fn property_schema(&self) -> &[PropertySpec] {
1370 &self.properties
1371 }
1372
1373 fn link_schema(&self) -> &[LinkSpec] {
1374 &self.links
1375 }
1376}
1377
1378fn family(
1379 name: &'static str,
1380 properties: &'static [&'static str],
1381 links: &'static [(&'static str, &'static str)],
1382) -> StaticModelFamily {
1383 StaticModelFamily {
1384 name: ObjectTypeName::unchecked(name),
1385 properties: properties
1386 .iter()
1387 .map(|name| PropertySpec::new(name))
1388 .collect(),
1389 links: links
1390 .iter()
1391 .map(|(name, target)| LinkSpec::new(name, target))
1392 .collect(),
1393 }
1394}
1395
1396#[derive(Clone, Debug, Eq, Hash, PartialEq)]
1398pub struct ObjectIdentity {
1399 key: String,
1400}
1401
1402#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
1404pub struct LinkName(Identifier);
1405
1406impl LinkName {
1407 pub fn new(value: impl Into<String>) -> std::result::Result<Self, crate::ConfigError> {
1413 Ok(Self(Identifier::new(value)?))
1414 }
1415}
1416
1417pub trait ModelObject {
1419 fn id(&self) -> Option<ObjectIdentity>;
1421 fn object_type(&self) -> ObjectTypeName;
1423 fn super_types(&self) -> &[ObjectTypeName];
1425 fn extra_context(&self) -> Option<&str>;
1427 fn property(&self, name: &PropertyName) -> Result<ModelValue>;
1433 fn links(&self) -> &[LinkName];
1435 fn linked_objects<'a>(
1441 &self,
1442 graph: &ModelGraph<'a>,
1443 max_objects: usize,
1444 ) -> Result<Vec<ModelObjectRef<'a>>>;
1445}
1446
1447#[derive(Clone, Debug)]
1449pub enum ModelObjectRef<'a> {
1450 Document(DocumentModel<'a>),
1452 Catalog(CatalogModel<'a>),
1454 Metadata(MetadataModel<'a>),
1456 Page(PageModel<'a>),
1458 Font(FontModel<'a>),
1460 Annotation(AnnotationModel<'a>),
1462 OutputIntent(OutputIntentModel<'a>),
1464 ContentStream(ContentStreamModel<'a>),
1466 Stream(StreamModel<'a>),
1468 Generic(GenericModel<'a>),
1470}
1471
1472impl<'a> ModelObjectRef<'a> {
1473 #[must_use]
1475 pub fn document(&self) -> &'a ParsedDocument {
1476 match self {
1477 Self::Document(model) => model.document,
1478 Self::Catalog(model) => model.document,
1479 Self::Metadata(model) => model.document,
1480 Self::Page(model) => model.document,
1481 Self::Font(model) => model.document,
1482 Self::Annotation(model) => model.document,
1483 Self::OutputIntent(model) => model.document,
1484 Self::ContentStream(model) => model.document,
1485 Self::Stream(model) => model.document,
1486 Self::Generic(model) => model.document,
1487 }
1488 }
1489
1490 #[must_use]
1492 pub fn object_type(&self) -> ObjectTypeName {
1493 match self {
1494 Self::Document(model) => model.object_type(),
1495 Self::Catalog(model) => model.object_type(),
1496 Self::Metadata(model) => model.object_type(),
1497 Self::Page(model) => model.object_type(),
1498 Self::Font(model) => model.object_type(),
1499 Self::Annotation(model) => model.object_type(),
1500 Self::OutputIntent(model) => model.object_type(),
1501 Self::ContentStream(model) => model.object_type(),
1502 Self::Stream(model) => model.object_type(),
1503 Self::Generic(model) => model.object_type(),
1504 }
1505 }
1506
1507 pub fn property(&self, name: &PropertyName) -> Result<ModelValue> {
1513 match self {
1514 Self::Document(model) => model.property(name),
1515 Self::Catalog(model) => model.property(name),
1516 Self::Metadata(model) => model.property(name),
1517 Self::Page(model) => model.property(name),
1518 Self::Font(model) => model.property(name),
1519 Self::Annotation(model) => model.property(name),
1520 Self::OutputIntent(model) => model.property(name),
1521 Self::ContentStream(model) => model.property(name),
1522 Self::Stream(model) => model.property(name),
1523 Self::Generic(model) => model.property(name),
1524 }
1525 }
1526
1527 fn location(&self) -> ObjectLocation {
1528 match self {
1529 Self::Document(_) => ObjectLocation {
1530 object: None,
1531 offset: None,
1532 path: Some(BoundedText::unchecked("root")),
1533 },
1534 Self::Catalog(model) => ObjectLocation {
1535 object: Some(model.key),
1536 offset: Some(model.offset),
1537 path: Some(BoundedText::unchecked("root/catalog[0]")),
1538 },
1539 Self::Metadata(model) => ObjectLocation {
1540 object: Some(model.key),
1541 offset: Some(model.offset),
1542 path: Some(BoundedText::unchecked("root/catalog[0]/metadata[0]")),
1543 },
1544 Self::Page(model) => ObjectLocation {
1545 object: Some(model.key),
1546 offset: Some(model.offset),
1547 path: Some(BoundedText::unchecked(format!(
1548 "root/page[{}]",
1549 model.ordinal
1550 ))),
1551 },
1552 Self::Font(model) => ObjectLocation {
1553 object: model.key,
1554 offset: model.offset,
1555 path: Some(BoundedText::unchecked(format!(
1556 "root/page[{}]/font[{}]",
1557 model.page_ordinal,
1558 String::from_utf8_lossy(model.name.as_bytes())
1559 ))),
1560 },
1561 Self::Annotation(model) => ObjectLocation {
1562 object: model.key,
1563 offset: model.offset,
1564 path: Some(BoundedText::unchecked(format!(
1565 "root/page[{}]/annotation[{}]",
1566 model.page_ordinal, model.ordinal
1567 ))),
1568 },
1569 Self::OutputIntent(model) => ObjectLocation {
1570 object: model.key,
1571 offset: model.offset,
1572 path: Some(BoundedText::unchecked(format!(
1573 "root/catalog[0]/outputIntent[{}]",
1574 model.ordinal
1575 ))),
1576 },
1577 Self::ContentStream(model) => ObjectLocation {
1578 object: Some(model.key),
1579 offset: Some(model.offset),
1580 path: Some(BoundedText::unchecked(format!(
1581 "root/page[{}]/contentStream[{}]",
1582 model.page_ordinal, model.ordinal
1583 ))),
1584 },
1585 Self::Stream(model) => ObjectLocation {
1586 object: Some(model.key),
1587 offset: Some(model.offset),
1588 path: Some(BoundedText::unchecked(format!(
1589 "root/stream[{}]",
1590 model.key.number
1591 ))),
1592 },
1593 Self::Generic(model) => ObjectLocation {
1594 object: model.key,
1595 offset: model.offset,
1596 path: Some(BoundedText::unchecked(model.context.clone())),
1597 },
1598 }
1599 }
1600
1601 fn context(&self) -> BoundedText {
1602 match self {
1603 Self::Document(_) => BoundedText::unchecked("root"),
1604 Self::Catalog(_) => BoundedText::unchecked("root/catalog[0]"),
1605 Self::Metadata(_) => BoundedText::unchecked("root/catalog[0]/metadata[0]"),
1606 Self::Page(model) => BoundedText::unchecked(format!("root/page[{}]", model.ordinal)),
1607 Self::Font(model) => BoundedText::unchecked(format!(
1608 "root/page[{}]/font[{}]",
1609 model.page_ordinal,
1610 String::from_utf8_lossy(model.name.as_bytes())
1611 )),
1612 Self::Annotation(model) => BoundedText::unchecked(format!(
1613 "root/page[{}]/annotation[{}]",
1614 model.page_ordinal, model.ordinal
1615 )),
1616 Self::OutputIntent(model) => {
1617 BoundedText::unchecked(format!("root/catalog[0]/outputIntent[{}]", model.ordinal))
1618 }
1619 Self::ContentStream(model) => BoundedText::unchecked(format!(
1620 "root/page[{}]/contentStream[{}]",
1621 model.page_ordinal, model.ordinal
1622 )),
1623 Self::Stream(model) => {
1624 BoundedText::unchecked(format!("root/stream[{}]", model.key.number))
1625 }
1626 Self::Generic(model) => BoundedText::unchecked(model.context.clone()),
1627 }
1628 }
1629
1630 fn identity_key(&self) -> String {
1631 match self {
1632 Self::Document(_) => String::from("document"),
1633 Self::Catalog(model) => {
1634 format!("catalog:{}:{}", model.key.number, model.key.generation)
1635 }
1636 Self::Metadata(model) => {
1637 format!("metadata:{}:{}", model.key.number, model.key.generation)
1638 }
1639 Self::Page(model) => format!("page:{}:{}", model.key.number, model.key.generation),
1640 Self::Font(model) => format!(
1641 "font:{}:{}:{}",
1642 model.page_ordinal,
1643 model.key.map_or(0, |key| key.number.get()),
1644 String::from_utf8_lossy(model.name.as_bytes())
1645 ),
1646 Self::Annotation(model) => format!(
1647 "annotation:{}:{}:{}",
1648 model.page_ordinal,
1649 model.ordinal,
1650 model.key.map_or(0, |key| key.number.get())
1651 ),
1652 Self::OutputIntent(model) => format!(
1653 "outputIntent:{}:{}",
1654 model.ordinal,
1655 model.key.map_or(0, |key| key.number.get())
1656 ),
1657 Self::ContentStream(model) => format!(
1658 "contentStream:{}:{}:{}",
1659 model.page_ordinal, model.key.number, model.key.generation
1660 ),
1661 Self::Stream(model) => format!("stream:{}:{}", model.key.number, model.key.generation),
1662 Self::Generic(model) => format!(
1663 "{}:{}:{}",
1664 model.object_type.as_str(),
1665 model.ordinal,
1666 model.key.map_or(0, |key| key.number.get())
1667 ),
1668 }
1669 }
1670
1671 fn linked_objects(
1672 &self,
1673 graph: &ModelGraph<'a>,
1674 max_objects: usize,
1675 ) -> Result<Vec<ModelObjectRef<'a>>> {
1676 match self {
1677 Self::Document(model) => model.linked_objects(graph, max_objects),
1678 Self::Catalog(model) => model.linked_objects(graph, max_objects),
1679 Self::Metadata(model) => model.linked_objects(graph, max_objects),
1680 Self::Page(model) => model.linked_objects(graph, max_objects),
1681 Self::Font(model) => model.linked_objects(graph, max_objects),
1682 Self::Annotation(model) => model.linked_objects(graph, max_objects),
1683 Self::OutputIntent(model) => model.linked_objects(graph, max_objects),
1684 Self::ContentStream(model) => model.linked_objects(graph, max_objects),
1685 Self::Stream(model) => model.linked_objects(graph, max_objects),
1686 Self::Generic(model) => model.linked_objects(graph, max_objects),
1687 }
1688 }
1689}
1690
1691#[derive(Debug)]
1693pub struct ModelGraph<'a> {
1694 document: &'a ParsedDocument,
1695 limits: &'a ResourceLimits,
1696 materialized_families: BTreeSet<ObjectTypeName>,
1697}
1698
1699#[derive(Clone, Copy, Debug)]
1700struct ResourceCollection<'a> {
1701 resources: &'a crate::Dictionary,
1702 resource_name: &'static str,
1703 family: &'static str,
1704 context_prefix: &'static str,
1705 page_ordinal: usize,
1706 max_objects: usize,
1707}
1708
1709impl<'a> ModelGraph<'a> {
1710 fn for_rules(document: &'a ParsedDocument, limits: &'a ResourceLimits, rules: &[Rule]) -> Self {
1711 let materialized_families = rules
1712 .iter()
1713 .filter(|rule| !matches!(rule.test, crate::RuleExpr::Unsupported { .. }))
1714 .map(|rule| rule.object_type.clone())
1715 .collect();
1716 Self {
1717 document,
1718 limits,
1719 materialized_families,
1720 }
1721 }
1722
1723 fn with_all_families(document: &'a ParsedDocument, limits: &'a ResourceLimits) -> Self {
1724 Self {
1725 document,
1726 limits,
1727 materialized_families: ModelRegistry::default_registry()
1728 .family_names()
1729 .cloned()
1730 .collect(),
1731 }
1732 }
1733
1734 fn materializes(&self, family: &str) -> bool {
1735 self.materialized_families
1736 .iter()
1737 .any(|materialized| materialized.as_str() == family)
1738 }
1739
1740 fn materializes_generic_roots(&self) -> bool {
1741 self.materialized_families.iter().any(|family| {
1742 !matches!(
1743 family.as_str(),
1744 "document"
1745 | "catalog"
1746 | "metadata"
1747 | "page"
1748 | "font"
1749 | "annotation"
1750 | "outputIntent"
1751 | "contentStream"
1752 | "stream"
1753 | "object"
1754 )
1755 })
1756 }
1757
1758 fn catalog(&self) -> Option<CatalogModel<'a>> {
1759 self.document
1760 .catalog
1761 .and_then(|key| CatalogModel::new(self.document, key))
1762 }
1763
1764 fn metadata(&self, catalog: &CatalogModel<'_>) -> Option<MetadataModel<'a>> {
1765 MetadataModel::new(self.document, catalog.metadata)
1766 }
1767
1768 fn pages(&self, catalog: &CatalogModel<'_>, max_objects: usize) -> Result<Vec<PageModel<'a>>> {
1769 PageModel::from_catalog(self.document, catalog, self.limits, max_objects)
1770 }
1771
1772 fn fonts(&self, page: &PageModel<'_>, max_objects: usize) -> Result<Vec<FontModel<'a>>> {
1773 FontModel::from_page(self.document, page, max_objects)
1774 }
1775
1776 fn annotations(
1777 &self,
1778 page: &PageModel<'_>,
1779 max_objects: usize,
1780 ) -> Result<Vec<AnnotationModel<'a>>> {
1781 AnnotationModel::from_page(self.document, page, max_objects)
1782 }
1783
1784 fn output_intents(
1785 &self,
1786 catalog: &CatalogModel<'_>,
1787 max_objects: usize,
1788 ) -> Result<Vec<OutputIntentModel<'a>>> {
1789 OutputIntentModel::from_catalog(self.document, catalog, max_objects)
1790 }
1791
1792 fn content_streams(
1793 &self,
1794 page: &PageModel<'_>,
1795 max_objects: usize,
1796 ) -> Result<Vec<ContentStreamModel<'a>>> {
1797 ContentStreamModel::from_page(self.document, page, max_objects)
1798 }
1799
1800 fn push_streams(
1801 &self,
1802 objects: &mut Vec<ModelObjectRef<'a>>,
1803 max_objects: usize,
1804 ) -> Result<()> {
1805 for object in self.document.objects.values() {
1806 let Some(stream) = StreamModel::from_indirect_with_document(self.document, object)
1807 else {
1808 continue;
1809 };
1810 if Some(stream.key) != self.document.catalog {
1811 push_linked(objects, ModelObjectRef::Stream(stream), max_objects)?;
1812 }
1813 }
1814 Ok(())
1815 }
1816
1817 fn push_generic_roots(
1818 &self,
1819 objects: &mut Vec<ModelObjectRef<'a>>,
1820 max_objects: usize,
1821 ) -> Result<()> {
1822 for model in self.generic_models(max_objects)? {
1823 if !self.materialized_families.contains(&model.object_type) {
1824 continue;
1825 }
1826 push_linked(objects, ModelObjectRef::Generic(model), max_objects)?;
1827 }
1828 Ok(())
1829 }
1830
1831 fn generic_models(&self, max_objects: usize) -> Result<Vec<GenericModel<'a>>> {
1832 let mut models = Vec::new();
1833 if let Some(catalog) = self.catalog() {
1834 self.push_catalog_generic_models(&catalog, max_objects, &mut models)?;
1835 for page in self.pages(&catalog, max_objects)? {
1836 self.push_page_generic_models(&page, max_objects, &mut models)?;
1837 }
1838 }
1839 self.push_indirect_generic_models(max_objects, &mut models)?;
1840 Ok(models)
1841 }
1842
1843 fn push_catalog_generic_models(
1844 &self,
1845 catalog: &CatalogModel<'_>,
1846 max_objects: usize,
1847 models: &mut Vec<GenericModel<'a>>,
1848 ) -> Result<()> {
1849 let Some(catalog_object) = self.document.objects.get(&catalog.key) else {
1850 return Ok(());
1851 };
1852 let Some(dictionary) = catalog_object.object.as_dictionary() else {
1853 return Ok(());
1854 };
1855 for (family, key_name, context) in [
1856 ("acroForm", "AcroForm", "root/catalog[0]/acroForm[0]"),
1857 (
1858 "structureTreeRoot",
1859 "StructTreeRoot",
1860 "root/catalog[0]/structureTreeRoot[0]",
1861 ),
1862 (
1863 "optionalContentProperties",
1864 "OCProperties",
1865 "root/catalog[0]/optionalContentProperties[0]",
1866 ),
1867 ("names", "Names", "root/catalog[0]/names[0]"),
1868 ("outline", "Outlines", "root/catalog[0]/outline[0]"),
1869 ("permissions", "Perms", "root/catalog[0]/permissions[0]"),
1870 ] {
1871 if let Some((key, offset, dictionary)) =
1872 resolve_named_dictionary_from_option(self.document, dictionary.get(key_name))
1873 {
1874 push_generic_model(
1875 models,
1876 GenericModel::new(
1877 self.document,
1878 family,
1879 key,
1880 offset,
1881 dictionary,
1882 models.len(),
1883 context,
1884 ),
1885 max_objects,
1886 )?;
1887 }
1888 }
1889 for (ordinal, value) in array_values(dictionary.get("Dests")).enumerate() {
1890 if let Some((key, offset, dictionary)) = resolve_named_dictionary(self.document, value)
1891 {
1892 push_generic_model(
1893 models,
1894 GenericModel::new(
1895 self.document,
1896 "destination",
1897 key,
1898 offset,
1899 dictionary,
1900 ordinal,
1901 format!("root/catalog[0]/destination[{ordinal}]"),
1902 ),
1903 max_objects,
1904 )?;
1905 }
1906 }
1907 Ok(())
1908 }
1909
1910 fn push_page_generic_models(
1911 &self,
1912 page: &PageModel<'a>,
1913 max_objects: usize,
1914 models: &mut Vec<GenericModel<'a>>,
1915 ) -> Result<()> {
1916 if let Some(resources) =
1917 resolve_dictionary_value(self.document, page.dictionary.get("Resources"))
1918 {
1919 push_generic_model(
1920 models,
1921 GenericModel::new(
1922 self.document,
1923 "resource",
1924 None,
1925 None,
1926 resources,
1927 page.ordinal,
1928 format!("root/page[{}]/resources[0]", page.ordinal),
1929 ),
1930 max_objects,
1931 )?;
1932 self.push_resource_collection(
1933 ResourceCollection {
1934 resources,
1935 resource_name: "XObject",
1936 family: "xObject",
1937 context_prefix: "root/page",
1938 page_ordinal: page.ordinal,
1939 max_objects,
1940 },
1941 models,
1942 )?;
1943 self.push_resource_collection(
1944 ResourceCollection {
1945 resources,
1946 resource_name: "ColorSpace",
1947 family: "colorSpace",
1948 context_prefix: "root/page",
1949 page_ordinal: page.ordinal,
1950 max_objects,
1951 },
1952 models,
1953 )?;
1954 self.push_resource_collection(
1955 ResourceCollection {
1956 resources,
1957 resource_name: "ExtGState",
1958 family: "extGState",
1959 context_prefix: "root/page",
1960 page_ordinal: page.ordinal,
1961 max_objects,
1962 },
1963 models,
1964 )?;
1965 }
1966 Ok(())
1967 }
1968
1969 fn push_resource_collection(
1970 &self,
1971 collection: ResourceCollection<'a>,
1972 models: &mut Vec<GenericModel<'a>>,
1973 ) -> Result<()> {
1974 let Some(crate::CosObject::Dictionary(resources)) =
1975 collection.resources.get(collection.resource_name)
1976 else {
1977 return Ok(());
1978 };
1979 for (ordinal, (name, value)) in resources.iter().enumerate() {
1980 if let Some((key, offset, dictionary)) = resolve_named_dictionary(self.document, value)
1981 {
1982 let object_family = if collection.family == "xObject" {
1983 classify_xobject(dictionary).unwrap_or(collection.family)
1984 } else {
1985 collection.family
1986 };
1987 push_generic_model(
1988 models,
1989 GenericModel::new(
1990 self.document,
1991 object_family,
1992 key,
1993 offset,
1994 dictionary,
1995 ordinal,
1996 format!(
1997 "{}[{}]/{}[{}]",
1998 collection.context_prefix,
1999 collection.page_ordinal,
2000 collection.family,
2001 String::from_utf8_lossy(name.as_bytes())
2002 ),
2003 ),
2004 collection.max_objects,
2005 )?;
2006 }
2007 }
2008 Ok(())
2009 }
2010
2011 fn push_indirect_generic_models(
2012 &self,
2013 max_objects: usize,
2014 models: &mut Vec<GenericModel<'a>>,
2015 ) -> Result<()> {
2016 for object in self.document.objects.values() {
2017 let Some(dictionary) = object.object.as_dictionary() else {
2018 continue;
2019 };
2020 let Some(family) = classify_dictionary(dictionary) else {
2021 continue;
2022 };
2023 if matches!(
2024 family,
2025 "catalog" | "page" | "font" | "annotation" | "outputIntent" | "metadata"
2026 ) {
2027 continue;
2028 }
2029 push_generic_model(
2030 models,
2031 GenericModel::new(
2032 self.document,
2033 family,
2034 Some(object.key),
2035 Some(object.offset),
2036 dictionary,
2037 models.len(),
2038 format!("root/{family}[{}]", object.key.number),
2039 ),
2040 max_objects,
2041 )?;
2042 }
2043 Ok(())
2044 }
2045}
2046
2047#[derive(Clone, Debug)]
2049pub struct DocumentModel<'a> {
2050 document: &'a ParsedDocument,
2051 object_type: ObjectTypeName,
2052 supertypes: Vec<ObjectTypeName>,
2053 links: Vec<LinkName>,
2054}
2055
2056impl<'a> DocumentModel<'a> {
2057 #[must_use]
2059 pub fn new(document: &'a ParsedDocument) -> Self {
2060 Self {
2061 document,
2062 object_type: ObjectTypeName::unchecked("document"),
2063 supertypes: Vec::new(),
2064 links: vec![LinkName(Identifier::unchecked("catalog"))],
2065 }
2066 }
2067}
2068
2069impl ModelObject for DocumentModel<'_> {
2070 fn id(&self) -> Option<ObjectIdentity> {
2071 Some(ObjectIdentity {
2072 key: String::from("document"),
2073 })
2074 }
2075
2076 fn object_type(&self) -> ObjectTypeName {
2077 self.object_type.clone()
2078 }
2079
2080 fn super_types(&self) -> &[ObjectTypeName] {
2081 &self.supertypes
2082 }
2083
2084 fn extra_context(&self) -> Option<&str> {
2085 Some("root")
2086 }
2087
2088 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2089 match name.as_str() {
2090 "headerOffset" => Ok(ModelValue::Number(u64_to_f64(header_offset(
2091 self.document,
2092 ))?)),
2093 "postEOFDataSize" => Ok(ModelValue::Number(u64_to_f64(post_eof_data_size(
2094 self.document,
2095 ))?)),
2096 "header" => Ok(ModelValue::String(BoundedText::new(
2097 format!(
2098 "%PDF-{}.{}",
2099 self.document.version.major, self.document.version.minor
2100 ),
2101 32,
2102 )?)),
2103 "encrypted" | "isEncrypted" => Ok(ModelValue::Bool(self.document.is_encrypted())),
2104 "hasCatalog" => Ok(ModelValue::Bool(self.document.catalog.is_some())),
2105 "containsXRefStream" => Ok(ModelValue::Bool(contains_xref_stream(self.document))),
2106 "nrIndirects" => Ok(ModelValue::Number(usize_to_f64(
2107 self.document.objects.len(),
2108 )?)),
2109 "containsPDFUAIdentification" => Ok(ModelValue::Bool(contains_xmp_family(
2110 self.document,
2111 "pdfua",
2112 ))),
2113 "containsPDFAIdentification" => {
2114 Ok(ModelValue::Bool(contains_xmp_family(self.document, "pdfa")))
2115 }
2116 "part" => Ok(ModelValue::Number(0.0)),
2117 "partPrefix" | "rev" | "revPrefix" => Ok(ModelValue::Null),
2118 _ => Err(crate::ProfileError::UnknownProperty {
2119 property: BoundedText::unchecked(name.as_str()),
2120 }
2121 .into()),
2122 }
2123 }
2124
2125 fn links(&self) -> &[LinkName] {
2126 &self.links
2127 }
2128
2129 fn linked_objects<'a>(
2130 &self,
2131 graph: &ModelGraph<'a>,
2132 max_objects: usize,
2133 ) -> Result<Vec<ModelObjectRef<'a>>> {
2134 let mut objects = Vec::new();
2135 if let Some(catalog) = graph.catalog() {
2136 push_linked(&mut objects, ModelObjectRef::Catalog(catalog), max_objects)?;
2137 }
2138 if graph.materializes("stream") {
2139 graph.push_streams(&mut objects, max_objects)?;
2140 }
2141 if graph.materializes_generic_roots() {
2142 graph.push_generic_roots(&mut objects, max_objects)?;
2143 }
2144 Ok(objects)
2145 }
2146}
2147
2148#[derive(Clone, Debug)]
2150pub struct CatalogModel<'a> {
2151 document: &'a ParsedDocument,
2152 key: ObjectKey,
2153 offset: u64,
2154 metadata: Option<ObjectKey>,
2155 pages: Option<ObjectKey>,
2156 object_type: ObjectTypeName,
2157 supertypes: Vec<ObjectTypeName>,
2158 links: Vec<LinkName>,
2159}
2160
2161impl<'a> CatalogModel<'a> {
2162 fn new(document: &'a ParsedDocument, key: ObjectKey) -> Option<Self> {
2163 let object = document.objects.get(&key)?;
2164 let dictionary = object.object.as_dictionary()?;
2165 let metadata = match dictionary.get("Metadata") {
2166 Some(crate::CosObject::Reference(key)) => Some(*key),
2167 _ => None,
2168 };
2169 let pages = match dictionary.get("Pages") {
2170 Some(crate::CosObject::Reference(key)) => Some(*key),
2171 _ => None,
2172 };
2173 Some(Self {
2174 document,
2175 key,
2176 offset: object.offset,
2177 metadata,
2178 pages,
2179 object_type: ObjectTypeName::unchecked("catalog"),
2180 supertypes: vec![ObjectTypeName::unchecked("object")],
2181 links: vec![LinkName(Identifier::unchecked("metadata"))],
2182 })
2183 }
2184}
2185
2186impl ModelObject for CatalogModel<'_> {
2187 fn id(&self) -> Option<ObjectIdentity> {
2188 Some(ObjectIdentity {
2189 key: format!("catalog:{}:{}", self.key.number, self.key.generation),
2190 })
2191 }
2192
2193 fn object_type(&self) -> ObjectTypeName {
2194 self.object_type.clone()
2195 }
2196
2197 fn super_types(&self) -> &[ObjectTypeName] {
2198 &self.supertypes
2199 }
2200
2201 fn extra_context(&self) -> Option<&str> {
2202 Some("catalog")
2203 }
2204
2205 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2206 match name.as_str() {
2207 "hasMetadata" => Ok(ModelValue::Bool(self.metadata.is_some())),
2208 "hasAcroForm" | "containsAcroForm" => Ok(ModelValue::Bool(
2209 self.document
2210 .objects
2211 .get(&self.key)
2212 .and_then(|object| object.object.as_dictionary())
2213 .and_then(|dictionary| dictionary.get("AcroForm"))
2214 .is_some(),
2215 )),
2216 "hasStructTreeRoot" | "containsStructTreeRoot" => Ok(ModelValue::Bool(
2217 self.document
2218 .objects
2219 .get(&self.key)
2220 .and_then(|object| object.object.as_dictionary())
2221 .and_then(|dictionary| dictionary.get("StructTreeRoot"))
2222 .is_some(),
2223 )),
2224 "hasOCProperties" | "containsOCProperties" => Ok(ModelValue::Bool(
2225 self.document
2226 .objects
2227 .get(&self.key)
2228 .and_then(|object| object.object.as_dictionary())
2229 .and_then(|dictionary| dictionary.get("OCProperties"))
2230 .is_some(),
2231 )),
2232 "hasLang" => Ok(ModelValue::Bool(
2233 self.document
2234 .objects
2235 .get(&self.key)
2236 .and_then(|object| object.object.as_dictionary())
2237 .and_then(|dictionary| dictionary.get("Lang"))
2238 .is_some(),
2239 )),
2240 "hasOutlines" => Ok(ModelValue::Bool(
2241 self.document
2242 .objects
2243 .get(&self.key)
2244 .and_then(|object| object.object.as_dictionary())
2245 .and_then(|dictionary| dictionary.get("Outlines"))
2246 .is_some(),
2247 )),
2248 "hasNames" => Ok(ModelValue::Bool(
2249 self.document
2250 .objects
2251 .get(&self.key)
2252 .and_then(|object| object.object.as_dictionary())
2253 .and_then(|dictionary| dictionary.get("Names"))
2254 .is_some(),
2255 )),
2256 "hasDests" => Ok(ModelValue::Bool(
2257 self.document
2258 .objects
2259 .get(&self.key)
2260 .and_then(|object| object.object.as_dictionary())
2261 .and_then(|dictionary| dictionary.get("Dests"))
2262 .is_some(),
2263 )),
2264 "Marked" => Ok(ModelValue::Bool(false)),
2265 _ => self
2266 .document
2267 .objects
2268 .get(&self.key)
2269 .and_then(|object| object.object.as_dictionary())
2270 .map_or_else(
2271 || unknown_property(name),
2272 |dictionary| dictionary_property(dictionary, name, CATALOG_DIRECT_PROPERTIES),
2273 ),
2274 }
2275 }
2276
2277 fn links(&self) -> &[LinkName] {
2278 &self.links
2279 }
2280
2281 fn linked_objects<'a>(
2282 &self,
2283 graph: &ModelGraph<'a>,
2284 max_objects: usize,
2285 ) -> Result<Vec<ModelObjectRef<'a>>> {
2286 let mut objects = graph
2287 .metadata(self)
2288 .map(ModelObjectRef::Metadata)
2289 .into_iter()
2290 .collect::<Vec<_>>();
2291 if objects.len() > max_objects {
2292 return Err(ValidationError::LimitExceeded {
2293 limit: "max_objects",
2294 }
2295 .into());
2296 }
2297 let mut output_intents =
2298 graph.output_intents(self, max_objects.saturating_sub(objects.len()))?;
2299 output_intents.reverse();
2300 for output_intent in output_intents {
2301 push_linked(
2302 &mut objects,
2303 ModelObjectRef::OutputIntent(output_intent),
2304 max_objects,
2305 )?;
2306 }
2307 let mut pages = graph.pages(self, max_objects.saturating_sub(objects.len()))?;
2308 pages.reverse();
2309 for page in pages {
2310 push_linked(&mut objects, ModelObjectRef::Page(page), max_objects)?;
2311 }
2312 Ok(objects)
2313 }
2314}
2315
2316#[derive(Clone, Debug)]
2318pub struct MetadataModel<'a> {
2319 document: &'a ParsedDocument,
2320 key: ObjectKey,
2321 offset: u64,
2322 object_type: ObjectTypeName,
2323 supertypes: Vec<ObjectTypeName>,
2324 links: Vec<LinkName>,
2325}
2326
2327impl<'a> MetadataModel<'a> {
2328 fn new(document: &'a ParsedDocument, key: Option<ObjectKey>) -> Option<Self> {
2329 let key = key?;
2330 let object = document.objects.get(&key)?;
2331 if !matches!(object.object, crate::CosObject::Stream(_)) {
2332 return None;
2333 }
2334 Some(Self {
2335 document,
2336 key,
2337 offset: object.offset,
2338 object_type: ObjectTypeName::unchecked("metadata"),
2339 supertypes: vec![
2340 ObjectTypeName::unchecked("stream"),
2341 ObjectTypeName::unchecked("object"),
2342 ],
2343 links: Vec::new(),
2344 })
2345 }
2346}
2347
2348impl ModelObject for MetadataModel<'_> {
2349 fn id(&self) -> Option<ObjectIdentity> {
2350 Some(ObjectIdentity {
2351 key: format!("metadata:{}:{}", self.key.number, self.key.generation),
2352 })
2353 }
2354
2355 fn object_type(&self) -> ObjectTypeName {
2356 self.object_type.clone()
2357 }
2358
2359 fn super_types(&self) -> &[ObjectTypeName] {
2360 &self.supertypes
2361 }
2362
2363 fn extra_context(&self) -> Option<&str> {
2364 Some("metadata")
2365 }
2366
2367 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2368 match name.as_str() {
2369 "present" | "catalogMetadata" => Ok(ModelValue::Bool(true)),
2370 "containsPDFAIdentification" => {
2371 Ok(ModelValue::Bool(contains_xmp_family(self.document, "pdfa")))
2372 }
2373 "containsPDFUAIdentification" => Ok(ModelValue::Bool(contains_xmp_family(
2374 self.document,
2375 "pdfua",
2376 ))),
2377 "part" => Ok(ModelValue::Number(xmp_part(self.document).unwrap_or(0.0))),
2378 "partPrefix" => Ok(ModelValue::String(BoundedText::unchecked(
2379 xmp_prefix_for_claim(self.document).unwrap_or("pdfaid"),
2380 ))),
2381 "conformance" => Ok(
2382 xmp_conformance(self.document).map_or(ModelValue::Null, |value| {
2383 ModelValue::String(BoundedText::unchecked(value))
2384 }),
2385 ),
2386 "conformancePrefix" | "revPrefix" | "amdPrefix" | "corrPrefix" => {
2387 Ok(ModelValue::String(BoundedText::unchecked("pdfaid")))
2388 }
2389 "rev" => Ok(ModelValue::Null),
2390 "declarations" => Ok(ModelValue::List(xmp_declarations(self.document))),
2391 _ => self.document.objects.get(&self.key).map_or_else(
2392 || unknown_property(name),
2393 |object| match &object.object {
2394 crate::CosObject::Stream(stream) => {
2395 dictionary_property(&stream.dictionary, name, METADATA_DIRECT_PROPERTIES)
2396 }
2397 _ => unknown_property(name),
2398 },
2399 ),
2400 }
2401 }
2402
2403 fn links(&self) -> &[LinkName] {
2404 &self.links
2405 }
2406
2407 fn linked_objects<'a>(
2408 &self,
2409 _graph: &ModelGraph<'a>,
2410 _max_objects: usize,
2411 ) -> Result<Vec<ModelObjectRef<'a>>> {
2412 Ok(Vec::new())
2413 }
2414}
2415
2416#[derive(Clone, Debug)]
2418pub struct PageModel<'a> {
2419 document: &'a ParsedDocument,
2420 key: ObjectKey,
2421 offset: u64,
2422 ordinal: usize,
2423 dictionary: &'a crate::Dictionary,
2424 object_type: ObjectTypeName,
2425 supertypes: Vec<ObjectTypeName>,
2426 links: Vec<LinkName>,
2427}
2428
2429impl<'a> PageModel<'a> {
2430 fn from_catalog(
2431 document: &'a ParsedDocument,
2432 catalog: &CatalogModel<'_>,
2433 limits: &ResourceLimits,
2434 max_objects: usize,
2435 ) -> Result<Vec<Self>> {
2436 let Some(pages_root) = catalog.pages else {
2437 return Ok(Vec::new());
2438 };
2439 let mut stack = vec![pages_root];
2440 let mut pages = Vec::new();
2441 let mut visited = HashSet::new();
2442 while let Some(key) = stack.pop() {
2443 if !visited.insert(key) {
2444 continue;
2445 }
2446 let Some(object) = document.objects.get(&key) else {
2447 continue;
2448 };
2449 let Some(dictionary) = object.object.as_dictionary() else {
2450 continue;
2451 };
2452 match dictionary.get("Type") {
2453 Some(crate::CosObject::Name(name)) if name.matches("Page") => {
2454 if pages.len() >= max_objects {
2455 return Err(ValidationError::LimitExceeded {
2456 limit: "max_objects",
2457 }
2458 .into());
2459 }
2460 pages.push(Self {
2461 document,
2462 key,
2463 offset: object.offset,
2464 ordinal: pages.len(),
2465 dictionary,
2466 object_type: ObjectTypeName::unchecked("page"),
2467 supertypes: vec![ObjectTypeName::unchecked("object")],
2468 links: vec![
2469 LinkName(Identifier::unchecked("fonts")),
2470 LinkName(Identifier::unchecked("annotations")),
2471 LinkName(Identifier::unchecked("contentStreams")),
2472 ],
2473 });
2474 }
2475 _ => {
2476 for kid in object_refs_from_array(dictionary.get("Kids"))
2477 .into_iter()
2478 .rev()
2479 {
2480 stack.push(kid);
2481 }
2482 }
2483 }
2484 if u64::try_from(visited.len()).map_err(|_| ValidationError::LimitExceeded {
2485 limit: "max_objects",
2486 })? > limits.max_objects
2487 {
2488 return Err(ValidationError::LimitExceeded {
2489 limit: "max_objects",
2490 }
2491 .into());
2492 }
2493 }
2494 Ok(pages)
2495 }
2496}
2497
2498impl ModelObject for PageModel<'_> {
2499 fn id(&self) -> Option<ObjectIdentity> {
2500 Some(ObjectIdentity {
2501 key: format!("page:{}:{}", self.key.number, self.key.generation),
2502 })
2503 }
2504
2505 fn object_type(&self) -> ObjectTypeName {
2506 self.object_type.clone()
2507 }
2508
2509 fn super_types(&self) -> &[ObjectTypeName] {
2510 &self.supertypes
2511 }
2512
2513 fn extra_context(&self) -> Option<&str> {
2514 Some("page")
2515 }
2516
2517 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2518 match name.as_str() {
2519 "hasContents" => Ok(ModelValue::Bool(self.dictionary.get("Contents").is_some())),
2520 "hasResources" => Ok(ModelValue::Bool(self.dictionary.get("Resources").is_some())),
2521 "annotationCount" => Ok(ModelValue::Number(usize_to_f64(
2522 object_refs_or_direct_count(self.dictionary.get("Annots")),
2523 )?)),
2524 _ => dictionary_property(self.dictionary, name, PAGE_DIRECT_PROPERTIES),
2525 }
2526 }
2527
2528 fn links(&self) -> &[LinkName] {
2529 &self.links
2530 }
2531
2532 fn linked_objects<'a>(
2533 &self,
2534 graph: &ModelGraph<'a>,
2535 max_objects: usize,
2536 ) -> Result<Vec<ModelObjectRef<'a>>> {
2537 let mut objects = Vec::new();
2538 let mut content_streams = graph.content_streams(self, max_objects)?;
2539 content_streams.reverse();
2540 for content_stream in content_streams {
2541 push_linked(
2542 &mut objects,
2543 ModelObjectRef::ContentStream(content_stream),
2544 max_objects,
2545 )?;
2546 }
2547 let mut annotations = graph.annotations(self, max_objects.saturating_sub(objects.len()))?;
2548 annotations.reverse();
2549 for annotation in annotations {
2550 push_linked(
2551 &mut objects,
2552 ModelObjectRef::Annotation(annotation),
2553 max_objects,
2554 )?;
2555 }
2556 let mut fonts = graph.fonts(self, max_objects.saturating_sub(objects.len()))?;
2557 fonts.reverse();
2558 for font in fonts {
2559 push_linked(&mut objects, ModelObjectRef::Font(font), max_objects)?;
2560 }
2561 Ok(objects)
2562 }
2563}
2564
2565#[derive(Clone, Debug)]
2567pub struct FontModel<'a> {
2568 document: &'a ParsedDocument,
2569 page_ordinal: usize,
2570 key: Option<ObjectKey>,
2571 offset: Option<u64>,
2572 name: PdfName,
2573 dictionary: &'a crate::Dictionary,
2574 object_type: ObjectTypeName,
2575 supertypes: Vec<ObjectTypeName>,
2576 links: Vec<LinkName>,
2577}
2578
2579impl<'a> FontModel<'a> {
2580 fn from_page(
2581 document: &'a ParsedDocument,
2582 page: &PageModel<'_>,
2583 max_objects: usize,
2584 ) -> Result<Vec<Self>> {
2585 let mut fonts = Vec::new();
2586 let Some(page_dictionary) = page_dictionary(document, page.key) else {
2587 return Ok(fonts);
2588 };
2589 let Some(resources) = resolve_dictionary_value(document, page_dictionary.get("Resources"))
2590 else {
2591 return Ok(fonts);
2592 };
2593 let Some(crate::CosObject::Dictionary(fonts_dictionary)) = resources.get("Font") else {
2594 return Ok(fonts);
2595 };
2596 for (name, value) in fonts_dictionary.iter() {
2597 if let Some((key, offset, dictionary)) = resolve_named_dictionary(document, value) {
2598 if fonts.len() >= max_objects {
2599 return Err(ValidationError::LimitExceeded {
2600 limit: "max_objects",
2601 }
2602 .into());
2603 }
2604 fonts.push(Self {
2605 document,
2606 page_ordinal: page.ordinal,
2607 key,
2608 offset,
2609 name: name.clone(),
2610 dictionary,
2611 object_type: ObjectTypeName::unchecked("font"),
2612 supertypes: vec![ObjectTypeName::unchecked("object")],
2613 links: Vec::new(),
2614 });
2615 }
2616 }
2617 Ok(fonts)
2618 }
2619}
2620
2621impl ModelObject for FontModel<'_> {
2622 fn id(&self) -> Option<ObjectIdentity> {
2623 Some(ObjectIdentity {
2624 key: format!(
2625 "font:{}:{}",
2626 self.page_ordinal,
2627 String::from_utf8_lossy(self.name.as_bytes())
2628 ),
2629 })
2630 }
2631
2632 fn object_type(&self) -> ObjectTypeName {
2633 self.object_type.clone()
2634 }
2635
2636 fn super_types(&self) -> &[ObjectTypeName] {
2637 &self.supertypes
2638 }
2639
2640 fn extra_context(&self) -> Option<&str> {
2641 Some("font")
2642 }
2643
2644 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2645 match name.as_str() {
2646 "embedded" => Ok(ModelValue::Bool(
2647 self.dictionary.get("FontDescriptor").is_some(),
2648 )),
2649 "hasSubtype" => Ok(ModelValue::Bool(self.dictionary.get("Subtype").is_some())),
2650 _ => dictionary_property(self.dictionary, name, FONT_DIRECT_PROPERTIES),
2651 }
2652 }
2653
2654 fn links(&self) -> &[LinkName] {
2655 &self.links
2656 }
2657
2658 fn linked_objects<'a>(
2659 &self,
2660 _graph: &ModelGraph<'a>,
2661 _max_objects: usize,
2662 ) -> Result<Vec<ModelObjectRef<'a>>> {
2663 Ok(Vec::new())
2664 }
2665}
2666
2667#[derive(Clone, Debug)]
2669pub struct AnnotationModel<'a> {
2670 document: &'a ParsedDocument,
2671 page_ordinal: usize,
2672 ordinal: usize,
2673 key: Option<ObjectKey>,
2674 offset: Option<u64>,
2675 dictionary: &'a crate::Dictionary,
2676 object_type: ObjectTypeName,
2677 supertypes: Vec<ObjectTypeName>,
2678 links: Vec<LinkName>,
2679}
2680
2681impl<'a> AnnotationModel<'a> {
2682 fn from_page(
2683 document: &'a ParsedDocument,
2684 page: &PageModel<'_>,
2685 max_objects: usize,
2686 ) -> Result<Vec<Self>> {
2687 let mut annotations = Vec::new();
2688 let Some(page_dictionary) = page_dictionary(document, page.key) else {
2689 return Ok(annotations);
2690 };
2691 for (ordinal, value) in array_values(page_dictionary.get("Annots")).enumerate() {
2692 if let Some((key, offset, dictionary)) = resolve_named_dictionary(document, value) {
2693 if annotations.len() >= max_objects {
2694 return Err(ValidationError::LimitExceeded {
2695 limit: "max_objects",
2696 }
2697 .into());
2698 }
2699 annotations.push(Self {
2700 document,
2701 page_ordinal: page.ordinal,
2702 ordinal,
2703 key,
2704 offset,
2705 dictionary,
2706 object_type: ObjectTypeName::unchecked("annotation"),
2707 supertypes: vec![ObjectTypeName::unchecked("object")],
2708 links: Vec::new(),
2709 });
2710 }
2711 }
2712 Ok(annotations)
2713 }
2714}
2715
2716impl ModelObject for AnnotationModel<'_> {
2717 fn id(&self) -> Option<ObjectIdentity> {
2718 Some(ObjectIdentity {
2719 key: format!("annotation:{}:{}", self.page_ordinal, self.ordinal),
2720 })
2721 }
2722
2723 fn object_type(&self) -> ObjectTypeName {
2724 self.object_type.clone()
2725 }
2726
2727 fn super_types(&self) -> &[ObjectTypeName] {
2728 &self.supertypes
2729 }
2730
2731 fn extra_context(&self) -> Option<&str> {
2732 Some("annotation")
2733 }
2734
2735 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2736 match name.as_str() {
2737 "hasSubtype" => Ok(ModelValue::Bool(self.dictionary.get("Subtype").is_some())),
2738 _ => dictionary_property(self.dictionary, name, ANNOTATION_DIRECT_PROPERTIES),
2739 }
2740 }
2741
2742 fn links(&self) -> &[LinkName] {
2743 &self.links
2744 }
2745
2746 fn linked_objects<'a>(
2747 &self,
2748 _graph: &ModelGraph<'a>,
2749 _max_objects: usize,
2750 ) -> Result<Vec<ModelObjectRef<'a>>> {
2751 Ok(Vec::new())
2752 }
2753}
2754
2755#[derive(Clone, Debug)]
2757pub struct OutputIntentModel<'a> {
2758 document: &'a ParsedDocument,
2759 ordinal: usize,
2760 key: Option<ObjectKey>,
2761 offset: Option<u64>,
2762 dictionary: &'a crate::Dictionary,
2763 object_type: ObjectTypeName,
2764 supertypes: Vec<ObjectTypeName>,
2765 links: Vec<LinkName>,
2766}
2767
2768impl<'a> OutputIntentModel<'a> {
2769 fn from_catalog(
2770 document: &'a ParsedDocument,
2771 catalog: &CatalogModel<'_>,
2772 max_objects: usize,
2773 ) -> Result<Vec<Self>> {
2774 let Some(catalog_object) = document.objects.get(&catalog.key) else {
2775 return Ok(Vec::new());
2776 };
2777 let Some(catalog_dictionary) = catalog_object.object.as_dictionary() else {
2778 return Ok(Vec::new());
2779 };
2780 let mut output_intents = Vec::new();
2781 for (ordinal, value) in array_values(catalog_dictionary.get("OutputIntents")).enumerate() {
2782 if let Some((key, offset, dictionary)) = resolve_named_dictionary(document, value) {
2783 if output_intents.len() >= max_objects {
2784 return Err(ValidationError::LimitExceeded {
2785 limit: "max_objects",
2786 }
2787 .into());
2788 }
2789 output_intents.push(Self {
2790 document,
2791 ordinal,
2792 key,
2793 offset,
2794 dictionary,
2795 object_type: ObjectTypeName::unchecked("outputIntent"),
2796 supertypes: vec![ObjectTypeName::unchecked("object")],
2797 links: Vec::new(),
2798 });
2799 }
2800 }
2801 Ok(output_intents)
2802 }
2803}
2804
2805impl ModelObject for OutputIntentModel<'_> {
2806 fn id(&self) -> Option<ObjectIdentity> {
2807 Some(ObjectIdentity {
2808 key: format!("outputIntent:{}", self.ordinal),
2809 })
2810 }
2811
2812 fn object_type(&self) -> ObjectTypeName {
2813 self.object_type.clone()
2814 }
2815
2816 fn super_types(&self) -> &[ObjectTypeName] {
2817 &self.supertypes
2818 }
2819
2820 fn extra_context(&self) -> Option<&str> {
2821 Some("outputIntent")
2822 }
2823
2824 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2825 match name.as_str() {
2826 "hasDestOutputProfile" => Ok(ModelValue::Bool(
2827 self.dictionary.get("DestOutputProfile").is_some(),
2828 )),
2829 _ => dictionary_property(self.dictionary, name, OUTPUT_INTENT_DIRECT_PROPERTIES),
2830 }
2831 }
2832
2833 fn links(&self) -> &[LinkName] {
2834 &self.links
2835 }
2836
2837 fn linked_objects<'a>(
2838 &self,
2839 _graph: &ModelGraph<'a>,
2840 _max_objects: usize,
2841 ) -> Result<Vec<ModelObjectRef<'a>>> {
2842 Ok(Vec::new())
2843 }
2844}
2845
2846#[derive(Clone, Debug)]
2848pub struct ContentStreamModel<'a> {
2849 document: &'a ParsedDocument,
2850 page_ordinal: usize,
2851 ordinal: usize,
2852 key: ObjectKey,
2853 offset: u64,
2854 stream: &'a crate::StreamObject,
2855 object_type: ObjectTypeName,
2856 supertypes: Vec<ObjectTypeName>,
2857 links: Vec<LinkName>,
2858}
2859
2860impl<'a> ContentStreamModel<'a> {
2861 fn from_page(
2862 document: &'a ParsedDocument,
2863 page: &PageModel<'_>,
2864 max_objects: usize,
2865 ) -> Result<Vec<Self>> {
2866 let mut streams = Vec::new();
2867 let Some(page_dictionary) = page_dictionary(document, page.key) else {
2868 return Ok(streams);
2869 };
2870 push_content_streams_from_value(
2871 document,
2872 page,
2873 page_dictionary.get("Contents"),
2874 max_objects,
2875 &mut streams,
2876 )?;
2877 Ok(streams)
2878 }
2879}
2880
2881impl ModelObject for ContentStreamModel<'_> {
2882 fn id(&self) -> Option<ObjectIdentity> {
2883 Some(ObjectIdentity {
2884 key: format!(
2885 "contentStream:{}:{}:{}",
2886 self.page_ordinal, self.key.number, self.key.generation
2887 ),
2888 })
2889 }
2890
2891 fn object_type(&self) -> ObjectTypeName {
2892 self.object_type.clone()
2893 }
2894
2895 fn super_types(&self) -> &[ObjectTypeName] {
2896 &self.supertypes
2897 }
2898
2899 fn extra_context(&self) -> Option<&str> {
2900 Some("contentStream")
2901 }
2902
2903 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
2904 match name.as_str() {
2905 "lengthMatches" => {
2906 Ok(ModelValue::Bool(self.stream.declared_length.is_none_or(
2907 |declared| declared == self.stream.discovered_length,
2908 )))
2909 }
2910 "declaredLength" => Ok(ModelValue::Number(u64_to_f64(
2911 self.stream
2912 .declared_length
2913 .unwrap_or(self.stream.discovered_length),
2914 )?)),
2915 "discoveredLength" => Ok(ModelValue::Number(u64_to_f64(
2916 self.stream.discovered_length,
2917 )?)),
2918 _ => dictionary_property(&self.stream.dictionary, name, STREAM_DIRECT_PROPERTIES),
2919 }
2920 }
2921
2922 fn links(&self) -> &[LinkName] {
2923 &self.links
2924 }
2925
2926 fn linked_objects<'a>(
2927 &self,
2928 _graph: &ModelGraph<'a>,
2929 _max_objects: usize,
2930 ) -> Result<Vec<ModelObjectRef<'a>>> {
2931 Ok(Vec::new())
2932 }
2933}
2934
2935fn resolve_dictionary_value<'a>(
2936 document: &'a ParsedDocument,
2937 value: Option<&'a crate::CosObject>,
2938) -> Option<&'a crate::Dictionary> {
2939 match value {
2940 Some(crate::CosObject::Dictionary(dictionary)) => Some(dictionary),
2941 Some(crate::CosObject::Reference(key)) => document.objects.get(key)?.object.as_dictionary(),
2942 _ => None,
2943 }
2944}
2945
2946fn page_dictionary(document: &ParsedDocument, key: ObjectKey) -> Option<&crate::Dictionary> {
2947 document.objects.get(&key)?.object.as_dictionary()
2948}
2949
2950fn resolve_named_dictionary<'a>(
2951 document: &'a ParsedDocument,
2952 value: &'a crate::CosObject,
2953) -> Option<(Option<ObjectKey>, Option<u64>, &'a crate::Dictionary)> {
2954 match value {
2955 crate::CosObject::Dictionary(dictionary) => Some((None, None, dictionary)),
2956 crate::CosObject::Reference(key) => {
2957 let object = document.objects.get(key)?;
2958 let dictionary = object.object.as_dictionary()?;
2959 Some((Some(*key), Some(object.offset), dictionary))
2960 }
2961 _ => None,
2962 }
2963}
2964
2965fn object_refs_from_array(value: Option<&crate::CosObject>) -> Vec<ObjectKey> {
2966 match value {
2967 Some(crate::CosObject::Array(values)) => values
2968 .iter()
2969 .filter_map(|value| match value {
2970 crate::CosObject::Reference(key) => Some(*key),
2971 _ => None,
2972 })
2973 .collect(),
2974 _ => Vec::new(),
2975 }
2976}
2977
2978fn push_content_streams_from_value<'a>(
2979 document: &'a ParsedDocument,
2980 page: &PageModel<'_>,
2981 value: Option<&crate::CosObject>,
2982 max_objects: usize,
2983 streams: &mut Vec<ContentStreamModel<'a>>,
2984) -> Result<()> {
2985 match value {
2986 Some(crate::CosObject::Reference(key)) => {
2987 push_content_stream(document, page, *key, 0, max_objects, streams)?;
2988 }
2989 Some(crate::CosObject::Array(values)) => {
2990 let mut ordinal = 0_usize;
2991 for value in values {
2992 let crate::CosObject::Reference(key) = value else {
2993 continue;
2994 };
2995 push_content_stream(document, page, *key, ordinal, max_objects, streams)?;
2996 ordinal = ordinal
2997 .checked_add(1)
2998 .ok_or(ValidationError::LimitExceeded {
2999 limit: "max_objects",
3000 })?;
3001 }
3002 }
3003 Some(_) | None => {}
3004 }
3005 Ok(())
3006}
3007
3008fn push_content_stream<'a>(
3009 document: &'a ParsedDocument,
3010 page: &PageModel<'_>,
3011 key: ObjectKey,
3012 ordinal: usize,
3013 max_objects: usize,
3014 streams: &mut Vec<ContentStreamModel<'a>>,
3015) -> Result<()> {
3016 let Some(object) = document.objects.get(&key) else {
3017 return Ok(());
3018 };
3019 let crate::CosObject::Stream(stream) = &object.object else {
3020 return Ok(());
3021 };
3022 if streams.len() >= max_objects {
3023 return Err(ValidationError::LimitExceeded {
3024 limit: "max_objects",
3025 }
3026 .into());
3027 }
3028 streams.push(ContentStreamModel {
3029 document,
3030 page_ordinal: page.ordinal,
3031 ordinal,
3032 key,
3033 offset: object.offset,
3034 stream,
3035 object_type: ObjectTypeName::unchecked("contentStream"),
3036 supertypes: vec![
3037 ObjectTypeName::unchecked("stream"),
3038 ObjectTypeName::unchecked("object"),
3039 ],
3040 links: Vec::new(),
3041 });
3042 Ok(())
3043}
3044
3045fn object_refs_or_direct_count(value: Option<&crate::CosObject>) -> usize {
3046 match value {
3047 Some(crate::CosObject::Array(values)) => values.len(),
3048 Some(_) => 1,
3049 None => 0,
3050 }
3051}
3052
3053fn array_values(value: Option<&crate::CosObject>) -> impl Iterator<Item = &crate::CosObject> {
3054 value
3055 .and_then(|value| match value {
3056 crate::CosObject::Array(values) => Some(values.as_slice()),
3057 _ => None,
3058 })
3059 .into_iter()
3060 .flatten()
3061}
3062
3063fn dictionary_property(
3064 dictionary: &crate::Dictionary,
3065 name: &PropertyName,
3066 allowed_names: &[&str],
3067) -> Result<ModelValue> {
3068 if !allowed_names.contains(&name.as_str()) {
3069 return unknown_property(name);
3070 }
3071 Ok(dictionary
3072 .get(name.as_str())
3073 .cloned()
3074 .map_or(ModelValue::Null, ModelValue::from))
3075}
3076
3077fn unknown_property(name: &PropertyName) -> Result<ModelValue> {
3078 Err(crate::ProfileError::UnknownProperty {
3079 property: BoundedText::unchecked(name.as_str()),
3080 }
3081 .into())
3082}
3083
3084#[derive(Clone, Debug)]
3086pub struct GenericModel<'a> {
3087 document: &'a ParsedDocument,
3088 key: Option<ObjectKey>,
3089 offset: Option<u64>,
3090 dictionary: &'a crate::Dictionary,
3091 object_type: ObjectTypeName,
3092 supertypes: Vec<ObjectTypeName>,
3093 links: Vec<LinkName>,
3094 allowed_properties: &'static [&'static str],
3095 context: String,
3096 ordinal: usize,
3097}
3098
3099impl<'a> GenericModel<'a> {
3100 fn new(
3101 document: &'a ParsedDocument,
3102 family: &'static str,
3103 key: Option<ObjectKey>,
3104 offset: Option<u64>,
3105 dictionary: &'a crate::Dictionary,
3106 ordinal: usize,
3107 context: impl Into<String>,
3108 ) -> Self {
3109 Self {
3110 document,
3111 key,
3112 offset,
3113 dictionary,
3114 object_type: ObjectTypeName::unchecked(family),
3115 supertypes: vec![ObjectTypeName::unchecked("object")],
3116 links: Vec::new(),
3117 allowed_properties: family_direct_properties(family),
3118 context: context.into(),
3119 ordinal,
3120 }
3121 }
3122}
3123
3124impl ModelObject for GenericModel<'_> {
3125 fn id(&self) -> Option<ObjectIdentity> {
3126 Some(ObjectIdentity {
3127 key: format!("{}:{}", self.object_type.as_str(), self.ordinal),
3128 })
3129 }
3130
3131 fn object_type(&self) -> ObjectTypeName {
3132 self.object_type.clone()
3133 }
3134
3135 fn super_types(&self) -> &[ObjectTypeName] {
3136 &self.supertypes
3137 }
3138
3139 fn extra_context(&self) -> Option<&str> {
3140 Some(&self.context)
3141 }
3142
3143 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
3144 match (self.object_type.as_str(), name.as_str()) {
3145 ("image", "width") => dictionary_property(
3146 self.dictionary,
3147 &PropertyName::unchecked("Width"),
3148 IMAGE_DIRECT_PROPERTIES,
3149 ),
3150 ("image", "height") => dictionary_property(
3151 self.dictionary,
3152 &PropertyName::unchecked("Height"),
3153 IMAGE_DIRECT_PROPERTIES,
3154 ),
3155 ("contentStream", "operatorCount" | "markedContentCount") => {
3156 Ok(ModelValue::Number(0.0))
3157 }
3158 _ => dictionary_property(self.dictionary, name, self.allowed_properties),
3159 }
3160 }
3161
3162 fn links(&self) -> &[LinkName] {
3163 &self.links
3164 }
3165
3166 fn linked_objects<'a>(
3167 &self,
3168 _graph: &ModelGraph<'a>,
3169 _max_objects: usize,
3170 ) -> Result<Vec<ModelObjectRef<'a>>> {
3171 Ok(Vec::new())
3172 }
3173}
3174
3175fn push_generic_model<'a>(
3176 models: &mut Vec<GenericModel<'a>>,
3177 model: GenericModel<'a>,
3178 max_objects: usize,
3179) -> Result<()> {
3180 if models.len() >= max_objects {
3181 return Err(ValidationError::LimitExceeded {
3182 limit: "max_objects",
3183 }
3184 .into());
3185 }
3186 models.push(model);
3187 Ok(())
3188}
3189
3190fn family_direct_properties(family: &str) -> &'static [&'static str] {
3191 match family {
3192 "resource" => RESOURCE_DIRECT_PROPERTIES,
3193 "names" => NAMES_DIRECT_PROPERTIES,
3194 "outline" => OUTLINES_DIRECT_PROPERTIES,
3195 "destination" => DESTINATION_DIRECT_PROPERTIES,
3196 "acroForm" => ACRO_FORM_DIRECT_PROPERTIES,
3197 "optionalContentProperties" => OPTIONAL_CONTENT_DIRECT_PROPERTIES,
3198 "permissions" => PERMISSIONS_PROPERTIES,
3199 "cMap" => CMAP_DIRECT_PROPERTIES,
3200 "image" => IMAGE_DIRECT_PROPERTIES,
3201 "xObject" => XOBJECT_DIRECT_PROPERTIES,
3202 "action" => ACTION_DIRECT_PROPERTIES,
3203 "formField" => FORM_FIELD_DIRECT_PROPERTIES,
3204 "colorSpace" => COLOR_SPACE_DIRECT_PROPERTIES,
3205 "extGState" => EXT_GSTATE_DIRECT_PROPERTIES,
3206 "structureTreeRoot" => STRUCTURE_DIRECT_PROPERTIES,
3207 "structureElement" => STRUCTURE_ELEMENT_PROPERTIES,
3208 "signature" => SIGNATURE_DIRECT_PROPERTIES,
3209 "security" => SECURITY_DIRECT_PROPERTIES,
3210 "pageTree" => PAGE_TREE_PROPERTIES,
3211 _ => DIRECT_PROPERTY_NAMES,
3212 }
3213}
3214
3215fn resolve_named_dictionary_from_option<'a>(
3216 document: &'a ParsedDocument,
3217 value: Option<&'a crate::CosObject>,
3218) -> Option<(Option<ObjectKey>, Option<u64>, &'a crate::Dictionary)> {
3219 match value {
3220 Some(value) => resolve_named_dictionary(document, value),
3221 None => None,
3222 }
3223}
3224
3225fn classify_xobject(dictionary: &crate::Dictionary) -> Option<&'static str> {
3226 match dictionary.get("Subtype") {
3227 Some(crate::CosObject::Name(name)) if name.matches("Image") => Some("image"),
3228 Some(crate::CosObject::Name(name)) if name.matches("Form") => Some("xObject"),
3229 _ => None,
3230 }
3231}
3232
3233fn classify_dictionary(dictionary: &crate::Dictionary) -> Option<&'static str> {
3234 if let Some(crate::CosObject::Name(name)) = dictionary.get("Subtype") {
3235 if name.matches("Image") {
3236 return Some("image");
3237 }
3238 if name.matches("Form") {
3239 return Some("xObject");
3240 }
3241 if name.matches("Widget") {
3242 return Some("formField");
3243 }
3244 }
3245 if let Some(crate::CosObject::Name(name)) = dictionary.get("Type") {
3246 if name.matches("Pages") {
3247 return Some("pageTree");
3248 }
3249 if name.matches("Action") {
3250 return Some("action");
3251 }
3252 if name.matches("StructTreeRoot") {
3253 return Some("structureTreeRoot");
3254 }
3255 if name.matches("StructElem") {
3256 return Some("structureElement");
3257 }
3258 if name.matches("Sig") {
3259 return Some("signature");
3260 }
3261 if name.matches("EmbeddedFile") {
3262 return Some("embeddedFontFile");
3263 }
3264 if name.matches("OCProperties") {
3265 return Some("optionalContentProperties");
3266 }
3267 if name.matches("XObject") {
3268 return classify_xobject(dictionary).or(Some("xObject"));
3269 }
3270 if name.matches("Font") {
3271 return Some("font");
3272 }
3273 if name.matches("Annot") {
3274 return Some("annotation");
3275 }
3276 if name.matches("Metadata") {
3277 return Some("metadata");
3278 }
3279 if name.matches("OutputIntent") {
3280 return Some("outputIntent");
3281 }
3282 if name.matches("Filespec") {
3283 return Some("destination");
3284 }
3285 }
3286 if dictionary.get("Fields").is_some() {
3287 return Some("acroForm");
3288 }
3289 if dictionary.get("Filter").is_some() && dictionary.get("V").is_some() {
3290 return Some("security");
3291 }
3292 if dictionary.get("CMapName").is_some() {
3293 return Some("cMap");
3294 }
3295 if dictionary.get("ByteRange").is_some() {
3296 return Some("signature");
3297 }
3298 None
3299}
3300
3301#[derive(Clone, Debug)]
3303pub struct StreamModel<'a> {
3304 document: &'a ParsedDocument,
3305 key: ObjectKey,
3306 offset: u64,
3307 stream: &'a crate::StreamObject,
3308 object_type: ObjectTypeName,
3309 supertypes: Vec<ObjectTypeName>,
3310 links: Vec<LinkName>,
3311}
3312
3313impl<'a> StreamModel<'a> {
3314 fn from_indirect_with_document(
3315 document: &'a ParsedDocument,
3316 object: &'a IndirectObject,
3317 ) -> Option<Self> {
3318 let crate::CosObject::Stream(stream) = &object.object else {
3319 return None;
3320 };
3321 Some(Self {
3322 document,
3323 key: object.key,
3324 offset: object.offset,
3325 stream,
3326 object_type: ObjectTypeName::unchecked("stream"),
3327 supertypes: vec![ObjectTypeName::unchecked("object")],
3328 links: Vec::new(),
3329 })
3330 }
3331}
3332
3333impl ModelObject for StreamModel<'_> {
3334 fn id(&self) -> Option<ObjectIdentity> {
3335 Some(ObjectIdentity {
3336 key: format!("stream:{}:{}", self.key.number, self.key.generation),
3337 })
3338 }
3339
3340 fn object_type(&self) -> ObjectTypeName {
3341 self.object_type.clone()
3342 }
3343
3344 fn super_types(&self) -> &[ObjectTypeName] {
3345 &self.supertypes
3346 }
3347
3348 fn extra_context(&self) -> Option<&str> {
3349 Some("stream")
3350 }
3351
3352 fn property(&self, name: &PropertyName) -> Result<ModelValue> {
3353 match name.as_str() {
3354 "lengthMatches" => {
3355 Ok(ModelValue::Bool(self.stream.declared_length.is_none_or(
3356 |declared| declared == self.stream.discovered_length,
3357 )))
3358 }
3359 "declaredLength" => Ok(ModelValue::Number(u64_to_f64(
3360 self.stream
3361 .declared_length
3362 .unwrap_or(self.stream.discovered_length),
3363 )?)),
3364 "discoveredLength" => Ok(ModelValue::Number(u64_to_f64(
3365 self.stream.discovered_length,
3366 )?)),
3367 "streamKeywordCRLFCompliant" => {
3368 Ok(ModelValue::Bool(self.stream.stream_keyword_crlf_compliant))
3369 }
3370 "endstreamKeywordEOLCompliant" => Ok(ModelValue::Bool(
3371 self.stream.endstream_keyword_eol_compliant,
3372 )),
3373 "F" | "FFilter" | "FDecodeParms" => Ok(self
3374 .stream
3375 .dictionary
3376 .get(name.as_str())
3377 .cloned()
3378 .map_or(ModelValue::Null, ModelValue::from)),
3379 _ => Err(crate::ProfileError::UnknownProperty {
3380 property: BoundedText::unchecked(name.as_str()),
3381 }
3382 .into()),
3383 }
3384 }
3385
3386 fn links(&self) -> &[LinkName] {
3387 &self.links
3388 }
3389
3390 fn linked_objects<'a>(
3391 &self,
3392 _graph: &ModelGraph<'a>,
3393 _max_objects: usize,
3394 ) -> Result<Vec<ModelObjectRef<'a>>> {
3395 Ok(Vec::new())
3396 }
3397}
3398
3399struct RuleIndex<'a> {
3400 by_type: BTreeMap<&'a str, Vec<&'a Rule>>,
3401}
3402
3403impl<'a> RuleIndex<'a> {
3404 fn new(rules: &'a [Rule]) -> Self {
3405 let mut by_type: BTreeMap<&'a str, Vec<&'a Rule>> = BTreeMap::new();
3406 for rule in rules {
3407 by_type
3408 .entry(rule.object_type.as_str())
3409 .or_default()
3410 .push(rule);
3411 }
3412 Self { by_type }
3413 }
3414
3415 fn rules_for(&self, object: &ModelObjectRef<'_>) -> Vec<&'a Rule> {
3416 let mut rules = self
3417 .by_type
3418 .get(object.object_type().as_str())
3419 .cloned()
3420 .unwrap_or_default();
3421 let supertypes = match object {
3422 ModelObjectRef::Document(model) => model.super_types(),
3423 ModelObjectRef::Catalog(model) => model.super_types(),
3424 ModelObjectRef::Metadata(model) => model.super_types(),
3425 ModelObjectRef::Page(model) => model.super_types(),
3426 ModelObjectRef::Font(model) => model.super_types(),
3427 ModelObjectRef::Annotation(model) => model.super_types(),
3428 ModelObjectRef::OutputIntent(model) => model.super_types(),
3429 ModelObjectRef::ContentStream(model) => model.super_types(),
3430 ModelObjectRef::Stream(model) => model.super_types(),
3431 ModelObjectRef::Generic(model) => model.super_types(),
3432 };
3433 for supertype in supertypes {
3434 if let Some(super_rules) = self.by_type.get(supertype.as_str()) {
3435 rules.extend(super_rules.iter().copied());
3436 }
3437 }
3438 rules
3439 }
3440}
3441
3442struct ProfileState {
3443 profile: crate::ProfileIdentity,
3444 max_failed_assertions_per_rule: u32,
3445 record_passed_assertions: bool,
3446 checks_executed: u64,
3447 rules_executed: u64,
3448 failed_rules: u64,
3449 failed_assertions: Vec<Assertion>,
3450 passed_assertions: Vec<Assertion>,
3451 unsupported_rules: Vec<UnsupportedRule>,
3452 retained_failures_by_rule: HashMap<RuleId, u32>,
3453 next_ordinal: u64,
3454}
3455
3456impl ProfileState {
3457 fn new(
3458 profile: crate::ProfileIdentity,
3459 max_failed_assertions_per_rule: u32,
3460 record_passed_assertions: bool,
3461 ) -> Self {
3462 Self {
3463 profile,
3464 max_failed_assertions_per_rule,
3465 record_passed_assertions,
3466 checks_executed: 0,
3467 rules_executed: 0,
3468 failed_rules: 0,
3469 failed_assertions: Vec::new(),
3470 passed_assertions: Vec::new(),
3471 unsupported_rules: Vec::new(),
3472 retained_failures_by_rule: HashMap::new(),
3473 next_ordinal: 1,
3474 }
3475 }
3476
3477 fn apply_rule(
3478 &mut self,
3479 object: &ModelObjectRef<'_>,
3480 rule: &Rule,
3481 evaluator: &mut DefaultRuleEvaluator,
3482 ) -> Result<()> {
3483 self.rules_executed =
3484 self.rules_executed
3485 .checked_add(1)
3486 .ok_or(ValidationError::LimitExceeded {
3487 limit: "rules_executed",
3488 })?;
3489 self.checks_executed =
3490 self.checks_executed
3491 .checked_add(1)
3492 .ok_or(ValidationError::LimitExceeded {
3493 limit: "checks_executed",
3494 })?;
3495 let outcome = match evaluator.evaluate(object.clone(), rule) {
3496 Ok(outcome) => outcome,
3497 Err(PdfvError::Profile(error)) => {
3498 self.unsupported_rules.push(UnsupportedRule {
3499 profile_id: self.profile.id.clone(),
3500 rule_id: rule.id.clone(),
3501 expression_fragment: Some(BoundedText::unchecked(format!("{:?}", rule.test))),
3502 reason: BoundedText::new(error.to_string(), 512)?,
3503 references: rule.references.clone(),
3504 });
3505 return Ok(());
3506 }
3507 Err(error) => return Err(error),
3508 };
3509 match outcome {
3510 RuleOutcome::Passed if self.record_passed_assertions => {
3511 let assertion = self.assertion(object, rule, outcome)?;
3512 self.passed_assertions.push(assertion);
3513 }
3514 RuleOutcome::Passed => {}
3515 RuleOutcome::Failed => {
3516 self.failed_rules =
3517 self.failed_rules
3518 .checked_add(1)
3519 .ok_or(ValidationError::LimitExceeded {
3520 limit: "failed_rules",
3521 })?;
3522 let retained = self
3523 .retained_failures_by_rule
3524 .get(&rule.id)
3525 .copied()
3526 .unwrap_or(0);
3527 if retained < self.max_failed_assertions_per_rule {
3528 let assertion = self.assertion(object, rule, outcome)?;
3529 self.failed_assertions.push(assertion);
3530 self.retained_failures_by_rule
3531 .insert(rule.id.clone(), retained.saturating_add(1));
3532 }
3533 }
3534 }
3535 Ok(())
3536 }
3537
3538 fn register_static_unsupported_rules(&mut self, rules: &[Rule]) {
3539 for rule in rules {
3540 if let crate::RuleExpr::Unsupported { fragment, reason } = &rule.test {
3541 self.unsupported_rules.push(UnsupportedRule {
3542 profile_id: self.profile.id.clone(),
3543 rule_id: rule.id.clone(),
3544 expression_fragment: Some(fragment.clone()),
3545 reason: reason.clone(),
3546 references: rule.references.clone(),
3547 });
3548 }
3549 }
3550 }
3551
3552 fn assertion(
3553 &mut self,
3554 object: &ModelObjectRef<'_>,
3555 rule: &Rule,
3556 outcome: RuleOutcome,
3557 ) -> Result<Assertion> {
3558 let ordinal = NonZeroU64::new(self.next_ordinal).ok_or(ValidationError::LimitExceeded {
3559 limit: "assertion_ordinal",
3560 })?;
3561 self.next_ordinal =
3562 self.next_ordinal
3563 .checked_add(1)
3564 .ok_or(ValidationError::LimitExceeded {
3565 limit: "assertion_ordinal",
3566 })?;
3567 Ok(Assertion {
3568 ordinal,
3569 rule_id: rule.id.clone(),
3570 status: outcome.assertion_status(),
3571 description: rule.description.clone(),
3572 location: object.location(),
3573 object_context: Some(object.context()),
3574 message: Some(rule.error.message.clone()),
3575 error_arguments: Vec::<ErrorArgument>::new(),
3576 })
3577 }
3578
3579 fn finish(self) -> ProfileReport {
3580 ProfileReport::builder()
3581 .profile(self.profile)
3582 .is_compliant(self.failed_rules == 0 && self.unsupported_rules.is_empty())
3583 .checks_executed(self.checks_executed)
3584 .rules_executed(self.rules_executed)
3585 .failed_rules(self.failed_rules)
3586 .failed_assertions(self.failed_assertions)
3587 .passed_assertions(self.passed_assertions)
3588 .unsupported_rules(self.unsupported_rules)
3589 .build()
3590 }
3591}
3592
3593fn reader_len<R: Read + Seek>(reader: &mut R) -> Result<Option<u64>> {
3594 let current = reader
3595 .stream_position()
3596 .map_err(|source| PdfvError::Io { path: None, source })?;
3597 let end = reader
3598 .seek(SeekFrom::End(0))
3599 .map_err(|source| PdfvError::Io { path: None, source })?;
3600 reader
3601 .seek(SeekFrom::Start(current))
3602 .map_err(|source| PdfvError::Io { path: None, source })?;
3603 Ok(Some(end))
3604}
3605
3606fn parse_failed_report(
3607 source: InputSummary,
3608 error: &crate::ParseError,
3609 elapsed: std::time::Duration,
3610) -> Result<ValidationReport> {
3611 Ok(ValidationReport::builder()
3612 .engine_version(ENGINE_VERSION.to_owned())
3613 .source(source)
3614 .status(ValidationStatus::ParseFailed)
3615 .flavours(Vec::new())
3616 .profile_reports(Vec::new())
3617 .parse_facts(Vec::new())
3618 .warnings(vec![crate::ValidationWarning::General {
3619 message: BoundedText::new(error.to_string(), 512)?,
3620 }])
3621 .task_durations(vec![TaskDuration::from_duration(
3622 Identifier::new("parse")?,
3623 elapsed,
3624 )])
3625 .build())
3626}
3627
3628fn base_report(
3629 source: InputSummary,
3630 status: ValidationStatus,
3631 profile_reports: Vec<ProfileReport>,
3632 parsed: ParsedDocument,
3633 elapsed: std::time::Duration,
3634) -> Result<ValidationReport> {
3635 Ok(ValidationReport::builder()
3636 .engine_version(ENGINE_VERSION.to_owned())
3637 .source(source)
3638 .status(status)
3639 .flavours(Vec::new())
3640 .profile_reports(profile_reports)
3641 .parse_facts(parsed.parse_facts)
3642 .warnings(parsed.warnings)
3643 .task_durations(vec![TaskDuration::from_duration(
3644 Identifier::new("validate")?,
3645 elapsed,
3646 )])
3647 .build())
3648}
3649
3650fn header_offset(document: &ParsedDocument) -> u64 {
3651 document
3652 .parse_facts
3653 .iter()
3654 .find_map(|fact| match fact {
3655 crate::ParseFact::Header { offset, .. } => Some(*offset),
3656 _ => None,
3657 })
3658 .unwrap_or(0)
3659}
3660
3661fn post_eof_data_size(document: &ParsedDocument) -> u64 {
3662 document
3663 .parse_facts
3664 .iter()
3665 .find_map(|fact| match fact {
3666 crate::ParseFact::PostEofData { bytes } => Some(*bytes),
3667 _ => None,
3668 })
3669 .unwrap_or(0)
3670}
3671
3672fn contains_xref_stream(document: &ParsedDocument) -> bool {
3673 document.parse_facts.iter().any(|fact| {
3674 matches!(
3675 fact,
3676 crate::ParseFact::Xref {
3677 fact: crate::XrefFact::XrefStreamParsed { .. }
3678 | crate::XrefFact::XrefStreamUnsupported,
3679 ..
3680 }
3681 )
3682 })
3683}
3684
3685fn contains_xmp_family(document: &ParsedDocument, family: &str) -> bool {
3686 document.parse_facts.iter().any(|fact| {
3687 matches!(
3688 fact,
3689 crate::ParseFact::Xmp {
3690 fact:
3691 crate::XmpFact::FlavourClaim {
3692 family: claim_family,
3693 ..
3694 },
3695 ..
3696 } if claim_family.as_str() == family
3697 )
3698 })
3699}
3700
3701fn xmp_part(document: &ParsedDocument) -> Option<f64> {
3702 document.parse_facts.iter().find_map(|fact| {
3703 let crate::ParseFact::Xmp {
3704 fact:
3705 crate::XmpFact::FlavourClaim {
3706 family,
3707 display_flavour,
3708 ..
3709 },
3710 ..
3711 } = fact
3712 else {
3713 return None;
3714 };
3715 if family.as_str() == "pdfa" || family.as_str() == "pdfua" {
3716 display_flavour
3717 .as_str()
3718 .split('-')
3719 .nth(1)
3720 .and_then(|value| value.chars().next())
3721 .and_then(|character| character.to_digit(10))
3722 .map(f64::from)
3723 } else {
3724 None
3725 }
3726 })
3727}
3728
3729fn xmp_prefix_for_claim(document: &ParsedDocument) -> Option<&'static str> {
3730 document.parse_facts.iter().find_map(|fact| {
3731 let crate::ParseFact::Xmp {
3732 fact: crate::XmpFact::FlavourClaim { family, .. },
3733 ..
3734 } = fact
3735 else {
3736 return None;
3737 };
3738 match family.as_str() {
3739 "pdfa" => Some("pdfaid"),
3740 "pdfua" => Some("pdfuaid"),
3741 _ => None,
3742 }
3743 })
3744}
3745
3746fn xmp_conformance(document: &ParsedDocument) -> Option<String> {
3747 document.parse_facts.iter().find_map(|fact| {
3748 let crate::ParseFact::Xmp {
3749 fact:
3750 crate::XmpFact::FlavourClaim {
3751 family,
3752 display_flavour,
3753 ..
3754 },
3755 ..
3756 } = fact
3757 else {
3758 return None;
3759 };
3760 if family.as_str() != "pdfa" {
3761 return None;
3762 }
3763 display_flavour
3764 .as_str()
3765 .chars()
3766 .last()
3767 .filter(char::is_ascii_alphabetic)
3768 .map(|character| character.to_ascii_uppercase().to_string())
3769 })
3770}
3771
3772fn xmp_declarations(document: &ParsedDocument) -> Vec<ModelValue> {
3773 document
3774 .parse_facts
3775 .iter()
3776 .filter_map(|fact| {
3777 let crate::ParseFact::Xmp {
3778 fact:
3779 crate::XmpFact::FlavourClaim {
3780 family,
3781 display_flavour,
3782 ..
3783 },
3784 ..
3785 } = fact
3786 else {
3787 return None;
3788 };
3789 if family.as_str() != "wtpdf" {
3790 return None;
3791 }
3792 let declaration = match display_flavour.as_str() {
3793 "wtpdf-1-0-accessibility" => "http://pdfa.org/declarations/wtpdf#accessibility1.0",
3794 "wtpdf-1-0-reuse" => "http://pdfa.org/declarations/wtpdf#reuse1.0",
3795 _ => return None,
3796 };
3797 Some(ModelValue::String(BoundedText::unchecked(declaration)))
3798 })
3799 .collect()
3800}
3801
3802fn u64_to_f64(value: u64) -> Result<f64> {
3803 let bounded = u32::try_from(value).map_err(|_| ValidationError::LimitExceeded {
3804 limit: "numeric_property",
3805 })?;
3806 Ok(f64::from(bounded))
3807}
3808
3809fn usize_to_f64(value: usize) -> Result<f64> {
3810 let bounded = u32::try_from(value).map_err(|_| ValidationError::LimitExceeded {
3811 limit: "numeric_property",
3812 })?;
3813 Ok(f64::from(bounded))
3814}
3815
3816fn remaining_object_budget(
3817 limits: &ResourceLimits,
3818 visited_len: usize,
3819 stack_len: usize,
3820) -> Result<usize> {
3821 let visited = u64::try_from(visited_len).map_err(|_| ValidationError::LimitExceeded {
3822 limit: "max_objects",
3823 })?;
3824 let pending = u64::try_from(stack_len).map_err(|_| ValidationError::LimitExceeded {
3825 limit: "max_objects",
3826 })?;
3827 let consumed = visited
3828 .checked_add(pending)
3829 .ok_or(ValidationError::LimitExceeded {
3830 limit: "max_objects",
3831 })?;
3832 let remaining =
3833 limits
3834 .max_objects
3835 .checked_sub(consumed)
3836 .ok_or(ValidationError::LimitExceeded {
3837 limit: "max_objects",
3838 })?;
3839 usize::try_from(remaining).map_err(|_| {
3840 ValidationError::LimitExceeded {
3841 limit: "max_objects",
3842 }
3843 .into()
3844 })
3845}
3846
3847fn push_linked<'a>(
3848 objects: &mut Vec<ModelObjectRef<'a>>,
3849 object: ModelObjectRef<'a>,
3850 max_objects: usize,
3851) -> Result<()> {
3852 if objects.len() >= max_objects {
3853 return Err(ValidationError::LimitExceeded {
3854 limit: "max_objects",
3855 }
3856 .into());
3857 }
3858 objects.push(object);
3859 Ok(())
3860}
3861
3862#[cfg(test)]
3863mod tests {
3864 use std::{io::Cursor, sync::Arc};
3865
3866 use super::{
3867 AnnotationModel, CatalogModel, ContentStreamModel, FontModel, OutputIntentModel, PageModel,
3868 };
3869 use crate::{
3870 BinaryOp, BoundedText, ErrorTemplate, FlavourSelection, Identifier, ModelObject,
3871 ModelObjectRef, ModelValue, Parser, PdfvError, ProfileIdentity, ProfileRepository,
3872 PropertyName, ResourceLimits, Rule, RuleExpr, RuleId, ValidationFlavour, ValidationOptions,
3873 ValidationProfile, Validator,
3874 };
3875
3876 #[derive(Debug)]
3877 struct StaticRepo(ValidationProfile);
3878
3879 impl ProfileRepository for StaticRepo {
3880 fn profiles_for(
3881 &self,
3882 _selection: &FlavourSelection,
3883 ) -> crate::Result<Vec<ValidationProfile>> {
3884 Ok(vec![self.0.clone()])
3885 }
3886 }
3887
3888 fn m1_model_pdf() -> &'static [u8] {
3889 br"%PDF-1.7
38901 0 obj
3891<< /Type /Catalog /Pages 2 0 R /OutputIntents [8 0 R] >>
3892endobj
38932 0 obj
3894<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3895endobj
38963 0 obj
3897<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> >> /Annots [5 0 R] /Contents 6 0 R >>
3898endobj
38994 0 obj
3900<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
3901endobj
39025 0 obj
3903<< /Type /Annot /Subtype /Text >>
3904endobj
39056 0 obj
3906<< /Length 3 >>
3907stream
3908q Q
3909endstream
3910endobj
39117 0 obj
3912<< /Length 0 >>
3913stream
3914endstream
3915endobj
39168 0 obj
3917<< /Type /OutputIntent /S /GTS_PDFA1 /DestOutputProfile 7 0 R >>
3918endobj
3919trailer
3920<< /Root 1 0 R >>
3921%%EOF
3922"
3923 }
3924
3925 fn m6_model_pdf() -> &'static [u8] {
3926 br"%PDF-1.7
39271 0 obj
3928<< /Type /Catalog /Pages 2 0 R /AcroForm 7 0 R /StructTreeRoot 8 0 R /OCProperties 9 0 R /Names 10 0 R /Outlines 11 0 R /Perms 12 0 R /Dests [21 0 R] >>
3929endobj
39302 0 obj
3931<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3932endobj
39333 0 obj
3934<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 4 0 R >> /XObject << /Im1 5 0 R /Fm1 22 0 R >> /ColorSpace << /CS1 13 0 R >> /ExtGState << /GS1 14 0 R >> >> /Annots [6 0 R] /Contents 15 0 R >>
3935endobj
39364 0 obj
3937<< /Type /Font /Subtype /Type0 /BaseFont /Faux /ToUnicode 16 0 R /FontDescriptor << /FontFile2 20 0 R >> >>
3938endobj
39395 0 obj
3940<< /Type /XObject /Subtype /Image /Width 1 /Height 1 /ColorSpace /DeviceRGB /BitsPerComponent 8 /Length 0 >>
3941stream
3942endstream
3943endobj
39446 0 obj
3945<< /Type /Annot /Subtype /Widget /FT /Sig /A 17 0 R >>
3946endobj
39477 0 obj
3948<< /Fields [6 0 R] /SigFlags 3 >>
3949endobj
39508 0 obj
3951<< /Type /StructTreeRoot /K 18 0 R /RoleMap << /H1 /H >> >>
3952endobj
39539 0 obj
3954<< /OCGs [] /D << >> >>
3955endobj
395610 0 obj
3957<< /Dests << /Names [] >> >>
3958endobj
395911 0 obj
3960<< /Type /Outlines /Count 0 >>
3961endobj
396212 0 obj
3963<< /DocMDP 19 0 R >>
3964endobj
396513 0 obj
3966<< /N 3 /Alternate /DeviceRGB >>
3967endobj
396814 0 obj
3969<< /Type /ExtGState /BM /Normal /CA 1 >>
3970endobj
397115 0 obj
3972<< /Length 3 >>
3973stream
3974q Q
3975endstream
3976endobj
397716 0 obj
3978<< /Type /CMap /CMapName /Identity-H >>
3979endobj
398017 0 obj
3981<< /Type /Action /S /URI /URI (https://example.invalid) >>
3982endobj
398318 0 obj
3984<< /Type /StructElem /S /Document /K [] >>
3985endobj
398619 0 obj
3987<< /Type /Sig /Filter /Adobe.PPKLite /ByteRange [0 0 0 0] >>
3988endobj
398920 0 obj
3990<< /Type /EmbeddedFile /Length 0 >>
3991stream
3992endstream
3993endobj
399421 0 obj
3995<< /D [3 0 R /Fit] >>
3996endobj
399722 0 obj
3998<< /Type /XObject /Subtype /Form /BBox [0 0 1 1] /Length 0 >>
3999stream
4000endstream
4001endobj
400223 0 obj
4003<< /Filter /Standard /V 1 /R 2 /Length 40 /P -4 >>
4004endobj
4005trailer
4006<< /Root 1 0 R >>
4007%%EOF
4008"
4009 }
4010
4011 #[test]
4012 fn test_should_materialize_m1_model_wrappers() -> crate::Result<()> {
4013 let document = Parser::default().parse(Cursor::new(m1_model_pdf()))?;
4014 let catalog_key = document.catalog.ok_or(crate::ParseError::MissingObject {
4015 message: crate::BoundedText::unchecked("missing catalog"),
4016 })?;
4017 let catalog =
4018 CatalogModel::new(&document, catalog_key).ok_or(crate::ParseError::MissingObject {
4019 message: crate::BoundedText::unchecked("missing catalog model"),
4020 })?;
4021
4022 let pages =
4023 PageModel::from_catalog(&document, &catalog, &crate::ResourceLimits::default(), 16)?;
4024 let page = pages.first().ok_or(crate::ParseError::MissingObject {
4025 message: crate::BoundedText::unchecked("missing page"),
4026 })?;
4027 let fonts = FontModel::from_page(&document, page, 16)?;
4028 let annotations = AnnotationModel::from_page(&document, page, 16)?;
4029 let output_intents = OutputIntentModel::from_catalog(&document, &catalog, 16)?;
4030 let content_streams = ContentStreamModel::from_page(&document, page, 16)?;
4031
4032 assert_eq!(pages.len(), 1);
4033 assert_eq!(fonts.len(), 1);
4034 assert_eq!(annotations.len(), 1);
4035 assert_eq!(output_intents.len(), 1);
4036 assert_eq!(content_streams.len(), 1);
4037 assert_eq!(
4038 page.property(&PropertyName::new("hasContents")?)?,
4039 ModelValue::Bool(true)
4040 );
4041 Ok(())
4042 }
4043
4044 #[test]
4045 fn test_should_resolve_m1_links_lazily_from_model_graph() -> crate::Result<()> {
4046 let document = Parser::default().parse(Cursor::new(m1_model_pdf()))?;
4047 let limits = crate::ResourceLimits::default();
4048 let graph = super::ModelGraph::with_all_families(&document, &limits);
4049 let document_model = super::DocumentModel::new(&document);
4050 let mut stack = vec![ModelObjectRef::Document(document_model)];
4051 let mut visited_contexts = Vec::new();
4052
4053 while let Some(object) = stack.pop() {
4054 visited_contexts.push(object.context().as_str().to_owned());
4055 for linked in object.linked_objects(&graph, 16)? {
4056 stack.push(linked);
4057 }
4058 }
4059
4060 assert!(visited_contexts.iter().any(|value| value == "root/page[0]"));
4061 assert!(
4062 visited_contexts
4063 .iter()
4064 .any(|value| value == "root/page[0]/font[F1]")
4065 );
4066 assert!(
4067 visited_contexts
4068 .iter()
4069 .any(|value| value == "root/page[0]/annotation[0]")
4070 );
4071 assert!(
4072 visited_contexts
4073 .iter()
4074 .any(|value| value == "root/catalog[0]/outputIntent[0]")
4075 );
4076 assert!(
4077 visited_contexts
4078 .iter()
4079 .any(|value| value == "root/page[0]/contentStream[0]")
4080 );
4081 Ok(())
4082 }
4083
4084 #[test]
4085 fn test_should_redact_content_strings_from_feature_report() -> crate::Result<()> {
4086 let document = Parser::default().parse(Cursor::new(m6_model_pdf()))?;
4087 let session =
4088 super::ValidationSession::new(document, crate::ResourceLimits::default(), 100, false);
4089 let action_family = crate::ObjectTypeName::new("action")?;
4090 let report = session.extract_features(&super::FeatureSelection::Families {
4091 families: vec![action_family.clone()],
4092 })?;
4093 let Some(action) = report
4094 .objects
4095 .iter()
4096 .find(|object| object.family == action_family)
4097 else {
4098 return Err(crate::ParseError::MissingObject {
4099 message: crate::BoundedText::unchecked("missing action feature"),
4100 }
4101 .into());
4102 };
4103 assert!(matches!(
4104 action.properties.get(&PropertyName::new("URI")?),
4105 Some(crate::FeatureValue::RedactedString { bytes }) if *bytes > 0
4106 ));
4107 Ok(())
4108 }
4109
4110 #[test]
4111 fn test_should_truncate_feature_report_on_object_cap() -> crate::Result<()> {
4112 let document = Parser::default().parse(Cursor::new(m6_model_pdf()))?;
4113 let limits = crate::ResourceLimits {
4114 max_objects: 1,
4115 ..crate::ResourceLimits::default()
4116 };
4117 let session = super::ValidationSession::new(document, limits, 100, false);
4118 let report = session.extract_features(&super::FeatureSelection::All)?;
4119
4120 assert!(report.truncated);
4121 assert_eq!(report.visited_objects, 1);
4122 Ok(())
4123 }
4124
4125 #[test]
4126 fn test_should_register_model_family_schema_for_generated_profiles() -> crate::Result<()> {
4127 let registry = super::ModelRegistry::default_registry();
4128
4129 for family in [
4130 "document",
4131 "catalog",
4132 "page",
4133 "resource",
4134 "font",
4135 "cMap",
4136 "image",
4137 "contentStream",
4138 "annotation",
4139 "action",
4140 "formField",
4141 "colorSpace",
4142 "extGState",
4143 "structureTreeRoot",
4144 "structureElement",
4145 "signature",
4146 "security",
4147 ] {
4148 assert!(registry.has_family(&crate::ObjectTypeName::new(family)?));
4149 }
4150 assert!(registry.has_family_property(
4151 &crate::ObjectTypeName::new("structureElement")?,
4152 &PropertyName::new("parentStandardType")?
4153 ));
4154 Ok(())
4155 }
4156
4157 #[test]
4158 fn test_should_materialize_m6_broad_model_families_bounded_iteratively() -> crate::Result<()> {
4159 let document = Parser::default().parse(Cursor::new(m6_model_pdf()))?;
4160 let limits = crate::ResourceLimits {
4161 max_objects: 128,
4162 ..crate::ResourceLimits::default()
4163 };
4164 let graph = super::ModelGraph::with_all_families(&document, &limits);
4165 let mut stack = vec![ModelObjectRef::Document(super::DocumentModel::new(
4166 &document,
4167 ))];
4168 let mut visited = std::collections::HashSet::new();
4169 let mut families = std::collections::BTreeSet::new();
4170
4171 while let Some(object) = stack.pop() {
4172 if !visited.insert(object.identity_key()) {
4173 continue;
4174 }
4175 families.insert(object.object_type().as_str().to_owned());
4176 for linked in object.linked_objects(&graph, 128)? {
4177 stack.push(linked);
4178 }
4179 }
4180
4181 for family in [
4182 "acroForm",
4183 "structureTreeRoot",
4184 "optionalContentProperties",
4185 "names",
4186 "outline",
4187 "destination",
4188 "permissions",
4189 "pageTree",
4190 "resource",
4191 "image",
4192 "xObject",
4193 "colorSpace",
4194 "extGState",
4195 "cMap",
4196 "embeddedFontFile",
4197 "action",
4198 "signature",
4199 "security",
4200 ] {
4201 assert!(families.contains(family), "missing {family}: {families:?}");
4202 }
4203 Ok(())
4204 }
4205
4206 #[test]
4207 fn test_should_validate_m1_linked_objects_through_lazy_traversal() -> crate::Result<()> {
4208 let profile = linked_object_profile()?;
4209 let validator =
4210 Validator::with_profiles(ValidationOptions::default(), Arc::new(StaticRepo(profile)))?;
4211 let report =
4212 validator.validate_reader(Cursor::new(m1_model_pdf()), crate::InputName::memory())?;
4213 let profile =
4214 report
4215 .profile_reports
4216 .first()
4217 .ok_or(crate::ValidationError::LimitExceeded {
4218 limit: "profile_reports",
4219 })?;
4220 let contexts = profile
4221 .failed_assertions
4222 .iter()
4223 .filter_map(|assertion| assertion.object_context.as_ref())
4224 .map(BoundedText::as_str)
4225 .collect::<Vec<_>>();
4226
4227 assert_eq!(profile.rules_executed, 5);
4228 assert!(contexts.contains(&"root/page[0]"));
4229 assert!(contexts.contains(&"root/page[0]/font[F1]"));
4230 assert!(contexts.contains(&"root/page[0]/annotation[0]"));
4231 assert!(contexts.contains(&"root/catalog[0]/outputIntent[0]"));
4232 assert!(contexts.contains(&"root/page[0]/contentStream[0]"));
4233 Ok(())
4234 }
4235
4236 #[test]
4237 fn test_should_limit_lazy_link_expansion_before_enqueue() -> crate::Result<()> {
4238 let limits = ResourceLimits {
4239 max_objects: 1,
4240 ..ResourceLimits::default()
4241 };
4242 let options = ValidationOptions::builder().resource_limits(limits).build();
4243 let Err(error) = Validator::new(options)?.validate_reader(
4244 Cursor::new(simple_catalog_pdf()),
4245 crate::InputName::memory(),
4246 ) else {
4247 return Err(crate::ValidationError::LimitExceeded {
4248 limit: "expected_error",
4249 }
4250 .into());
4251 };
4252
4253 assert!(matches!(
4254 error,
4255 PdfvError::Validation(crate::ValidationError::LimitExceeded {
4256 limit: "max_objects"
4257 })
4258 ));
4259 Ok(())
4260 }
4261
4262 fn simple_catalog_pdf() -> &'static [u8] {
4263 br"%PDF-1.7
42641 0 obj
4265<< /Type /Catalog >>
4266endobj
4267trailer
4268<< /Root 1 0 R >>
4269%%EOF
4270"
4271 }
4272
4273 fn linked_object_profile() -> crate::Result<ValidationProfile> {
4274 Ok(ValidationProfile {
4275 identity: ProfileIdentity {
4276 id: Identifier::new("lazy-links")?,
4277 name: BoundedText::new("lazy links", 64)?,
4278 version: None,
4279 },
4280 flavour: ValidationFlavour::new("pdfa", std::num::NonZeroU32::MIN, "b")?,
4281 rules: vec![
4282 false_rule("page-rule", "page", false)?,
4283 false_rule("font-rule", "font", false)?,
4284 false_rule("annotation-rule", "annotation", false)?,
4285 false_rule("output-intent-rule", "outputIntent", false)?,
4286 false_rule("content-stream-deferred", "contentStream", true)?,
4287 ],
4288 })
4289 }
4290
4291 fn false_rule(id: &str, object_type: &str, deferred: bool) -> crate::Result<Rule> {
4292 Ok(Rule {
4293 id: RuleId(Identifier::new(id)?),
4294 object_type: crate::ObjectTypeName::new(object_type)?,
4295 deferred,
4296 tags: Vec::new(),
4297 description: BoundedText::new(id, 64)?,
4298 test: RuleExpr::Binary {
4299 op: BinaryOp::Eq,
4300 left: Box::new(RuleExpr::Bool { value: true }),
4301 right: Box::new(RuleExpr::Bool { value: false }),
4302 },
4303 error: ErrorTemplate {
4304 message: BoundedText::new(id, 64)?,
4305 },
4306 references: Vec::new(),
4307 })
4308 }
4309}