Skip to main content

pdfv_core/
xmp.rs

1//! Bounded XMP packet extraction, identification parsing, and flavour detection.
2
3use std::{collections::BTreeMap, num::NonZeroU32, sync::Arc};
4
5use quick_xml::{Reader, events::Event};
6use serde::{Deserialize, Serialize};
7
8use crate::{
9    BoundedText, CosObject, Identifier, ObjectKey, ParseError, ParseFact, PdfvError,
10    ProfileRepository, ResourceLimits, Result, ValidationFlavour, ValidationProfile,
11    ValidationWarning, XmpFact, display_flavour,
12};
13
14const PDF_A_ID_NS: &str = "http://www.aiim.org/pdfa/ns/id/";
15const PDF_UA_ID_NS: &str = "http://www.aiim.org/pdfua/ns/id/";
16const PDF_D_NS: &str = "http://pdfa.org/declarations/";
17const WTPDF_ACCESSIBILITY_DECLARATION: &str = "http://pdfa.org/declarations/wtpdf#accessibility1.0";
18const WTPDF_REUSE_DECLARATION: &str = "http://pdfa.org/declarations/wtpdf#reuse1.0";
19
20/// Namespace declaration retained from an XMP packet.
21#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
22#[non_exhaustive]
23#[serde(rename_all = "camelCase", deny_unknown_fields)]
24pub struct NamespaceBinding {
25    /// Namespace prefix, or empty for the default namespace.
26    pub prefix: Identifier,
27    /// Namespace URI.
28    pub uri: BoundedText,
29}
30
31/// Recognized XMP identification schema kind.
32#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
33#[non_exhaustive]
34#[serde(rename_all = "camelCase")]
35pub enum XmpIdentificationKind {
36    /// PDF/A identification schema.
37    PdfA,
38    /// PDF/UA identification schema.
39    PdfUa,
40    /// WTPDF PDF Declaration.
41    Wtpdf,
42}
43
44/// Recognized flavour claim extracted from XMP metadata.
45#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
46#[non_exhaustive]
47#[serde(rename_all = "camelCase", deny_unknown_fields)]
48pub struct FlavourClaim {
49    /// Claim kind.
50    pub kind: XmpIdentificationKind,
51    /// Validation flavour represented by the claim.
52    pub flavour: ValidationFlavour,
53    /// Report-safe display spelling.
54    pub display_flavour: BoundedText,
55    /// Source namespace URI.
56    pub namespace_uri: BoundedText,
57    /// Source property name.
58    pub property: Identifier,
59}
60
61/// Parsed XMP packet summary.
62#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
63#[non_exhaustive]
64#[serde(rename_all = "camelCase", deny_unknown_fields)]
65pub struct XmpPacket {
66    /// Metadata stream object that supplied the packet.
67    pub source_object: ObjectKey,
68    /// Packet byte count.
69    pub bytes: u64,
70    /// Retained namespace declarations.
71    pub namespaces: Vec<NamespaceBinding>,
72    /// Recognized identification claims.
73    pub identification: Vec<FlavourClaim>,
74    /// Report-safe parser facts.
75    pub facts: Vec<XmpFact>,
76}
77
78/// Bounded XMP parser.
79#[derive(Clone, Debug, Default)]
80pub struct XmpParser;
81
82/// Auto flavour detection result.
83#[derive(Clone, Debug)]
84#[non_exhaustive]
85pub struct DetectedFlavours {
86    /// Parsed packet, when catalog metadata was present and XML was parseable.
87    pub packet: Option<XmpPacket>,
88    /// Profiles selected for validation.
89    pub profiles: Vec<ValidationProfile>,
90    /// Report-safe parse facts generated during detection.
91    pub parse_facts: Vec<ParseFact>,
92    /// Structured warnings generated during detection.
93    pub warnings: Vec<ValidationWarning>,
94}
95
96/// Report-safe XMP parse result independent of profile selection.
97#[derive(Clone, Debug)]
98#[non_exhaustive]
99pub(crate) struct XmpParseResult {
100    /// Parsed packet, when XML was parseable.
101    pub packet: Option<XmpPacket>,
102    /// Report-safe parse facts generated during parsing.
103    pub parse_facts: Vec<ParseFact>,
104    /// Structured warnings generated during parsing.
105    pub warnings: Vec<ValidationWarning>,
106}
107
108/// Profile selector backed by XMP identification claims.
109#[derive(Clone)]
110pub struct FlavourDetector {
111    profiles: Arc<dyn ProfileRepository + Send + Sync>,
112}
113
114impl std::fmt::Debug for FlavourDetector {
115    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
116        formatter.write_str("FlavourDetector")
117    }
118}
119
120impl FlavourDetector {
121    /// Creates a detector backed by a profile repository.
122    #[must_use]
123    pub fn new(profiles: Arc<dyn ProfileRepository + Send + Sync>) -> Self {
124        Self { profiles }
125    }
126
127    /// Detects validation profiles from catalog XMP metadata.
128    ///
129    /// # Errors
130    ///
131    /// Returns [`PdfvError`] when a detected or fallback flavour cannot be loaded.
132    pub fn detect(
133        &self,
134        document: &crate::ParsedDocument,
135        default: Option<&ValidationFlavour>,
136        limits: &ResourceLimits,
137    ) -> Result<DetectedFlavours> {
138        let parsed_xmp = parse_document_xmp(document, limits, true)?;
139        let Some(packet) = parsed_xmp.packet else {
140            let mut fallback = self.fallback(default, "catalog metadata stream is missing")?;
141            fallback.parse_facts = parsed_xmp.parse_facts;
142            fallback.warnings = parsed_xmp.warnings;
143            return Ok(fallback);
144        };
145
146        let mut parse_facts = parsed_xmp.parse_facts;
147        let mut warnings = parsed_xmp.warnings;
148        let mut profiles = Vec::new();
149        for claim in &packet.identification {
150            match self
151                .profiles
152                .profiles_for(&crate::FlavourSelection::Explicit {
153                    flavour: claim.flavour.clone(),
154                }) {
155                Ok(mut selected) => profiles.append(&mut selected),
156                Err(error) => warnings.push(ValidationWarning::IncompatibleProfile {
157                    profile_id: Identifier::new(claim.display_flavour.as_str())?,
158                    reason: BoundedText::new(error.to_string(), 512)?,
159                }),
160            }
161        }
162        if profiles.is_empty() {
163            let mut fallback =
164                self.fallback(default, "XMP metadata contains no supported claims")?;
165            fallback.parse_facts.append(&mut parse_facts);
166            fallback.warnings.extend(warnings);
167            fallback.packet = Some(packet);
168            return Ok(fallback);
169        }
170        let compatible_profiles = select_compatible_profiles(profiles, &mut warnings)?;
171        Ok(DetectedFlavours {
172            packet: Some(packet),
173            profiles: compatible_profiles,
174            parse_facts,
175            warnings,
176        })
177    }
178
179    fn fallback(
180        &self,
181        default: Option<&ValidationFlavour>,
182        reason: &'static str,
183    ) -> Result<DetectedFlavours> {
184        let warning = ValidationWarning::AutoDetection {
185            message: BoundedText::unchecked(reason),
186        };
187        self.fallback_with_warning(default, warning)
188    }
189
190    fn fallback_with_warning(
191        &self,
192        default: Option<&ValidationFlavour>,
193        warning: ValidationWarning,
194    ) -> Result<DetectedFlavours> {
195        let warnings = vec![warning];
196        let profiles = if let Some(flavour) = default {
197            self.profiles.profiles_for(&crate::FlavourSelection::Auto {
198                default: Some(flavour.clone()),
199            })?
200        } else {
201            self.profiles
202                .profiles_for(&crate::FlavourSelection::Auto { default: None })?
203        };
204        Ok(DetectedFlavours {
205            packet: None,
206            profiles,
207            parse_facts: Vec::new(),
208            warnings,
209        })
210    }
211}
212
213impl XmpParser {
214    /// Parses one catalog metadata stream into report-safe XMP facts.
215    ///
216    /// # Errors
217    ///
218    /// Returns [`PdfvError`] when resource limits are exceeded or XML is malformed.
219    pub fn parse_packet(
220        &self,
221        source_object: ObjectKey,
222        bytes: &[u8],
223        limits: &ResourceLimits,
224    ) -> Result<XmpPacket> {
225        enforce_xmp_len(bytes.len(), limits.max_xmp_bytes)?;
226        let text = std::str::from_utf8(bytes).map_err(|error| crate::ProfileError::InvalidXml {
227            reason: BoundedText::new(error.to_string(), 512)
228                .unwrap_or_else(|_| BoundedText::unchecked("XMP is not UTF-8")),
229        })?;
230        let parser = PacketBuilder::new(source_object, bytes.len(), limits);
231        parser.parse(text)
232    }
233}
234
235#[derive(Debug)]
236struct PacketBuilder<'a> {
237    source_object: ObjectKey,
238    byte_len: usize,
239    limits: &'a ResourceLimits,
240    depth: u32,
241    elements: u64,
242    namespaces: BTreeMap<String, BoundedText>,
243    current_namespaces: BTreeMap<String, BoundedText>,
244    properties: BTreeMap<(String, String), XmpProperty>,
245    stack: Vec<ElementFrame>,
246    facts: Vec<XmpFact>,
247    saw_packet_wrapper: bool,
248}
249
250impl<'a> PacketBuilder<'a> {
251    fn new(source_object: ObjectKey, byte_len: usize, limits: &'a ResourceLimits) -> Self {
252        Self {
253            source_object,
254            byte_len,
255            limits,
256            depth: 0,
257            elements: 0,
258            namespaces: BTreeMap::new(),
259            current_namespaces: BTreeMap::new(),
260            properties: BTreeMap::new(),
261            stack: Vec::with_capacity(usize::try_from(limits.max_xmp_depth).unwrap_or(0)),
262            facts: Vec::new(),
263            saw_packet_wrapper: false,
264        }
265    }
266
267    fn parse(mut self, text: &str) -> Result<XmpPacket> {
268        let mut reader = Reader::from_str(text);
269        reader.config_mut().trim_text(true);
270        loop {
271            match reader.read_event().map_err(|error| xmp_xml_error(&error))? {
272                Event::Start(element) => self.start(&element)?,
273                Event::Empty(element) => {
274                    self.start(&element)?;
275                    self.end()?;
276                }
277                Event::Text(text) => {
278                    let decoded =
279                        text.decode()
280                            .map_err(|error| crate::ProfileError::InvalidXml {
281                                reason: bounded_reason(error.to_string()),
282                            })?;
283                    self.text(decoded.as_ref())?;
284                }
285                Event::End(_) => self.end()?,
286                Event::Decl(_) | Event::PI(_) | Event::Comment(_) | Event::CData(_) => {}
287                Event::DocType(_) | Event::GeneralRef(_) => {
288                    return Err(crate::ProfileError::InvalidXml {
289                        reason: BoundedText::unchecked(
290                            "XMP DTD and entity processing are forbidden",
291                        ),
292                    }
293                    .into());
294                }
295                Event::Eof => break,
296            }
297        }
298        self.finish()
299    }
300
301    fn start(&mut self, element: &quick_xml::events::BytesStart<'_>) -> Result<()> {
302        self.depth = self.depth.checked_add(1).ok_or(ParseError::LimitExceeded {
303            limit: "max_xmp_depth",
304        })?;
305        if self.depth > self.limits.max_xmp_depth {
306            return Err(ParseError::LimitExceeded {
307                limit: "max_xmp_depth",
308            }
309            .into());
310        }
311        self.elements = self
312            .elements
313            .checked_add(1)
314            .ok_or(ParseError::LimitExceeded {
315                limit: "max_xmp_elements",
316            })?;
317        if self.elements > self.limits.max_xmp_elements {
318            return Err(ParseError::LimitExceeded {
319                limit: "max_xmp_elements",
320            }
321            .into());
322        }
323        let (prefix, local) = split_xml_name(element.name().as_ref())?;
324        if local.as_str() == "xmpmeta" || local.as_str() == "RDF" {
325            self.saw_packet_wrapper = true;
326        }
327        let previous_namespaces = self.current_namespaces.clone();
328        self.read_namespaces(element)?;
329        let namespace = self.resolve_prefix(&prefix)?;
330        let frame = ElementFrame {
331            namespace,
332            local,
333            text: String::new(),
334            previous_namespaces,
335        };
336        self.capture_attr_properties(element)?;
337        self.stack.push(frame);
338        Ok(())
339    }
340
341    fn end(&mut self) -> Result<()> {
342        let Some(frame) = self.stack.pop() else {
343            return Err(crate::ProfileError::InvalidXml {
344                reason: BoundedText::unchecked("XMP element depth underflow"),
345            }
346            .into());
347        };
348        let value = frame.text.trim().to_owned();
349        if !value.is_empty() && is_identification_property(&frame.namespace, &frame.local) {
350            self.insert_property(&frame, &value)?;
351        }
352        self.current_namespaces = frame.previous_namespaces;
353        self.depth = self.depth.checked_sub(1).ok_or(ParseError::LimitExceeded {
354            limit: "max_xmp_depth",
355        })?;
356        Ok(())
357    }
358
359    fn text(&mut self, value: &str) -> Result<()> {
360        let Some(frame) = self.stack.last_mut() else {
361            return Ok(());
362        };
363        let next_len =
364            frame
365                .text
366                .len()
367                .checked_add(value.len())
368                .ok_or(ParseError::LimitExceeded {
369                    limit: "max_xmp_text_bytes",
370                })?;
371        if next_len > self.limits.max_xmp_text_bytes {
372            return Err(ParseError::LimitExceeded {
373                limit: "max_xmp_text_bytes",
374            }
375            .into());
376        }
377        frame.text.push_str(value);
378        Ok(())
379    }
380
381    fn finish(mut self) -> Result<XmpPacket> {
382        if !self.saw_packet_wrapper {
383            self.facts.push(XmpFact::MissingPacketWrapper);
384        }
385        let identification = self.claims()?;
386        self.facts.push(XmpFact::PacketParsed {
387            bytes: checked_u64_len(self.byte_len, "XMP packet length")?,
388            namespaces: checked_u64_len(self.namespaces.len(), "XMP namespace count")?,
389            claims: checked_u64_len(identification.len(), "XMP claim count")?,
390        });
391        for claim in &identification {
392            self.facts.push(XmpFact::FlavourClaim {
393                family: claim.flavour.family.clone(),
394                display_flavour: claim.display_flavour.clone(),
395                namespace_uri: claim.namespace_uri.clone(),
396            });
397        }
398        let namespaces = self
399            .namespaces
400            .into_iter()
401            .map(|(prefix, uri)| {
402                Ok(NamespaceBinding {
403                    prefix: identifier_allow_empty(prefix)?,
404                    uri,
405                })
406            })
407            .collect::<Result<Vec<_>>>()?;
408        Ok(XmpPacket {
409            source_object: self.source_object,
410            bytes: checked_u64_len(self.byte_len, "XMP packet length")?,
411            namespaces,
412            identification,
413            facts: self.facts,
414        })
415    }
416
417    fn read_namespaces(&mut self, element: &quick_xml::events::BytesStart<'_>) -> Result<()> {
418        let mut attributes = 0_usize;
419        for attr in element.attributes().with_checks(true) {
420            let attr = attr.map_err(|error| crate::ProfileError::InvalidXml {
421                reason: bounded_reason(error.to_string()),
422            })?;
423            attributes = attributes.checked_add(1).ok_or(ParseError::LimitExceeded {
424                limit: "max_xmp_attributes",
425            })?;
426            if attributes > self.limits.max_xmp_attributes {
427                return Err(ParseError::LimitExceeded {
428                    limit: "max_xmp_attributes",
429                }
430                .into());
431            }
432            let key = attr.key.as_ref();
433            let prefix = namespace_decl_prefix(key);
434            if let Some(prefix) = prefix {
435                if self.namespaces.len() >= self.limits.max_xmp_namespaces {
436                    return Err(ParseError::LimitExceeded {
437                        limit: "max_xmp_namespaces",
438                    }
439                    .into());
440                }
441                let value = String::from_utf8_lossy(attr.value.as_ref()).into_owned();
442                let value = BoundedText::new(value, 512)?;
443                self.current_namespaces
444                    .insert(prefix.clone(), value.clone());
445                self.namespaces.insert(prefix, value);
446            }
447        }
448        Ok(())
449    }
450
451    fn capture_attr_properties(
452        &mut self,
453        element: &quick_xml::events::BytesStart<'_>,
454    ) -> Result<()> {
455        for attr in element.attributes().with_checks(true) {
456            let attr = attr.map_err(|error| crate::ProfileError::InvalidXml {
457                reason: bounded_reason(error.to_string()),
458            })?;
459            let key = attr.key.as_ref();
460            if namespace_decl_prefix(key).is_some() {
461                continue;
462            }
463            let (prefix, local) = split_xml_name(key)?;
464            let namespace = self.resolve_prefix(&prefix)?;
465            if is_identification_property(&namespace, &local) {
466                let value = String::from_utf8_lossy(attr.value.as_ref()).into_owned();
467                let frame = ElementFrame {
468                    namespace,
469                    local,
470                    text: String::new(),
471                    previous_namespaces: BTreeMap::new(),
472                };
473                self.insert_property(&frame, value.trim())?;
474            }
475        }
476        Ok(())
477    }
478
479    fn insert_property(&mut self, frame: &ElementFrame, value: &str) -> Result<()> {
480        let text = BoundedText::new(value.to_owned(), self.limits.max_xmp_text_bytes)?;
481        self.properties.insert(
482            (
483                frame.namespace.as_str().to_owned(),
484                frame.local.as_str().to_owned(),
485            ),
486            XmpProperty {
487                namespace: frame.namespace.clone(),
488                local: frame.local.clone(),
489                value: text,
490            },
491        );
492        Ok(())
493    }
494
495    fn resolve_prefix(&self, prefix: &Identifier) -> Result<BoundedText> {
496        if prefix.as_str().is_empty() {
497            return Ok(self
498                .current_namespaces
499                .get("")
500                .cloned()
501                .unwrap_or_else(|| BoundedText::unchecked("")));
502        }
503        self.current_namespaces
504            .get(prefix.as_str())
505            .cloned()
506            .ok_or_else(|| {
507                crate::ProfileError::InvalidXml {
508                    reason: BoundedText::new(
509                        format!("unknown XMP namespace prefix {}", prefix.as_str()),
510                        512,
511                    )
512                    .unwrap_or_else(|_| BoundedText::unchecked("unknown XMP namespace prefix")),
513                }
514                .into()
515            })
516    }
517
518    fn property(&self, namespace: &str, local: &str) -> Option<&XmpProperty> {
519        self.properties
520            .get(&(namespace.to_owned(), local.to_owned()))
521    }
522
523    fn claims(&self) -> Result<Vec<FlavourClaim>> {
524        let mut claims = Vec::new();
525        if let Some(claim) = self.pdfa_claim()? {
526            claims.push(claim);
527        }
528        if let Some(claim) = self.pdfua_claim()? {
529            claims.push(claim);
530        }
531        claims.extend(self.wtpdf_claims()?);
532        Ok(claims)
533    }
534
535    fn pdfa_claim(&self) -> Result<Option<FlavourClaim>> {
536        let Some(part) = self.property(PDF_A_ID_NS, "part") else {
537            return Ok(None);
538        };
539        let part_number = parse_nonzero_part(part.value.as_str(), "PDF/A part")?;
540        let conformance = self
541            .property(PDF_A_ID_NS, "conformance")
542            .map_or("none", |property| property.value.as_str())
543            .to_ascii_lowercase();
544        let flavour = ValidationFlavour::new("pdfa", part_number, conformance)?;
545        Ok(Some(claim_from_property(
546            XmpIdentificationKind::PdfA,
547            &flavour,
548            part,
549        )?))
550    }
551
552    fn pdfua_claim(&self) -> Result<Option<FlavourClaim>> {
553        let Some(part) = self.property(PDF_UA_ID_NS, "part") else {
554            return Ok(None);
555        };
556        let part_number = parse_nonzero_part(part.value.as_str(), "PDF/UA part")?;
557        let conformance = if part_number.get() == 2 {
558            "iso32005"
559        } else {
560            "none"
561        };
562        let flavour = ValidationFlavour::new("pdfua", part_number, conformance)?;
563        Ok(Some(claim_from_property(
564            XmpIdentificationKind::PdfUa,
565            &flavour,
566            part,
567        )?))
568    }
569
570    fn wtpdf_claims(&self) -> Result<Vec<FlavourClaim>> {
571        let mut claims = Vec::new();
572        for property in self
573            .properties
574            .values()
575            .filter(|property| property.namespace.as_str() == PDF_D_NS)
576        {
577            let conformance = match property.value.as_str() {
578                WTPDF_ACCESSIBILITY_DECLARATION => Some("accessibility"),
579                WTPDF_REUSE_DECLARATION => Some("reuse"),
580                _ => None,
581            };
582            if let Some(conformance) = conformance {
583                let flavour = ValidationFlavour::new("wtpdf", NonZeroU32::MIN, conformance)?;
584                claims.push(claim_from_property(
585                    XmpIdentificationKind::Wtpdf,
586                    &flavour,
587                    property,
588                )?);
589            }
590        }
591        Ok(claims)
592    }
593}
594
595#[derive(Clone, Debug)]
596struct ElementFrame {
597    namespace: BoundedText,
598    local: Identifier,
599    text: String,
600    previous_namespaces: BTreeMap<String, BoundedText>,
601}
602
603#[derive(Clone, Debug)]
604struct XmpProperty {
605    namespace: BoundedText,
606    local: Identifier,
607    value: BoundedText,
608}
609
610fn catalog_xmp_bytes(
611    document: &crate::ParsedDocument,
612    limits: &ResourceLimits,
613    warnings: &mut Vec<ValidationWarning>,
614) -> Result<Option<(ObjectKey, Vec<u8>)>> {
615    let Some(catalog_key) = document.catalog else {
616        return Ok(None);
617    };
618    let Some(catalog) = document.objects.get(&catalog_key) else {
619        return Ok(None);
620    };
621    let Some(dictionary) = catalog.object.as_dictionary() else {
622        return Ok(None);
623    };
624    let Some(CosObject::Reference(metadata_key)) = dictionary.get("Metadata") else {
625        return Ok(None);
626    };
627    let Some(metadata) = document.objects.get(metadata_key) else {
628        return Ok(None);
629    };
630    let CosObject::Stream(stream) = &metadata.object else {
631        warnings.push(ValidationWarning::AutoDetection {
632            message: BoundedText::unchecked("catalog metadata is not a stream"),
633        });
634        return Ok(None);
635    };
636    let mut xmp_limits = limits.clone();
637    xmp_limits.max_stream_decode_bytes = limits.max_stream_decode_bytes.min(limits.max_xmp_bytes);
638    let bytes = stream.decoded_bytes(&xmp_limits)?;
639    enforce_xmp_len(bytes.len(), limits.max_xmp_bytes)?;
640    Ok(Some((*metadata_key, bytes)))
641}
642
643/// Parses catalog XMP metadata without deciding validation profiles.
644///
645/// # Errors
646///
647/// Returns [`PdfvError`] when XMP byte/depth/count limits are exceeded.
648pub(crate) fn parse_document_xmp(
649    document: &crate::ParsedDocument,
650    limits: &ResourceLimits,
651    report_absent_metadata: bool,
652) -> Result<XmpParseResult> {
653    let mut warnings = Vec::new();
654    let Some((object, bytes)) = catalog_xmp_bytes(document, limits, &mut warnings)? else {
655        if report_absent_metadata {
656            warnings.push(ValidationWarning::AutoDetection {
657                message: BoundedText::unchecked("catalog metadata stream is missing"),
658            });
659        }
660        return Ok(XmpParseResult {
661            packet: None,
662            parse_facts: Vec::new(),
663            warnings,
664        });
665    };
666    if document.is_encrypted() && !looks_like_xml(&bytes) {
667        return Ok(XmpParseResult {
668            packet: None,
669            parse_facts: Vec::new(),
670            warnings,
671        });
672    }
673    let parser = XmpParser;
674    match parser.parse_packet(object, &bytes, limits) {
675        Ok(packet) => {
676            warnings.extend(packet_warnings(&packet)?);
677            let parse_facts = xmp_parse_facts(&packet)?;
678            Ok(XmpParseResult {
679                packet: Some(packet),
680                parse_facts,
681                warnings,
682            })
683        }
684        Err(PdfvError::Parse(ParseError::LimitExceeded { limit })) => {
685            Err(ParseError::LimitExceeded { limit }.into())
686        }
687        Err(error) => {
688            let reason = BoundedText::new(error.to_string(), 512)
689                .unwrap_or_else(|_| BoundedText::unchecked("XMP parse failed"));
690            let fact = malformed_fact(reason.clone());
691            let parse_facts = vec![ParseFact::Xmp { object, fact }];
692            warnings.push(ValidationWarning::AutoDetection { message: reason });
693            Ok(XmpParseResult {
694                packet: None,
695                parse_facts,
696                warnings,
697            })
698        }
699    }
700}
701
702fn malformed_fact(reason: BoundedText) -> XmpFact {
703    if reason.as_str().contains("DTD")
704        || reason.as_str().contains("entity")
705        || reason.as_str().contains("forbidden")
706    {
707        XmpFact::HostileXmlRejected { reason }
708    } else {
709        XmpFact::Malformed { reason }
710    }
711}
712
713fn looks_like_xml(bytes: &[u8]) -> bool {
714    bytes
715        .iter()
716        .copied()
717        .find(|byte| !byte.is_ascii_whitespace())
718        .is_some_and(|byte| byte == b'<')
719}
720
721fn select_compatible_profiles(
722    profiles: Vec<ValidationProfile>,
723    warnings: &mut Vec<ValidationWarning>,
724) -> Result<Vec<ValidationProfile>> {
725    let Some(first_group) = profiles
726        .first()
727        .map(|profile| compatibility_group(&profile.flavour))
728    else {
729        return Ok(Vec::new());
730    };
731    let mut selected = Vec::new();
732    for profile in profiles {
733        if compatibility_group(&profile.flavour) == first_group {
734            selected.push(profile);
735        } else {
736            warnings.push(ValidationWarning::IncompatibleProfile {
737                profile_id: profile.identity.id,
738                reason: BoundedText::new(
739                    "detected XMP claim is incompatible with the first selected PDF specification \
740                     generation",
741                    256,
742                )?,
743            });
744        }
745    }
746    Ok(selected)
747}
748
749fn compatibility_group(flavour: &ValidationFlavour) -> &'static str {
750    match (flavour.family.as_str(), flavour.part.get()) {
751        ("pdfa", 1..=3) | ("pdfua", 1) => "pdf-1",
752        _ => "pdf-2",
753    }
754}
755
756fn xmp_parse_facts(packet: &XmpPacket) -> Result<Vec<ParseFact>> {
757    packet
758        .facts
759        .iter()
760        .cloned()
761        .map(|fact| {
762            Ok(ParseFact::Xmp {
763                object: packet.source_object,
764                fact,
765            })
766        })
767        .collect()
768}
769
770fn packet_warnings(packet: &XmpPacket) -> Result<Vec<ValidationWarning>> {
771    packet
772        .facts
773        .iter()
774        .filter_map(|fact| match fact {
775            XmpFact::MissingPacketWrapper => Some("XMP packet wrapper is missing"),
776            XmpFact::Malformed { .. } | XmpFact::HostileXmlRejected { .. } => {
777                Some("XMP metadata has parser warnings")
778            }
779            XmpFact::PacketParsed { .. } | XmpFact::FlavourClaim { .. } => None,
780        })
781        .map(|message| {
782            Ok(ValidationWarning::AutoDetection {
783                message: BoundedText::new(message, 256)?,
784            })
785        })
786        .collect()
787}
788
789fn claim_from_property(
790    kind: XmpIdentificationKind,
791    flavour: &ValidationFlavour,
792    property: &XmpProperty,
793) -> Result<FlavourClaim> {
794    Ok(FlavourClaim {
795        kind,
796        flavour: flavour.clone(),
797        display_flavour: display_flavour(flavour)?,
798        namespace_uri: property.namespace.clone(),
799        property: property.local.clone(),
800    })
801}
802
803fn namespace_decl_prefix(key: &[u8]) -> Option<String> {
804    if key == b"xmlns" {
805        return Some(String::new());
806    }
807    key.strip_prefix(b"xmlns:")
808        .map(|prefix| String::from_utf8_lossy(prefix).into_owned())
809}
810
811fn split_xml_name(name: &[u8]) -> Result<(Identifier, Identifier)> {
812    let split = name.iter().position(|byte| *byte == b':');
813    let (prefix, local) = match split {
814        Some(index) => {
815            let prefix = name.get(..index).unwrap_or_default();
816            let local = name.get(index.saturating_add(1)..).unwrap_or_default();
817            (prefix, local)
818        }
819        None => (&[][..], name),
820    };
821    Ok((
822        identifier_allow_empty(String::from_utf8_lossy(prefix).into_owned())?,
823        Identifier::new(String::from_utf8_lossy(local).into_owned())?,
824    ))
825}
826
827fn identifier_allow_empty(value: String) -> Result<Identifier> {
828    if value.is_empty() {
829        Ok(Identifier::unchecked(""))
830    } else {
831        Identifier::new(value).map_err(Into::into)
832    }
833}
834
835fn is_identification_property(namespace: &BoundedText, local: &Identifier) -> bool {
836    matches!(
837        (namespace.as_str(), local.as_str()),
838        (PDF_A_ID_NS, "part" | "conformance" | "rev")
839            | (PDF_UA_ID_NS, "part" | "rev" | "amd" | "corr")
840            | (PDF_D_NS, "conformsTo" | "declarations" | "value" | "li")
841    )
842}
843
844fn parse_nonzero_part(value: &str, field: &'static str) -> Result<NonZeroU32> {
845    let number = value
846        .parse::<u32>()
847        .map_err(|_| crate::ProfileError::InvalidField {
848            field,
849            reason: BoundedText::unchecked("XMP identification part is not numeric"),
850        })?;
851    NonZeroU32::new(number)
852        .ok_or(crate::ProfileError::InvalidField {
853            field,
854            reason: BoundedText::unchecked("XMP identification part is zero"),
855        })
856        .map_err(Into::into)
857}
858
859fn enforce_xmp_len(len: usize, max: u64) -> Result<()> {
860    if checked_u64_len(len, "XMP byte length")? > max {
861        return Err(ParseError::LimitExceeded {
862            limit: "max_xmp_bytes",
863        }
864        .into());
865    }
866    Ok(())
867}
868
869fn checked_u64_len(len: usize, context: &'static str) -> Result<u64> {
870    u64::try_from(len)
871        .map_err(|_| ParseError::ArithmeticOverflow { context })
872        .map_err(Into::into)
873}
874
875fn xmp_xml_error(error: &quick_xml::Error) -> PdfvError {
876    crate::ProfileError::InvalidXml {
877        reason: bounded_reason(error.to_string()),
878    }
879    .into()
880}
881
882fn bounded_reason(value: String) -> BoundedText {
883    BoundedText::new(value, 512).unwrap_or_else(|_| BoundedText::unchecked("XMP XML error"))
884}
885
886#[cfg(test)]
887mod tests {
888    use std::num::NonZeroU32;
889
890    use super::XmpParser;
891    use crate::{ObjectKey, ResourceLimits, XmpFact};
892
893    fn key() -> ObjectKey {
894        ObjectKey::new(NonZeroU32::MIN, 0)
895    }
896
897    #[test]
898    fn test_should_parse_pdfa_claim_with_namespace_alias() -> crate::Result<()> {
899        let xml = br#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
900<x:xmpmeta xmlns:x="adobe:ns:meta/">
901  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
902    <rdf:Description xmlns:aid="http://www.aiim.org/pdfa/ns/id/" aid:part="2" aid:conformance="B"/>
903  </rdf:RDF>
904</x:xmpmeta>"#;
905
906        let packet = XmpParser.parse_packet(key(), xml, &ResourceLimits::default())?;
907
908        assert_eq!(packet.identification.len(), 1);
909        assert_eq!(
910            packet
911                .identification
912                .first()
913                .map(|claim| claim.display_flavour.as_str()),
914            Some("pdfa-2b")
915        );
916        assert!(packet.facts.iter().any(|fact| matches!(
917            fact,
918            XmpFact::FlavourClaim {
919                family,
920                display_flavour,
921                ..
922            } if family.as_str() == "pdfa" && display_flavour.as_str() == "pdfa-2b"
923        )));
924        Ok(())
925    }
926
927    #[test]
928    fn test_should_parse_pdfua_and_wtpdf_claims() -> crate::Result<()> {
929        let xml = br#"<x:xmpmeta xmlns:x="adobe:ns:meta/">
930  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
931    <rdf:Description xmlns:pdfuaid="http://www.aiim.org/pdfua/ns/id/"
932                     xmlns:pdfd="http://pdfa.org/declarations/"
933                     pdfuaid:part="2">
934      <pdfd:conformsTo>http://pdfa.org/declarations/wtpdf#reuse1.0</pdfd:conformsTo>
935    </rdf:Description>
936  </rdf:RDF>
937</x:xmpmeta>"#;
938
939        let packet = XmpParser.parse_packet(key(), xml, &ResourceLimits::default())?;
940        let flavours = packet
941            .identification
942            .iter()
943            .map(|claim| claim.display_flavour.as_str())
944            .collect::<Vec<_>>();
945
946        assert!(flavours.contains(&"pdfua-2-iso32005"));
947        assert!(flavours.contains(&"wtpdf-1-0-reuse"));
948        Ok(())
949    }
950
951    #[test]
952    fn test_should_restore_scoped_namespace_bindings() -> crate::Result<()> {
953        let xml = br#"<x:xmpmeta xmlns:x="adobe:ns:meta/">
954  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
955    <rdf:Description xmlns:id="http://www.aiim.org/pdfa/ns/id/">
956      <wrapper xmlns:id="http://www.aiim.org/pdfua/ns/id/">
957        <id:part>2</id:part>
958      </wrapper>
959      <id:part>3</id:part>
960      <id:conformance>U</id:conformance>
961    </rdf:Description>
962  </rdf:RDF>
963</x:xmpmeta>"#;
964
965        let packet = XmpParser.parse_packet(key(), xml, &ResourceLimits::default())?;
966
967        assert!(packet.identification.iter().any(|claim| {
968            claim.display_flavour.as_str() == "pdfa-3u"
969                && claim.kind == super::XmpIdentificationKind::PdfA
970        }));
971        Ok(())
972    }
973
974    #[test]
975    fn test_should_reject_xmp_doctype_and_entities() {
976        let xml = br#"<!DOCTYPE x [ <!ENTITY ext SYSTEM "file:///etc/passwd"> ]>
977<x:xmpmeta xmlns:x="adobe:ns:meta/">&ext;</x:xmpmeta>"#;
978
979        let result = XmpParser.parse_packet(key(), xml, &ResourceLimits::default());
980
981        assert!(result.is_err());
982    }
983
984    #[test]
985    fn test_should_enforce_xmp_byte_cap() {
986        let limits = ResourceLimits {
987            max_xmp_bytes: 8,
988            ..ResourceLimits::default()
989        };
990
991        let result = XmpParser.parse_packet(key(), b"<x:xmpmeta/>", &limits);
992
993        assert!(matches!(
994            result,
995            Err(crate::PdfvError::Parse(crate::ParseError::LimitExceeded {
996                limit: "max_xmp_bytes"
997            }))
998        ));
999    }
1000}