Skip to main content

cdx_core/document/
verification.rs

1use crate::content::{Block, Text};
2use crate::{DocumentId, Hasher, Result};
3
4use super::Document;
5
6impl Document {
7    /// Compute the document ID from content and identity metadata.
8    ///
9    /// Per spec §06 §4.1, the document ID is computed by hashing the canonicalized
10    /// semantic identity of the document. This includes:
11    ///
12    /// - **Content blocks** (the document's structural content)
13    /// - **Identity metadata**: title, creator, subject, description, language
14    ///
15    /// The hash explicitly **excludes** presentation layers, signatures, phantom
16    /// data, form data, and collaboration data — these are non-identity concerns
17    /// with their own integrity mechanisms.
18    ///
19    /// # Errors
20    ///
21    /// Returns an error if canonicalization fails.
22    pub fn compute_id(&self) -> Result<DocumentId> {
23        // Build a hashable structure combining content + identity metadata.
24        // Per spec §06 §4.1, the hash boundary includes content blocks and
25        // the subset of Dublin Core metadata that defines document identity.
26        let content_value = serde_json::to_value(&self.content)?;
27        let metadata_value = serde_json::json!({
28            "title": self.dublin_core.terms.title,
29            "creator": serde_json::to_value(&self.dublin_core.terms.creator)?,
30            "subject": serde_json::to_value(&self.dublin_core.terms.subject)?,
31            "description": self.dublin_core.terms.description,
32            "language": self.dublin_core.terms.language,
33        });
34
35        let hashable = serde_json::json!({
36            "content": content_value,
37            "metadata": metadata_value,
38        });
39
40        let canonical = json_canon::to_string(&hashable)?;
41
42        Ok(Hasher::hash(
43            self.manifest.hash_algorithm,
44            canonical.as_bytes(),
45        ))
46    }
47
48    /// Verify the document integrity.
49    ///
50    /// This checks:
51    /// - Content hash matches manifest
52    /// - Document ID is valid (if not pending)
53    ///
54    /// # Errors
55    ///
56    /// Returns an error if verification fails.
57    pub fn verify(&self) -> Result<VerificationReport> {
58        let mut report = VerificationReport {
59            content_valid: true,
60            id_valid: true,
61            errors: Vec::new(),
62        };
63
64        // Verify content hash
65        // Note: must use to_vec_pretty to match what write_to uses
66        if !self.manifest.content.hash.is_pending() {
67            let content_json = serde_json::to_vec_pretty(&self.content)?;
68            let actual_hash = Hasher::hash(self.manifest.content.hash.algorithm(), &content_json);
69
70            if actual_hash != self.manifest.content.hash {
71                report.content_valid = false;
72                report.errors.push(format!(
73                    "Content hash mismatch: expected {}, got {}",
74                    self.manifest.content.hash, actual_hash
75                ));
76            }
77        }
78
79        // Verify document ID
80        if !self.manifest.id.is_pending() {
81            let computed_id = self.compute_id()?;
82            if computed_id != self.manifest.id {
83                report.id_valid = false;
84                report.errors.push(format!(
85                    "Document ID mismatch: expected {}, got {}",
86                    self.manifest.id, computed_id
87                ));
88            }
89        }
90
91        Ok(report)
92    }
93
94    /// Validate extension declarations.
95    ///
96    /// This checks that all extension namespaces used in the document's content
97    /// (blocks and marks) are declared in the manifest's extensions list.
98    ///
99    /// # Returns
100    ///
101    /// An `ExtensionValidationReport` containing:
102    /// - List of used extension namespaces
103    /// - List of declared extension namespaces
104    /// - List of undeclared (used but not declared) namespaces
105    /// - Warnings for any issues found
106    #[must_use]
107    pub fn validate_extensions(&self) -> ExtensionValidationReport {
108        // Collect declared namespaces
109        let declared_namespaces: Vec<String> = self
110            .manifest
111            .extensions
112            .iter()
113            .map(|e| e.namespace().to_string())
114            .collect();
115
116        // Collect used namespaces from content
117        let mut used = std::collections::HashSet::new();
118        Self::collect_extension_namespaces(&self.content.blocks, &mut used);
119
120        let mut used_namespaces: Vec<String> = used.iter().cloned().collect();
121        used_namespaces.sort();
122
123        // Find undeclared namespaces
124        let mut undeclared = Vec::new();
125        let mut warnings = Vec::new();
126        for namespace in &used_namespaces {
127            if !self.manifest.has_extension(namespace) {
128                undeclared.push(namespace.clone());
129                warnings.push(format!(
130                    "Extension namespace '{namespace}' is used but not declared in manifest"
131                ));
132            }
133        }
134
135        ExtensionValidationReport {
136            used_namespaces,
137            declared_namespaces,
138            undeclared,
139            unsupported_required: Vec::new(),
140            warnings,
141        }
142    }
143
144    /// Recursively collect extension namespaces from blocks.
145    fn collect_extension_namespaces(
146        blocks: &[Block],
147        namespaces: &mut std::collections::HashSet<String>,
148    ) {
149        for block in blocks {
150            // Check if this is an extension block
151            if let Some(ext) = block.as_extension() {
152                namespaces.insert(ext.namespace.clone());
153            }
154
155            // Recursively check children and collect marks from text nodes
156            match block {
157                Block::Paragraph { children, .. }
158                | Block::Heading { children, .. }
159                | Block::CodeBlock { children, .. }
160                | Block::DefinitionTerm { children, .. } => {
161                    Self::collect_marks_namespaces(children, namespaces);
162                }
163                Block::List { children, .. }
164                | Block::ListItem { children, .. }
165                | Block::Blockquote { children, .. }
166                | Block::Table { children, .. }
167                | Block::TableRow { children, .. }
168                | Block::DefinitionItem { children, .. }
169                | Block::DefinitionDescription { children, .. } => {
170                    Self::collect_extension_namespaces(children, namespaces);
171                }
172                Block::DefinitionList(dl) => {
173                    Self::collect_extension_namespaces(&dl.children, namespaces);
174                }
175                Block::TableCell(cell) => {
176                    Self::collect_marks_namespaces(&cell.children, namespaces);
177                }
178                Block::Figure(fig) => {
179                    Self::collect_extension_namespaces(&fig.children, namespaces);
180                }
181                Block::FigCaption(fc) => {
182                    Self::collect_marks_namespaces(&fc.children, namespaces);
183                }
184                Block::Admonition(adm) => {
185                    Self::collect_extension_namespaces(&adm.children, namespaces);
186                }
187                Block::Extension(ext) => {
188                    // Already handled above, but also check children
189                    Self::collect_extension_namespaces(&ext.children, namespaces);
190                }
191                // Leaf blocks without children
192                Block::HorizontalRule { .. }
193                | Block::Image(_)
194                | Block::Math(_)
195                | Block::Break { .. }
196                | Block::Measurement(_)
197                | Block::Signature(_)
198                | Block::Svg(_)
199                | Block::Barcode(_) => {}
200            }
201        }
202    }
203
204    /// Collect extension namespaces from text marks.
205    fn collect_marks_namespaces(
206        texts: &[Text],
207        namespaces: &mut std::collections::HashSet<String>,
208    ) {
209        for text in texts {
210            for mark in &text.marks {
211                if let Some(ext) = mark.as_extension() {
212                    namespaces.insert(ext.namespace.clone());
213                }
214            }
215        }
216    }
217}
218
219/// Report from document verification.
220#[derive(Debug, Clone)]
221pub struct VerificationReport {
222    /// Whether content hash is valid.
223    pub content_valid: bool,
224    /// Whether document ID is valid.
225    pub id_valid: bool,
226    /// Error messages.
227    pub errors: Vec<String>,
228}
229
230impl VerificationReport {
231    /// Check if verification passed.
232    #[must_use]
233    pub fn is_valid(&self) -> bool {
234        self.content_valid && self.id_valid && self.errors.is_empty()
235    }
236}
237
238/// Report from extension validation.
239///
240/// This report identifies which extension namespaces are used in the document
241/// content but not declared in the manifest's extensions list.
242#[derive(Debug, Clone, Default)]
243pub struct ExtensionValidationReport {
244    /// Extension namespaces used in content (from blocks and marks).
245    pub used_namespaces: Vec<String>,
246    /// Extension namespaces that are declared in the manifest.
247    pub declared_namespaces: Vec<String>,
248    /// Extension namespaces used but not declared.
249    pub undeclared: Vec<String>,
250    /// Extension namespaces declared as required but not supported by this reader.
251    /// (Currently empty since we support all built-in extensions)
252    pub unsupported_required: Vec<String>,
253    /// Warning messages.
254    pub warnings: Vec<String>,
255}
256
257impl ExtensionValidationReport {
258    /// Check if extension validation passed without warnings.
259    #[must_use]
260    pub fn is_valid(&self) -> bool {
261        self.undeclared.is_empty() && self.unsupported_required.is_empty()
262    }
263
264    /// Check if there are any warnings.
265    #[must_use]
266    pub fn has_warnings(&self) -> bool {
267        !self.warnings.is_empty()
268    }
269}