1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
use crate::content::{Block, Text};
use crate::{DocumentId, Hasher, Result};
use super::Document;
impl Document {
/// Compute the document ID from content and identity metadata.
///
/// Per spec §06 §4.1, the document ID is computed by hashing the canonicalized
/// semantic identity of the document. This includes:
///
/// - **Content blocks** (the document's structural content)
/// - **Identity metadata**: title, creator, subject, description, language
///
/// The hash explicitly **excludes** presentation layers, signatures, phantom
/// data, form data, and collaboration data — these are non-identity concerns
/// with their own integrity mechanisms.
///
/// # Errors
///
/// Returns an error if canonicalization fails.
pub fn compute_id(&self) -> Result<DocumentId> {
// Build a hashable structure combining content + identity metadata.
// Per spec §06 §4.1, the hash boundary includes content blocks and
// the subset of Dublin Core metadata that defines document identity.
let content_value = serde_json::to_value(&self.content)?;
let metadata_value = serde_json::json!({
"title": self.dublin_core.terms.title,
"creator": serde_json::to_value(&self.dublin_core.terms.creator)?,
"subject": serde_json::to_value(&self.dublin_core.terms.subject)?,
"description": self.dublin_core.terms.description,
"language": self.dublin_core.terms.language,
});
let hashable = serde_json::json!({
"content": content_value,
"metadata": metadata_value,
});
let canonical = json_canon::to_string(&hashable)?;
Ok(Hasher::hash(
self.manifest.hash_algorithm,
canonical.as_bytes(),
))
}
/// Verify the document integrity.
///
/// This checks:
/// - Content hash matches manifest
/// - Document ID is valid (if not pending)
///
/// # Errors
///
/// Returns an error if verification fails.
pub fn verify(&self) -> Result<VerificationReport> {
let mut report = VerificationReport {
content_valid: true,
id_valid: true,
errors: Vec::new(),
};
// Verify content hash
// Note: must use to_vec_pretty to match what write_to uses
if !self.manifest.content.hash.is_pending() {
let content_json = serde_json::to_vec_pretty(&self.content)?;
let actual_hash = Hasher::hash(self.manifest.content.hash.algorithm(), &content_json);
if actual_hash != self.manifest.content.hash {
report.content_valid = false;
report.errors.push(format!(
"Content hash mismatch: expected {}, got {}",
self.manifest.content.hash, actual_hash
));
}
}
// Verify document ID
if !self.manifest.id.is_pending() {
let computed_id = self.compute_id()?;
if computed_id != self.manifest.id {
report.id_valid = false;
report.errors.push(format!(
"Document ID mismatch: expected {}, got {}",
self.manifest.id, computed_id
));
}
}
Ok(report)
}
/// Validate extension declarations.
///
/// This checks that all extension namespaces used in the document's content
/// (blocks and marks) are declared in the manifest's extensions list.
///
/// # Returns
///
/// An `ExtensionValidationReport` containing:
/// - List of used extension namespaces
/// - List of declared extension namespaces
/// - List of undeclared (used but not declared) namespaces
/// - Warnings for any issues found
#[must_use]
pub fn validate_extensions(&self) -> ExtensionValidationReport {
// Collect declared namespaces
let declared_namespaces: Vec<String> = self
.manifest
.extensions
.iter()
.map(|e| e.namespace().to_string())
.collect();
// Collect used namespaces from content
let mut used = std::collections::HashSet::new();
Self::collect_extension_namespaces(&self.content.blocks, &mut used);
let mut used_namespaces: Vec<String> = used.iter().cloned().collect();
used_namespaces.sort();
// Find undeclared namespaces
let mut undeclared = Vec::new();
let mut warnings = Vec::new();
for namespace in &used_namespaces {
if !self.manifest.has_extension(namespace) {
undeclared.push(namespace.clone());
warnings.push(format!(
"Extension namespace '{namespace}' is used but not declared in manifest"
));
}
}
ExtensionValidationReport {
used_namespaces,
declared_namespaces,
undeclared,
unsupported_required: Vec::new(),
warnings,
}
}
/// Recursively collect extension namespaces from blocks.
fn collect_extension_namespaces(
blocks: &[Block],
namespaces: &mut std::collections::HashSet<String>,
) {
for block in blocks {
// Check if this is an extension block
if let Some(ext) = block.as_extension() {
namespaces.insert(ext.namespace.clone());
}
// Recursively check children and collect marks from text nodes
match block {
Block::Paragraph { children, .. }
| Block::Heading { children, .. }
| Block::CodeBlock { children, .. }
| Block::DefinitionTerm { children, .. } => {
Self::collect_marks_namespaces(children, namespaces);
}
Block::List { children, .. }
| Block::ListItem { children, .. }
| Block::Blockquote { children, .. }
| Block::Table { children, .. }
| Block::TableRow { children, .. }
| Block::DefinitionItem { children, .. }
| Block::DefinitionDescription { children, .. } => {
Self::collect_extension_namespaces(children, namespaces);
}
Block::DefinitionList(dl) => {
Self::collect_extension_namespaces(&dl.children, namespaces);
}
Block::TableCell(cell) => {
Self::collect_marks_namespaces(&cell.children, namespaces);
}
Block::Figure(fig) => {
Self::collect_extension_namespaces(&fig.children, namespaces);
}
Block::FigCaption(fc) => {
Self::collect_marks_namespaces(&fc.children, namespaces);
}
Block::Admonition(adm) => {
Self::collect_extension_namespaces(&adm.children, namespaces);
}
Block::Extension(ext) => {
// Already handled above, but also check children
Self::collect_extension_namespaces(&ext.children, namespaces);
}
// Leaf blocks without children
Block::HorizontalRule { .. }
| Block::Image(_)
| Block::Math(_)
| Block::Break { .. }
| Block::Measurement(_)
| Block::Signature(_)
| Block::Svg(_)
| Block::Barcode(_) => {}
}
}
}
/// Collect extension namespaces from text marks.
fn collect_marks_namespaces(
texts: &[Text],
namespaces: &mut std::collections::HashSet<String>,
) {
for text in texts {
for mark in &text.marks {
if let Some(ext) = mark.as_extension() {
namespaces.insert(ext.namespace.clone());
}
}
}
}
}
/// Report from document verification.
#[derive(Debug, Clone)]
pub struct VerificationReport {
/// Whether content hash is valid.
pub content_valid: bool,
/// Whether document ID is valid.
pub id_valid: bool,
/// Error messages.
pub errors: Vec<String>,
}
impl VerificationReport {
/// Check if verification passed.
#[must_use]
pub fn is_valid(&self) -> bool {
self.content_valid && self.id_valid && self.errors.is_empty()
}
}
/// Report from extension validation.
///
/// This report identifies which extension namespaces are used in the document
/// content but not declared in the manifest's extensions list.
#[derive(Debug, Clone, Default)]
pub struct ExtensionValidationReport {
/// Extension namespaces used in content (from blocks and marks).
pub used_namespaces: Vec<String>,
/// Extension namespaces that are declared in the manifest.
pub declared_namespaces: Vec<String>,
/// Extension namespaces used but not declared.
pub undeclared: Vec<String>,
/// Extension namespaces declared as required but not supported by this reader.
/// (Currently empty since we support all built-in extensions)
pub unsupported_required: Vec<String>,
/// Warning messages.
pub warnings: Vec<String>,
}
impl ExtensionValidationReport {
/// Check if extension validation passed without warnings.
#[must_use]
pub fn is_valid(&self) -> bool {
self.undeclared.is_empty() && self.unsupported_required.is_empty()
}
/// Check if there are any warnings.
#[must_use]
pub fn has_warnings(&self) -> bool {
!self.warnings.is_empty()
}
}