marque_extract/metadata.rs
1//! Metadata extraction and sanitization.
2//!
3//! Surfaces sensitive metadata that document authors are typically unaware of:
4//! author identity, revision history, tracked changes, embedded image EXIF,
5//! template source paths, software version strings, GPS coordinates.
6
7use serde::{Deserialize, Serialize};
8
9/// Complete metadata report for a document.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11pub struct MetadataReport {
12 pub fields: Vec<MetadataField>,
13 pub warnings: Vec<MetadataWarning>,
14}
15
16impl MetadataReport {
17 pub fn has_warnings(&self) -> bool {
18 !self.warnings.is_empty()
19 }
20}
21
22/// A single extracted metadata field.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct MetadataField {
25 pub category: MetadataCategory,
26 pub key: String,
27 pub value: String,
28}
29
30/// Category of metadata field.
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
32pub enum MetadataCategory {
33 /// Document properties: author, company, title, subject, keywords.
34 DocumentProperties,
35 /// Revision history, tracked changes, comments with author attribution.
36 RevisionHistory,
37 /// EXIF data from embedded images (GPS, device, timestamp).
38 ImageExif,
39 /// XMP metadata embedded in the document.
40 Xmp,
41 /// Template or base document path — can reveal internal paths or systems.
42 TemplateReference,
43 /// Software and version strings (reveals toolchain).
44 Software,
45 /// Custom/application-defined properties.
46 Custom,
47}
48
49/// A metadata warning — fields that may expose sensitive information.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct MetadataWarning {
52 pub field: MetadataField,
53 pub severity: WarningSeverity,
54 pub reason: String,
55 /// Whether this field can be automatically stripped.
56 pub strippable: bool,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
60pub enum WarningSeverity {
61 /// Low-sensitivity metadata (title, keywords).
62 Info,
63 /// Potentially sensitive (author name, company, software version).
64 Warn,
65 /// High sensitivity — GPS coordinates, revision history with PII.
66 High,
67}