Skip to main content

marque_extract/
metadata.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Metadata extraction and sanitization.
6//!
7//! Surfaces sensitive metadata that document authors are typically unaware of:
8//! author identity, revision history, tracked changes, embedded image EXIF,
9//! template source paths, software version strings, GPS coordinates.
10
11use serde::{Deserialize, Serialize};
12
13/// Complete metadata report for a document.
14#[derive(Debug, Clone, Default, Serialize, Deserialize)]
15pub struct MetadataReport {
16    pub fields: Vec<MetadataField>,
17    pub warnings: Vec<MetadataWarning>,
18}
19
20impl MetadataReport {
21    pub fn has_warnings(&self) -> bool {
22        !self.warnings.is_empty()
23    }
24}
25
26/// A single extracted metadata field.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct MetadataField {
29    pub category: MetadataCategory,
30    pub key: String,
31    pub value: String,
32}
33
34/// Category of metadata field.
35#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
36pub enum MetadataCategory {
37    /// Document properties: author, company, title, subject, keywords.
38    DocumentProperties,
39    /// Revision history, tracked changes, comments with author attribution.
40    RevisionHistory,
41    /// EXIF data from embedded images (GPS, device, timestamp).
42    ImageExif,
43    /// XMP metadata embedded in the document.
44    Xmp,
45    /// Template or base document path — can reveal internal paths or systems.
46    TemplateReference,
47    /// Software and version strings (reveals toolchain).
48    Software,
49    /// Custom/application-defined properties.
50    Custom,
51}
52
53/// A metadata warning — fields that may expose sensitive information.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct MetadataWarning {
56    pub field: MetadataField,
57    pub severity: WarningSeverity,
58    pub reason: String,
59    /// Whether this field can be automatically stripped.
60    pub strippable: bool,
61}
62
63#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
64pub enum WarningSeverity {
65    /// Low-sensitivity metadata (title, keywords).
66    Info,
67    /// Potentially sensitive (author name, company, software version).
68    Warn,
69    /// High sensitivity — GPS coordinates, revision history with PII.
70    High,
71}
72
73#[cfg(test)]
74#[cfg_attr(coverage_nightly, coverage(off))]
75mod tests {
76    use super::*;
77
78    #[test]
79    fn metadata_report_has_warnings_empty() {
80        let report = MetadataReport::default();
81        assert!(!report.has_warnings());
82    }
83
84    #[test]
85    fn metadata_report_has_warnings_populated() {
86        let warning = MetadataWarning {
87            field: MetadataField {
88                category: MetadataCategory::DocumentProperties,
89                key: "Author".to_string(),
90                value: "John Doe".to_string(),
91            },
92            severity: WarningSeverity::Warn,
93            reason: "Reveals author identity".to_string(),
94            strippable: true,
95        };
96        let report = MetadataReport {
97            fields: vec![],
98            warnings: vec![warning],
99        };
100        assert!(report.has_warnings());
101    }
102}