Skip to main content

marque_extract/
metadata.rs

1//! Metadata extraction and sanitization.
2//!
3//! Surfaces sensitive metadata that document authors are typically unaware of:
4//! author identity, revision history, tracked changes, embedded image EXIF,
5//! template source paths, software version strings, GPS coordinates.
6
7use serde::{Deserialize, Serialize};
8
9/// Complete metadata report for a document.
10#[derive(Debug, Clone, Default, Serialize, Deserialize)]
11pub struct MetadataReport {
12    pub fields: Vec<MetadataField>,
13    pub warnings: Vec<MetadataWarning>,
14}
15
16impl MetadataReport {
17    pub fn has_warnings(&self) -> bool {
18        !self.warnings.is_empty()
19    }
20}
21
22/// A single extracted metadata field.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct MetadataField {
25    pub category: MetadataCategory,
26    pub key: String,
27    pub value: String,
28}
29
30/// Category of metadata field.
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
32pub enum MetadataCategory {
33    /// Document properties: author, company, title, subject, keywords.
34    DocumentProperties,
35    /// Revision history, tracked changes, comments with author attribution.
36    RevisionHistory,
37    /// EXIF data from embedded images (GPS, device, timestamp).
38    ImageExif,
39    /// XMP metadata embedded in the document.
40    Xmp,
41    /// Template or base document path — can reveal internal paths or systems.
42    TemplateReference,
43    /// Software and version strings (reveals toolchain).
44    Software,
45    /// Custom/application-defined properties.
46    Custom,
47}
48
49/// A metadata warning — fields that may expose sensitive information.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct MetadataWarning {
52    pub field: MetadataField,
53    pub severity: WarningSeverity,
54    pub reason: String,
55    /// Whether this field can be automatically stripped.
56    pub strippable: bool,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
60pub enum WarningSeverity {
61    /// Low-sensitivity metadata (title, keywords).
62    Info,
63    /// Potentially sensitive (author name, company, software version).
64    Warn,
65    /// High sensitivity — GPS coordinates, revision history with PII.
66    High,
67}