spdfdiff_types 0.1.1-preview.7

Shared semantic PDF diff types, diagnostics, provenance, and limits.
Documentation
use serde::{Deserialize, Serialize};
use thiserror::Error;

pub const DIFF_SCHEMA_VERSION: &str = "0.1.0";

#[derive(Debug, Error, Clone, PartialEq, Eq)]
pub enum PdfDiffError {
    #[error("input exceeds configured resource limit: {0}")]
    ResourceLimitExceeded(String),
    #[error("input is not a supported PDF: {0}")]
    UnsupportedPdf(String),
    #[error("invalid input: {0}")]
    InvalidInput(String),
    #[error("internal invariant failed: {0}")]
    InternalInvariant(String),
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct ObjectId {
    pub number: u32,
    pub generation: u16,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct ByteRange {
    pub start: usize,
    pub end: usize,
}

impl ByteRange {
    #[must_use]
    pub const fn new(start: usize, end: usize) -> Self {
        Self { start, end }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FileRole {
    Old,
    New,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Provenance {
    pub file_role: Option<FileRole>,
    pub object_id: Option<ObjectId>,
    pub page_index: Option<usize>,
    pub stream_object_id: Option<ObjectId>,
    pub content_op_index: Option<usize>,
    pub byte_range: Option<ByteRange>,
}

impl Provenance {
    #[must_use]
    pub const fn unknown() -> Self {
        Self {
            file_role: None,
            object_id: None,
            page_index: None,
            stream_object_id: None,
            content_op_index: None,
            byte_range: None,
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct Point {
    pub x: f32,
    pub y: f32,
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct Rect {
    pub x0: f32,
    pub y0: f32,
    pub x1: f32,
    pub y1: f32,
}

impl Rect {
    #[must_use]
    pub fn width(self) -> f32 {
        self.x1 - self.x0
    }

    #[must_use]
    pub fn height(self) -> f32 {
        self.y1 - self.y0
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct Matrix {
    pub a: f32,
    pub b: f32,
    pub c: f32,
    pub d: f32,
    pub e: f32,
    pub f: f32,
}

impl Matrix {
    pub const IDENTITY: Self = Self {
        a: 1.0,
        b: 0.0,
        c: 0.0,
        d: 1.0,
        e: 0.0,
        f: 0.0,
    };
}

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct LineSegment {
    pub start: Point,
    pub end: Point,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum DiagnosticSeverity {
    Info,
    Warning,
    Error,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Diagnostic {
    pub severity: DiagnosticSeverity,
    pub code: String,
    pub message: String,
    pub object: Option<ObjectId>,
    pub page_index: Option<usize>,
}

impl Diagnostic {
    #[must_use]
    pub fn new(
        severity: DiagnosticSeverity,
        code: impl Into<String>,
        message: impl Into<String>,
    ) -> Self {
        Self {
            severity,
            code: code.into(),
            message: message.into(),
            object: None,
            page_index: None,
        }
    }

    #[must_use]
    pub fn info(code: impl Into<String>, message: impl Into<String>) -> Self {
        Self::new(DiagnosticSeverity::Info, code, message)
    }

    #[must_use]
    pub fn warning(code: impl Into<String>, message: impl Into<String>) -> Self {
        Self::new(DiagnosticSeverity::Warning, code, message)
    }

    #[must_use]
    pub fn error(code: impl Into<String>, message: impl Into<String>) -> Self {
        Self::new(DiagnosticSeverity::Error, code, message)
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct ResourceLimits {
    pub max_file_bytes: usize,
    pub max_objects: usize,
    pub max_indirect_depth: usize,
    pub max_stream_bytes: usize,
    pub max_decoded_stream_bytes: usize,
    pub max_content_ops_per_page: usize,
    pub max_pages: usize,
}

impl Default for ResourceLimits {
    fn default() -> Self {
        Self {
            max_file_bytes: 100 * 1024 * 1024,
            max_objects: 250_000,
            max_indirect_depth: 64,
            max_stream_bytes: 50 * 1024 * 1024,
            max_decoded_stream_bytes: 200 * 1024 * 1024,
            max_content_ops_per_page: 1_000_000,
            max_pages: 10_000,
        }
    }
}

impl ResourceLimits {
    pub fn check_file_size(self, byte_len: usize) -> Result<(), PdfDiffError> {
        if byte_len > self.max_file_bytes {
            return Err(PdfDiffError::ResourceLimitExceeded(format!(
                "file has {byte_len} bytes, limit is {}",
                self.max_file_bytes
            )));
        }

        Ok(())
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct ParseConfig {
    pub limits: ResourceLimits,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChangeKind {
    Inserted,
    Deleted,
    Modified,
    Moved,
    LayoutChanged,
    StyleChanged,
    MetadataChanged,
    AnnotationChanged,
    FormFieldChanged,
    ObjectChanged,
    Unknown,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChangeSeverity {
    Critical,
    Major,
    Minor,
    Info,
}

#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct DiffSummary {
    pub inserted: usize,
    pub deleted: usize,
    pub modified: usize,
    pub moved: usize,
    pub layout_changed: usize,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SemanticNodeEvidence {
    pub node_id: String,
    pub page: usize,
    pub bbox: Option<Rect>,
    pub text: Option<String>,
    pub source: Vec<Provenance>,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum TextHunkKind {
    Equal,
    Inserted,
    Deleted,
    Replaced,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextRange {
    pub start: usize,
    pub end: usize,
}

impl TextRange {
    #[must_use]
    pub const fn new(start: usize, end: usize) -> Self {
        Self { start, end }
    }
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum TextHunkGranularity {
    Token,
    Character,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextHunk {
    pub kind: TextHunkKind,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub granularity: Option<TextHunkGranularity>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub old_range: Option<TextRange>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub new_range: Option<TextRange>,
    pub old_text: Option<String>,
    pub new_text: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SemanticChange {
    pub id: String,
    pub kind: ChangeKind,
    pub severity: ChangeSeverity,
    pub old_node: Option<SemanticNodeEvidence>,
    pub new_node: Option<SemanticNodeEvidence>,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub text_hunks: Vec<TextHunk>,
    pub confidence: f32,
    pub reason: String,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct DiffDocument {
    pub schema_version: String,
    pub old_fingerprint: String,
    pub new_fingerprint: String,
    pub summary: DiffSummary,
    pub changes: Vec<SemanticChange>,
    pub diagnostics: Vec<Diagnostic>,
}

impl DiffDocument {
    #[must_use]
    pub fn empty(old_fingerprint: impl Into<String>, new_fingerprint: impl Into<String>) -> Self {
        Self {
            schema_version: DIFF_SCHEMA_VERSION.to_owned(),
            old_fingerprint: old_fingerprint.into(),
            new_fingerprint: new_fingerprint.into(),
            summary: DiffSummary::default(),
            changes: Vec::new(),
            diagnostics: Vec::new(),
        }
    }
}