pdfluent 1.0.0-beta.5

Pure-Rust PDF SDK with XFA, PDF/A, digital signatures, and WASM support.
Documentation
//! Per-document tier classifier (M6b.3).
//!
//! Given an input PDF (bytes or path), classify it into the M3 corpus tier
//! taxonomy and surface the signals that drove the decision. The classifier
//! is intentionally cheap: it inspects the raw PDF byte stream for textual
//! markers rather than parsing the full XFA template.
//!
//! Signals detected:
//!
//! - XFA presence (any `/XFA` entry in `/AcroForm`).
//! - Dynamic XFA (template `<subform layout="…"/>` other than `position`).
//! - FormCalc presence (`<script contentType="application/x-formcalc">`).
//! - JavaScript presence (`<script>` without contentType, or `application/x-javascript`).
//!
//! Each signal lifts the tier; the highest-tier signal wins.
//!
//! Tier mapping (per M3.6 tier policy):
//!
//! | Tier | Description |
//! |---|---|
//! | A | Static / hybrid PDF without XFA, or static XFA-F |
//! | B | Dynamic XFA without scripts |
//! | C | Dynamic XFA + FormCalc only |
//! | D | Dynamic XFA + JavaScript |
//! | E | Out-of-scope (encrypted, malformed, untriaged) |
//!
//! The classifier never claims fidelity guarantees from the tier alone
//! (M6b.3 §"what not to do"); it returns expected support behaviour for
//! the tier and a `low_confidence` flag when signals are ambiguous.

use serde::{Deserialize, Serialize};
use std::path::Path;

use super::SCHEMA_VERSION;

/// XFA corpus tier (M3 taxonomy).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
#[non_exhaustive]
pub enum XfaTier {
    /// Static / hybrid PDF, or static XFA-F.
    A,
    /// Dynamic XFA without scripts.
    B,
    /// Dynamic XFA with FormCalc only.
    C,
    /// Dynamic XFA with JavaScript.
    D,
    /// Out-of-scope (encrypted, malformed, untriaged).
    E,
}

impl XfaTier {
    /// Stable single-letter tag for serialisation.
    pub const fn as_str(self) -> &'static str {
        match self {
            Self::A => "A",
            Self::B => "B",
            Self::C => "C",
            Self::D => "D",
            Self::E => "E",
        }
    }
}

/// FormCalc presence signal.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum FormCalcSignal {
    /// No FormCalc detected.
    None,
    /// At least one `<script contentType="application/x-formcalc">` block found.
    Present,
}

/// JavaScript presence signal.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum JsSignal {
    /// No JavaScript detected.
    None,
    /// At least one JavaScript block found.
    Present,
}

/// Support status for a tier.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum SupportStatus {
    /// First-class support; expected to render with high fidelity.
    FullySupported,
    /// Supported with known caveats; consult tier policy doc.
    SupportedWithCaveats,
    /// Best-effort; no fidelity contract.
    BestEffort,
    /// Out of scope.
    OutOfScope,
}

/// Classification report (M6b.3 public API).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClassificationReport {
    /// Always equal to [`SCHEMA_VERSION`] at write time.
    pub schema_version: u32,
    /// SDK version that produced the report.
    pub sdk_version: String,
    /// Tier assignment.
    pub tier: XfaTier,
    /// Whether the input contains an XFA stream at all.
    pub has_xfa: bool,
    /// Whether the XFA stream is dynamic (vs static / position-layout).
    pub is_dynamic_xfa: bool,
    /// FormCalc signal.
    pub formcalc: FormCalcSignal,
    /// JavaScript signal.
    pub javascript: JsSignal,
    /// Suggested support behaviour for downstream consumers.
    pub support_status: SupportStatus,
    /// Human-readable caveats, if any.
    pub caveats: Vec<String>,
    /// True when the classifier believes its signal is ambiguous and the
    /// tier may be wrong. Consumers should treat this as a hint to fall
    /// back to manual review (M6b.3 risk: "low confidence" marker).
    pub low_confidence: bool,
}

/// Classify an input PDF from raw bytes.
///
/// Cheap byte-level classifier. Does not parse XFA templates fully.
pub fn classify_input(bytes: &[u8]) -> ClassificationReport {
    let has_xfa = bytes.windows(b"/XFA".len()).any(|w| w == b"/XFA");
    let formcalc_marker = b"x-formcalc";
    let has_formcalc = bytes
        .windows(formcalc_marker.len())
        .any(|w| w == formcalc_marker);
    let js_marker = b"x-javascript";
    let has_javascript = bytes.windows(js_marker.len()).any(|w| w == js_marker);

    // Dynamic-vs-static heuristic. A dynamic XFA template typically declares
    // `layout="tb"` (top-to-bottom) or `layout="lr-tb"` rather than `position`.
    // We look for those tokens in the stream.
    let has_dynamic_layout_token = bytes.windows(11).any(|w| w == b"layout=\"tb\"")
        || bytes.windows(14).any(|w| w == b"layout=\"lr-tb\"")
        || bytes.windows(13).any(|w| w == b"layout=\"row\"");
    let is_dynamic_xfa = has_xfa && has_dynamic_layout_token;

    let tier = if !has_xfa || !is_dynamic_xfa {
        XfaTier::A
    } else if has_javascript {
        XfaTier::D
    } else if has_formcalc {
        XfaTier::C
    } else {
        XfaTier::B
    };

    let support_status = match tier {
        XfaTier::A => SupportStatus::FullySupported,
        XfaTier::B => SupportStatus::FullySupported,
        XfaTier::C => SupportStatus::SupportedWithCaveats,
        XfaTier::D => SupportStatus::BestEffort,
        XfaTier::E => SupportStatus::OutOfScope,
    };

    let mut caveats = Vec::new();
    if matches!(tier, XfaTier::C) {
        caveats.push("FormCalc evaluation is supported but interaction with JS is not.".into());
    }
    if matches!(tier, XfaTier::D) {
        caveats.push("JavaScript-driven XFA layout is best-effort; fidelity is not guaranteed for layout-critical scripts.".into());
    }

    // Low confidence when XFA is present but layout token is missing — could
    // be a static XFA-F or a dynamic one with attribute on a different element.
    let low_confidence = has_xfa && !has_dynamic_layout_token;

    ClassificationReport {
        schema_version: SCHEMA_VERSION,
        sdk_version: crate::api_version().to_string(),
        tier,
        has_xfa,
        is_dynamic_xfa,
        formcalc: if has_formcalc {
            FormCalcSignal::Present
        } else {
            FormCalcSignal::None
        },
        javascript: if has_javascript {
            JsSignal::Present
        } else {
            JsSignal::None
        },
        support_status,
        caveats,
        low_confidence,
    }
}

/// Classify an input PDF from a file path.
pub fn classify_path(path: &Path) -> std::io::Result<ClassificationReport> {
    let bytes = std::fs::read(path)?;
    Ok(classify_input(&bytes))
}

#[cfg(test)]
mod tests {
    use super::*;

    fn fake_pdf(extra: &[u8]) -> Vec<u8> {
        let mut v = b"%PDF-1.7\n".to_vec();
        v.extend_from_slice(extra);
        v.extend_from_slice(b"\n%%EOF\n");
        v
    }

    #[test]
    fn no_xfa_is_tier_a() {
        let pdf = fake_pdf(b"/Catalog\n/AcroForm null");
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::A);
        assert!(!r.has_xfa);
        assert!(matches!(r.support_status, SupportStatus::FullySupported));
        assert!(!r.low_confidence);
    }

    #[test]
    fn xfa_static_is_tier_a() {
        let pdf = fake_pdf(b"/XFA [(template) (data)] layout=\"position\"");
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::A);
        assert!(r.has_xfa);
        assert!(!r.is_dynamic_xfa);
        assert!(
            r.low_confidence,
            "static-xfa heuristic should be low-confidence"
        );
    }

    #[test]
    fn dynamic_xfa_no_scripts_is_tier_b() {
        let pdf = fake_pdf(b"/XFA <subform layout=\"tb\"></subform>");
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::B);
        assert!(r.is_dynamic_xfa);
        assert!(matches!(r.support_status, SupportStatus::FullySupported));
    }

    #[test]
    fn dynamic_xfa_formcalc_is_tier_c() {
        let pdf = fake_pdf(
            b"/XFA <subform layout=\"tb\"><script contentType=\"application/x-formcalc\">a=1</script></subform>",
        );
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::C);
        assert!(matches!(r.formcalc, FormCalcSignal::Present));
        assert!(matches!(r.javascript, JsSignal::None));
        assert!(matches!(
            r.support_status,
            SupportStatus::SupportedWithCaveats
        ));
        assert!(!r.caveats.is_empty());
    }

    #[test]
    fn dynamic_xfa_javascript_is_tier_d() {
        let pdf = fake_pdf(
            b"/XFA <subform layout=\"tb\"><script contentType=\"application/x-javascript\">x=1</script></subform>",
        );
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::D);
        assert!(matches!(r.javascript, JsSignal::Present));
        assert!(matches!(r.support_status, SupportStatus::BestEffort));
    }

    #[test]
    fn javascript_dominates_formcalc() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\" x-formcalc x-javascript");
        let r = classify_input(&pdf);
        assert_eq!(r.tier, XfaTier::D);
    }

    #[test]
    fn report_round_trips_json() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\" x-formcalc");
        let r = classify_input(&pdf);
        let json = serde_json::to_string(&r).unwrap();
        let r2: ClassificationReport = serde_json::from_str(&json).unwrap();
        assert_eq!(r2.tier, r.tier);
        assert_eq!(r2.schema_version, SCHEMA_VERSION);
    }

    #[test]
    fn schema_version_is_one() {
        let r = classify_input(b"%PDF-1.7\n%%EOF");
        assert_eq!(r.schema_version, 1);
    }

    #[test]
    fn tier_serialises_as_single_letter() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let r = classify_input(&pdf);
        let json = serde_json::to_value(&r).unwrap();
        assert_eq!(json["tier"], "B");
    }
}