Skip to main content

pdf_syntax/
leniency.rs

1//! Thread-local decode-leniency event collector.
2//!
3//! Inner parser and filter code calls [`emit`] to record a recovery or
4//! leniency event without changing any behaviour. A caller that wants to
5//! observe events wraps its work with [`activate`] / [`drain`]:
6//!
7//! ```rust,ignore
8//! pdf_syntax::leniency::activate();
9//! let pdf = Pdf::new(bytes)?;                 // events accumulate here
10//! let events = pdf_syntax::leniency::drain(); // returns them and resets
11//! ```
12//!
13//! When no collector is active every [`emit`] call is a no-op — zero heap
14//! allocation on the hot path. Per-code deduplication prevents a single
15//! corrupt stream from producing thousands of identical events.
16
17/// Severity of a [`LeniencyEvent`].
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum LeniencySeverity {
20    /// A recovery succeeded; output may differ slightly from the source.
21    Info,
22    /// A degradation or non-standard recovery occurred; output may be partial.
23    Warning,
24    /// A hard limit or severe corruption required truncating the output.
25    Critical,
26}
27
28/// A single decode-leniency or structural-recovery event.
29///
30/// All fields are `'static` — no heap allocation on the emit path.
31#[derive(Debug, Clone, PartialEq, Eq)]
32pub struct LeniencyEvent {
33    /// Short, stable, machine-readable identifier (e.g. `"FLATE_BROKEN_FALLBACK"`).
34    pub code: &'static str,
35    /// Severity classification.
36    pub severity: LeniencySeverity,
37    /// Human-readable description of the event.
38    pub message: &'static str,
39}
40
41impl LeniencyEvent {
42    const fn new(code: &'static str, severity: LeniencySeverity, message: &'static str) -> Self {
43        Self {
44            code,
45            severity,
46            message,
47        }
48    }
49}
50
51// ---------------------------------------------------------------------------
52// Known event constants — constructed once, zero cost to reference.
53// ---------------------------------------------------------------------------
54
55/// Flate stream broken; decoded with a pure-Rust fallback.
56pub const FLATE_BROKEN_FALLBACK: LeniencyEvent = LeniencyEvent::new(
57    "FLATE_BROKEN_FALLBACK",
58    LeniencySeverity::Warning,
59    "A flate stream was broken; the pure-Rust fallback decoder was used. \
60     Output may be truncated or differ from the original.",
61);
62
63/// A bad block header was encountered inside a flate stream.
64pub const FLATE_BAD_BLOCK: LeniencyEvent = LeniencyEvent::new(
65    "FLATE_BAD_BLOCK",
66    LeniencySeverity::Warning,
67    "A bad block header was encountered in a flate stream; the block was skipped.",
68);
69
70/// Premature EOF in an LZW stream; the EOD code was absent.
71pub const LZW_PREMATURE_EOF: LeniencyEvent = LeniencyEvent::new(
72    "LZW_PREMATURE_EOF",
73    LeniencySeverity::Warning,
74    "Premature EOF in an LZW stream; the end-of-data code was absent.",
75);
76
77/// An invalid code was encountered in an LZW stream.
78pub const LZW_INVALID_CODE: LeniencyEvent = LeniencyEvent::new(
79    "LZW_INVALID_CODE",
80    LeniencySeverity::Warning,
81    "An invalid LZW code was encountered; the stream may be truncated.",
82);
83
84/// ASCII-85: a 1-character terminal group was accepted leniently.
85///
86/// A 1-character group is technically malformed (spec requires ≥ 2 chars) but
87/// is common in scanned PDFs. The group is treated as producing zero bytes.
88pub const ASCII85_LENIENT_PARTIAL: LeniencyEvent = LeniencyEvent::new(
89    "ASCII85_LENIENT_PARTIAL",
90    LeniencySeverity::Info,
91    "An ASCII-85 stream contained a 1-character terminal group; accepted leniently \
92     (common in scanned PDFs). No bytes produced for that group.",
93);
94
95/// CCITT: partial row decode — at least one row decoded before an error.
96pub const CCITT_PARTIAL_DECODE: LeniencyEvent = LeniencyEvent::new(
97    "CCITT_PARTIAL_DECODE",
98    LeniencySeverity::Warning,
99    "The CCITT filter encountered an error after decoding at least one row; \
100     partial output returned.",
101);
102
103/// Stream parse failed; a manual fallback parser was used.
104pub const STREAM_PARSE_FALLBACK: LeniencyEvent = LeniencyEvent::new(
105    "STREAM_PARSE_FALLBACK",
106    LeniencySeverity::Warning,
107    "Normal stream parsing failed; a manual fallback parser was used instead.",
108);
109
110/// A cycle was detected in indirect object references.
111pub const INDIRECT_CYCLE: LeniencyEvent = LeniencyEvent::new(
112    "INDIRECT_CYCLE",
113    LeniencySeverity::Warning,
114    "A cycle was detected in indirect object references; resolution was stopped.",
115);
116
117/// Indirect object resolution depth exceeded the 512-level limit.
118pub const INDIRECT_DEPTH_EXCEEDED: LeniencyEvent = LeniencyEvent::new(
119    "INDIRECT_DEPTH_EXCEEDED",
120    LeniencySeverity::Warning,
121    "Indirect object resolution depth exceeded 512; resolution was stopped.",
122);
123
124// ---------------------------------------------------------------------------
125// Thread-local collector — std only.
126// ---------------------------------------------------------------------------
127
128#[cfg(feature = "std")]
129use std::cell::RefCell;
130
131#[cfg(feature = "std")]
132thread_local! {
133    static COLLECTOR: RefCell<Option<Vec<LeniencyEvent>>> = const { RefCell::new(None) };
134}
135
136/// Activate the thread-local collector on the current thread.
137///
138/// Any subsequent [`emit`] calls accumulate into the buffer until [`drain`]
139/// is called. Calling `activate` while a collector is already active discards
140/// the existing events and starts fresh.
141#[cfg(feature = "std")]
142pub fn activate() {
143    COLLECTOR.with(|c| *c.borrow_mut() = Some(Vec::new()));
144}
145
146/// Drain all accumulated events and deactivate the collector.
147///
148/// Returns the events collected since the last [`activate`], then resets the
149/// buffer. Calling `drain` without a prior `activate` returns an empty `Vec`.
150#[cfg(feature = "std")]
151pub fn drain() -> alloc::vec::Vec<LeniencyEvent> {
152    COLLECTOR.with(|c| c.borrow_mut().take().unwrap_or_default())
153}
154
155/// Emit a leniency event to the active thread-local collector.
156///
157/// No-op if no collector is active on this thread. Per-code deduplication
158/// ensures the same event code appears at most once per `activate`/`drain`
159/// pair, regardless of how many times the recovery path is triggered.
160#[cfg(feature = "std")]
161pub(crate) fn emit(event: LeniencyEvent) {
162    COLLECTOR.with(|c| {
163        if let Some(vec) = c.borrow_mut().as_mut() {
164            // Linear scan — ≤ ~15 distinct codes per document in practice.
165            if !vec.iter().any(|e| e.code == event.code) {
166                vec.push(event);
167            }
168        }
169    });
170}
171
172/// No-op stub for `no_std` builds where thread-locals are unavailable.
173#[cfg(not(feature = "std"))]
174pub(crate) fn emit(_event: LeniencyEvent) {}
175
176// ---------------------------------------------------------------------------
177// Tests
178// ---------------------------------------------------------------------------
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    #[test]
185    fn activate_emit_drain_roundtrip() {
186        activate();
187        emit(FLATE_BROKEN_FALLBACK);
188        let events = drain();
189        assert_eq!(events.len(), 1);
190        assert_eq!(events[0].code, "FLATE_BROKEN_FALLBACK");
191    }
192
193    #[test]
194    fn per_code_deduplication() {
195        activate();
196        emit(LZW_PREMATURE_EOF);
197        emit(LZW_PREMATURE_EOF);
198        emit(LZW_PREMATURE_EOF);
199        let events = drain();
200        assert_eq!(events.len(), 1, "same code must appear at most once");
201    }
202
203    #[test]
204    fn multiple_distinct_codes_all_collected() {
205        activate();
206        emit(FLATE_BROKEN_FALLBACK);
207        emit(FLATE_BAD_BLOCK);
208        emit(LZW_PREMATURE_EOF);
209        let events = drain();
210        assert_eq!(events.len(), 3);
211        let codes: Vec<&str> = events.iter().map(|e| e.code).collect();
212        assert!(codes.contains(&"FLATE_BROKEN_FALLBACK"));
213        assert!(codes.contains(&"FLATE_BAD_BLOCK"));
214        assert!(codes.contains(&"LZW_PREMATURE_EOF"));
215    }
216
217    #[test]
218    fn emit_without_activate_is_noop() {
219        // Ensure no collector is active by draining first.
220        let _ = drain();
221        // Now emit — should be a no-op.
222        emit(INDIRECT_CYCLE);
223        // Drain again — must return empty (collector was never activated).
224        let events = drain();
225        assert!(events.is_empty());
226    }
227
228    #[test]
229    fn drain_without_activate_returns_empty() {
230        let _ = drain(); // clear any leftover state
231        let events = drain();
232        assert!(events.is_empty());
233    }
234
235    #[test]
236    fn activate_resets_prior_events() {
237        activate();
238        emit(FLATE_BROKEN_FALLBACK);
239        // Second activate discards the first batch.
240        activate();
241        emit(INDIRECT_CYCLE);
242        let events = drain();
243        assert_eq!(events.len(), 1);
244        assert_eq!(events[0].code, "INDIRECT_CYCLE");
245    }
246
247    #[test]
248    fn all_known_constants_have_unique_codes() {
249        let all = [
250            FLATE_BROKEN_FALLBACK.code,
251            FLATE_BAD_BLOCK.code,
252            LZW_PREMATURE_EOF.code,
253            LZW_INVALID_CODE.code,
254            ASCII85_LENIENT_PARTIAL.code,
255            CCITT_PARTIAL_DECODE.code,
256            STREAM_PARSE_FALLBACK.code,
257            INDIRECT_CYCLE.code,
258            INDIRECT_DEPTH_EXCEEDED.code,
259        ];
260        let unique: std::collections::HashSet<&str> = all.iter().copied().collect();
261        assert_eq!(unique.len(), all.len(), "duplicate event code found");
262    }
263}