pdf_redact/
search_redact.rs

1//! Search-and-redact: find text patterns and permanently redact them.
2//!
3//! Combines text extraction (positioned characters) with content stream
4//! surgery to both overlay and remove matched text from the PDF.
5
6use crate::error::{RedactError, Result};
7use crate::redact::{RedactionArea, Redactor};
8use lopdf::{Document, Object, ObjectId};
9use regex::Regex;
10use std::collections::{HashMap, HashSet};
11
12/// Options for search-and-redact operations.
13#[derive(Debug, Clone)]
14pub struct RedactSearchOptions {
15    /// Whether the search is case-sensitive (default: true).
16    pub case_sensitive: bool,
17    /// Whether the pattern is a regex (default: false).
18    pub regex: bool,
19    /// Fill color [r, g, b] for redaction overlay (default: black).
20    pub fill_color: [f64; 3],
21    /// Specific pages to search (None = all pages).
22    pub pages: Option<Vec<u32>>,
23    /// Optional overlay text (e.g., "[REDACTED]").
24    pub overlay_text: Option<String>,
25}
26
27impl Default for RedactSearchOptions {
28    fn default() -> Self {
29        Self {
30            case_sensitive: true,
31            regex: false,
32            fill_color: [0.0, 0.0, 0.0],
33            pages: None,
34            overlay_text: None,
35        }
36    }
37}
38
39impl RedactSearchOptions {
40    /// Create options for an exact case-sensitive search.
41    pub fn exact(pattern: &str) -> Self {
42        let _ = pattern; // used by caller
43        Self::default()
44    }
45
46    /// Create options for a case-insensitive search.
47    pub fn case_insensitive() -> Self {
48        Self {
49            case_sensitive: false,
50            ..Self::default()
51        }
52    }
53
54    /// Create options for a regex search.
55    pub fn with_regex() -> Self {
56        Self {
57            regex: true,
58            ..Self::default()
59        }
60    }
61
62    /// Set fill color.
63    pub fn fill_color(mut self, r: f64, g: f64, b: f64) -> Self {
64        self.fill_color = [r, g, b];
65        self
66    }
67
68    /// Set specific pages.
69    pub fn pages(mut self, pages: Vec<u32>) -> Self {
70        self.pages = Some(pages);
71        self
72    }
73
74    /// Set overlay text.
75    pub fn overlay_text(mut self, text: impl Into<String>) -> Self {
76        self.overlay_text = Some(text.into());
77        self
78    }
79}
80
81/// Report from a search-and-redact operation.
82#[derive(Debug, Clone)]
83pub struct SearchRedactReport {
84    /// Number of text matches found.
85    pub matches_found: usize,
86    /// Number of redaction areas applied.
87    pub areas_redacted: usize,
88    /// Number of content operations removed.
89    pub operations_removed: usize,
90    /// Number of pages affected.
91    pub pages_affected: usize,
92    /// Whether metadata was cleaned.
93    pub metadata_cleaned: bool,
94    /// Bounding boxes of all redacted areas: (page_number_1based, [x0,y0,x1,y1] in PDF points).
95    /// Used by visual verification tests to confirm overlay coverage.
96    pub redacted_rects: Vec<(u32, [f64; 4])>,
97}
98
99/// Search for text matching a pattern and redact all occurrences.
100///
101/// This performs two operations:
102/// 1. Finds text matches using positioned character extraction
103/// 2. Computes bounding rectangles for matches
104/// 3. Applies redaction (overlay + content removal) via the Redactor
105pub fn search_and_redact(
106    doc: &mut Document,
107    pattern: &str,
108    options: &RedactSearchOptions,
109) -> Result<SearchRedactReport> {
110    let pages = doc.get_pages();
111    let total = pages.len() as u32;
112
113    let page_range: Vec<u32> = match &options.pages {
114        Some(ps) => ps.clone(),
115        None => (1..=total).collect(),
116    };
117
118    // Validate pages.
119    for &p in &page_range {
120        if p == 0 || p > total {
121            return Err(RedactError::PageOutOfRange(p, total));
122        }
123    }
124
125    // Build the search pattern.
126    let matcher = build_matcher(pattern, options)?;
127
128    // Find all matches across pages.
129    let mut all_areas: Vec<RedactionArea> = Vec::new();
130    let mut total_matches = 0;
131    // Per-page match bounding boxes used for position-based op removal fallback.
132    let mut page_bboxes: std::collections::HashMap<u32, Vec<[f64; 4]>> =
133        std::collections::HashMap::new();
134
135    for &page_num in &page_range {
136        let chars = match pdf_extract::extract_positioned_chars(doc, page_num) {
137            Ok(c) => c,
138            Err(_) => continue,
139        };
140
141        if chars.is_empty() {
142            continue;
143        }
144
145        // Build a text string from positioned chars.
146        let text: String = chars.iter().map(|c| c.ch).collect();
147
148        // Build a byte-offset-to-char-index map so regex byte offsets can be
149        // translated back to indices into `chars`.
150        let byte_to_char: Vec<usize> = {
151            let mut map = Vec::with_capacity(text.len() + 1);
152            for (ci, ch) in text.chars().enumerate() {
153                for _ in 0..ch.len_utf8() {
154                    map.push(ci);
155                }
156            }
157            map.push(chars.len()); // sentinel for end-of-string
158            map
159        };
160
161        // Find matches in the text.
162        let match_ranges = matcher.find_all(&text);
163
164        for range in &match_ranges {
165            total_matches += 1;
166
167            // Convert byte offsets to char indices.
168            let char_start = byte_to_char.get(range.start).copied().unwrap_or(0);
169            let char_end = byte_to_char.get(range.end).copied().unwrap_or(chars.len());
170            if char_start >= chars.len() || char_end > chars.len() || char_start >= char_end {
171                continue;
172            }
173
174            // Compute bounding rect from the chars in this range.
175            let matched_chars = &chars[char_start..char_end];
176            if matched_chars.is_empty() {
177                continue;
178            }
179
180            let bbox = compute_bounding_rect(matched_chars);
181
182            // Store bbox for position-based content removal fallback.
183            page_bboxes.entry(page_num).or_default().push(bbox);
184
185            let mut area = RedactionArea::new(page_num, bbox);
186            area = area.with_color(
187                options.fill_color[0],
188                options.fill_color[1],
189                options.fill_color[2],
190            );
191            if let Some(ref overlay) = options.overlay_text {
192                area = area.with_overlay(overlay);
193            }
194            all_areas.push(area);
195        }
196    }
197
198    if all_areas.is_empty() {
199        return Ok(SearchRedactReport {
200            matches_found: 0,
201            areas_redacted: 0,
202            operations_removed: 0,
203            pages_affected: 0,
204            metadata_cleaned: false,
205            redacted_rects: Vec::new(),
206        });
207    }
208
209    // Collect rects before mark_all consumes all_areas.
210    let redacted_rects: Vec<(u32, [f64; 4])> = all_areas.iter().map(|a| (a.page, a.rect)).collect();
211
212    // Apply redactions using the existing Redactor.
213    let mut redactor = Redactor::new();
214    redactor.mark_all(all_areas);
215    let report = redactor.apply(doc)?;
216
217    // Additionally, use ContentEditor to surgically remove matching
218    // text operations from the content stream.
219    let mut extra_ops_removed = 0;
220    for &page_num in &page_range {
221        let bboxes: &[[f64; 4]] = page_bboxes
222            .get(&page_num)
223            .map(|v| v.as_slice())
224            .unwrap_or(&[]);
225        let removed = remove_text_ops_for_page(doc, page_num, pattern, options, bboxes)?;
226        extra_ops_removed += removed;
227    }
228
229    Ok(SearchRedactReport {
230        matches_found: total_matches,
231        areas_redacted: report.areas_redacted,
232        operations_removed: report.operations_removed + extra_ops_removed,
233        pages_affected: report.pages_affected,
234        metadata_cleaned: report.metadata_cleaned,
235        redacted_rects,
236    })
237}
238
239// ---------------------------------------------------------------------------
240// Pattern matching
241// ---------------------------------------------------------------------------
242
243struct TextMatcher {
244    regex: Regex,
245}
246
247struct MatchRange {
248    start: usize,
249    end: usize,
250}
251
252impl TextMatcher {
253    fn find_all(&self, text: &str) -> Vec<MatchRange> {
254        self.regex
255            .find_iter(text)
256            .map(|m| MatchRange {
257                start: m.start(),
258                end: m.end(),
259            })
260            .collect()
261    }
262}
263
264fn build_matcher(pattern: &str, options: &RedactSearchOptions) -> Result<TextMatcher> {
265    let regex_pattern = if options.regex {
266        if options.case_sensitive {
267            pattern.to_string()
268        } else {
269            format!("(?i){}", pattern)
270        }
271    } else {
272        let escaped = regex::escape(pattern);
273        if options.case_sensitive {
274            escaped
275        } else {
276            format!("(?i){}", escaped)
277        }
278    };
279
280    let regex = Regex::new(&regex_pattern)
281        .map_err(|e| RedactError::Other(format!("invalid pattern: {e}")))?;
282
283    Ok(TextMatcher { regex })
284}
285
286// ---------------------------------------------------------------------------
287// Bounding rectangle computation
288// ---------------------------------------------------------------------------
289
290/// Returns true if a text run's position overlaps a single bounding rectangle.
291fn run_overlaps_single_bbox(run: &pdf_manip::text_run::TextRun, bbox: [f64; 4]) -> bool {
292    let run_x1 = run.x + run.width.max(1.0);
293    let tol = 4.0_f64;
294    let x_overlap = run.x < bbox[2] + tol && run_x1 > bbox[0] - tol;
295    let y_overlap = run.y <= bbox[3] + tol && run.y >= bbox[1] - tol;
296    x_overlap && y_overlap
297}
298
299/// Check if a text run is on the same baseline as a match bbox and X-overlaps.
300///
301/// Used for the "covered" check in `apply_per_bbox_combined_fallback` to decide
302/// whether a match bbox is already handled by a text-matched run on the same line.
303///
304/// `bbox[1]` is the text rendering y (baseline) of the matched chars — the same
305/// coordinate that `extract_text_runs` stores in `run.y`.  `bbox[3]` equals
306/// `bbox[1] + font_size` (see `extract_positioned_chars`), so using the full
307/// bbox interval `[bbox[1], bbox[3]]` for the Y check permits runs on adjacent
308/// lines (y ≈ bbox[1] + line_height) to be falsely considered "covering" the bbox.
309/// Using a direct `|run.y - bbox[1]| ≤ 0.5` comparison restricts coverage to
310/// runs on the same text line.  Fixes #474.
311fn run_on_same_baseline(run: &pdf_manip::text_run::TextRun, bbox: [f64; 4]) -> bool {
312    let same_y = (run.y - bbox[1]).abs() <= 0.5;
313    let run_x1 = run.x + run.width.max(1.0);
314    let x_overlap = run.x < bbox[2] + 4.0 && run_x1 > bbox[0] - 4.0;
315    same_y && x_overlap
316}
317
318/// Extract the Latin-1–decoded text from a text-showing content operation.
319///
320/// Decodes Tj/TJ/"/"' string operands by treating each byte as its Latin-1
321/// code point — the same strategy used by `pdf_extract::extract_positioned_chars`.
322/// Used as a last-resort fallback when ToUnicode CMap decoding produces
323/// characters that do not match the search pattern (misleading CMap entries).
324fn raw_text_from_op(op: &lopdf::content::Operation) -> Option<String> {
325    use lopdf::Object;
326    match op.operator.as_str() {
327        "Tj" | "'" => {
328            if let Some(Object::String(ref bytes, _)) = op.operands.first() {
329                Some(bytes.iter().map(|&b| b as char).collect())
330            } else {
331                None
332            }
333        }
334        "TJ" => {
335            if let Some(Object::Array(ref arr)) = op.operands.first() {
336                let s: String = arr
337                    .iter()
338                    .filter_map(|item| match item {
339                        Object::String(ref bytes, _) => {
340                            Some(bytes.iter().map(|&b| b as char).collect::<String>())
341                        }
342                        _ => None,
343                    })
344                    .collect();
345                if s.is_empty() {
346                    None
347                } else {
348                    Some(s)
349                }
350            } else {
351                None
352            }
353        }
354        "\"" => op.operands.get(2).and_then(|obj| match obj {
355            Object::String(ref bytes, _) => Some(bytes.iter().map(|&b| b as char).collect()),
356            _ => None,
357        }),
358        _ => None,
359    }
360}
361
362/// Per-bbox combined fallback: for each match bbox not covered by text-matched
363/// runs, find the correct ops to remove using a two-phase approach:
364///
365/// **Phase 1 — Y-line raw-byte match**: concatenates the raw Latin-1 bytes of
366/// all ops on the same Y-baseline as the bbox and pattern-matches across them.
367/// This correctly handles:
368/// - Words split across adjacent Tj ops (e.g. "(Ar) Tj (e) Tj" → "Are").
369/// - Mid-line x-coordinate drift: `extract_positioned_chars` uses an
370///   approximate char width (0.5 × font_size), which accumulates error across
371///   a long line.  By the time we reach a mid-line word, the bbox x can be off
372///   by tens of units from the actual run.x from `extract_text_runs` (which
373///   uses real font metrics).  Y is always accurate; the raw bytes are the same
374///   Latin-1 decoding that `extract_positioned_chars` uses.  Fixes #XXX.
375///
376/// **Phase 2 — X+Y spatial overlap** (legacy fallback): fires only when Phase 1
377/// finds no raw-byte match on the Y-line (e.g. font encoding means the glyph
378/// bytes are not the expected ASCII codepoints).
379fn apply_per_bbox_combined_fallback(
380    runs: &[pdf_manip::text_run::TextRun],
381    indices_to_remove: &mut Vec<usize>,
382    bboxes: &[[f64; 4]],
383    ops: &[lopdf::content::Operation],
384    matcher: &TextMatcher,
385) {
386    // Snapshot the text-matched indices for O(1) lookups.  Indices added in
387    // this pass must not retroactively cover other bboxes.
388    let text_matched: HashSet<usize> = indices_to_remove.iter().copied().collect();
389    let mut to_add: HashSet<usize> = HashSet::new();
390
391    // Build op_index → run.y map for Y-line lookups.
392    let op_to_y: HashMap<usize, f64> = runs
393        .iter()
394        .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
395        .collect();
396
397    for &bbox in bboxes {
398        // If this bbox is already handled by a text-matched run on the same
399        // baseline, skip it.  Uses run_on_same_baseline (strict Y check) so
400        // runs from adjacent lines don't falsely "cover" this bbox. Fixes #474.
401        let covered = runs.iter().any(|run| {
402            run_on_same_baseline(run, bbox)
403                && run.ops_range.clone().any(|i| text_matched.contains(&i))
404        });
405        if covered {
406            continue;
407        }
408
409        // ------------------------------------------------------------------
410        // Phase 1: Y-line raw-byte match.
411        //
412        // bbox[1] = y_baseline - 1 (padding from compute_bounding_rect).
413        // run.y = y_baseline.  So |run.y - bbox[1]| = 1.0; use tol = 6.0 to
414        // catch slight y discrepancies between streams.
415        // ------------------------------------------------------------------
416        let bbox_y = bbox[1];
417        let mut y_line: Vec<(usize, &lopdf::content::Operation)> = ops
418            .iter()
419            .enumerate()
420            .filter(|(idx, _)| {
421                op_to_y
422                    .get(idx)
423                    .map(|&y| (y - bbox_y).abs() <= 6.0)
424                    .unwrap_or(false)
425            })
426            .collect();
427        y_line.sort_by_key(|(idx, _)| *idx);
428
429        // Concatenate raw bytes for all ops on this Y-line, tracking which op
430        // contributed each byte so we can map match positions back to op indices.
431        let mut combined = String::new();
432        let mut byte_to_op: Vec<usize> = Vec::new();
433        for &(idx, op) in &y_line {
434            if let Some(raw) = raw_text_from_op(op) {
435                let before = combined.len(); // byte offset before push
436                combined.push_str(&raw);
437                // Each byte in the appended slice belongs to this op.
438                byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
439            }
440        }
441
442        let raw_matches = matcher.find_all(&combined);
443        if !raw_matches.is_empty() {
444            // Phase 1 succeeded: add only the ops that contain the matched bytes.
445            for m in &raw_matches {
446                for i in m.start..m.end {
447                    if let Some(&op_idx) = byte_to_op.get(i) {
448                        if !text_matched.contains(&op_idx) {
449                            to_add.insert(op_idx);
450                        }
451                    }
452                }
453            }
454            continue; // Skip Phase 2 for this bbox.
455        }
456
457        // ------------------------------------------------------------------
458        // Phase 2: X+Y spatial overlap (legacy fallback).
459        //
460        // Used when Phase 1 finds nothing, e.g. when the font's byte→glyph
461        // mapping means the raw Tj bytes do not spell the search word in Latin-1.
462        // ------------------------------------------------------------------
463        for run in runs {
464            if run_overlaps_single_bbox(run, bbox) {
465                for idx in run.ops_range.clone() {
466                    to_add.insert(idx);
467                }
468            }
469        }
470    }
471
472    for idx in to_add {
473        if !text_matched.contains(&idx) {
474            indices_to_remove.push(idx);
475        }
476    }
477}
478
479fn compute_bounding_rect(chars: &[pdf_extract::PositionedChar]) -> [f64; 4] {
480    let mut x0 = f64::MAX;
481    let mut y0 = f64::MAX;
482    let mut x1 = f64::MIN;
483    let mut y1 = f64::MIN;
484
485    for ch in chars {
486        x0 = x0.min(ch.bbox[0]);
487        y0 = y0.min(ch.bbox[1]);
488        x1 = x1.max(ch.bbox[2]);
489        y1 = y1.max(ch.bbox[3]);
490    }
491
492    // Add small padding to ensure complete coverage.
493    [x0 - 1.0, y0 - 1.0, x1 + 1.0, y1 + 1.0]
494}
495
496// ---------------------------------------------------------------------------
497// Content stream surgery
498// ---------------------------------------------------------------------------
499
500/// Remove text-showing operations whose decoded text matches the pattern.
501///
502/// Processes both the page's direct content stream and any Form XObjects
503/// referenced in the page's Resources.
504///
505/// `match_bboxes` contains bounding rectangles (in page space) of all matches
506/// found on this page by the `pdf_extract`-based search.  These are used as a
507/// fallback when decoded text is unreadable (e.g. glyph-indexed fonts without
508/// a ToUnicode CMap): in that case we fall back to spatial matching, removing
509/// every text-showing op whose position overlaps one of the match bboxes.
510fn remove_text_ops_for_page(
511    doc: &mut Document,
512    page_num: u32,
513    pattern: &str,
514    options: &RedactSearchOptions,
515    match_bboxes: &[[f64; 4]],
516) -> Result<usize> {
517    let fonts = match pdf_manip::text_run::FontMap::from_page(doc, page_num) {
518        Ok(f) => f,
519        Err(_) => return Ok(0),
520    };
521
522    let matcher = build_matcher(pattern, options)?;
523
524    // Shared visited set: prevents re-processing the same stream object across
525    // XObjects and annotation AP streams (avoids diamond-DAG / cycle blowup).
526    let mut visited: HashSet<ObjectId> = HashSet::new();
527
528    // Try normal ContentEditor path first.  Falls back to inline-image-aware
529    // path when the content stream contains BI…EI binary image data that
530    // lopdf's decoder cannot handle.
531    let removed = match pdf_manip::content_editor::editor_for_page(doc, page_num) {
532        Ok(editor) => {
533            remove_text_ops_via_editor(doc, page_num, editor, &matcher, &fonts, match_bboxes)?
534        }
535        Err(_) => {
536            remove_text_ops_with_inline_images(doc, page_num, &matcher, &fonts, match_bboxes)?
537        }
538    };
539
540    // Also process Form XObjects referenced in the page's Resources.
541    // Pass match_bboxes so that XObjects whose font encoding prevents text
542    // matching can still be cleaned via the spatial fallback. Fixes #466 bugs 5–6.
543    let removed = removed
544        + remove_text_ops_from_xobjects(
545            doc,
546            page_num,
547            &matcher,
548            &fonts,
549            match_bboxes,
550            &mut visited,
551        )?;
552
553    // Also process annotation appearance streams.  Pass match_bboxes so the
554    // raw-byte fallback can fire for AP streams with misleading ToUnicode CMaps.
555    let removed = removed
556        + remove_text_ops_from_annotations(
557            doc,
558            page_num,
559            &matcher,
560            &fonts,
561            match_bboxes,
562            &mut visited,
563        )?;
564
565    Ok(removed)
566}
567
568/// Normal content-edit path: parse → find matches → remove → write back.
569fn remove_text_ops_via_editor(
570    doc: &mut Document,
571    page_num: u32,
572    editor: pdf_manip::content_editor::ContentEditor,
573    matcher: &TextMatcher,
574    fonts: &pdf_manip::text_run::FontMap,
575    match_bboxes: &[[f64; 4]],
576) -> Result<usize> {
577    let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
578
579    let mut indices_to_remove: Vec<usize> = Vec::new();
580    for run in &runs {
581        if !matcher.find_all(&run.text).is_empty() {
582            for idx in run.ops_range.clone() {
583                indices_to_remove.push(idx);
584            }
585        }
586    }
587
588    // Per-bbox combined fallback: for each match bbox not covered by a
589    // text-matched run, use Y-line raw-byte matching (Phase 1) to handle
590    // split words and x-coordinate drift, then fall back to spatial overlap
591    // (Phase 2) when the Y-line match finds nothing.
592    if !match_bboxes.is_empty() {
593        apply_per_bbox_combined_fallback(
594            &runs,
595            &mut indices_to_remove,
596            match_bboxes,
597            editor.operations(),
598            matcher,
599        );
600    }
601
602    // Raw-byte fallback: handles cases where extract_text_runs decodes a font
603    // differently from extract_positioned_chars, causing the word to be found
604    // during bbox computation but missed by both text-matching and spatial
605    // matching on the content stream.  Fixes #476.
606    // Also handles words split across adjacent TJ operators by joining them
607    // on the same Y-line before matching. Fixes #654.
608    if !match_bboxes.is_empty() {
609        let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
610
611        // Build op_index → run.y map for Y-line lookups.
612        let op_to_y: HashMap<usize, f64> = runs
613            .iter()
614            .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
615            .collect();
616
617        let ops = editor.operations();
618
619        // Group ops by Y-line (within tolerance).
620        let mut ops_by_y: HashMap<i64, Vec<usize>> = HashMap::new();
621        for (idx, _) in ops.iter().enumerate() {
622            if let Some(&y) = op_to_y.get(&idx) {
623                let y_bucket = (y * 10.0).round() as i64;
624                ops_by_y.entry(y_bucket).or_default().push(idx);
625            }
626        }
627
628        for (_, mut op_indices) in ops_by_y {
629            op_indices.sort();
630            let mut combined = String::new();
631            let mut byte_to_op: Vec<usize> = Vec::new();
632
633            for &idx in &op_indices {
634                if text_matched_set.contains(&idx) {
635                    continue;
636                }
637                if let Some(raw) = raw_text_from_op(&ops[idx]) {
638                    let before = combined.len();
639                    combined.push_str(&raw);
640                    byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
641                }
642            }
643
644            if combined.is_empty() {
645                continue;
646            }
647
648            let matches = matcher.find_all(&combined);
649            for m in matches {
650                for i in m.start..m.end {
651                    if let Some(&op_idx) = byte_to_op.get(i) {
652                        if !text_matched_set.contains(&op_idx) {
653                            indices_to_remove.push(op_idx);
654                            text_matched_set.insert(op_idx);
655                        }
656                    }
657                }
658            }
659        }
660    }
661
662    if indices_to_remove.is_empty() {
663        return Ok(0);
664    }
665
666    indices_to_remove.sort_unstable();
667    indices_to_remove.dedup();
668
669    let mut new_editor = editor;
670    for &idx in indices_to_remove.iter().rev() {
671        new_editor.remove_range(idx..idx + 1);
672    }
673
674    let removed = indices_to_remove.len();
675    pdf_manip::content_editor::write_editor_to_page(doc, page_num, &new_editor)
676        .map_err(|e| RedactError::Other(format!("write content: {e}")))?;
677
678    Ok(removed)
679}
680
681/// Fallback path for pages whose content stream contains inline images
682/// (`BI … EI`).  Strips the images, edits the remaining text ops, then
683/// reconstructs the stream with the images prepended so z-order is preserved.
684fn remove_text_ops_with_inline_images(
685    doc: &mut Document,
686    page_num: u32,
687    matcher: &TextMatcher,
688    fonts: &pdf_manip::text_run::FontMap,
689    match_bboxes: &[[f64; 4]],
690) -> Result<usize> {
691    let pages = doc.get_pages();
692    let &page_id = match pages.get(&page_num) {
693        Some(id) => id,
694        None => return Ok(0),
695    };
696
697    // Decompressed concatenation of all content streams for this page.
698    let content_bytes = match doc.get_page_content(page_id) {
699        Ok(b) => b,
700        Err(_) => return Ok(0),
701    };
702
703    // Separate inline images from the rest.
704    let (stripped, inline_images) = pdf_manip::content_editor::strip_inline_images(&content_bytes);
705
706    let editor = match pdf_manip::content_editor::ContentEditor::from_stream(&stripped) {
707        Ok(e) => e,
708        Err(_) => return Ok(0),
709    };
710
711    let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
712
713    let mut indices_to_remove: Vec<usize> = Vec::new();
714    for run in &runs {
715        if !matcher.find_all(&run.text).is_empty() {
716            for idx in run.ops_range.clone() {
717                indices_to_remove.push(idx);
718            }
719        }
720    }
721
722    // Per-bbox combined fallback (same logic as in remove_text_ops_via_editor).
723    if !match_bboxes.is_empty() {
724        apply_per_bbox_combined_fallback(
725            &runs,
726            &mut indices_to_remove,
727            match_bboxes,
728            editor.operations(),
729            matcher,
730        );
731    }
732
733    // Raw-byte fallback (same logic as in remove_text_ops_via_editor).
734    if !match_bboxes.is_empty() {
735        let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
736        let ops = editor.operations();
737        for (idx, op) in ops.iter().enumerate() {
738            if text_matched_set.contains(&idx) {
739                continue;
740            }
741            if let Some(raw_text) = raw_text_from_op(op) {
742                if !matcher.find_all(&raw_text).is_empty() {
743                    indices_to_remove.push(idx);
744                    text_matched_set.insert(idx);
745                }
746            }
747        }
748    }
749
750    if indices_to_remove.is_empty() {
751        return Ok(0);
752    }
753
754    indices_to_remove.sort_unstable();
755    indices_to_remove.dedup();
756
757    let mut new_editor = editor;
758    for &idx in indices_to_remove.iter().rev() {
759        new_editor.remove_range(idx..idx + 1);
760    }
761
762    let removed = indices_to_remove.len();
763
764    // Re-encode the edited ops and prepend the raw inline image blobs so the
765    // visual z-order is preserved (images were originally before the text).
766    let re_encoded = new_editor
767        .encode()
768        .map_err(|e| RedactError::Other(format!("encode: {e}")))?;
769
770    let mut final_content = Vec::new();
771    for img in &inline_images {
772        final_content.extend_from_slice(img);
773        final_content.push(b'\n');
774    }
775    final_content.extend_from_slice(&re_encoded);
776
777    // Compress if it helps.
778    let compressed = {
779        let mut enc = flate2::write::ZlibEncoder::new(Vec::new(), flate2::Compression::default());
780        use std::io::Write as _;
781        if enc.write_all(&final_content).is_ok() {
782            enc.finish().unwrap_or_else(|_| final_content.clone())
783        } else {
784            final_content.clone()
785        }
786    };
787    let (stream_bytes, use_flate) = if compressed.len() < final_content.len() {
788        (compressed, true)
789    } else {
790        (final_content, false)
791    };
792
793    // Write to the first content stream and collapse multiple streams into one.
794    let content_ids = pdf_manip::content_editor::get_content_stream_ids(doc, page_id);
795    if let Some(&first_id) = content_ids.first() {
796        if let Ok(Object::Stream(ref mut s)) = doc.get_object_mut(first_id) {
797            s.content = stream_bytes;
798            if use_flate {
799                s.dict.set("Filter", Object::Name(b"FlateDecode".to_vec()));
800            } else {
801                s.dict.remove(b"Filter");
802            }
803            s.dict
804                .set("Length", Object::Integer(s.content.len() as i64));
805        }
806        if content_ids.len() > 1 {
807            if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
808                page_dict.set("Contents", Object::Reference(first_id));
809            }
810        }
811    }
812
813    Ok(removed)
814}
815
816/// Find and process Form XObjects in the page's Resources/XObject dictionary.
817///
818/// Delegates to `remove_text_ops_from_stream` for each top-level XObject so
819/// that:
820/// - Each XObject's own Resources/Font dict is used for correct CMap decoding.
821/// - Nested Form XObjects (`Do` inside an XObject) are handled recursively.
822///
823/// `match_bboxes` are forwarded so that XObjects with non-decodable font
824/// encodings can still be cleaned via the spatial fallback.
825///
826/// Fixes #457: previously this function used the page-level FontMap and did
827/// not recurse into nested XObjects, leaving redacted text extractable when it
828/// resided in a Form XObject hierarchy.
829fn remove_text_ops_from_xobjects(
830    doc: &mut Document,
831    page_num: u32,
832    matcher: &TextMatcher,
833    fonts: &pdf_manip::text_run::FontMap,
834    match_bboxes: &[[f64; 4]],
835    visited: &mut HashSet<ObjectId>,
836) -> Result<usize> {
837    let pages = doc.get_pages();
838    let &page_id = match pages.get(&page_num) {
839        Some(id) => id,
840        None => return Ok(0),
841    };
842
843    let xobject_ids = collect_form_xobject_ids(doc, page_id);
844    if xobject_ids.is_empty() {
845        return Ok(0);
846    }
847
848    let mut total_removed = 0;
849    for xobj_id in xobject_ids {
850        total_removed +=
851            remove_text_ops_from_stream(doc, xobj_id, matcher, fonts, match_bboxes, visited)?;
852    }
853    Ok(total_removed)
854}
855
856/// Remove matching text ops from annotation appearance streams on a page.
857///
858/// `match_bboxes` are passed to `remove_text_ops_from_stream` to enable the
859/// raw-byte fallback for AP streams whose ToUnicode CMap decodes to unexpected
860/// characters.  Spatial matching is still disabled for AP streams (their
861/// coordinate space is local, not page space), but raw-byte matching only
862/// requires the word to have been found somewhere on the page (non-empty bboxes).
863fn remove_text_ops_from_annotations(
864    doc: &mut Document,
865    page_num: u32,
866    matcher: &TextMatcher,
867    fonts: &pdf_manip::text_run::FontMap,
868    match_bboxes: &[[f64; 4]],
869    visited: &mut HashSet<ObjectId>,
870) -> Result<usize> {
871    let pages = doc.get_pages();
872    let &page_id = match pages.get(&page_num) {
873        Some(id) => id,
874        None => return Ok(0),
875    };
876
877    // Collect appearance stream IDs from annotations.
878    let ap_stream_ids = collect_annotation_appearance_ids(doc, page_id);
879    if ap_stream_ids.is_empty() {
880        return Ok(0);
881    }
882
883    let mut total_removed = 0;
884    for stream_id in ap_stream_ids {
885        total_removed +=
886            remove_text_ops_from_stream(doc, stream_id, matcher, fonts, match_bboxes, visited)?;
887    }
888
889    Ok(total_removed)
890}
891
892/// Remove matching text operations from a single stream object.
893///
894/// Builds a stream-local FontMap from the stream's own Resources/Font dict
895/// (merged with the caller-supplied page-level `page_fonts` as fallback) so
896/// that CMap decoding is correct for Form XObjects and AP streams that define
897/// their own font resources.
898///
899/// `match_bboxes` are page-space bounding rectangles from the pdf_extract pass.
900/// Used for the per-bbox spatial fallback (effective when XObject runs are in
901/// page space) and as a guard for the raw-byte fallback (fires when non-empty,
902/// confirming the word was found on this page).
903fn remove_text_ops_from_stream(
904    doc: &mut Document,
905    stream_id: ObjectId,
906    matcher: &TextMatcher,
907    page_fonts: &pdf_manip::text_run::FontMap,
908    match_bboxes: &[[f64; 4]],
909    visited: &mut HashSet<ObjectId>,
910) -> Result<usize> {
911    // Guard against cycles and diamond-DAG re-processing: if we have already
912    // visited this stream in the current page pass, skip it (#OOM-002874).
913    if !visited.insert(stream_id) {
914        return Ok(0);
915    }
916
917    let content_bytes = match doc.get_object(stream_id) {
918        Ok(Object::Stream(ref s)) => {
919            let mut stream = s.clone();
920            let _ = stream.decompress();
921            stream.content.clone()
922        }
923        _ => return Ok(0),
924    };
925
926    let editor = match pdf_manip::content_editor::ContentEditor::from_stream(&content_bytes) {
927        Ok(e) => e,
928        Err(_) => return Ok(0),
929    };
930
931    // Use the stream's own font resources so CMap decoding is correct.
932    // Fixes #457: XObjects/AP streams often define fonts not present on the page.
933    let stream_fonts =
934        pdf_manip::text_run::FontMap::from_xobject_stream(doc, stream_id, page_fonts);
935    let fonts = &stream_fonts;
936
937    let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
938
939    let mut indices_to_remove: Vec<usize> = Vec::new();
940    for run in &runs {
941        if !matcher.find_all(&run.text).is_empty() {
942            for idx in run.ops_range.clone() {
943                indices_to_remove.push(idx);
944            }
945        }
946    }
947
948    // Per-bbox combined fallback: Y-line raw-byte match (Phase 1) + spatial
949    // overlap fallback (Phase 2).  Note: XObject run positions are in local
950    // space, not page space, so spatial matching is most effective for
951    // XObjects without CTM transforms.  Fixes #466 bugs 5–6.
952    if !match_bboxes.is_empty() {
953        apply_per_bbox_combined_fallback(
954            &runs,
955            &mut indices_to_remove,
956            match_bboxes,
957            editor.operations(),
958            matcher,
959        );
960    }
961
962    // Raw-byte fallback: handles cases where extract_text_runs decodes a font
963    // differently from extract_positioned_chars, causing the word to be found
964    // during bbox computation but missed by both text-matching and spatial
965    // matching on the content stream.  Fixes #476.
966    // Also handles words split across adjacent TJ operators by joining them
967    // on the same Y-line before matching. Fixes #654.
968    if !match_bboxes.is_empty() {
969        let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
970
971        // Build op_index → run.y map for Y-line lookups.
972        let op_to_y: HashMap<usize, f64> = runs
973            .iter()
974            .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
975            .collect();
976
977        let ops = editor.operations();
978
979        // Group ops by Y-line (within tolerance).
980        let mut ops_by_y: HashMap<i64, Vec<usize>> = HashMap::new();
981        for (idx, _) in ops.iter().enumerate() {
982            if let Some(&y) = op_to_y.get(&idx) {
983                let y_bucket = (y * 10.0).round() as i64;
984                ops_by_y.entry(y_bucket).or_default().push(idx);
985            }
986        }
987
988        for (_, mut op_indices) in ops_by_y {
989            op_indices.sort();
990            let mut combined = String::new();
991            let mut byte_to_op: Vec<usize> = Vec::new();
992
993            for &idx in &op_indices {
994                if text_matched_set.contains(&idx) {
995                    continue;
996                }
997                if let Some(raw) = raw_text_from_op(&ops[idx]) {
998                    let before = combined.len();
999                    combined.push_str(&raw);
1000                    byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
1001                }
1002            }
1003
1004            if combined.is_empty() {
1005                continue;
1006            }
1007
1008            let matches = matcher.find_all(&combined);
1009            for m in matches {
1010                for i in m.start..m.end {
1011                    if let Some(&op_idx) = byte_to_op.get(i) {
1012                        if !text_matched_set.contains(&op_idx) {
1013                            indices_to_remove.push(op_idx);
1014                            text_matched_set.insert(op_idx);
1015                        }
1016                    }
1017                }
1018            }
1019        }
1020    }
1021
1022    if indices_to_remove.is_empty() {
1023        // Check for nested Form XObjects within this stream (e.g., signature appearances).
1024        let nested_ids = collect_nested_form_xobjects(doc, stream_id);
1025        let mut nested_removed = 0;
1026        for nested_id in nested_ids {
1027            nested_removed +=
1028                remove_text_ops_from_stream(doc, nested_id, matcher, fonts, match_bboxes, visited)?;
1029        }
1030        return Ok(nested_removed);
1031    }
1032
1033    indices_to_remove.sort_unstable();
1034    indices_to_remove.dedup();
1035
1036    let mut new_editor = editor;
1037    for &idx in indices_to_remove.iter().rev() {
1038        new_editor.remove_range(idx..idx + 1);
1039    }
1040
1041    let removed = indices_to_remove.len();
1042
1043    let encoded = new_editor
1044        .encode()
1045        .map_err(|e| RedactError::Other(format!("encode annotation stream: {e}")))?;
1046
1047    if let Ok(Object::Stream(ref mut s)) = doc.get_object_mut(stream_id) {
1048        s.dict.remove(b"Filter");
1049        s.content = encoded;
1050        s.dict
1051            .set("Length", Object::Integer(s.content.len() as i64));
1052    }
1053
1054    // Also recurse into nested Form XObjects.
1055    let nested_ids = collect_nested_form_xobjects(doc, stream_id);
1056    let mut nested_removed = removed;
1057    for nested_id in nested_ids {
1058        nested_removed +=
1059            remove_text_ops_from_stream(doc, nested_id, matcher, fonts, match_bboxes, visited)?;
1060    }
1061
1062    Ok(nested_removed)
1063}
1064
1065/// Collect appearance stream IDs from page annotations.
1066fn collect_annotation_appearance_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
1067    let mut result = Vec::new();
1068
1069    let page_dict = match doc.get_object(page_id) {
1070        Ok(Object::Dictionary(ref d)) => d.clone(),
1071        _ => return result,
1072    };
1073
1074    let annots = match page_dict.get(b"Annots") {
1075        Ok(Object::Array(ref arr)) => arr.clone(),
1076        Ok(Object::Reference(id)) => match doc.get_object(*id) {
1077            Ok(Object::Array(ref arr)) => arr.clone(),
1078            _ => return result,
1079        },
1080        _ => return result,
1081    };
1082
1083    for annot_ref in &annots {
1084        let annot_id = match annot_ref {
1085            Object::Reference(id) => *id,
1086            _ => continue,
1087        };
1088
1089        let annot_dict = match doc.get_object(annot_id) {
1090            Ok(Object::Dictionary(ref d)) => d.clone(),
1091            _ => continue,
1092        };
1093
1094        // Get the AP (appearance) dictionary.
1095        let ap_dict = match annot_dict.get(b"AP") {
1096            Ok(Object::Dictionary(ref d)) => d.clone(),
1097            Ok(Object::Reference(id)) => match doc.get_object(*id) {
1098                Ok(Object::Dictionary(ref d)) => d.clone(),
1099                _ => continue,
1100            },
1101            _ => continue,
1102        };
1103
1104        // Get the N (normal appearance) stream.
1105        match ap_dict.get(b"N") {
1106            Ok(Object::Reference(id)) => {
1107                result.push(*id);
1108            }
1109            Ok(Object::Dictionary(ref d)) => {
1110                // Some annotations have a dict of appearance states.
1111                for (_key, val) in d.iter() {
1112                    if let Object::Reference(id) = val {
1113                        result.push(*id);
1114                    }
1115                }
1116            }
1117            _ => {}
1118        }
1119    }
1120
1121    result
1122}
1123
1124/// Collect Form XObject IDs referenced within a stream's Resources or content.
1125fn collect_nested_form_xobjects(doc: &Document, stream_id: ObjectId) -> Vec<ObjectId> {
1126    let mut result = Vec::new();
1127
1128    let stream_dict = match doc.get_object(stream_id) {
1129        Ok(Object::Stream(ref s)) => s.dict.clone(),
1130        _ => return result,
1131    };
1132
1133    // Check the stream's own Resources/XObject dict.
1134    let resources = match stream_dict.get(b"Resources") {
1135        Ok(Object::Dictionary(ref d)) => d.clone(),
1136        Ok(Object::Reference(id)) => match doc.get_object(*id) {
1137            Ok(Object::Dictionary(ref d)) => d.clone(),
1138            _ => return result,
1139        },
1140        _ => return result,
1141    };
1142
1143    let xobject_dict = match resources.get(b"XObject") {
1144        Ok(Object::Dictionary(ref d)) => d.clone(),
1145        Ok(Object::Reference(id)) => match doc.get_object(*id) {
1146            Ok(Object::Dictionary(ref d)) => d.clone(),
1147            _ => return result,
1148        },
1149        _ => return result,
1150    };
1151
1152    for (_key, value) in xobject_dict.iter() {
1153        let obj_id = match value {
1154            Object::Reference(id) => *id,
1155            _ => continue,
1156        };
1157        // Only include Form XObjects, not images.
1158        if let Ok(Object::Stream(ref s)) = doc.get_object(obj_id) {
1159            let is_form = s
1160                .dict
1161                .get(b"Subtype")
1162                .ok()
1163                .and_then(|v| match v {
1164                    Object::Name(ref n) => Some(n.as_slice()),
1165                    _ => None,
1166                })
1167                .map(|n| n == b"Form")
1168                .unwrap_or(false);
1169            if is_form {
1170                result.push(obj_id);
1171            }
1172        }
1173    }
1174
1175    result
1176}
1177
1178/// Collect ObjectIds of Form XObjects from a page's Resources/XObject dictionary.
1179fn collect_form_xobject_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
1180    let mut result = Vec::new();
1181
1182    let page_dict = match doc.get_object(page_id) {
1183        Ok(Object::Dictionary(ref d)) => d.clone(),
1184        _ => return result,
1185    };
1186
1187    let resources = match page_dict.get(b"Resources") {
1188        Ok(Object::Dictionary(ref d)) => d.clone(),
1189        Ok(Object::Reference(id)) => match doc.get_object(*id) {
1190            Ok(Object::Dictionary(ref d)) => d.clone(),
1191            _ => return result,
1192        },
1193        _ => return result,
1194    };
1195
1196    let xobject_dict = match resources.get(b"XObject") {
1197        Ok(Object::Dictionary(ref d)) => d.clone(),
1198        Ok(Object::Reference(id)) => match doc.get_object(*id) {
1199            Ok(Object::Dictionary(ref d)) => d.clone(),
1200            _ => return result,
1201        },
1202        _ => return result,
1203    };
1204
1205    for (_key, value) in xobject_dict.iter() {
1206        let obj_id = match value {
1207            Object::Reference(id) => *id,
1208            _ => continue,
1209        };
1210
1211        // Check if it's a Form XObject (Subtype == Form).
1212        if let Ok(Object::Stream(ref s)) = doc.get_object(obj_id) {
1213            let is_form = s
1214                .dict
1215                .get(b"Subtype")
1216                .ok()
1217                .and_then(|v| match v {
1218                    Object::Name(ref n) => Some(n.as_slice()),
1219                    _ => None,
1220                })
1221                .map(|n| n == b"Form")
1222                .unwrap_or(false);
1223            if is_form {
1224                result.push(obj_id);
1225            }
1226        }
1227    }
1228
1229    result
1230}
1231
1232#[cfg(test)]
1233mod tests {
1234    use super::*;
1235    use lopdf::{dictionary, Document, Object, Stream};
1236
1237    fn make_doc_with_text(content: &[u8]) -> Document {
1238        let mut doc = Document::with_version("1.7");
1239
1240        let font = dictionary! {
1241            "Type" => "Font",
1242            "Subtype" => "Type1",
1243            "BaseFont" => "Helvetica",
1244        };
1245        let font_id = doc.add_object(Object::Dictionary(font));
1246        let font_resources = dictionary! {
1247            "F1" => Object::Reference(font_id),
1248        };
1249        let resources = dictionary! {
1250            "Font" => Object::Dictionary(font_resources),
1251        };
1252
1253        let content_stream = Stream::new(dictionary! {}, content.to_vec());
1254        let content_id = doc.add_object(Object::Stream(content_stream));
1255
1256        let page_dict = dictionary! {
1257            "Type" => "Page",
1258            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
1259            "Contents" => Object::Reference(content_id),
1260            "Resources" => Object::Dictionary(resources),
1261        };
1262        let page_id = doc.add_object(Object::Dictionary(page_dict));
1263
1264        let pages_dict = dictionary! {
1265            "Type" => "Pages",
1266            "Kids" => vec![Object::Reference(page_id)],
1267            "Count" => 1_i64,
1268        };
1269        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
1270
1271        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
1272            d.set("Parent", Object::Reference(pages_id));
1273        }
1274
1275        let info = dictionary! {
1276            "Title" => Object::String(b"Test".to_vec(), lopdf::StringFormat::Literal),
1277        };
1278        let info_id = doc.add_object(Object::Dictionary(info));
1279        doc.trailer.set("Info", Object::Reference(info_id));
1280
1281        let catalog = dictionary! {
1282            "Type" => "Catalog",
1283            "Pages" => Object::Reference(pages_id),
1284        };
1285        let catalog_id = doc.add_object(Object::Dictionary(catalog));
1286        doc.trailer.set("Root", Object::Reference(catalog_id));
1287
1288        doc
1289    }
1290
1291    #[test]
1292    fn search_and_redact_exact_match() {
1293        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret Data) Tj ET");
1294        let opts = RedactSearchOptions::default();
1295        let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1296        assert!(report.matches_found >= 1);
1297        assert!(report.areas_redacted >= 1);
1298    }
1299
1300    #[test]
1301    fn search_and_redact_no_match() {
1302        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET");
1303        let opts = RedactSearchOptions::default();
1304        let report = search_and_redact(&mut doc, "Missing", &opts).unwrap();
1305        assert_eq!(report.matches_found, 0);
1306        assert_eq!(report.areas_redacted, 0);
1307    }
1308
1309    #[test]
1310    fn search_and_redact_case_insensitive() {
1311        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret Data) Tj ET");
1312        let opts = RedactSearchOptions::case_insensitive();
1313        let report = search_and_redact(&mut doc, "secret", &opts).unwrap();
1314        assert!(report.matches_found >= 1);
1315    }
1316
1317    #[test]
1318    fn search_and_redact_regex() {
1319        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (SSN 123-45-6789) Tj ET");
1320        let opts = RedactSearchOptions::with_regex();
1321        let report = search_and_redact(&mut doc, r"\d{3}-\d{2}-\d{4}", &opts).unwrap();
1322        assert!(report.matches_found >= 1);
1323    }
1324
1325    #[test]
1326    fn search_and_redact_with_overlay() {
1327        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Confidential) Tj ET");
1328        let opts = RedactSearchOptions::default().overlay_text("[REDACTED]");
1329        let report = search_and_redact(&mut doc, "Confidential", &opts).unwrap();
1330        assert!(report.matches_found >= 1);
1331    }
1332
1333    #[test]
1334    fn search_and_redact_specific_pages() {
1335        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1336        let opts = RedactSearchOptions::default().pages(vec![1]);
1337        let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1338        assert!(report.matches_found >= 1);
1339    }
1340
1341    #[test]
1342    fn search_and_redact_page_out_of_range() {
1343        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET");
1344        let opts = RedactSearchOptions::default().pages(vec![5]);
1345        let result = search_and_redact(&mut doc, "Hello", &opts);
1346        assert!(result.is_err());
1347    }
1348
1349    #[test]
1350    fn search_and_redact_cleans_metadata() {
1351        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1352        let opts = RedactSearchOptions::default();
1353        let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1354        assert!(report.metadata_cleaned);
1355        assert!(doc.trailer.get(b"Info").is_err());
1356    }
1357
1358    #[test]
1359    fn search_and_redact_custom_color() {
1360        let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1361        let opts = RedactSearchOptions::default().fill_color(1.0, 0.0, 0.0);
1362        let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1363        assert!(report.matches_found >= 1);
1364    }
1365
1366    /// Build a document where the word "Classified" lives only inside a Form
1367    /// XObject — not in the page's own content stream.  The XObject has its
1368    /// own Resources/Font dictionary that differs from the page-level one.
1369    ///
1370    /// Before the #457 fix, `remove_text_ops_from_xobjects` used the wrong
1371    /// (page-level) FontMap and did not recurse into nested XObjects, so the
1372    /// Tj operator was never removed from the XObject stream.
1373    fn make_doc_with_xobject_text() -> (Document, ObjectId) {
1374        let mut doc = Document::with_version("1.7");
1375
1376        // Font defined only in the XObject's own Resources (not on the page).
1377        let xobj_font = dictionary! {
1378            "Type" => "Font",
1379            "Subtype" => "Type1",
1380            "BaseFont" => "Times-Roman",
1381        };
1382        let xobj_font_id = doc.add_object(Object::Dictionary(xobj_font));
1383        let xobj_font_res = dictionary! { "FX" => Object::Reference(xobj_font_id) };
1384        let xobj_resources = dictionary! {
1385            "Font" => Object::Dictionary(xobj_font_res),
1386        };
1387
1388        // Form XObject stream containing the sensitive text.
1389        let xobj_content = b"BT /FX 12 Tf 0 0 Td (Classified) Tj ET".to_vec();
1390        let xobj_stream = Stream::new(
1391            dictionary! {
1392                "Type" => "XObject",
1393                "Subtype" => "Form",
1394                "BBox" => vec![0.into(), 0.into(), 300_i64.into(), 20_i64.into()],
1395                "Resources" => Object::Dictionary(xobj_resources),
1396            },
1397            xobj_content,
1398        );
1399        let xobj_id = doc.add_object(Object::Stream(xobj_stream));
1400
1401        // Page has a different font (F1/Helvetica) but no reference to FX.
1402        let page_font = dictionary! {
1403            "Type" => "Font",
1404            "Subtype" => "Type1",
1405            "BaseFont" => "Helvetica",
1406        };
1407        let page_font_id = doc.add_object(Object::Dictionary(page_font));
1408        let page_font_res = dictionary! { "F1" => Object::Reference(page_font_id) };
1409        let xobj_map = dictionary! { "Xobj1" => Object::Reference(xobj_id) };
1410        let page_resources = dictionary! {
1411            "Font" => Object::Dictionary(page_font_res),
1412            "XObject" => Object::Dictionary(xobj_map),
1413        };
1414
1415        // Page content only invokes the XObject — no direct Tj operators.
1416        let page_content = b"q 1 0 0 1 100 700 cm /Xobj1 Do Q".to_vec();
1417        let content_stream = Stream::new(dictionary! {}, page_content);
1418        let content_id = doc.add_object(Object::Stream(content_stream));
1419
1420        let page_dict = dictionary! {
1421            "Type" => "Page",
1422            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
1423            "Contents" => Object::Reference(content_id),
1424            "Resources" => Object::Dictionary(page_resources),
1425        };
1426        let page_id = doc.add_object(Object::Dictionary(page_dict));
1427
1428        let pages_dict = dictionary! {
1429            "Type" => "Pages",
1430            "Kids" => vec![Object::Reference(page_id)],
1431            "Count" => 1_i64,
1432        };
1433        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
1434
1435        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
1436            d.set("Parent", Object::Reference(pages_id));
1437        }
1438
1439        let catalog = dictionary! {
1440            "Type" => "Catalog",
1441            "Pages" => Object::Reference(pages_id),
1442        };
1443        let catalog_id = doc.add_object(Object::Dictionary(catalog));
1444        doc.trailer.set("Root", Object::Reference(catalog_id));
1445
1446        (doc, xobj_id)
1447    }
1448
1449    /// Verify that text inside a Form XObject is removed from the XObject's
1450    /// content stream after redaction (Fixes #457).
1451    ///
1452    /// We check the raw bytes of the XObject stream directly rather than
1453    /// going through text extraction, so the test does not depend on
1454    /// pdf_extract being able to parse this minimal synthetic PDF.
1455    #[test]
1456    fn redact_removes_text_from_xobject_stream() {
1457        let (mut doc, xobj_id) = make_doc_with_xobject_text();
1458
1459        // Build a TextMatcher and FontMap and call remove_text_ops_for_page
1460        // indirectly by exercising the XObject-removal path directly.
1461        // We use the page-level FontMap (which does NOT contain "FX"); the fix
1462        // must still correctly use the XObject's own Resources to decode.
1463        let page_fonts = pdf_manip::text_run::FontMap::empty();
1464        let matcher_opts = RedactSearchOptions::default();
1465        let matcher = build_matcher("Classified", &matcher_opts).unwrap();
1466
1467        // Call the private helper via remove_text_ops_from_stream.
1468        // We test it indirectly: verify the XObject stream bytes change.
1469        let removed = remove_text_ops_from_stream(
1470            &mut doc,
1471            xobj_id,
1472            &matcher,
1473            &page_fonts,
1474            &[],
1475            &mut HashSet::new(),
1476        )
1477        .unwrap();
1478
1479        assert!(
1480            removed > 0,
1481            "Expected at least one op removed from XObject stream, got 0"
1482        );
1483
1484        // Confirm the raw bytes of the XObject no longer contain the literal.
1485        if let Ok(Object::Stream(ref s)) = doc.get_object(xobj_id) {
1486            let content = std::str::from_utf8(&s.content).unwrap_or("");
1487            assert!(
1488                !content.contains("Classified"),
1489                "XObject stream still contains 'Classified' after redaction"
1490            );
1491        } else {
1492            panic!("XObject is not a stream after redaction");
1493        }
1494    }
1495
1496    /// When the target word is split across multiple Tj ops (e.g. "(LI) Tj (C) Tj")
1497    /// and another occurrence of the word is text-matched first (making
1498    /// `indices_to_remove` non-empty), the per-bbox spatial fallback must still
1499    /// fire independently for each match_bbox that isn't covered.  Fixes #463
1500    /// edge case 'LIC'.
1501    #[test]
1502    fn redact_split_token_per_bbox_spatial_fallback() {
1503        // "ALICE" contains "LIC" → text-match succeeds for that occurrence.
1504        // "(LI) Tj (C) Tj" is a split-token occurrence of "LIC" at a different
1505        // position; the old global-empty guard would have skipped the spatial
1506        // fallback because indices_to_remove was already non-empty from ALICE.
1507        let content = b"BT /F1 12 Tf 0 700 Td (ALICE) Tj 200 0 Td (LI) Tj 30 0 Td (C) Tj ET";
1508        let mut doc = make_doc_with_text(content);
1509        let opts = RedactSearchOptions::default();
1510        let report = search_and_redact(&mut doc, "LIC", &opts).unwrap();
1511        assert!(report.matches_found >= 1);
1512        assert!(report.areas_redacted >= 1);
1513    }
1514
1515    /// A Form XObject whose font has a misleading ToUnicode CMap must still be
1516    /// cleaned via the raw-byte fallback when CMap-decoded text doesn't match
1517    /// but the literal bytes do.  Fixes #463 edge cases '270' / '000'.
1518    ///
1519    /// We reuse `make_doc_with_xobject_text` (text = "Classified") and pass
1520    /// a non-empty `dummy_bboxes` so the raw-byte fallback is triggered even
1521    /// when there are no runs with matching text.
1522    #[test]
1523    fn redact_xobject_raw_byte_fallback() {
1524        let (mut doc, xobj_id) = make_doc_with_xobject_text();
1525        let page_fonts = pdf_manip::text_run::FontMap::empty();
1526        let matcher_opts = RedactSearchOptions::default();
1527        let matcher = build_matcher("Classified", &matcher_opts).unwrap();
1528        // Non-empty bboxes activate the raw-byte fallback path in
1529        // `remove_text_ops_from_stream` when no spatial run was matched.
1530        let dummy_bboxes = [[0.0_f64, 0.0, 300.0, 20.0]];
1531        let removed = remove_text_ops_from_stream(
1532            &mut doc,
1533            xobj_id,
1534            &matcher,
1535            &page_fonts,
1536            &dummy_bboxes,
1537            &mut HashSet::new(),
1538        )
1539        .unwrap();
1540        assert!(
1541            removed > 0,
1542            "Expected raw-byte fallback to remove ops from XObject stream"
1543        );
1544    }
1545}
pdf_redact/search_redact.rs

pdf_redact/
search_redact.rs