1use crate::error::{RedactError, Result};
7use crate::redact::{RedactionArea, Redactor};
8use lopdf::{Document, Object, ObjectId};
9use regex::Regex;
10use std::collections::{HashMap, HashSet};
11
12#[derive(Debug, Clone)]
14pub struct RedactSearchOptions {
15 pub case_sensitive: bool,
17 pub regex: bool,
19 pub fill_color: [f64; 3],
21 pub pages: Option<Vec<u32>>,
23 pub overlay_text: Option<String>,
25}
26
27impl Default for RedactSearchOptions {
28 fn default() -> Self {
29 Self {
30 case_sensitive: true,
31 regex: false,
32 fill_color: [0.0, 0.0, 0.0],
33 pages: None,
34 overlay_text: None,
35 }
36 }
37}
38
39impl RedactSearchOptions {
40 pub fn exact(pattern: &str) -> Self {
42 let _ = pattern; Self::default()
44 }
45
46 pub fn case_insensitive() -> Self {
48 Self {
49 case_sensitive: false,
50 ..Self::default()
51 }
52 }
53
54 pub fn with_regex() -> Self {
56 Self {
57 regex: true,
58 ..Self::default()
59 }
60 }
61
62 pub fn fill_color(mut self, r: f64, g: f64, b: f64) -> Self {
64 self.fill_color = [r, g, b];
65 self
66 }
67
68 pub fn pages(mut self, pages: Vec<u32>) -> Self {
70 self.pages = Some(pages);
71 self
72 }
73
74 pub fn overlay_text(mut self, text: impl Into<String>) -> Self {
76 self.overlay_text = Some(text.into());
77 self
78 }
79}
80
81#[derive(Debug, Clone)]
83pub struct SearchRedactReport {
84 pub matches_found: usize,
86 pub areas_redacted: usize,
88 pub operations_removed: usize,
90 pub pages_affected: usize,
92 pub metadata_cleaned: bool,
94 pub redacted_rects: Vec<(u32, [f64; 4])>,
97}
98
99pub fn search_and_redact(
106 doc: &mut Document,
107 pattern: &str,
108 options: &RedactSearchOptions,
109) -> Result<SearchRedactReport> {
110 let pages = doc.get_pages();
111 let total = pages.len() as u32;
112
113 let page_range: Vec<u32> = match &options.pages {
114 Some(ps) => ps.clone(),
115 None => (1..=total).collect(),
116 };
117
118 for &p in &page_range {
120 if p == 0 || p > total {
121 return Err(RedactError::PageOutOfRange(p, total));
122 }
123 }
124
125 let matcher = build_matcher(pattern, options)?;
127
128 let mut all_areas: Vec<RedactionArea> = Vec::new();
130 let mut total_matches = 0;
131 let mut page_bboxes: std::collections::HashMap<u32, Vec<[f64; 4]>> =
133 std::collections::HashMap::new();
134
135 for &page_num in &page_range {
136 let chars = match pdf_extract::extract_positioned_chars(doc, page_num) {
137 Ok(c) => c,
138 Err(_) => continue,
139 };
140
141 if chars.is_empty() {
142 continue;
143 }
144
145 let text: String = chars.iter().map(|c| c.ch).collect();
147
148 let byte_to_char: Vec<usize> = {
151 let mut map = Vec::with_capacity(text.len() + 1);
152 for (ci, ch) in text.chars().enumerate() {
153 for _ in 0..ch.len_utf8() {
154 map.push(ci);
155 }
156 }
157 map.push(chars.len()); map
159 };
160
161 let match_ranges = matcher.find_all(&text);
163
164 for range in &match_ranges {
165 total_matches += 1;
166
167 let char_start = byte_to_char.get(range.start).copied().unwrap_or(0);
169 let char_end = byte_to_char.get(range.end).copied().unwrap_or(chars.len());
170 if char_start >= chars.len() || char_end > chars.len() || char_start >= char_end {
171 continue;
172 }
173
174 let matched_chars = &chars[char_start..char_end];
176 if matched_chars.is_empty() {
177 continue;
178 }
179
180 let bbox = compute_bounding_rect(matched_chars);
181
182 page_bboxes.entry(page_num).or_default().push(bbox);
184
185 let mut area = RedactionArea::new(page_num, bbox);
186 area = area.with_color(
187 options.fill_color[0],
188 options.fill_color[1],
189 options.fill_color[2],
190 );
191 if let Some(ref overlay) = options.overlay_text {
192 area = area.with_overlay(overlay);
193 }
194 all_areas.push(area);
195 }
196 }
197
198 if all_areas.is_empty() {
199 return Ok(SearchRedactReport {
200 matches_found: 0,
201 areas_redacted: 0,
202 operations_removed: 0,
203 pages_affected: 0,
204 metadata_cleaned: false,
205 redacted_rects: Vec::new(),
206 });
207 }
208
209 let redacted_rects: Vec<(u32, [f64; 4])> = all_areas.iter().map(|a| (a.page, a.rect)).collect();
211
212 let mut redactor = Redactor::new();
214 redactor.mark_all(all_areas);
215 let report = redactor.apply(doc)?;
216
217 let mut extra_ops_removed = 0;
220 for &page_num in &page_range {
221 let bboxes: &[[f64; 4]] = page_bboxes
222 .get(&page_num)
223 .map(|v| v.as_slice())
224 .unwrap_or(&[]);
225 let removed = remove_text_ops_for_page(doc, page_num, pattern, options, bboxes)?;
226 extra_ops_removed += removed;
227 }
228
229 Ok(SearchRedactReport {
230 matches_found: total_matches,
231 areas_redacted: report.areas_redacted,
232 operations_removed: report.operations_removed + extra_ops_removed,
233 pages_affected: report.pages_affected,
234 metadata_cleaned: report.metadata_cleaned,
235 redacted_rects,
236 })
237}
238
239struct TextMatcher {
244 regex: Regex,
245}
246
247struct MatchRange {
248 start: usize,
249 end: usize,
250}
251
252impl TextMatcher {
253 fn find_all(&self, text: &str) -> Vec<MatchRange> {
254 self.regex
255 .find_iter(text)
256 .map(|m| MatchRange {
257 start: m.start(),
258 end: m.end(),
259 })
260 .collect()
261 }
262}
263
264fn build_matcher(pattern: &str, options: &RedactSearchOptions) -> Result<TextMatcher> {
265 let regex_pattern = if options.regex {
266 if options.case_sensitive {
267 pattern.to_string()
268 } else {
269 format!("(?i){}", pattern)
270 }
271 } else {
272 let escaped = regex::escape(pattern);
273 if options.case_sensitive {
274 escaped
275 } else {
276 format!("(?i){}", escaped)
277 }
278 };
279
280 let regex = Regex::new(®ex_pattern)
281 .map_err(|e| RedactError::Other(format!("invalid pattern: {e}")))?;
282
283 Ok(TextMatcher { regex })
284}
285
286fn run_overlaps_single_bbox(run: &pdf_manip::text_run::TextRun, bbox: [f64; 4]) -> bool {
292 let run_x1 = run.x + run.width.max(1.0);
293 let tol = 4.0_f64;
294 let x_overlap = run.x < bbox[2] + tol && run_x1 > bbox[0] - tol;
295 let y_overlap = run.y <= bbox[3] + tol && run.y >= bbox[1] - tol;
296 x_overlap && y_overlap
297}
298
299fn run_on_same_baseline(run: &pdf_manip::text_run::TextRun, bbox: [f64; 4]) -> bool {
312 let same_y = (run.y - bbox[1]).abs() <= 0.5;
313 let run_x1 = run.x + run.width.max(1.0);
314 let x_overlap = run.x < bbox[2] + 4.0 && run_x1 > bbox[0] - 4.0;
315 same_y && x_overlap
316}
317
318fn raw_text_from_op(op: &lopdf::content::Operation) -> Option<String> {
325 use lopdf::Object;
326 match op.operator.as_str() {
327 "Tj" | "'" => {
328 if let Some(Object::String(ref bytes, _)) = op.operands.first() {
329 Some(bytes.iter().map(|&b| b as char).collect())
330 } else {
331 None
332 }
333 }
334 "TJ" => {
335 if let Some(Object::Array(ref arr)) = op.operands.first() {
336 let s: String = arr
337 .iter()
338 .filter_map(|item| match item {
339 Object::String(ref bytes, _) => {
340 Some(bytes.iter().map(|&b| b as char).collect::<String>())
341 }
342 _ => None,
343 })
344 .collect();
345 if s.is_empty() {
346 None
347 } else {
348 Some(s)
349 }
350 } else {
351 None
352 }
353 }
354 "\"" => op.operands.get(2).and_then(|obj| match obj {
355 Object::String(ref bytes, _) => Some(bytes.iter().map(|&b| b as char).collect()),
356 _ => None,
357 }),
358 _ => None,
359 }
360}
361
362fn apply_per_bbox_combined_fallback(
380 runs: &[pdf_manip::text_run::TextRun],
381 indices_to_remove: &mut Vec<usize>,
382 bboxes: &[[f64; 4]],
383 ops: &[lopdf::content::Operation],
384 matcher: &TextMatcher,
385) {
386 let text_matched: HashSet<usize> = indices_to_remove.iter().copied().collect();
389 let mut to_add: HashSet<usize> = HashSet::new();
390
391 let op_to_y: HashMap<usize, f64> = runs
393 .iter()
394 .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
395 .collect();
396
397 for &bbox in bboxes {
398 let covered = runs.iter().any(|run| {
402 run_on_same_baseline(run, bbox)
403 && run.ops_range.clone().any(|i| text_matched.contains(&i))
404 });
405 if covered {
406 continue;
407 }
408
409 let bbox_y = bbox[1];
417 let mut y_line: Vec<(usize, &lopdf::content::Operation)> = ops
418 .iter()
419 .enumerate()
420 .filter(|(idx, _)| {
421 op_to_y
422 .get(idx)
423 .map(|&y| (y - bbox_y).abs() <= 6.0)
424 .unwrap_or(false)
425 })
426 .collect();
427 y_line.sort_by_key(|(idx, _)| *idx);
428
429 let mut combined = String::new();
432 let mut byte_to_op: Vec<usize> = Vec::new();
433 for &(idx, op) in &y_line {
434 if let Some(raw) = raw_text_from_op(op) {
435 let before = combined.len(); combined.push_str(&raw);
437 byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
439 }
440 }
441
442 let raw_matches = matcher.find_all(&combined);
443 if !raw_matches.is_empty() {
444 for m in &raw_matches {
446 for i in m.start..m.end {
447 if let Some(&op_idx) = byte_to_op.get(i) {
448 if !text_matched.contains(&op_idx) {
449 to_add.insert(op_idx);
450 }
451 }
452 }
453 }
454 continue; }
456
457 for run in runs {
464 if run_overlaps_single_bbox(run, bbox) {
465 for idx in run.ops_range.clone() {
466 to_add.insert(idx);
467 }
468 }
469 }
470 }
471
472 for idx in to_add {
473 if !text_matched.contains(&idx) {
474 indices_to_remove.push(idx);
475 }
476 }
477}
478
479fn compute_bounding_rect(chars: &[pdf_extract::PositionedChar]) -> [f64; 4] {
480 let mut x0 = f64::MAX;
481 let mut y0 = f64::MAX;
482 let mut x1 = f64::MIN;
483 let mut y1 = f64::MIN;
484
485 for ch in chars {
486 x0 = x0.min(ch.bbox[0]);
487 y0 = y0.min(ch.bbox[1]);
488 x1 = x1.max(ch.bbox[2]);
489 y1 = y1.max(ch.bbox[3]);
490 }
491
492 [x0 - 1.0, y0 - 1.0, x1 + 1.0, y1 + 1.0]
494}
495
496fn remove_text_ops_for_page(
511 doc: &mut Document,
512 page_num: u32,
513 pattern: &str,
514 options: &RedactSearchOptions,
515 match_bboxes: &[[f64; 4]],
516) -> Result<usize> {
517 let fonts = match pdf_manip::text_run::FontMap::from_page(doc, page_num) {
518 Ok(f) => f,
519 Err(_) => return Ok(0),
520 };
521
522 let matcher = build_matcher(pattern, options)?;
523
524 let mut visited: HashSet<ObjectId> = HashSet::new();
527
528 let removed = match pdf_manip::content_editor::editor_for_page(doc, page_num) {
532 Ok(editor) => {
533 remove_text_ops_via_editor(doc, page_num, editor, &matcher, &fonts, match_bboxes)?
534 }
535 Err(_) => {
536 remove_text_ops_with_inline_images(doc, page_num, &matcher, &fonts, match_bboxes)?
537 }
538 };
539
540 let removed = removed
544 + remove_text_ops_from_xobjects(
545 doc,
546 page_num,
547 &matcher,
548 &fonts,
549 match_bboxes,
550 &mut visited,
551 )?;
552
553 let removed = removed
556 + remove_text_ops_from_annotations(
557 doc,
558 page_num,
559 &matcher,
560 &fonts,
561 match_bboxes,
562 &mut visited,
563 )?;
564
565 Ok(removed)
566}
567
568fn remove_text_ops_via_editor(
570 doc: &mut Document,
571 page_num: u32,
572 editor: pdf_manip::content_editor::ContentEditor,
573 matcher: &TextMatcher,
574 fonts: &pdf_manip::text_run::FontMap,
575 match_bboxes: &[[f64; 4]],
576) -> Result<usize> {
577 let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
578
579 let mut indices_to_remove: Vec<usize> = Vec::new();
580 for run in &runs {
581 if !matcher.find_all(&run.text).is_empty() {
582 for idx in run.ops_range.clone() {
583 indices_to_remove.push(idx);
584 }
585 }
586 }
587
588 if !match_bboxes.is_empty() {
593 apply_per_bbox_combined_fallback(
594 &runs,
595 &mut indices_to_remove,
596 match_bboxes,
597 editor.operations(),
598 matcher,
599 );
600 }
601
602 if !match_bboxes.is_empty() {
609 let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
610
611 let op_to_y: HashMap<usize, f64> = runs
613 .iter()
614 .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
615 .collect();
616
617 let ops = editor.operations();
618
619 let mut ops_by_y: HashMap<i64, Vec<usize>> = HashMap::new();
621 for (idx, _) in ops.iter().enumerate() {
622 if let Some(&y) = op_to_y.get(&idx) {
623 let y_bucket = (y * 10.0).round() as i64;
624 ops_by_y.entry(y_bucket).or_default().push(idx);
625 }
626 }
627
628 for (_, mut op_indices) in ops_by_y {
629 op_indices.sort();
630 let mut combined = String::new();
631 let mut byte_to_op: Vec<usize> = Vec::new();
632
633 for &idx in &op_indices {
634 if text_matched_set.contains(&idx) {
635 continue;
636 }
637 if let Some(raw) = raw_text_from_op(&ops[idx]) {
638 let before = combined.len();
639 combined.push_str(&raw);
640 byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
641 }
642 }
643
644 if combined.is_empty() {
645 continue;
646 }
647
648 let matches = matcher.find_all(&combined);
649 for m in matches {
650 for i in m.start..m.end {
651 if let Some(&op_idx) = byte_to_op.get(i) {
652 if !text_matched_set.contains(&op_idx) {
653 indices_to_remove.push(op_idx);
654 text_matched_set.insert(op_idx);
655 }
656 }
657 }
658 }
659 }
660 }
661
662 if indices_to_remove.is_empty() {
663 return Ok(0);
664 }
665
666 indices_to_remove.sort_unstable();
667 indices_to_remove.dedup();
668
669 let mut new_editor = editor;
670 for &idx in indices_to_remove.iter().rev() {
671 new_editor.remove_range(idx..idx + 1);
672 }
673
674 let removed = indices_to_remove.len();
675 pdf_manip::content_editor::write_editor_to_page(doc, page_num, &new_editor)
676 .map_err(|e| RedactError::Other(format!("write content: {e}")))?;
677
678 Ok(removed)
679}
680
681fn remove_text_ops_with_inline_images(
685 doc: &mut Document,
686 page_num: u32,
687 matcher: &TextMatcher,
688 fonts: &pdf_manip::text_run::FontMap,
689 match_bboxes: &[[f64; 4]],
690) -> Result<usize> {
691 let pages = doc.get_pages();
692 let &page_id = match pages.get(&page_num) {
693 Some(id) => id,
694 None => return Ok(0),
695 };
696
697 let content_bytes = match doc.get_page_content(page_id) {
699 Ok(b) => b,
700 Err(_) => return Ok(0),
701 };
702
703 let (stripped, inline_images) = pdf_manip::content_editor::strip_inline_images(&content_bytes);
705
706 let editor = match pdf_manip::content_editor::ContentEditor::from_stream(&stripped) {
707 Ok(e) => e,
708 Err(_) => return Ok(0),
709 };
710
711 let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
712
713 let mut indices_to_remove: Vec<usize> = Vec::new();
714 for run in &runs {
715 if !matcher.find_all(&run.text).is_empty() {
716 for idx in run.ops_range.clone() {
717 indices_to_remove.push(idx);
718 }
719 }
720 }
721
722 if !match_bboxes.is_empty() {
724 apply_per_bbox_combined_fallback(
725 &runs,
726 &mut indices_to_remove,
727 match_bboxes,
728 editor.operations(),
729 matcher,
730 );
731 }
732
733 if !match_bboxes.is_empty() {
735 let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
736 let ops = editor.operations();
737 for (idx, op) in ops.iter().enumerate() {
738 if text_matched_set.contains(&idx) {
739 continue;
740 }
741 if let Some(raw_text) = raw_text_from_op(op) {
742 if !matcher.find_all(&raw_text).is_empty() {
743 indices_to_remove.push(idx);
744 text_matched_set.insert(idx);
745 }
746 }
747 }
748 }
749
750 if indices_to_remove.is_empty() {
751 return Ok(0);
752 }
753
754 indices_to_remove.sort_unstable();
755 indices_to_remove.dedup();
756
757 let mut new_editor = editor;
758 for &idx in indices_to_remove.iter().rev() {
759 new_editor.remove_range(idx..idx + 1);
760 }
761
762 let removed = indices_to_remove.len();
763
764 let re_encoded = new_editor
767 .encode()
768 .map_err(|e| RedactError::Other(format!("encode: {e}")))?;
769
770 let mut final_content = Vec::new();
771 for img in &inline_images {
772 final_content.extend_from_slice(img);
773 final_content.push(b'\n');
774 }
775 final_content.extend_from_slice(&re_encoded);
776
777 let compressed = {
779 let mut enc = flate2::write::ZlibEncoder::new(Vec::new(), flate2::Compression::default());
780 use std::io::Write as _;
781 if enc.write_all(&final_content).is_ok() {
782 enc.finish().unwrap_or_else(|_| final_content.clone())
783 } else {
784 final_content.clone()
785 }
786 };
787 let (stream_bytes, use_flate) = if compressed.len() < final_content.len() {
788 (compressed, true)
789 } else {
790 (final_content, false)
791 };
792
793 let content_ids = pdf_manip::content_editor::get_content_stream_ids(doc, page_id);
795 if let Some(&first_id) = content_ids.first() {
796 if let Ok(Object::Stream(ref mut s)) = doc.get_object_mut(first_id) {
797 s.content = stream_bytes;
798 if use_flate {
799 s.dict.set("Filter", Object::Name(b"FlateDecode".to_vec()));
800 } else {
801 s.dict.remove(b"Filter");
802 }
803 s.dict
804 .set("Length", Object::Integer(s.content.len() as i64));
805 }
806 if content_ids.len() > 1 {
807 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
808 page_dict.set("Contents", Object::Reference(first_id));
809 }
810 }
811 }
812
813 Ok(removed)
814}
815
816fn remove_text_ops_from_xobjects(
830 doc: &mut Document,
831 page_num: u32,
832 matcher: &TextMatcher,
833 fonts: &pdf_manip::text_run::FontMap,
834 match_bboxes: &[[f64; 4]],
835 visited: &mut HashSet<ObjectId>,
836) -> Result<usize> {
837 let pages = doc.get_pages();
838 let &page_id = match pages.get(&page_num) {
839 Some(id) => id,
840 None => return Ok(0),
841 };
842
843 let xobject_ids = collect_form_xobject_ids(doc, page_id);
844 if xobject_ids.is_empty() {
845 return Ok(0);
846 }
847
848 let mut total_removed = 0;
849 for xobj_id in xobject_ids {
850 total_removed +=
851 remove_text_ops_from_stream(doc, xobj_id, matcher, fonts, match_bboxes, visited)?;
852 }
853 Ok(total_removed)
854}
855
856fn remove_text_ops_from_annotations(
864 doc: &mut Document,
865 page_num: u32,
866 matcher: &TextMatcher,
867 fonts: &pdf_manip::text_run::FontMap,
868 match_bboxes: &[[f64; 4]],
869 visited: &mut HashSet<ObjectId>,
870) -> Result<usize> {
871 let pages = doc.get_pages();
872 let &page_id = match pages.get(&page_num) {
873 Some(id) => id,
874 None => return Ok(0),
875 };
876
877 let ap_stream_ids = collect_annotation_appearance_ids(doc, page_id);
879 if ap_stream_ids.is_empty() {
880 return Ok(0);
881 }
882
883 let mut total_removed = 0;
884 for stream_id in ap_stream_ids {
885 total_removed +=
886 remove_text_ops_from_stream(doc, stream_id, matcher, fonts, match_bboxes, visited)?;
887 }
888
889 Ok(total_removed)
890}
891
892fn remove_text_ops_from_stream(
904 doc: &mut Document,
905 stream_id: ObjectId,
906 matcher: &TextMatcher,
907 page_fonts: &pdf_manip::text_run::FontMap,
908 match_bboxes: &[[f64; 4]],
909 visited: &mut HashSet<ObjectId>,
910) -> Result<usize> {
911 if !visited.insert(stream_id) {
914 return Ok(0);
915 }
916
917 let content_bytes = match doc.get_object(stream_id) {
918 Ok(Object::Stream(ref s)) => {
919 let mut stream = s.clone();
920 let _ = stream.decompress();
921 stream.content.clone()
922 }
923 _ => return Ok(0),
924 };
925
926 let editor = match pdf_manip::content_editor::ContentEditor::from_stream(&content_bytes) {
927 Ok(e) => e,
928 Err(_) => return Ok(0),
929 };
930
931 let stream_fonts =
934 pdf_manip::text_run::FontMap::from_xobject_stream(doc, stream_id, page_fonts);
935 let fonts = &stream_fonts;
936
937 let runs = pdf_manip::text_run::extract_text_runs(&editor, fonts);
938
939 let mut indices_to_remove: Vec<usize> = Vec::new();
940 for run in &runs {
941 if !matcher.find_all(&run.text).is_empty() {
942 for idx in run.ops_range.clone() {
943 indices_to_remove.push(idx);
944 }
945 }
946 }
947
948 if !match_bboxes.is_empty() {
953 apply_per_bbox_combined_fallback(
954 &runs,
955 &mut indices_to_remove,
956 match_bboxes,
957 editor.operations(),
958 matcher,
959 );
960 }
961
962 if !match_bboxes.is_empty() {
969 let mut text_matched_set: HashSet<usize> = indices_to_remove.iter().cloned().collect();
970
971 let op_to_y: HashMap<usize, f64> = runs
973 .iter()
974 .flat_map(|run| run.ops_range.clone().map(move |i| (i, run.y)))
975 .collect();
976
977 let ops = editor.operations();
978
979 let mut ops_by_y: HashMap<i64, Vec<usize>> = HashMap::new();
981 for (idx, _) in ops.iter().enumerate() {
982 if let Some(&y) = op_to_y.get(&idx) {
983 let y_bucket = (y * 10.0).round() as i64;
984 ops_by_y.entry(y_bucket).or_default().push(idx);
985 }
986 }
987
988 for (_, mut op_indices) in ops_by_y {
989 op_indices.sort();
990 let mut combined = String::new();
991 let mut byte_to_op: Vec<usize> = Vec::new();
992
993 for &idx in &op_indices {
994 if text_matched_set.contains(&idx) {
995 continue;
996 }
997 if let Some(raw) = raw_text_from_op(&ops[idx]) {
998 let before = combined.len();
999 combined.push_str(&raw);
1000 byte_to_op.extend(std::iter::repeat_n(idx, combined.len() - before));
1001 }
1002 }
1003
1004 if combined.is_empty() {
1005 continue;
1006 }
1007
1008 let matches = matcher.find_all(&combined);
1009 for m in matches {
1010 for i in m.start..m.end {
1011 if let Some(&op_idx) = byte_to_op.get(i) {
1012 if !text_matched_set.contains(&op_idx) {
1013 indices_to_remove.push(op_idx);
1014 text_matched_set.insert(op_idx);
1015 }
1016 }
1017 }
1018 }
1019 }
1020 }
1021
1022 if indices_to_remove.is_empty() {
1023 let nested_ids = collect_nested_form_xobjects(doc, stream_id);
1025 let mut nested_removed = 0;
1026 for nested_id in nested_ids {
1027 nested_removed +=
1028 remove_text_ops_from_stream(doc, nested_id, matcher, fonts, match_bboxes, visited)?;
1029 }
1030 return Ok(nested_removed);
1031 }
1032
1033 indices_to_remove.sort_unstable();
1034 indices_to_remove.dedup();
1035
1036 let mut new_editor = editor;
1037 for &idx in indices_to_remove.iter().rev() {
1038 new_editor.remove_range(idx..idx + 1);
1039 }
1040
1041 let removed = indices_to_remove.len();
1042
1043 let encoded = new_editor
1044 .encode()
1045 .map_err(|e| RedactError::Other(format!("encode annotation stream: {e}")))?;
1046
1047 if let Ok(Object::Stream(ref mut s)) = doc.get_object_mut(stream_id) {
1048 s.dict.remove(b"Filter");
1049 s.content = encoded;
1050 s.dict
1051 .set("Length", Object::Integer(s.content.len() as i64));
1052 }
1053
1054 let nested_ids = collect_nested_form_xobjects(doc, stream_id);
1056 let mut nested_removed = removed;
1057 for nested_id in nested_ids {
1058 nested_removed +=
1059 remove_text_ops_from_stream(doc, nested_id, matcher, fonts, match_bboxes, visited)?;
1060 }
1061
1062 Ok(nested_removed)
1063}
1064
1065fn collect_annotation_appearance_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
1067 let mut result = Vec::new();
1068
1069 let page_dict = match doc.get_object(page_id) {
1070 Ok(Object::Dictionary(ref d)) => d.clone(),
1071 _ => return result,
1072 };
1073
1074 let annots = match page_dict.get(b"Annots") {
1075 Ok(Object::Array(ref arr)) => arr.clone(),
1076 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1077 Ok(Object::Array(ref arr)) => arr.clone(),
1078 _ => return result,
1079 },
1080 _ => return result,
1081 };
1082
1083 for annot_ref in &annots {
1084 let annot_id = match annot_ref {
1085 Object::Reference(id) => *id,
1086 _ => continue,
1087 };
1088
1089 let annot_dict = match doc.get_object(annot_id) {
1090 Ok(Object::Dictionary(ref d)) => d.clone(),
1091 _ => continue,
1092 };
1093
1094 let ap_dict = match annot_dict.get(b"AP") {
1096 Ok(Object::Dictionary(ref d)) => d.clone(),
1097 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1098 Ok(Object::Dictionary(ref d)) => d.clone(),
1099 _ => continue,
1100 },
1101 _ => continue,
1102 };
1103
1104 match ap_dict.get(b"N") {
1106 Ok(Object::Reference(id)) => {
1107 result.push(*id);
1108 }
1109 Ok(Object::Dictionary(ref d)) => {
1110 for (_key, val) in d.iter() {
1112 if let Object::Reference(id) = val {
1113 result.push(*id);
1114 }
1115 }
1116 }
1117 _ => {}
1118 }
1119 }
1120
1121 result
1122}
1123
1124fn collect_nested_form_xobjects(doc: &Document, stream_id: ObjectId) -> Vec<ObjectId> {
1126 let mut result = Vec::new();
1127
1128 let stream_dict = match doc.get_object(stream_id) {
1129 Ok(Object::Stream(ref s)) => s.dict.clone(),
1130 _ => return result,
1131 };
1132
1133 let resources = match stream_dict.get(b"Resources") {
1135 Ok(Object::Dictionary(ref d)) => d.clone(),
1136 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1137 Ok(Object::Dictionary(ref d)) => d.clone(),
1138 _ => return result,
1139 },
1140 _ => return result,
1141 };
1142
1143 let xobject_dict = match resources.get(b"XObject") {
1144 Ok(Object::Dictionary(ref d)) => d.clone(),
1145 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1146 Ok(Object::Dictionary(ref d)) => d.clone(),
1147 _ => return result,
1148 },
1149 _ => return result,
1150 };
1151
1152 for (_key, value) in xobject_dict.iter() {
1153 let obj_id = match value {
1154 Object::Reference(id) => *id,
1155 _ => continue,
1156 };
1157 if let Ok(Object::Stream(ref s)) = doc.get_object(obj_id) {
1159 let is_form = s
1160 .dict
1161 .get(b"Subtype")
1162 .ok()
1163 .and_then(|v| match v {
1164 Object::Name(ref n) => Some(n.as_slice()),
1165 _ => None,
1166 })
1167 .map(|n| n == b"Form")
1168 .unwrap_or(false);
1169 if is_form {
1170 result.push(obj_id);
1171 }
1172 }
1173 }
1174
1175 result
1176}
1177
1178fn collect_form_xobject_ids(doc: &Document, page_id: ObjectId) -> Vec<ObjectId> {
1180 let mut result = Vec::new();
1181
1182 let page_dict = match doc.get_object(page_id) {
1183 Ok(Object::Dictionary(ref d)) => d.clone(),
1184 _ => return result,
1185 };
1186
1187 let resources = match page_dict.get(b"Resources") {
1188 Ok(Object::Dictionary(ref d)) => d.clone(),
1189 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1190 Ok(Object::Dictionary(ref d)) => d.clone(),
1191 _ => return result,
1192 },
1193 _ => return result,
1194 };
1195
1196 let xobject_dict = match resources.get(b"XObject") {
1197 Ok(Object::Dictionary(ref d)) => d.clone(),
1198 Ok(Object::Reference(id)) => match doc.get_object(*id) {
1199 Ok(Object::Dictionary(ref d)) => d.clone(),
1200 _ => return result,
1201 },
1202 _ => return result,
1203 };
1204
1205 for (_key, value) in xobject_dict.iter() {
1206 let obj_id = match value {
1207 Object::Reference(id) => *id,
1208 _ => continue,
1209 };
1210
1211 if let Ok(Object::Stream(ref s)) = doc.get_object(obj_id) {
1213 let is_form = s
1214 .dict
1215 .get(b"Subtype")
1216 .ok()
1217 .and_then(|v| match v {
1218 Object::Name(ref n) => Some(n.as_slice()),
1219 _ => None,
1220 })
1221 .map(|n| n == b"Form")
1222 .unwrap_or(false);
1223 if is_form {
1224 result.push(obj_id);
1225 }
1226 }
1227 }
1228
1229 result
1230}
1231
1232#[cfg(test)]
1233mod tests {
1234 use super::*;
1235 use lopdf::{dictionary, Document, Object, Stream};
1236
1237 fn make_doc_with_text(content: &[u8]) -> Document {
1238 let mut doc = Document::with_version("1.7");
1239
1240 let font = dictionary! {
1241 "Type" => "Font",
1242 "Subtype" => "Type1",
1243 "BaseFont" => "Helvetica",
1244 };
1245 let font_id = doc.add_object(Object::Dictionary(font));
1246 let font_resources = dictionary! {
1247 "F1" => Object::Reference(font_id),
1248 };
1249 let resources = dictionary! {
1250 "Font" => Object::Dictionary(font_resources),
1251 };
1252
1253 let content_stream = Stream::new(dictionary! {}, content.to_vec());
1254 let content_id = doc.add_object(Object::Stream(content_stream));
1255
1256 let page_dict = dictionary! {
1257 "Type" => "Page",
1258 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
1259 "Contents" => Object::Reference(content_id),
1260 "Resources" => Object::Dictionary(resources),
1261 };
1262 let page_id = doc.add_object(Object::Dictionary(page_dict));
1263
1264 let pages_dict = dictionary! {
1265 "Type" => "Pages",
1266 "Kids" => vec![Object::Reference(page_id)],
1267 "Count" => 1_i64,
1268 };
1269 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
1270
1271 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
1272 d.set("Parent", Object::Reference(pages_id));
1273 }
1274
1275 let info = dictionary! {
1276 "Title" => Object::String(b"Test".to_vec(), lopdf::StringFormat::Literal),
1277 };
1278 let info_id = doc.add_object(Object::Dictionary(info));
1279 doc.trailer.set("Info", Object::Reference(info_id));
1280
1281 let catalog = dictionary! {
1282 "Type" => "Catalog",
1283 "Pages" => Object::Reference(pages_id),
1284 };
1285 let catalog_id = doc.add_object(Object::Dictionary(catalog));
1286 doc.trailer.set("Root", Object::Reference(catalog_id));
1287
1288 doc
1289 }
1290
1291 #[test]
1292 fn search_and_redact_exact_match() {
1293 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret Data) Tj ET");
1294 let opts = RedactSearchOptions::default();
1295 let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1296 assert!(report.matches_found >= 1);
1297 assert!(report.areas_redacted >= 1);
1298 }
1299
1300 #[test]
1301 fn search_and_redact_no_match() {
1302 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET");
1303 let opts = RedactSearchOptions::default();
1304 let report = search_and_redact(&mut doc, "Missing", &opts).unwrap();
1305 assert_eq!(report.matches_found, 0);
1306 assert_eq!(report.areas_redacted, 0);
1307 }
1308
1309 #[test]
1310 fn search_and_redact_case_insensitive() {
1311 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret Data) Tj ET");
1312 let opts = RedactSearchOptions::case_insensitive();
1313 let report = search_and_redact(&mut doc, "secret", &opts).unwrap();
1314 assert!(report.matches_found >= 1);
1315 }
1316
1317 #[test]
1318 fn search_and_redact_regex() {
1319 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (SSN 123-45-6789) Tj ET");
1320 let opts = RedactSearchOptions::with_regex();
1321 let report = search_and_redact(&mut doc, r"\d{3}-\d{2}-\d{4}", &opts).unwrap();
1322 assert!(report.matches_found >= 1);
1323 }
1324
1325 #[test]
1326 fn search_and_redact_with_overlay() {
1327 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Confidential) Tj ET");
1328 let opts = RedactSearchOptions::default().overlay_text("[REDACTED]");
1329 let report = search_and_redact(&mut doc, "Confidential", &opts).unwrap();
1330 assert!(report.matches_found >= 1);
1331 }
1332
1333 #[test]
1334 fn search_and_redact_specific_pages() {
1335 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1336 let opts = RedactSearchOptions::default().pages(vec![1]);
1337 let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1338 assert!(report.matches_found >= 1);
1339 }
1340
1341 #[test]
1342 fn search_and_redact_page_out_of_range() {
1343 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET");
1344 let opts = RedactSearchOptions::default().pages(vec![5]);
1345 let result = search_and_redact(&mut doc, "Hello", &opts);
1346 assert!(result.is_err());
1347 }
1348
1349 #[test]
1350 fn search_and_redact_cleans_metadata() {
1351 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1352 let opts = RedactSearchOptions::default();
1353 let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1354 assert!(report.metadata_cleaned);
1355 assert!(doc.trailer.get(b"Info").is_err());
1356 }
1357
1358 #[test]
1359 fn search_and_redact_custom_color() {
1360 let mut doc = make_doc_with_text(b"BT /F1 12 Tf 100 700 Td (Secret) Tj ET");
1361 let opts = RedactSearchOptions::default().fill_color(1.0, 0.0, 0.0);
1362 let report = search_and_redact(&mut doc, "Secret", &opts).unwrap();
1363 assert!(report.matches_found >= 1);
1364 }
1365
1366 fn make_doc_with_xobject_text() -> (Document, ObjectId) {
1374 let mut doc = Document::with_version("1.7");
1375
1376 let xobj_font = dictionary! {
1378 "Type" => "Font",
1379 "Subtype" => "Type1",
1380 "BaseFont" => "Times-Roman",
1381 };
1382 let xobj_font_id = doc.add_object(Object::Dictionary(xobj_font));
1383 let xobj_font_res = dictionary! { "FX" => Object::Reference(xobj_font_id) };
1384 let xobj_resources = dictionary! {
1385 "Font" => Object::Dictionary(xobj_font_res),
1386 };
1387
1388 let xobj_content = b"BT /FX 12 Tf 0 0 Td (Classified) Tj ET".to_vec();
1390 let xobj_stream = Stream::new(
1391 dictionary! {
1392 "Type" => "XObject",
1393 "Subtype" => "Form",
1394 "BBox" => vec![0.into(), 0.into(), 300_i64.into(), 20_i64.into()],
1395 "Resources" => Object::Dictionary(xobj_resources),
1396 },
1397 xobj_content,
1398 );
1399 let xobj_id = doc.add_object(Object::Stream(xobj_stream));
1400
1401 let page_font = dictionary! {
1403 "Type" => "Font",
1404 "Subtype" => "Type1",
1405 "BaseFont" => "Helvetica",
1406 };
1407 let page_font_id = doc.add_object(Object::Dictionary(page_font));
1408 let page_font_res = dictionary! { "F1" => Object::Reference(page_font_id) };
1409 let xobj_map = dictionary! { "Xobj1" => Object::Reference(xobj_id) };
1410 let page_resources = dictionary! {
1411 "Font" => Object::Dictionary(page_font_res),
1412 "XObject" => Object::Dictionary(xobj_map),
1413 };
1414
1415 let page_content = b"q 1 0 0 1 100 700 cm /Xobj1 Do Q".to_vec();
1417 let content_stream = Stream::new(dictionary! {}, page_content);
1418 let content_id = doc.add_object(Object::Stream(content_stream));
1419
1420 let page_dict = dictionary! {
1421 "Type" => "Page",
1422 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
1423 "Contents" => Object::Reference(content_id),
1424 "Resources" => Object::Dictionary(page_resources),
1425 };
1426 let page_id = doc.add_object(Object::Dictionary(page_dict));
1427
1428 let pages_dict = dictionary! {
1429 "Type" => "Pages",
1430 "Kids" => vec![Object::Reference(page_id)],
1431 "Count" => 1_i64,
1432 };
1433 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
1434
1435 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
1436 d.set("Parent", Object::Reference(pages_id));
1437 }
1438
1439 let catalog = dictionary! {
1440 "Type" => "Catalog",
1441 "Pages" => Object::Reference(pages_id),
1442 };
1443 let catalog_id = doc.add_object(Object::Dictionary(catalog));
1444 doc.trailer.set("Root", Object::Reference(catalog_id));
1445
1446 (doc, xobj_id)
1447 }
1448
1449 #[test]
1456 fn redact_removes_text_from_xobject_stream() {
1457 let (mut doc, xobj_id) = make_doc_with_xobject_text();
1458
1459 let page_fonts = pdf_manip::text_run::FontMap::empty();
1464 let matcher_opts = RedactSearchOptions::default();
1465 let matcher = build_matcher("Classified", &matcher_opts).unwrap();
1466
1467 let removed = remove_text_ops_from_stream(
1470 &mut doc,
1471 xobj_id,
1472 &matcher,
1473 &page_fonts,
1474 &[],
1475 &mut HashSet::new(),
1476 )
1477 .unwrap();
1478
1479 assert!(
1480 removed > 0,
1481 "Expected at least one op removed from XObject stream, got 0"
1482 );
1483
1484 if let Ok(Object::Stream(ref s)) = doc.get_object(xobj_id) {
1486 let content = std::str::from_utf8(&s.content).unwrap_or("");
1487 assert!(
1488 !content.contains("Classified"),
1489 "XObject stream still contains 'Classified' after redaction"
1490 );
1491 } else {
1492 panic!("XObject is not a stream after redaction");
1493 }
1494 }
1495
1496 #[test]
1502 fn redact_split_token_per_bbox_spatial_fallback() {
1503 let content = b"BT /F1 12 Tf 0 700 Td (ALICE) Tj 200 0 Td (LI) Tj 30 0 Td (C) Tj ET";
1508 let mut doc = make_doc_with_text(content);
1509 let opts = RedactSearchOptions::default();
1510 let report = search_and_redact(&mut doc, "LIC", &opts).unwrap();
1511 assert!(report.matches_found >= 1);
1512 assert!(report.areas_redacted >= 1);
1513 }
1514
1515 #[test]
1523 fn redact_xobject_raw_byte_fallback() {
1524 let (mut doc, xobj_id) = make_doc_with_xobject_text();
1525 let page_fonts = pdf_manip::text_run::FontMap::empty();
1526 let matcher_opts = RedactSearchOptions::default();
1527 let matcher = build_matcher("Classified", &matcher_opts).unwrap();
1528 let dummy_bboxes = [[0.0_f64, 0.0, 300.0, 20.0]];
1531 let removed = remove_text_ops_from_stream(
1532 &mut doc,
1533 xobj_id,
1534 &matcher,
1535 &page_fonts,
1536 &dummy_bboxes,
1537 &mut HashSet::new(),
1538 )
1539 .unwrap();
1540 assert!(
1541 removed > 0,
1542 "Expected raw-byte fallback to remove ops from XObject stream"
1543 );
1544 }
1545}