Skip to main content

pdfplumber_core/
dedupe.rs

1//! Duplicate character deduplication.
2//!
3//! Removes duplicate overlapping characters that some PDF generators output
4//! (for bold effect or due to bugs). Reference: pdfplumber(Py) `Page.dedupe_chars()`.
5
6use crate::text::Char;
7
8/// Options for duplicate character detection and removal.
9#[derive(Debug, Clone)]
10#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
11pub struct DedupeOptions {
12    /// Maximum distance (in points) between character positions to consider
13    /// them as duplicates. Default: `1.0`.
14    pub tolerance: f64,
15    /// Additional character attributes that must match for two characters to be
16    /// considered duplicates. Default: `["fontname", "size"]`.
17    ///
18    /// Supported attribute names: `"fontname"`, `"size"`, `"upright"`,
19    /// `"stroking_color"`, `"non_stroking_color"`.
20    pub extra_attrs: Vec<String>,
21}
22
23impl Default for DedupeOptions {
24    fn default() -> Self {
25        Self {
26            tolerance: 1.0,
27            extra_attrs: vec!["fontname".to_string(), "size".to_string()],
28        }
29    }
30}
31
32/// Returns whether two characters match on the given attribute name.
33fn attrs_match(a: &Char, b: &Char, attr: &str) -> bool {
34    match attr {
35        "fontname" => a.fontname == b.fontname,
36        "size" => (a.size - b.size).abs() < f64::EPSILON,
37        "upright" => a.upright == b.upright,
38        "stroking_color" => a.stroking_color == b.stroking_color,
39        "non_stroking_color" => a.non_stroking_color == b.non_stroking_color,
40        _ => true, // Unknown attributes are ignored (treated as matching)
41    }
42}
43
44/// Returns whether two characters are duplicates according to the given options.
45///
46/// Two characters are considered duplicates if:
47/// 1. They have the same text content
48/// 2. Their positions (x0, top) are within the tolerance
49/// 3. All specified extra attributes match
50fn is_duplicate(a: &Char, b: &Char, options: &DedupeOptions) -> bool {
51    // Must have the same text
52    if a.text != b.text {
53        return false;
54    }
55
56    // Positions must be within tolerance
57    let dx = (a.bbox.x0 - b.bbox.x0).abs();
58    let dy = (a.bbox.top - b.bbox.top).abs();
59    if dx > options.tolerance || dy > options.tolerance {
60        return false;
61    }
62
63    // All extra attributes must match
64    options
65        .extra_attrs
66        .iter()
67        .all(|attr| attrs_match(a, b, attr))
68}
69
70/// Remove duplicate overlapping characters from a slice.
71///
72/// Iterates through characters in order, keeping the first occurrence and
73/// discarding subsequent duplicates. Two characters are duplicates if their
74/// positions overlap within `tolerance` and the specified `extra_attrs` match.
75///
76/// The original slice is not modified; a new `Vec<Char>` is returned.
77pub fn dedupe_chars(chars: &[Char], options: &DedupeOptions) -> Vec<Char> {
78    let mut kept: Vec<Char> = Vec::with_capacity(chars.len());
79
80    for ch in chars {
81        let dominated = kept.iter().any(|k| is_duplicate(k, ch, options));
82        if !dominated {
83            kept.push(ch.clone());
84        }
85    }
86
87    kept
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93    use crate::geometry::BBox;
94    use crate::painting::Color;
95    use crate::text::TextDirection;
96
97    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
98        Char {
99            text: text.to_string(),
100            bbox: BBox::new(x0, top, x1, bottom),
101            fontname: "Helvetica".to_string(),
102            size: 12.0,
103            doctop: top,
104            upright: true,
105            direction: TextDirection::Ltr,
106            stroking_color: None,
107            non_stroking_color: None,
108            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
109            char_code: 0,
110            mcid: None,
111            tag: None,
112        }
113    }
114
115    fn make_char_with_font(text: &str, x0: f64, top: f64, fontname: &str, size: f64) -> Char {
116        Char {
117            text: text.to_string(),
118            bbox: BBox::new(x0, top, x0 + 10.0, top + 12.0),
119            fontname: fontname.to_string(),
120            size,
121            doctop: top,
122            upright: true,
123            direction: TextDirection::Ltr,
124            stroking_color: None,
125            non_stroking_color: None,
126            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
127            char_code: 0,
128            mcid: None,
129            tag: None,
130        }
131    }
132
133    #[test]
134    fn test_overlapping_identical_chars_deduped() {
135        // Two "A" chars at nearly the same position — should be deduped to one
136        let chars = vec![
137            make_char("A", 10.0, 20.0, 20.0, 32.0),
138            make_char("A", 10.5, 20.3, 20.5, 32.3), // within tolerance=1.0
139        ];
140
141        let result = dedupe_chars(&chars, &DedupeOptions::default());
142        assert_eq!(result.len(), 1);
143        assert_eq!(result[0].text, "A");
144        // First occurrence is kept
145        assert!((result[0].bbox.x0 - 10.0).abs() < f64::EPSILON);
146    }
147
148    #[test]
149    fn test_non_overlapping_chars_preserved() {
150        // Two "A" chars at different positions — should NOT be deduped
151        let chars = vec![
152            make_char("A", 10.0, 20.0, 20.0, 32.0),
153            make_char("A", 50.0, 20.0, 60.0, 32.0), // far apart
154        ];
155
156        let result = dedupe_chars(&chars, &DedupeOptions::default());
157        assert_eq!(result.len(), 2);
158    }
159
160    #[test]
161    fn test_different_text_not_deduped() {
162        // "A" and "B" at the same position — should NOT be deduped
163        let chars = vec![
164            make_char("A", 10.0, 20.0, 20.0, 32.0),
165            make_char("B", 10.0, 20.0, 20.0, 32.0),
166        ];
167
168        let result = dedupe_chars(&chars, &DedupeOptions::default());
169        assert_eq!(result.len(), 2);
170    }
171
172    #[test]
173    fn test_different_font_not_deduped() {
174        // Same text, same position, but different font — should NOT be deduped
175        let chars = vec![
176            make_char_with_font("A", 10.0, 20.0, "Helvetica", 12.0),
177            make_char_with_font("A", 10.0, 20.0, "Times-Roman", 12.0),
178        ];
179
180        let result = dedupe_chars(&chars, &DedupeOptions::default());
181        assert_eq!(result.len(), 2);
182    }
183
184    #[test]
185    fn test_different_size_not_deduped() {
186        // Same text, same position, same font, different size — should NOT be deduped
187        let chars = vec![
188            make_char_with_font("A", 10.0, 20.0, "Helvetica", 12.0),
189            make_char_with_font("A", 10.0, 20.0, "Helvetica", 14.0),
190        ];
191
192        let result = dedupe_chars(&chars, &DedupeOptions::default());
193        assert_eq!(result.len(), 2);
194    }
195
196    #[test]
197    fn test_custom_tolerance() {
198        // Two chars 2.5 points apart — not deduped with default tolerance=1.0
199        // but deduped with tolerance=3.0
200        let chars = vec![
201            make_char("A", 10.0, 20.0, 20.0, 32.0),
202            make_char("A", 12.5, 20.0, 22.5, 32.0),
203        ];
204
205        let default_result = dedupe_chars(&chars, &DedupeOptions::default());
206        assert_eq!(
207            default_result.len(),
208            2,
209            "Default tolerance should not merge these"
210        );
211
212        let wide_result = dedupe_chars(
213            &chars,
214            &DedupeOptions {
215                tolerance: 3.0,
216                ..DedupeOptions::default()
217            },
218        );
219        assert_eq!(wide_result.len(), 1, "Wide tolerance should merge these");
220    }
221
222    #[test]
223    fn test_empty_extra_attrs() {
224        // With no extra_attrs, only text + position matter
225        // Different font chars at same position should be deduped
226        let chars = vec![
227            make_char_with_font("A", 10.0, 20.0, "Helvetica", 12.0),
228            make_char_with_font("A", 10.0, 20.0, "Times-Roman", 14.0),
229        ];
230
231        let result = dedupe_chars(
232            &chars,
233            &DedupeOptions {
234                tolerance: 1.0,
235                extra_attrs: vec![],
236            },
237        );
238        assert_eq!(result.len(), 1);
239    }
240
241    #[test]
242    fn test_multiple_duplicates_keep_first() {
243        // Three identical chars — should keep only the first
244        let chars = vec![
245            make_char("A", 10.0, 20.0, 20.0, 32.0),
246            make_char("A", 10.2, 20.1, 20.2, 32.1),
247            make_char("A", 10.4, 20.2, 20.4, 32.2),
248        ];
249
250        let result = dedupe_chars(&chars, &DedupeOptions::default());
251        assert_eq!(result.len(), 1);
252        assert!((result[0].bbox.x0 - 10.0).abs() < f64::EPSILON);
253    }
254
255    #[test]
256    fn test_mixed_chars_only_duplicates_removed() {
257        // "H" "e" "l" "l" "o" with "H" duplicated
258        let chars = vec![
259            make_char("H", 10.0, 20.0, 20.0, 32.0),
260            make_char("H", 10.1, 20.0, 20.1, 32.0), // duplicate of first H
261            make_char("e", 20.0, 20.0, 30.0, 32.0),
262            make_char("l", 30.0, 20.0, 40.0, 32.0),
263            make_char("l", 40.0, 20.0, 50.0, 32.0), // NOT a dup (different position)
264            make_char("o", 50.0, 20.0, 60.0, 32.0),
265        ];
266
267        let result = dedupe_chars(&chars, &DedupeOptions::default());
268        assert_eq!(result.len(), 5);
269        let texts: Vec<&str> = result.iter().map(|c| c.text.as_str()).collect();
270        assert_eq!(texts, vec!["H", "e", "l", "l", "o"]);
271    }
272
273    #[test]
274    fn test_empty_input() {
275        let result = dedupe_chars(&[], &DedupeOptions::default());
276        assert!(result.is_empty());
277    }
278
279    #[test]
280    fn test_single_char() {
281        let chars = vec![make_char("A", 10.0, 20.0, 20.0, 32.0)];
282        let result = dedupe_chars(&chars, &DedupeOptions::default());
283        assert_eq!(result.len(), 1);
284    }
285
286    #[test]
287    fn test_color_as_extra_attr() {
288        // Two chars at same position, same text, same font, but different fill color
289        let mut c1 = make_char("A", 10.0, 20.0, 20.0, 32.0);
290        c1.non_stroking_color = Some(Color::Rgb(1.0, 0.0, 0.0));
291        let mut c2 = make_char("A", 10.0, 20.0, 20.0, 32.0);
292        c2.non_stroking_color = Some(Color::Rgb(0.0, 0.0, 1.0));
293
294        // With default extra_attrs (fontname, size) → deduped (colors not checked)
295        let result = dedupe_chars(&[c1.clone(), c2.clone()], &DedupeOptions::default());
296        assert_eq!(result.len(), 1);
297
298        // With non_stroking_color in extra_attrs → not deduped
299        let result = dedupe_chars(
300            &[c1, c2],
301            &DedupeOptions {
302                tolerance: 1.0,
303                extra_attrs: vec![
304                    "fontname".to_string(),
305                    "size".to_string(),
306                    "non_stroking_color".to_string(),
307                ],
308            },
309        );
310        assert_eq!(result.len(), 2);
311    }
312
313    #[test]
314    fn test_default_options() {
315        let opts = DedupeOptions::default();
316        assert!((opts.tolerance - 1.0).abs() < f64::EPSILON);
317        assert_eq!(opts.extra_attrs, vec!["fontname", "size"]);
318    }
319}