Skip to main content

pdfplumber_core/
text.rs

1use crate::geometry::BBox;
2use crate::painting::Color;
3
4/// A single character extracted from a PDF page.
5#[derive(Debug, Clone, PartialEq)]
6#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
7pub struct Char {
8    /// The text content of this character.
9    pub text: String,
10    /// Bounding box in top-left origin coordinates.
11    pub bbox: BBox,
12    /// Font name.
13    pub fontname: String,
14    /// Font size in points.
15    pub size: f64,
16    /// Distance from the top of the first page (accumulates across pages).
17    pub doctop: f64,
18    /// Whether the character is upright (not rotated).
19    pub upright: bool,
20    /// Text direction for this character.
21    pub direction: TextDirection,
22    /// Stroking (outline) color, if any.
23    pub stroking_color: Option<Color>,
24    /// Non-stroking (fill) color, if any.
25    pub non_stroking_color: Option<Color>,
26    /// Current transformation matrix `[a, b, c, d, e, f]` at time of rendering.
27    pub ctm: [f64; 6],
28    /// Raw character code from the PDF content stream.
29    pub char_code: u32,
30    /// Marked content identifier linking this character to a structure tree element.
31    /// Set when the character is inside a marked-content sequence with an MCID.
32    pub mcid: Option<u32>,
33    /// Structure tag for this character (e.g., "P", "H1", "Span").
34    /// Derived from the structure tree element that references this character's MCID.
35    pub tag: Option<String>,
36}
37
38impl Char {
39    /// Resolve the non-stroking (fill) color to RGB.
40    ///
41    /// Converts the `non_stroking_color` to `Color::Rgb` if possible.
42    /// Returns `None` if no color is set or conversion is not possible
43    /// (e.g., `Color::Other` with unknown color space).
44    pub fn resolved_color(&self) -> Option<Color> {
45        self.non_stroking_color
46            .as_ref()
47            .and_then(|c| c.to_rgb())
48            .map(|(r, g, b)| Color::Rgb(r, g, b))
49    }
50}
51
52/// Text flow direction.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
54#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
55pub enum TextDirection {
56    /// Left-to-right (default for Latin, CJK horizontal).
57    #[default]
58    Ltr,
59    /// Right-to-left (Arabic, Hebrew).
60    Rtl,
61    /// Top-to-bottom (CJK vertical writing).
62    Ttb,
63    /// Bottom-to-top.
64    Btt,
65}
66
67/// Returns `true` if the character is a CJK ideograph, syllable, or kana.
68///
69/// Covers the main Unicode blocks used by Chinese, Japanese, and Korean text:
70/// - CJK Unified Ideographs (U+4E00–U+9FFF)
71/// - CJK Extension A (U+3400–U+4DBF)
72/// - CJK Extension B (U+20000–U+2A6DF)
73/// - CJK Compatibility Ideographs (U+F900–U+FAFF)
74/// - Hiragana (U+3040–U+309F)
75/// - Katakana (U+30A0–U+30FF)
76/// - Hangul Syllables (U+AC00–U+D7AF)
77/// - Hangul Jamo (U+1100–U+11FF)
78/// - Bopomofo (U+3100–U+312F)
79/// - CJK Radicals Supplement (U+2E80–U+2EFF)
80/// - Kangxi Radicals (U+2F00–U+2FDF)
81pub fn is_cjk(c: char) -> bool {
82    matches!(c,
83        '\u{4E00}'..='\u{9FFF}'   // CJK Unified Ideographs
84        | '\u{3400}'..='\u{4DBF}' // CJK Extension A
85        | '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
86        | '\u{3040}'..='\u{309F}' // Hiragana
87        | '\u{30A0}'..='\u{30FF}' // Katakana
88        | '\u{AC00}'..='\u{D7AF}' // Hangul Syllables
89        | '\u{1100}'..='\u{11FF}' // Hangul Jamo
90        | '\u{3100}'..='\u{312F}' // Bopomofo
91        | '\u{2E80}'..='\u{2EFF}' // CJK Radicals Supplement
92        | '\u{2F00}'..='\u{2FDF}' // Kangxi Radicals
93        | '\u{20000}'..='\u{2A6DF}' // CJK Extension B
94    )
95}
96
97/// Returns `true` if the first character of the text is CJK.
98pub fn is_cjk_text(text: &str) -> bool {
99    text.chars().next().is_some_and(is_cjk)
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105
106    #[test]
107    fn test_char_creation_basic() {
108        let ch = Char {
109            text: "A".to_string(),
110            bbox: BBox::new(10.0, 20.0, 20.0, 32.0),
111            fontname: "Helvetica".to_string(),
112            size: 12.0,
113            doctop: 20.0,
114            upright: true,
115            direction: TextDirection::Ltr,
116            stroking_color: None,
117            non_stroking_color: Some(Color::Gray(0.0)),
118            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
119            char_code: 65,
120            mcid: None,
121            tag: None,
122        };
123        assert_eq!(ch.text, "A");
124        assert_eq!(ch.bbox.x0, 10.0);
125        assert_eq!(ch.fontname, "Helvetica");
126        assert_eq!(ch.size, 12.0);
127        assert_eq!(ch.doctop, 20.0);
128        assert!(ch.upright);
129        assert_eq!(ch.direction, TextDirection::Ltr);
130        assert_eq!(ch.stroking_color, None);
131        assert_eq!(ch.non_stroking_color, Some(Color::Gray(0.0)));
132        assert_eq!(ch.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
133        assert_eq!(ch.char_code, 65);
134        assert_eq!(ch.mcid, None);
135        assert_eq!(ch.tag, None);
136    }
137
138    #[test]
139    fn test_char_with_colors() {
140        let ch = Char {
141            text: "B".to_string(),
142            bbox: BBox::new(30.0, 20.0, 40.0, 32.0),
143            fontname: "Times-Roman".to_string(),
144            size: 14.0,
145            doctop: 820.0,
146            upright: true,
147            direction: TextDirection::Ltr,
148            stroking_color: Some(Color::Rgb(1.0, 0.0, 0.0)),
149            non_stroking_color: Some(Color::Cmyk(0.0, 1.0, 1.0, 0.0)),
150            ctm: [2.0, 0.0, 0.0, 2.0, 100.0, 200.0],
151            char_code: 66,
152            mcid: Some(3),
153            tag: Some("P".to_string()),
154        };
155        assert_eq!(ch.stroking_color, Some(Color::Rgb(1.0, 0.0, 0.0)));
156        assert_eq!(ch.non_stroking_color, Some(Color::Cmyk(0.0, 1.0, 1.0, 0.0)));
157        assert_eq!(ch.ctm[4], 100.0);
158        assert_eq!(ch.ctm[5], 200.0);
159        assert_eq!(ch.doctop, 820.0);
160        assert_eq!(ch.mcid, Some(3));
161        assert_eq!(ch.tag.as_deref(), Some("P"));
162    }
163
164    #[test]
165    fn test_char_rotated_text() {
166        let ch = Char {
167            text: "R".to_string(),
168            bbox: BBox::new(50.0, 100.0, 62.0, 110.0),
169            fontname: "Courier".to_string(),
170            size: 10.0,
171            doctop: 100.0,
172            upright: false,
173            direction: TextDirection::Ttb,
174            stroking_color: None,
175            non_stroking_color: Some(Color::Gray(0.0)),
176            ctm: [0.0, 1.0, -1.0, 0.0, 50.0, 100.0],
177            char_code: 82,
178            mcid: None,
179            tag: None,
180        };
181        assert!(!ch.upright);
182        assert_eq!(ch.direction, TextDirection::Ttb);
183    }
184
185    #[test]
186    fn test_text_direction_default() {
187        let dir = TextDirection::default();
188        assert_eq!(dir, TextDirection::Ltr);
189    }
190
191    #[test]
192    fn test_is_cjk_chinese() {
193        assert!(is_cjk('中'));
194        assert!(is_cjk('国'));
195        assert!(is_cjk('人'));
196    }
197
198    #[test]
199    fn test_is_cjk_japanese_hiragana() {
200        assert!(is_cjk('あ'));
201        assert!(is_cjk('い'));
202    }
203
204    #[test]
205    fn test_is_cjk_japanese_katakana() {
206        assert!(is_cjk('ア'));
207        assert!(is_cjk('イ'));
208    }
209
210    #[test]
211    fn test_is_cjk_korean() {
212        assert!(is_cjk('한'));
213        assert!(is_cjk('글'));
214    }
215
216    #[test]
217    fn test_is_cjk_latin() {
218        assert!(!is_cjk('A'));
219        assert!(!is_cjk('z'));
220        assert!(!is_cjk('0'));
221        assert!(!is_cjk(' '));
222    }
223
224    #[test]
225    fn test_is_cjk_text() {
226        assert!(is_cjk_text("中文"));
227        assert!(is_cjk_text("한글"));
228        assert!(!is_cjk_text("Hello"));
229        assert!(!is_cjk_text(""));
230    }
231
232    // --- Char::resolved_color tests ---
233
234    fn make_char(non_stroking: Option<Color>) -> Char {
235        Char {
236            text: "A".to_string(),
237            bbox: BBox::new(0.0, 0.0, 10.0, 10.0),
238            fontname: "Helvetica".to_string(),
239            size: 12.0,
240            doctop: 0.0,
241            upright: true,
242            direction: TextDirection::Ltr,
243            stroking_color: None,
244            non_stroking_color: non_stroking,
245            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
246            char_code: 65,
247            mcid: None,
248            tag: None,
249        }
250    }
251
252    #[test]
253    fn test_resolved_color_gray_to_rgb() {
254        let ch = make_char(Some(Color::Gray(0.5)));
255        let resolved = ch.resolved_color();
256        assert_eq!(resolved, Some(Color::Rgb(0.5, 0.5, 0.5)));
257    }
258
259    #[test]
260    fn test_resolved_color_rgb_identity() {
261        let ch = make_char(Some(Color::Rgb(1.0, 0.0, 0.0)));
262        let resolved = ch.resolved_color();
263        assert_eq!(resolved, Some(Color::Rgb(1.0, 0.0, 0.0)));
264    }
265
266    #[test]
267    fn test_resolved_color_cmyk_to_rgb() {
268        let ch = make_char(Some(Color::Cmyk(0.0, 0.0, 0.0, 0.0)));
269        let resolved = ch.resolved_color();
270        assert_eq!(resolved, Some(Color::Rgb(1.0, 1.0, 1.0)));
271    }
272
273    #[test]
274    fn test_resolved_color_none() {
275        let ch = make_char(None);
276        let resolved = ch.resolved_color();
277        assert_eq!(resolved, None);
278    }
279
280    #[test]
281    fn test_resolved_color_other_returns_none() {
282        let ch = make_char(Some(Color::Other(vec![0.1])));
283        let resolved = ch.resolved_color();
284        assert_eq!(resolved, None);
285    }
286}