Skip to main content

pdfplumber_core/
text.rs

1use crate::geometry::BBox;
2use crate::painting::Color;
3
4/// A single character extracted from a PDF page.
5#[derive(Debug, Clone, PartialEq)]
6#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
7pub struct Char {
8    /// The text content of this character.
9    pub text: String,
10    /// Bounding box in top-left origin coordinates.
11    pub bbox: BBox,
12    /// Font name.
13    pub fontname: String,
14    /// Font size in points.
15    pub size: f64,
16    /// Distance from the top of the first page (accumulates across pages).
17    pub doctop: f64,
18    /// Whether the character is upright (not rotated).
19    pub upright: bool,
20    /// Text direction for this character.
21    pub direction: TextDirection,
22    /// Stroking (outline) color, if any.
23    pub stroking_color: Option<Color>,
24    /// Non-stroking (fill) color, if any.
25    pub non_stroking_color: Option<Color>,
26    /// Current transformation matrix `[a, b, c, d, e, f]` at time of rendering.
27    pub ctm: [f64; 6],
28    /// Raw character code from the PDF content stream.
29    pub char_code: u32,
30}
31
32/// Text flow direction.
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
34#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
35pub enum TextDirection {
36    /// Left-to-right (default for Latin, CJK horizontal).
37    #[default]
38    Ltr,
39    /// Right-to-left (Arabic, Hebrew).
40    Rtl,
41    /// Top-to-bottom (CJK vertical writing).
42    Ttb,
43    /// Bottom-to-top.
44    Btt,
45}
46
47/// Returns `true` if the character is a CJK ideograph, syllable, or kana.
48///
49/// Covers the main Unicode blocks used by Chinese, Japanese, and Korean text:
50/// - CJK Unified Ideographs (U+4E00–U+9FFF)
51/// - CJK Extension A (U+3400–U+4DBF)
52/// - CJK Extension B (U+20000–U+2A6DF)
53/// - CJK Compatibility Ideographs (U+F900–U+FAFF)
54/// - Hiragana (U+3040–U+309F)
55/// - Katakana (U+30A0–U+30FF)
56/// - Hangul Syllables (U+AC00–U+D7AF)
57/// - Hangul Jamo (U+1100–U+11FF)
58/// - Bopomofo (U+3100–U+312F)
59/// - CJK Radicals Supplement (U+2E80–U+2EFF)
60/// - Kangxi Radicals (U+2F00–U+2FDF)
61pub fn is_cjk(c: char) -> bool {
62    matches!(c,
63        '\u{4E00}'..='\u{9FFF}'   // CJK Unified Ideographs
64        | '\u{3400}'..='\u{4DBF}' // CJK Extension A
65        | '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
66        | '\u{3040}'..='\u{309F}' // Hiragana
67        | '\u{30A0}'..='\u{30FF}' // Katakana
68        | '\u{AC00}'..='\u{D7AF}' // Hangul Syllables
69        | '\u{1100}'..='\u{11FF}' // Hangul Jamo
70        | '\u{3100}'..='\u{312F}' // Bopomofo
71        | '\u{2E80}'..='\u{2EFF}' // CJK Radicals Supplement
72        | '\u{2F00}'..='\u{2FDF}' // Kangxi Radicals
73        | '\u{20000}'..='\u{2A6DF}' // CJK Extension B
74    )
75}
76
77/// Returns `true` if the first character of the text is CJK.
78pub fn is_cjk_text(text: &str) -> bool {
79    text.chars().next().is_some_and(is_cjk)
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    #[test]
87    fn test_char_creation_basic() {
88        let ch = Char {
89            text: "A".to_string(),
90            bbox: BBox::new(10.0, 20.0, 20.0, 32.0),
91            fontname: "Helvetica".to_string(),
92            size: 12.0,
93            doctop: 20.0,
94            upright: true,
95            direction: TextDirection::Ltr,
96            stroking_color: None,
97            non_stroking_color: Some(Color::Gray(0.0)),
98            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
99            char_code: 65,
100        };
101        assert_eq!(ch.text, "A");
102        assert_eq!(ch.bbox.x0, 10.0);
103        assert_eq!(ch.fontname, "Helvetica");
104        assert_eq!(ch.size, 12.0);
105        assert_eq!(ch.doctop, 20.0);
106        assert!(ch.upright);
107        assert_eq!(ch.direction, TextDirection::Ltr);
108        assert_eq!(ch.stroking_color, None);
109        assert_eq!(ch.non_stroking_color, Some(Color::Gray(0.0)));
110        assert_eq!(ch.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
111        assert_eq!(ch.char_code, 65);
112    }
113
114    #[test]
115    fn test_char_with_colors() {
116        let ch = Char {
117            text: "B".to_string(),
118            bbox: BBox::new(30.0, 20.0, 40.0, 32.0),
119            fontname: "Times-Roman".to_string(),
120            size: 14.0,
121            doctop: 820.0,
122            upright: true,
123            direction: TextDirection::Ltr,
124            stroking_color: Some(Color::Rgb(1.0, 0.0, 0.0)),
125            non_stroking_color: Some(Color::Cmyk(0.0, 1.0, 1.0, 0.0)),
126            ctm: [2.0, 0.0, 0.0, 2.0, 100.0, 200.0],
127            char_code: 66,
128        };
129        assert_eq!(ch.stroking_color, Some(Color::Rgb(1.0, 0.0, 0.0)));
130        assert_eq!(ch.non_stroking_color, Some(Color::Cmyk(0.0, 1.0, 1.0, 0.0)));
131        assert_eq!(ch.ctm[4], 100.0);
132        assert_eq!(ch.ctm[5], 200.0);
133        assert_eq!(ch.doctop, 820.0);
134    }
135
136    #[test]
137    fn test_char_rotated_text() {
138        let ch = Char {
139            text: "R".to_string(),
140            bbox: BBox::new(50.0, 100.0, 62.0, 110.0),
141            fontname: "Courier".to_string(),
142            size: 10.0,
143            doctop: 100.0,
144            upright: false,
145            direction: TextDirection::Ttb,
146            stroking_color: None,
147            non_stroking_color: Some(Color::Gray(0.0)),
148            ctm: [0.0, 1.0, -1.0, 0.0, 50.0, 100.0],
149            char_code: 82,
150        };
151        assert!(!ch.upright);
152        assert_eq!(ch.direction, TextDirection::Ttb);
153    }
154
155    #[test]
156    fn test_text_direction_default() {
157        let dir = TextDirection::default();
158        assert_eq!(dir, TextDirection::Ltr);
159    }
160
161    #[test]
162    fn test_is_cjk_chinese() {
163        assert!(is_cjk('中'));
164        assert!(is_cjk('国'));
165        assert!(is_cjk('人'));
166    }
167
168    #[test]
169    fn test_is_cjk_japanese_hiragana() {
170        assert!(is_cjk('あ'));
171        assert!(is_cjk('い'));
172    }
173
174    #[test]
175    fn test_is_cjk_japanese_katakana() {
176        assert!(is_cjk('ア'));
177        assert!(is_cjk('イ'));
178    }
179
180    #[test]
181    fn test_is_cjk_korean() {
182        assert!(is_cjk('한'));
183        assert!(is_cjk('글'));
184    }
185
186    #[test]
187    fn test_is_cjk_latin() {
188        assert!(!is_cjk('A'));
189        assert!(!is_cjk('z'));
190        assert!(!is_cjk('0'));
191        assert!(!is_cjk(' '));
192    }
193
194    #[test]
195    fn test_is_cjk_text() {
196        assert!(is_cjk_text("中文"));
197        assert!(is_cjk_text("한글"));
198        assert!(!is_cjk_text("Hello"));
199        assert!(!is_cjk_text(""));
200    }
201}