1use crate::ast::{NodeId, PdfAstGraph};
2use crate::parser::cmap::{CMap, CMapParser};
3use crate::parser::content_stream::ContentOperator;
4use crate::types::{PdfDictionary, PdfValue};
5use std::collections::HashMap;
6
7#[allow(dead_code)]
9pub struct TextExtractor<'a> {
10 ast: &'a PdfAstGraph,
11 page_resources: &'a PdfDictionary,
12 fonts: HashMap<String, FontInfo>,
13 cmaps: HashMap<String, CMap>,
14 text_spans: Vec<TextSpan>,
15 graphics_state: GraphicsState,
16 text_state: TextState,
17}
18
19#[derive(Debug, Clone)]
20pub struct FontInfo {
21 pub font_type: String,
22 pub base_font: String,
23 pub encoding: String,
24 pub to_unicode: Option<NodeId>,
25 pub width_map: HashMap<u32, f64>,
26 pub default_width: f64,
27 pub font_matrix: [f64; 6],
28}
29
30#[derive(Debug, Clone)]
31pub struct GraphicsState {
32 pub ctm: [f64; 6], pub text_matrix: [f64; 6],
34 pub text_line_matrix: [f64; 6],
35 pub leading: f64,
36 pub char_space: f64,
37 pub word_space: f64,
38 pub horizontal_scale: f64,
39 pub text_rise: f64,
40 pub font: Option<String>,
41 pub font_size: f64,
42 pub render_mode: i32,
43}
44
45#[derive(Debug, Clone)]
46pub struct TextState {
47 pub current_font: Option<FontInfo>,
48 pub current_cmap: Option<CMap>,
49}
50
51#[derive(Debug, Clone)]
52pub struct TextSpan {
53 pub text: String,
54 pub x: f64,
55 pub y: f64,
56 pub width: f64,
57 pub height: f64,
58 pub font_name: String,
59 pub font_size: f64,
60 pub space_width: f64,
61 pub chars: Vec<CharInfo>,
62}
63
64#[derive(Debug, Clone)]
65pub struct CharInfo {
66 pub unicode: String,
67 pub x: f64,
68 pub y: f64,
69 pub width: f64,
70 pub height: f64,
71}
72
73impl<'a> TextExtractor<'a> {
74 pub fn new(ast: &'a PdfAstGraph, page_resources: &'a PdfDictionary) -> Self {
75 TextExtractor {
76 ast,
77 page_resources,
78 fonts: HashMap::new(),
79 cmaps: HashMap::new(),
80 text_spans: Vec::new(),
81 graphics_state: GraphicsState::default(),
82 text_state: TextState {
83 current_font: None,
84 current_cmap: None,
85 },
86 }
87 }
88
89 pub fn extract_text(&mut self, operators: &[ContentOperator]) -> Vec<TextSpan> {
90 self.load_fonts();
92
93 for op in operators {
95 self.process_operator(op);
96 }
97
98 self.text_spans.sort_by(|a, b| {
100 a.y.partial_cmp(&b.y)
101 .unwrap()
102 .then(a.x.partial_cmp(&b.x).unwrap())
103 });
104
105 self.text_spans.clone()
106 }
107
108 fn load_fonts(&mut self) {
109 if let Some(PdfValue::Dictionary(fonts)) = self.page_resources.get("Font") {
110 for (name, font_ref) in fonts.iter() {
111 if let PdfValue::Reference(_obj_id) = font_ref {
112 let font_info = self.parse_font_info(name.as_str(), font_ref);
115 self.fonts.insert(name.to_string(), font_info);
116 }
117 }
118 }
119 }
120
121 fn parse_font_info(&mut self, name: &str, _font_value: &PdfValue) -> FontInfo {
122 FontInfo {
129 font_type: "Type1".to_string(),
130 base_font: name.to_string(),
131 encoding: "StandardEncoding".to_string(),
132 to_unicode: None,
133 width_map: HashMap::new(),
134 default_width: 1000.0,
135 font_matrix: [0.001, 0.0, 0.0, 0.001, 0.0, 0.0],
136 }
137 }
138
139 fn process_operator(&mut self, op: &ContentOperator) {
140 match op {
141 ContentOperator::BeginText => {
142 self.graphics_state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
143 self.graphics_state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
144 }
145
146 ContentOperator::EndText => {
147 }
149
150 ContentOperator::SetFont(name, size) => {
151 self.graphics_state.font = Some(name.clone());
152 self.graphics_state.font_size = *size;
153
154 if let Some(font_info) = self.fonts.get(name) {
156 self.text_state.current_font = Some(font_info.clone());
157 }
158 }
159
160 ContentOperator::SetCharSpace(spacing) => {
161 self.graphics_state.char_space = *spacing;
162 }
163
164 ContentOperator::SetWordSpace(spacing) => {
165 self.graphics_state.word_space = *spacing;
166 }
167
168 ContentOperator::SetHorizontalScale(scale) => {
169 self.graphics_state.horizontal_scale = *scale;
170 }
171
172 ContentOperator::SetLeading(leading) => {
173 self.graphics_state.leading = *leading;
174 }
175
176 ContentOperator::SetTextRise(rise) => {
177 self.graphics_state.text_rise = *rise;
178 }
179
180 ContentOperator::MoveText(tx, ty) => {
181 let tm = &mut self.graphics_state.text_line_matrix;
182 tm[4] += tx;
183 tm[5] += ty;
184 self.graphics_state.text_matrix = *tm;
185 }
186
187 ContentOperator::MoveTextNextLine => {
188 let leading = self.graphics_state.leading;
189 self.process_operator(&ContentOperator::MoveText(0.0, -leading));
190 }
191
192 ContentOperator::SetTextMatrix(a, b, c, d, e, f) => {
193 self.graphics_state.text_matrix = [*a, *b, *c, *d, *e, *f];
194 self.graphics_state.text_line_matrix = [*a, *b, *c, *d, *e, *f];
195 }
196
197 ContentOperator::ShowText(text) => {
198 self.show_text(text);
199 }
200
201 ContentOperator::ShowTextArray(array) => {
202 for element in array {
203 match element {
204 crate::parser::content_stream::TextArrayElement::Text(text) => {
205 self.show_text(text);
206 }
207 crate::parser::content_stream::TextArrayElement::Spacing(spacing) => {
208 let adj = -spacing / 1000.0
210 * self.graphics_state.font_size
211 * self.graphics_state.horizontal_scale
212 / 100.0;
213 self.graphics_state.text_matrix[4] -= adj;
214 }
215 }
216 }
217 }
218
219 ContentOperator::ShowTextNextLine(text) => {
220 self.process_operator(&ContentOperator::MoveTextNextLine);
221 self.show_text(text);
222 }
223
224 ContentOperator::ShowTextWithSpacing(tw, tc, text) => {
225 self.graphics_state.word_space = *tw;
226 self.graphics_state.char_space = *tc;
227 self.process_operator(&ContentOperator::MoveTextNextLine);
228 self.show_text(text);
229 }
230
231 ContentOperator::Save => {
232 }
234
235 ContentOperator::Restore => {
236 }
238
239 ContentOperator::SetMatrix(a, b, c, d, e, f) => {
240 self.graphics_state.ctm = [*a, *b, *c, *d, *e, *f];
241 }
242
243 _ => {
244 }
246 }
247 }
248
249 fn show_text(&mut self, text_bytes: &[u8]) {
250 if self.text_state.current_font.is_none() {
251 return;
252 }
253
254 let font = self.text_state.current_font.as_ref().unwrap();
255 let mut chars = Vec::new();
256 let mut total_width = 0.0;
257
258 let decoded = self.decode_text(text_bytes, font);
260
261 let tm = &self.graphics_state.text_matrix;
263 let ctm = &self.graphics_state.ctm;
264
265 let (x, y) = self.transform_point(0.0, 0.0, tm, ctm);
267
268 for ch in decoded.chars() {
269 let char_width = self.get_char_width(ch, font);
270
271 let char_info = CharInfo {
272 unicode: ch.to_string(),
273 x: x + total_width,
274 y,
275 width: char_width * self.graphics_state.font_size,
276 height: self.graphics_state.font_size,
277 };
278
279 chars.push(char_info);
280
281 total_width += char_width * self.graphics_state.font_size;
283 total_width += self.graphics_state.char_space;
284
285 if ch == ' ' {
286 total_width += self.graphics_state.word_space;
287 }
288 }
289
290 self.graphics_state.text_matrix[4] += total_width;
292
293 if !chars.is_empty() {
295 let span = TextSpan {
296 text: decoded,
297 x,
298 y,
299 width: total_width,
300 height: self.graphics_state.font_size,
301 font_name: self.graphics_state.font.clone().unwrap_or_default(),
302 font_size: self.graphics_state.font_size,
303 space_width: self.get_char_width(' ', font) * self.graphics_state.font_size,
304 chars,
305 };
306
307 self.text_spans.push(span);
308 }
309 }
310
311 fn decode_text(&self, text_bytes: &[u8], font: &FontInfo) -> String {
312 if let Some(cmap) = &self.text_state.current_cmap {
314 return self.decode_with_cmap(text_bytes, cmap);
315 }
316
317 match font.encoding.as_str() {
319 "WinAnsiEncoding" => self.decode_win_ansi(text_bytes),
320 "MacRomanEncoding" => self.decode_mac_roman(text_bytes),
321 "StandardEncoding" => {
322 String::from_utf8_lossy(text_bytes).to_string()
324 }
325 _ => {
326 String::from_utf8_lossy(text_bytes).to_string()
328 }
329 }
330 }
331
332 fn decode_with_cmap(&self, text_bytes: &[u8], cmap: &CMap) -> String {
333 let mut result = String::new();
334 let mut i = 0;
335
336 while i < text_bytes.len() {
337 if i + 1 < text_bytes.len() {
339 let code = &text_bytes[i..i + 2];
340 if let Some(unicode) = CMapParser::new(
341 &mut PdfAstGraph::new(),
342 &crate::parser::reference_resolver::ObjectNodeMap::new(),
343 )
344 .map_code_to_unicode(cmap, code)
345 {
346 result.push_str(&unicode);
347 i += 2;
348 continue;
349 }
350 }
351
352 let code = &text_bytes[i..i + 1];
354 if let Some(unicode) = CMapParser::new(
355 &mut PdfAstGraph::new(),
356 &crate::parser::reference_resolver::ObjectNodeMap::new(),
357 )
358 .map_code_to_unicode(cmap, code)
359 {
360 result.push_str(&unicode);
361 } else {
362 result.push(text_bytes[i] as char);
364 }
365
366 i += 1;
367 }
368
369 result
370 }
371
372 fn decode_win_ansi(&self, text_bytes: &[u8]) -> String {
373 text_bytes
374 .iter()
375 .map(|&b| {
376 if b < 128 {
377 b as char
378 } else {
379 match b {
381 0x80 => '€',
382 0x82 => '‚',
383 0x83 => 'ƒ',
384 0x84 => '„',
385 0x85 => '…',
386 0x86 => '†',
387 0x87 => '‡',
388 0x88 => 'ˆ',
389 0x89 => '‰',
390 0x8A => 'Š',
391 0x8B => '‹',
392 0x8C => 'Œ',
393 0x8E => 'Ž',
394 0x91 => '\'',
395 0x92 => '\'',
396 0x93 => '"',
397 0x94 => '"',
398 0x95 => '•',
399 0x96 => '–',
400 0x97 => '—',
401 0x98 => '˜',
402 0x99 => '™',
403 0x9A => 'š',
404 0x9B => '›',
405 0x9C => 'œ',
406 0x9E => 'ž',
407 0x9F => 'Ÿ',
408 _ => b as char,
409 }
410 }
411 })
412 .collect()
413 }
414
415 fn decode_mac_roman(&self, text_bytes: &[u8]) -> String {
416 String::from_utf8_lossy(text_bytes).to_string()
418 }
419
420 fn get_char_width(&self, ch: char, font: &FontInfo) -> f64 {
421 let code = ch as u32;
423 font.width_map
424 .get(&code)
425 .copied()
426 .unwrap_or(font.default_width)
427 * font.font_matrix[0]
428 }
429
430 fn transform_point(&self, x: f64, y: f64, tm: &[f64; 6], ctm: &[f64; 6]) -> (f64, f64) {
431 let tx = tm[0] * x + tm[2] * y + tm[4];
433 let ty = tm[1] * x + tm[3] * y + tm[5];
434
435 let dx = ctm[0] * tx + ctm[2] * ty + ctm[4];
437 let dy = ctm[1] * tx + ctm[3] * ty + ctm[5];
438
439 (dx, dy)
440 }
441
442 pub fn merge_spans(&mut self) -> Vec<TextLine> {
443 let mut lines = Vec::new();
444 let mut current_line = TextLine::new();
445
446 for span in &self.text_spans {
447 if current_line.should_add_span(span) {
448 current_line.add_span(span.clone());
449 } else {
450 if !current_line.spans.is_empty() {
451 lines.push(current_line);
452 }
453 current_line = TextLine::new();
454 current_line.add_span(span.clone());
455 }
456 }
457
458 if !current_line.spans.is_empty() {
459 lines.push(current_line);
460 }
461
462 lines
463 }
464}
465
466impl Default for GraphicsState {
467 fn default() -> Self {
468 GraphicsState {
469 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
470 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
471 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
472 leading: 0.0,
473 char_space: 0.0,
474 word_space: 0.0,
475 horizontal_scale: 100.0,
476 text_rise: 0.0,
477 font: None,
478 font_size: 12.0,
479 render_mode: 0,
480 }
481 }
482}
483
484#[derive(Debug, Clone)]
485pub struct TextLine {
486 pub spans: Vec<TextSpan>,
487 pub x: f64,
488 pub y: f64,
489 pub width: f64,
490 pub height: f64,
491}
492
493impl Default for TextLine {
494 fn default() -> Self {
495 Self::new()
496 }
497}
498
499impl TextLine {
500 pub fn new() -> Self {
501 TextLine {
502 spans: Vec::new(),
503 x: 0.0,
504 y: 0.0,
505 width: 0.0,
506 height: 0.0,
507 }
508 }
509
510 pub fn should_add_span(&self, span: &TextSpan) -> bool {
511 if self.spans.is_empty() {
512 return true;
513 }
514
515 let last = &self.spans[self.spans.len() - 1];
516
517 let y_diff = (span.y - last.y).abs();
519 if y_diff > last.height * 0.3 {
520 return false;
521 }
522
523 let expected_x = last.x + last.width;
525 let x_diff = span.x - expected_x;
526
527 x_diff < last.space_width * 3.0
529 }
530
531 pub fn add_span(&mut self, span: TextSpan) {
532 if self.spans.is_empty() {
533 self.x = span.x;
534 self.y = span.y;
535 self.height = span.height;
536 }
537
538 self.width = (span.x + span.width) - self.x;
539 self.spans.push(span);
540 }
541
542 pub fn get_text(&self) -> String {
543 self.spans
544 .iter()
545 .map(|s| s.text.as_str())
546 .collect::<Vec<_>>()
547 .join(" ")
548 }
549}