1use crate::parser::ParseResult;
7use crate::parser::content::{ContentParser, ContentOperation, TextElement};
8use crate::parser::document::PdfDocument;
9use std::io::{Read, Seek};
10
11#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14 pub preserve_layout: bool,
16 pub space_threshold: f64,
18 pub newline_threshold: f64,
20}
21
22impl Default for ExtractionOptions {
23 fn default() -> Self {
24 Self {
25 preserve_layout: false,
26 space_threshold: 0.2,
27 newline_threshold: 10.0,
28 }
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct ExtractedText {
35 pub text: String,
37 pub fragments: Vec<TextFragment>,
39}
40
41#[derive(Debug, Clone)]
43pub struct TextFragment {
44 pub text: String,
46 pub x: f64,
48 pub y: f64,
50 pub width: f64,
52 pub height: f64,
54 pub font_size: f64,
56}
57
58struct TextState {
60 text_matrix: [f64; 6],
62 text_line_matrix: [f64; 6],
64 ctm: [f64; 6],
66 leading: f64,
68 char_space: f64,
70 word_space: f64,
72 horizontal_scale: f64,
74 text_rise: f64,
76 font_size: f64,
78 font_name: Option<String>,
80 render_mode: u8,
82}
83
84impl Default for TextState {
85 fn default() -> Self {
86 Self {
87 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
88 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
89 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
90 leading: 0.0,
91 char_space: 0.0,
92 word_space: 0.0,
93 horizontal_scale: 100.0,
94 text_rise: 0.0,
95 font_size: 0.0,
96 font_name: None,
97 render_mode: 0,
98 }
99 }
100}
101
102pub struct TextExtractor {
104 options: ExtractionOptions,
105}
106
107impl TextExtractor {
108 pub fn new() -> Self {
110 Self {
111 options: ExtractionOptions::default(),
112 }
113 }
114
115 pub fn with_options(options: ExtractionOptions) -> Self {
117 Self { options }
118 }
119
120 pub fn extract_from_document<R: Read + Seek>(
122 &self,
123 document: &PdfDocument<R>,
124 ) -> ParseResult<Vec<ExtractedText>> {
125 let page_count = document.page_count()?;
126 let mut results = Vec::new();
127
128 for i in 0..page_count {
129 let text = self.extract_from_page(document, i)?;
130 results.push(text);
131 }
132
133 Ok(results)
134 }
135
136 pub fn extract_from_page<R: Read + Seek>(
138 &self,
139 document: &PdfDocument<R>,
140 page_index: u32,
141 ) -> ParseResult<ExtractedText> {
142 let page = document.get_page(page_index)?;
144
145 let streams = page.content_streams_with_document(document)?;
147
148 let mut extracted_text = String::new();
149 let mut fragments = Vec::new();
150 let mut state = TextState::default();
151 let mut in_text_object = false;
152 let mut last_x = 0.0;
153 let mut last_y = 0.0;
154
155 for stream_data in streams {
157 let operations = ContentParser::parse_content(&stream_data)?;
158
159 for op in operations {
160 match op {
161 ContentOperation::BeginText => {
162 in_text_object = true;
163 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
165 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
166 }
167
168 ContentOperation::EndText => {
169 in_text_object = false;
170 }
171
172 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
173 state.text_matrix = [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
174 state.text_line_matrix = [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
175 }
176
177 ContentOperation::MoveText(tx, ty) => {
178 let new_matrix = multiply_matrix(
180 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
181 &state.text_line_matrix,
182 );
183 state.text_matrix = new_matrix;
184 state.text_line_matrix = new_matrix;
185 }
186
187 ContentOperation::NextLine => {
188 let new_matrix = multiply_matrix(
190 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
191 &state.text_line_matrix,
192 );
193 state.text_matrix = new_matrix;
194 state.text_line_matrix = new_matrix;
195 }
196
197 ContentOperation::ShowText(text) => {
198 if in_text_object {
199 let text_bytes = &text;
200 let decoded = self.decode_text(text_bytes, &state)?;
201
202 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
204
205 if !extracted_text.is_empty() {
207 let dx = x - last_x;
208 let dy = (y - last_y).abs();
209
210 if dy > self.options.newline_threshold {
211 extracted_text.push('\n');
212 } else if dx > self.options.space_threshold * state.font_size {
213 extracted_text.push(' ');
214 }
215 }
216
217 extracted_text.push_str(&decoded);
218
219 if self.options.preserve_layout {
220 fragments.push(TextFragment {
221 text: decoded.clone(),
222 x,
223 y,
224 width: calculate_text_width(&decoded, state.font_size),
225 height: state.font_size,
226 font_size: state.font_size,
227 });
228 }
229
230 last_x = x + calculate_text_width(&decoded, state.font_size);
232 last_y = y;
233
234 let text_width = calculate_text_width(&decoded, state.font_size);
236 let tx = text_width * state.horizontal_scale / 100.0;
237 state.text_matrix = multiply_matrix(
238 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
239 &state.text_matrix,
240 );
241 }
242 }
243
244 ContentOperation::ShowTextArray(array) => {
245 if in_text_object {
246 for item in array {
247 match item {
248 TextElement::Text(text_bytes) => {
249 let decoded = self.decode_text(&text_bytes, &state)?;
250 extracted_text.push_str(&decoded);
251
252 let text_width = calculate_text_width(&decoded, state.font_size);
254 let tx = text_width * state.horizontal_scale / 100.0;
255 state.text_matrix = multiply_matrix(
256 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
257 &state.text_matrix,
258 );
259 }
260 TextElement::Spacing(adjustment) => {
261 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
263 state.text_matrix = multiply_matrix(
264 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
265 &state.text_matrix,
266 );
267 }
268 }
269 }
270 }
271 }
272
273 ContentOperation::SetFont(name, size) => {
274 state.font_name = Some(name);
275 state.font_size = size as f64;
276 }
277
278 ContentOperation::SetLeading(leading) => {
279 state.leading = leading as f64;
280 }
281
282 ContentOperation::SetCharSpacing(spacing) => {
283 state.char_space = spacing as f64;
284 }
285
286 ContentOperation::SetWordSpacing(spacing) => {
287 state.word_space = spacing as f64;
288 }
289
290 ContentOperation::SetHorizontalScaling(scale) => {
291 state.horizontal_scale = scale as f64;
292 }
293
294 ContentOperation::SetTextRise(rise) => {
295 state.text_rise = rise as f64;
296 }
297
298 ContentOperation::SetTextRenderMode(mode) => {
299 state.render_mode = mode as u8;
300 }
301
302 _ => {
303 }
305 }
306 }
307 }
308
309 Ok(ExtractedText {
310 text: extracted_text,
311 fragments,
312 })
313 }
314
315 fn decode_text(&self, text: &[u8], _state: &TextState) -> ParseResult<String> {
317 use crate::text::encoding::TextEncoding;
320
321 let encoding = TextEncoding::WinAnsiEncoding;
322 Ok(encoding.decode(text))
323 }
324}
325
326impl Default for TextExtractor {
327 fn default() -> Self {
328 Self::new()
329 }
330}
331
332fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
334 [
335 a[0] * b[0] + a[1] * b[2],
336 a[0] * b[1] + a[1] * b[3],
337 a[2] * b[0] + a[3] * b[2],
338 a[2] * b[1] + a[3] * b[3],
339 a[4] * b[0] + a[5] * b[2] + b[4],
340 a[4] * b[1] + a[5] * b[3] + b[5],
341 ]
342}
343
344fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
346 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
347 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
348 (tx, ty)
349}
350
351fn calculate_text_width(text: &str, font_size: f64) -> f64 {
353 text.len() as f64 * font_size * 0.5
355}
356
357#[cfg(test)]
358mod tests {
359 use super::*;
360
361 #[test]
362 fn test_matrix_multiplication() {
363 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
364 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
365
366 let result = multiply_matrix(&identity, &translation);
367 assert_eq!(result, translation);
368
369 let result2 = multiply_matrix(&translation, &identity);
370 assert_eq!(result2, translation);
371 }
372
373 #[test]
374 fn test_transform_point() {
375 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
376 let (x, y) = transform_point(5.0, 5.0, &translation);
377 assert_eq!(x, 15.0);
378 assert_eq!(y, 25.0);
379 }
380}