1use log::warn;
2
3use crate::{
4 content::{Content, Operation},
5 document::Document,
6 encodings::Encoding,
7 error::ParseError,
8 object::Object::Name,
9 xref::{Xref, XrefEntry, XrefType},
10 Error, Result,
11};
12use crate::{parser, Dictionary, Object, ObjectId, Stream};
13use std::{
14 collections::BTreeMap,
15 io::{Cursor, Read},
16};
17
18impl Content<Vec<Operation>> {
19 pub fn decode(data: &[u8]) -> Result<Self> {
21 parser::content(data).ok_or(ParseError::InvalidContentStream.into())
22 }
23
24 pub fn decode_strict(data: &[u8]) -> Result<Self> {
26 parser::content_strict(data).map_err(|e| e.into())
27 }
28}
29
30impl Stream {
31 pub fn decode_content(&self) -> Result<Content<Vec<Operation>>> {
33 Content::decode(&self.content)
34 }
35}
36
37impl Document {
38 pub fn get_and_decode_page_content(&self, page_id: ObjectId) -> Result<Content<Vec<Operation>>> {
40 let content_data = self.get_page_content(page_id)?;
41 Content::decode(&content_data)
42 }
43
44 pub fn add_to_page_content(&mut self, page_id: ObjectId, content: Content<Vec<Operation>>) -> Result<()> {
46 let content_data = Content::encode(&content)?;
47 self.add_page_contents(page_id, content_data)?;
48 Ok(())
49 }
50
51 pub fn extract_text(&self, page_numbers: &[u32]) -> Result<String> {
52 let text_fragments = self.extract_text_chunks(page_numbers);
53 let mut text = String::new();
54 for maybe_text_fragment in text_fragments.into_iter() {
55 let text_fragment = maybe_text_fragment?;
56 text.push_str(&text_fragment);
57 }
58
59 Ok(text)
60 }
61
62 pub fn extract_text_chunks(&self, page_numbers: &[u32]) -> Vec<Result<String>> {
63 let pages: BTreeMap<u32, (u32, u16)> = self.get_pages();
64 page_numbers
65 .iter()
66 .flat_map(|page_number| {
67 let result = self.extract_text_chunks_from_page(&pages, *page_number);
68 match result {
69 Ok(text_chunks) => text_chunks,
70 Err(err) => vec![Err(err)],
71 }
72 })
73 .collect()
74 }
75
76 fn extract_text_chunks_from_page(
77 &self, pages: &BTreeMap<u32, (u32, u16)>, page_number: u32,
78 ) -> Result<Vec<Result<String>>> {
79 let mut collected_chunks_and_errs: Vec<std::result::Result<String, Error>> = Vec::new();
80
81 let page_id = *pages.get(&page_number).ok_or(Error::PageNumberNotFound(page_number))?;
82 let fonts = self.get_page_fonts(page_id)?;
83 let encodings: BTreeMap<Vec<u8>, Encoding> = fonts
84 .into_iter()
85 .filter_map(|(name, font)| match font.get_font_encoding(self) {
86 Ok(it) => Some((name, it)),
87 Err(err) => {
88 collected_chunks_and_errs.push(Err(err));
89 None
90 }
91 })
92 .collect();
93 let content_data = self.get_page_content(page_id)?;
94 let content = Content::decode(&content_data)?;
95
96 let mut current_encoding = None;
98 let mut current_text = String::new();
99 for operation in &content.operations {
100 match operation.operator.as_ref() {
101 "Tf" => {
102 let current_font = operation
103 .operands
104 .first()
105 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
106 .as_name();
107 current_encoding = match current_font {
108 Ok(font) => encodings.get(font),
109 Err(err) => {
110 collected_chunks_and_errs.push(Err(err));
111 None
112 }
113 };
114
115 if !current_text.is_empty() {
116 collected_chunks_and_errs.push(Ok(current_text));
117 current_text = String::new();
118 }
119 }
120 "Tj" | "TJ" => match current_encoding {
121 Some(encoding) => {
122 let res = collect_text(&mut current_text, encoding, &operation.operands);
123 if let Err(err) = res {
124 collected_chunks_and_errs.push(Err(err));
125 }
126 }
127 None => warn!("Could not decode extracted text"),
128 },
129 "'" => match current_encoding {
133 Some(encoding) => {
134 if !current_text.ends_with('\n') {
135 current_text.push('\n');
136 }
137 let res = collect_text(&mut current_text, encoding, &operation.operands);
138 if let Err(err) = res {
139 collected_chunks_and_errs.push(Err(err));
140 }
141 }
142 None => warn!("Could not decode extracted text"),
143 },
144 "\"" => match current_encoding {
150 Some(encoding) => {
151 if !current_text.ends_with('\n') {
152 current_text.push('\n');
153 }
154 if let Some(string_operand) = operation.operands.get(2) {
155 let res = collect_text(&mut current_text, encoding, std::slice::from_ref(string_operand));
156 if let Err(err) = res {
157 collected_chunks_and_errs.push(Err(err));
158 }
159 }
160 }
161 None => warn!("Could not decode extracted text"),
162 },
163 "T*" if !current_text.ends_with('\n') => current_text.push('\n'),
168 "T*" => {}
169 "ET" if !current_text.ends_with('\n') => current_text.push('\n'),
170 "ET" => {}
171 _ => {}
172 }
173 }
174 if !current_text.is_empty() {
175 collected_chunks_and_errs.push(Ok(current_text));
176 }
177
178 Ok(collected_chunks_and_errs)
179 }
180
181 pub fn replace_text(
182 &mut self, page_number: u32, text: &str, other_text: &str, default_str: Option<&str>,
183 ) -> Result<()> {
184 let page = page_number.saturating_sub(1) as usize;
185 let page_id = self
186 .page_iter()
187 .nth(page)
188 .ok_or(Error::PageNumberNotFound(page_number))?;
189 let encodings: BTreeMap<Vec<u8>, Encoding> = self
190 .get_page_fonts(page_id)?
191 .into_iter()
192 .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
193 .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
194 let content_data = self.get_page_content(page_id)?;
195 let mut content = Content::decode(&content_data)?;
196 let mut current_encoding = None;
197 for operation in &mut content.operations {
198 match operation.operator.as_ref() {
199 "Tf" => {
200 let current_font = operation
201 .operands
202 .first()
203 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
204 .as_name()?;
205 current_encoding = encodings.get(current_font);
206 }
207 "Tj" | "TJ" => match current_encoding {
208 Some(encoding) => {
209 try_to_replace_encoded_text(operation, encoding, text, other_text, default_str.unwrap_or(""))?
210 }
211 None => {
212 warn!("Could not decode extracted text, some of the occurances might not be properly replaced")
213 }
214 },
215 _ => {}
216 }
217 }
218 let modified_content = content.encode()?;
219 self.change_page_content(page_id, modified_content)
220 }
221
222 pub fn replace_partial_text(
223 &mut self, page_number: u32, search_text: &str, replacement_text: &str, default_char: Option<&str>,
224 ) -> Result<usize> {
225 let page = page_number.saturating_sub(1) as usize;
226 let page_id = self
227 .page_iter()
228 .nth(page)
229 .ok_or(Error::PageNumberNotFound(page_number))?;
230
231 let encodings: BTreeMap<Vec<u8>, Encoding> = self
232 .get_page_fonts(page_id)?
233 .into_iter()
234 .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
235 .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
236
237 let content_data = self.get_page_content(page_id)?;
238 let mut content = Content::decode(&content_data)?;
239 let mut current_encoding = None;
240 let mut replacement_count = 0;
241
242 for operation in &mut content.operations {
243 match operation.operator.as_ref() {
244 "Tf" => {
245 let current_font = operation
246 .operands
247 .first()
248 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
249 .as_name()?;
250 current_encoding = encodings.get(current_font);
251 }
252 "Tj" | "TJ" => {
253 if let Some(encoding) = current_encoding {
254 replacement_count += replace_partial_in_operation(
255 operation,
256 encoding,
257 search_text,
258 replacement_text,
259 default_char.unwrap_or("?"),
260 )?;
261 } else {
262 warn!("No encoding found for text operation");
263 }
264 }
265 _ => {}
266 }
267 }
268
269 if replacement_count > 0 {
270 let modified_content = content.encode()?;
271 self.change_page_content(page_id, modified_content)?;
272 }
273
274 Ok(replacement_count)
275 }
276
277 pub fn insert_image(
278 &mut self, page_id: ObjectId, img_object: Stream, position: (f32, f32), size: (f32, f32),
279 ) -> Result<()> {
280 let img_id = self.add_object(img_object);
281 let img_name = format!("X{}", img_id.0);
282
283 self.add_xobject(page_id, img_name.as_bytes(), img_id)?;
284
285 let mut content = self.get_and_decode_page_content(page_id)?;
286 content.operations.push(Operation::new("q", vec![]));
287 content.operations.push(Operation::new(
288 "cm",
289 vec![
290 size.0.into(),
291 0.into(),
292 0.into(),
293 size.1.into(),
294 position.0.into(),
295 position.1.into(),
296 ],
297 ));
298 content
299 .operations
300 .push(Operation::new("Do", vec![Name(img_name.as_bytes().to_vec())]));
301 content.operations.push(Operation::new("Q", vec![]));
302
303 self.change_page_content(page_id, content.encode()?)
304 }
305
306 pub fn insert_form_object(&mut self, page_id: ObjectId, form_obj: Stream) -> Result<()> {
307 let form_id = self.add_object(form_obj);
308 let form_name = format!("X{}", form_id.0);
309
310 let mut content = self.get_and_decode_page_content(page_id)?;
311 content.operations.insert(0, Operation::new("q", vec![]));
312 content.operations.push(Operation::new("Q", vec![]));
313 content
314 .operations
315 .push(Operation::new("Do", vec![Name(form_name.as_bytes().to_vec())]));
316 let modified_content = content.encode()?;
317 self.add_xobject(page_id, form_name, form_id)?;
318
319 self.change_page_content(page_id, modified_content)
320 }
321}
322fn collect_text(text: &mut String, encoding: &Encoding, operands: &[Object]) -> Result<()> {
323 for operand in operands.iter() {
324 match operand {
325 Object::String(bytes, _) => {
326 encoding.write_to_string(bytes, text)?;
327 }
328 Object::Array(arr) => {
329 collect_text(text, encoding, arr)?;
330 text.push(' ');
331 }
332 Object::Integer(i) if *i < -100 => {
333 text.push(' ');
334 }
335 _ => {}
336 }
337 }
338 Ok(())
339}
340pub fn substr(s: &str, start: usize, len: usize) -> &str {
341 let mut indices = s.char_indices();
342
343 for _ in 0..start {
344 if indices.next().is_none() {
345 return "";
346 }
347 }
348
349 let Some((start_idx, _)) = indices.next() else {
350 return "";
351 };
352
353 let end_idx = indices
354 .nth(len.saturating_sub(1))
355 .map(|(idx, _)| idx)
356 .unwrap_or(s.len());
357
358 &s[start_idx..end_idx]
359}
360pub fn substring(s: &str, start: usize) -> &str {
361 s.char_indices().nth(start).map(|(idx, _)| &s[idx..]).unwrap_or("")
362}
363
364fn encode(encoding: &Encoding, txt: &str, default_str: &str) -> Vec<u8> {
365 if txt.chars().count() > 1 {
366 let mut cur = 0;
367 let mut result = Vec::new();
368 while cur < txt.chars().count() {
369 let c = substr(txt, cur, 1);
370 result.extend_from_slice(&encode(encoding, c, default_str));
371 cur += 1;
372 }
373 result
374 } else {
375 let encoded_bytes = Document::encode_text(encoding, txt);
376 if !encoded_bytes.is_empty() {
377 encoded_bytes
378 } else {
379 Document::encode_text(encoding, default_str)
380 }
381 }
382}
383fn try_to_replace_encoded_text(
384 operation: &mut Operation, encoding: &Encoding, text_to_replace: &str, replacement: &str, default_str: &str,
385) -> Result<()> {
386 for operand in &mut operation.operands {
387 match operand {
388 Object::String(bytes, _) => {
389 let decoded_text = Document::decode_text(encoding, bytes)?;
390 if decoded_text == text_to_replace {
391 let encoded_bytes = encode(encoding, replacement, default_str);
392 *bytes = encoded_bytes;
393 }
394 }
395 Object::Array(arr) => {
396 let mut str_collected = String::new();
397 collect_text(&mut str_collected, encoding, arr)?;
398 if str_collected == text_to_replace {
399 let encoded_replacement = encode(encoding, replacement, default_str);
410 let mut placed = false;
411 for item in arr.iter_mut() {
412 if let Object::String(bytes, _f) = item {
413 if placed {
414 *bytes = Vec::new();
415 } else {
416 bytes.clone_from(&encoded_replacement);
417 placed = true;
418 }
419 }
420 }
421 }
422 }
423 _ => {}
424 }
425 }
426
427 Ok(())
428}
429
430fn replace_partial_in_operation(
431 operation: &mut Operation, encoding: &Encoding, search_text: &str, replacement_text: &str, default_char: &str,
432) -> Result<usize> {
433 let mut replacement_count = 0;
434
435 for operand in &mut operation.operands {
436 match operand {
437 Object::String(bytes, _) => {
438 let decoded_text = Document::decode_text(encoding, bytes)?;
439 if decoded_text.contains(search_text) {
440 let new_text = decoded_text.replace(search_text, replacement_text);
441 let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
442 *bytes = encoded_bytes;
443 replacement_count += decoded_text.matches(search_text).count();
444 }
445 }
446 Object::Array(arr) => {
447 replacement_count +=
448 replace_partial_in_array(arr, encoding, search_text, replacement_text, default_char)?;
449 }
450 _ => {}
451 }
452 }
453
454 Ok(replacement_count)
455}
456
457fn replace_partial_in_array(
458 arr: &mut [Object], encoding: &Encoding, search_text: &str, replacement_text: &str, default_char: &str,
459) -> Result<usize> {
460 let mut replacement_count = 0;
461
462 for item in arr.iter_mut() {
463 if let Object::String(bytes, _) = item {
464 let decoded_text = Document::decode_text(encoding, bytes)?;
465 if decoded_text.contains(search_text) {
466 let new_text = decoded_text.replace(search_text, replacement_text);
467 let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
468 *bytes = encoded_bytes;
469 replacement_count += decoded_text.matches(search_text).count();
470 }
471 }
472 }
473
474 Ok(replacement_count)
475}
476
477fn encode_with_fallback(encoding: &Encoding, text: &str, default_char: &str) -> Vec<u8> {
478 let encoded = Document::encode_text(encoding, text);
479 if !encoded.is_empty() {
480 return encoded;
481 }
482
483 encode(encoding, text, default_char)
484}
485
486pub fn decode_xref_stream(mut stream: Stream) -> Result<(Xref, Dictionary)> {
488 if stream.is_compressed() {
489 stream.decompress()?;
490 }
491 let mut dict = stream.dict;
492 let mut reader = Cursor::new(stream.content);
493 let size = dict
494 .get(b"Size")
495 .and_then(Object::as_i64)
496 .map_err(|_| ParseError::InvalidXref)?;
497 let mut xref = Xref::new(size as u32, XrefType::CrossReferenceStream);
498 {
499 let section_indice = dict
500 .get(b"Index")
501 .and_then(parse_integer_array)
502 .unwrap_or_else(|_| vec![0, size]);
503 let field_widths = dict
504 .get(b"W")
505 .and_then(parse_integer_array)
506 .map_err(|_| ParseError::InvalidXref)?;
507
508 if field_widths.len() < 3
509 || field_widths[0].is_negative()
510 || field_widths[1].is_negative()
511 || field_widths[2].is_negative()
512 {
513 return Err(ParseError::InvalidXref.into());
514 }
515
516 let mut bytes1 = vec![0_u8; field_widths[0] as usize];
517 let mut bytes2 = vec![0_u8; field_widths[1] as usize];
518 let mut bytes3 = vec![0_u8; field_widths[2] as usize];
519
520 for i in 0..section_indice.len() / 2 {
521 let start = section_indice[2 * i];
522 let count = section_indice[2 * i + 1];
523
524 for j in 0..count {
525 let entry_type = if !bytes1.is_empty() {
526 read_big_endian_integer(&mut reader, bytes1.as_mut_slice())?
527 } else {
528 1
529 };
530 match entry_type {
531 0 => {
532 read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
534 read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?;
535 }
536 1 => {
537 let offset = read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
539 let generation = if !bytes3.is_empty() {
540 read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?
541 } else {
542 0
543 } as u16;
544 xref.insert((start + j) as u32, XrefEntry::Normal { offset, generation });
545 }
546 2 => {
547 let container = read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
549 let index = read_big_endian_integer(&mut reader, bytes3.as_mut_slice())? as u16;
550 xref.insert((start + j) as u32, XrefEntry::Compressed { container, index });
551 }
552 _ => {}
553 }
554 }
555 }
556 }
557 dict.remove(b"Length");
558 dict.remove(b"W");
559 dict.remove(b"Index");
560 Ok((xref, dict))
561}
562
563fn read_big_endian_integer(reader: &mut Cursor<Vec<u8>>, buffer: &mut [u8]) -> Result<u32> {
564 reader.read_exact(buffer)?;
565 let mut value = 0;
566 for &mut byte in buffer {
567 value = (value << 8) + u32::from(byte);
568 }
569 Ok(value)
570}
571
572fn parse_integer_array(array: &Object) -> Result<Vec<i64>> {
573 let array = array.as_array()?;
574 let mut out = Vec::with_capacity(array.len());
575
576 for n in array {
577 out.push(n.as_i64()?);
578 }
579
580 Ok(out)
581}
582
583#[cfg(test)]
584mod tests {
585 #[cfg(not(feature = "async"))]
586 #[test]
587 fn load_and_save() {
588 use crate::creator::tests::{create_document, save_document};
589 use crate::Document;
590
591 use std::fs::File;
593 use std::io::Cursor;
594 let temp_dir = tempfile::tempdir().unwrap();
596 let file_path = temp_dir.path().join("test_1_load_and_save.pdf");
597
598 let mut doc = create_document();
599
600 save_document(&file_path, &mut doc);
601
602 let in_file = File::open(file_path).unwrap();
603 let mut in_doc = Document::load_from(in_file).unwrap();
604
605 let out_buf = Vec::new();
606 let mut memory_cursor = Cursor::new(out_buf);
607 in_doc.save_to(&mut memory_cursor).unwrap();
608 assert!(!memory_cursor.get_ref().is_empty());
610 }
611
612 #[test]
613 fn extract_text_chunks() {
614 use crate::creator::tests::create_document_with_texts;
615
616 let text1 = "Hello world!";
617 let text2 = "Ferris is the best!";
618 let doc = create_document_with_texts(&[text1, text2]);
619 let extracted_texts = doc.extract_text_chunks(&[1, 2]);
620 assert_eq!(extracted_texts.len(), 2);
621 assert_eq!(
622 [
623 extracted_texts[0].as_ref().unwrap().trim(),
624 extracted_texts[1].as_ref().unwrap().trim()
625 ],
626 [text1, text2]
627 );
628 }
629
630 #[test]
631 fn extract_text_concatenates_text_from_multiple_pages() {
632 use crate::creator::tests::create_document_with_texts;
633
634 let text1 = "Hello world!";
635 let text2 = "Ferris is the best!";
636 let doc = create_document_with_texts(&[text1, text2]);
637 let extracted_text = doc.extract_text(&[1, 2]);
638 assert_eq!(extracted_text.unwrap(), format!("{text1}\n{text2}\n"));
639 }
640
641 #[test]
642 fn test_replace_partial_text() {
643 use crate::creator::tests::create_document_with_texts;
644
645 let mut doc = create_document_with_texts(&["Hello World! Hello Universe!"]);
646 let replacements = doc.replace_partial_text(1, "Hello", "Hi", None).unwrap();
647 assert_eq!(replacements, 2); let extracted_text = doc.extract_text(&[1]).unwrap();
650 assert!(extracted_text.contains("Hi World! Hi Universe!"));
651 }
652
653 #[test]
657 fn extract_text_handles_apostrophe_show_text_op() {
658 use crate::content::Operation;
659 use crate::creator::tests::create_document_with_operations;
660 use crate::Object;
661
662 let doc = create_document_with_operations(vec![
663 Operation::new("BT", vec![]),
664 Operation::new("Tf", vec!["F1".into(), 12.into()]),
665 Operation::new("Td", vec![100.into(), 700.into()]),
666 Operation::new("Tj", vec![Object::string_literal("first")]),
667 Operation::new("'", vec![Object::string_literal("second")]),
668 Operation::new("'", vec![Object::string_literal("third")]),
669 Operation::new("ET", vec![]),
670 ]);
671
672 let text = doc.extract_text(&[1]).unwrap();
673 assert!(text.contains("first"), "Tj string lost: {text:?}");
674 assert!(text.contains("second"), "first ' string lost: {text:?}");
675 assert!(text.contains("third"), "second ' string lost: {text:?}");
676 }
677
678 #[test]
683 fn extract_text_handles_quote_show_text_op() {
684 use crate::content::Operation;
685 use crate::creator::tests::create_document_with_operations;
686 use crate::Object;
687
688 let doc = create_document_with_operations(vec![
689 Operation::new("BT", vec![]),
690 Operation::new("Tf", vec!["F1".into(), 12.into()]),
691 Operation::new("Td", vec![100.into(), 700.into()]),
692 Operation::new("\"", vec![0.into(), 0.into(), Object::string_literal("from-quote-op")]),
693 Operation::new("ET", vec![]),
694 ]);
695
696 let text = doc.extract_text(&[1]).unwrap();
697 assert!(text.contains("from-quote-op"), "\" string operand lost: {text:?}");
698 }
699
700 #[test]
705 fn extract_text_preserves_line_breaks_for_t_star() {
706 use crate::content::Operation;
707 use crate::creator::tests::create_document_with_operations;
708 use crate::Object;
709
710 let doc = create_document_with_operations(vec![
711 Operation::new("BT", vec![]),
712 Operation::new("Tf", vec!["F1".into(), 12.into()]),
713 Operation::new("Td", vec![100.into(), 700.into()]),
714 Operation::new("Tj", vec![Object::string_literal("line-one")]),
715 Operation::new("T*", vec![]),
716 Operation::new("Tj", vec![Object::string_literal("line-two")]),
717 Operation::new("ET", vec![]),
718 ]);
719
720 let text = doc.extract_text(&[1]).unwrap();
721 let one = text.find("line-one").expect("line-one missing");
722 let two = text.find("line-two").expect("line-two missing");
723 assert!(one < two, "order wrong: {text:?}");
724 let between = &text[one + "line-one".len()..two];
725 assert!(
726 between.contains('\n'),
727 "T* did not insert a line break between Tj strings: between={between:?}"
728 );
729 }
730}