1use log::warn;
2
3use crate::{Dictionary, Object, ObjectId, Stream, parser};
4use crate::{
5 Error, Result,
6 content::{Content, Operation},
7 document::Document,
8 encodings::Encoding,
9 error::ParseError,
10 object::Object::Name,
11 parser::ParserInput,
12 xref::{Xref, XrefEntry, XrefType},
13};
14use std::{
15 collections::BTreeMap,
16 io::{Cursor, Read},
17};
18
19impl Content<Vec<Operation>> {
20 pub fn decode(data: &[u8]) -> Result<Self> {
22 parser::content(ParserInput::new_extra(data, "content operations"))
23 .ok_or(ParseError::InvalidContentStream.into())
24 }
25}
26
27impl Stream {
28 pub fn decode_content(&self) -> Result<Content<Vec<Operation>>> {
30 Content::decode(&self.content)
31 }
32}
33
34impl Document {
35 pub fn get_and_decode_page_content(
37 &self,
38 page_id: ObjectId,
39 ) -> Result<Content<Vec<Operation>>> {
40 let content_data = self.get_page_content(page_id)?;
41 Content::decode(&content_data)
42 }
43
44 pub fn add_to_page_content(
46 &mut self,
47 page_id: ObjectId,
48 content: Content<Vec<Operation>>,
49 ) -> Result<()> {
50 let content_data = Content::encode(&content)?;
51 self.add_page_contents(page_id, content_data)?;
52 Ok(())
53 }
54
55 pub fn extract_text(&self, page_numbers: &[u32]) -> Result<String> {
56 let text_fragments = self.extract_text_chunks(page_numbers);
57 let mut text = String::new();
58 for maybe_text_fragment in text_fragments.into_iter() {
59 let text_fragment = maybe_text_fragment?;
60 text.push_str(&text_fragment);
61 }
62
63 Ok(text)
64 }
65
66 pub fn extract_text_chunks(&self, page_numbers: &[u32]) -> Vec<Result<String>> {
67 let pages: BTreeMap<u32, (u32, u16)> = self.get_pages();
68 page_numbers
69 .iter()
70 .flat_map(|page_number| {
71 let result = self.extract_text_chunks_from_page(&pages, *page_number);
72 match result {
73 Ok(text_chunks) => text_chunks,
74 Err(err) => vec![Err(err)],
75 }
76 })
77 .collect()
78 }
79
80 fn extract_text_chunks_from_page(
81 &self,
82 pages: &BTreeMap<u32, (u32, u16)>,
83 page_number: u32,
84 ) -> Result<Vec<Result<String>>> {
85 let mut collected_chunks_and_errs: Vec<std::result::Result<String, Error>> = Vec::new();
86
87 let page_id = *pages
88 .get(&page_number)
89 .ok_or(Error::PageNumberNotFound(page_number))?;
90 let fonts = self.get_page_fonts(page_id)?;
91 let encodings: BTreeMap<Vec<u8>, Encoding> = fonts
92 .into_iter()
93 .filter_map(|(name, font)| match font.get_font_encoding(self) {
94 Ok(it) => Some((name, it)),
95 Err(err) => {
96 collected_chunks_and_errs.push(Err(err));
97 None
98 }
99 })
100 .collect();
101 let content_data = self.get_page_content(page_id)?;
102 let content = Content::decode(&content_data)?;
103
104 let mut current_encoding = None;
106 let mut current_text = String::new();
107 for operation in &content.operations {
108 match operation.operator.as_ref() {
109 "Tf" => {
110 let current_font = operation
111 .operands
112 .first()
113 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
114 .as_name();
115 current_encoding = match current_font {
116 Ok(font) => encodings.get(font),
117 Err(err) => {
118 collected_chunks_and_errs.push(Err(err));
119 None
120 }
121 };
122
123 if !current_text.is_empty() {
124 collected_chunks_and_errs.push(Ok(current_text));
125 current_text = String::new();
126 }
127 }
128 "Tj" | "TJ" => match current_encoding {
129 Some(encoding) => {
130 let res = collect_text(&mut current_text, encoding, &operation.operands);
131 if let Err(err) = res {
132 collected_chunks_and_errs.push(Err(err));
133 }
134 }
135 None => warn!("Could not decode extracted text"),
136 },
137 "ET" if !current_text.ends_with('\n') => current_text.push('\n'),
138 _ => {}
139 }
140 }
141 if !current_text.is_empty() {
142 collected_chunks_and_errs.push(Ok(current_text));
143 }
144
145 Ok(collected_chunks_and_errs)
146 }
147
148 pub fn replace_text(
149 &mut self,
150 page_number: u32,
151 text: &str,
152 other_text: &str,
153 default_str: Option<&str>,
154 ) -> Result<()> {
155 let page = page_number.saturating_sub(1) as usize;
156 let page_id = self
157 .page_iter()
158 .nth(page)
159 .ok_or(Error::PageNumberNotFound(page_number))?;
160 let encodings: BTreeMap<Vec<u8>, Encoding> = self
161 .get_page_fonts(page_id)?
162 .into_iter()
163 .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
164 .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
165 let content_data = self.get_page_content(page_id)?;
166 let mut content = Content::decode(&content_data)?;
167 let mut current_encoding = None;
168 for operation in &mut content.operations {
169 match operation.operator.as_ref() {
170 "Tf" => {
171 let current_font = operation
172 .operands
173 .first()
174 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
175 .as_name()?;
176 current_encoding = encodings.get(current_font);
177 }
178 "Tj" | "TJ" => match current_encoding {
179 Some(encoding) => try_to_replace_encoded_text(
180 operation,
181 encoding,
182 text,
183 other_text,
184 default_str.unwrap_or(""),
185 )?,
186 None => {
187 warn!(
188 "Could not decode extracted text, some of the occurances might not be properly replaced"
189 )
190 }
191 },
192 _ => {}
193 }
194 }
195 let modified_content = content.encode()?;
196 self.change_page_content(page_id, modified_content)
197 }
198
199 pub fn replace_partial_text(
200 &mut self,
201 page_number: u32,
202 search_text: &str,
203 replacement_text: &str,
204 default_char: Option<&str>,
205 ) -> Result<usize> {
206 let page = page_number.saturating_sub(1) as usize;
207 let page_id = self
208 .page_iter()
209 .nth(page)
210 .ok_or(Error::PageNumberNotFound(page_number))?;
211
212 let encodings: BTreeMap<Vec<u8>, Encoding> = self
213 .get_page_fonts(page_id)?
214 .into_iter()
215 .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
216 .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
217
218 let content_data = self.get_page_content(page_id)?;
219 let mut content = Content::decode(&content_data)?;
220 let mut current_encoding = None;
221 let mut replacement_count = 0;
222
223 for operation in &mut content.operations {
224 match operation.operator.as_ref() {
225 "Tf" => {
226 let current_font = operation
227 .operands
228 .first()
229 .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
230 .as_name()?;
231 current_encoding = encodings.get(current_font);
232 }
233 "Tj" | "TJ" => {
234 if let Some(encoding) = current_encoding {
235 replacement_count += replace_partial_in_operation(
236 operation,
237 encoding,
238 search_text,
239 replacement_text,
240 default_char.unwrap_or("?"),
241 )?;
242 } else {
243 warn!("No encoding found for text operation");
244 }
245 }
246 _ => {}
247 }
248 }
249
250 if replacement_count > 0 {
251 let modified_content = content.encode()?;
252 self.change_page_content(page_id, modified_content)?;
253 }
254
255 Ok(replacement_count)
256 }
257
258 pub fn insert_image(
259 &mut self,
260 page_id: ObjectId,
261 img_object: Stream,
262 position: (f32, f32),
263 size: (f32, f32),
264 ) -> Result<()> {
265 let img_id = self.add_object(img_object);
266 let img_name = format!("X{}", img_id.0);
267
268 self.add_xobject(page_id, img_name.as_bytes(), img_id)?;
269
270 let mut content = self.get_and_decode_page_content(page_id)?;
271 content.operations.push(Operation::new("q", vec![]));
272 content.operations.push(Operation::new(
273 "cm",
274 vec![
275 size.0.into(),
276 0.into(),
277 0.into(),
278 size.1.into(),
279 position.0.into(),
280 position.1.into(),
281 ],
282 ));
283 content.operations.push(Operation::new(
284 "Do",
285 vec![Name(img_name.as_bytes().to_vec())],
286 ));
287 content.operations.push(Operation::new("Q", vec![]));
288
289 self.change_page_content(page_id, content.encode()?)
290 }
291
292 pub fn insert_form_object(&mut self, page_id: ObjectId, form_obj: Stream) -> Result<()> {
293 let form_id = self.add_object(form_obj);
294 let form_name = format!("X{}", form_id.0);
295
296 let mut content = self.get_and_decode_page_content(page_id)?;
297 content.operations.insert(0, Operation::new("q", vec![]));
298 content.operations.push(Operation::new("Q", vec![]));
299 content.operations.push(Operation::new(
300 "Do",
301 vec![Name(form_name.as_bytes().to_vec())],
302 ));
303 let modified_content = content.encode()?;
304 self.add_xobject(page_id, form_name, form_id)?;
305
306 self.change_page_content(page_id, modified_content)
307 }
308}
309fn collect_text(text: &mut String, encoding: &Encoding, operands: &[Object]) -> Result<()> {
310 for operand in operands.iter() {
311 match operand {
312 Object::String(bytes, _) => {
313 text.push_str(&Document::decode_text(encoding, bytes)?);
314 }
315 Object::Array(arr) => {
316 collect_text(text, encoding, arr)?;
317 text.push(' ');
318 }
319 Object::Integer(i) if *i < -100 => {
320 text.push(' ');
321 }
322 _ => {}
323 }
324 }
325 Ok(())
326}
327pub fn substr(s: &str, start: usize, len: usize) -> &str {
328 let mut indices = s.char_indices();
329
330 for _ in 0..start {
331 if indices.next().is_none() {
332 return "";
333 }
334 }
335
336 let Some((start_idx, _)) = indices.next() else {
337 return "";
338 };
339
340 let end_idx = indices
341 .nth(len.saturating_sub(1))
342 .map(|(idx, _)| idx)
343 .unwrap_or(s.len());
344
345 &s[start_idx..end_idx]
346}
347pub fn substring(s: &str, start: usize) -> &str {
348 s.char_indices()
349 .nth(start)
350 .map(|(idx, _)| &s[idx..])
351 .unwrap_or("")
352}
353
354fn encode(encoding: &Encoding, txt: &str, default_str: &str) -> Vec<u8> {
355 if txt.chars().count() > 1 {
356 let mut cur = 0;
357 let mut result = Vec::new();
358 while cur < txt.chars().count() {
359 let c = substr(txt, cur, 1);
360 result.extend_from_slice(&encode(encoding, c, default_str));
361 cur += 1;
362 }
363 result
364 } else {
365 let encoded_bytes = Document::encode_text(encoding, txt);
366 if !encoded_bytes.is_empty() {
367 encoded_bytes
368 } else {
369 Document::encode_text(encoding, default_str)
370 }
371 }
372}
373fn try_to_replace_encoded_text(
374 operation: &mut Operation,
375 encoding: &Encoding,
376 text_to_replace: &str,
377 replacement: &str,
378 default_str: &str,
379) -> Result<()> {
380 for operand in &mut operation.operands {
381 match operand {
382 Object::String(bytes, _) => {
383 let decoded_text = Document::decode_text(encoding, bytes)?;
384 if decoded_text == text_to_replace {
385 let encoded_bytes = encode(encoding, replacement, default_str);
386 *bytes = encoded_bytes;
387 }
388 }
389 Object::Array(arr) => {
390 let mut str_collected = String::new();
391 collect_text(&mut str_collected, encoding, arr)?;
392 if str_collected == text_to_replace {
393 let s_len = str_collected.chars().count();
394 let r_len = replacement.chars().count();
395 let mut cur = 0;
396 for item in arr.iter_mut() {
397 if let Object::String(bytes, _f) = item {
398 if cur == s_len - 1 {
399 let sub = substring(replacement, cur);
400 let encoded_bytes = encode(encoding, sub, default_str);
401 *bytes = encoded_bytes;
402 break;
403 } else if cur > r_len {
404 *item = Object::Null;
405 } else {
406 let sub = substr(replacement, cur, 1);
407 let encoded_bytes = encode(encoding, sub, default_str);
408 *bytes = encoded_bytes;
409 }
410 cur += 1;
411 }
412 }
413 }
414 }
415 _ => {}
416 }
417 }
418
419 Ok(())
420}
421
422fn replace_partial_in_operation(
423 operation: &mut Operation,
424 encoding: &Encoding,
425 search_text: &str,
426 replacement_text: &str,
427 default_char: &str,
428) -> Result<usize> {
429 let mut replacement_count = 0;
430
431 for operand in &mut operation.operands {
432 match operand {
433 Object::String(bytes, _) => {
434 let decoded_text = Document::decode_text(encoding, bytes)?;
435 if decoded_text.contains(search_text) {
436 let new_text = decoded_text.replace(search_text, replacement_text);
437 let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
438 *bytes = encoded_bytes;
439 replacement_count += decoded_text.matches(search_text).count();
440 }
441 }
442 Object::Array(arr) => {
443 replacement_count += replace_partial_in_array(
444 arr,
445 encoding,
446 search_text,
447 replacement_text,
448 default_char,
449 )?;
450 }
451 _ => {}
452 }
453 }
454
455 Ok(replacement_count)
456}
457
458fn replace_partial_in_array(
459 arr: &mut [Object],
460 encoding: &Encoding,
461 search_text: &str,
462 replacement_text: &str,
463 default_char: &str,
464) -> Result<usize> {
465 let mut replacement_count = 0;
466
467 for item in arr.iter_mut() {
468 if let Object::String(bytes, _) = item {
469 let decoded_text = Document::decode_text(encoding, bytes)?;
470 if decoded_text.contains(search_text) {
471 let new_text = decoded_text.replace(search_text, replacement_text);
472 let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
473 *bytes = encoded_bytes;
474 replacement_count += decoded_text.matches(search_text).count();
475 }
476 }
477 }
478
479 Ok(replacement_count)
480}
481
482fn encode_with_fallback(encoding: &Encoding, text: &str, default_char: &str) -> Vec<u8> {
483 let encoded = Document::encode_text(encoding, text);
484 if !encoded.is_empty() {
485 return encoded;
486 }
487
488 encode(encoding, text, default_char)
489}
490
491pub fn decode_xref_stream(mut stream: Stream) -> Result<(Xref, Dictionary)> {
493 if stream.is_compressed() {
494 stream.decompress()?;
495 }
496 let mut dict = stream.dict;
497 let mut reader = Cursor::new(stream.content);
498 let size = dict
499 .get(b"Size")
500 .and_then(Object::as_i64)
501 .map_err(|_| ParseError::InvalidXref)?;
502 let mut xref = Xref::new(size as u32, XrefType::CrossReferenceStream);
503 {
504 let section_indice = dict
505 .get(b"Index")
506 .and_then(parse_integer_array)
507 .unwrap_or_else(|_| vec![0, size]);
508 let field_widths = dict
509 .get(b"W")
510 .and_then(parse_integer_array)
511 .map_err(|_| ParseError::InvalidXref)?;
512
513 const MAX_XREF_FIELD_WIDTH: i64 = 8;
518 if field_widths.len() < 3
519 || field_widths[0].is_negative()
520 || field_widths[1].is_negative()
521 || field_widths[2].is_negative()
522 || field_widths[0] > MAX_XREF_FIELD_WIDTH
523 || field_widths[1] > MAX_XREF_FIELD_WIDTH
524 || field_widths[2] > MAX_XREF_FIELD_WIDTH
525 {
526 return Err(ParseError::InvalidXref.into());
527 }
528
529 let mut bytes1 = vec![0_u8; field_widths[0] as usize];
530 let mut bytes2 = vec![0_u8; field_widths[1] as usize];
531 let mut bytes3 = vec![0_u8; field_widths[2] as usize];
532
533 for i in 0..section_indice.len() / 2 {
534 let start = section_indice[2 * i];
535 let count = section_indice[2 * i + 1];
536
537 for j in 0..count {
538 let entry_type = if !bytes1.is_empty() {
539 read_big_endian_integer(&mut reader, bytes1.as_mut_slice())?
540 } else {
541 1
542 };
543 match entry_type {
544 0 => {
545 read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
547 read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?;
548 }
549 1 => {
550 let offset = read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
552 let generation = if !bytes3.is_empty() {
553 read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?
554 } else {
555 0
556 } as u16;
557 xref.insert((start + j) as u32, XrefEntry::Normal { offset, generation });
558 }
559 2 => {
560 let container =
562 read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
563 let index =
564 read_big_endian_integer(&mut reader, bytes3.as_mut_slice())? as u16;
565 xref.insert(
566 (start + j) as u32,
567 XrefEntry::Compressed { container, index },
568 );
569 }
570 _ => {}
571 }
572 }
573 }
574 }
575 dict.remove(b"Length");
576 dict.remove(b"W");
577 dict.remove(b"Index");
578 Ok((xref, dict))
579}
580
581fn read_big_endian_integer(reader: &mut Cursor<Vec<u8>>, buffer: &mut [u8]) -> Result<u32> {
582 reader.read_exact(buffer)?;
583 let mut value = 0;
584 for &mut byte in buffer {
585 value = (value << 8) + u32::from(byte);
586 }
587 Ok(value)
588}
589
590fn parse_integer_array(array: &Object) -> Result<Vec<i64>> {
591 let array = array.as_array()?;
592 let mut out = Vec::with_capacity(array.len());
593
594 for n in array {
595 out.push(n.as_i64()?);
596 }
597
598 Ok(out)
599}
600
601#[cfg(test)]
602mod tests {
603 #[cfg(not(feature = "async"))]
604 #[test]
605 fn load_and_save() {
606 use crate::Document;
607 use crate::creator::tests::{create_document, save_document};
608
609 use std::fs::File;
611 use std::io::Cursor;
612 let temp_dir = tempfile::tempdir().unwrap();
614 let file_path = temp_dir.path().join("test_1_load_and_save.pdf");
615
616 let mut doc = create_document();
617
618 save_document(&file_path, &mut doc);
619
620 let in_file = File::open(file_path).unwrap();
621 let mut in_doc = Document::load_from(in_file).unwrap();
622
623 let out_buf = Vec::new();
624 let mut memory_cursor = Cursor::new(out_buf);
625 in_doc.save_to(&mut memory_cursor).unwrap();
626 assert!(!memory_cursor.get_ref().is_empty());
628 }
629
630 #[test]
631 fn extract_text_chunks() {
632 use crate::creator::tests::create_document_with_texts;
633
634 let text1 = "Hello world!";
635 let text2 = "Ferris is the best!";
636 let doc = create_document_with_texts(&[text1, text2]);
637 let extracted_texts = doc.extract_text_chunks(&[1, 2]);
638 assert_eq!(extracted_texts.len(), 2);
639 assert_eq!(
640 [
641 extracted_texts[0].as_ref().unwrap().trim(),
642 extracted_texts[1].as_ref().unwrap().trim()
643 ],
644 [text1, text2]
645 );
646 }
647
648 #[test]
649 fn extract_text_concatenates_text_from_multiple_pages() {
650 use crate::creator::tests::create_document_with_texts;
651
652 let text1 = "Hello world!";
653 let text2 = "Ferris is the best!";
654 let doc = create_document_with_texts(&[text1, text2]);
655 let extracted_text = doc.extract_text(&[1, 2]);
656 assert_eq!(extracted_text.unwrap(), format!("{text1}\n{text2}\n"));
657 }
658
659 #[test]
660 fn xref_stream_oversized_field_width_is_rejected() {
661 use super::decode_xref_stream;
665 use crate::{Dictionary, Object, Stream};
666
667 let mut dict = Dictionary::new();
668 dict.set("Type", Object::Name(b"XRef".to_vec()));
669 dict.set("Size", Object::Integer(1));
670 dict.set(
672 "W",
673 Object::Array(vec![
674 Object::Integer(999_999_999),
675 Object::Integer(4),
676 Object::Integer(2),
677 ]),
678 );
679 let stream = Stream::new(dict, vec![]);
680 let result = decode_xref_stream(stream);
681 assert!(result.is_err(), "oversized W field must be rejected");
682 }
683
684 #[test]
685 fn xref_stream_negative_field_width_is_rejected() {
686 use super::decode_xref_stream;
688 use crate::{Dictionary, Object, Stream};
689
690 let mut dict = Dictionary::new();
691 dict.set("Type", Object::Name(b"XRef".to_vec()));
692 dict.set("Size", Object::Integer(1));
693 dict.set(
694 "W",
695 Object::Array(vec![
696 Object::Integer(-1),
697 Object::Integer(4),
698 Object::Integer(2),
699 ]),
700 );
701 let stream = Stream::new(dict, vec![]);
702 let result = decode_xref_stream(stream);
703 assert!(result.is_err(), "negative W field must be rejected");
704 }
705
706 #[test]
707 fn test_replace_partial_text() {
708 use crate::creator::tests::create_document_with_texts;
709
710 let mut doc = create_document_with_texts(&["Hello World! Hello Universe!"]);
711 let replacements = doc.replace_partial_text(1, "Hello", "Hi", None).unwrap();
712 assert_eq!(replacements, 2); let extracted_text = doc.extract_text(&[1]).unwrap();
715 assert!(extracted_text.contains("Hi World! Hi Universe!"));
716 }
717}