Skip to main content

lopdf/
parser_aux.rs

1use log::warn;
2
3use crate::{
4    content::{Content, Operation},
5    document::Document,
6    encodings::Encoding,
7    error::ParseError,
8    object::Object::Name,
9    xref::{Xref, XrefEntry, XrefType},
10    Error, Result,
11};
12use crate::{parser, Dictionary, Object, ObjectId, Stream};
13use std::{
14    collections::BTreeMap,
15    io::{Cursor, Read},
16};
17
18impl Content<Vec<Operation>> {
19    /// Decode content operations.
20    pub fn decode(data: &[u8]) -> Result<Self> {
21        parser::content(data).ok_or(ParseError::InvalidContentStream.into())
22    }
23
24    /// Strict decode content operations.
25    pub fn decode_strict(data: &[u8]) -> Result<Self> {
26        parser::content_strict(data).map_err(|e| e.into())
27    }
28}
29
30impl Stream {
31    /// Decode content after decoding all stream filters.
32    pub fn decode_content(&self) -> Result<Content<Vec<Operation>>> {
33        Content::decode(&self.content)
34    }
35}
36
37impl Document {
38    /// Get decoded page content;
39    pub fn get_and_decode_page_content(&self, page_id: ObjectId) -> Result<Content<Vec<Operation>>> {
40        let content_data = self.get_page_content(page_id)?;
41        Content::decode(&content_data)
42    }
43
44    /// Add content to a page. All existing content will be unchanged.
45    pub fn add_to_page_content(&mut self, page_id: ObjectId, content: Content<Vec<Operation>>) -> Result<()> {
46        let content_data = Content::encode(&content)?;
47        self.add_page_contents(page_id, content_data)?;
48        Ok(())
49    }
50
51    pub fn extract_text(&self, page_numbers: &[u32]) -> Result<String> {
52        let text_fragments = self.extract_text_chunks(page_numbers);
53        let mut text = String::new();
54        for maybe_text_fragment in text_fragments.into_iter() {
55            let text_fragment = maybe_text_fragment?;
56            text.push_str(&text_fragment);
57        }
58
59        Ok(text)
60    }
61
62    pub fn extract_text_chunks(&self, page_numbers: &[u32]) -> Vec<Result<String>> {
63        let pages: BTreeMap<u32, (u32, u16)> = self.get_pages();
64        page_numbers
65            .iter()
66            .flat_map(|page_number| {
67                let result = self.extract_text_chunks_from_page(&pages, *page_number);
68                match result {
69                    Ok(text_chunks) => text_chunks,
70                    Err(err) => vec![Err(err)],
71                }
72            })
73            .collect()
74    }
75
76    fn extract_text_chunks_from_page(
77        &self, pages: &BTreeMap<u32, (u32, u16)>, page_number: u32,
78    ) -> Result<Vec<Result<String>>> {
79        let mut collected_chunks_and_errs: Vec<std::result::Result<String, Error>> = Vec::new();
80
81        let page_id = *pages.get(&page_number).ok_or(Error::PageNumberNotFound(page_number))?;
82        let fonts = self.get_page_fonts(page_id)?;
83        let encodings: BTreeMap<Vec<u8>, Encoding> = fonts
84            .into_iter()
85            .filter_map(|(name, font)| match font.get_font_encoding(self) {
86                Ok(it) => Some((name, it)),
87                Err(err) => {
88                    collected_chunks_and_errs.push(Err(err));
89                    None
90                }
91            })
92            .collect();
93        let content_data = self.get_page_content(page_id)?;
94        let content = Content::decode(&content_data)?;
95
96        // each text with different encoding is extracted as separate chunk
97        let mut current_encoding = None;
98        let mut current_text = String::new();
99        for operation in &content.operations {
100            match operation.operator.as_ref() {
101                "Tf" => {
102                    let current_font = operation
103                        .operands
104                        .first()
105                        .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
106                        .as_name();
107                    current_encoding = match current_font {
108                        Ok(font) => encodings.get(font),
109                        Err(err) => {
110                            collected_chunks_and_errs.push(Err(err));
111                            None
112                        }
113                    };
114
115                    if !current_text.is_empty() {
116                        collected_chunks_and_errs.push(Ok(current_text));
117                        current_text = String::new();
118                    }
119                }
120                "Tj" | "TJ" => match current_encoding {
121                    Some(encoding) => {
122                        let res = collect_text(&mut current_text, encoding, &operation.operands);
123                        if let Err(err) = res {
124                            collected_chunks_and_errs.push(Err(err));
125                        }
126                    }
127                    None => warn!("Could not decode extracted text"),
128                },
129                // PDF 32000-1 §9.4.3 — `'` is equivalent to `T* Tj`:
130                // move to next line, then show string from the single
131                // string operand.
132                "'" => match current_encoding {
133                    Some(encoding) => {
134                        if !current_text.ends_with('\n') {
135                            current_text.push('\n');
136                        }
137                        let res = collect_text(&mut current_text, encoding, &operation.operands);
138                        if let Err(err) = res {
139                            collected_chunks_and_errs.push(Err(err));
140                        }
141                    }
142                    None => warn!("Could not decode extracted text"),
143                },
144                // PDF 32000-1 §9.4.3 — `"` is equivalent to
145                // `aw Tw ac Tc T* Tj` with operands `[aw, ac, string]`.
146                // Operands 0/1 set word/character spacing for rendering
147                // and don't affect the extracted character sequence;
148                // operand 2 is the string to show.
149                "\"" => match current_encoding {
150                    Some(encoding) => {
151                        if !current_text.ends_with('\n') {
152                            current_text.push('\n');
153                        }
154                        if let Some(string_operand) = operation.operands.get(2) {
155                            let res = collect_text(&mut current_text, encoding, std::slice::from_ref(string_operand));
156                            if let Err(err) = res {
157                                collected_chunks_and_errs.push(Err(err));
158                            }
159                        }
160                    }
161                    None => warn!("Could not decode extracted text"),
162                },
163                // PDF 32000-1 §9.4.2 — `T*` moves to the start of the
164                // next line. For text extraction we approximate this
165                // as `\n`, matching how the `ET` arm above handles end
166                // of text object.
167                "T*" if !current_text.ends_with('\n') => current_text.push('\n'),
168                "T*" => {}
169                "ET" if !current_text.ends_with('\n') => current_text.push('\n'),
170                "ET" => {}
171                _ => {}
172            }
173        }
174        if !current_text.is_empty() {
175            collected_chunks_and_errs.push(Ok(current_text));
176        }
177
178        Ok(collected_chunks_and_errs)
179    }
180
181    pub fn replace_text(
182        &mut self, page_number: u32, text: &str, other_text: &str, default_str: Option<&str>,
183    ) -> Result<()> {
184        let page = page_number.saturating_sub(1) as usize;
185        let page_id = self
186            .page_iter()
187            .nth(page)
188            .ok_or(Error::PageNumberNotFound(page_number))?;
189        let encodings: BTreeMap<Vec<u8>, Encoding> = self
190            .get_page_fonts(page_id)?
191            .into_iter()
192            .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
193            .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
194        let content_data = self.get_page_content(page_id)?;
195        let mut content = Content::decode(&content_data)?;
196        let mut current_encoding = None;
197        for operation in &mut content.operations {
198            match operation.operator.as_ref() {
199                "Tf" => {
200                    let current_font = operation
201                        .operands
202                        .first()
203                        .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
204                        .as_name()?;
205                    current_encoding = encodings.get(current_font);
206                }
207                "Tj" | "TJ" => match current_encoding {
208                    Some(encoding) => {
209                        try_to_replace_encoded_text(operation, encoding, text, other_text, default_str.unwrap_or(""))?
210                    }
211                    None => {
212                        warn!("Could not decode extracted text, some of the occurances might not be properly replaced")
213                    }
214                },
215                _ => {}
216            }
217        }
218        let modified_content = content.encode()?;
219        self.change_page_content(page_id, modified_content)
220    }
221
222    pub fn replace_partial_text(
223        &mut self, page_number: u32, search_text: &str, replacement_text: &str, default_char: Option<&str>,
224    ) -> Result<usize> {
225        let page = page_number.saturating_sub(1) as usize;
226        let page_id = self
227            .page_iter()
228            .nth(page)
229            .ok_or(Error::PageNumberNotFound(page_number))?;
230
231        let encodings: BTreeMap<Vec<u8>, Encoding> = self
232            .get_page_fonts(page_id)?
233            .into_iter()
234            .map(|(name, font)| font.get_font_encoding(self).map(|it| (name, it)))
235            .collect::<Result<BTreeMap<Vec<u8>, Encoding>>>()?;
236
237        let content_data = self.get_page_content(page_id)?;
238        let mut content = Content::decode(&content_data)?;
239        let mut current_encoding = None;
240        let mut replacement_count = 0;
241
242        for operation in &mut content.operations {
243            match operation.operator.as_ref() {
244                "Tf" => {
245                    let current_font = operation
246                        .operands
247                        .first()
248                        .ok_or_else(|| Error::Syntax("missing font operand".to_string()))?
249                        .as_name()?;
250                    current_encoding = encodings.get(current_font);
251                }
252                "Tj" | "TJ" => {
253                    if let Some(encoding) = current_encoding {
254                        replacement_count += replace_partial_in_operation(
255                            operation,
256                            encoding,
257                            search_text,
258                            replacement_text,
259                            default_char.unwrap_or("?"),
260                        )?;
261                    } else {
262                        warn!("No encoding found for text operation");
263                    }
264                }
265                _ => {}
266            }
267        }
268
269        if replacement_count > 0 {
270            let modified_content = content.encode()?;
271            self.change_page_content(page_id, modified_content)?;
272        }
273
274        Ok(replacement_count)
275    }
276
277    pub fn insert_image(
278        &mut self, page_id: ObjectId, img_object: Stream, position: (f32, f32), size: (f32, f32),
279    ) -> Result<()> {
280        let img_id = self.add_object(img_object);
281        let img_name = format!("X{}", img_id.0);
282
283        self.add_xobject(page_id, img_name.as_bytes(), img_id)?;
284
285        let mut content = self.get_and_decode_page_content(page_id)?;
286        content.operations.push(Operation::new("q", vec![]));
287        content.operations.push(Operation::new(
288            "cm",
289            vec![
290                size.0.into(),
291                0.into(),
292                0.into(),
293                size.1.into(),
294                position.0.into(),
295                position.1.into(),
296            ],
297        ));
298        content
299            .operations
300            .push(Operation::new("Do", vec![Name(img_name.as_bytes().to_vec())]));
301        content.operations.push(Operation::new("Q", vec![]));
302
303        self.change_page_content(page_id, content.encode()?)
304    }
305
306    pub fn insert_form_object(&mut self, page_id: ObjectId, form_obj: Stream) -> Result<()> {
307        let form_id = self.add_object(form_obj);
308        let form_name = format!("X{}", form_id.0);
309
310        let mut content = self.get_and_decode_page_content(page_id)?;
311        content.operations.insert(0, Operation::new("q", vec![]));
312        content.operations.push(Operation::new("Q", vec![]));
313        content
314            .operations
315            .push(Operation::new("Do", vec![Name(form_name.as_bytes().to_vec())]));
316        let modified_content = content.encode()?;
317        self.add_xobject(page_id, form_name, form_id)?;
318
319        self.change_page_content(page_id, modified_content)
320    }
321}
322fn collect_text(text: &mut String, encoding: &Encoding, operands: &[Object]) -> Result<()> {
323    for operand in operands.iter() {
324        match operand {
325            Object::String(bytes, _) => {
326                encoding.write_to_string(bytes, text)?;
327            }
328            Object::Array(arr) => {
329                collect_text(text, encoding, arr)?;
330                text.push(' ');
331            }
332            Object::Integer(i) if *i < -100 => {
333                text.push(' ');
334            }
335            _ => {}
336        }
337    }
338    Ok(())
339}
340pub fn substr(s: &str, start: usize, len: usize) -> &str {
341    let mut indices = s.char_indices();
342
343    for _ in 0..start {
344        if indices.next().is_none() {
345            return "";
346        }
347    }
348
349    let Some((start_idx, _)) = indices.next() else {
350        return "";
351    };
352
353    let end_idx = indices
354        .nth(len.saturating_sub(1))
355        .map(|(idx, _)| idx)
356        .unwrap_or(s.len());
357
358    &s[start_idx..end_idx]
359}
360pub fn substring(s: &str, start: usize) -> &str {
361    s.char_indices().nth(start).map(|(idx, _)| &s[idx..]).unwrap_or("")
362}
363
364fn encode(encoding: &Encoding, txt: &str, default_str: &str) -> Vec<u8> {
365    if txt.chars().count() > 1 {
366        let mut cur = 0;
367        let mut result = Vec::new();
368        while cur < txt.chars().count() {
369            let c = substr(txt, cur, 1);
370            result.extend_from_slice(&encode(encoding, c, default_str));
371            cur += 1;
372        }
373        result
374    } else {
375        let encoded_bytes = Document::encode_text(encoding, txt);
376        if !encoded_bytes.is_empty() {
377            encoded_bytes
378        } else {
379            Document::encode_text(encoding, default_str)
380        }
381    }
382}
383fn try_to_replace_encoded_text(
384    operation: &mut Operation, encoding: &Encoding, text_to_replace: &str, replacement: &str, default_str: &str,
385) -> Result<()> {
386    for operand in &mut operation.operands {
387        match operand {
388            Object::String(bytes, _) => {
389                let decoded_text = Document::decode_text(encoding, bytes)?;
390                if decoded_text == text_to_replace {
391                    let encoded_bytes = encode(encoding, replacement, default_str);
392                    *bytes = encoded_bytes;
393                }
394            }
395            Object::Array(arr) => {
396                let mut str_collected = String::new();
397                collect_text(&mut str_collected, encoding, arr)?;
398                if str_collected == text_to_replace {
399                    // The number of `Object::String` items in a `TJ` array is
400                    // not guaranteed to match the character count of the
401                    // decoded text (each string may hold several glyphs, and
402                    // numeric kerning entries are interleaved).
403                    //
404                    // There is no **good** way to interpolate between the OG
405                    // and the replacement, but putting the full encoded replacement
406                    // into the first string slot and emptying out the remaining
407                    // string slots, leaving any numeric kerning entries in place
408                    // seems like the least bad option.
409                    let encoded_replacement = encode(encoding, replacement, default_str);
410                    let mut placed = false;
411                    for item in arr.iter_mut() {
412                        if let Object::String(bytes, _f) = item {
413                            if placed {
414                                *bytes = Vec::new();
415                            } else {
416                                bytes.clone_from(&encoded_replacement);
417                                placed = true;
418                            }
419                        }
420                    }
421                }
422            }
423            _ => {}
424        }
425    }
426
427    Ok(())
428}
429
430fn replace_partial_in_operation(
431    operation: &mut Operation, encoding: &Encoding, search_text: &str, replacement_text: &str, default_char: &str,
432) -> Result<usize> {
433    let mut replacement_count = 0;
434
435    for operand in &mut operation.operands {
436        match operand {
437            Object::String(bytes, _) => {
438                let decoded_text = Document::decode_text(encoding, bytes)?;
439                if decoded_text.contains(search_text) {
440                    let new_text = decoded_text.replace(search_text, replacement_text);
441                    let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
442                    *bytes = encoded_bytes;
443                    replacement_count += decoded_text.matches(search_text).count();
444                }
445            }
446            Object::Array(arr) => {
447                replacement_count +=
448                    replace_partial_in_array(arr, encoding, search_text, replacement_text, default_char)?;
449            }
450            _ => {}
451        }
452    }
453
454    Ok(replacement_count)
455}
456
457fn replace_partial_in_array(
458    arr: &mut [Object], encoding: &Encoding, search_text: &str, replacement_text: &str, default_char: &str,
459) -> Result<usize> {
460    let mut replacement_count = 0;
461
462    for item in arr.iter_mut() {
463        if let Object::String(bytes, _) = item {
464            let decoded_text = Document::decode_text(encoding, bytes)?;
465            if decoded_text.contains(search_text) {
466                let new_text = decoded_text.replace(search_text, replacement_text);
467                let encoded_bytes = encode_with_fallback(encoding, &new_text, default_char);
468                *bytes = encoded_bytes;
469                replacement_count += decoded_text.matches(search_text).count();
470            }
471        }
472    }
473
474    Ok(replacement_count)
475}
476
477fn encode_with_fallback(encoding: &Encoding, text: &str, default_char: &str) -> Vec<u8> {
478    let encoded = Document::encode_text(encoding, text);
479    if !encoded.is_empty() {
480        return encoded;
481    }
482
483    encode(encoding, text, default_char)
484}
485
486/// Decode CrossReferenceStream
487pub fn decode_xref_stream(mut stream: Stream) -> Result<(Xref, Dictionary)> {
488    if stream.is_compressed() {
489        stream.decompress()?;
490    }
491    let mut dict = stream.dict;
492    let mut reader = Cursor::new(stream.content);
493    let size = dict
494        .get(b"Size")
495        .and_then(Object::as_i64)
496        .map_err(|_| ParseError::InvalidXref)?;
497    let mut xref = Xref::new(size as u32, XrefType::CrossReferenceStream);
498    {
499        let section_indice = dict
500            .get(b"Index")
501            .and_then(parse_integer_array)
502            .unwrap_or_else(|_| vec![0, size]);
503        let field_widths = dict
504            .get(b"W")
505            .and_then(parse_integer_array)
506            .map_err(|_| ParseError::InvalidXref)?;
507
508        if field_widths.len() < 3
509            || field_widths[0].is_negative()
510            || field_widths[1].is_negative()
511            || field_widths[2].is_negative()
512        {
513            return Err(ParseError::InvalidXref.into());
514        }
515
516        let mut bytes1 = vec![0_u8; field_widths[0] as usize];
517        let mut bytes2 = vec![0_u8; field_widths[1] as usize];
518        let mut bytes3 = vec![0_u8; field_widths[2] as usize];
519
520        for i in 0..section_indice.len() / 2 {
521            let start = section_indice[2 * i];
522            let count = section_indice[2 * i + 1];
523
524            for j in 0..count {
525                let entry_type = if !bytes1.is_empty() {
526                    read_big_endian_integer(&mut reader, bytes1.as_mut_slice())?
527                } else {
528                    1
529                };
530                match entry_type {
531                    0 => {
532                        // free object
533                        read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
534                        read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?;
535                    }
536                    1 => {
537                        // normal object
538                        let offset = read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
539                        let generation = if !bytes3.is_empty() {
540                            read_big_endian_integer(&mut reader, bytes3.as_mut_slice())?
541                        } else {
542                            0
543                        } as u16;
544                        xref.insert((start + j) as u32, XrefEntry::Normal { offset, generation });
545                    }
546                    2 => {
547                        // compressed object
548                        let container = read_big_endian_integer(&mut reader, bytes2.as_mut_slice())?;
549                        let index = read_big_endian_integer(&mut reader, bytes3.as_mut_slice())? as u16;
550                        xref.insert((start + j) as u32, XrefEntry::Compressed { container, index });
551                    }
552                    _ => {}
553                }
554            }
555        }
556    }
557    dict.remove(b"Length");
558    dict.remove(b"W");
559    dict.remove(b"Index");
560    Ok((xref, dict))
561}
562
563fn read_big_endian_integer(reader: &mut Cursor<Vec<u8>>, buffer: &mut [u8]) -> Result<u32> {
564    reader.read_exact(buffer)?;
565    let mut value = 0;
566    for &mut byte in buffer {
567        value = (value << 8) + u32::from(byte);
568    }
569    Ok(value)
570}
571
572fn parse_integer_array(array: &Object) -> Result<Vec<i64>> {
573    let array = array.as_array()?;
574    let mut out = Vec::with_capacity(array.len());
575
576    for n in array {
577        out.push(n.as_i64()?);
578    }
579
580    Ok(out)
581}
582
583#[cfg(test)]
584mod tests {
585    #[cfg(not(feature = "async"))]
586    #[test]
587    fn load_and_save() {
588        use crate::creator::tests::{create_document, save_document};
589        use crate::Document;
590
591        // test load_from() and save_to()
592        use std::fs::File;
593        use std::io::Cursor;
594        // Create temporary folder to store file.
595        let temp_dir = tempfile::tempdir().unwrap();
596        let file_path = temp_dir.path().join("test_1_load_and_save.pdf");
597
598        let mut doc = create_document();
599
600        save_document(&file_path, &mut doc);
601
602        let in_file = File::open(file_path).unwrap();
603        let mut in_doc = Document::load_from(in_file).unwrap();
604
605        let out_buf = Vec::new();
606        let mut memory_cursor = Cursor::new(out_buf);
607        in_doc.save_to(&mut memory_cursor).unwrap();
608        // Check if saved file is not an empty bytes vector.
609        assert!(!memory_cursor.get_ref().is_empty());
610    }
611
612    #[test]
613    fn extract_text_chunks() {
614        use crate::creator::tests::create_document_with_texts;
615
616        let text1 = "Hello world!";
617        let text2 = "Ferris is the best!";
618        let doc = create_document_with_texts(&[text1, text2]);
619        let extracted_texts = doc.extract_text_chunks(&[1, 2]);
620        assert_eq!(extracted_texts.len(), 2);
621        assert_eq!(
622            [
623                extracted_texts[0].as_ref().unwrap().trim(),
624                extracted_texts[1].as_ref().unwrap().trim()
625            ],
626            [text1, text2]
627        );
628    }
629
630    #[test]
631    fn extract_text_concatenates_text_from_multiple_pages() {
632        use crate::creator::tests::create_document_with_texts;
633
634        let text1 = "Hello world!";
635        let text2 = "Ferris is the best!";
636        let doc = create_document_with_texts(&[text1, text2]);
637        let extracted_text = doc.extract_text(&[1, 2]);
638        assert_eq!(extracted_text.unwrap(), format!("{text1}\n{text2}\n"));
639    }
640
641    #[test]
642    fn test_replace_partial_text() {
643        use crate::creator::tests::create_document_with_texts;
644
645        let mut doc = create_document_with_texts(&["Hello World! Hello Universe!"]);
646        let replacements = doc.replace_partial_text(1, "Hello", "Hi", None).unwrap();
647        assert_eq!(replacements, 2); // Should replace both occurrences
648
649        let extracted_text = doc.extract_text(&[1]).unwrap();
650        assert!(extracted_text.contains("Hi World! Hi Universe!"));
651    }
652
653    /// PDF 1.7 / ISO 32000-1 §9.4.3 — `'` is equivalent to `T* Tj`:
654    /// move to the next line and show a string. extract_text should
655    /// recover the string operand and emit a line break before it.
656    #[test]
657    fn extract_text_handles_apostrophe_show_text_op() {
658        use crate::content::Operation;
659        use crate::creator::tests::create_document_with_operations;
660        use crate::Object;
661
662        let doc = create_document_with_operations(vec![
663            Operation::new("BT", vec![]),
664            Operation::new("Tf", vec!["F1".into(), 12.into()]),
665            Operation::new("Td", vec![100.into(), 700.into()]),
666            Operation::new("Tj", vec![Object::string_literal("first")]),
667            Operation::new("'", vec![Object::string_literal("second")]),
668            Operation::new("'", vec![Object::string_literal("third")]),
669            Operation::new("ET", vec![]),
670        ]);
671
672        let text = doc.extract_text(&[1]).unwrap();
673        assert!(text.contains("first"), "Tj string lost: {text:?}");
674        assert!(text.contains("second"), "first ' string lost: {text:?}");
675        assert!(text.contains("third"), "second ' string lost: {text:?}");
676    }
677
678    /// PDF 1.7 / ISO 32000-1 §9.4.3 — `"` is equivalent to
679    /// `aw Tw ac Tc T* Tj` with operands `[aw, ac, string]`. extract_text
680    /// should recover operand index 2 (the string); operands 0 and 1 set
681    /// rendering spacing and don't affect the extracted character sequence.
682    #[test]
683    fn extract_text_handles_quote_show_text_op() {
684        use crate::content::Operation;
685        use crate::creator::tests::create_document_with_operations;
686        use crate::Object;
687
688        let doc = create_document_with_operations(vec![
689            Operation::new("BT", vec![]),
690            Operation::new("Tf", vec!["F1".into(), 12.into()]),
691            Operation::new("Td", vec![100.into(), 700.into()]),
692            Operation::new("\"", vec![0.into(), 0.into(), Object::string_literal("from-quote-op")]),
693            Operation::new("ET", vec![]),
694        ]);
695
696        let text = doc.extract_text(&[1]).unwrap();
697        assert!(text.contains("from-quote-op"), "\" string operand lost: {text:?}");
698    }
699
700    /// PDF 1.7 / ISO 32000-1 §9.4.2 — `T*` moves to the start of the
701    /// next line. For text extraction we approximate as `\n`, so a
702    /// `Tj T* Tj` sequence should produce two strings separated by a
703    /// newline rather than running together.
704    #[test]
705    fn extract_text_preserves_line_breaks_for_t_star() {
706        use crate::content::Operation;
707        use crate::creator::tests::create_document_with_operations;
708        use crate::Object;
709
710        let doc = create_document_with_operations(vec![
711            Operation::new("BT", vec![]),
712            Operation::new("Tf", vec!["F1".into(), 12.into()]),
713            Operation::new("Td", vec![100.into(), 700.into()]),
714            Operation::new("Tj", vec![Object::string_literal("line-one")]),
715            Operation::new("T*", vec![]),
716            Operation::new("Tj", vec![Object::string_literal("line-two")]),
717            Operation::new("ET", vec![]),
718        ]);
719
720        let text = doc.extract_text(&[1]).unwrap();
721        let one = text.find("line-one").expect("line-one missing");
722        let two = text.find("line-two").expect("line-two missing");
723        assert!(one < two, "order wrong: {text:?}");
724        let between = &text[one + "line-one".len()..two];
725        assert!(
726            between.contains('\n'),
727            "T* did not insert a line break between Tj strings: between={between:?}"
728        );
729    }
730}