Skip to main content

oxidize_pdf/structure/
marked_content.rs

1/// Marked Content operators for Tagged PDF (ISO 32000-1 Section 14.6)
2///
3/// Marked content provides a way to identify portions of a content stream
4/// and associate them with structure elements in the structure tree.
5///
6/// # Operators
7///
8/// - **BMC** (Begin Marked Content): Simple marked content without properties
9/// - **BDC** (Begin Marked Content with Dictionary): Marked content with properties
10/// - **EMC** (End Marked Content): Closes the most recent BMC or BDC
11///
12/// # Thread Safety
13///
14/// `MarkedContent` is not thread-safe by design. Each instance should be used
15/// within a single thread. For concurrent PDF generation, create separate
16/// `MarkedContent` instances per thread.
17///
18/// # Example
19///
20/// ```rust
21/// use oxidize_pdf::structure::MarkedContent;
22///
23/// let mut mc = MarkedContent::new();
24///
25/// // Begin marked content with MCID for structure element
26/// mc.begin_with_mcid("P", 0);
27/// // ... add content (text, graphics, etc.) ...
28/// mc.end();
29///
30/// // Get the PDF operators as string
31/// let operators = mc.finish();
32/// ```
33use crate::error::{PdfError, Result};
34use lazy_static::lazy_static;
35use regex::Regex;
36use std::fmt::Write;
37
38// Constants for validation and limits
39const MAX_TAG_LENGTH: usize = 127; // PDF name object limit
40const MAX_OPERATIONS_SIZE: usize = 10 * 1024 * 1024; // 10MB limit
41const MAX_NESTING_DEPTH: usize = 100; // Reasonable nesting limit
42
43lazy_static! {
44    /// Valid PDF name pattern: alphanumeric, underscore, hyphen
45    static ref VALID_TAG_PATTERN: Regex = Regex::new(r"^[A-Za-z0-9_-]+$")
46        .expect("VALID_TAG_PATTERN regex is hardcoded and must be valid");
47}
48
49/// Marked content builder for Tagged PDF
50#[derive(Clone, Debug)]
51pub struct MarkedContent {
52    operations: String,
53    /// Stack of open marked content tags (for validation)
54    tag_stack: Vec<String>,
55}
56
57impl Default for MarkedContent {
58    fn default() -> Self {
59        Self::new()
60    }
61}
62
63/// Common marked content properties
64#[derive(Clone, Debug, PartialEq, Eq)]
65pub enum MarkedContentProperty {
66    /// Marked Content ID (links to structure tree)
67    MCID(u32),
68    /// Language specification
69    Lang(String),
70    /// Actual text (for accessibility)
71    ActualText(String),
72    /// Alternate description
73    Alt(String),
74    /// Expansion of abbreviation
75    E(String),
76}
77
78impl MarkedContentProperty {
79    /// Returns the PDF dictionary key for this property
80    fn key(&self) -> &str {
81        match self {
82            MarkedContentProperty::MCID(_) => "MCID",
83            MarkedContentProperty::Lang(_) => "Lang",
84            MarkedContentProperty::ActualText(_) => "ActualText",
85            MarkedContentProperty::Alt(_) => "Alt",
86            MarkedContentProperty::E(_) => "E",
87        }
88    }
89
90    /// Returns the PDF dictionary value for this property
91    fn value(&self) -> String {
92        match self {
93            MarkedContentProperty::MCID(id) => id.to_string(),
94            MarkedContentProperty::Lang(s)
95            | MarkedContentProperty::ActualText(s)
96            | MarkedContentProperty::Alt(s)
97            | MarkedContentProperty::E(s) => format!(
98                "({})",
99                s.replace('\\', "\\\\")
100                    .replace('(', "\\(")
101                    .replace(')', "\\)")
102            ),
103        }
104    }
105}
106
107/// Validates a marked content tag name
108///
109/// Tags must be valid PDF name objects: alphanumeric, underscore, or hyphen.
110/// Maximum length is 127 characters.
111fn validate_tag(tag: &str) -> Result<()> {
112    if tag.is_empty() {
113        return Err(PdfError::InvalidOperation(
114            "Marked content tag cannot be empty".to_string(),
115        ));
116    }
117
118    if tag.len() > MAX_TAG_LENGTH {
119        return Err(PdfError::InvalidOperation(format!(
120            "Marked content tag too long: {} characters (max {})",
121            tag.len(),
122            MAX_TAG_LENGTH
123        )));
124    }
125
126    if !VALID_TAG_PATTERN.is_match(tag) {
127        return Err(PdfError::InvalidOperation(format!(
128            "Invalid marked content tag '{}': must contain only alphanumeric, underscore, or hyphen",
129            tag
130        )));
131    }
132
133    Ok(())
134}
135
136impl MarkedContent {
137    /// Creates a new marked content builder
138    pub fn new() -> Self {
139        Self {
140            operations: String::new(),
141            tag_stack: Vec::new(),
142        }
143    }
144
145    /// Checks if size limit has been exceeded
146    fn check_size_limit(&self) -> Result<()> {
147        if self.operations.len() > MAX_OPERATIONS_SIZE {
148            return Err(PdfError::InvalidOperation(format!(
149                "Marked content operations exceed size limit: {} bytes (max {})",
150                self.operations.len(),
151                MAX_OPERATIONS_SIZE
152            )));
153        }
154        Ok(())
155    }
156
157    /// Checks if nesting depth limit has been exceeded
158    fn check_nesting_limit(&self) -> Result<()> {
159        if self.tag_stack.len() >= MAX_NESTING_DEPTH {
160            return Err(PdfError::InvalidOperation(format!(
161                "Marked content nesting too deep: {} levels (max {})",
162                self.tag_stack.len(),
163                MAX_NESTING_DEPTH
164            )));
165        }
166        Ok(())
167    }
168
169    /// Begin marked content without properties (BMC operator)
170    ///
171    /// # Arguments
172    ///
173    /// * `tag` - Structure type tag (e.g., "P" for paragraph, "H1" for heading)
174    ///
175    /// # Example
176    ///
177    /// ```
178    /// use oxidize_pdf::structure::MarkedContent;
179    ///
180    /// let mut mc = MarkedContent::new();
181    /// mc.begin("P");
182    /// // ... add content ...
183    /// mc.end();
184    /// ```
185    pub fn begin(&mut self, tag: &str) -> Result<&mut Self> {
186        validate_tag(tag)?;
187        self.check_nesting_limit()?;
188        self.check_size_limit()?;
189
190        writeln!(&mut self.operations, "/{tag} BMC")
191            .map_err(|e| PdfError::Internal(format!("Failed to write BMC operator: {e}")))?;
192
193        self.tag_stack.push(tag.to_string());
194        Ok(self)
195    }
196
197    /// Begin marked content with properties dictionary (BDC operator)
198    ///
199    /// This is the primary method for Tagged PDF, as it allows specifying
200    /// the MCID (Marked Content ID) that links content to structure elements.
201    ///
202    /// # Arguments
203    ///
204    /// * `tag` - Structure type tag (e.g., "P", "H1", "Figure")
205    /// * `mcid` - Marked Content ID linking to structure tree
206    ///
207    /// # Example
208    ///
209    /// ```
210    /// use oxidize_pdf::structure::MarkedContent;
211    ///
212    /// let mut mc = MarkedContent::new();
213    /// mc.begin_with_mcid("P", 0)?;
214    /// // ... add paragraph content ...
215    /// mc.end()?;
216    /// # Ok::<(), oxidize_pdf::PdfError>(())
217    /// ```
218    pub fn begin_with_mcid(&mut self, tag: &str, mcid: u32) -> Result<&mut Self> {
219        validate_tag(tag)?;
220        self.check_nesting_limit()?;
221        self.check_size_limit()?;
222
223        // BDC operator with inline dictionary containing MCID
224        writeln!(&mut self.operations, "/{tag} << /MCID {mcid} >> BDC")
225            .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator: {e}")))?;
226
227        self.tag_stack.push(tag.to_string());
228        Ok(self)
229    }
230
231    /// Begin marked content with typed properties (BDC operator)
232    ///
233    /// This is a type-safe alternative to `begin_with_properties` that uses
234    /// an enum for common marked content properties.
235    ///
236    /// # Arguments
237    ///
238    /// * `tag` - Structure type tag
239    /// * `properties` - Typed properties (MCID, Lang, ActualText, etc.)
240    ///
241    /// # Example
242    ///
243    /// ```
244    /// use oxidize_pdf::structure::{MarkedContent, MarkedContentProperty};
245    ///
246    /// let mut mc = MarkedContent::new();
247    /// let props = vec![
248    ///     MarkedContentProperty::MCID(0),
249    ///     MarkedContentProperty::Lang("en-US".to_string()),
250    /// ];
251    /// mc.begin_with_typed_properties("P", &props)?;
252    /// # Ok::<(), oxidize_pdf::PdfError>(())
253    /// ```
254    pub fn begin_with_typed_properties(
255        &mut self,
256        tag: &str,
257        properties: &[MarkedContentProperty],
258    ) -> Result<&mut Self> {
259        validate_tag(tag)?;
260        self.check_nesting_limit()?;
261        self.check_size_limit()?;
262
263        // Build properties dictionary
264        write!(&mut self.operations, "/{tag} <<")
265            .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator start: {e}")))?;
266
267        for prop in properties {
268            write!(&mut self.operations, " /{} {}", prop.key(), prop.value()).map_err(|e| {
269                PdfError::Internal(format!("Failed to write property {}: {e}", prop.key()))
270            })?;
271        }
272
273        writeln!(&mut self.operations, " >> BDC")
274            .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator end: {e}")))?;
275
276        self.tag_stack.push(tag.to_string());
277        Ok(self)
278    }
279
280    /// Begin marked content with custom properties dictionary
281    ///
282    /// Allows specifying additional properties beyond MCID.
283    ///
284    /// # Arguments
285    ///
286    /// * `tag` - Structure type tag
287    /// * `properties` - Dictionary entries as key-value pairs
288    ///
289    /// # Example
290    ///
291    /// ```
292    /// use oxidize_pdf::structure::MarkedContent;
293    ///
294    /// let mut mc = MarkedContent::new();
295    /// let props = vec![("MCID", "0"), ("Lang", "(en-US)")];
296    /// mc.begin_with_properties("P", &props)?;
297    /// # Ok::<(), oxidize_pdf::PdfError>(())
298    /// ```
299    pub fn begin_with_properties(
300        &mut self,
301        tag: &str,
302        properties: &[(&str, &str)],
303    ) -> Result<&mut Self> {
304        validate_tag(tag)?;
305        self.check_nesting_limit()?;
306        self.check_size_limit()?;
307
308        // Build properties dictionary
309        write!(&mut self.operations, "/{tag} <<")
310            .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator start: {e}")))?;
311
312        for (key, value) in properties {
313            write!(&mut self.operations, " /{key} {value}")
314                .map_err(|e| PdfError::Internal(format!("Failed to write property {key}: {e}")))?;
315        }
316
317        writeln!(&mut self.operations, " >> BDC")
318            .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator end: {e}")))?;
319
320        self.tag_stack.push(tag.to_string());
321        Ok(self)
322    }
323
324    /// End marked content (EMC operator)
325    ///
326    /// Closes the most recently opened marked content section.
327    ///
328    /// # Errors
329    ///
330    /// Returns an error if there are no open marked content sections.
331    ///
332    /// # Example
333    ///
334    /// ```
335    /// use oxidize_pdf::structure::MarkedContent;
336    ///
337    /// let mut mc = MarkedContent::new();
338    /// mc.begin("P")?;
339    /// mc.end()?; // Closes the "P" section
340    /// # Ok::<(), oxidize_pdf::PdfError>(())
341    /// ```
342    pub fn end(&mut self) -> Result<&mut Self> {
343        if self.tag_stack.is_empty() {
344            return Err(PdfError::InvalidStructure(
345                "Cannot end marked content: no open sections".to_string(),
346            ));
347        }
348
349        self.tag_stack.pop();
350        writeln!(&mut self.operations, "EMC")
351            .map_err(|e| PdfError::Internal(format!("Failed to write EMC operator: {e}")))?;
352
353        Ok(self)
354    }
355
356    /// Returns true if there are open marked content sections
357    pub fn has_open_sections(&self) -> bool {
358        !self.tag_stack.is_empty()
359    }
360
361    /// Returns the number of open marked content sections
362    pub fn open_section_count(&self) -> usize {
363        self.tag_stack.len()
364    }
365
366    /// Get the current tag stack (for debugging/validation)
367    pub fn tag_stack(&self) -> &[String] {
368        &self.tag_stack
369    }
370
371    /// Finishes marked content generation and returns the PDF operators
372    ///
373    /// # Errors
374    ///
375    /// Returns an error if there are still open marked content sections.
376    ///
377    /// # Example
378    ///
379    /// ```
380    /// use oxidize_pdf::structure::MarkedContent;
381    ///
382    /// let mut mc = MarkedContent::new();
383    /// mc.begin_with_mcid("P", 0)?;
384    /// mc.end()?;
385    /// let operators = mc.finish()?;
386    /// assert!(operators.contains("BMC") || operators.contains("BDC"));
387    /// assert!(operators.contains("EMC"));
388    /// # Ok::<(), oxidize_pdf::PdfError>(())
389    /// ```
390    pub fn finish(self) -> Result<String> {
391        if !self.tag_stack.is_empty() {
392            return Err(PdfError::InvalidStructure(format!(
393                "Cannot finish marked content: {} open section(s) remaining: {:?}",
394                self.tag_stack.len(),
395                self.tag_stack
396            )));
397        }
398
399        Ok(self.operations)
400    }
401
402    /// Returns the operations string without consuming self
403    ///
404    /// Unlike `finish()`, this does not validate that all sections are closed.
405    /// Useful for incremental content generation.
406    pub fn operations(&self) -> &str {
407        &self.operations
408    }
409
410    /// Clears all operations and resets the tag stack
411    ///
412    /// Useful for reusing the builder for multiple content sections.
413    pub fn reset(&mut self) {
414        self.operations.clear();
415        self.tag_stack.clear();
416    }
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn test_bmc_operator() {
425        let mut mc = MarkedContent::new();
426        mc.begin("P").unwrap();
427        mc.end().unwrap();
428
429        let ops = mc.finish().unwrap();
430        assert!(ops.contains("/P BMC"));
431        assert!(ops.contains("EMC"));
432    }
433
434    #[test]
435    fn test_bdc_with_mcid() {
436        let mut mc = MarkedContent::new();
437        mc.begin_with_mcid("P", 42).unwrap();
438        mc.end().unwrap();
439
440        let ops = mc.finish().unwrap();
441        assert!(ops.contains("/P << /MCID 42 >> BDC"));
442        assert!(ops.contains("EMC"));
443    }
444
445    #[test]
446    fn test_bdc_with_properties() {
447        let mut mc = MarkedContent::new();
448        let props = vec![("MCID", "0"), ("Lang", "(en-US)")];
449        mc.begin_with_properties("P", &props).unwrap();
450        mc.end().unwrap();
451
452        let ops = mc.finish().unwrap();
453        assert!(ops.contains("/P << /MCID 0 /Lang (en-US) >> BDC"));
454    }
455
456    #[test]
457    fn test_nested_marked_content() {
458        let mut mc = MarkedContent::new();
459        mc.begin_with_mcid("Div", 0).unwrap();
460        mc.begin_with_mcid("P", 1).unwrap();
461        mc.end().unwrap(); // Close P
462        mc.end().unwrap(); // Close Div
463
464        let ops = mc.finish().unwrap();
465        assert_eq!(ops.matches("BDC").count(), 2);
466        assert_eq!(ops.matches("EMC").count(), 2);
467    }
468
469    #[test]
470    fn test_invalid_tag_name() {
471        let mut mc = MarkedContent::new();
472        let result = mc.begin("Invalid Tag");
473        assert!(result.is_err());
474
475        let result = mc.begin("Tag<>");
476        assert!(result.is_err());
477    }
478
479    #[test]
480    fn test_end_without_begin() {
481        let mut mc = MarkedContent::new();
482        let result = mc.end();
483        assert!(result.is_err());
484    }
485
486    #[test]
487    fn test_finish_with_open_sections() {
488        let mut mc = MarkedContent::new();
489        mc.begin("P").unwrap();
490        // Don't call end()
491
492        let result = mc.finish();
493        assert!(result.is_err());
494    }
495
496    #[test]
497    fn test_tag_stack() {
498        let mut mc = MarkedContent::new();
499        assert_eq!(mc.open_section_count(), 0);
500        assert!(!mc.has_open_sections());
501
502        mc.begin("Div").unwrap();
503        assert_eq!(mc.open_section_count(), 1);
504        assert!(mc.has_open_sections());
505
506        mc.begin("P").unwrap();
507        assert_eq!(mc.open_section_count(), 2);
508
509        mc.end().unwrap();
510        assert_eq!(mc.open_section_count(), 1);
511
512        mc.end().unwrap();
513        assert_eq!(mc.open_section_count(), 0);
514        assert!(!mc.has_open_sections());
515    }
516
517    #[test]
518    fn test_reset() {
519        let mut mc = MarkedContent::new();
520        mc.begin("P").unwrap();
521        mc.end().unwrap();
522
523        mc.reset();
524        assert_eq!(mc.operations().len(), 0);
525        assert_eq!(mc.open_section_count(), 0);
526    }
527
528    // NEW TESTS for quality improvements
529
530    #[test]
531    fn test_deep_nesting() {
532        // Test 20-level nesting
533        let mut mc = MarkedContent::new();
534
535        // Open 20 levels
536        for i in 0..20 {
537            mc.begin(&format!("Level{}", i)).unwrap();
538        }
539
540        assert_eq!(mc.open_section_count(), 20);
541
542        // Close all 20 levels
543        for _ in 0..20 {
544            mc.end().unwrap();
545        }
546
547        assert_eq!(mc.open_section_count(), 0);
548        let ops = mc.finish().unwrap();
549        assert_eq!(ops.matches("BMC").count(), 20);
550        assert_eq!(ops.matches("EMC").count(), 20);
551    }
552
553    #[test]
554    fn test_nesting_limit_exceeded() {
555        let mut mc = MarkedContent::new();
556
557        // Try to exceed the 100-level limit
558        for i in 0..100 {
559            mc.begin(&format!("L{}", i)).unwrap();
560        }
561
562        // 101st level should fail
563        let result = mc.begin("TooDeep");
564        assert!(result.is_err());
565        assert!(result.unwrap_err().to_string().contains("nesting too deep"));
566    }
567
568    #[test]
569    fn test_tag_validation_alphanumeric() {
570        let mut mc = MarkedContent::new();
571
572        // Valid tags
573        assert!(mc.begin("P").is_ok());
574        mc.end().unwrap();
575
576        assert!(mc.begin("H1").is_ok());
577        mc.end().unwrap();
578
579        assert!(mc.begin("My_Tag").is_ok());
580        mc.end().unwrap();
581
582        assert!(mc.begin("Tag-123").is_ok());
583        mc.end().unwrap();
584    }
585
586    #[test]
587    fn test_tag_validation_invalid_chars() {
588        let mut mc = MarkedContent::new();
589
590        // Invalid: contains special characters
591        assert!(mc.begin("Tag@Value").is_err());
592        assert!(mc.begin("Tag#123").is_err());
593        assert!(mc.begin("Tag$Name").is_err());
594        assert!(mc.begin("Tag%").is_err());
595        assert!(mc.begin("Tag/Path").is_err());
596    }
597
598    #[test]
599    fn test_tag_length_limit() {
600        let mut mc = MarkedContent::new();
601
602        // 127 characters (at limit)
603        let max_tag = "A".repeat(127);
604        assert!(mc.begin(&max_tag).is_ok());
605        mc.end().unwrap();
606
607        // 128 characters (over limit)
608        let over_tag = "A".repeat(128);
609        assert!(mc.begin(&over_tag).is_err());
610    }
611
612    #[test]
613    fn test_typed_properties() {
614        let mut mc = MarkedContent::new();
615
616        let props = vec![
617            MarkedContentProperty::MCID(42),
618            MarkedContentProperty::Lang("en-US".to_string()),
619            MarkedContentProperty::ActualText("Hello".to_string()),
620        ];
621
622        mc.begin_with_typed_properties("P", &props).unwrap();
623        mc.end().unwrap();
624
625        let ops = mc.finish().unwrap();
626        assert!(ops.contains("/MCID 42"));
627        assert!(ops.contains("/Lang (en-US)"));
628        assert!(ops.contains("/ActualText (Hello)"));
629    }
630
631    #[test]
632    fn test_property_string_escaping() {
633        let mut mc = MarkedContent::new();
634
635        let props = vec![
636            MarkedContentProperty::ActualText("Text with (parens)".to_string()),
637            MarkedContentProperty::Alt("Text\\with\\backslashes".to_string()),
638        ];
639
640        mc.begin_with_typed_properties("P", &props).unwrap();
641        mc.end().unwrap();
642
643        let ops = mc.finish().unwrap();
644        // Verify escaping
645        assert!(ops.contains("\\(") && ops.contains("\\)"));
646        assert!(ops.contains("\\\\"));
647    }
648
649    #[test]
650    fn test_size_limit() {
651        let mut mc = MarkedContent::new();
652
653        // Generate content until we hit the size limit
654        // We'll add a large tag to make each operation substantial
655        let large_tag = "T".repeat(100); // 100-char tag
656
657        let mut iteration_count = 0;
658        let mut hit_limit = false;
659
660        // Try to add many operations
661        for i in 0..200_000 {
662            iteration_count = i;
663
664            // Use unique large tags to grow the buffer
665            let tag = format!("{}{}", large_tag, i);
666
667            match mc.begin(&tag) {
668                Ok(_) => {
669                    // Successfully added, now end it
670                    if mc.end().is_err() {
671                        hit_limit = true;
672                        break;
673                    }
674                }
675                Err(_) => {
676                    // Hit the limit
677                    hit_limit = true;
678                    break;
679                }
680            }
681        }
682
683        // Verify we eventually hit the size limit
684        assert!(
685            hit_limit,
686            "Expected to hit size limit but completed {} iterations (ops size: {} bytes)",
687            iteration_count,
688            mc.operations().len()
689        );
690
691        // Verify operations size is at or near the limit
692        let ops_size = mc.operations().len();
693        assert!(
694            ops_size > 9_000_000,
695            "Operations size {} should be near 10MB limit after hitting size check",
696            ops_size
697        );
698    }
699
700    #[test]
701    fn test_property_enum_equality() {
702        let prop1 = MarkedContentProperty::MCID(42);
703        let prop2 = MarkedContentProperty::MCID(42);
704        let prop3 = MarkedContentProperty::MCID(43);
705
706        assert_eq!(prop1, prop2);
707        assert_ne!(prop1, prop3);
708
709        let prop4 = MarkedContentProperty::Lang("en".to_string());
710        let prop5 = MarkedContentProperty::Lang("en".to_string());
711        assert_eq!(prop4, prop5);
712    }
713
714    #[test]
715    fn test_empty_tag() {
716        let mut mc = MarkedContent::new();
717        let result = mc.begin("");
718        assert!(result.is_err());
719        assert!(result.unwrap_err().to_string().contains("cannot be empty"));
720    }
721
722    #[test]
723    fn test_validate_tag_function() {
724        // Test the validation function directly
725        assert!(validate_tag("P").is_ok());
726        assert!(validate_tag("H1").is_ok());
727        assert!(validate_tag("My-Tag_123").is_ok());
728
729        assert!(validate_tag("").is_err());
730        assert!(validate_tag("Tag with spaces").is_err());
731        assert!(validate_tag("Tag<>").is_err());
732        assert!(validate_tag(&"A".repeat(128)).is_err());
733    }
734}