1use crate::error::{PdfError, Result};
34use lazy_static::lazy_static;
35use regex::Regex;
36use std::fmt::Write;
37
38const MAX_TAG_LENGTH: usize = 127; const MAX_OPERATIONS_SIZE: usize = 10 * 1024 * 1024; const MAX_NESTING_DEPTH: usize = 100; lazy_static! {
44 static ref VALID_TAG_PATTERN: Regex = Regex::new(r"^[A-Za-z0-9_-]+$")
46 .expect("VALID_TAG_PATTERN regex is hardcoded and must be valid");
47}
48
49#[derive(Clone, Debug)]
51pub struct MarkedContent {
52 operations: String,
53 tag_stack: Vec<String>,
55}
56
57impl Default for MarkedContent {
58 fn default() -> Self {
59 Self::new()
60 }
61}
62
63#[derive(Clone, Debug, PartialEq, Eq)]
65pub enum MarkedContentProperty {
66 MCID(u32),
68 Lang(String),
70 ActualText(String),
72 Alt(String),
74 E(String),
76}
77
78impl MarkedContentProperty {
79 fn key(&self) -> &str {
81 match self {
82 MarkedContentProperty::MCID(_) => "MCID",
83 MarkedContentProperty::Lang(_) => "Lang",
84 MarkedContentProperty::ActualText(_) => "ActualText",
85 MarkedContentProperty::Alt(_) => "Alt",
86 MarkedContentProperty::E(_) => "E",
87 }
88 }
89
90 fn value(&self) -> String {
92 match self {
93 MarkedContentProperty::MCID(id) => id.to_string(),
94 MarkedContentProperty::Lang(s)
95 | MarkedContentProperty::ActualText(s)
96 | MarkedContentProperty::Alt(s)
97 | MarkedContentProperty::E(s) => format!(
98 "({})",
99 s.replace('\\', "\\\\")
100 .replace('(', "\\(")
101 .replace(')', "\\)")
102 ),
103 }
104 }
105}
106
107fn validate_tag(tag: &str) -> Result<()> {
112 if tag.is_empty() {
113 return Err(PdfError::InvalidOperation(
114 "Marked content tag cannot be empty".to_string(),
115 ));
116 }
117
118 if tag.len() > MAX_TAG_LENGTH {
119 return Err(PdfError::InvalidOperation(format!(
120 "Marked content tag too long: {} characters (max {})",
121 tag.len(),
122 MAX_TAG_LENGTH
123 )));
124 }
125
126 if !VALID_TAG_PATTERN.is_match(tag) {
127 return Err(PdfError::InvalidOperation(format!(
128 "Invalid marked content tag '{}': must contain only alphanumeric, underscore, or hyphen",
129 tag
130 )));
131 }
132
133 Ok(())
134}
135
136impl MarkedContent {
137 pub fn new() -> Self {
139 Self {
140 operations: String::new(),
141 tag_stack: Vec::new(),
142 }
143 }
144
145 fn check_size_limit(&self) -> Result<()> {
147 if self.operations.len() > MAX_OPERATIONS_SIZE {
148 return Err(PdfError::InvalidOperation(format!(
149 "Marked content operations exceed size limit: {} bytes (max {})",
150 self.operations.len(),
151 MAX_OPERATIONS_SIZE
152 )));
153 }
154 Ok(())
155 }
156
157 fn check_nesting_limit(&self) -> Result<()> {
159 if self.tag_stack.len() >= MAX_NESTING_DEPTH {
160 return Err(PdfError::InvalidOperation(format!(
161 "Marked content nesting too deep: {} levels (max {})",
162 self.tag_stack.len(),
163 MAX_NESTING_DEPTH
164 )));
165 }
166 Ok(())
167 }
168
169 pub fn begin(&mut self, tag: &str) -> Result<&mut Self> {
186 validate_tag(tag)?;
187 self.check_nesting_limit()?;
188 self.check_size_limit()?;
189
190 writeln!(&mut self.operations, "/{tag} BMC")
191 .map_err(|e| PdfError::Internal(format!("Failed to write BMC operator: {e}")))?;
192
193 self.tag_stack.push(tag.to_string());
194 Ok(self)
195 }
196
197 pub fn begin_with_mcid(&mut self, tag: &str, mcid: u32) -> Result<&mut Self> {
219 validate_tag(tag)?;
220 self.check_nesting_limit()?;
221 self.check_size_limit()?;
222
223 writeln!(&mut self.operations, "/{tag} << /MCID {mcid} >> BDC")
225 .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator: {e}")))?;
226
227 self.tag_stack.push(tag.to_string());
228 Ok(self)
229 }
230
231 pub fn begin_with_typed_properties(
255 &mut self,
256 tag: &str,
257 properties: &[MarkedContentProperty],
258 ) -> Result<&mut Self> {
259 validate_tag(tag)?;
260 self.check_nesting_limit()?;
261 self.check_size_limit()?;
262
263 write!(&mut self.operations, "/{tag} <<")
265 .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator start: {e}")))?;
266
267 for prop in properties {
268 write!(&mut self.operations, " /{} {}", prop.key(), prop.value()).map_err(|e| {
269 PdfError::Internal(format!("Failed to write property {}: {e}", prop.key()))
270 })?;
271 }
272
273 writeln!(&mut self.operations, " >> BDC")
274 .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator end: {e}")))?;
275
276 self.tag_stack.push(tag.to_string());
277 Ok(self)
278 }
279
280 pub fn begin_with_properties(
300 &mut self,
301 tag: &str,
302 properties: &[(&str, &str)],
303 ) -> Result<&mut Self> {
304 validate_tag(tag)?;
305 self.check_nesting_limit()?;
306 self.check_size_limit()?;
307
308 write!(&mut self.operations, "/{tag} <<")
310 .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator start: {e}")))?;
311
312 for (key, value) in properties {
313 write!(&mut self.operations, " /{key} {value}")
314 .map_err(|e| PdfError::Internal(format!("Failed to write property {key}: {e}")))?;
315 }
316
317 writeln!(&mut self.operations, " >> BDC")
318 .map_err(|e| PdfError::Internal(format!("Failed to write BDC operator end: {e}")))?;
319
320 self.tag_stack.push(tag.to_string());
321 Ok(self)
322 }
323
324 pub fn end(&mut self) -> Result<&mut Self> {
343 if self.tag_stack.is_empty() {
344 return Err(PdfError::InvalidStructure(
345 "Cannot end marked content: no open sections".to_string(),
346 ));
347 }
348
349 self.tag_stack.pop();
350 writeln!(&mut self.operations, "EMC")
351 .map_err(|e| PdfError::Internal(format!("Failed to write EMC operator: {e}")))?;
352
353 Ok(self)
354 }
355
356 pub fn has_open_sections(&self) -> bool {
358 !self.tag_stack.is_empty()
359 }
360
361 pub fn open_section_count(&self) -> usize {
363 self.tag_stack.len()
364 }
365
366 pub fn tag_stack(&self) -> &[String] {
368 &self.tag_stack
369 }
370
371 pub fn finish(self) -> Result<String> {
391 if !self.tag_stack.is_empty() {
392 return Err(PdfError::InvalidStructure(format!(
393 "Cannot finish marked content: {} open section(s) remaining: {:?}",
394 self.tag_stack.len(),
395 self.tag_stack
396 )));
397 }
398
399 Ok(self.operations)
400 }
401
402 pub fn operations(&self) -> &str {
407 &self.operations
408 }
409
410 pub fn reset(&mut self) {
414 self.operations.clear();
415 self.tag_stack.clear();
416 }
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422
423 #[test]
424 fn test_bmc_operator() {
425 let mut mc = MarkedContent::new();
426 mc.begin("P").unwrap();
427 mc.end().unwrap();
428
429 let ops = mc.finish().unwrap();
430 assert!(ops.contains("/P BMC"));
431 assert!(ops.contains("EMC"));
432 }
433
434 #[test]
435 fn test_bdc_with_mcid() {
436 let mut mc = MarkedContent::new();
437 mc.begin_with_mcid("P", 42).unwrap();
438 mc.end().unwrap();
439
440 let ops = mc.finish().unwrap();
441 assert!(ops.contains("/P << /MCID 42 >> BDC"));
442 assert!(ops.contains("EMC"));
443 }
444
445 #[test]
446 fn test_bdc_with_properties() {
447 let mut mc = MarkedContent::new();
448 let props = vec![("MCID", "0"), ("Lang", "(en-US)")];
449 mc.begin_with_properties("P", &props).unwrap();
450 mc.end().unwrap();
451
452 let ops = mc.finish().unwrap();
453 assert!(ops.contains("/P << /MCID 0 /Lang (en-US) >> BDC"));
454 }
455
456 #[test]
457 fn test_nested_marked_content() {
458 let mut mc = MarkedContent::new();
459 mc.begin_with_mcid("Div", 0).unwrap();
460 mc.begin_with_mcid("P", 1).unwrap();
461 mc.end().unwrap(); mc.end().unwrap(); let ops = mc.finish().unwrap();
465 assert_eq!(ops.matches("BDC").count(), 2);
466 assert_eq!(ops.matches("EMC").count(), 2);
467 }
468
469 #[test]
470 fn test_invalid_tag_name() {
471 let mut mc = MarkedContent::new();
472 let result = mc.begin("Invalid Tag");
473 assert!(result.is_err());
474
475 let result = mc.begin("Tag<>");
476 assert!(result.is_err());
477 }
478
479 #[test]
480 fn test_end_without_begin() {
481 let mut mc = MarkedContent::new();
482 let result = mc.end();
483 assert!(result.is_err());
484 }
485
486 #[test]
487 fn test_finish_with_open_sections() {
488 let mut mc = MarkedContent::new();
489 mc.begin("P").unwrap();
490 let result = mc.finish();
493 assert!(result.is_err());
494 }
495
496 #[test]
497 fn test_tag_stack() {
498 let mut mc = MarkedContent::new();
499 assert_eq!(mc.open_section_count(), 0);
500 assert!(!mc.has_open_sections());
501
502 mc.begin("Div").unwrap();
503 assert_eq!(mc.open_section_count(), 1);
504 assert!(mc.has_open_sections());
505
506 mc.begin("P").unwrap();
507 assert_eq!(mc.open_section_count(), 2);
508
509 mc.end().unwrap();
510 assert_eq!(mc.open_section_count(), 1);
511
512 mc.end().unwrap();
513 assert_eq!(mc.open_section_count(), 0);
514 assert!(!mc.has_open_sections());
515 }
516
517 #[test]
518 fn test_reset() {
519 let mut mc = MarkedContent::new();
520 mc.begin("P").unwrap();
521 mc.end().unwrap();
522
523 mc.reset();
524 assert_eq!(mc.operations().len(), 0);
525 assert_eq!(mc.open_section_count(), 0);
526 }
527
528 #[test]
531 fn test_deep_nesting() {
532 let mut mc = MarkedContent::new();
534
535 for i in 0..20 {
537 mc.begin(&format!("Level{}", i)).unwrap();
538 }
539
540 assert_eq!(mc.open_section_count(), 20);
541
542 for _ in 0..20 {
544 mc.end().unwrap();
545 }
546
547 assert_eq!(mc.open_section_count(), 0);
548 let ops = mc.finish().unwrap();
549 assert_eq!(ops.matches("BMC").count(), 20);
550 assert_eq!(ops.matches("EMC").count(), 20);
551 }
552
553 #[test]
554 fn test_nesting_limit_exceeded() {
555 let mut mc = MarkedContent::new();
556
557 for i in 0..100 {
559 mc.begin(&format!("L{}", i)).unwrap();
560 }
561
562 let result = mc.begin("TooDeep");
564 assert!(result.is_err());
565 assert!(result.unwrap_err().to_string().contains("nesting too deep"));
566 }
567
568 #[test]
569 fn test_tag_validation_alphanumeric() {
570 let mut mc = MarkedContent::new();
571
572 assert!(mc.begin("P").is_ok());
574 mc.end().unwrap();
575
576 assert!(mc.begin("H1").is_ok());
577 mc.end().unwrap();
578
579 assert!(mc.begin("My_Tag").is_ok());
580 mc.end().unwrap();
581
582 assert!(mc.begin("Tag-123").is_ok());
583 mc.end().unwrap();
584 }
585
586 #[test]
587 fn test_tag_validation_invalid_chars() {
588 let mut mc = MarkedContent::new();
589
590 assert!(mc.begin("Tag@Value").is_err());
592 assert!(mc.begin("Tag#123").is_err());
593 assert!(mc.begin("Tag$Name").is_err());
594 assert!(mc.begin("Tag%").is_err());
595 assert!(mc.begin("Tag/Path").is_err());
596 }
597
598 #[test]
599 fn test_tag_length_limit() {
600 let mut mc = MarkedContent::new();
601
602 let max_tag = "A".repeat(127);
604 assert!(mc.begin(&max_tag).is_ok());
605 mc.end().unwrap();
606
607 let over_tag = "A".repeat(128);
609 assert!(mc.begin(&over_tag).is_err());
610 }
611
612 #[test]
613 fn test_typed_properties() {
614 let mut mc = MarkedContent::new();
615
616 let props = vec![
617 MarkedContentProperty::MCID(42),
618 MarkedContentProperty::Lang("en-US".to_string()),
619 MarkedContentProperty::ActualText("Hello".to_string()),
620 ];
621
622 mc.begin_with_typed_properties("P", &props).unwrap();
623 mc.end().unwrap();
624
625 let ops = mc.finish().unwrap();
626 assert!(ops.contains("/MCID 42"));
627 assert!(ops.contains("/Lang (en-US)"));
628 assert!(ops.contains("/ActualText (Hello)"));
629 }
630
631 #[test]
632 fn test_property_string_escaping() {
633 let mut mc = MarkedContent::new();
634
635 let props = vec![
636 MarkedContentProperty::ActualText("Text with (parens)".to_string()),
637 MarkedContentProperty::Alt("Text\\with\\backslashes".to_string()),
638 ];
639
640 mc.begin_with_typed_properties("P", &props).unwrap();
641 mc.end().unwrap();
642
643 let ops = mc.finish().unwrap();
644 assert!(ops.contains("\\(") && ops.contains("\\)"));
646 assert!(ops.contains("\\\\"));
647 }
648
649 #[test]
650 fn test_size_limit() {
651 let mut mc = MarkedContent::new();
652
653 let large_tag = "T".repeat(100); let mut iteration_count = 0;
658 let mut hit_limit = false;
659
660 for i in 0..200_000 {
662 iteration_count = i;
663
664 let tag = format!("{}{}", large_tag, i);
666
667 match mc.begin(&tag) {
668 Ok(_) => {
669 if mc.end().is_err() {
671 hit_limit = true;
672 break;
673 }
674 }
675 Err(_) => {
676 hit_limit = true;
678 break;
679 }
680 }
681 }
682
683 assert!(
685 hit_limit,
686 "Expected to hit size limit but completed {} iterations (ops size: {} bytes)",
687 iteration_count,
688 mc.operations().len()
689 );
690
691 let ops_size = mc.operations().len();
693 assert!(
694 ops_size > 9_000_000,
695 "Operations size {} should be near 10MB limit after hitting size check",
696 ops_size
697 );
698 }
699
700 #[test]
701 fn test_property_enum_equality() {
702 let prop1 = MarkedContentProperty::MCID(42);
703 let prop2 = MarkedContentProperty::MCID(42);
704 let prop3 = MarkedContentProperty::MCID(43);
705
706 assert_eq!(prop1, prop2);
707 assert_ne!(prop1, prop3);
708
709 let prop4 = MarkedContentProperty::Lang("en".to_string());
710 let prop5 = MarkedContentProperty::Lang("en".to_string());
711 assert_eq!(prop4, prop5);
712 }
713
714 #[test]
715 fn test_empty_tag() {
716 let mut mc = MarkedContent::new();
717 let result = mc.begin("");
718 assert!(result.is_err());
719 assert!(result.unwrap_err().to_string().contains("cannot be empty"));
720 }
721
722 #[test]
723 fn test_validate_tag_function() {
724 assert!(validate_tag("P").is_ok());
726 assert!(validate_tag("H1").is_ok());
727 assert!(validate_tag("My-Tag_123").is_ok());
728
729 assert!(validate_tag("").is_err());
730 assert!(validate_tag("Tag with spaces").is_err());
731 assert!(validate_tag("Tag<>").is_err());
732 assert!(validate_tag(&"A".repeat(128)).is_err());
733 }
734}