Skip to main content

pdf_ast/security/
hardening.rs

1use crate::performance::limits::{PerformanceGuard, PerformanceLimits};
2use crate::types::{ObjectId, PdfArray, PdfDictionary, PdfName, PdfString, PdfValue};
3use regex::Regex;
4use std::collections::{HashMap, HashSet};
5
6#[derive(Debug, Clone)]
7pub struct SecurityLimits {
8    /// Maximum string length to prevent memory exhaustion
9    pub max_string_length: usize,
10
11    /// Maximum array size
12    pub max_array_size: usize,
13
14    /// Maximum dictionary size
15    pub max_dictionary_size: usize,
16
17    /// Maximum stream size in bytes
18    pub max_stream_size: usize,
19
20    /// Maximum nesting depth for objects
21    pub max_nesting_depth: usize,
22
23    /// Maximum number of references to prevent infinite loops
24    pub max_reference_count: usize,
25
26    /// Blacklisted object types that are considered dangerous
27    pub forbidden_types: HashSet<String>,
28
29    /// Blacklisted dictionary keys
30    pub forbidden_keys: HashSet<String>,
31
32    /// Patterns that should not appear in string values
33    pub forbidden_patterns: Vec<Regex>,
34
35    /// Enable JavaScript validation
36    pub validate_javascript: bool,
37
38    /// Enable form field validation
39    pub validate_forms: bool,
40
41    /// Enable annotation validation
42    pub validate_annotations: bool,
43
44    /// Maximum number of pages
45    pub max_pages: usize,
46
47    /// Maximum file size in bytes
48    pub max_file_size: usize,
49}
50
51impl Default for SecurityLimits {
52    fn default() -> Self {
53        let mut forbidden_types = HashSet::new();
54        forbidden_types.insert("Launch".to_string());
55        forbidden_types.insert("ImportData".to_string());
56        forbidden_types.insert("JavaScript".to_string());
57        forbidden_types.insert("ResetForm".to_string());
58        forbidden_types.insert("SubmitForm".to_string());
59
60        let mut forbidden_keys = HashSet::new();
61        forbidden_keys.insert("JS".to_string());
62        forbidden_keys.insert("JavaScript".to_string());
63        forbidden_keys.insert("Launch".to_string());
64        forbidden_keys.insert("URI".to_string());
65
66        let mut forbidden_patterns = Vec::new();
67        // JavaScript patterns
68        if let Ok(js_pattern) = Regex::new(r"(?i)(javascript|eval|function|var|let|const)") {
69            forbidden_patterns.push(js_pattern);
70        }
71        // File system patterns
72        if let Ok(fs_pattern) = Regex::new(r"(?i)(\.\.[\\/]|file://|[a-z]:\\)") {
73            forbidden_patterns.push(fs_pattern);
74        }
75        // Network patterns
76        if let Ok(net_pattern) = Regex::new(r"(?i)(https?://|ftp://|ldap://)") {
77            forbidden_patterns.push(net_pattern);
78        }
79
80        Self {
81            max_string_length: 1_000_000,
82            max_array_size: 100_000,
83            max_dictionary_size: 10_000,
84            max_stream_size: 50_000_000, // 50MB
85            max_nesting_depth: 50,
86            max_reference_count: 1_000_000,
87            forbidden_types,
88            forbidden_keys,
89            forbidden_patterns,
90            validate_javascript: true,
91            validate_forms: true,
92            validate_annotations: true,
93            max_pages: 10_000,
94            max_file_size: 100_000_000, // 100MB
95        }
96    }
97}
98
99impl SecurityLimits {
100    pub fn permissive() -> Self {
101        Self {
102            max_string_length: 10_000_000,
103            max_array_size: 1_000_000,
104            max_dictionary_size: 100_000,
105            max_stream_size: 500_000_000, // 500MB
106            max_nesting_depth: 200,
107            max_reference_count: 10_000_000,
108            validate_javascript: false,
109            validate_forms: false,
110            validate_annotations: false,
111            max_pages: 100_000,
112            max_file_size: 1_000_000_000, // 1GB
113            ..Default::default()
114        }
115    }
116
117    pub fn strict() -> Self {
118        let mut limits = Self::default();
119
120        // Add more forbidden types
121        limits.forbidden_types.insert("GoTo".to_string());
122        limits.forbidden_types.insert("GoToR".to_string());
123        limits.forbidden_types.insert("Movie".to_string());
124        limits.forbidden_types.insert("Sound".to_string());
125        limits.forbidden_types.insert("Rendition".to_string());
126
127        // Stricter limits
128        limits.max_string_length = 100_000;
129        limits.max_array_size = 10_000;
130        limits.max_dictionary_size = 1_000;
131        limits.max_stream_size = 10_000_000; // 10MB
132        limits.max_nesting_depth = 20;
133        limits.max_pages = 1_000;
134        limits.max_file_size = 10_000_000; // 10MB
135
136        limits
137    }
138}
139
140#[derive(Debug, Clone)]
141pub enum SecurityViolation {
142    StringTooLong(usize, usize),
143    ArrayTooLarge(usize, usize),
144    DictionaryTooLarge(usize, usize),
145    StreamTooLarge(usize, usize),
146    NestingTooDeep(usize, usize),
147    TooManyReferences(usize, usize),
148    ForbiddenObjectType(String),
149    ForbiddenDictionaryKey(String),
150    SuspiciousPattern(String, String),
151    MaliciousJavaScript(String),
152    DangerousForm(String),
153    SuspiciousAnnotation(String),
154    TooManyPages(usize, usize),
155    FileTooLarge(usize, usize),
156}
157
158impl std::fmt::Display for SecurityViolation {
159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160        match self {
161            SecurityViolation::StringTooLong(len, max) => {
162                write!(f, "String too long: {} > {} characters", len, max)
163            }
164            SecurityViolation::ArrayTooLarge(size, max) => {
165                write!(f, "Array too large: {} > {} elements", size, max)
166            }
167            SecurityViolation::DictionaryTooLarge(size, max) => {
168                write!(f, "Dictionary too large: {} > {} entries", size, max)
169            }
170            SecurityViolation::StreamTooLarge(size, max) => {
171                write!(f, "Stream too large: {} > {} bytes", size, max)
172            }
173            SecurityViolation::NestingTooDeep(depth, max) => {
174                write!(f, "Nesting too deep: {} > {} levels", depth, max)
175            }
176            SecurityViolation::TooManyReferences(count, max) => {
177                write!(f, "Too many references: {} > {}", count, max)
178            }
179            SecurityViolation::ForbiddenObjectType(obj_type) => {
180                write!(f, "Forbidden object type: {}", obj_type)
181            }
182            SecurityViolation::ForbiddenDictionaryKey(key) => {
183                write!(f, "Forbidden dictionary key: {}", key)
184            }
185            SecurityViolation::SuspiciousPattern(pattern, content) => write!(
186                f,
187                "Suspicious pattern '{}' in content: {}",
188                pattern,
189                &content[..content.len().min(100)]
190            ),
191            SecurityViolation::MaliciousJavaScript(script) => write!(
192                f,
193                "Malicious JavaScript detected: {}",
194                &script[..script.len().min(100)]
195            ),
196            SecurityViolation::DangerousForm(form_desc) => {
197                write!(f, "Dangerous form field: {}", form_desc)
198            }
199            SecurityViolation::SuspiciousAnnotation(annot_desc) => {
200                write!(f, "Suspicious annotation: {}", annot_desc)
201            }
202            SecurityViolation::TooManyPages(count, max) => {
203                write!(f, "Too many pages: {} > {}", count, max)
204            }
205            SecurityViolation::FileTooLarge(size, max) => {
206                write!(f, "File too large: {} > {} bytes", size, max)
207            }
208        }
209    }
210}
211
212impl std::error::Error for SecurityViolation {}
213
214pub struct SecurityValidator {
215    limits: SecurityLimits,
216    performance_guard: PerformanceGuard,
217    reference_counts: HashMap<ObjectId, usize>,
218    current_depth: usize,
219    page_count: usize,
220}
221
222impl SecurityValidator {
223    pub fn new(limits: SecurityLimits, performance_limits: PerformanceLimits) -> Self {
224        Self {
225            limits,
226            performance_guard: PerformanceGuard::new(performance_limits, "security_validation"),
227            reference_counts: HashMap::new(),
228            current_depth: 0,
229            page_count: 0,
230        }
231    }
232
233    pub fn validate_file_size(&self, size: usize) -> Result<(), SecurityViolation> {
234        if size > self.limits.max_file_size {
235            return Err(SecurityViolation::FileTooLarge(
236                size,
237                self.limits.max_file_size,
238            ));
239        }
240        Ok(())
241    }
242
243    pub fn validate_value(&mut self, value: &PdfValue) -> Result<(), SecurityViolation> {
244        self.validate_value_recursive(value, 0)
245    }
246
247    fn validate_value_recursive(
248        &mut self,
249        value: &PdfValue,
250        depth: usize,
251    ) -> Result<(), SecurityViolation> {
252        // Check nesting depth
253        if depth > self.limits.max_nesting_depth {
254            return Err(SecurityViolation::NestingTooDeep(
255                depth,
256                self.limits.max_nesting_depth,
257            ));
258        }
259
260        match value {
261            PdfValue::String(s) => self.validate_string(s),
262            PdfValue::Array(arr) => self.validate_array(arr, depth),
263            PdfValue::Dictionary(dict) => self.validate_dictionary(dict, depth),
264            PdfValue::Stream(stream) => self.validate_stream(stream, depth),
265            PdfValue::Reference(reference) => self.validate_reference(&reference.id()),
266            PdfValue::Name(name) => self.validate_name(name),
267            _ => Ok(()),
268        }
269    }
270
271    fn validate_string(&self, string: &PdfString) -> Result<(), SecurityViolation> {
272        let content = string.to_string_lossy();
273
274        // Check length
275        if content.len() > self.limits.max_string_length {
276            return Err(SecurityViolation::StringTooLong(
277                content.len(),
278                self.limits.max_string_length,
279            ));
280        }
281
282        // Check for forbidden patterns
283        for pattern in &self.limits.forbidden_patterns {
284            if let Some(_matched) = pattern.find(&content) {
285                return Err(SecurityViolation::SuspiciousPattern(
286                    pattern.as_str().to_string(),
287                    content,
288                ));
289            }
290        }
291
292        Ok(())
293    }
294
295    fn validate_array(&mut self, array: &PdfArray, depth: usize) -> Result<(), SecurityViolation> {
296        // Check size
297        if array.len() > self.limits.max_array_size {
298            return Err(SecurityViolation::ArrayTooLarge(
299                array.len(),
300                self.limits.max_array_size,
301            ));
302        }
303
304        // Validate each element
305        for element in array.iter() {
306            self.validate_value_recursive(element, depth + 1)?;
307        }
308
309        Ok(())
310    }
311
312    fn validate_dictionary(
313        &mut self,
314        dict: &PdfDictionary,
315        depth: usize,
316    ) -> Result<(), SecurityViolation> {
317        self.validate_dictionary_size(dict)?;
318
319        for (key, value) in dict.iter() {
320            self.validate_forbidden_key(key)?;
321            self.validate_forbidden_object_type(key, value)?;
322            self.validate_content_security(key, value)?;
323            self.validate_value_recursive(value, depth + 1)?;
324        }
325
326        self.update_page_count_if_page(dict)
327    }
328
329    fn validate_dictionary_size(&self, dict: &PdfDictionary) -> Result<(), SecurityViolation> {
330        if dict.len() > self.limits.max_dictionary_size {
331            return Err(SecurityViolation::DictionaryTooLarge(
332                dict.len(),
333                self.limits.max_dictionary_size,
334            ));
335        }
336        Ok(())
337    }
338
339    fn validate_forbidden_key(&self, key: &PdfName) -> Result<(), SecurityViolation> {
340        if self.limits.forbidden_keys.contains(&key.to_string()) {
341            return Err(SecurityViolation::ForbiddenDictionaryKey(key.to_string()));
342        }
343        Ok(())
344    }
345
346    fn validate_forbidden_object_type(
347        &self,
348        key: &PdfName,
349        value: &PdfValue,
350    ) -> Result<(), SecurityViolation> {
351        if key != "Type" && key != "S" {
352            return Ok(());
353        }
354
355        if let PdfValue::Name(type_name) = value {
356            let type_str = type_name.without_slash();
357            if self.limits.forbidden_types.contains(type_str) {
358                return Err(SecurityViolation::ForbiddenObjectType(type_str.to_string()));
359            }
360        }
361        Ok(())
362    }
363
364    fn validate_content_security(
365        &self,
366        key: &PdfName,
367        value: &PdfValue,
368    ) -> Result<(), SecurityViolation> {
369        if self.limits.validate_javascript {
370            self.validate_javascript_content(key.as_str(), value)?;
371        }
372        if self.limits.validate_forms {
373            self.validate_form_content(key.as_str(), value)?;
374        }
375        if self.limits.validate_annotations {
376            self.validate_annotation_content(key.as_str(), value)?;
377        }
378        Ok(())
379    }
380
381    fn update_page_count_if_page(&mut self, dict: &PdfDictionary) -> Result<(), SecurityViolation> {
382        let is_page = dict
383            .get("Type")
384            .and_then(|v| v.as_name())
385            .map(|n| n.without_slash())
386            == Some("Page");
387
388        if !is_page {
389            return Ok(());
390        }
391
392        self.page_count += 1;
393        if self.page_count > self.limits.max_pages {
394            return Err(SecurityViolation::TooManyPages(
395                self.page_count,
396                self.limits.max_pages,
397            ));
398        }
399        Ok(())
400    }
401
402    fn validate_stream(
403        &mut self,
404        stream: &crate::types::PdfStream,
405        depth: usize,
406    ) -> Result<(), SecurityViolation> {
407        // Check stream size
408        if stream.data.len() > self.limits.max_stream_size {
409            return Err(SecurityViolation::StreamTooLarge(
410                stream.data.len(),
411                self.limits.max_stream_size,
412            ));
413        }
414
415        // Validate stream dictionary
416        self.validate_dictionary(&stream.dict, depth)
417    }
418
419    fn validate_reference(
420        &mut self,
421        reference: &crate::types::ObjectId,
422    ) -> Result<(), SecurityViolation> {
423        // Count reference usage to detect potential DoS
424        let count = self.reference_counts.entry(*reference).or_insert(0);
425        *count += 1;
426
427        if *count > self.limits.max_reference_count {
428            return Err(SecurityViolation::TooManyReferences(
429                *count,
430                self.limits.max_reference_count,
431            ));
432        }
433
434        Ok(())
435    }
436
437    fn validate_name(&self, name: &PdfName) -> Result<(), SecurityViolation> {
438        let name_str = name.without_slash();
439
440        // Check for forbidden object types
441        if self.limits.forbidden_types.contains(name_str) {
442            return Err(SecurityViolation::ForbiddenObjectType(name_str.to_string()));
443        }
444
445        Ok(())
446    }
447
448    fn validate_javascript_content(
449        &self,
450        key: &str,
451        value: &PdfValue,
452    ) -> Result<(), SecurityViolation> {
453        if key == "JS" || key == "JavaScript" {
454            if let Some(content) = value.as_string() {
455                let script = content.to_string_lossy();
456
457                // Basic JavaScript security checks
458                let dangerous_patterns = [
459                    r"eval\s*\(",
460                    r"Function\s*\(",
461                    r"document\.",
462                    r"window\.",
463                    r"XMLHttpRequest",
464                    r"fetch\s*\(",
465                    r"\.innerHTML",
466                    r"\.outerHTML",
467                    r"createElement",
468                ];
469
470                for pattern in &dangerous_patterns {
471                    if let Ok(regex) = Regex::new(pattern) {
472                        if regex.is_match(&script) {
473                            return Err(SecurityViolation::MaliciousJavaScript(script));
474                        }
475                    }
476                }
477            }
478        }
479        Ok(())
480    }
481
482    fn validate_form_content(&self, key: &str, value: &PdfValue) -> Result<(), SecurityViolation> {
483        if key == "FT" || key == "Ff" {
484            // Check for dangerous form field types
485            if let Some(field_type) = value.as_name() {
486                let ft = field_type.without_slash();
487                if ft == "Sig" && key == "FT" {
488                    // Signature fields can be dangerous
489                    return Err(SecurityViolation::DangerousForm(
490                        "Signature field detected".to_string(),
491                    ));
492                }
493            }
494        }
495
496        if key == "A" || key == "AA" {
497            // Action dictionaries in forms can execute code
498            return Err(SecurityViolation::DangerousForm(
499                "Form action detected".to_string(),
500            ));
501        }
502
503        Ok(())
504    }
505
506    fn validate_annotation_content(
507        &self,
508        key: &str,
509        value: &PdfValue,
510    ) -> Result<(), SecurityViolation> {
511        if key == "Subtype" && value.as_name().map(|n| n.without_slash()) == Some("Widget") {
512            // Widget annotations can be interactive
513            return Err(SecurityViolation::SuspiciousAnnotation(
514                "Interactive widget annotation".to_string(),
515            ));
516        }
517
518        if key == "A" || key == "AA" {
519            // Action dictionaries in annotations
520            return Err(SecurityViolation::SuspiciousAnnotation(
521                "Annotation with actions".to_string(),
522            ));
523        }
524
525        if key == "Movie" || key == "Sound" {
526            // Multimedia annotations
527            return Err(SecurityViolation::SuspiciousAnnotation(
528                "Multimedia annotation".to_string(),
529            ));
530        }
531
532        Ok(())
533    }
534
535    pub fn get_statistics(&self) -> SecurityStatistics {
536        SecurityStatistics {
537            reference_counts: self.reference_counts.clone(),
538            page_count: self.page_count,
539            max_depth_reached: self.current_depth,
540            performance_stats: self.performance_guard.get_stats(),
541        }
542    }
543}
544
545#[derive(Debug, Clone)]
546pub struct SecurityStatistics {
547    pub reference_counts: HashMap<ObjectId, usize>,
548    pub page_count: usize,
549    pub max_depth_reached: usize,
550    pub performance_stats: crate::performance::limits::PerformanceStats,
551}
552
553/// Sanitize PDF content by removing dangerous elements
554pub struct PdfSanitizer {
555    limits: SecurityLimits,
556}
557
558impl PdfSanitizer {
559    pub fn new(limits: SecurityLimits) -> Self {
560        Self { limits }
561    }
562
563    pub fn sanitize_value(&self, value: &mut PdfValue) -> bool {
564        match value {
565            PdfValue::String(s) => self.sanitize_string(s),
566            PdfValue::Array(arr) => self.sanitize_array(arr),
567            PdfValue::Dictionary(dict) => self.sanitize_dictionary(dict),
568            PdfValue::Stream(stream) => self.sanitize_stream(stream),
569            _ => true,
570        }
571    }
572
573    fn sanitize_string(&self, string: &mut PdfString) -> bool {
574        let mut content = string.to_string_lossy();
575        let _original_len = content.len();
576
577        // Remove content matching forbidden patterns
578        for pattern in &self.limits.forbidden_patterns {
579            content = pattern.replace_all(&content, "[SANITIZED]").to_string();
580        }
581
582        // Truncate if too long
583        if content.len() > self.limits.max_string_length {
584            content.truncate(self.limits.max_string_length);
585            content.push_str("[TRUNCATED]");
586        }
587
588        if content != string.to_string_lossy() {
589            *string = PdfString::new_literal(content.as_bytes());
590            return false; // Content was modified
591        }
592
593        true
594    }
595
596    fn sanitize_array(&self, array: &mut PdfArray) -> bool {
597        let mut all_clean = true;
598
599        // Truncate if too large
600        if array.len() > self.limits.max_array_size {
601            array.truncate(self.limits.max_array_size);
602            all_clean = false;
603        }
604
605        // Sanitize each element
606        for element in array.iter_mut() {
607            if !self.sanitize_value(element) {
608                all_clean = false;
609            }
610        }
611
612        all_clean
613    }
614
615    fn sanitize_dictionary(&self, dict: &mut PdfDictionary) -> bool {
616        let keys_removed = self.remove_forbidden_keys(dict);
617        let type_removed = self.remove_forbidden_type(dict);
618        let values_modified = self.sanitize_dictionary_values(dict);
619
620        !keys_removed && !type_removed && !values_modified
621    }
622
623    fn remove_forbidden_keys(&self, dict: &mut PdfDictionary) -> bool {
624        let keys_to_remove: Vec<_> = dict
625            .keys()
626            .filter(|key| self.limits.forbidden_keys.contains(key.without_slash()))
627            .cloned()
628            .collect();
629
630        let removed_any = !keys_to_remove.is_empty();
631        for key in keys_to_remove {
632            dict.remove(key.as_str());
633        }
634        removed_any
635    }
636
637    fn remove_forbidden_type(&self, dict: &mut PdfDictionary) -> bool {
638        let should_remove = dict
639            .get("Type")
640            .and_then(|v| v.as_name())
641            .map(|type_name| {
642                self.limits
643                    .forbidden_types
644                    .contains(type_name.without_slash())
645            })
646            .unwrap_or(false);
647
648        if should_remove {
649            dict.remove("Type");
650        }
651        should_remove
652    }
653
654    fn sanitize_dictionary_values(&self, dict: &mut PdfDictionary) -> bool {
655        let mut any_modified = false;
656        let keys: Vec<_> = dict.keys().cloned().collect();
657
658        for key in keys {
659            if let Some(mut value) = dict.remove(key.as_str()) {
660                if !self.sanitize_value(&mut value) {
661                    any_modified = true;
662                }
663                dict.insert(key, value);
664            }
665        }
666        any_modified
667    }
668
669    fn sanitize_stream(&self, stream: &mut crate::types::PdfStream) -> bool {
670        let mut all_clean = true;
671
672        // Truncate stream data if too large
673        if stream.data.len() > self.limits.max_stream_size {
674            stream.data.truncate(self.limits.max_stream_size);
675            all_clean = false;
676        }
677
678        // Sanitize stream dictionary
679        if !self.sanitize_dictionary(&mut stream.dict) {
680            all_clean = false;
681        }
682
683        all_clean
684    }
685}
686
687#[cfg(test)]
688mod tests {
689    use super::*;
690    use crate::types::*;
691
692    #[test]
693    fn test_security_limits() {
694        let limits = SecurityLimits::strict();
695        assert!(limits.forbidden_types.contains("JavaScript"));
696        assert!(limits.forbidden_keys.contains("JS"));
697        assert!(limits.max_string_length < SecurityLimits::default().max_string_length);
698    }
699
700    #[test]
701    fn test_string_validation() {
702        let limits = SecurityLimits::default();
703        let perf_limits = PerformanceLimits::default();
704        let validator = SecurityValidator::new(limits, perf_limits);
705
706        // Test long string
707        let long_string = PdfString::new_literal(vec![b'a'; 2_000_000]);
708        assert!(validator.validate_string(&long_string).is_err());
709
710        // Test JavaScript pattern
711        let js_string = PdfString::new_literal(b"function evil() { eval('bad'); }");
712        assert!(validator.validate_string(&js_string).is_err());
713    }
714
715    #[test]
716    fn test_sanitizer() {
717        let limits = SecurityLimits::default();
718        let sanitizer = PdfSanitizer::new(limits);
719
720        let mut dict = PdfDictionary::new();
721        dict.insert(
722            "Type".to_string(),
723            PdfValue::Name(PdfName::new("JavaScript")),
724        );
725        dict.insert(
726            "JS".to_string(),
727            PdfValue::String(PdfString::new_literal(b"alert('xss')")),
728        );
729
730        let clean = sanitizer.sanitize_dictionary(&mut dict);
731        assert!(!clean); // Should not be clean
732        assert!(!dict.contains_key("Type")); // JavaScript type should be removed
733        assert!(!dict.contains_key("JS")); // JS key should be removed
734    }
735
736    #[test]
737    fn test_reference_counting() {
738        let limits = SecurityLimits::default();
739        let perf_limits = PerformanceLimits::default();
740        let mut validator = SecurityValidator::new(limits, perf_limits);
741
742        let obj_id = ObjectId {
743            number: 1,
744            generation: 0,
745        };
746
747        // Validate same reference multiple times
748        for _ in 0..10 {
749            assert!(validator.validate_reference(&obj_id).is_ok());
750        }
751
752        assert_eq!(validator.reference_counts[&obj_id], 10);
753    }
754}