Skip to main content

pdf_ast/parser/
content_analyzer.rs

1use crate::ast::{AstNode, NodeId, NodeMetadata, NodeType};
2use crate::types::{PdfString, PdfValue};
3use std::collections::HashMap;
4
5/// Analyzes PDF content streams and extracts operators and potentially malicious content
6pub struct ContentAnalyzer {
7    suspicious_keywords: Vec<&'static str>,
8    js_keywords: Vec<&'static str>,
9}
10
11impl ContentAnalyzer {
12    pub fn new() -> Self {
13        Self {
14            suspicious_keywords: vec![
15                "eval",
16                "unescape",
17                "fromCharCode",
18                "String.fromCharCode",
19                "document.write",
20                "innerHTML",
21                "createElement",
22                "appendChild",
23                "exec",
24                "system",
25                "shell",
26                "cmd",
27                "powershell",
28                "bash",
29                "ActiveXObject",
30                "WScript",
31                "Shell.Application",
32                "getAnnots",
33                "print",
34                "mailDoc",
35                "importDataObject",
36                "launch",
37                "submitForm",
38                "resetForm",
39                "exportValues",
40                "/F ",
41                "/FT ",
42                "/Ff ",
43                "/V ",
44                "/DV ",
45                "/AA ",
46                "/OpenAction",
47                "/Names",
48                "/AcroForm",
49                "/XFA",
50            ],
51            js_keywords: vec![
52                "function",
53                "var",
54                "let",
55                "const",
56                "if",
57                "for",
58                "while",
59                "try",
60                "catch",
61                "throw",
62                "return",
63                "new",
64                "this",
65                "app.",
66                "doc.",
67                "field.",
68                "event.",
69                "util.",
70                "AFNumber_Format",
71                "AFPercent_Format",
72                "AFDate_Format",
73            ],
74        }
75    }
76
77    /// Analyze a content stream and extract all operators and suspicious content
78    pub fn analyze_content_stream(&self, stream_data: &[u8], node_id: usize) -> Vec<AstNode> {
79        let mut nodes = Vec::new();
80        let mut next_id = node_id;
81
82        // Try to interpret as text first
83        if let Ok(content) = String::from_utf8(stream_data.to_vec()) {
84            nodes.extend(self.parse_text_content(&content, &mut next_id));
85        }
86
87        // Parse as PDF operators
88        nodes.extend(self.parse_pdf_operators(stream_data, &mut next_id));
89
90        nodes
91    }
92
93    /// Parse text content looking for JavaScript and suspicious patterns
94    fn parse_text_content(&self, content: &str, next_id: &mut usize) -> Vec<AstNode> {
95        let mut nodes = Vec::new();
96
97        // Check for JavaScript code
98        if self.contains_javascript(content) {
99            let js_node = self.create_js_node(content, next_id);
100            nodes.push(js_node);
101        }
102
103        // Check for suspicious patterns
104        for suspicious in self.find_suspicious_patterns(content) {
105            let suspicious_node = self.create_suspicious_node(&suspicious, next_id);
106            nodes.push(suspicious_node);
107        }
108
109        // Check for external references (URLs, file paths)
110        for external_ref in self.find_external_references(content) {
111            let ref_node = self.create_external_ref_node(&external_ref, next_id);
112            nodes.push(ref_node);
113        }
114
115        nodes
116    }
117
118    /// Parse PDF content stream operators (BT, ET, Tf, Tj, etc.)
119    fn parse_pdf_operators(&self, data: &[u8], next_id: &mut usize) -> Vec<AstNode> {
120        let mut nodes = Vec::new();
121
122        if let Ok(content) = String::from_utf8(data.to_vec()) {
123            let tokens = self.tokenize_content_stream(&content);
124            let mut i = 0;
125
126            while i < tokens.len() {
127                if let Some(operator) = self.identify_operator(&tokens, i) {
128                    let op_node = self.create_operator_node(&operator, next_id);
129                    nodes.push(op_node);
130                    i += operator.token_count;
131                } else {
132                    i += 1;
133                }
134            }
135        }
136
137        nodes
138    }
139
140    /// Tokenize content stream into PDF tokens
141    fn tokenize_content_stream(&self, content: &str) -> Vec<String> {
142        let mut tokens = Vec::new();
143        let mut current_token = String::new();
144        let mut in_string = false;
145        let mut in_hex_string = false;
146        let mut escape_next = false;
147
148        for ch in content.chars() {
149            match ch {
150                '(' if !in_hex_string && !escape_next => {
151                    if !current_token.is_empty() {
152                        tokens.push(current_token.clone());
153                        current_token.clear();
154                    }
155                    in_string = true;
156                    current_token.push(ch);
157                }
158                ')' if in_string && !escape_next => {
159                    current_token.push(ch);
160                    tokens.push(current_token.clone());
161                    current_token.clear();
162                    in_string = false;
163                }
164                '<' if !in_string => {
165                    if !current_token.is_empty() {
166                        tokens.push(current_token.clone());
167                        current_token.clear();
168                    }
169                    in_hex_string = true;
170                    current_token.push(ch);
171                }
172                '>' if in_hex_string => {
173                    current_token.push(ch);
174                    tokens.push(current_token.clone());
175                    current_token.clear();
176                    in_hex_string = false;
177                }
178                '\\' if in_string => {
179                    current_token.push(ch);
180                    escape_next = true;
181                }
182                c if c.is_whitespace() && !in_string && !in_hex_string => {
183                    if !current_token.is_empty() {
184                        tokens.push(current_token.clone());
185                        current_token.clear();
186                    }
187                }
188                _ => {
189                    current_token.push(ch);
190                    if escape_next {
191                        escape_next = false;
192                    }
193                }
194            }
195        }
196
197        if !current_token.is_empty() {
198            tokens.push(current_token);
199        }
200
201        tokens
202    }
203
204    /// Identify PDF operators and their operands
205    fn identify_operator(&self, tokens: &[String], index: usize) -> Option<ContentOperator> {
206        if index >= tokens.len() {
207            return None;
208        }
209
210        let token = &tokens[index];
211
212        match token.as_str() {
213            // Text operators
214            "BT" => Some(ContentOperator {
215                operator: "BT".to_string(),
216                operands: vec![],
217                operator_type: OperatorType::TextBegin,
218                token_count: 1,
219                suspicious: false,
220            }),
221            "ET" => Some(ContentOperator {
222                operator: "ET".to_string(),
223                operands: vec![],
224                operator_type: OperatorType::TextEnd,
225                token_count: 1,
226                suspicious: false,
227            }),
228            "Tf" if index >= 2 => Some(ContentOperator {
229                operator: "Tf".to_string(),
230                operands: vec![tokens[index - 2].clone(), tokens[index - 1].clone()],
231                operator_type: OperatorType::TextFont,
232                token_count: 3,
233                suspicious: false,
234            }),
235            "Tj" if index >= 1 => Some(ContentOperator {
236                operator: "Tj".to_string(),
237                operands: vec![tokens[index - 1].clone()],
238                operator_type: OperatorType::TextShow,
239                token_count: 2,
240                suspicious: self.is_suspicious_text(&tokens[index - 1]),
241            }),
242            "TJ" if index >= 1 => Some(ContentOperator {
243                operator: "TJ".to_string(),
244                operands: vec![tokens[index - 1].clone()],
245                operator_type: OperatorType::TextShowArray,
246                token_count: 2,
247                suspicious: self.is_suspicious_text(&tokens[index - 1]),
248            }),
249            // Graphics operators
250            "q" => Some(ContentOperator {
251                operator: "q".to_string(),
252                operands: vec![],
253                operator_type: OperatorType::GraphicsSave,
254                token_count: 1,
255                suspicious: false,
256            }),
257            "Q" => Some(ContentOperator {
258                operator: "Q".to_string(),
259                operands: vec![],
260                operator_type: OperatorType::GraphicsRestore,
261                token_count: 1,
262                suspicious: false,
263            }),
264            // XObject operators
265            "Do" if index >= 1 => Some(ContentOperator {
266                operator: "Do".to_string(),
267                operands: vec![tokens[index - 1].clone()],
268                operator_type: OperatorType::XObject,
269                token_count: 2,
270                suspicious: false,
271            }),
272            _ => None,
273        }
274    }
275
276    /// Check if text content contains JavaScript
277    fn contains_javascript(&self, content: &str) -> bool {
278        let js_count = self
279            .js_keywords
280            .iter()
281            .filter(|&keyword| content.contains(keyword))
282            .count();
283
284        js_count >= 2 || content.contains("function") || content.contains("eval")
285    }
286
287    /// Find suspicious patterns in content
288    fn find_suspicious_patterns(&self, content: &str) -> Vec<SuspiciousPattern> {
289        let mut patterns = Vec::new();
290
291        for &keyword in &self.suspicious_keywords {
292            if content.contains(keyword) {
293                patterns.push(SuspiciousPattern {
294                    pattern: keyword.to_string(),
295                    content: content.to_string(),
296                    risk_level: self.assess_risk_level(keyword),
297                });
298            }
299        }
300
301        patterns
302    }
303
304    /// Find external references (URLs, file paths)
305    fn find_external_references(&self, content: &str) -> Vec<ExternalReference> {
306        let mut refs = Vec::new();
307
308        // Simple URL detection
309        if content.contains("http://") || content.contains("https://") {
310            refs.push(ExternalReference {
311                ref_type: "URL".to_string(),
312                target: content.to_string(),
313                suspicious: true,
314            });
315        }
316
317        // File path detection
318        if content.contains("file://") || content.contains("\\\\") || content.contains("C:\\") {
319            refs.push(ExternalReference {
320                ref_type: "FilePath".to_string(),
321                target: content.to_string(),
322                suspicious: true,
323            });
324        }
325
326        refs
327    }
328
329    fn is_suspicious_text(&self, text: &str) -> bool {
330        self.suspicious_keywords
331            .iter()
332            .any(|&keyword| text.contains(keyword))
333    }
334
335    fn assess_risk_level(&self, keyword: &str) -> RiskLevel {
336        match keyword {
337            k if k.contains("eval") || k.contains("exec") || k.contains("shell") => RiskLevel::High,
338            k if k.contains("ActiveX") || k.contains("launch") || k.contains("system") => {
339                RiskLevel::High
340            }
341            k if k.contains("JavaScript") || k.contains("unescape") => RiskLevel::Medium,
342            _ => RiskLevel::Low,
343        }
344    }
345
346    fn create_js_node(&self, content: &str, next_id: &mut usize) -> AstNode {
347        let node_id = NodeId(*next_id);
348        *next_id += 1;
349
350        let mut properties = HashMap::new();
351        properties.insert("js_content".to_string(), content.to_string());
352        properties.insert("risk_level".to_string(), "high".to_string());
353
354        AstNode::new(
355            node_id,
356            NodeType::EmbeddedJS,
357            PdfValue::String(PdfString::new_literal(content.as_bytes())),
358        )
359        .with_metadata(NodeMetadata {
360            properties,
361            ..Default::default()
362        })
363    }
364
365    fn create_suspicious_node(&self, pattern: &SuspiciousPattern, next_id: &mut usize) -> AstNode {
366        let node_id = NodeId(*next_id);
367        *next_id += 1;
368
369        let mut properties = HashMap::new();
370        properties.insert("pattern".to_string(), pattern.pattern.clone());
371        properties.insert(
372            "risk_level".to_string(),
373            format!("{:?}", pattern.risk_level),
374        );
375
376        AstNode::new(
377            node_id,
378            NodeType::SuspiciousAction,
379            PdfValue::String(PdfString::new_literal(pattern.content.as_bytes())),
380        )
381        .with_metadata(NodeMetadata {
382            properties,
383            ..Default::default()
384        })
385    }
386
387    fn create_external_ref_node(
388        &self,
389        ext_ref: &ExternalReference,
390        next_id: &mut usize,
391    ) -> AstNode {
392        let node_id = NodeId(*next_id);
393        *next_id += 1;
394
395        let mut properties = HashMap::new();
396        properties.insert("ref_type".to_string(), ext_ref.ref_type.clone());
397        properties.insert("target".to_string(), ext_ref.target.clone());
398        properties.insert("suspicious".to_string(), ext_ref.suspicious.to_string());
399
400        AstNode::new(
401            node_id,
402            NodeType::ExternalReference,
403            PdfValue::String(PdfString::new_literal(ext_ref.target.as_bytes())),
404        )
405        .with_metadata(NodeMetadata {
406            properties,
407            ..Default::default()
408        })
409    }
410
411    fn create_operator_node(&self, operator: &ContentOperator, next_id: &mut usize) -> AstNode {
412        let node_id = NodeId(*next_id);
413        *next_id += 1;
414
415        let mut properties = HashMap::new();
416        properties.insert("operator".to_string(), operator.operator.clone());
417        properties.insert("operands".to_string(), operator.operands.join(" "));
418        properties.insert("type".to_string(), format!("{:?}", operator.operator_type));
419        properties.insert("suspicious".to_string(), operator.suspicious.to_string());
420
421        let node_type = match operator.operator_type {
422            OperatorType::TextBegin
423            | OperatorType::TextEnd
424            | OperatorType::TextFont
425            | OperatorType::TextShow
426            | OperatorType::TextShowArray => NodeType::TextOperator,
427            OperatorType::GraphicsSave | OperatorType::GraphicsRestore => {
428                NodeType::GraphicsOperator
429            }
430            _ => NodeType::ContentOperator,
431        };
432
433        AstNode::new(
434            node_id,
435            node_type,
436            PdfValue::String(PdfString::new_literal(operator.operator.as_bytes())),
437        )
438        .with_metadata(NodeMetadata {
439            properties,
440            ..Default::default()
441        })
442    }
443}
444
445#[derive(Debug, Clone)]
446pub struct ContentOperator {
447    pub operator: String,
448    pub operands: Vec<String>,
449    pub operator_type: OperatorType,
450    pub token_count: usize,
451    pub suspicious: bool,
452}
453
454#[derive(Debug, Clone)]
455pub enum OperatorType {
456    TextBegin,
457    TextEnd,
458    TextFont,
459    TextShow,
460    TextShowArray,
461    GraphicsSave,
462    GraphicsRestore,
463    XObject,
464    Other,
465}
466
467#[derive(Debug, Clone)]
468pub struct SuspiciousPattern {
469    pub pattern: String,
470    pub content: String,
471    pub risk_level: RiskLevel,
472}
473
474#[derive(Debug, Clone)]
475pub struct ExternalReference {
476    pub ref_type: String,
477    pub target: String,
478    pub suspicious: bool,
479}
480
481#[derive(Debug, Clone)]
482pub enum RiskLevel {
483    Low,
484    Medium,
485    High,
486    Critical,
487}
488
489impl Default for ContentAnalyzer {
490    fn default() -> Self {
491        Self::new()
492    }
493}