1use crate::ast::{AstNode, NodeId, NodeMetadata, NodeType};
2use crate::types::{PdfString, PdfValue};
3use std::collections::HashMap;
4
5pub struct ContentAnalyzer {
7 suspicious_keywords: Vec<&'static str>,
8 js_keywords: Vec<&'static str>,
9}
10
11impl ContentAnalyzer {
12 pub fn new() -> Self {
13 Self {
14 suspicious_keywords: vec![
15 "eval",
16 "unescape",
17 "fromCharCode",
18 "String.fromCharCode",
19 "document.write",
20 "innerHTML",
21 "createElement",
22 "appendChild",
23 "exec",
24 "system",
25 "shell",
26 "cmd",
27 "powershell",
28 "bash",
29 "ActiveXObject",
30 "WScript",
31 "Shell.Application",
32 "getAnnots",
33 "print",
34 "mailDoc",
35 "importDataObject",
36 "launch",
37 "submitForm",
38 "resetForm",
39 "exportValues",
40 "/F ",
41 "/FT ",
42 "/Ff ",
43 "/V ",
44 "/DV ",
45 "/AA ",
46 "/OpenAction",
47 "/Names",
48 "/AcroForm",
49 "/XFA",
50 ],
51 js_keywords: vec![
52 "function",
53 "var",
54 "let",
55 "const",
56 "if",
57 "for",
58 "while",
59 "try",
60 "catch",
61 "throw",
62 "return",
63 "new",
64 "this",
65 "app.",
66 "doc.",
67 "field.",
68 "event.",
69 "util.",
70 "AFNumber_Format",
71 "AFPercent_Format",
72 "AFDate_Format",
73 ],
74 }
75 }
76
77 pub fn analyze_content_stream(&self, stream_data: &[u8], node_id: usize) -> Vec<AstNode> {
79 let mut nodes = Vec::new();
80 let mut next_id = node_id;
81
82 if let Ok(content) = String::from_utf8(stream_data.to_vec()) {
84 nodes.extend(self.parse_text_content(&content, &mut next_id));
85 }
86
87 nodes.extend(self.parse_pdf_operators(stream_data, &mut next_id));
89
90 nodes
91 }
92
93 fn parse_text_content(&self, content: &str, next_id: &mut usize) -> Vec<AstNode> {
95 let mut nodes = Vec::new();
96
97 if self.contains_javascript(content) {
99 let js_node = self.create_js_node(content, next_id);
100 nodes.push(js_node);
101 }
102
103 for suspicious in self.find_suspicious_patterns(content) {
105 let suspicious_node = self.create_suspicious_node(&suspicious, next_id);
106 nodes.push(suspicious_node);
107 }
108
109 for external_ref in self.find_external_references(content) {
111 let ref_node = self.create_external_ref_node(&external_ref, next_id);
112 nodes.push(ref_node);
113 }
114
115 nodes
116 }
117
118 fn parse_pdf_operators(&self, data: &[u8], next_id: &mut usize) -> Vec<AstNode> {
120 let mut nodes = Vec::new();
121
122 if let Ok(content) = String::from_utf8(data.to_vec()) {
123 let tokens = self.tokenize_content_stream(&content);
124 let mut i = 0;
125
126 while i < tokens.len() {
127 if let Some(operator) = self.identify_operator(&tokens, i) {
128 let op_node = self.create_operator_node(&operator, next_id);
129 nodes.push(op_node);
130 i += operator.token_count;
131 } else {
132 i += 1;
133 }
134 }
135 }
136
137 nodes
138 }
139
140 fn tokenize_content_stream(&self, content: &str) -> Vec<String> {
142 let mut tokens = Vec::new();
143 let mut current_token = String::new();
144 let mut in_string = false;
145 let mut in_hex_string = false;
146 let mut escape_next = false;
147
148 for ch in content.chars() {
149 match ch {
150 '(' if !in_hex_string && !escape_next => {
151 if !current_token.is_empty() {
152 tokens.push(current_token.clone());
153 current_token.clear();
154 }
155 in_string = true;
156 current_token.push(ch);
157 }
158 ')' if in_string && !escape_next => {
159 current_token.push(ch);
160 tokens.push(current_token.clone());
161 current_token.clear();
162 in_string = false;
163 }
164 '<' if !in_string => {
165 if !current_token.is_empty() {
166 tokens.push(current_token.clone());
167 current_token.clear();
168 }
169 in_hex_string = true;
170 current_token.push(ch);
171 }
172 '>' if in_hex_string => {
173 current_token.push(ch);
174 tokens.push(current_token.clone());
175 current_token.clear();
176 in_hex_string = false;
177 }
178 '\\' if in_string => {
179 current_token.push(ch);
180 escape_next = true;
181 }
182 c if c.is_whitespace() && !in_string && !in_hex_string => {
183 if !current_token.is_empty() {
184 tokens.push(current_token.clone());
185 current_token.clear();
186 }
187 }
188 _ => {
189 current_token.push(ch);
190 if escape_next {
191 escape_next = false;
192 }
193 }
194 }
195 }
196
197 if !current_token.is_empty() {
198 tokens.push(current_token);
199 }
200
201 tokens
202 }
203
204 fn identify_operator(&self, tokens: &[String], index: usize) -> Option<ContentOperator> {
206 if index >= tokens.len() {
207 return None;
208 }
209
210 let token = &tokens[index];
211
212 match token.as_str() {
213 "BT" => Some(ContentOperator {
215 operator: "BT".to_string(),
216 operands: vec![],
217 operator_type: OperatorType::TextBegin,
218 token_count: 1,
219 suspicious: false,
220 }),
221 "ET" => Some(ContentOperator {
222 operator: "ET".to_string(),
223 operands: vec![],
224 operator_type: OperatorType::TextEnd,
225 token_count: 1,
226 suspicious: false,
227 }),
228 "Tf" if index >= 2 => Some(ContentOperator {
229 operator: "Tf".to_string(),
230 operands: vec![tokens[index - 2].clone(), tokens[index - 1].clone()],
231 operator_type: OperatorType::TextFont,
232 token_count: 3,
233 suspicious: false,
234 }),
235 "Tj" if index >= 1 => Some(ContentOperator {
236 operator: "Tj".to_string(),
237 operands: vec![tokens[index - 1].clone()],
238 operator_type: OperatorType::TextShow,
239 token_count: 2,
240 suspicious: self.is_suspicious_text(&tokens[index - 1]),
241 }),
242 "TJ" if index >= 1 => Some(ContentOperator {
243 operator: "TJ".to_string(),
244 operands: vec![tokens[index - 1].clone()],
245 operator_type: OperatorType::TextShowArray,
246 token_count: 2,
247 suspicious: self.is_suspicious_text(&tokens[index - 1]),
248 }),
249 "q" => Some(ContentOperator {
251 operator: "q".to_string(),
252 operands: vec![],
253 operator_type: OperatorType::GraphicsSave,
254 token_count: 1,
255 suspicious: false,
256 }),
257 "Q" => Some(ContentOperator {
258 operator: "Q".to_string(),
259 operands: vec![],
260 operator_type: OperatorType::GraphicsRestore,
261 token_count: 1,
262 suspicious: false,
263 }),
264 "Do" if index >= 1 => Some(ContentOperator {
266 operator: "Do".to_string(),
267 operands: vec![tokens[index - 1].clone()],
268 operator_type: OperatorType::XObject,
269 token_count: 2,
270 suspicious: false,
271 }),
272 _ => None,
273 }
274 }
275
276 fn contains_javascript(&self, content: &str) -> bool {
278 let js_count = self
279 .js_keywords
280 .iter()
281 .filter(|&keyword| content.contains(keyword))
282 .count();
283
284 js_count >= 2 || content.contains("function") || content.contains("eval")
285 }
286
287 fn find_suspicious_patterns(&self, content: &str) -> Vec<SuspiciousPattern> {
289 let mut patterns = Vec::new();
290
291 for &keyword in &self.suspicious_keywords {
292 if content.contains(keyword) {
293 patterns.push(SuspiciousPattern {
294 pattern: keyword.to_string(),
295 content: content.to_string(),
296 risk_level: self.assess_risk_level(keyword),
297 });
298 }
299 }
300
301 patterns
302 }
303
304 fn find_external_references(&self, content: &str) -> Vec<ExternalReference> {
306 let mut refs = Vec::new();
307
308 if content.contains("http://") || content.contains("https://") {
310 refs.push(ExternalReference {
311 ref_type: "URL".to_string(),
312 target: content.to_string(),
313 suspicious: true,
314 });
315 }
316
317 if content.contains("file://") || content.contains("\\\\") || content.contains("C:\\") {
319 refs.push(ExternalReference {
320 ref_type: "FilePath".to_string(),
321 target: content.to_string(),
322 suspicious: true,
323 });
324 }
325
326 refs
327 }
328
329 fn is_suspicious_text(&self, text: &str) -> bool {
330 self.suspicious_keywords
331 .iter()
332 .any(|&keyword| text.contains(keyword))
333 }
334
335 fn assess_risk_level(&self, keyword: &str) -> RiskLevel {
336 match keyword {
337 k if k.contains("eval") || k.contains("exec") || k.contains("shell") => RiskLevel::High,
338 k if k.contains("ActiveX") || k.contains("launch") || k.contains("system") => {
339 RiskLevel::High
340 }
341 k if k.contains("JavaScript") || k.contains("unescape") => RiskLevel::Medium,
342 _ => RiskLevel::Low,
343 }
344 }
345
346 fn create_js_node(&self, content: &str, next_id: &mut usize) -> AstNode {
347 let node_id = NodeId(*next_id);
348 *next_id += 1;
349
350 let mut properties = HashMap::new();
351 properties.insert("js_content".to_string(), content.to_string());
352 properties.insert("risk_level".to_string(), "high".to_string());
353
354 AstNode::new(
355 node_id,
356 NodeType::EmbeddedJS,
357 PdfValue::String(PdfString::new_literal(content.as_bytes())),
358 )
359 .with_metadata(NodeMetadata {
360 properties,
361 ..Default::default()
362 })
363 }
364
365 fn create_suspicious_node(&self, pattern: &SuspiciousPattern, next_id: &mut usize) -> AstNode {
366 let node_id = NodeId(*next_id);
367 *next_id += 1;
368
369 let mut properties = HashMap::new();
370 properties.insert("pattern".to_string(), pattern.pattern.clone());
371 properties.insert(
372 "risk_level".to_string(),
373 format!("{:?}", pattern.risk_level),
374 );
375
376 AstNode::new(
377 node_id,
378 NodeType::SuspiciousAction,
379 PdfValue::String(PdfString::new_literal(pattern.content.as_bytes())),
380 )
381 .with_metadata(NodeMetadata {
382 properties,
383 ..Default::default()
384 })
385 }
386
387 fn create_external_ref_node(
388 &self,
389 ext_ref: &ExternalReference,
390 next_id: &mut usize,
391 ) -> AstNode {
392 let node_id = NodeId(*next_id);
393 *next_id += 1;
394
395 let mut properties = HashMap::new();
396 properties.insert("ref_type".to_string(), ext_ref.ref_type.clone());
397 properties.insert("target".to_string(), ext_ref.target.clone());
398 properties.insert("suspicious".to_string(), ext_ref.suspicious.to_string());
399
400 AstNode::new(
401 node_id,
402 NodeType::ExternalReference,
403 PdfValue::String(PdfString::new_literal(ext_ref.target.as_bytes())),
404 )
405 .with_metadata(NodeMetadata {
406 properties,
407 ..Default::default()
408 })
409 }
410
411 fn create_operator_node(&self, operator: &ContentOperator, next_id: &mut usize) -> AstNode {
412 let node_id = NodeId(*next_id);
413 *next_id += 1;
414
415 let mut properties = HashMap::new();
416 properties.insert("operator".to_string(), operator.operator.clone());
417 properties.insert("operands".to_string(), operator.operands.join(" "));
418 properties.insert("type".to_string(), format!("{:?}", operator.operator_type));
419 properties.insert("suspicious".to_string(), operator.suspicious.to_string());
420
421 let node_type = match operator.operator_type {
422 OperatorType::TextBegin
423 | OperatorType::TextEnd
424 | OperatorType::TextFont
425 | OperatorType::TextShow
426 | OperatorType::TextShowArray => NodeType::TextOperator,
427 OperatorType::GraphicsSave | OperatorType::GraphicsRestore => {
428 NodeType::GraphicsOperator
429 }
430 _ => NodeType::ContentOperator,
431 };
432
433 AstNode::new(
434 node_id,
435 node_type,
436 PdfValue::String(PdfString::new_literal(operator.operator.as_bytes())),
437 )
438 .with_metadata(NodeMetadata {
439 properties,
440 ..Default::default()
441 })
442 }
443}
444
445#[derive(Debug, Clone)]
446pub struct ContentOperator {
447 pub operator: String,
448 pub operands: Vec<String>,
449 pub operator_type: OperatorType,
450 pub token_count: usize,
451 pub suspicious: bool,
452}
453
454#[derive(Debug, Clone)]
455pub enum OperatorType {
456 TextBegin,
457 TextEnd,
458 TextFont,
459 TextShow,
460 TextShowArray,
461 GraphicsSave,
462 GraphicsRestore,
463 XObject,
464 Other,
465}
466
467#[derive(Debug, Clone)]
468pub struct SuspiciousPattern {
469 pub pattern: String,
470 pub content: String,
471 pub risk_level: RiskLevel,
472}
473
474#[derive(Debug, Clone)]
475pub struct ExternalReference {
476 pub ref_type: String,
477 pub target: String,
478 pub suspicious: bool,
479}
480
481#[derive(Debug, Clone)]
482pub enum RiskLevel {
483 Low,
484 Medium,
485 High,
486 Critical,
487}
488
489impl Default for ContentAnalyzer {
490 fn default() -> Self {
491 Self::new()
492 }
493}