1use crate::types::{PdfDictionary, PdfStream, PdfValue};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct XfaDocument {
9 pub packets: Vec<XfaPacket>,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct XfaScriptStats {
14 pub script_nodes: usize,
15 pub has_scripts: bool,
16 pub script_node_names: Vec<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct XfaPacket {
21 pub name: String,
22 pub root: XfaNode,
23 pub source_len: usize,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct XfaNode {
28 pub name: String,
29 pub attributes: HashMap<String, String>,
30 pub text: Option<String>,
31 pub children: Vec<XfaNode>,
32}
33
34impl XfaDocument {
35 pub fn from_acroform(acroform: &PdfDictionary) -> Result<Self, String> {
36 let packets = parse_xfa_packets(acroform)?;
37 Ok(Self { packets })
38 }
39
40 pub fn is_empty(&self) -> bool {
41 self.packets.is_empty()
42 }
43
44 pub fn script_stats(&self) -> XfaScriptStats {
45 let mut count = 0usize;
46 let mut names = Vec::new();
47 for packet in &self.packets {
48 count_scripts(&packet.root, &mut count, &mut names);
49 }
50 XfaScriptStats {
51 script_nodes: count,
52 has_scripts: count > 0,
53 script_node_names: unique_names(names),
54 }
55 }
56}
57
58pub fn parse_xfa_packets(acroform: &PdfDictionary) -> Result<Vec<XfaPacket>, String> {
59 let xfa_value = match acroform.get("XFA") {
60 Some(value) => value,
61 None => return Ok(Vec::new()),
62 };
63
64 let mut packets = Vec::new();
65
66 match xfa_value {
67 PdfValue::Stream(stream) => {
68 if let Some(packet) = parse_xfa_packet("xfa", stream)? {
69 packets.push(packet);
70 }
71 }
72 PdfValue::Array(items) => {
73 let mut iter = items.iter();
74 while let Some(name_value) = iter.next() {
75 let packet_name = match name_value {
76 PdfValue::String(s) => s.decode_pdf_encoding(),
77 PdfValue::Name(n) => n.without_slash().to_string(),
78 _ => "packet".to_string(),
79 };
80
81 if let Some(packet_value) = iter.next() {
82 if let Some(packet) = parse_xfa_value(&packet_name, packet_value)? {
83 packets.push(packet);
84 }
85 }
86 }
87 }
88 _ => return Ok(Vec::new()),
89 }
90
91 Ok(packets)
92}
93
94fn parse_xfa_value(name: &str, value: &PdfValue) -> Result<Option<XfaPacket>, String> {
95 match value {
96 PdfValue::Stream(stream) => parse_xfa_packet(name, stream),
97 PdfValue::String(s) => parse_xfa_from_bytes(name, s.as_bytes()),
98 _ => Ok(None),
99 }
100}
101
102fn parse_xfa_packet(name: &str, stream: &PdfStream) -> Result<Option<XfaPacket>, String> {
103 let data = match stream.decode() {
104 Ok(decoded) => decoded,
105 Err(_) => stream.raw_data().map(|d| d.to_vec()).unwrap_or_default(),
106 };
107 parse_xfa_from_bytes(name, &data)
108}
109
110fn parse_xfa_from_bytes(name: &str, bytes: &[u8]) -> Result<Option<XfaPacket>, String> {
111 if bytes.is_empty() {
112 return Ok(None);
113 }
114
115 let xml = String::from_utf8_lossy(bytes).to_string();
116 let root = parse_xml_root(&xml)?;
117
118 Ok(Some(XfaPacket {
119 name: name.to_string(),
120 root,
121 source_len: bytes.len(),
122 }))
123}
124
125fn parse_xml_root(xml: &str) -> Result<XfaNode, String> {
126 let mut reader = Reader::from_str(xml);
127 reader.trim_text(true);
128
129 let mut buf = Vec::new();
130 let mut stack: Vec<XfaNode> = Vec::new();
131 let mut root: Option<XfaNode> = None;
132
133 loop {
134 match reader.read_event_into(&mut buf) {
135 Ok(Event::Start(e)) => {
136 let node = XfaNode {
137 name: String::from_utf8_lossy(e.name().as_ref()).to_string(),
138 attributes: parse_attributes(&reader, &e)?,
139 text: None,
140 children: Vec::new(),
141 };
142 stack.push(node);
143 }
144 Ok(Event::Empty(e)) => {
145 let node = XfaNode {
146 name: String::from_utf8_lossy(e.name().as_ref()).to_string(),
147 attributes: parse_attributes(&reader, &e)?,
148 text: None,
149 children: Vec::new(),
150 };
151 if let Some(parent) = stack.last_mut() {
152 parent.children.push(node);
153 } else if root.is_none() {
154 root = Some(node);
155 }
156 }
157 Ok(Event::Text(e)) => {
158 if let Some(current) = stack.last_mut() {
159 let text = e.unescape().map_err(|e| e.to_string())?;
160 let new_text = text.trim();
161 if !new_text.is_empty() {
162 let existing = current.text.take().unwrap_or_default();
163 let combined = if existing.is_empty() {
164 new_text.to_string()
165 } else {
166 format!("{}{}", existing, new_text)
167 };
168 current.text = Some(combined);
169 }
170 }
171 }
172 Ok(Event::CData(e)) => {
173 if let Some(current) = stack.last_mut() {
174 let text = String::from_utf8_lossy(e.as_ref()).to_string();
175 let new_text = text.trim();
176 if !new_text.is_empty() {
177 let existing = current.text.take().unwrap_or_default();
178 let combined = if existing.is_empty() {
179 new_text.to_string()
180 } else {
181 format!("{}{}", existing, new_text)
182 };
183 current.text = Some(combined);
184 }
185 }
186 }
187 Ok(Event::End(_)) => {
188 if let Some(node) = stack.pop() {
189 if let Some(parent) = stack.last_mut() {
190 parent.children.push(node);
191 } else {
192 root = Some(node);
193 }
194 }
195 }
196 Ok(Event::Eof) => break,
197 Err(e) => return Err(format!("XFA XML parse error: {}", e)),
198 _ => {}
199 }
200 buf.clear();
201 }
202
203 root.ok_or_else(|| "XFA XML document has no root element".to_string())
204}
205
206fn parse_attributes(
207 reader: &Reader<&[u8]>,
208 element: &quick_xml::events::BytesStart,
209) -> Result<HashMap<String, String>, String> {
210 let mut attrs = HashMap::new();
211 for attr in element.attributes() {
212 let attr = attr.map_err(|e| e.to_string())?;
213 let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
214 let value = attr
215 .decode_and_unescape_value(reader)
216 .map_err(|e| e.to_string())?
217 .to_string();
218 attrs.insert(key, value);
219 }
220 Ok(attrs)
221}
222
223fn count_scripts(node: &XfaNode, count: &mut usize, names: &mut Vec<String>) {
224 if is_script_node(&node.name) || has_script_attribute(node) {
225 *count += 1;
226 names.push(node.name.clone());
227 }
228 for child in &node.children {
229 count_scripts(child, count, names);
230 }
231}
232
233fn is_script_node(name: &str) -> bool {
234 matches!(
235 name,
236 "script"
237 | "event"
238 | "calculate"
239 | "validate"
240 | "execute"
241 | "exec"
242 | "init"
243 | "preSubmit"
244 | "postSubmit"
245 | "preOpen"
246 | "postOpen"
247 )
248}
249
250fn has_script_attribute(node: &XfaNode) -> bool {
251 node.attributes
252 .get("runAt")
253 .map(|v| !v.is_empty())
254 .unwrap_or(false)
255 || node
256 .attributes
257 .get("script")
258 .map(|v| !v.is_empty())
259 .unwrap_or(false)
260}
261
262fn unique_names(mut names: Vec<String>) -> Vec<String> {
263 names.sort();
264 names.dedup();
265 names
266}
267
268#[cfg(test)]
269mod tests {
270 use super::*;
271 use crate::types::PdfString;
272
273 #[test]
274 fn parse_simple_xfa_xml() {
275 let xml = r#"<xfa><form><field name="a">1</field></form></xfa>"#;
276 let root = parse_xml_root(xml).unwrap();
277 assert_eq!(root.name, "xfa");
278 assert_eq!(root.children.len(), 1);
279 }
280
281 #[test]
282 fn parse_xfa_packet_from_string() {
283 let xml = PdfString::new_literal(b"<xfa><data>ok</data></xfa>");
284 let packet = parse_xfa_from_bytes("form", xml.as_bytes())
285 .unwrap()
286 .unwrap();
287 assert_eq!(packet.name, "form");
288 assert_eq!(packet.root.name, "xfa");
289 }
290
291 #[test]
292 fn xfa_script_detection() {
293 let xml = PdfString::new_literal(
294 b"<xfa><form><event><script>app.alert('x')</script></event></form></xfa>",
295 );
296 let packet = parse_xfa_from_bytes("form", xml.as_bytes())
297 .unwrap()
298 .unwrap();
299 let doc = XfaDocument {
300 packets: vec![packet],
301 };
302 let stats = doc.script_stats();
303 assert!(stats.has_scripts);
304 assert!(stats.script_nodes >= 1);
305 }
306}