Skip to main content

pdf_ast/forms/
xfa.rs

1use crate::types::{PdfDictionary, PdfStream, PdfValue};
2use quick_xml::events::Event;
3use quick_xml::Reader;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct XfaDocument {
9    pub packets: Vec<XfaPacket>,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct XfaScriptStats {
14    pub script_nodes: usize,
15    pub has_scripts: bool,
16    pub script_node_names: Vec<String>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct XfaPacket {
21    pub name: String,
22    pub root: XfaNode,
23    pub source_len: usize,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct XfaNode {
28    pub name: String,
29    pub attributes: HashMap<String, String>,
30    pub text: Option<String>,
31    pub children: Vec<XfaNode>,
32}
33
34impl XfaDocument {
35    pub fn from_acroform(acroform: &PdfDictionary) -> Result<Self, String> {
36        let packets = parse_xfa_packets(acroform)?;
37        Ok(Self { packets })
38    }
39
40    pub fn is_empty(&self) -> bool {
41        self.packets.is_empty()
42    }
43
44    pub fn script_stats(&self) -> XfaScriptStats {
45        let mut count = 0usize;
46        let mut names = Vec::new();
47        for packet in &self.packets {
48            count_scripts(&packet.root, &mut count, &mut names);
49        }
50        XfaScriptStats {
51            script_nodes: count,
52            has_scripts: count > 0,
53            script_node_names: unique_names(names),
54        }
55    }
56}
57
58pub fn parse_xfa_packets(acroform: &PdfDictionary) -> Result<Vec<XfaPacket>, String> {
59    let xfa_value = match acroform.get("XFA") {
60        Some(value) => value,
61        None => return Ok(Vec::new()),
62    };
63
64    let mut packets = Vec::new();
65
66    match xfa_value {
67        PdfValue::Stream(stream) => {
68            if let Some(packet) = parse_xfa_packet("xfa", stream)? {
69                packets.push(packet);
70            }
71        }
72        PdfValue::Array(items) => {
73            let mut iter = items.iter();
74            while let Some(name_value) = iter.next() {
75                let packet_name = match name_value {
76                    PdfValue::String(s) => s.decode_pdf_encoding(),
77                    PdfValue::Name(n) => n.without_slash().to_string(),
78                    _ => "packet".to_string(),
79                };
80
81                if let Some(packet_value) = iter.next() {
82                    if let Some(packet) = parse_xfa_value(&packet_name, packet_value)? {
83                        packets.push(packet);
84                    }
85                }
86            }
87        }
88        _ => return Ok(Vec::new()),
89    }
90
91    Ok(packets)
92}
93
94fn parse_xfa_value(name: &str, value: &PdfValue) -> Result<Option<XfaPacket>, String> {
95    match value {
96        PdfValue::Stream(stream) => parse_xfa_packet(name, stream),
97        PdfValue::String(s) => parse_xfa_from_bytes(name, s.as_bytes()),
98        _ => Ok(None),
99    }
100}
101
102fn parse_xfa_packet(name: &str, stream: &PdfStream) -> Result<Option<XfaPacket>, String> {
103    let data = match stream.decode() {
104        Ok(decoded) => decoded,
105        Err(_) => stream.raw_data().map(|d| d.to_vec()).unwrap_or_default(),
106    };
107    parse_xfa_from_bytes(name, &data)
108}
109
110fn parse_xfa_from_bytes(name: &str, bytes: &[u8]) -> Result<Option<XfaPacket>, String> {
111    if bytes.is_empty() {
112        return Ok(None);
113    }
114
115    let xml = String::from_utf8_lossy(bytes).to_string();
116    let root = parse_xml_root(&xml)?;
117
118    Ok(Some(XfaPacket {
119        name: name.to_string(),
120        root,
121        source_len: bytes.len(),
122    }))
123}
124
125fn parse_xml_root(xml: &str) -> Result<XfaNode, String> {
126    let mut reader = Reader::from_str(xml);
127    reader.trim_text(true);
128
129    let mut buf = Vec::new();
130    let mut stack: Vec<XfaNode> = Vec::new();
131    let mut root: Option<XfaNode> = None;
132
133    loop {
134        match reader.read_event_into(&mut buf) {
135            Ok(Event::Start(e)) => {
136                let node = XfaNode {
137                    name: String::from_utf8_lossy(e.name().as_ref()).to_string(),
138                    attributes: parse_attributes(&reader, &e)?,
139                    text: None,
140                    children: Vec::new(),
141                };
142                stack.push(node);
143            }
144            Ok(Event::Empty(e)) => {
145                let node = XfaNode {
146                    name: String::from_utf8_lossy(e.name().as_ref()).to_string(),
147                    attributes: parse_attributes(&reader, &e)?,
148                    text: None,
149                    children: Vec::new(),
150                };
151                if let Some(parent) = stack.last_mut() {
152                    parent.children.push(node);
153                } else if root.is_none() {
154                    root = Some(node);
155                }
156            }
157            Ok(Event::Text(e)) => {
158                if let Some(current) = stack.last_mut() {
159                    let text = e.unescape().map_err(|e| e.to_string())?;
160                    let new_text = text.trim();
161                    if !new_text.is_empty() {
162                        let existing = current.text.take().unwrap_or_default();
163                        let combined = if existing.is_empty() {
164                            new_text.to_string()
165                        } else {
166                            format!("{}{}", existing, new_text)
167                        };
168                        current.text = Some(combined);
169                    }
170                }
171            }
172            Ok(Event::CData(e)) => {
173                if let Some(current) = stack.last_mut() {
174                    let text = String::from_utf8_lossy(e.as_ref()).to_string();
175                    let new_text = text.trim();
176                    if !new_text.is_empty() {
177                        let existing = current.text.take().unwrap_or_default();
178                        let combined = if existing.is_empty() {
179                            new_text.to_string()
180                        } else {
181                            format!("{}{}", existing, new_text)
182                        };
183                        current.text = Some(combined);
184                    }
185                }
186            }
187            Ok(Event::End(_)) => {
188                if let Some(node) = stack.pop() {
189                    if let Some(parent) = stack.last_mut() {
190                        parent.children.push(node);
191                    } else {
192                        root = Some(node);
193                    }
194                }
195            }
196            Ok(Event::Eof) => break,
197            Err(e) => return Err(format!("XFA XML parse error: {}", e)),
198            _ => {}
199        }
200        buf.clear();
201    }
202
203    root.ok_or_else(|| "XFA XML document has no root element".to_string())
204}
205
206fn parse_attributes(
207    reader: &Reader<&[u8]>,
208    element: &quick_xml::events::BytesStart,
209) -> Result<HashMap<String, String>, String> {
210    let mut attrs = HashMap::new();
211    for attr in element.attributes() {
212        let attr = attr.map_err(|e| e.to_string())?;
213        let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
214        let value = attr
215            .decode_and_unescape_value(reader)
216            .map_err(|e| e.to_string())?
217            .to_string();
218        attrs.insert(key, value);
219    }
220    Ok(attrs)
221}
222
223fn count_scripts(node: &XfaNode, count: &mut usize, names: &mut Vec<String>) {
224    if is_script_node(&node.name) || has_script_attribute(node) {
225        *count += 1;
226        names.push(node.name.clone());
227    }
228    for child in &node.children {
229        count_scripts(child, count, names);
230    }
231}
232
233fn is_script_node(name: &str) -> bool {
234    matches!(
235        name,
236        "script"
237            | "event"
238            | "calculate"
239            | "validate"
240            | "execute"
241            | "exec"
242            | "init"
243            | "preSubmit"
244            | "postSubmit"
245            | "preOpen"
246            | "postOpen"
247    )
248}
249
250fn has_script_attribute(node: &XfaNode) -> bool {
251    node.attributes
252        .get("runAt")
253        .map(|v| !v.is_empty())
254        .unwrap_or(false)
255        || node
256            .attributes
257            .get("script")
258            .map(|v| !v.is_empty())
259            .unwrap_or(false)
260}
261
262fn unique_names(mut names: Vec<String>) -> Vec<String> {
263    names.sort();
264    names.dedup();
265    names
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use crate::types::PdfString;
272
273    #[test]
274    fn parse_simple_xfa_xml() {
275        let xml = r#"<xfa><form><field name="a">1</field></form></xfa>"#;
276        let root = parse_xml_root(xml).unwrap();
277        assert_eq!(root.name, "xfa");
278        assert_eq!(root.children.len(), 1);
279    }
280
281    #[test]
282    fn parse_xfa_packet_from_string() {
283        let xml = PdfString::new_literal(b"<xfa><data>ok</data></xfa>");
284        let packet = parse_xfa_from_bytes("form", xml.as_bytes())
285            .unwrap()
286            .unwrap();
287        assert_eq!(packet.name, "form");
288        assert_eq!(packet.root.name, "xfa");
289    }
290
291    #[test]
292    fn xfa_script_detection() {
293        let xml = PdfString::new_literal(
294            b"<xfa><form><event><script>app.alert('x')</script></event></form></xfa>",
295        );
296        let packet = parse_xfa_from_bytes("form", xml.as_bytes())
297            .unwrap()
298            .unwrap();
299        let doc = XfaDocument {
300            packets: vec![packet],
301        };
302        let stats = doc.script_stats();
303        assert!(stats.has_scripts);
304        assert!(stats.script_nodes >= 1);
305    }
306}