Skip to main content

pdfluent_forms/
parse.rs

1//! AcroForm dictionary parser (B.1).
2
3use crate::flags::FieldFlags;
4use crate::tree::*;
5use pdf_syntax::object::dict::keys;
6use pdf_syntax::object::{Array, Dict, Name, Object, ObjectIdentifier, Rect};
7use pdf_syntax::Pdf;
8use std::collections::BTreeSet;
9
10/// Maximum AcroForm field-tree depth. Bounds `/Kids` recursion on adversarial
11/// or malformed forms; real field hierarchies are shallow.
12const MAX_FIELD_DEPTH: usize = 100;
13
14/// Parse the AcroForm dictionary from a PDF document and build a field tree.
15pub fn parse_acroform(pdf: &Pdf) -> Option<FieldTree> {
16    let xref = pdf.xref();
17    let catalog: Dict<'_> = xref.get(xref.root_id())?;
18    let acroform: Dict<'_> = catalog.get(keys::ACRO_FORM)?;
19    let mut tree = FieldTree::new();
20
21    if let Some(da) = get_string_value(&acroform, keys::DA) {
22        tree.document_da = Some(da);
23    }
24    if let Some(q) = acroform.get::<u32>(keys::Q) {
25        tree.document_quadding = Some(parse_quadding(q));
26    }
27    if let Some(na) = acroform.get::<bool>(keys::NEED_APPEARANCES) {
28        tree.need_appearances = na;
29    }
30    if let Some(sf) = acroform.get::<u32>(keys::SIG_FLAGS) {
31        tree.sig_flags = sf;
32    }
33
34    if let Some(fields_arr) = acroform.get::<Array<'_>>(keys::FIELDS) {
35        // `visited` breaks cyclic /Kids references and shared-node blowups;
36        // `depth` bounds deep nesting.
37        let mut visited = BTreeSet::new();
38        for field_dict in fields_arr.iter::<Dict<'_>>() {
39            parse_field_recursive(&field_dict, &mut tree, None, 0, &mut visited);
40        }
41    }
42
43    if let Some(co_arr) = acroform.get::<Array<'_>>(keys::CO) {
44        for co_obj in co_arr.iter::<Object<'_>>() {
45            if let Object::Dict(co_dict) = co_obj {
46                if let Some(obj_id) = co_dict.obj_id() {
47                    let target = (obj_id.obj_number, obj_id.gen_number);
48                    if let Some(id) = find_by_object_id(&tree, target) {
49                        tree.calculation_order.push(id);
50                    }
51                }
52            }
53        }
54    }
55
56    assign_page_indices(pdf, &mut tree);
57    Some(tree)
58}
59
60fn parse_field_recursive(
61    dict: &Dict<'_>,
62    tree: &mut FieldTree,
63    parent: Option<FieldId>,
64    depth: usize,
65    visited: &mut BTreeSet<ObjectIdentifier>,
66) {
67    // Bound deep nesting and break cyclic/shared /Kids references so a malformed
68    // form cannot overflow the stack or blow up memory.
69    if depth >= MAX_FIELD_DEPTH {
70        return;
71    }
72    if let Some(id) = dict.obj_id() {
73        if !visited.insert(id) {
74            return;
75        }
76    }
77
78    let partial_name = get_string_value(dict, keys::T).unwrap_or_default();
79    let field_type = dict.get::<Name>(keys::FT).and_then(|n| match n.as_ref() {
80        b"Tx" => Some(FieldType::Text),
81        b"Btn" => Some(FieldType::Button),
82        b"Ch" => Some(FieldType::Choice),
83        b"Sig" => Some(FieldType::Signature),
84        _ => None,
85    });
86    let flags = dict
87        .get::<u32>(keys::FF)
88        .map(FieldFlags::from_bits)
89        .unwrap_or_default();
90    let rect = dict
91        .get::<Rect>(keys::RECT)
92        .map(|r| [r.x0 as f32, r.y0 as f32, r.x1 as f32, r.y1 as f32]);
93    let appearance_state = dict
94        .get::<Name>(keys::AS)
95        .map(|n| crate::encoding::decode_name_bytes(n.as_ref()));
96    let object_id = dict.obj_id().map(|oid| (oid.obj_number, oid.gen_number));
97    let on_state = parse_on_state(dict);
98
99    let node = FieldNode {
100        partial_name,
101        alternate_name: get_string_value(dict, keys::TU),
102        mapping_name: get_string_value(dict, keys::TM),
103        field_type,
104        flags,
105        value: parse_field_value(dict, keys::V),
106        default_value: parse_field_value(dict, keys::DV),
107        default_appearance: get_string_value(dict, keys::DA),
108        quadding: dict.get::<u32>(keys::Q).map(parse_quadding),
109        max_len: dict.get::<u32>(keys::MAX_LEN),
110        options: parse_options(dict),
111        top_index: dict.get::<u32>(keys::TI),
112        rect,
113        appearance_state,
114        on_state,
115        page_index: None,
116        parent,
117        children: vec![],
118        object_id,
119        has_actions: dict.contains_key(keys::AA),
120        mk: parse_mk(dict),
121        border_style: parse_border_style(dict),
122    };
123    let id = tree.alloc(node);
124    if let Some(pid) = parent {
125        tree.get_mut(pid).children.push(id);
126    }
127    if let Some(kids_arr) = dict.get::<Array<'_>>(keys::KIDS) {
128        for kid_dict in kids_arr.iter::<Dict<'_>>() {
129            parse_field_recursive(&kid_dict, tree, Some(id), depth + 1, visited);
130        }
131    }
132}
133
134/// Extract the on-state for a button widget: the first non-`Off` key of the
135/// widget's `/AP /N` sub-dictionary (the same resolution rule pdfium's
136/// `GetOnStateName` and mupdf's `pdf_button_field_on_state` use).
137fn parse_on_state(dict: &Dict<'_>) -> Option<String> {
138    let ap: Dict<'_> = dict.get(keys::AP)?;
139    let n: Dict<'_> = ap.get(keys::N)?;
140    let mut found = None;
141    for key in n.keys() {
142        let bytes: &[u8] = key.as_ref();
143        if bytes != b"Off" {
144            found = Some(crate::encoding::decode_name_bytes(bytes));
145            break;
146        }
147    }
148    found
149}
150
151fn parse_field_value(dict: &Dict<'_>, key: &[u8]) -> Option<FieldValue> {
152    let obj: Object<'_> = dict.get(key)?;
153    match obj {
154        Object::String(s) => Some(FieldValue::Text(crate::encoding::decode_pdf_text_bytes(
155            s.as_bytes(),
156        ))),
157        Object::Name(n) => Some(FieldValue::Text(crate::encoding::decode_name_bytes(
158            n.as_ref(),
159        ))),
160        Object::Array(arr) => {
161            let vals: Vec<String> = arr
162                .iter::<Object<'_>>()
163                .filter_map(|o| match o {
164                    Object::String(s) => Some(crate::encoding::decode_pdf_text_bytes(s.as_bytes())),
165                    Object::Name(n) => Some(crate::encoding::decode_name_bytes(n.as_ref())),
166                    _ => None,
167                })
168                .collect();
169            Some(FieldValue::StringArray(vals))
170        }
171        _ => None,
172    }
173}
174
175fn parse_options(dict: &Dict<'_>) -> Vec<ChoiceOption> {
176    let Some(arr) = dict.get::<Array<'_>>(keys::OPT) else {
177        return vec![];
178    };
179    arr.iter::<Object<'_>>()
180        .filter_map(|obj| match obj {
181            Object::String(s) => {
182                let text = crate::encoding::decode_pdf_text_bytes(s.as_bytes());
183                Some(ChoiceOption {
184                    export: text.clone(),
185                    display: text,
186                })
187            }
188            Object::Array(pair) => {
189                let items: Vec<Object<'_>> = pair.iter::<Object<'_>>().collect();
190                if items.len() >= 2 {
191                    Some(ChoiceOption {
192                        export: obj_to_string(&items[0]).unwrap_or_default(),
193                        display: obj_to_string(&items[1]).unwrap_or_default(),
194                    })
195                } else {
196                    None
197                }
198            }
199            _ => None,
200        })
201        .collect()
202}
203
204fn parse_mk(dict: &Dict<'_>) -> Option<MkDict> {
205    let mk_dict: Dict<'_> = dict.get(keys::MK)?;
206    Some(MkDict {
207        border_color: parse_color_array(&mk_dict, keys::BC),
208        background_color: parse_color_array(&mk_dict, keys::BG),
209        caption: get_string_value(&mk_dict, keys::CA),
210        rollover_caption: get_string_value(&mk_dict, &b"RC"[..]),
211        alternate_caption: get_string_value(&mk_dict, keys::AC),
212        text_position: mk_dict.get::<u32>(&b"TP"[..]),
213        rotation: mk_dict.get::<u32>(&b"R"[..]),
214    })
215}
216
217fn parse_color_array(dict: &Dict<'_>, key: &[u8]) -> Option<Vec<f32>> {
218    let arr: Array<'_> = dict.get(key)?;
219    let vals: Vec<f32> = arr.iter::<f32>().collect();
220    if vals.is_empty() {
221        None
222    } else {
223        Some(vals)
224    }
225}
226
227fn parse_border_style(dict: &Dict<'_>) -> Option<BorderStyle> {
228    let bs_dict: Dict<'_> = dict.get(keys::BS)?;
229    Some(BorderStyle {
230        width: bs_dict.get::<f32>(&b"W"[..]).unwrap_or(1.0),
231        style: bs_dict
232            .get::<Name>(&b"S"[..])
233            .and_then(|n| n.as_ref().first().copied())
234            .unwrap_or(b'S'),
235    })
236}
237
238fn parse_quadding(q: u32) -> Quadding {
239    match q {
240        1 => Quadding::Center,
241        2 => Quadding::Right,
242        _ => Quadding::Left,
243    }
244}
245
246fn get_string_value(dict: &Dict<'_>, key: &[u8]) -> Option<String> {
247    obj_to_string(&dict.get::<Object<'_>>(key)?)
248}
249
250fn obj_to_string(obj: &Object<'_>) -> Option<String> {
251    match obj {
252        Object::String(s) => Some(crate::encoding::decode_pdf_text_bytes(s.as_bytes())),
253        Object::Name(n) => Some(crate::encoding::decode_name_bytes(n.as_ref())),
254        _ => None,
255    }
256}
257
258fn find_by_object_id(tree: &FieldTree, target: (i32, i32)) -> Option<FieldId> {
259    tree.all_ids()
260        .find(|&id| tree.get(id).object_id == Some(target))
261}
262
263fn assign_page_indices(pdf: &Pdf, tree: &mut FieldTree) {
264    let pages = pdf.pages();
265    for (page_idx, page) in pages.iter().enumerate() {
266        let raw = page.raw();
267        let Some(annots_arr) = raw.get::<Array<'_>>(keys::ANNOTS) else {
268            continue;
269        };
270        for annot_obj in annots_arr.iter::<Object<'_>>() {
271            if let Object::Dict(annot_dict) = annot_obj {
272                if let Some(annot_oid) = annot_dict.obj_id() {
273                    let target = (annot_oid.obj_number, annot_oid.gen_number);
274                    if let Some(fid) = find_by_object_id(tree, target) {
275                        tree.get_mut(fid).page_index = Some(page_idx);
276                    }
277                }
278            }
279        }
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286    #[test]
287    fn quadding_values() {
288        assert_eq!(parse_quadding(0), Quadding::Left);
289        assert_eq!(parse_quadding(1), Quadding::Center);
290        assert_eq!(parse_quadding(2), Quadding::Right);
291    }
292
293    #[test]
294    fn cyclic_field_tree_terminates_and_is_bounded() {
295        // Two fields whose /Kids reference each other form a cycle; without the
296        // visited/depth guard parsing recurses until stack overflow.
297        fn cyclic_form_pdf() -> Vec<u8> {
298            let objs: [&[u8]; 6] = [
299                b"<< /Type /Catalog /Pages 2 0 R /AcroForm 4 0 R >>",
300                b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
301                b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] >>",
302                b"<< /Fields [5 0 R] >>",
303                b"<< /T (A) /Kids [6 0 R] >>",
304                b"<< /T (B) /Kids [5 0 R] >>", // /Kids back to A -> cycle
305            ];
306            let mut buf = Vec::new();
307            let mut offsets = [0usize; 7];
308            buf.extend_from_slice(b"%PDF-1.7\n");
309            for (i, body) in objs.iter().enumerate() {
310                offsets[i + 1] = buf.len();
311                buf.extend_from_slice(format!("{} 0 obj\n", i + 1).as_bytes());
312                buf.extend_from_slice(body);
313                buf.extend_from_slice(b"\nendobj\n");
314            }
315            let xref_off = buf.len();
316            buf.extend_from_slice(b"xref\n0 7\n0000000000 65535 f \n");
317            for o in &offsets[1..7] {
318                buf.extend_from_slice(format!("{o:010} 00000 n \n").as_bytes());
319            }
320            buf.extend_from_slice(
321                format!("trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF")
322                    .as_bytes(),
323            );
324            buf
325        }
326
327        let pdf = Pdf::new(cyclic_form_pdf()).expect("load cyclic-form PDF");
328        let tree = parse_acroform(&pdf).expect("acroform parses");
329        assert!(
330            tree.len() <= 2,
331            "cyclic /Kids must not inflate the field tree; got {}",
332            tree.len()
333        );
334    }
335}