Skip to main content

edgeparse_core/pdf/
form_extractor.rs

1//! AcroForm field extraction from PDF documents.
2//!
3//! Reads form fields (text inputs, checkboxes, radio buttons, dropdowns)
4//! and their current values from the PDF's AcroForm dictionary.
5
6use lopdf::{Document, Object};
7use serde::{Deserialize, Serialize};
8
9/// Type of form field.
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
11pub enum FormFieldType {
12    /// Text input field
13    Text,
14    /// Checkbox
15    Checkbox,
16    /// Radio button
17    RadioButton,
18    /// Dropdown / combo box
19    Choice,
20    /// Push button
21    Button,
22    /// Signature field
23    Signature,
24    /// Unknown field type
25    Unknown,
26}
27
28/// A single form field extracted from the PDF.
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct FormField {
31    /// Field name (fully qualified)
32    pub name: String,
33    /// Field type
34    pub field_type: FormFieldType,
35    /// Current value (if any)
36    pub value: Option<String>,
37    /// Default value
38    pub default_value: Option<String>,
39    /// Whether the field is read-only
40    pub read_only: bool,
41    /// Whether the field is required
42    pub required: bool,
43    /// Page number (1-based) if determinable
44    pub page_number: Option<u32>,
45}
46
47/// Extract all AcroForm fields from a PDF document.
48pub fn extract_form_fields(doc: &Document) -> Vec<FormField> {
49    let mut fields = Vec::new();
50
51    // Get the AcroForm dictionary from the document catalog
52    let catalog = match doc.catalog() {
53        Ok(c) => c,
54        Err(_) => return fields,
55    };
56
57    let acroform = match catalog.get(b"AcroForm") {
58        Ok(obj) => resolve(doc, obj),
59        Err(_) => return fields,
60    };
61
62    let acroform_dict = match acroform.as_dict() {
63        Ok(d) => d,
64        Err(_) => return fields,
65    };
66
67    // Get the Fields array
68    let fields_array = match acroform_dict.get(b"Fields") {
69        Ok(obj) => resolve(doc, obj),
70        Err(_) => return fields,
71    };
72
73    let fields_arr = match fields_array.as_array() {
74        Ok(a) => a,
75        Err(_) => return fields,
76    };
77
78    // Process each field reference
79    for field_ref in fields_arr {
80        let field_obj = resolve(doc, field_ref);
81        if let Ok(dict) = field_obj.as_dict() {
82            extract_field(doc, dict, "", &mut fields);
83        }
84    }
85
86    fields
87}
88
89/// Recursively extract a field and its children (for hierarchical fields).
90fn extract_field(
91    doc: &Document,
92    dict: &lopdf::Dictionary,
93    parent_name: &str,
94    fields: &mut Vec<FormField>,
95) {
96    // Get partial name
97    let partial_name = extract_string(dict, b"T").unwrap_or_default();
98    let full_name = if parent_name.is_empty() {
99        partial_name.clone()
100    } else if partial_name.is_empty() {
101        parent_name.to_string()
102    } else {
103        format!("{parent_name}.{partial_name}")
104    };
105
106    // Check for Kids (child fields)
107    if let Ok(kids_obj) = dict.get(b"Kids") {
108        let kids = resolve(doc, kids_obj);
109        if let Ok(kids_arr) = kids.as_array() {
110            for kid_ref in kids_arr {
111                let kid_obj = resolve(doc, kid_ref);
112                if let Ok(kid_dict) = kid_obj.as_dict() {
113                    extract_field(doc, kid_dict, &full_name, fields);
114                }
115            }
116            // If the field has kids and no own widget, don't add it as a leaf
117            if dict.get(b"Subtype").is_err() {
118                return;
119            }
120        }
121    }
122
123    // Determine field type
124    let field_type = determine_field_type(dict);
125
126    // Get value
127    let value = extract_string(dict, b"V");
128    let default_value = extract_string(dict, b"DV");
129
130    // Get flags
131    let ff = dict
132        .get(b"Ff")
133        .ok()
134        .and_then(|o| {
135            if let Object::Integer(i) = resolve(doc, o) {
136                Some(*i as u32)
137            } else {
138                None
139            }
140        })
141        .unwrap_or(0);
142
143    let read_only = (ff & 1) != 0;
144    let required = (ff & 2) != 0;
145
146    fields.push(FormField {
147        name: full_name,
148        field_type,
149        value,
150        default_value,
151        read_only,
152        required,
153        page_number: None,
154    });
155}
156
157/// Determine the field type from the /FT entry.
158fn determine_field_type(dict: &lopdf::Dictionary) -> FormFieldType {
159    match dict.get(b"FT") {
160        Ok(obj) => {
161            if let Object::Name(name) = obj {
162                match name.as_slice() {
163                    b"Tx" => FormFieldType::Text,
164                    b"Btn" => {
165                        // Distinguish checkbox vs radio vs push button
166                        let ff = dict
167                            .get(b"Ff")
168                            .ok()
169                            .and_then(|o| {
170                                if let Object::Integer(i) = o {
171                                    Some(*i as u32)
172                                } else {
173                                    None
174                                }
175                            })
176                            .unwrap_or(0);
177                        if (ff & 0x10000) != 0 {
178                            FormFieldType::Button // push button
179                        } else if (ff & 0x8000) != 0 {
180                            FormFieldType::RadioButton
181                        } else {
182                            FormFieldType::Checkbox
183                        }
184                    }
185                    b"Ch" => FormFieldType::Choice,
186                    b"Sig" => FormFieldType::Signature,
187                    _ => FormFieldType::Unknown,
188                }
189            } else {
190                FormFieldType::Unknown
191            }
192        }
193        Err(_) => FormFieldType::Unknown,
194    }
195}
196
197/// Extract a string value from a dictionary field.
198fn extract_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
199    dict.get(key).ok().and_then(|obj| match obj {
200        Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()),
201        Object::Name(bytes) => Some(String::from_utf8_lossy(bytes).to_string()),
202        _ => None,
203    })
204}
205
206/// Resolve an object reference, following indirect references.
207fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
208    match obj {
209        Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
210        _ => obj,
211    }
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn test_form_field_type_default() {
220        let dict = lopdf::Dictionary::new();
221        assert_eq!(determine_field_type(&dict), FormFieldType::Unknown);
222    }
223
224    #[test]
225    fn test_form_field_type_text() {
226        let mut dict = lopdf::Dictionary::new();
227        dict.set("FT", Object::Name(b"Tx".to_vec()));
228        assert_eq!(determine_field_type(&dict), FormFieldType::Text);
229    }
230
231    #[test]
232    fn test_form_field_type_checkbox() {
233        let mut dict = lopdf::Dictionary::new();
234        dict.set("FT", Object::Name(b"Btn".to_vec()));
235        assert_eq!(determine_field_type(&dict), FormFieldType::Checkbox);
236    }
237
238    #[test]
239    fn test_form_field_type_radio() {
240        let mut dict = lopdf::Dictionary::new();
241        dict.set("FT", Object::Name(b"Btn".to_vec()));
242        dict.set("Ff", Object::Integer(0x8000));
243        assert_eq!(determine_field_type(&dict), FormFieldType::RadioButton);
244    }
245
246    #[test]
247    fn test_extract_string_value() {
248        let mut dict = lopdf::Dictionary::new();
249        dict.set(
250            "V",
251            Object::String(b"Hello".to_vec(), lopdf::StringFormat::Literal),
252        );
253        assert_eq!(extract_string(&dict, b"V"), Some("Hello".to_string()));
254    }
255
256    #[test]
257    fn test_extract_string_missing() {
258        let dict = lopdf::Dictionary::new();
259        assert_eq!(extract_string(&dict, b"V"), None);
260    }
261}