Skip to main content

_formatparse/parser/
format_parser.rs

1use crate::parser::matching::FieldCaptureSlices;
2use crate::result::ParseResult;
3use fancy_regex::Regex;
4use formatparse_core::count_capturing_groups;
5use formatparse_core::parser::MAX_FIELDS;
6use formatparse_core::{FieldSpec, FieldType};
7use pyo3::exceptions::PyValueError;
8use pyo3::prelude::*;
9use std::collections::HashMap;
10use std::sync::Arc;
11
12/// Field layout produced at pattern-compile time (narrow interface for matchers).
13pub(crate) struct CompiledFields {
14    pub field_specs: Vec<FieldSpec>,
15    pub field_names: Vec<Option<String>>,
16    pub normalized_names: Vec<Option<String>>,
17    pub custom_type_groups: Vec<usize>,
18    pub has_nested_dict_fields: Vec<bool>,
19    pub nested_parsers: Vec<Option<Arc<FormatParser>>>,
20    pub field_count: usize,
21}
22
23impl CompiledFields {
24    pub fn capture_slices(&self) -> FieldCaptureSlices<'_> {
25        FieldCaptureSlices {
26            field_specs: &self.field_specs,
27            field_names: &self.field_names,
28            normalized_names: &self.normalized_names,
29            custom_type_groups: &self.custom_type_groups,
30            has_nested_dict_fields: &self.has_nested_dict_fields,
31            nested_parsers: &self.nested_parsers,
32        }
33    }
34}
35
36#[pyclass(module = "_formatparse")]
37/// Compiled format pattern for parsing strings.
38///
39/// Construct with :func:`formatparse.compile` (or ``FormatParser(pattern, extra_types=...)`` in Python).
40///
41/// **Custom types:** converters passed as ``extra_types`` at compile time are stored
42/// and merged with any ``extra_types`` passed per call to ``parse`` or ``search``
43/// (per-call keys override stored keys).
44///
45/// **Pickling:** Only the pattern string is serialized. If the parser was built
46/// with ``extra_types``, those converters are **not** restored after unpickling;
47/// call ``compile(pattern, extra_types=...)`` again with the same mapping.
48pub struct FormatParser {
49    #[pyo3(get)]
50    // Note: This field is actually used in __getstate__, format getter, and accessed from Python.
51    // The dead_code warning is a false positive - the compiler doesn't recognize PyO3 getter usage.
52    pub pattern: String,
53    pub(crate) regex: Regex,
54    pub(crate) regex_str: String, // Store the regex string for _expression property
55    pub(crate) regex_case_insensitive: Option<Regex>,
56    pub(crate) search_regex: Regex, // Pre-compiled search regex (case-sensitive, no anchors)
57    pub(crate) search_regex_case_insensitive: Option<Regex>, // Pre-compiled search regex (case-insensitive, no anchors)
58    pub(crate) fields: CompiledFields,
59    #[allow(dead_code)]
60    pub(crate) name_mapping: std::collections::HashMap<String, String>, // Map normalized -> original
61    pub(crate) stored_extra_types: Option<HashMap<String, PyObject>>, // Store extra_types for use during conversion
62    pub(crate) allows_empty_default_string_match: bool, // True iff parse("") can use empty-field fast path (issue #16)
63}
64
65impl FormatParser {
66    /// Returns true when this parser matches a cache lookup: same normalized pattern and
67    /// the same `extra_types` fingerprint as [`crate::extract_extra_types_identity`].
68    /// Used after an LRU hit on the hash key to rule out collisions.
69    pub(crate) fn matches_pattern_cache_request(
70        &self,
71        py: Python<'_>,
72        normalized_pattern: &str,
73        extra_types: &Option<HashMap<String, PyObject>>,
74    ) -> bool {
75        if self.pattern != normalized_pattern {
76            return false;
77        }
78        let requested = crate::extract_extra_types_identity(py, extra_types);
79        let stored = crate::extract_extra_types_identity(py, &self.stored_extra_types);
80        requested == stored
81    }
82
83    pub fn new(pattern: &str) -> PyResult<Self> {
84        Self::new_with_extra_types(pattern, None)
85    }
86
87    pub fn new_with_extra_types(
88        pattern: &str,
89        extra_types: Option<HashMap<String, PyObject>>,
90    ) -> PyResult<Self> {
91        let pattern_owned = crate::pattern_normalize::prepare_compiled_pattern(pattern)?;
92        let custom_patterns = Python::with_gil(|py| -> PyResult<HashMap<String, String>> {
93            let mut patterns = HashMap::new();
94            if let Some(ref extra_types_map) = extra_types {
95                for (name, converter_obj) in extra_types_map {
96                    // Try to get the pattern attribute from the converter function
97                    let converter_ref = converter_obj.bind(py);
98                    if let Ok(pattern_attr) = converter_ref.getattr("pattern") {
99                        if let Ok(pattern_str) = pattern_attr.extract::<String>() {
100                            patterns.insert(name.clone(), pattern_str);
101                        }
102                    }
103                }
104            }
105            Ok(patterns)
106        })?;
107
108        let (
109            regex_str_with_anchors,
110            regex_str,
111            field_specs,
112            field_names,
113            normalized_names,
114            name_mapping,
115            allows_empty_default_string_match,
116        ) = formatparse_core::parser::pattern::parse_pattern(
117            &pattern_owned,
118            &custom_patterns,
119            true,
120            0,
121        )
122        .map_err(crate::parser::pattern::pattern_compile_error_to_py)?;
123
124        // Search/findall use a separate compile path without "empty delimited" `.*?` groups so
125        // unanchored matching does not stop early (e.g. `{}, {}` on "Hello, World").
126        let (regex_str_search_anchored, _, _, _, _, _, _) =
127            formatparse_core::parser::pattern::parse_pattern(
128                &pattern_owned,
129                &custom_patterns,
130                false,
131                0,
132            )
133            .map_err(crate::parser::pattern::pattern_compile_error_to_py)?;
134
135        // Validate field count
136        if field_specs.len() > MAX_FIELDS {
137            return Err(PyValueError::new_err(format!(
138                "Pattern contains {} fields, which exceeds the maximum allowed count of {}",
139                field_specs.len(),
140                MAX_FIELDS
141            )));
142        }
143
144        let nested_parsers: Vec<Option<Arc<FormatParser>>> =
145            Python::with_gil(|py| -> PyResult<_> {
146                let mut out = Vec::with_capacity(field_specs.len());
147                for spec in &field_specs {
148                    if matches!(spec.field_type, FieldType::Nested) {
149                        let sub = spec.nested_subpattern.as_ref().ok_or_else(|| {
150                            PyValueError::new_err("internal error: nested field missing subpattern")
151                        })?;
152                        let cloned_et = extra_types.as_ref().map(|m| {
153                            m.iter()
154                                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
155                                .collect::<HashMap<_, _>>()
156                        });
157                        out.push(Some(Arc::new(FormatParser::new_with_extra_types(
158                            sub, cloned_et,
159                        )?)));
160                    } else {
161                        out.push(None);
162                    }
163                }
164                Ok(out)
165            })?;
166        // Pre-compute custom type validation results (pattern_groups per field)
167        // This avoids calling validate_custom_type_pattern for every match
168        let custom_type_groups = Python::with_gil(|py| -> PyResult<Vec<usize>> {
169            let mut groups = Vec::with_capacity(field_specs.len());
170            let empty_map = std::collections::HashMap::new();
171            let custom_converters = extra_types
172                .as_ref()
173                .map(|et| et as &HashMap<String, PyObject>)
174                .unwrap_or(&empty_map);
175
176            for spec in &field_specs {
177                let pattern_groups = if matches!(spec.field_type, FieldType::Nested) {
178                    spec.nested_regex_body
179                        .as_ref()
180                        .map(|b| count_capturing_groups(b))
181                        .unwrap_or(0)
182                } else if !custom_converters.is_empty() {
183                    crate::parser::matching::validate_custom_type_pattern(
184                        spec,
185                        custom_converters,
186                        py,
187                    )?
188                } else {
189                    0
190                };
191                groups.push(pattern_groups);
192            }
193            Ok(groups)
194        })?;
195
196        // Pre-compute which fields have nested dict names (contain '[')
197        // This avoids checking original_name.contains('[') in the hot path
198        let has_nested_dict_fields: Vec<bool> = field_names
199            .iter()
200            .map(|name_opt| name_opt.as_ref().map(|n| n.contains('[')).unwrap_or(false))
201            .collect();
202
203        // Build regex with DOTALL flag
204        let regex = formatparse_core::build_regex(&regex_str_with_anchors)
205            .map_err(crate::error::core_error_to_py_err)?;
206
207        let regex_search_anchored = formatparse_core::build_regex(&regex_str_search_anchored)
208            .map_err(crate::error::core_error_to_py_err)?;
209
210        // Build case-insensitive regex
211        let regex_case_insensitive =
212            formatparse_core::build_case_insensitive_regex(&regex_str_with_anchors);
213
214        // Pre-compile search regex variants (without anchors)
215        let search_regex =
216            formatparse_core::build_search_regex(regex_search_anchored.as_str(), true)
217                .map_err(crate::error::core_error_to_py_err)?;
218        let search_regex_case_insensitive =
219            formatparse_core::build_search_regex(regex_search_anchored.as_str(), false).ok();
220
221        let field_count = field_specs.len();
222        Ok(Self {
223            pattern: pattern_owned,
224            regex,
225            regex_str,
226            regex_case_insensitive,
227            search_regex,
228            search_regex_case_insensitive,
229            fields: CompiledFields {
230                field_specs,
231                field_names,
232                normalized_names,
233                custom_type_groups,
234                has_nested_dict_fields,
235                nested_parsers,
236                field_count,
237            },
238            name_mapping,
239            stored_extra_types: extra_types,
240            allows_empty_default_string_match,
241        })
242    }
243
244    pub fn search_pattern(
245        &self,
246        string: &str,
247        case_sensitive: bool,
248        extra_types: Option<HashMap<String, PyObject>>,
249        evaluate_result: bool,
250    ) -> PyResult<Option<PyObject>> {
251        // Use pre-compiled search regex
252        let search_regex = if case_sensitive {
253            &self.search_regex
254        } else {
255            self.search_regex_case_insensitive
256                .as_ref()
257                .unwrap_or(&self.search_regex)
258        };
259
260        Python::with_gil(|py| {
261            if search_regex
262                .captures(string)
263                .map_err(crate::error::fancy_regex_match_error)?
264                .is_some()
265            {
266                let extra_types_ref = if let Some(ref et) = extra_types {
267                    et
268                } else {
269                    &HashMap::new()
270                };
271                let f = &self.fields;
272                return crate::parser::matching::match_with_regex(
273                    search_regex,
274                    &crate::parser::matching::RegexMatchContext {
275                        string,
276                        pattern: &self.pattern,
277                        field_specs: &f.field_specs,
278                        field_names: &f.field_names,
279                        normalized_names: &f.normalized_names,
280                        nested_parsers: &f.nested_parsers,
281                        py,
282                        custom_converters: extra_types_ref,
283                        evaluate_result,
284                    },
285                );
286            }
287            Ok(None)
288        })
289    }
290
291    pub(crate) fn parse_internal(
292        &self,
293        string: &str,
294        case_sensitive: bool,
295        extra_types: Option<&HashMap<String, PyObject>>,
296        evaluate_result: bool,
297    ) -> PyResult<Option<PyObject>> {
298        Python::with_gil(|py| {
299            let empty = HashMap::<String, PyObject>::new();
300            let extra_types_ref = extra_types.unwrap_or(&empty);
301
302            // Use existing regex (custom type handling is done in convert_value)
303            let regex = if case_sensitive {
304                &self.regex
305            } else {
306                self.regex_case_insensitive.as_ref().unwrap_or(&self.regex)
307            };
308
309            let f = &self.fields;
310            if string.is_empty()
311                && self.allows_empty_default_string_match
312                && !f.field_specs.is_empty()
313            {
314                if let Some(obj) = crate::parser::matching::match_empty_default_string_parse(
315                    &self.pattern,
316                    &f.field_specs,
317                    &f.field_names,
318                    &f.normalized_names,
319                    py,
320                    extra_types_ref,
321                    evaluate_result,
322                )? {
323                    return Ok(Some(obj));
324                }
325            }
326
327            crate::parser::matching::match_with_regex(
328                regex,
329                &crate::parser::matching::RegexMatchContext {
330                    string,
331                    pattern: &self.pattern,
332                    field_specs: &f.field_specs,
333                    field_names: &f.field_names,
334                    normalized_names: &f.normalized_names,
335                    nested_parsers: &f.nested_parsers,
336                    py,
337                    custom_converters: extra_types_ref,
338                    evaluate_result,
339                },
340            )
341        })
342    }
343
344    /// Get the search regex for a given case sensitivity
345    pub(crate) fn get_search_regex(&self, case_sensitive: bool) -> &Regex {
346        if case_sensitive {
347            &self.search_regex
348        } else {
349            self.search_regex_case_insensitive
350                .as_ref()
351                .unwrap_or(&self.search_regex)
352        }
353    }
354
355    /// Parse one capture slice with this parser's pattern (nested fields, issue #12).
356    pub(crate) fn parse_nested_capture(
357        &self,
358        py: Python<'_>,
359        slice: &str,
360        custom_converters: &HashMap<String, PyObject>,
361    ) -> PyResult<Option<Py<ParseResult>>> {
362        use pyo3::types::PyAnyMethods;
363        let mut merged = HashMap::new();
364        if let Some(ref stored) = self.stored_extra_types {
365            for (k, v) in stored {
366                merged.insert(k.clone(), v.clone_ref(py));
367            }
368        }
369        for (k, v) in custom_converters {
370            merged.insert(k.clone(), v.clone_ref(py));
371        }
372        let opt = self.parse_internal(slice, true, Some(&merged), true)?;
373        let Some(obj) = opt else {
374            return Ok(None);
375        };
376        let bound = obj.bind(py);
377        let pr = bound.downcast::<ParseResult>().map_err(|_| {
378            PyValueError::new_err("internal error: nested parse did not return ParseResult")
379        })?;
380        Ok(Some(pr.clone().unbind()))
381    }
382}
383
384impl Clone for FormatParser {
385    fn clone(&self) -> Self {
386        Python::with_gil(|py| Self {
387            pattern: self.pattern.clone(),
388            regex: self.regex.clone(),
389            regex_str: self.regex_str.clone(),
390            regex_case_insensitive: self.regex_case_insensitive.clone(),
391            search_regex: self.search_regex.clone(),
392            search_regex_case_insensitive: self.search_regex_case_insensitive.clone(),
393            fields: CompiledFields {
394                field_specs: self.fields.field_specs.clone(),
395                field_names: self.fields.field_names.clone(),
396                normalized_names: self.fields.normalized_names.clone(),
397                custom_type_groups: self.fields.custom_type_groups.clone(),
398                has_nested_dict_fields: self.fields.has_nested_dict_fields.clone(),
399                nested_parsers: self.fields.nested_parsers.clone(),
400                field_count: self.fields.field_count,
401            },
402            name_mapping: self.name_mapping.clone(),
403            stored_extra_types: self.stored_extra_types.as_ref().map(|m| {
404                m.iter()
405                    .map(|(k, v)| (k.clone(), v.clone_ref(py)))
406                    .collect()
407            }),
408            allows_empty_default_string_match: self.allows_empty_default_string_match,
409        })
410    }
411}