formatparse-pyo3 0.8.1

PyO3 bindings for formatparse (native _formatparse extension; use PyPI for Python installs)
Documentation
use crate::error;
use crate::parser::findall_iter::FindallIter;
use crate::parser::format_parser::{CompiledFields, FormatParser};
use fancy_regex::Regex;
use formatparse_core::count_capturing_groups;
use formatparse_core::parser::validate_input_length;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyAnyMethods, PyString, PyTuple};
use pyo3::IntoPyObjectExt;
use std::collections::HashMap;

#[pymethods]
impl FormatParser {
    #[new]
    #[pyo3(signature = (pattern=None, extra_types=None))]
    fn new_py(
        pattern: Option<&str>,
        extra_types: Option<HashMap<String, PyObject>>,
    ) -> PyResult<Self> {
        match pattern {
            Some(p) => Self::new_with_extra_types(p, extra_types),
            None => {
                // Create a dummy instance for unpickling - __setstate__ will initialize it properly
                // We need to create a valid but minimal instance
                let empty_regex =
                    Regex::new("^$").map_err(|e| crate::error::regex_error(&e.to_string()))?;
                Ok(Self {
                    pattern: String::new(),
                    regex: empty_regex.clone(),
                    regex_str: String::new(),
                    regex_case_insensitive: None,
                    search_regex: empty_regex.clone(),
                    search_regex_case_insensitive: None,
                    fields: CompiledFields {
                        field_specs: Vec::new(),
                        field_names: Vec::new(),
                        normalized_names: Vec::new(),
                        custom_type_groups: Vec::new(),
                        has_nested_dict_fields: Vec::new(),
                        nested_parsers: Vec::new(),
                        field_count: 0,
                    },
                    name_mapping: HashMap::new(),
                    stored_extra_types: None,
                    allows_empty_default_string_match: false,
                })
            }
        }
    }

    /// Parse a string using this compiled pattern.
    ///
    /// Merges ``extra_types`` from compile time with any ``extra_types`` passed here
    /// (call-time wins on duplicate keys).
    #[pyo3(signature = (string, case_sensitive=false, extra_types=None, evaluate_result=true))]
    fn parse(
        &self,
        string: &str,
        case_sensitive: bool,
        extra_types: Option<HashMap<String, PyObject>>,
        evaluate_result: bool,
    ) -> PyResult<Option<PyObject>> {
        // Validate input length
        validate_input_length(string).map_err(PyValueError::new_err)?;

        // Check for null bytes
        if string.contains('\0') {
            return Err(PyValueError::new_err("Input string contains null byte"));
        }
        // Merge stored extra_types with provided extra_types (provided takes precedence)
        let merged_extra_types =
            Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
                let mut merged = if let Some(ref stored) = self.stored_extra_types {
                    stored
                        .iter()
                        .map(|(k, v)| (k.clone(), v.clone_ref(py)))
                        .collect()
                } else {
                    HashMap::new()
                };
                if let Some(ref provided) = extra_types {
                    for (k, v) in provided {
                        merged.insert(k.clone(), v.clone_ref(py));
                    }
                }
                Ok(Some(merged))
            })?;
        self.parse_internal(
            string,
            case_sensitive,
            merged_extra_types.as_ref(),
            evaluate_result,
        )
    }

    /// Get the list of named field names (returns normalized names for compatibility)
    #[getter]
    fn named_fields(&self) -> Vec<String> {
        // Return normalized names (without hyphens/dots) for compatibility with original parse library
        self.fields
            .normalized_names
            .iter()
            .filter_map(|n| n.clone())
            .collect()
    }

    /// Raw regex body for this pattern (no ``^``/``$``, no ``(?s)`` prefix).
    ///
    /// Intended for **composition** (GitHub issue #7): embed this string as a custom
    /// type’s ``pattern`` when building a parent pattern via ``extra_types``. Do not use
    /// :meth:`_expression` for that purpose; it applies display-oriented transforms that are
    /// not guaranteed to be valid regex fragments.
    #[getter]
    fn regex_subpattern(&self) -> String {
        self.regex_str.clone()
    }

    /// Number of capturing groups in :meth:`regex_subpattern`.
    ///
    /// Use as ``regex_group_count`` on a converter object when composing (see
    /// :func:`formatparse.composed_type`).
    #[getter]
    fn regex_capturing_group_count(&self) -> usize {
        count_capturing_groups(&self.regex_str)
    }

    /// Get the internal regex expression string (for testing)
    /// Returns a canonical format with literal spaces instead of \s+ for compatibility
    #[getter]
    fn _expression(&self) -> String {
        let mut result = self.regex_str.clone();

        // Replace \s+ between capturing groups with literal spaces for canonical format
        // This matches the original parse library's _expression format
        result = result.replace(r")\s+(", ") (");
        // Also replace )\s*( with ) ( for backward compatibility
        result = result.replace(r")\s*(", ") (");

        // Simplify float patterns to match expected format
        // Our pattern: ([+-]?(?:\d+\.\d+|\.\d+|\d+\.)(?:[eE][+-]?\d+)?)
        // Expected: ([-+ ]?\d*\.\d+)
        // Replace the complex float pattern with the simpler one
        result = result.replace(
            r"([+-]?(?:\d+\.\d+|\.\d+|\d+\.)(?:[eE][+-]?\d+)?)",
            r"([-+ ]?\d*\.\d+)",
        );

        // For alignment patterns like {:>} that produce "( *(.+?))", we need to unwrap
        // the outer capturing group to get " *(.+?)" (no outer wrapper)
        // Only do this for patterns that start with "(" and end with ")" and contain nested groups
        if result.starts_with("(") && result.ends_with(")") {
            let inner = &result[1..result.len() - 1];
            // Check if inner already starts with a space and contains a capturing group
            if inner.starts_with(" *(") && inner.ends_with(")") {
                // This is a simple wrapper, unwrap it
                result = inner.to_string();
            }
        }

        result
    }

    /// Get the format object for formatting values into the pattern
    #[getter]
    fn format(&self) -> Format {
        Format {
            pattern: self.pattern.clone(),
        }
    }

    /// Search for the pattern in a string.
    ///
    /// Merges ``extra_types`` from compile time with any ``extra_types`` passed here
    /// (call-time wins on duplicate keys), same as :meth:`parse`.
    #[pyo3(signature = (string, case_sensitive=true, extra_types=None, evaluate_result=true))]
    fn search(
        &self,
        string: &str,
        case_sensitive: bool,
        extra_types: Option<HashMap<String, PyObject>>,
        evaluate_result: bool,
    ) -> PyResult<Option<PyObject>> {
        // Validate input length
        validate_input_length(string).map_err(PyValueError::new_err)?;

        // Check for null bytes
        if string.contains('\0') {
            return Err(PyValueError::new_err("Input string contains null byte"));
        }

        let merged_extra_types =
            Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
                let mut merged = if let Some(ref stored) = self.stored_extra_types {
                    stored
                        .iter()
                        .map(|(k, v)| (k.clone(), v.clone_ref(py)))
                        .collect()
                } else {
                    HashMap::new()
                };
                if let Some(ref provided) = extra_types {
                    for (k, v) in provided {
                        merged.insert(k.clone(), v.clone_ref(py));
                    }
                }
                Ok(Some(merged))
            })?;

        self.search_pattern(string, case_sensitive, merged_extra_types, evaluate_result)
    }

    /// Yield non-overlapping matches from ``string`` one at a time.
    ///
    /// Same matching rules as :func:`findall`, but each ``__next__`` converts at most one
    /// match, lowering peak memory when you stream results. This does **not** read
    /// arbitrary file chunks with backtracking; pair with line-based file iteration only
    /// when matches cannot span line breaks (see GitHub issue #13).
    #[pyo3(signature = (string, case_sensitive=false, extra_types=None, evaluate_result=true))]
    fn findall_iter(
        &self,
        py: Python<'_>,
        string: &str,
        case_sensitive: bool,
        extra_types: Option<HashMap<String, PyObject>>,
        evaluate_result: bool,
    ) -> PyResult<Py<FindallIter>> {
        validate_input_length(string).map_err(PyValueError::new_err)?;

        if string.contains('\0') {
            return Err(PyValueError::new_err("Input string contains null byte"));
        }

        let merged_extra_types =
            Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
                let mut merged = if let Some(ref stored) = self.stored_extra_types {
                    stored
                        .iter()
                        .map(|(k, v)| (k.clone(), v.clone_ref(py)))
                        .collect()
                } else {
                    HashMap::new()
                };
                if let Some(ref provided) = extra_types {
                    for (k, v) in provided {
                        merged.insert(k.clone(), v.clone_ref(py));
                    }
                }
                Ok(Some(merged))
            })?;

        let merged_map = merged_extra_types.unwrap_or_default();
        let arc = std::sync::Arc::new(self.clone());
        Py::new(
            py,
            FindallIter::new(
                arc,
                string.to_string(),
                case_sensitive,
                evaluate_result,
                merged_map,
            ),
        )
    }

    /// Pickle support: rebuild with ``compile(pattern)`` only (no ``extra_types``).
    ///
    /// Custom type converters cannot be serialized reliably; use
    /// ``compile(pattern, extra_types=...)`` after unpickling if you need them.
    fn __reduce__(&self, py: Python<'_>) -> PyResult<Py<PyAny>> {
        let m = py.import("_formatparse")?;
        let compile_fn = m.getattr("compile")?;
        let args = PyTuple::new(py, [&self.pattern])?;
        PyTuple::new(py, [compile_fn.as_any(), args.as_any()])?.into_py_any(py)
    }

    /// Pickle state: pattern string only (see class doc for ``extra_types``).
    fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
        use pyo3::types::PyDict;
        let state = PyDict::new(py);
        state.set_item("pattern", &self.pattern)?;
        state.into_py_any(py)
    }

    /// Restore from pickle state (pattern only; ``extra_types`` are not recovered).
    fn __setstate__(&mut self, _py: Python, state: &Bound<'_, PyAny>) -> PyResult<()> {
        use pyo3::types::PyDict;
        let dict = state.downcast::<PyDict>()?;
        let pattern: String = dict
            .get_item("pattern")?
            .ok_or_else(|| error::missing_field_error("pattern"))?
            .extract()?;

        // Reconstruct the parser from the pattern
        let reconstructed = Self::new_with_extra_types(&pattern, None)?;

        // Copy all fields from reconstructed parser
        self.pattern = reconstructed.pattern;
        self.regex_str = reconstructed.regex_str;
        self.regex = reconstructed.regex;
        self.regex_case_insensitive = reconstructed.regex_case_insensitive;
        self.search_regex = reconstructed.search_regex;
        self.search_regex_case_insensitive = reconstructed.search_regex_case_insensitive;
        self.fields = reconstructed.fields;
        self.name_mapping = reconstructed.name_mapping;
        self.stored_extra_types = reconstructed.stored_extra_types;
        self.allows_empty_default_string_match = reconstructed.allows_empty_default_string_match;
        Ok(())
    }
}

/// Format object that formats values into a pattern string
#[pyclass]
pub struct Format {
    pattern: String,
}

#[pymethods]
impl Format {
    /// Format values into the pattern string using Python's format() method
    fn format(&self, py: Python, args: &Bound<'_, PyAny>) -> PyResult<String> {
        // Use Python's string format method to format values into the pattern
        let pattern_obj = PyString::new(py, &self.pattern);
        let format_method = pattern_obj.getattr("format")?;

        // Call format with the args (can be a single value, tuple, or *args)
        let result = if let Ok(tuple) = args.downcast::<PyTuple>() {
            format_method.call1(tuple)?
        } else {
            // Single argument
            format_method.call1((args,))?
        };
        result.extract()
    }
}