Skip to main content

_formatparse/
lib.rs

1//! formatparse-pyo3: PyO3 bindings for formatparse
2//!
3//! formatparse-pyo3 provides Python bindings for the formatparse-core library.
4
5use pyo3::exceptions::PyNotImplementedError;
6use pyo3::prelude::*;
7use pyo3::types::{PyDict, PyList};
8use pyo3::IntoPyObjectExt;
9use std::collections::HashMap;
10
11mod datetime;
12mod error;
13mod match_rs;
14mod parser;
15mod pattern_cache;
16mod pattern_normalize;
17mod result;
18mod results;
19mod types;
20
21pub(crate) use pattern_cache::extract_extra_types_identity;
22use pattern_cache::get_or_create_parser;
23
24pub use datetime::FixedTzOffset;
25pub use parser::{FindallIter, Format, FormatParser};
26pub use result::*;
27pub use results::Results;
28pub use types::conversion::*;
29// Core types come from formatparse-core
30pub use formatparse_core::strftime_to_regex;
31pub use formatparse_core::{FieldSpec, FieldType};
32pub use match_rs::Match;
33
34pub use error::PatternParseMismatch;
35
36/// Parse a string using a format specification
37#[pyfunction]
38#[pyo3(signature = (pattern, string, extra_types=None, case_sensitive=false, evaluate_result=true))]
39fn parse(
40    pattern: &str,
41    string: &str,
42    extra_types: Option<HashMap<String, PyObject>>,
43    case_sensitive: bool,
44    evaluate_result: bool,
45) -> PyResult<Option<PyObject>> {
46    // Validate input lengths
47    formatparse_core::validate_input_length(string)
48        .map_err(pyo3::exceptions::PyValueError::new_err)?;
49
50    // Check for null bytes in inputs
51    if string.contains('\0') {
52        return Err(pyo3::exceptions::PyValueError::new_err(
53            "Input string contains null byte",
54        ));
55    }
56
57    // Use cached parser if available
58    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
59        extra_types.as_ref().map(|et| {
60            et.iter()
61                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
62                .collect()
63        })
64    });
65    match get_or_create_parser(pattern, extra_types_cloned) {
66        Ok(parser) => parser.parse_internal(
67            string,
68            case_sensitive,
69            extra_types.as_ref(),
70            evaluate_result,
71        ),
72        Err(e) => Python::with_gil(|py| {
73            if e.is_instance_of::<PyNotImplementedError>(py) {
74                return Err(e);
75            }
76            if e.is_instance_of::<crate::error::PatternParseMismatch>(py) {
77                return Ok(None);
78            }
79            Err(e)
80        }),
81    }
82}
83
84/// Parse many strings with the same pattern, compiling the pattern once.
85///
86/// Each input string uses the same semantics as `parse` (including
87/// `extra_types`, `case_sensitive`, and `evaluate_result`). Non-matches
88/// become Python `None` at that index in the returned list.
89#[pyfunction]
90#[pyo3(signature = (pattern, strings, extra_types=None, case_sensitive=false, evaluate_result=true))]
91fn parse_batch(
92    pattern: &str,
93    strings: Vec<String>,
94    extra_types: Option<HashMap<String, PyObject>>,
95    case_sensitive: bool,
96    evaluate_result: bool,
97) -> PyResult<PyObject> {
98    for s in &strings {
99        formatparse_core::validate_input_length(s)
100            .map_err(pyo3::exceptions::PyValueError::new_err)?;
101        if s.contains('\0') {
102            return Err(pyo3::exceptions::PyValueError::new_err(
103                "Input string contains null byte",
104            ));
105        }
106    }
107
108    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
109        extra_types.as_ref().map(|et| {
110            et.iter()
111                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
112                .collect()
113        })
114    });
115
116    let parser = match get_or_create_parser(pattern, extra_types_cloned) {
117        Ok(p) => p,
118        Err(e) => {
119            return Python::with_gil(|py| -> PyResult<PyObject> {
120                if e.is_instance_of::<PyNotImplementedError>(py) {
121                    return Err(e);
122                }
123                if e.is_instance_of::<crate::error::PatternParseMismatch>(py) {
124                    let none_obj = py.None().into_py_any(py)?;
125                    let mut out: Vec<PyObject> = Vec::with_capacity(strings.len());
126                    for _ in 0..strings.len() {
127                        out.push(none_obj.clone_ref(py));
128                    }
129                    let items: Vec<_> = out.iter().map(|o| o.bind(py)).collect();
130                    return PyList::new(py, items)?.into_py_any(py);
131                }
132                Err(e)
133            });
134        }
135    };
136
137    Python::with_gil(|py| -> PyResult<PyObject> {
138        let mut out: Vec<PyObject> = Vec::with_capacity(strings.len());
139        for s in &strings {
140            match parser.parse_internal(s, case_sensitive, extra_types.as_ref(), evaluate_result)? {
141                Some(obj) => out.push(obj),
142                None => out.push(py.None().into_py_any(py)?),
143            }
144        }
145        let items: Vec<_> = out.iter().map(|o| o.bind(py)).collect();
146        PyList::new(py, items)?.into_py_any(py)
147    })
148}
149
150/// Search for a pattern in a string
151#[pyfunction]
152#[pyo3(signature = (pattern, string, pos=0, endpos=None, extra_types=None, case_sensitive=true, evaluate_result=true))]
153fn search(
154    pattern: &str,
155    string: &str,
156    pos: usize,
157    endpos: Option<usize>,
158    extra_types: Option<HashMap<String, PyObject>>,
159    case_sensitive: bool,
160    evaluate_result: bool,
161) -> PyResult<Option<PyObject>> {
162    // Validate pos parameter
163    if pos > string.len() {
164        return Ok(None);
165    }
166
167    // Validate endpos parameter
168    let end = endpos.unwrap_or(string.len());
169    if end > string.len() {
170        return Ok(None);
171    }
172    if end < pos {
173        return Ok(None);
174    }
175
176    // Validate input lengths
177    formatparse_core::validate_input_length(string)
178        .map_err(pyo3::exceptions::PyValueError::new_err)?;
179
180    // Check for null bytes in inputs
181    if string.contains('\0') {
182        return Err(pyo3::exceptions::PyValueError::new_err(
183            "Input string contains null byte",
184        ));
185    }
186
187    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
188        extra_types.as_ref().map(|et| {
189            et.iter()
190                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
191                .collect()
192        })
193    });
194    let parser = get_or_create_parser(pattern, extra_types_cloned)?;
195    let search_string = &string[pos..end];
196
197    if let Some(result) =
198        parser.search_pattern(search_string, case_sensitive, extra_types, evaluate_result)?
199    {
200        // Adjust positions if it's a ParseResult (not Match)
201        Python::with_gil(|py| {
202            if let Ok(parse_result) = result.bind(py).downcast::<ParseResult>() {
203                let result_value = parse_result.borrow();
204                let adjusted = result_value.clone().with_offset(pos);
205                // Py::new() is already optimized when GIL is held
206                Ok(Some(Py::new(py, adjusted)?.into_py_any(py)?))
207            } else {
208                // It's a Match object - we need to adjust its span
209                // For now, just return it as-is (Match spans are relative to search start)
210                Ok(Some(result))
211            }
212        })
213    } else {
214        Ok(None)
215    }
216}
217
218/// Find all matches of a pattern in a string
219#[pyfunction]
220#[pyo3(signature = (pattern, string, extra_types=None, case_sensitive=false, evaluate_result=true))]
221fn findall(
222    pattern: &str,
223    string: &str,
224    extra_types: Option<HashMap<String, PyObject>>,
225    case_sensitive: bool,
226    evaluate_result: bool,
227) -> PyResult<PyObject> {
228    // Validate input lengths
229    formatparse_core::validate_input_length(string)
230        .map_err(pyo3::exceptions::PyValueError::new_err)?;
231
232    // Check for null bytes in inputs
233    if string.contains('\0') {
234        return Err(pyo3::exceptions::PyValueError::new_err(
235            "Input string contains null byte",
236        ));
237    }
238
239    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
240        extra_types.as_ref().map(|et| {
241            et.iter()
242                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
243                .collect()
244        })
245    });
246    let parser = get_or_create_parser(pattern, extra_types_cloned)?;
247    crate::parser::findall_engine::findall_matches(
248        parser,
249        string,
250        extra_types.as_ref(),
251        case_sensitive,
252        evaluate_result,
253    )
254}
255
256/// Iterator over non-overlapping matches (same scan as :func:`findall`, one item per step).
257///
258/// See :class:`FindallIter` for memory semantics and limitations (issue #13 MVP).
259#[pyfunction]
260#[pyo3(signature = (pattern, string, extra_types=None, case_sensitive=false, evaluate_result=true))]
261fn findall_iter(
262    py: Python<'_>,
263    pattern: &str,
264    string: &str,
265    extra_types: Option<HashMap<String, PyObject>>,
266    case_sensitive: bool,
267    evaluate_result: bool,
268) -> PyResult<Py<FindallIter>> {
269    formatparse_core::validate_input_length(string)
270        .map_err(pyo3::exceptions::PyValueError::new_err)?;
271
272    if string.contains('\0') {
273        return Err(pyo3::exceptions::PyValueError::new_err(
274            "Input string contains null byte",
275        ));
276    }
277
278    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
279        extra_types.as_ref().map(|et| {
280            et.iter()
281                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
282                .collect()
283        })
284    });
285    let parser = get_or_create_parser(pattern, extra_types_cloned)?;
286
287    let et_map = Python::with_gil(|py| -> HashMap<String, PyObject> {
288        extra_types
289            .as_ref()
290            .map(|et| {
291                et.iter()
292                    .map(|(k, v)| (k.clone(), v.clone_ref(py)))
293                    .collect()
294            })
295            .unwrap_or_default()
296    });
297
298    Py::new(
299        py,
300        FindallIter::new(
301            parser,
302            string.to_string(),
303            case_sensitive,
304            evaluate_result,
305            et_map,
306        ),
307    )
308}
309
310/// Compile a pattern into a FormatParser for reuse.
311///
312/// Uses the same LRU cache as the `parse`, `search`, and `findall` bindings:
313/// `compile` with the same pattern and equivalent `extra_types` keys avoids
314/// rebuilding compiled regexes (see GitHub issue #29).
315#[pyfunction]
316#[pyo3(signature = (pattern, extra_types=None))]
317fn compile(
318    pattern: &str,
319    extra_types: Option<HashMap<String, PyObject>>,
320) -> PyResult<FormatParser> {
321    let extra_types_cloned = Python::with_gil(|py| -> Option<HashMap<String, PyObject>> {
322        extra_types.as_ref().map(|et| {
323            et.iter()
324                .map(|(k, v)| (k.clone(), v.clone_ref(py)))
325                .collect()
326        })
327    });
328    let arc = get_or_create_parser(pattern, extra_types_cloned)?;
329    Ok((*arc).clone())
330}
331
332/// Extract format specification components from a format string
333#[pyfunction]
334#[pyo3(signature = (format_string, _match_dict=None))]
335fn extract_format(
336    format_string: &str,
337    _match_dict: Option<&Bound<'_, PyDict>>,
338) -> PyResult<PyObject> {
339    use crate::types::FieldSpec;
340
341    // Parse the format spec string
342    let mut spec = FieldSpec::new();
343    formatparse_core::parser::pattern::parse_format_spec(format_string, &mut spec)
344        .map_err(crate::parser::pattern::pattern_compile_error_to_py)?;
345    formatparse_core::parser::pattern::validate_multiline_mvp(&spec)
346        .map_err(crate::parser::pattern::pattern_compile_error_to_py)?;
347
348    // Extract type from the original format_string (preserve original type chars like 'o', 'x', 'b')
349    // Parse the format spec to extract the type characters that come after width/precision/alignment
350    let type_str: String = if format_string == "%" {
351        "%".to_string()
352    } else {
353        // Parse format spec to find where type starts
354        // Format: [[fill]align][sign][#][0][width][,][.precision][type]
355        let chars: Vec<char> = format_string.chars().collect();
356        let mut i = 0;
357        let len = chars.len();
358
359        // Skip fill and align
360        if i < len && (chars[i] == '<' || chars[i] == '>' || chars[i] == '^' || chars[i] == '=') {
361            i += 1;
362        } else if i + 1 < len {
363            let ch = chars[i];
364            let next_ch = chars[i + 1];
365            if (next_ch == '<' || next_ch == '>' || next_ch == '^' || next_ch == '=')
366                && ch != next_ch
367            {
368                i += 2; // Skip fill + align
369            }
370        }
371
372        // Skip sign
373        if i < len && (chars[i] == '+' || chars[i] == '-' || chars[i] == ' ') {
374            i += 1;
375        }
376
377        // Skip #
378        if i < len && chars[i] == '#' {
379            i += 1;
380        }
381
382        // Skip 0
383        if i < len && chars[i] == '0' {
384            i += 1;
385        }
386
387        // Skip width (digits)
388        while i < len && chars[i].is_ascii_digit() {
389            i += 1;
390        }
391
392        // Skip comma
393        if i < len && chars[i] == ',' {
394            i += 1;
395        }
396
397        // Skip precision (.digits)
398        if i < len && chars[i] == '.' {
399            i += 1;
400            while i < len && chars[i].is_ascii_digit() {
401                i += 1;
402            }
403        }
404
405        // Type is the rest
406        if i < len {
407            format_string[i..].to_string()
408        } else {
409            "s".to_string() // Default
410        }
411    };
412
413    // Build result dictionary
414    Python::with_gil(|py| {
415        let result = PyDict::new(py);
416        result.set_item("type", type_str)?;
417
418        // Extract width
419        if let Some(width) = spec.width {
420            result.set_item("width", width.to_string())?;
421        }
422
423        // Extract precision
424        if let Some(precision) = spec.precision {
425            result.set_item("precision", precision.to_string())?;
426        }
427
428        // Extract alignment
429        if let Some(align) = spec.alignment {
430            result.set_item("align", align.to_string())?;
431        }
432
433        // Extract fill
434        if let Some(fill) = spec.fill {
435            result.set_item("fill", fill.to_string())?;
436        }
437
438        // Extract zero padding
439        if spec.zero_pad {
440            result.set_item("zero", true)?;
441        }
442
443        result.into_py_any(py)
444    })
445}
446
447/// Python module definition
448#[pymodule]
449fn _formatparse(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
450    m.add(
451        "PatternParseMismatch",
452        py.get_type::<crate::error::PatternParseMismatch>(),
453    )?;
454    m.add_function(wrap_pyfunction!(parse, m)?)?;
455    m.add_function(wrap_pyfunction!(parse_batch, m)?)?;
456    m.add_function(wrap_pyfunction!(search, m)?)?;
457    m.add_function(wrap_pyfunction!(findall, m)?)?;
458    m.add_function(wrap_pyfunction!(findall_iter, m)?)?;
459    m.add_function(wrap_pyfunction!(compile, m)?)?;
460    m.add_function(wrap_pyfunction!(extract_format, m)?)?;
461    m.add_class::<ParseResult>()?;
462    m.add_class::<FormatParser>()?;
463    m.add_class::<Format>()?;
464    m.add_class::<FixedTzOffset>()?;
465    m.add_class::<Match>()?;
466    m.add_class::<Results>()?;
467    m.add_class::<FindallIter>()?;
468    Ok(())
469}