1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
use crate::error;
use crate::parser::findall_iter::FindallIter;
use crate::parser::format_parser::{CompiledFields, FormatParser};
use fancy_regex::Regex;
use formatparse_core::count_capturing_groups;
use formatparse_core::parser::validate_input_length;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{PyAnyMethods, PyString, PyTuple};
use pyo3::IntoPyObjectExt;
use std::collections::HashMap;
#[pymethods]
impl FormatParser {
#[new]
#[pyo3(signature = (pattern=None, extra_types=None))]
fn new_py(
pattern: Option<&str>,
extra_types: Option<HashMap<String, PyObject>>,
) -> PyResult<Self> {
match pattern {
Some(p) => Self::new_with_extra_types(p, extra_types),
None => {
// Create a dummy instance for unpickling - __setstate__ will initialize it properly
// We need to create a valid but minimal instance
let empty_regex =
Regex::new("^$").map_err(|e| crate::error::regex_error(&e.to_string()))?;
Ok(Self {
pattern: String::new(),
regex: empty_regex.clone(),
regex_str: String::new(),
regex_case_insensitive: None,
search_regex: empty_regex.clone(),
search_regex_case_insensitive: None,
fields: CompiledFields {
field_specs: Vec::new(),
field_names: Vec::new(),
normalized_names: Vec::new(),
custom_type_groups: Vec::new(),
has_nested_dict_fields: Vec::new(),
nested_parsers: Vec::new(),
field_count: 0,
},
name_mapping: HashMap::new(),
stored_extra_types: None,
allows_empty_default_string_match: false,
})
}
}
}
/// Parse a string using this compiled pattern.
///
/// Merges ``extra_types`` from compile time with any ``extra_types`` passed here
/// (call-time wins on duplicate keys).
#[pyo3(signature = (string, case_sensitive=false, extra_types=None, evaluate_result=true))]
fn parse(
&self,
string: &str,
case_sensitive: bool,
extra_types: Option<HashMap<String, PyObject>>,
evaluate_result: bool,
) -> PyResult<Option<PyObject>> {
// Validate input length
validate_input_length(string).map_err(PyValueError::new_err)?;
// Check for null bytes
if string.contains('\0') {
return Err(PyValueError::new_err("Input string contains null byte"));
}
// Merge stored extra_types with provided extra_types (provided takes precedence)
let merged_extra_types =
Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
let mut merged = if let Some(ref stored) = self.stored_extra_types {
stored
.iter()
.map(|(k, v)| (k.clone(), v.clone_ref(py)))
.collect()
} else {
HashMap::new()
};
if let Some(ref provided) = extra_types {
for (k, v) in provided {
merged.insert(k.clone(), v.clone_ref(py));
}
}
Ok(Some(merged))
})?;
self.parse_internal(
string,
case_sensitive,
merged_extra_types.as_ref(),
evaluate_result,
)
}
/// Get the list of named field names (returns normalized names for compatibility)
#[getter]
fn named_fields(&self) -> Vec<String> {
// Return normalized names (without hyphens/dots) for compatibility with original parse library
self.fields
.normalized_names
.iter()
.filter_map(|n| n.clone())
.collect()
}
/// Raw regex body for this pattern (no ``^``/``$``, no ``(?s)`` prefix).
///
/// Intended for **composition** (GitHub issue #7): embed this string as a custom
/// type’s ``pattern`` when building a parent pattern via ``extra_types``. Do not use
/// :meth:`_expression` for that purpose; it applies display-oriented transforms that are
/// not guaranteed to be valid regex fragments.
#[getter]
fn regex_subpattern(&self) -> String {
self.regex_str.clone()
}
/// Number of capturing groups in :meth:`regex_subpattern`.
///
/// Use as ``regex_group_count`` on a converter object when composing (see
/// :func:`formatparse.composed_type`).
#[getter]
fn regex_capturing_group_count(&self) -> usize {
count_capturing_groups(&self.regex_str)
}
/// Get the internal regex expression string (for testing)
/// Returns a canonical format with literal spaces instead of \s+ for compatibility
#[getter]
fn _expression(&self) -> String {
let mut result = self.regex_str.clone();
// Replace \s+ between capturing groups with literal spaces for canonical format
// This matches the original parse library's _expression format
result = result.replace(r")\s+(", ") (");
// Also replace )\s*( with ) ( for backward compatibility
result = result.replace(r")\s*(", ") (");
// Simplify float patterns to match expected format
// Our pattern: ([+-]?(?:\d+\.\d+|\.\d+|\d+\.)(?:[eE][+-]?\d+)?)
// Expected: ([-+ ]?\d*\.\d+)
// Replace the complex float pattern with the simpler one
result = result.replace(
r"([+-]?(?:\d+\.\d+|\.\d+|\d+\.)(?:[eE][+-]?\d+)?)",
r"([-+ ]?\d*\.\d+)",
);
// For alignment patterns like {:>} that produce "( *(.+?))", we need to unwrap
// the outer capturing group to get " *(.+?)" (no outer wrapper)
// Only do this for patterns that start with "(" and end with ")" and contain nested groups
if result.starts_with("(") && result.ends_with(")") {
let inner = &result[1..result.len() - 1];
// Check if inner already starts with a space and contains a capturing group
if inner.starts_with(" *(") && inner.ends_with(")") {
// This is a simple wrapper, unwrap it
result = inner.to_string();
}
}
result
}
/// Get the format object for formatting values into the pattern
#[getter]
fn format(&self) -> Format {
Format {
pattern: self.pattern.clone(),
}
}
/// Search for the pattern in a string.
///
/// Merges ``extra_types`` from compile time with any ``extra_types`` passed here
/// (call-time wins on duplicate keys), same as :meth:`parse`.
#[pyo3(signature = (string, case_sensitive=true, extra_types=None, evaluate_result=true))]
fn search(
&self,
string: &str,
case_sensitive: bool,
extra_types: Option<HashMap<String, PyObject>>,
evaluate_result: bool,
) -> PyResult<Option<PyObject>> {
// Validate input length
validate_input_length(string).map_err(PyValueError::new_err)?;
// Check for null bytes
if string.contains('\0') {
return Err(PyValueError::new_err("Input string contains null byte"));
}
let merged_extra_types =
Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
let mut merged = if let Some(ref stored) = self.stored_extra_types {
stored
.iter()
.map(|(k, v)| (k.clone(), v.clone_ref(py)))
.collect()
} else {
HashMap::new()
};
if let Some(ref provided) = extra_types {
for (k, v) in provided {
merged.insert(k.clone(), v.clone_ref(py));
}
}
Ok(Some(merged))
})?;
self.search_pattern(string, case_sensitive, merged_extra_types, evaluate_result)
}
/// Yield non-overlapping matches from ``string`` one at a time.
///
/// Same matching rules as :func:`findall`, but each ``__next__`` converts at most one
/// match, lowering peak memory when you stream results. This does **not** read
/// arbitrary file chunks with backtracking; pair with line-based file iteration only
/// when matches cannot span line breaks (see GitHub issue #13).
#[pyo3(signature = (string, case_sensitive=false, extra_types=None, evaluate_result=true))]
fn findall_iter(
&self,
py: Python<'_>,
string: &str,
case_sensitive: bool,
extra_types: Option<HashMap<String, PyObject>>,
evaluate_result: bool,
) -> PyResult<Py<FindallIter>> {
validate_input_length(string).map_err(PyValueError::new_err)?;
if string.contains('\0') {
return Err(PyValueError::new_err("Input string contains null byte"));
}
let merged_extra_types =
Python::with_gil(|py| -> PyResult<Option<HashMap<String, PyObject>>> {
let mut merged = if let Some(ref stored) = self.stored_extra_types {
stored
.iter()
.map(|(k, v)| (k.clone(), v.clone_ref(py)))
.collect()
} else {
HashMap::new()
};
if let Some(ref provided) = extra_types {
for (k, v) in provided {
merged.insert(k.clone(), v.clone_ref(py));
}
}
Ok(Some(merged))
})?;
let merged_map = merged_extra_types.unwrap_or_default();
let arc = std::sync::Arc::new(self.clone());
Py::new(
py,
FindallIter::new(
arc,
string.to_string(),
case_sensitive,
evaluate_result,
merged_map,
),
)
}
/// Pickle support: rebuild with ``compile(pattern)`` only (no ``extra_types``).
///
/// Custom type converters cannot be serialized reliably; use
/// ``compile(pattern, extra_types=...)`` after unpickling if you need them.
fn __reduce__(&self, py: Python<'_>) -> PyResult<Py<PyAny>> {
let m = py.import("_formatparse")?;
let compile_fn = m.getattr("compile")?;
let args = PyTuple::new(py, [&self.pattern])?;
PyTuple::new(py, [compile_fn.as_any(), args.as_any()])?.into_py_any(py)
}
/// Pickle state: pattern string only (see class doc for ``extra_types``).
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
use pyo3::types::PyDict;
let state = PyDict::new(py);
state.set_item("pattern", &self.pattern)?;
state.into_py_any(py)
}
/// Restore from pickle state (pattern only; ``extra_types`` are not recovered).
fn __setstate__(&mut self, _py: Python, state: &Bound<'_, PyAny>) -> PyResult<()> {
use pyo3::types::PyDict;
let dict = state.downcast::<PyDict>()?;
let pattern: String = dict
.get_item("pattern")?
.ok_or_else(|| error::missing_field_error("pattern"))?
.extract()?;
// Reconstruct the parser from the pattern
let reconstructed = Self::new_with_extra_types(&pattern, None)?;
// Copy all fields from reconstructed parser
self.pattern = reconstructed.pattern;
self.regex_str = reconstructed.regex_str;
self.regex = reconstructed.regex;
self.regex_case_insensitive = reconstructed.regex_case_insensitive;
self.search_regex = reconstructed.search_regex;
self.search_regex_case_insensitive = reconstructed.search_regex_case_insensitive;
self.fields = reconstructed.fields;
self.name_mapping = reconstructed.name_mapping;
self.stored_extra_types = reconstructed.stored_extra_types;
self.allows_empty_default_string_match = reconstructed.allows_empty_default_string_match;
Ok(())
}
}
/// Format object that formats values into a pattern string
#[pyclass]
pub struct Format {
pattern: String,
}
#[pymethods]
impl Format {
/// Format values into the pattern string using Python's format() method
fn format(&self, py: Python, args: &Bound<'_, PyAny>) -> PyResult<String> {
// Use Python's string format method to format values into the pattern
let pattern_obj = PyString::new(py, &self.pattern);
let format_method = pattern_obj.getattr("format")?;
// Call format with the args (can be a single value, tuple, or *args)
let result = if let Ok(tuple) = args.downcast::<PyTuple>() {
format_method.call1(tuple)?
} else {
// Single argument
format_method.call1((args,))?
};
result.extract()
}
}