Skip to main content

_formatparse/parser/
findall_iter.rs

1use crate::parser::format_parser::FormatParser;
2use crate::parser::matching::{match_with_captures, match_with_captures_raw, CapturedMatchContext};
3use formatparse_core::FieldType;
4use pyo3::prelude::*;
5use pyo3::IntoPyObjectExt;
6use std::collections::HashMap;
7use std::sync::Arc;
8
9/// Incremental iterator over ``findall``-style matches (issue #13 MVP).
10///
11/// Yields one match at a time using the same non-overlapping scan as :func:`findall`,
12/// without building a full :class:`Results` or list first. This lowers peak memory when
13/// you only consume matches sequentially. It does **not** implement arbitrary chunked
14/// file I/O or cross-chunk backtracking; use line-by-line reads only when your pattern
15/// cannot span physical line breaks.
16#[pyclass(module = "_formatparse", name = "FindallIter")]
17pub struct FindallIter {
18    parser: Arc<FormatParser>,
19    haystack: String,
20    case_sensitive: bool,
21    evaluate_result: bool,
22    fast_path: bool,
23    extra_types: HashMap<String, PyObject>,
24    last_end: usize,
25    search_pos: usize,
26}
27
28impl FindallIter {
29    pub fn new(
30        parser: Arc<FormatParser>,
31        haystack: String,
32        case_sensitive: bool,
33        evaluate_result: bool,
34        extra_types: HashMap<String, PyObject>,
35    ) -> Self {
36        let has_custom_converters = !extra_types.is_empty();
37        let has_nested_dicts = parser.fields.has_nested_dict_fields.iter().any(|&b| b);
38        let has_nested_format_fields = parser
39            .fields
40            .field_specs
41            .iter()
42            .any(|s| matches!(s.field_type, FieldType::Nested));
43        let fast_path = !has_custom_converters
44            && evaluate_result
45            && !has_nested_dicts
46            && !has_nested_format_fields;
47        Self {
48            parser,
49            haystack,
50            case_sensitive,
51            evaluate_result,
52            fast_path,
53            extra_types,
54            last_end: 0,
55            search_pos: 0,
56        }
57    }
58}
59
60#[pymethods]
61impl FindallIter {
62    fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
63        slf
64    }
65
66    fn __next__(mut slf: PyRefMut<'_, Self>, py: Python<'_>) -> PyResult<Option<PyObject>> {
67        if slf.fast_path {
68            loop {
69                if slf.search_pos > slf.haystack.len() {
70                    return Ok(None);
71                }
72                let search_regex = slf.parser.get_search_regex(slf.case_sensitive);
73                let Some(caps) = search_regex
74                    .captures_from_pos(&slf.haystack, slf.search_pos)
75                    .map_err(crate::error::fancy_regex_match_error)?
76                else {
77                    return Ok(None);
78                };
79                let Some(m0) = caps.get(0) else {
80                    return Err(pyo3::exceptions::PyRuntimeError::new_err(
81                        "regex match missing capture group 0",
82                    ));
83                };
84                let match_start = m0.start();
85                let match_end = m0.end();
86
87                if match_start < slf.last_end {
88                    slf.search_pos = slf.last_end.max(match_start.saturating_add(1));
89                    continue;
90                }
91
92                let slices = slf.parser.fields.capture_slices();
93
94                match match_with_captures_raw(&caps, &slf.haystack, match_start, &slices) {
95                    Ok(Some(raw_data)) => {
96                        slf.last_end = match_end;
97                        if match_start == match_end {
98                            slf.last_end += 1;
99                        }
100                        slf.search_pos = slf.last_end;
101                        let pr = raw_data.to_parse_result(py)?;
102                        return Ok(Some(pr.into_py_any(py)?));
103                    }
104                    Ok(None) => {
105                        slf.search_pos = match_start.saturating_add(1);
106                        continue;
107                    }
108                    Err(_) => {
109                        slf.fast_path = false;
110                        if slf.last_end == 0 {
111                            slf.search_pos = 0;
112                        }
113                        break;
114                    }
115                }
116            }
117        }
118
119        loop {
120            if slf.search_pos > slf.haystack.len() {
121                return Ok(None);
122            }
123            let search_regex = slf.parser.get_search_regex(slf.case_sensitive);
124            let Some(caps) = search_regex
125                .captures_from_pos(&slf.haystack, slf.search_pos)
126                .map_err(crate::error::fancy_regex_match_error)?
127            else {
128                return Ok(None);
129            };
130            let Some(m0) = caps.get(0) else {
131                return Err(pyo3::exceptions::PyRuntimeError::new_err(
132                    "regex match missing capture group 0",
133                ));
134            };
135            let match_start = m0.start();
136            let match_end = m0.end();
137
138            if match_start < slf.last_end {
139                slf.search_pos = slf.last_end.max(match_start.saturating_add(1));
140                continue;
141            }
142
143            let ctx = CapturedMatchContext {
144                pattern: &slf.parser.pattern,
145                fields: slf.parser.fields.capture_slices(),
146                py,
147                custom_converters: &slf.extra_types,
148                evaluate_result: slf.evaluate_result,
149            };
150
151            match match_with_captures(&caps, &ctx)? {
152                Some(result) => {
153                    slf.last_end = match_end;
154                    if match_start == match_end {
155                        slf.last_end += 1;
156                    }
157                    slf.search_pos = slf.last_end;
158                    return Ok(Some(result));
159                }
160                None => {
161                    slf.search_pos = match_start.saturating_add(1);
162                    continue;
163                }
164            }
165        }
166    }
167
168    fn __repr__(&self) -> String {
169        "<FindallIter>".to_string()
170    }
171}