readcon_core/iterators.rs
1//=============================================================================
2// The Public API - A clean iterator for users of our library
3//=============================================================================
4
5use crate::parser::{parse_single_frame, parse_velocity_section};
6use crate::{error, types};
7use std::iter::Peekable;
8use std::path::Path;
9
10/// An iterator that lazily parses simulation frames from a `.con` or `.convel`
11/// file's contents.
12///
13/// This struct wraps an iterator over the lines of a string and, upon each iteration,
14/// attempts to parse a complete `ConFrame`. Velocity sections are detected
15/// automatically: if a blank line follows the coordinate blocks, the velocity
16/// data is parsed into the atoms.
17///
18/// The iterator yields items of type `Result<ConFrame, ParseError>`, allowing for
19/// robust error handling for each frame.
20pub struct ConFrameIterator<'a> {
21 lines: Peekable<std::str::Lines<'a>>,
22}
23
24impl<'a> ConFrameIterator<'a> {
25 /// Creates a new `ConFrameIterator` from a string slice of the entire file.
26 ///
27 /// # Arguments
28 ///
29 /// * `file_contents` - A string slice containing the text of one or more `.con` frames.
30 pub fn new(file_contents: &'a str) -> Self {
31 ConFrameIterator {
32 lines: file_contents.lines().peekable(),
33 }
34 }
35
36 /// Skips the next frame without fully parsing its atomic data.
37 ///
38 /// This is more efficient than `next()` if you only need to advance the
39 /// iterator. It reads the frame's header to determine how many lines to skip,
40 /// including any velocity section if present.
41 ///
42 /// # Returns
43 ///
44 /// * `Some(Ok(()))` on a successful skip.
45 /// * `Some(Err(ParseError::...))` if there's an error parsing the header.
46 /// * `None` if the iterator is already at the end.
47 pub fn forward(&mut self) -> Option<Result<(), error::ParseError>> {
48 // Skip frame by parsing only required header fields to avoid full parsing overhead
49 if self.lines.peek().is_none() {
50 return None;
51 }
52
53 // Manually consume the first 6 lines of the header, which we don't need for skipping.
54 for _ in 0..6 {
55 if self.lines.next().is_none() {
56 return Some(Err(error::ParseError::IncompleteHeader));
57 }
58 }
59
60 // Line 7: natm_types. We need to parse this.
61 let natm_types: usize = match self.lines.next() {
62 Some(line) => match crate::parser::parse_line_of_n::<usize>(line, 1) {
63 Ok(v) => v[0],
64 Err(e) => return Some(Err(e)),
65 },
66 None => return Some(Err(error::ParseError::IncompleteHeader)),
67 };
68
69 // Line 8: natms_per_type. We need this to sum the total number of atoms.
70 let natms_per_type: Vec<usize> = match self.lines.next() {
71 Some(line) => match crate::parser::parse_line_of_n(line, natm_types) {
72 Ok(v) => v,
73 Err(e) => return Some(Err(e)),
74 },
75 None => return Some(Err(error::ParseError::IncompleteHeader)),
76 };
77
78 // Line 9: masses_per_type. We just need to consume this line.
79 if self.lines.next().is_none() {
80 return Some(Err(error::ParseError::IncompleteHeader));
81 }
82
83 // Calculate how many more lines to skip for coordinate blocks.
84 let total_atoms: usize = natms_per_type.iter().sum();
85 // For each atom type, there is a symbol line and a "Coordinates..." line.
86 let non_atom_lines = natm_types * 2;
87 let lines_to_skip = total_atoms + non_atom_lines;
88
89 // Advance the iterator by skipping the coordinate block lines.
90 for _ in 0..lines_to_skip {
91 if self.lines.next().is_none() {
92 // The file ended before the header's promise was fulfilled.
93 return Some(Err(error::ParseError::IncompleteFrame));
94 }
95 }
96
97 // Check for an optional velocity section (blank separator followed by
98 // velocity blocks with the same structure as coordinate blocks).
99 if let Some(line) = self.lines.peek() {
100 if line.trim().is_empty() {
101 // Consume the blank separator
102 self.lines.next();
103 // Skip the velocity blocks: same structure as coordinate blocks
104 let vel_lines_to_skip = total_atoms + non_atom_lines;
105 for _ in 0..vel_lines_to_skip {
106 if self.lines.next().is_none() {
107 return Some(Err(error::ParseError::IncompleteVelocitySection));
108 }
109 }
110 }
111 }
112
113 Some(Ok(()))
114 }
115}
116
117impl<'a> Iterator for ConFrameIterator<'a> {
118 /// The type of item yielded by the iterator.
119 ///
120 /// Each item is a `Result` that contains a successfully parsed `ConFrame` or a
121 /// `ParseError` if the frame's data is malformed.
122 type Item = Result<types::ConFrame, error::ParseError>;
123
124 /// Advances the iterator and attempts to parse the next frame.
125 ///
126 /// This method will return `None` only when there are no more lines to consume.
127 /// If there are lines but they do not form a complete frame, it will return
128 /// `Some(Err(ParseError::...))`.
129 fn next(&mut self) -> Option<Self::Item> {
130 // If there are no more lines at all, the iterator is exhausted.
131 if self.lines.peek().is_none() {
132 return None;
133 }
134 // Otherwise, attempt to parse the next frame from the available lines.
135 let mut frame = match parse_single_frame(&mut self.lines) {
136 Ok(f) => f,
137 Err(e) => return Some(Err(e)),
138 };
139 // Attempt to parse optional velocity section
140 match parse_velocity_section(&mut self.lines, &frame.header, &mut frame.atom_data) {
141 Ok(_) => {}
142 Err(e) => return Some(Err(e)),
143 }
144 Some(Ok(frame))
145 }
146}
147
148/// Size threshold below which we use `read_to_string` instead of mmap.
149/// For small files, the fixed overhead of mmap (VMA creation, page fault,
150/// munmap) exceeds the cost of a simple `read` syscall + heap allocation.
151/// 64 KiB is a conservative cutoff used by ripgrep and similar tools.
152const MMAP_THRESHOLD: u64 = 64 * 1024;
153
154/// Reads file contents, choosing between `read_to_string` (small files) and
155/// mmap (large files) based on [`MMAP_THRESHOLD`].
156fn read_file_contents(path: &Path) -> Result<FileContents, Box<dyn std::error::Error>> {
157 let file = std::fs::File::open(path)?;
158 let metadata = file.metadata()?;
159 if metadata.len() < MMAP_THRESHOLD {
160 let contents = std::fs::read_to_string(path)?;
161 Ok(FileContents::Owned(contents))
162 } else {
163 let mmap = unsafe { memmap2::Mmap::map(&file)? };
164 Ok(FileContents::Mapped(mmap))
165 }
166}
167
168/// Holds file contents either as an owned String or a memory-mapped region.
169enum FileContents {
170 Owned(String),
171 Mapped(memmap2::Mmap),
172}
173
174impl FileContents {
175 fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
176 match self {
177 FileContents::Owned(s) => Ok(s.as_str()),
178 FileContents::Mapped(m) => std::str::from_utf8(m),
179 }
180 }
181}
182
183/// Reads all frames from a file.
184///
185/// For files smaller than 64 KiB, uses a simple `read_to_string` to avoid
186/// the fixed overhead of mmap (VMA creation, page fault, munmap). For larger
187/// trajectory files, uses memory-mapped I/O to let the OS page cache handle
188/// the data.
189pub fn read_all_frames(path: &Path) -> Result<Vec<types::ConFrame>, Box<dyn std::error::Error>> {
190 let contents = read_file_contents(path)?;
191 let text = contents.as_str()?;
192 let iter = ConFrameIterator::new(text);
193 let frames: Result<Vec<_>, _> = iter.collect();
194 Ok(frames?)
195}
196
197/// Reads only the first frame from a file.
198///
199/// More efficient than `read_all_frames` for single-frame access because it
200/// stops parsing after the first frame rather than collecting all of them.
201pub fn read_first_frame(path: &Path) -> Result<types::ConFrame, Box<dyn std::error::Error>> {
202 let contents = read_file_contents(path)?;
203 let text = contents.as_str()?;
204 let mut iter = ConFrameIterator::new(text);
205 match iter.next() {
206 Some(Ok(frame)) => Ok(frame),
207 Some(Err(e)) => Err(Box::new(e)),
208 None => Err("No frames found in file".into()),
209 }
210}
211
212/// Parses frames in parallel using rayon, splitting on frame boundaries.
213///
214/// Phase 1: sequential scan to find byte offsets of each frame's start.
215/// Phase 2: parallel parse of each frame slice using rayon.
216///
217/// Requires the `parallel` feature.
218#[cfg(feature = "parallel")]
219pub fn parse_frames_parallel(
220 file_contents: &str,
221) -> Vec<Result<types::ConFrame, error::ParseError>> {
222 use rayon::prelude::*;
223
224 // Phase 1: find frame byte boundaries by scanning for header patterns.
225 // Each frame starts with a header: 2 comment lines, then a line with 3 floats (box).
226 // We identify boundaries by walking through the file with a ConFrameIterator
227 // and recording byte positions.
228 let mut boundaries: Vec<usize> = Vec::new();
229 let mut offset = 0;
230 boundaries.push(0);
231
232 // Walk through the file using the forward() method to find frame boundaries
233 let mut scanner = ConFrameIterator::new(file_contents);
234 while scanner.forward().is_some() {
235 // After forward(), the internal iterator is positioned right after the frame.
236 // We need to figure out the byte offset of the next frame start.
237 // Since Peekable<Lines> doesn't expose byte offsets, we use a different approach:
238 // count lines consumed per frame and convert to byte offsets.
239 }
240
241 // Simpler approach: split into frame text chunks by parsing sequentially,
242 // recording where each frame starts and ends in the string.
243 boundaries.clear();
244 let lines: Vec<&str> = file_contents.lines().collect();
245 let mut line_idx = 0;
246 let total_lines = lines.len();
247
248 while line_idx < total_lines {
249 // Record the byte offset of this frame's start
250 let byte_offset: usize = lines[..line_idx]
251 .iter()
252 .map(|l| l.len() + 1) // +1 for newline
253 .sum();
254 boundaries.push(byte_offset);
255
256 // Skip 6 header lines (prebox1, prebox2, boxl, angles, postbox1, postbox2)
257 if line_idx + 6 >= total_lines {
258 break;
259 }
260 line_idx += 6;
261
262 // Line 7: natm_types
263 let natm_types: usize = match lines.get(line_idx) {
264 Some(l) => match crate::parser::parse_line_of_n::<usize>(l, 1) {
265 Ok(v) => v[0],
266 Err(_) => break,
267 },
268 None => break,
269 };
270 line_idx += 1;
271
272 // Line 8: natms_per_type
273 let natms_per_type: Vec<usize> = match lines.get(line_idx) {
274 Some(l) => match crate::parser::parse_line_of_n(l, natm_types) {
275 Ok(v) => v,
276 Err(_) => break,
277 },
278 None => break,
279 };
280 line_idx += 1;
281
282 // Line 9: masses (just skip)
283 line_idx += 1;
284
285 // Skip coordinate blocks
286 let total_atoms: usize = natms_per_type.iter().sum();
287 let coord_lines = total_atoms + natm_types * 2;
288 line_idx += coord_lines;
289
290 // Check for velocity section (blank separator)
291 if line_idx < total_lines {
292 if let Some(l) = lines.get(line_idx) {
293 if l.trim().is_empty() {
294 line_idx += 1; // blank separator
295 line_idx += coord_lines; // velocity blocks same size
296 }
297 }
298 }
299 }
300
301 // Phase 2: parallel parse each frame chunk
302 let num_frames = boundaries.len();
303 (0..num_frames)
304 .into_par_iter()
305 .map(|i| {
306 let start = boundaries[i];
307 let end = if i + 1 < num_frames {
308 boundaries[i + 1]
309 } else {
310 file_contents.len()
311 };
312 let chunk = &file_contents[start..end];
313 let mut iter = ConFrameIterator::new(chunk);
314 match iter.next() {
315 Some(result) => result,
316 None => Err(error::ParseError::IncompleteFrame),
317 }
318 })
319 .collect()
320}