readcon_core/iterators.rs
1//=============================================================================
2// The Public API - A clean iterator for users of our library
3//=============================================================================
4
5use crate::parser::{parse_single_frame, parse_velocity_section};
6use crate::{error, types};
7use std::iter::Peekable;
8use std::path::Path;
9
10/// An iterator that lazily parses simulation frames from a `.con` or `.convel`
11/// file's contents.
12///
13/// This struct wraps an iterator over the lines of a string and, upon each iteration,
14/// attempts to parse a complete `ConFrame`. Velocity sections are detected
15/// automatically: if a blank line follows the coordinate blocks, the velocity
16/// data is parsed into the atoms.
17///
18/// The iterator yields items of type `Result<ConFrame, ParseError>`, allowing for
19/// robust error handling for each frame.
20pub struct ConFrameIterator<'a> {
21 lines: Peekable<std::str::Lines<'a>>,
22}
23
24impl<'a> ConFrameIterator<'a> {
25 /// Creates a new `ConFrameIterator` from a string slice of the entire file.
26 ///
27 /// # Arguments
28 ///
29 /// * `file_contents` - A string slice containing the text of one or more `.con` frames.
30 pub fn new(file_contents: &'a str) -> Self {
31 ConFrameIterator {
32 lines: file_contents.lines().peekable(),
33 }
34 }
35
36 /// Skips the next frame without fully parsing its atomic data.
37 ///
38 /// This is more efficient than `next()` if you only need to advance the
39 /// iterator. It reads the frame's header to determine how many lines to skip,
40 /// including any velocity section if present.
41 ///
42 /// # Returns
43 ///
44 /// * `Some(Ok(()))` on a successful skip.
45 /// * `Some(Err(ParseError::...))` if there's an error parsing the header.
46 /// * `None` if the iterator is already at the end.
47 pub fn forward(&mut self) -> Option<Result<(), error::ParseError>> {
48 // Skip frame by parsing only required header fields to avoid full parsing overhead
49 if self.lines.peek().is_none() {
50 return None;
51 }
52
53 // Manually consume the first 6 lines of the header, which we don't need for skipping.
54 for _ in 0..6 {
55 if self.lines.next().is_none() {
56 return Some(Err(error::ParseError::IncompleteHeader));
57 }
58 }
59
60 // Line 7: natm_types. We need to parse this.
61 let natm_types: usize = match self.lines.next() {
62 Some(line) => match crate::parser::parse_line_of_n::<usize>(line, 1) {
63 Ok(v) => v[0],
64 Err(e) => return Some(Err(e)),
65 },
66 None => return Some(Err(error::ParseError::IncompleteHeader)),
67 };
68
69 // Line 8: natms_per_type. We need this to sum the total number of atoms.
70 let natms_per_type: Vec<usize> = match self.lines.next() {
71 Some(line) => match crate::parser::parse_line_of_n(line, natm_types) {
72 Ok(v) => v,
73 Err(e) => return Some(Err(e)),
74 },
75 None => return Some(Err(error::ParseError::IncompleteHeader)),
76 };
77
78 // Line 9: masses_per_type. We just need to consume this line.
79 if self.lines.next().is_none() {
80 return Some(Err(error::ParseError::IncompleteHeader));
81 }
82
83 // Calculate how many more lines to skip for coordinate blocks.
84 let total_atoms: usize = natms_per_type.iter().sum();
85 // For each atom type, there is a symbol line and a "Coordinates..." line.
86 let non_atom_lines = natm_types * 2;
87 let lines_to_skip = total_atoms + non_atom_lines;
88
89 // Advance the iterator by skipping the coordinate block lines.
90 for _ in 0..lines_to_skip {
91 if self.lines.next().is_none() {
92 // The file ended before the header's promise was fulfilled.
93 return Some(Err(error::ParseError::IncompleteFrame));
94 }
95 }
96
97 // Check for an optional velocity section (blank separator followed by
98 // velocity blocks with the same structure as coordinate blocks).
99 if let Some(line) = self.lines.peek() {
100 if line.trim().is_empty() {
101 // Consume the blank separator
102 self.lines.next();
103 // Skip the velocity blocks: same structure as coordinate blocks
104 let vel_lines_to_skip = total_atoms + non_atom_lines;
105 for _ in 0..vel_lines_to_skip {
106 if self.lines.next().is_none() {
107 return Some(Err(error::ParseError::IncompleteVelocitySection));
108 }
109 }
110 }
111 }
112
113 Some(Ok(()))
114 }
115}
116
117impl<'a> Iterator for ConFrameIterator<'a> {
118 /// The type of item yielded by the iterator.
119 ///
120 /// Each item is a `Result` that contains a successfully parsed `ConFrame` or a
121 /// `ParseError` if the frame's data is malformed.
122 type Item = Result<types::ConFrame, error::ParseError>;
123
124 /// Advances the iterator and attempts to parse the next frame.
125 ///
126 /// This method will return `None` only when there are no more lines to consume.
127 /// If there are lines but they do not form a complete frame, it will return
128 /// `Some(Err(ParseError::...))`.
129 fn next(&mut self) -> Option<Self::Item> {
130 // If there are no more lines at all, the iterator is exhausted.
131 if self.lines.peek().is_none() {
132 return None;
133 }
134 // Otherwise, attempt to parse the next frame from the available lines.
135 let mut frame = match parse_single_frame(&mut self.lines) {
136 Ok(f) => f,
137 Err(e) => return Some(Err(e)),
138 };
139 // Attempt to parse optional velocity section
140 match parse_velocity_section(&mut self.lines, &frame.header, &mut frame.atom_data) {
141 Ok(_) => {}
142 Err(e) => return Some(Err(e)),
143 }
144 Some(Ok(frame))
145 }
146}
147
148/// Reads all frames from a file using memory-mapped I/O.
149///
150/// This avoids the `read_to_string` allocation for large trajectory files by
151/// letting the OS page cache handle the data. The mmap is read-only and the
152/// file contents are validated as UTF-8 before parsing.
153pub fn read_all_frames(path: &Path) -> Result<Vec<types::ConFrame>, Box<dyn std::error::Error>> {
154 let file = std::fs::File::open(path)?;
155 let mmap = unsafe { memmap2::Mmap::map(&file)? };
156 let contents = std::str::from_utf8(&mmap)?;
157 let iter = ConFrameIterator::new(contents);
158 let frames: Result<Vec<_>, _> = iter.collect();
159 Ok(frames?)
160}
161
162/// Parses frames in parallel using rayon, splitting on frame boundaries.
163///
164/// Phase 1: sequential scan to find byte offsets of each frame's start.
165/// Phase 2: parallel parse of each frame slice using rayon.
166///
167/// Requires the `parallel` feature.
168#[cfg(feature = "parallel")]
169pub fn parse_frames_parallel(
170 file_contents: &str,
171) -> Vec<Result<types::ConFrame, error::ParseError>> {
172 use rayon::prelude::*;
173
174 // Phase 1: find frame byte boundaries by scanning for header patterns.
175 // Each frame starts with a header: 2 comment lines, then a line with 3 floats (box).
176 // We identify boundaries by walking through the file with a ConFrameIterator
177 // and recording byte positions.
178 let mut boundaries: Vec<usize> = Vec::new();
179 let mut offset = 0;
180 boundaries.push(0);
181
182 // Walk through the file using the forward() method to find frame boundaries
183 let mut scanner = ConFrameIterator::new(file_contents);
184 while scanner.forward().is_some() {
185 // After forward(), the internal iterator is positioned right after the frame.
186 // We need to figure out the byte offset of the next frame start.
187 // Since Peekable<Lines> doesn't expose byte offsets, we use a different approach:
188 // count lines consumed per frame and convert to byte offsets.
189 }
190
191 // Simpler approach: split into frame text chunks by parsing sequentially,
192 // recording where each frame starts and ends in the string.
193 boundaries.clear();
194 let lines: Vec<&str> = file_contents.lines().collect();
195 let mut line_idx = 0;
196 let total_lines = lines.len();
197
198 while line_idx < total_lines {
199 // Record the byte offset of this frame's start
200 let byte_offset: usize = lines[..line_idx]
201 .iter()
202 .map(|l| l.len() + 1) // +1 for newline
203 .sum();
204 boundaries.push(byte_offset);
205
206 // Skip 6 header lines (prebox1, prebox2, boxl, angles, postbox1, postbox2)
207 if line_idx + 6 >= total_lines {
208 break;
209 }
210 line_idx += 6;
211
212 // Line 7: natm_types
213 let natm_types: usize = match lines.get(line_idx) {
214 Some(l) => match crate::parser::parse_line_of_n::<usize>(l, 1) {
215 Ok(v) => v[0],
216 Err(_) => break,
217 },
218 None => break,
219 };
220 line_idx += 1;
221
222 // Line 8: natms_per_type
223 let natms_per_type: Vec<usize> = match lines.get(line_idx) {
224 Some(l) => match crate::parser::parse_line_of_n(l, natm_types) {
225 Ok(v) => v,
226 Err(_) => break,
227 },
228 None => break,
229 };
230 line_idx += 1;
231
232 // Line 9: masses (just skip)
233 line_idx += 1;
234
235 // Skip coordinate blocks
236 let total_atoms: usize = natms_per_type.iter().sum();
237 let coord_lines = total_atoms + natm_types * 2;
238 line_idx += coord_lines;
239
240 // Check for velocity section (blank separator)
241 if line_idx < total_lines {
242 if let Some(l) = lines.get(line_idx) {
243 if l.trim().is_empty() {
244 line_idx += 1; // blank separator
245 line_idx += coord_lines; // velocity blocks same size
246 }
247 }
248 }
249 }
250
251 // Phase 2: parallel parse each frame chunk
252 let num_frames = boundaries.len();
253 (0..num_frames)
254 .into_par_iter()
255 .map(|i| {
256 let start = boundaries[i];
257 let end = if i + 1 < num_frames {
258 boundaries[i + 1]
259 } else {
260 file_contents.len()
261 };
262 let chunk = &file_contents[start..end];
263 let mut iter = ConFrameIterator::new(chunk);
264 match iter.next() {
265 Some(result) => result,
266 None => Err(error::ParseError::IncompleteFrame),
267 }
268 })
269 .collect()
270}