Skip to main content

readcon_core/
iterators.rs

1//=============================================================================
2// The Public API - A clean iterator for users of our library
3//=============================================================================
4
5use crate::parser::{parse_single_frame, parse_velocity_section};
6use crate::{error, types};
7use std::iter::Peekable;
8use std::path::Path;
9
10/// An iterator that lazily parses simulation frames from a `.con` or `.convel`
11/// file's contents.
12///
13/// This struct wraps an iterator over the lines of a string and, upon each iteration,
14/// attempts to parse a complete `ConFrame`. Velocity sections are detected
15/// automatically: if a blank line follows the coordinate blocks, the velocity
16/// data is parsed into the atoms.
17///
18/// The iterator yields items of type `Result<ConFrame, ParseError>`, allowing for
19/// robust error handling for each frame.
20pub struct ConFrameIterator<'a> {
21    lines: Peekable<std::str::Lines<'a>>,
22}
23
24impl<'a> ConFrameIterator<'a> {
25    /// Creates a new `ConFrameIterator` from a string slice of the entire file.
26    ///
27    /// # Arguments
28    ///
29    /// * `file_contents` - A string slice containing the text of one or more `.con` frames.
30    pub fn new(file_contents: &'a str) -> Self {
31        ConFrameIterator {
32            lines: file_contents.lines().peekable(),
33        }
34    }
35
36    /// Skips the next frame without fully parsing its atomic data.
37    ///
38    /// This is more efficient than `next()` if you only need to advance the
39    /// iterator. It reads the frame's header to determine how many lines to skip,
40    /// including any velocity section if present.
41    ///
42    /// # Returns
43    ///
44    /// * `Some(Ok(()))` on a successful skip.
45    /// * `Some(Err(ParseError::...))` if there's an error parsing the header.
46    /// * `None` if the iterator is already at the end.
47    pub fn forward(&mut self) -> Option<Result<(), error::ParseError>> {
48        // Skip frame by parsing only required header fields to avoid full parsing overhead
49        if self.lines.peek().is_none() {
50            return None;
51        }
52
53        // Manually consume the first 6 lines of the header, which we don't need for skipping.
54        for _ in 0..6 {
55            if self.lines.next().is_none() {
56                return Some(Err(error::ParseError::IncompleteHeader));
57            }
58        }
59
60        // Line 7: natm_types. We need to parse this.
61        let natm_types: usize = match self.lines.next() {
62            Some(line) => match crate::parser::parse_line_of_n::<usize>(line, 1) {
63                Ok(v) => v[0],
64                Err(e) => return Some(Err(e)),
65            },
66            None => return Some(Err(error::ParseError::IncompleteHeader)),
67        };
68
69        // Line 8: natms_per_type. We need this to sum the total number of atoms.
70        let natms_per_type: Vec<usize> = match self.lines.next() {
71            Some(line) => match crate::parser::parse_line_of_n(line, natm_types) {
72                Ok(v) => v,
73                Err(e) => return Some(Err(e)),
74            },
75            None => return Some(Err(error::ParseError::IncompleteHeader)),
76        };
77
78        // Line 9: masses_per_type. We just need to consume this line.
79        if self.lines.next().is_none() {
80            return Some(Err(error::ParseError::IncompleteHeader));
81        }
82
83        // Calculate how many more lines to skip for coordinate blocks.
84        let total_atoms: usize = natms_per_type.iter().sum();
85        // For each atom type, there is a symbol line and a "Coordinates..." line.
86        let non_atom_lines = natm_types * 2;
87        let lines_to_skip = total_atoms + non_atom_lines;
88
89        // Advance the iterator by skipping the coordinate block lines.
90        for _ in 0..lines_to_skip {
91            if self.lines.next().is_none() {
92                // The file ended before the header's promise was fulfilled.
93                return Some(Err(error::ParseError::IncompleteFrame));
94            }
95        }
96
97        // Check for an optional velocity section (blank separator followed by
98        // velocity blocks with the same structure as coordinate blocks).
99        if let Some(line) = self.lines.peek() {
100            if line.trim().is_empty() {
101                // Consume the blank separator
102                self.lines.next();
103                // Skip the velocity blocks: same structure as coordinate blocks
104                let vel_lines_to_skip = total_atoms + non_atom_lines;
105                for _ in 0..vel_lines_to_skip {
106                    if self.lines.next().is_none() {
107                        return Some(Err(error::ParseError::IncompleteVelocitySection));
108                    }
109                }
110            }
111        }
112
113        Some(Ok(()))
114    }
115}
116
117impl<'a> Iterator for ConFrameIterator<'a> {
118    /// The type of item yielded by the iterator.
119    ///
120    /// Each item is a `Result` that contains a successfully parsed `ConFrame` or a
121    /// `ParseError` if the frame's data is malformed.
122    type Item = Result<types::ConFrame, error::ParseError>;
123
124    /// Advances the iterator and attempts to parse the next frame.
125    ///
126    /// This method will return `None` only when there are no more lines to consume.
127    /// If there are lines but they do not form a complete frame, it will return
128    /// `Some(Err(ParseError::...))`.
129    fn next(&mut self) -> Option<Self::Item> {
130        // If there are no more lines at all, the iterator is exhausted.
131        if self.lines.peek().is_none() {
132            return None;
133        }
134        // Otherwise, attempt to parse the next frame from the available lines.
135        let mut frame = match parse_single_frame(&mut self.lines) {
136            Ok(f) => f,
137            Err(e) => return Some(Err(e)),
138        };
139        // Attempt to parse optional velocity section
140        match parse_velocity_section(&mut self.lines, &frame.header, &mut frame.atom_data) {
141            Ok(_) => {}
142            Err(e) => return Some(Err(e)),
143        }
144        Some(Ok(frame))
145    }
146}
147
148/// Size threshold below which we use `read_to_string` instead of mmap.
149/// For small files, the fixed overhead of mmap (VMA creation, page fault,
150/// munmap) exceeds the cost of a simple `read` syscall + heap allocation.
151/// 64 KiB is a conservative cutoff used by ripgrep and similar tools.
152const MMAP_THRESHOLD: u64 = 64 * 1024;
153
154/// Reads file contents, choosing between `read_to_string` (small files) and
155/// mmap (large files) based on [`MMAP_THRESHOLD`].
156fn read_file_contents(path: &Path) -> Result<FileContents, Box<dyn std::error::Error>> {
157    let file = std::fs::File::open(path)?;
158    let metadata = file.metadata()?;
159    if metadata.len() < MMAP_THRESHOLD {
160        let contents = std::fs::read_to_string(path)?;
161        Ok(FileContents::Owned(contents))
162    } else {
163        let mmap = unsafe { memmap2::Mmap::map(&file)? };
164        Ok(FileContents::Mapped(mmap))
165    }
166}
167
168/// Holds file contents either as an owned String or a memory-mapped region.
169enum FileContents {
170    Owned(String),
171    Mapped(memmap2::Mmap),
172}
173
174impl FileContents {
175    fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
176        match self {
177            FileContents::Owned(s) => Ok(s.as_str()),
178            FileContents::Mapped(m) => std::str::from_utf8(m),
179        }
180    }
181}
182
183/// Reads all frames from a file.
184///
185/// For files smaller than 64 KiB, uses a simple `read_to_string` to avoid
186/// the fixed overhead of mmap (VMA creation, page fault, munmap). For larger
187/// trajectory files, uses memory-mapped I/O to let the OS page cache handle
188/// the data.
189pub fn read_all_frames(path: &Path) -> Result<Vec<types::ConFrame>, Box<dyn std::error::Error>> {
190    let contents = read_file_contents(path)?;
191    let text = contents.as_str()?;
192    let iter = ConFrameIterator::new(text);
193    let frames: Result<Vec<_>, _> = iter.collect();
194    Ok(frames?)
195}
196
197/// Reads only the first frame from a file.
198///
199/// More efficient than `read_all_frames` for single-frame access because it
200/// stops parsing after the first frame rather than collecting all of them.
201pub fn read_first_frame(path: &Path) -> Result<types::ConFrame, Box<dyn std::error::Error>> {
202    let contents = read_file_contents(path)?;
203    let text = contents.as_str()?;
204    let mut iter = ConFrameIterator::new(text);
205    match iter.next() {
206        Some(Ok(frame)) => Ok(frame),
207        Some(Err(e)) => Err(Box::new(e)),
208        None => Err("No frames found in file".into()),
209    }
210}
211
212/// Parses frames in parallel using rayon, splitting on frame boundaries.
213///
214/// Phase 1: sequential scan to find byte offsets of each frame's start.
215/// Phase 2: parallel parse of each frame slice using rayon.
216///
217/// Requires the `parallel` feature.
218#[cfg(feature = "parallel")]
219pub fn parse_frames_parallel(
220    file_contents: &str,
221) -> Vec<Result<types::ConFrame, error::ParseError>> {
222    use rayon::prelude::*;
223
224    // Phase 1: find frame byte boundaries by scanning for header patterns.
225    // Each frame starts with a header: 2 comment lines, then a line with 3 floats (box).
226    // We identify boundaries by walking through the file with a ConFrameIterator
227    // and recording byte positions.
228    let mut boundaries: Vec<usize> = Vec::new();
229    let mut offset = 0;
230    boundaries.push(0);
231
232    // Walk through the file using the forward() method to find frame boundaries
233    let mut scanner = ConFrameIterator::new(file_contents);
234    while scanner.forward().is_some() {
235        // After forward(), the internal iterator is positioned right after the frame.
236        // We need to figure out the byte offset of the next frame start.
237        // Since Peekable<Lines> doesn't expose byte offsets, we use a different approach:
238        // count lines consumed per frame and convert to byte offsets.
239    }
240
241    // Simpler approach: split into frame text chunks by parsing sequentially,
242    // recording where each frame starts and ends in the string.
243    boundaries.clear();
244    let lines: Vec<&str> = file_contents.lines().collect();
245    let mut line_idx = 0;
246    let total_lines = lines.len();
247
248    while line_idx < total_lines {
249        // Record the byte offset of this frame's start
250        let byte_offset: usize = lines[..line_idx]
251            .iter()
252            .map(|l| l.len() + 1) // +1 for newline
253            .sum();
254        boundaries.push(byte_offset);
255
256        // Skip 6 header lines (prebox1, prebox2, boxl, angles, postbox1, postbox2)
257        if line_idx + 6 >= total_lines {
258            break;
259        }
260        line_idx += 6;
261
262        // Line 7: natm_types
263        let natm_types: usize = match lines.get(line_idx) {
264            Some(l) => match crate::parser::parse_line_of_n::<usize>(l, 1) {
265                Ok(v) => v[0],
266                Err(_) => break,
267            },
268            None => break,
269        };
270        line_idx += 1;
271
272        // Line 8: natms_per_type
273        let natms_per_type: Vec<usize> = match lines.get(line_idx) {
274            Some(l) => match crate::parser::parse_line_of_n(l, natm_types) {
275                Ok(v) => v,
276                Err(_) => break,
277            },
278            None => break,
279        };
280        line_idx += 1;
281
282        // Line 9: masses (just skip)
283        line_idx += 1;
284
285        // Skip coordinate blocks
286        let total_atoms: usize = natms_per_type.iter().sum();
287        let coord_lines = total_atoms + natm_types * 2;
288        line_idx += coord_lines;
289
290        // Check for velocity section (blank separator)
291        if line_idx < total_lines {
292            if let Some(l) = lines.get(line_idx) {
293                if l.trim().is_empty() {
294                    line_idx += 1; // blank separator
295                    line_idx += coord_lines; // velocity blocks same size
296                }
297            }
298        }
299    }
300
301    // Phase 2: parallel parse each frame chunk
302    let num_frames = boundaries.len();
303    (0..num_frames)
304        .into_par_iter()
305        .map(|i| {
306            let start = boundaries[i];
307            let end = if i + 1 < num_frames {
308                boundaries[i + 1]
309            } else {
310                file_contents.len()
311            };
312            let chunk = &file_contents[start..end];
313            let mut iter = ConFrameIterator::new(chunk);
314            match iter.next() {
315                Some(result) => result,
316                None => Err(error::ParseError::IncompleteFrame),
317            }
318        })
319        .collect()
320}