Skip to main content

readcon_core/
parser.rs

1use crate::error::ParseError;
2use crate::types::{AtomDatum, ConFrame, FrameHeader};
3use std::iter::Peekable;
4use std::rc::Rc;
5
6/// Parses a line of whitespace-separated f64 values using fast-float2.
7///
8/// This is the hot-path parser for coordinate and velocity lines. It uses
9/// `fast_float2::parse` instead of `str::parse::<f64>()` for better throughput
10/// on the numeric-heavy atom data lines.
11///
12/// # Arguments
13///
14/// * `line` - A string slice representing a single line of data.
15/// * `n` - The exact number of f64 values expected on the line.
16pub fn parse_line_of_n_f64(line: &str, n: usize) -> Result<Vec<f64>, ParseError> {
17    let mut values = Vec::with_capacity(n);
18    for token in line.split_ascii_whitespace() {
19        let val: f64 = fast_float2::parse(token)
20            .map_err(|_| ParseError::InvalidNumberFormat(format!("invalid float: {token}")))?;
21        values.push(val);
22    }
23    if values.len() == n {
24        Ok(values)
25    } else {
26        Err(ParseError::InvalidVectorLength {
27            expected: n,
28            found: values.len(),
29        })
30    }
31}
32
33/// Parses a line of whitespace-separated values into a vector of a specific type.
34///
35/// This generic helper function takes a string slice, splits it by whitespace,
36/// and attempts to parse each substring into the target type `T`. The type `T`
37/// must implement `std::str::FromStr`.
38///
39/// # Arguments
40///
41/// * `line` - A string slice representing a single line of data.
42/// * `n` - The exact number of values expected on the line.
43///
44/// # Errors
45///
46/// * `ParseError::InvalidVectorLength` if the number of parsed values is not equal to `n`.
47/// * Propagates any error from the `parse()` method of the target type `T`.
48///
49/// # Example
50///
51/// ```
52/// use readcon_core::parser::parse_line_of_n;
53/// let line = "10.5 20.0 30.5";
54/// let values: Vec<f64> = parse_line_of_n(line, 3).unwrap();
55/// assert_eq!(values, vec![10.5, 20.0, 30.5]);
56///
57/// let result = parse_line_of_n::<i32>(line, 2);
58/// assert!(result.is_err());
59/// ```
60pub fn parse_line_of_n<T: std::str::FromStr>(line: &str, n: usize) -> Result<Vec<T>, ParseError>
61where
62    ParseError: From<<T as std::str::FromStr>::Err>,
63{
64    let values: Vec<T> = line
65        .split_whitespace()
66        .map(|s| s.parse::<T>())
67        .collect::<Result<_, _>>()?;
68
69    if values.len() == n {
70        Ok(values)
71    } else {
72        Err(ParseError::InvalidVectorLength {
73            expected: n,
74            found: values.len(),
75        })
76    }
77}
78
79/// Parses the 9-line header of a `.con` file frame from an iterator.
80///
81/// This function consumes the next 9 lines from the given line iterator to
82/// construct a `FrameHeader`. The iterator is advanced by 9 lines on success.
83///
84/// # Arguments
85///
86/// * `lines` - A mutable reference to an iterator that yields string slices.
87///
88/// # Errors
89///
90/// * `ParseError::IncompleteHeader` if the iterator has fewer than 9 lines remaining.
91/// * Propagates any errors from `parse_line_of_n` if the numeric data within
92///   the header is malformed.
93///
94/// # Panics
95///
96/// This function will panic if the intermediate vectors for box dimensions or angles,
97/// after being successfully parsed, cannot be converted into fixed-size arrays.
98/// This should not happen if `parse_line_of_n` is used correctly with `n=3`.
99pub fn parse_frame_header<'a>(
100    lines: &mut impl Iterator<Item = &'a str>,
101) -> Result<FrameHeader, ParseError> {
102    let prebox1 = lines
103        .next()
104        .ok_or(ParseError::IncompleteHeader)?
105        .to_string();
106    let prebox2 = lines
107        .next()
108        .ok_or(ParseError::IncompleteHeader)?
109        .to_string();
110    let boxl_vec = parse_line_of_n_f64(lines.next().ok_or(ParseError::IncompleteHeader)?, 3)?;
111    let angles_vec = parse_line_of_n_f64(lines.next().ok_or(ParseError::IncompleteHeader)?, 3)?;
112    let postbox1 = lines
113        .next()
114        .ok_or(ParseError::IncompleteHeader)?
115        .to_string();
116    let postbox2 = lines
117        .next()
118        .ok_or(ParseError::IncompleteHeader)?
119        .to_string();
120    let natm_types =
121        parse_line_of_n::<usize>(lines.next().ok_or(ParseError::IncompleteHeader)?, 1)?[0];
122    let natms_per_type = parse_line_of_n::<usize>(
123        lines.next().ok_or(ParseError::IncompleteHeader)?,
124        natm_types,
125    )?;
126    let masses_per_type = parse_line_of_n_f64(
127        lines.next().ok_or(ParseError::IncompleteHeader)?,
128        natm_types,
129    )?;
130    Ok(FrameHeader {
131        prebox_header: [prebox1, prebox2],
132        boxl: boxl_vec.try_into().unwrap(),
133        angles: angles_vec.try_into().unwrap(),
134        postbox_header: [postbox1, postbox2],
135        natm_types,
136        natms_per_type,
137        masses_per_type,
138    })
139}
140
141/// Parses a complete frame from a `.con` file, including its header and atomic data.
142///
143/// This function first parses the complete frame header and then uses the information within it
144/// (specifically the number of atom types and atoms per type) to parse the subsequent
145/// atom coordinate blocks.
146///
147/// # Arguments
148///
149/// * `lines` - A mutable reference to an iterator that yields string slices for the frame.
150///
151/// # Errors
152///
153/// * `ParseError::IncompleteFrame` if the iterator ends before all expected
154///   atomic data has been read.
155/// * Propagates any errors from the underlying calls to `parse_frame_header` and
156///   `parse_line_of_n`.
157///
158/// # Example
159///
160/// ```
161/// use readcon_core::parser::parse_single_frame;
162///
163/// let frame_text = r#"
164///PREBOX LINE 1
165///PREBOX LINE 2
166///10.0 10.0 10.0
167///90.0 90.0 90.0
168///POSTBOX LINE 1
169///POSTBOX LINE 2
170///2
171///1 1
172///12.011 1.008
173///C
174///Coordinates of Component 1
175///1.0 1.0 1.0 0.0 1
176///H
177///Coordinates of Component 2
178///2.0 2.0 2.0 0.0 2
179/// "#;
180///
181/// let mut lines = frame_text.trim().lines();
182/// let con_frame = parse_single_frame(&mut lines).unwrap();
183///
184/// assert_eq!(con_frame.header.natm_types, 2);
185/// assert_eq!(con_frame.atom_data.len(), 2);
186/// assert_eq!(&*con_frame.atom_data[0].symbol, "C");
187/// assert_eq!(con_frame.atom_data[1].atom_id, 2);
188/// ```
189pub fn parse_single_frame<'a>(
190    lines: &mut impl Iterator<Item = &'a str>,
191) -> Result<ConFrame, ParseError> {
192    let header = parse_frame_header(lines)?;
193    let total_atoms: usize = header.natms_per_type.iter().sum();
194    let mut atom_data = Vec::with_capacity(total_atoms);
195
196    for num_atoms in &header.natms_per_type {
197        // Create a reference-counted string for the symbol once per component.
198        let symbol = Rc::new(
199            lines
200                .next()
201                .ok_or(ParseError::IncompleteFrame)?
202                .trim()
203                .to_string(),
204        );
205        // Consume and discard the "Coordinates of Component X" line.
206        lines.next().ok_or(ParseError::IncompleteFrame)?;
207        for _ in 0..*num_atoms {
208            let coord_line = lines.next().ok_or(ParseError::IncompleteFrame)?;
209            let vals = parse_line_of_n_f64(coord_line, 5)?;
210            atom_data.push(AtomDatum {
211                // This is now a cheap reference-count increment, not a full string clone.
212                symbol: Rc::clone(&symbol),
213                x: vals[0],
214                y: vals[1],
215                z: vals[2],
216                is_fixed: vals[3] != 0.0,
217                atom_id: vals[4] as u64,
218                vx: None,
219                vy: None,
220                vz: None,
221            });
222        }
223    }
224    Ok(ConFrame { header, atom_data })
225}
226
227/// Attempts to parse an optional velocity section following coordinate blocks.
228///
229/// In `.convel` files, after all coordinate blocks there is a blank separator line
230/// followed by per-component velocity blocks with the same structure as coordinate
231/// blocks (symbol line, "Velocities of Component N" line, then atom lines with
232/// `vx vy vz fixed atomID`).
233///
234/// This function peeks at the next line. If it is blank (or contains only whitespace),
235/// it consumes the blank line and parses velocity data into the existing `atom_data`.
236/// If the next line is not blank (or is absent), no velocities are parsed.
237///
238/// Returns `Ok(true)` if velocities were found and parsed, `Ok(false)` otherwise.
239pub fn parse_velocity_section<'a, I>(
240    lines: &mut Peekable<I>,
241    header: &FrameHeader,
242    atom_data: &mut [AtomDatum],
243) -> Result<bool, ParseError>
244where
245    I: Iterator<Item = &'a str>,
246{
247    // Peek at the next line to check for blank separator
248    match lines.peek() {
249        Some(line) if line.trim().is_empty() => {
250            // Consume the blank separator
251            lines.next();
252        }
253        _ => return Ok(false),
254    }
255
256    let mut atom_idx = 0;
257    for (type_idx, &num_atoms) in header.natms_per_type.iter().enumerate() {
258        // Symbol line
259        let _symbol = lines
260            .next()
261            .ok_or(ParseError::IncompleteVelocitySection)?
262            .trim();
263
264        // "Velocities of Component N" line
265        let comp_line = lines
266            .next()
267            .ok_or(ParseError::IncompleteVelocitySection)?;
268        // Validate it looks like a velocity header (optional strictness)
269        if !comp_line.contains("Velocities of Component") {
270            return Err(ParseError::IncompleteVelocitySection);
271        }
272        let _ = type_idx; // suppress unused warning
273
274        for _ in 0..num_atoms {
275            let vel_line = lines
276                .next()
277                .ok_or(ParseError::IncompleteVelocitySection)?;
278            let vals = parse_line_of_n_f64(vel_line, 5)?;
279            if atom_idx < atom_data.len() {
280                atom_data[atom_idx].vx = Some(vals[0]);
281                atom_data[atom_idx].vy = Some(vals[1]);
282                atom_data[atom_idx].vz = Some(vals[2]);
283                // vals[3] is fixed flag, vals[4] is atom_id (redundant with coords)
284            }
285            atom_idx += 1;
286        }
287    }
288
289    Ok(true)
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn test_parse_line_of_n_success() {
298        let line = "1.0 2.5 -3.0";
299        let values = parse_line_of_n::<f64>(line, 3).unwrap();
300        assert_eq!(values, vec![1.0, 2.5, -3.0]);
301    }
302
303    #[test]
304    fn test_parse_line_of_n_too_short() {
305        let line = "1.0 2.5";
306        let result = parse_line_of_n::<f64>(line, 3);
307        assert!(result.is_err());
308        assert!(matches!(
309            result.unwrap_err(),
310            ParseError::InvalidVectorLength {
311                expected: 3,
312                found: 2
313            }
314        ));
315    }
316
317    #[test]
318    fn test_parse_line_of_n_too_long() {
319        let line = "1.0 2.5 -3.0 4.0";
320        let result = parse_line_of_n::<f64>(line, 3);
321        assert!(result.is_err());
322        assert!(matches!(
323            result.unwrap_err(),
324            ParseError::InvalidVectorLength {
325                expected: 3,
326                found: 4
327            }
328        ));
329    }
330
331    #[test]
332    fn test_parse_line_of_n_invalid_float() {
333        let line = "1.0 abc -3.0";
334        let result = parse_line_of_n::<f64>(line, 3);
335        assert!(result.is_err());
336        assert!(matches!(
337            result.unwrap_err(),
338            ParseError::InvalidNumberFormat(_)
339        ));
340    }
341
342    #[test]
343    fn test_parse_frame_header_success() {
344        let lines = vec![
345            "PREBOX1",
346            "PREBOX2",
347            "10.0 20.0 30.0",
348            "90.0 90.0 90.0",
349            "POSTBOX1",
350            "POSTBOX2",
351            "2",
352            "1 1",
353            "12.011 1.008",
354        ];
355        let mut line_it = lines.iter().copied();
356        match parse_frame_header(&mut line_it) {
357            Ok(header) => {
358                assert_eq!(header.prebox_header, ["PREBOX1", "PREBOX2"]);
359                assert_eq!(header.boxl, [10.0, 20.0, 30.0]);
360                assert_eq!(header.angles, [90.0, 90.0, 90.0]);
361                assert_eq!(header.postbox_header, ["POSTBOX1", "POSTBOX2"]);
362                assert_eq!(header.natm_types, 2);
363                assert_eq!(header.natms_per_type, vec![1, 1]);
364                assert_eq!(header.masses_per_type, vec![12.011, 1.008]);
365            }
366            Err(e) => {
367                panic!(
368                    "Parsing failed when it should have succeeded. Error: {:?}",
369                    e
370                );
371            }
372        }
373    }
374
375    #[test]
376    fn test_parse_frame_header_missing_line() {
377        let lines = vec![
378            "PREBOX1",
379            "PREBOX2",
380            "10.0 20.0 30.0",
381            "90.0 90.0 90.0",
382            "POSTBOX1",
383            "POSTBOX2",
384            "2",
385            "1 1",
386            // Missing masses_per_type
387        ];
388        let mut line_it = lines.iter().copied();
389        let result = parse_frame_header(&mut line_it);
390        assert!(result.is_err());
391        assert!(matches!(result.unwrap_err(), ParseError::IncompleteHeader));
392    }
393
394    #[test]
395    fn test_parse_frame_header_invalid_natms_per_type() {
396        let lines = vec![
397            "PREBOX1",
398            "PREBOX2",
399            "10.0 20.0 30.0",
400            "90.0 90.0 90.0",
401            "POSTBOX1",
402            "POSTBOX2",
403            "2",
404            "1 1 1", // 3 values, but natm_types is 2
405            "12.011 1.008",
406        ];
407        let mut line_it = lines.iter().copied();
408        let result = parse_frame_header(&mut line_it);
409        assert!(result.is_err());
410        assert!(matches!(
411            result.unwrap_err(),
412            ParseError::InvalidVectorLength {
413                expected: 2,
414                found: 3
415            }
416        ));
417    }
418
419    #[test]
420    fn test_parse_single_frame_success() {
421        let lines = vec![
422            "PREBOX1",
423            "PREBOX2",
424            "10.0 20.0 30.0",
425            "90.0 90.0 90.0",
426            "POSTBOX1",
427            "POSTBOX2",
428            "2",
429            "3 3",
430            "12.011 1.008",
431            "1",
432            "Coordinates of Component 1",
433            "0.0 0.0 0.0 0.0 1",
434            "1.0940 0.0 0.0 0.0 2",
435            "-0.5470 0.9499 0.0 0.0 3",
436            "2",
437            "Coordinates of Component 2",
438            "5.0 5.0 5.0 0.0 4",
439            "6.0940 5.0 5.0 0.0 5",
440            "5.5470 5.9499 5.0 0.0 6",
441        ];
442        let mut line_it = lines.iter().copied();
443        let frame = parse_single_frame(&mut line_it).unwrap();
444
445        assert_eq!(frame.header.natm_types, 2);
446        assert_eq!(frame.header.natms_per_type, vec![3, 3]);
447        assert_eq!(frame.header.masses_per_type, vec![12.011, 1.008]);
448        assert_eq!(frame.atom_data.len(), 6);
449        assert_eq!(&*frame.atom_data[0].symbol, "1");
450        assert_eq!(frame.atom_data[0].atom_id, 1);
451        assert_eq!(&*frame.atom_data[5].symbol, "2");
452        assert_eq!(frame.atom_data[5].atom_id, 6);
453    }
454
455    #[test]
456    fn test_parse_single_frame_missing_line() {
457        let lines = vec![
458            "PREBOX1",
459            "PREBOX2",
460            "10.0 20.0 30.0",
461            "90.0 90.0 90.0",
462            "POSTBOX1",
463            "POSTBOX2",
464            "2",
465            "3 3",
466            "12.011 1.008",
467            "1",
468            "Coordinates of Component 1",
469            "0.0 0.0 0.0 0.0 1",
470            "1.0940 0.0 0.0 0.0 2",
471            "-0.5470 0.9499 0.0 0.0 3",
472            // Missing "2" line for Component 2 atoms
473        ];
474        let mut line_it = lines.iter().copied();
475        let result = parse_single_frame(&mut line_it);
476        assert!(result.is_err());
477        assert!(matches!(result.unwrap_err(), ParseError::IncompleteFrame));
478    }
479
480    #[test]
481    fn test_parse_single_frame_invalid_atom_coords() {
482        let lines = vec![
483            "PREBOX1",
484            "PREBOX2",
485            "10.0 20.0 30.0",
486            "90.0 90.0 90.0",
487            "POSTBOX1",
488            "POSTBOX2",
489            "2",
490            "3 3",
491            "12.011 1.008",
492            "1",
493            "Coordinates of Component 1",
494            "0.0 0.0 0.0 0.0 1",
495            "1.0940 0.0 0.0 0.0 2",
496            "-0.5470 0.9499 0.0 0.0 3",
497            "2",
498            "Coordinates of Component 2",
499            "5.0 5.0 5.0 0.0", // Missing atom_id
500            "6.0940 5.0 5.0 0.0 5",
501            "5.5470 5.9499 5.0 0.0 6",
502        ];
503        let mut line_it = lines.iter().copied();
504        let result = parse_single_frame(&mut line_it);
505        assert!(result.is_err());
506        assert!(matches!(
507            result.unwrap_err(),
508            ParseError::InvalidVectorLength {
509                expected: 5,
510                found: 4
511            }
512        ));
513    }
514
515    #[test]
516    fn test_parse_velocity_section_present() {
517        let lines = vec![
518            "PREBOX1",
519            "PREBOX2",
520            "10.0 20.0 30.0",
521            "90.0 90.0 90.0",
522            "POSTBOX1",
523            "POSTBOX2",
524            "2",
525            "1 1",
526            "63.546 1.008",
527            "Cu",
528            "Coordinates of Component 1",
529            "0.0 0.0 0.0 1.0 0",
530            "H",
531            "Coordinates of Component 2",
532            "1.0 2.0 3.0 0.0 1",
533            "",
534            "Cu",
535            "Velocities of Component 1",
536            "0.1 0.2 0.3 1.0 0",
537            "H",
538            "Velocities of Component 2",
539            "0.4 0.5 0.6 0.0 1",
540        ];
541        let mut line_it = lines.iter().copied().peekable();
542        // Parse the frame first (consuming 15 lines)
543        let mut frame =
544            parse_single_frame(&mut line_it).expect("coordinate parsing should succeed");
545        assert!(!frame.has_velocities());
546
547        // Now parse the velocity section
548        let has_vel =
549            parse_velocity_section(&mut line_it, &frame.header, &mut frame.atom_data)
550                .expect("velocity parsing should succeed");
551        assert!(has_vel);
552        assert_eq!(frame.atom_data[0].vx, Some(0.1));
553        assert_eq!(frame.atom_data[0].vy, Some(0.2));
554        assert_eq!(frame.atom_data[0].vz, Some(0.3));
555        assert_eq!(frame.atom_data[1].vx, Some(0.4));
556        assert_eq!(frame.atom_data[1].vy, Some(0.5));
557        assert_eq!(frame.atom_data[1].vz, Some(0.6));
558    }
559
560    #[test]
561    fn test_parse_velocity_section_absent() {
562        let lines = vec![
563            "PREBOX1",
564            "PREBOX2",
565            "10.0 20.0 30.0",
566            "90.0 90.0 90.0",
567            "POSTBOX1",
568            "POSTBOX2",
569            "1",
570            "1",
571            "12.011",
572            "C",
573            "Coordinates of Component 1",
574            "0.0 0.0 0.0 0.0 1",
575        ];
576        let mut line_it = lines.iter().copied().peekable();
577        let mut frame = parse_single_frame(&mut line_it).expect("parse should succeed");
578        let has_vel =
579            parse_velocity_section(&mut line_it, &frame.header, &mut frame.atom_data)
580                .expect("should succeed with no velocities");
581        assert!(!has_vel);
582        assert_eq!(frame.atom_data[0].vx, None);
583    }
584}