alpha_g_analysis/
lib.rs

1use midasio::file::{initial_timestamp_unchecked, run_number_unchecked, TryFileViewFromBytesError};
2use std::ffi::{OsStr, OsString};
3use std::io::Read;
4use std::path::{Path, PathBuf};
5use thiserror::Error;
6
7// Known ALPHA-g file extensions.
8#[derive(Clone, Copy, Debug)]
9enum Extension {
10    Mid,
11    Lz4,
12    // If a new extension is added, remember to update the `TryFrom<&OsStr>`
13    // implementation.
14}
15
16/// The error type returned when conversion from [`OsStr`] to a known ALPHA-g
17/// file extension fails.
18#[derive(Debug, Error)]
19#[error("unknown conversion from `{extension:?}`")]
20pub struct TryExtensionFromOsStrError {
21    extension: OsString,
22}
23
24impl TryFrom<&OsStr> for Extension {
25    type Error = TryExtensionFromOsStrError;
26
27    fn try_from(extension: &OsStr) -> Result<Self, Self::Error> {
28        match extension.to_str() {
29            Some("mid") => Ok(Self::Mid),
30            Some("lz4") => Ok(Self::Lz4),
31            _ => Err(TryExtensionFromOsStrError {
32                extension: extension.to_owned(),
33            }),
34        }
35    }
36}
37
38/// The error type for I/O operations on ALPHA-g files
39#[derive(Debug, Error)]
40pub enum AlphaIOError {
41    /// The error type for I/O operations of the Read, Write, Seek, and
42    /// associated traits.
43    #[error("io error")]
44    IoError(#[from] std::io::Error),
45    /// Unknown file extension.
46    #[error("unknown file extension")]
47    UnknownExtension(#[from] TryExtensionFromOsStrError),
48    /// MIDAS file format error.
49    #[error("midas file format error")]
50    MidasFileFormatError(#[from] TryFileViewFromBytesError),
51    /// Bad run number.
52    #[error("bad run number in `{}` (expected `{expected}`, found `{found}`)", .path.display())]
53    BadRunNumber {
54        path: PathBuf,
55        expected: u32,
56        found: u32,
57    },
58    /// Duplicate files by their initial timestamp.
59    #[error("duplicate initial timestamp in `{}` and `{}`", .path1.display(), .path2.display())]
60    DuplicateInitialTimestamp { path1: PathBuf, path2: PathBuf },
61}
62
63/// Read the entire contents of a file (auto-detecting compression).
64///
65/// The compression algorithm is detected based on the file extension. This is a
66/// convenience function for using [`std::fs::read`] and handling the known
67/// compression algorithms used to store ALPHA-g data.
68pub fn read<P: AsRef<Path>>(path: P) -> Result<Vec<u8>, AlphaIOError> {
69    match Extension::try_from(path.as_ref().extension().unwrap_or_default())? {
70        Extension::Mid => Ok(std::fs::read(&path)?),
71        Extension::Lz4 => {
72            let file = std::fs::File::open(&path)?;
73            let mut decoder = lz4::Decoder::new(file)?;
74            let mut contents = Vec::new();
75            std::io::copy(&mut decoder, &mut contents)?;
76            Ok(contents)
77        }
78    }
79}
80
81/// Sort all the files of an individual run by their initial ODB dump timestamp.
82///
83/// Returns an error if:
84/// - Not all files correspond to the same run number.
85/// - Two files have the same initial timestamp.
86///
87/// # Panics
88///
89/// Panics if the input iterator is empty.
90pub fn sort_run_files<P: AsRef<Path>>(
91    files: impl IntoIterator<Item = P>,
92) -> Result<(u32, Vec<P>), AlphaIOError> {
93    let mut files = files
94        .into_iter()
95        .map(|path| {
96            let mut file = std::fs::File::open(&path)?;
97            // The first 12 bytes contain both the run number and the initial
98            // timestamp.
99            let mut buffer = [0; 12];
100            match Extension::try_from(path.as_ref().extension().unwrap_or_default())? {
101                Extension::Mid => {
102                    file.read_exact(&mut buffer)?;
103                }
104                Extension::Lz4 => {
105                    let mut decoder = lz4::Decoder::new(&mut file)?;
106                    decoder.read_exact(&mut buffer)?;
107                }
108            }
109
110            let run_number = run_number_unchecked(&buffer)?;
111            let initial_timestamp = initial_timestamp_unchecked(&buffer)?;
112
113            Ok((run_number, initial_timestamp, path))
114        })
115        .collect::<Result<Vec<_>, AlphaIOError>>()?;
116
117    assert!(!files.is_empty());
118    let expected_run_number = files[0].0;
119    for (run_number, _, path) in &files {
120        if *run_number != expected_run_number {
121            return Err(AlphaIOError::BadRunNumber {
122                path: path.as_ref().to_owned(),
123                expected: expected_run_number,
124                found: *run_number,
125            });
126        }
127    }
128
129    files.sort_unstable_by_key(|(_, initial_timestamp, _)| *initial_timestamp);
130    // These are sorted, so it is enough to check for consecutive duplicates.
131    for window in files.windows(2) {
132        if window[0].1 == window[1].1 {
133            return Err(AlphaIOError::DuplicateInitialTimestamp {
134                path1: window[0].2.as_ref().to_owned(),
135                path2: window[1].2.as_ref().to_owned(),
136            });
137        }
138    }
139
140    Ok((
141        expected_run_number,
142        files.into_iter().map(|(_, _, path)| path).collect(),
143    ))
144}