mod file_iterator;
mod find_files;
mod schema;
use std::path::{Path, PathBuf};
use ahash::{HashMap, HashMapExt};
use eyre::Result;
use itertools::Itertools;
use smallvec::{smallvec, SmallVec};
pub use schema::*;
const MAX_CARRYOVER_VALUES: usize = 6;
use self::{file_iterator::File, find_files::find_data_files};
#[derive(Clone, Default)]
struct FileMetadata {
locations: Vec<PathBuf>,
columns: Vec<String>,
carry_over_columns: CarryOverColumns,
}
pub struct Files {
files: HashMap<String, FileMetadata>,
}
impl Files {
pub fn new(dir: &Path) -> Result<Self> {
let dir = find_data_files(dir)?;
let mut files = HashMap::new();
let entries = glob::glob(&format!("{}/*/*.gz", dir.display()))?;
for file in entries {
let file = file?;
if !file.metadata()?.is_file() {
continue;
}
let name = file.file_name().unwrap_or_default().to_string_lossy();
let base_name = name.split('.').next().unwrap_or_default().to_string();
files
.entry(base_name)
.or_insert_with(FileMetadata::default)
.locations
.push(file);
}
for (_, file) in files.iter_mut() {
file.locations.sort_unstable();
}
let mut slf = Self { files };
slf.init_file_columns()?;
Ok(slf)
}
pub fn get_file_stream(&self, filename: &str) -> Result<File> {
let locations = self
.files
.get(filename)
.ok_or_else(|| eyre::eyre!("No file named {}", filename,))?;
File::new(locations)
}
fn init_file_columns(&mut self) -> Result<()> {
let mut mrfiles = self.get_file_stream("MRFILES")?;
for line in mrfiles.records() {
let line = line?;
let filename = line.get(0).unwrap_or_default();
let basename = filename.split('.').next().unwrap_or_default();
let columns = line.get(2).unwrap_or_default();
let columns = columns
.split(',')
.map(|s| s.to_string())
.collect::<Vec<_>>();
if let Some(f) = self.files.get_mut(basename) {
f.columns = columns;
f.carry_over_columns = get_carry_over_columns(basename, &f.columns);
}
}
Ok(())
}
}
#[derive(Clone, Default)]
struct CarryOverColumns {
ptr_column: Option<u8>,
columns: SmallVec<[u8; MAX_CARRYOVER_VALUES]>,
}
fn get_carry_over_columns(basename: &str, columns: &[String]) -> CarryOverColumns {
let (ptr, column_names): (bool, SmallVec<[&str; MAX_CARRYOVER_VALUES]>) = match basename {
"MRSAT" => (false, smallvec!["CUI", "METAUI", "STYPE", "SAB"]),
"MRHIER" => (true, smallvec!["CUI", "AUI", "SAB", "RELA"]),
"MRREL" => (false, smallvec!["CUI1", "AUI1", "STYPE1", "STYPE2", "SAB"]),
_ => (false, smallvec![]),
};
let column_idxs = column_names
.into_iter()
.filter_map(|name| columns.iter().position(|c| c == name).map(|p| p as u8))
.sorted()
.collect();
let ptr_column = if ptr {
columns.iter().position(|c| *c == "PTR").map(|p| p as u8)
} else {
None
};
CarryOverColumns {
ptr_column,
columns: column_idxs,
}
}