use crate::{standardize_collection, CollectionData, LblError};
use polars::prelude::*;
use std::collections::HashMap;
use std::fs::File;
use std::path::{Path, PathBuf};
pub fn load_file(path: PathBuf, metadata: Option<&CollectionData>) -> Result<DataFrame, LblError> {
let metadata_owned;
let metadata_ref = match metadata {
Some(metadata) => metadata,
None => {
metadata_owned = parse_path_metadata(&path);
&metadata_owned
}
};
standardize_collection(load_raw_data(path.clone())?, metadata_ref)
}
pub fn parse_path_metadata(path: &Path) -> CollectionData {
let mut data = CollectionData::default();
if let Some(collection_dir) = path.parent() {
if let Some(collection_name) = collection_dir.file_name() {
data.collection = Some(collection_name.to_string_lossy().into_owned());
}
if let Some(network_dir) = collection_dir.parent() {
if let Some(network_name) = network_dir.file_name() {
data.network = Some(network_name.to_string_lossy().into_owned());
}
}
}
data
}
pub trait FlattenPathBufVec {
fn flatten_into_pathbuf(&self) -> Vec<PathBuf>;
}
impl FlattenPathBufVec for Vec<PathBuf> {
fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
self.clone()
}
}
impl FlattenPathBufVec for HashMap<String, Vec<PathBuf>> {
fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
self.iter()
.flat_map(|(_key, value)| value.clone())
.collect()
}
}
impl FlattenPathBufVec for HashMap<String, HashMap<String, Vec<PathBuf>>> {
fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
self.iter()
.flat_map(|(_outer_key, inner_map)| {
inner_map
.iter()
.flat_map(|(_inner_key, value)| value.clone())
})
.collect()
}
}
pub fn load_files<T: FlattenPathBufVec>(
paths: &T,
metadata: Option<&CollectionData>,
) -> Result<DataFrame, LblError> {
let paths = paths.flatten_into_pathbuf();
let dfs: Result<Vec<DataFrame>, LblError> =
paths.into_iter().map(|p| load_file(p, metadata)).collect();
let mut dfs = dfs?;
let mut final_df = dfs.remove(0);
for df in dfs {
final_df = final_df.vstack(&df)?;
}
Ok(final_df)
}
pub fn load_raw_data(path: PathBuf) -> Result<DataFrame, LblError> {
let extension = path
.extension()
.and_then(std::ffi::OsStr::to_str)
.unwrap_or("");
match extension {
"csv" => {
let file = File::open(path)?;
let df = CsvReader::new(file).finish()?;
Ok(df)
}
"parquet" => {
let file = File::open(path)?;
let df = ParquetReader::new(file).finish()?;
Ok(df)
}
_ => Err(LblError::LblError(format!(
"Unsupported file type: {}",
extension
))),
}
}