lbl/filesystem/
load.rs

1use crate::{standardize_collection, CollectionData, LblError};
2use polars::prelude::*;
3use std::collections::HashMap;
4use std::fs::File;
5use std::path::{Path, PathBuf};
6
7/// load label file
8pub fn load_file(path: PathBuf, metadata: Option<&CollectionData>) -> Result<DataFrame, LblError> {
9    let metadata_owned;
10    let metadata_ref = match metadata {
11        Some(metadata) => metadata,
12        None => {
13            metadata_owned = parse_path_metadata(&path);
14            &metadata_owned
15        }
16    };
17    standardize_collection(load_raw_data(path.clone())?, metadata_ref)
18}
19
20/// parse path metadata
21pub fn parse_path_metadata(path: &Path) -> CollectionData {
22    let mut data = CollectionData::default();
23    if let Some(collection_dir) = path.parent() {
24        if let Some(collection_name) = collection_dir.file_name() {
25            data.collection = Some(collection_name.to_string_lossy().into_owned());
26        }
27
28        if let Some(network_dir) = collection_dir.parent() {
29            if let Some(network_name) = network_dir.file_name() {
30                data.network = Some(network_name.to_string_lossy().into_owned());
31            }
32        }
33    }
34
35    data
36}
37
38/// flatten into Vec<PathBuf>
39pub trait FlattenPathBufVec {
40    /// flatten into Vec<PathBuf>
41    fn flatten_into_pathbuf(&self) -> Vec<PathBuf>;
42}
43
44impl FlattenPathBufVec for Vec<PathBuf> {
45    fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
46        self.clone()
47    }
48}
49
50impl FlattenPathBufVec for HashMap<String, Vec<PathBuf>> {
51    fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
52        self.iter()
53            .flat_map(|(_key, value)| value.clone())
54            .collect()
55    }
56}
57
58impl FlattenPathBufVec for HashMap<String, HashMap<String, Vec<PathBuf>>> {
59    fn flatten_into_pathbuf(&self) -> Vec<PathBuf> {
60        self.iter()
61            .flat_map(|(_outer_key, inner_map)| {
62                inner_map
63                    .iter()
64                    .flat_map(|(_inner_key, value)| value.clone())
65            })
66            .collect()
67    }
68}
69
70/// load label files
71pub fn load_files<T: FlattenPathBufVec>(
72    paths: &T,
73    metadata: Option<&CollectionData>,
74) -> Result<DataFrame, LblError> {
75    let paths = paths.flatten_into_pathbuf();
76    let dfs: Result<Vec<DataFrame>, LblError> =
77        paths.into_iter().map(|p| load_file(p, metadata)).collect();
78    let mut dfs = dfs?;
79
80    let mut final_df = dfs.remove(0);
81    for df in dfs {
82        final_df = final_df.vstack(&df)?;
83    }
84    Ok(final_df)
85}
86
87/// load raw label file
88pub fn load_raw_data(path: PathBuf) -> Result<DataFrame, LblError> {
89    let extension = path
90        .extension()
91        .and_then(std::ffi::OsStr::to_str)
92        .unwrap_or("");
93    match extension {
94        "csv" => {
95            let file = File::open(path)?;
96            let df = CsvReader::new(file).finish()?;
97            Ok(df)
98        }
99        "parquet" => {
100            let file = File::open(path)?;
101            let df = ParquetReader::new(file).finish()?;
102            Ok(df)
103        }
104        _ => Err(LblError::LblError(format!(
105            "Unsupported file type: {}",
106            extension
107        ))),
108    }
109}