lace/data/
data_source.rs

1//! Type of the data source, e.g., CSV or SQL database.
2use super::error::DefaultCodebookError;
3use lace_codebook::Codebook;
4use polars::frame::DataFrame;
5use std::fmt;
6
7#[cfg(feature = "formats")]
8use std::ffi::{OsStr, OsString};
9#[cfg(feature = "formats")]
10use std::path::PathBuf;
11
12/// Denotes the source type of the data to be analyzed
13#[cfg(not(feature = "formats"))]
14#[derive(Debug, Clone, PartialEq)]
15pub enum DataSource {
16    /// Polars DataFrame
17    Polars(DataFrame),
18    /// Empty (A void datasource).
19    Empty,
20}
21
22/// Denotes the source type of the data to be analyzed
23#[cfg(feature = "formats")]
24#[derive(Debug, Clone, PartialEq)]
25pub enum DataSource {
26    /// CSV file
27    Csv(PathBuf),
28    /// Apache IPC data format (e.g. Arrow V2)
29    Ipc(PathBuf),
30    /// JSON  or JSON line file
31    Json(PathBuf),
32    /// Parquet data format
33    Parquet(PathBuf),
34    /// Polars DataFrame
35    Polars(DataFrame),
36    /// Empty (A void datasource).
37    Empty,
38}
39
40/// Error when extension is not CSV, JSON, IPC, or Parquet
41#[cfg(feature = "formats")]
42#[derive(Clone, Debug, PartialEq)]
43pub struct UnknownExtension(pub Option<OsString>);
44
45#[cfg(feature = "formats")]
46impl std::fmt::Display for UnknownExtension {
47    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48        write!(f, "Unknown Extension: {:?}", self.0)
49    }
50}
51
52#[cfg(feature = "formats")]
53impl std::error::Error for UnknownExtension {}
54
55#[cfg(feature = "formats")]
56impl TryFrom<PathBuf> for DataSource {
57    type Error = UnknownExtension;
58
59    fn try_from(value: PathBuf) -> Result<Self, Self::Error> {
60        match value
61            .extension()
62            .and_then(OsStr::to_str)
63            .map(str::to_lowercase)
64            .ok_or_else(|| {
65                UnknownExtension(value.extension().map(OsStr::to_os_string))
66            })?
67            .as_ref()
68        {
69            "csv" | "csv.gz" => Ok(Self::Csv(value)),
70            "gz" if value.ends_with("") => Ok(Self::Csv(value)),
71            "json" | "jsonl" => Ok(Self::Json(value)),
72            "parquet" => Ok(Self::Parquet(value)),
73            "arrow" | "ipc" => Ok(Self::Ipc(value)),
74            _ => Err(UnknownExtension(
75                value.extension().map(OsStr::to_os_string),
76            )),
77        }
78    }
79}
80
81#[cfg(feature = "formats")]
82impl TryFrom<DataSource> for PathBuf {
83    type Error = &'static str;
84    fn try_from(src: DataSource) -> Result<Self, Self::Error> {
85        match src {
86            DataSource::Parquet(s)
87            | DataSource::Csv(s)
88            | DataSource::Json(s)
89            | DataSource::Ipc(s) => Ok(s),
90            DataSource::Empty => {
91                Err("DataSource::EMPTY has no path information")
92            }
93            DataSource::Polars(_) => {
94                Err("DataSource::Polars has no corresponding path")
95            }
96        }
97    }
98}
99
100impl From<DataFrame> for DataSource {
101    fn from(value: DataFrame) -> Self {
102        Self::Polars(value)
103    }
104}
105
106#[cfg(feature = "formats")]
107impl fmt::Display for DataSource {
108    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
109        write!(
110            f,
111            "{}",
112            self.to_os_string()
113                .and_then(|s| s.into_string().ok())
114                .unwrap_or_else(|| "EMPTY".to_owned())
115        )
116    }
117}
118
119#[cfg(not(feature = "formats"))]
120impl fmt::Display for DataSource {
121    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
122        match self {
123            Self::Polars(df) => {
124                write!(f, "polars::DataFrame {:?}", df.shape())
125            }
126            Self::Empty => {
127                write!(f, "Empty")
128            }
129        }
130    }
131}
132
133#[cfg(feature = "formats")]
134impl DataSource {
135    pub fn to_os_string(&self) -> Option<OsString> {
136        match self {
137            DataSource::Parquet(s)
138            | DataSource::Csv(s)
139            | DataSource::Json(s)
140            | DataSource::Ipc(s) => Some(s),
141            DataSource::Empty | DataSource::Polars(_) => None,
142        }
143        .map(|x| x.clone().into_os_string())
144    }
145
146    /// Generate a default `Codebook` from the source data
147    pub fn default_codebook(&self) -> Result<Codebook, DefaultCodebookError> {
148        use crate::codebook::{data, formats};
149        let codebook = match &self {
150            DataSource::Ipc(path) => {
151                formats::codebook_from_ipc(path, None, None, None, false)
152            }
153            DataSource::Csv(path) => {
154                formats::codebook_from_csv(path, None, None, None, false)
155            }
156            DataSource::Json(path) => {
157                formats::codebook_from_json(path, None, None, None, false)
158            }
159            DataSource::Parquet(path) => {
160                formats::codebook_from_parquet(path, None, None, None, false)
161            }
162            DataSource::Polars(df) => {
163                data::df_to_codebook(df, None, None, None, false)
164            }
165            DataSource::Empty => Ok(Codebook::default()),
166        }?;
167        Ok(codebook)
168    }
169}
170
171#[cfg(not(feature = "formats"))]
172impl DataSource {
173    /// Generate a default `Codebook` from the source data
174    pub fn default_codebook(&self) -> Result<Codebook, DefaultCodebookError> {
175        use crate::codebook::data;
176        let codebook = match &self {
177            DataSource::Polars(df) => {
178                data::df_to_codebook(df, None, None, None, false)
179            }
180            DataSource::Empty => Ok(Codebook::default()),
181        }?;
182        Ok(codebook)
183    }
184}