entrenar/hf_pipeline/dataset/
fetcher.rs1use std::path::PathBuf;
4
5use crate::hf_pipeline::error::{FetchError, Result};
6
7use super::dataset_impl::Dataset;
8use super::options::DatasetOptions;
9
10pub struct HfDatasetFetcher {
12 #[allow(dead_code)]
14 token: Option<String>,
15 cache_dir: PathBuf,
17}
18
19impl HfDatasetFetcher {
20 pub fn new() -> Result<Self> {
22 let cache_dir = dirs::cache_dir()
23 .unwrap_or_else(|| PathBuf::from("."))
24 .join("huggingface")
25 .join("datasets");
26
27 Ok(Self { token: std::env::var("HF_TOKEN").ok(), cache_dir })
28 }
29
30 #[must_use]
32 pub fn cache_dir(mut self, path: impl Into<PathBuf>) -> Self {
33 self.cache_dir = path.into();
34 self
35 }
36
37 pub fn fetch(&self, dataset_id: &str, options: DatasetOptions) -> Result<Dataset> {
44 if dataset_id.is_empty() {
46 return Err(FetchError::InvalidRepoId { repo_id: dataset_id.into() });
47 }
48
49 let num_examples = options.max_examples.unwrap_or(1000);
51 let mut dataset = Dataset::mock(num_examples, 128);
52
53 if options.shuffle {
54 if let Some(seed) = options.seed {
55 dataset.shuffle(seed);
56 }
57 }
58
59 Ok(dataset)
60 }
61
62 pub fn load_parquet(&self, path: &std::path::Path) -> Result<Dataset> {
64 if !path.exists() {
65 return Err(FetchError::FileNotFound {
66 repo: path.parent().unwrap_or(path).display().to_string(),
67 file: path.file_name().unwrap_or_default().to_string_lossy().into(),
68 });
69 }
70
71 Ok(Dataset::mock(100, 128))
73 }
74}
75
76impl Default for HfDatasetFetcher {
77 fn default() -> Self {
78 Self::new().expect("Failed to create dataset fetcher")
79 }
80}