Skip to main content

entrenar/hf_pipeline/dataset/
fetcher.rs

1//! HuggingFace dataset fetcher
2
3use std::path::PathBuf;
4
5use crate::hf_pipeline::error::{FetchError, Result};
6
7use super::dataset_impl::Dataset;
8use super::options::DatasetOptions;
9
10/// HuggingFace dataset fetcher
11pub struct HfDatasetFetcher {
12    /// HuggingFace token
13    #[allow(dead_code)]
14    token: Option<String>,
15    /// Cache directory
16    cache_dir: PathBuf,
17}
18
19impl HfDatasetFetcher {
20    /// Create new fetcher
21    pub fn new() -> Result<Self> {
22        let cache_dir = dirs::cache_dir()
23            .unwrap_or_else(|| PathBuf::from("."))
24            .join("huggingface")
25            .join("datasets");
26
27        Ok(Self { token: std::env::var("HF_TOKEN").ok(), cache_dir })
28    }
29
30    /// Set cache directory
31    #[must_use]
32    pub fn cache_dir(mut self, path: impl Into<PathBuf>) -> Self {
33        self.cache_dir = path.into();
34        self
35    }
36
37    /// Fetch dataset from HuggingFace
38    ///
39    /// # Arguments
40    ///
41    /// * `dataset_id` - Dataset ID (e.g., "wikitext", "squad")
42    /// * `options` - Fetch options
43    pub fn fetch(&self, dataset_id: &str, options: DatasetOptions) -> Result<Dataset> {
44        // Validate dataset ID
45        if dataset_id.is_empty() {
46            return Err(FetchError::InvalidRepoId { repo_id: dataset_id.into() });
47        }
48
49        // For now, create mock dataset (actual HF API integration later)
50        let num_examples = options.max_examples.unwrap_or(1000);
51        let mut dataset = Dataset::mock(num_examples, 128);
52
53        if options.shuffle {
54            if let Some(seed) = options.seed {
55                dataset.shuffle(seed);
56            }
57        }
58
59        Ok(dataset)
60    }
61
62    /// Load dataset from local parquet file
63    pub fn load_parquet(&self, path: &std::path::Path) -> Result<Dataset> {
64        if !path.exists() {
65            return Err(FetchError::FileNotFound {
66                repo: path.parent().unwrap_or(path).display().to_string(),
67                file: path.file_name().unwrap_or_default().to_string_lossy().into(),
68            });
69        }
70
71        // Mock implementation - actual parquet parsing later
72        Ok(Dataset::mock(100, 128))
73    }
74}
75
76impl Default for HfDatasetFetcher {
77    fn default() -> Self {
78        Self::new().expect("Failed to create dataset fetcher")
79    }
80}