Skip to main content

xet_runtime/config/groups/
data.rs

1use std::time::Duration;
2
3use crate::utils::ByteSize;
4
5crate::config_group!({
6
7    /// Gives the minimum spacing in number of chunks between global dedup queries
8    /// sent to the server to limit the number of simultaneous queries.
9    ///
10    /// The default value is 256, which means that the server will receive a query at most
11    /// for every 256 chunks or 4MB of data.
12    ///
13    /// Use the environment variable `HF_XET_DATA_MIN_SPACING_BETWEEN_GLOBAL_DEDUP_QUERIES` to set this value.
14    ref min_spacing_between_global_dedup_queries: usize = 256;
15
16    /// scheme for a local filesystem based CAS server
17    ///
18    /// The default value is "local://".
19    ///
20    /// Use the environment variable `HF_XET_DATA_LOCAL_CAS_SCHEME` to set this value.
21    ref local_cas_scheme: String = "local://".to_owned();
22
23    /// The maximum number of files to ingest at once on the upload path.
24    /// High performance mode (enabled via HF_XET_HIGH_PERFORMANCE or HF_XET_HP)
25    /// automatically sets this to 100 via XetConfig::with_high_performance().
26    ///
27    /// The default value is 8.
28    ///
29    /// Use the environment variable `HF_XET_DATA_MAX_CONCURRENT_FILE_INGESTION` to set this value.
30    ref max_concurrent_file_ingestion: usize = 8;
31
32    /// The maximum number of files to ingest at once on the download path.
33    ///
34    /// The default value is 8.
35    ///
36    /// Use the environment variable `HF_XET_DATA_MAX_CONCURRENT_FILE_DOWNLOADS` to set this value.
37    ref max_concurrent_file_downloads: usize = 8;
38
39    /// The maximum block size from a file to process at once.
40    ///
41    /// The default value is 8mb.
42    ///
43    /// Use the environment variable `HF_XET_DATA_INGESTION_BLOCK_SIZE` to set this value.
44    ref ingestion_block_size : ByteSize = ByteSize::from("8mb");
45
46    /// How often to send updates on file progress, in milliseconds.  Disables batching
47    /// if set to 0.
48    ///
49    /// The default value is 200ms.
50    ///
51    /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_INTERVAL` to set this value.
52    ref progress_update_interval : Duration = Duration::from_millis(200);
53
54    /// Half-life duration for the exponentially weighted moving average used
55    /// to estimate progress completion speed. Older rate observations are
56    /// exponentially decayed with this half-life.
57    ///
58    /// The default value is 10sec.
59    ///
60    /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_SPEED_SAMPLING_WINDOW` to set this value.
61    ref progress_update_speed_sampling_window: Duration = Duration::from_secs(10);
62
63    /// Minimum number of speed observations before reporting a rate.
64    /// Until this many updates have been recorded, the completion rate
65    /// is reported as unknown (None). This avoids displaying noisy
66    /// initial estimates.
67    ///
68    /// The default value is 4.
69    ///
70    /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_SPEED_MIN_OBSERVATIONS` to set this value.
71    ref progress_update_speed_min_observations: u32 = 4;
72
73    /// How often do we flush new xorb data to disk on a long running upload session?
74    ///
75    /// The default value is 20sec.
76    ///
77    /// Use the environment variable `HF_XET_DATA_SESSION_XORB_METADATA_FLUSH_INTERVAL` to set this value.
78    ref session_xorb_metadata_flush_interval : Duration = Duration::from_secs(20);
79
80    /// Force a flush of the xorb metadata every this many xorbs, if more are created
81    /// in this time window.
82    ///
83    /// The default value is 64.
84    ///
85    /// Use the environment variable `HF_XET_DATA_SESSION_XORB_METADATA_FLUSH_MAX_COUNT` to set this value.
86    ref session_xorb_metadata_flush_max_count : usize = 64;
87
88    /// Default CAS endpoint
89    ///
90    /// The default value is "http://localhost:8080".
91    ///
92    /// Use the environment variable `HF_XET_DATA_DEFAULT_CAS_ENDPOINT` to set this value.
93    ref default_cas_endpoint: String = "http://localhost:8080".to_string();
94
95    /// Whether to aggregate progress updates before sending them.
96    /// When enabled, progress updates are batched and sent at regular intervals
97    /// to reduce overhead.
98    ///
99    /// The default value is true.
100    ///
101    /// Use the environment variable `HF_XET_DATA_AGGREGATE_PROGRESS` to set this value.
102    ref aggregate_progress: bool = true;
103
104    /// Default prefix used for CAS and shard operations.
105    ///
106    /// The default value is "default".
107    ///
108    /// Use the environment variable `HF_XET_DATA_DEFAULT_PREFIX` to set this value.
109    ref default_prefix: String = "default".to_string();
110
111    /// Subdirectory name for staging data within the endpoint cache directory.
112    ///
113    /// The default value is "staging".
114    ///
115    /// Use the environment variable `HF_XET_DATA_STAGING_SUBDIR` to set this value.
116    ref staging_subdir: String = "staging".to_string();
117
118});