xet_runtime/config/groups/data.rs
1use std::time::Duration;
2
3use crate::utils::ByteSize;
4
5crate::config_group!({
6
7 /// Gives the minimum spacing in number of chunks between global dedup queries
8 /// sent to the server to limit the number of simultaneous queries.
9 ///
10 /// The default value is 256, which means that the server will receive a query at most
11 /// for every 256 chunks or 4MB of data.
12 ///
13 /// Use the environment variable `HF_XET_DATA_MIN_SPACING_BETWEEN_GLOBAL_DEDUP_QUERIES` to set this value.
14 ref min_spacing_between_global_dedup_queries: usize = 256;
15
16 /// scheme for a local filesystem based CAS server
17 ///
18 /// The default value is "local://".
19 ///
20 /// Use the environment variable `HF_XET_DATA_LOCAL_CAS_SCHEME` to set this value.
21 ref local_cas_scheme: String = "local://".to_owned();
22
23 /// The maximum number of files to ingest at once on the upload path.
24 /// High performance mode (enabled via HF_XET_HIGH_PERFORMANCE or HF_XET_HP)
25 /// automatically sets this to 100 via XetConfig::with_high_performance().
26 ///
27 /// The default value is 8.
28 ///
29 /// Use the environment variable `HF_XET_DATA_MAX_CONCURRENT_FILE_INGESTION` to set this value.
30 ref max_concurrent_file_ingestion: usize = 8;
31
32 /// The maximum number of files to ingest at once on the download path.
33 ///
34 /// The default value is 8.
35 ///
36 /// Use the environment variable `HF_XET_DATA_MAX_CONCURRENT_FILE_DOWNLOADS` to set this value.
37 ref max_concurrent_file_downloads: usize = 8;
38
39 /// The maximum block size from a file to process at once.
40 ///
41 /// The default value is 8mb.
42 ///
43 /// Use the environment variable `HF_XET_DATA_INGESTION_BLOCK_SIZE` to set this value.
44 ref ingestion_block_size : ByteSize = ByteSize::from("8mb");
45
46 /// How often to send updates on file progress, in milliseconds. Disables batching
47 /// if set to 0.
48 ///
49 /// The default value is 200ms.
50 ///
51 /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_INTERVAL` to set this value.
52 ref progress_update_interval : Duration = Duration::from_millis(200);
53
54 /// Half-life duration for the exponentially weighted moving average used
55 /// to estimate progress completion speed. Older rate observations are
56 /// exponentially decayed with this half-life.
57 ///
58 /// The default value is 10sec.
59 ///
60 /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_SPEED_SAMPLING_WINDOW` to set this value.
61 ref progress_update_speed_sampling_window: Duration = Duration::from_secs(10);
62
63 /// Minimum number of speed observations before reporting a rate.
64 /// Until this many updates have been recorded, the completion rate
65 /// is reported as unknown (None). This avoids displaying noisy
66 /// initial estimates.
67 ///
68 /// The default value is 4.
69 ///
70 /// Use the environment variable `HF_XET_DATA_PROGRESS_UPDATE_SPEED_MIN_OBSERVATIONS` to set this value.
71 ref progress_update_speed_min_observations: u32 = 4;
72
73 /// How often do we flush new xorb data to disk on a long running upload session?
74 ///
75 /// The default value is 20sec.
76 ///
77 /// Use the environment variable `HF_XET_DATA_SESSION_XORB_METADATA_FLUSH_INTERVAL` to set this value.
78 ref session_xorb_metadata_flush_interval : Duration = Duration::from_secs(20);
79
80 /// Force a flush of the xorb metadata every this many xorbs, if more are created
81 /// in this time window.
82 ///
83 /// The default value is 64.
84 ///
85 /// Use the environment variable `HF_XET_DATA_SESSION_XORB_METADATA_FLUSH_MAX_COUNT` to set this value.
86 ref session_xorb_metadata_flush_max_count : usize = 64;
87
88 /// Default CAS endpoint
89 ///
90 /// The default value is "http://localhost:8080".
91 ///
92 /// Use the environment variable `HF_XET_DATA_DEFAULT_CAS_ENDPOINT` to set this value.
93 ref default_cas_endpoint: String = "http://localhost:8080".to_string();
94
95 /// Whether to aggregate progress updates before sending them.
96 /// When enabled, progress updates are batched and sent at regular intervals
97 /// to reduce overhead.
98 ///
99 /// The default value is true.
100 ///
101 /// Use the environment variable `HF_XET_DATA_AGGREGATE_PROGRESS` to set this value.
102 ref aggregate_progress: bool = true;
103
104 /// Default prefix used for CAS and shard operations.
105 ///
106 /// The default value is "default".
107 ///
108 /// Use the environment variable `HF_XET_DATA_DEFAULT_PREFIX` to set this value.
109 ref default_prefix: String = "default".to_string();
110
111 /// Subdirectory name for staging data within the endpoint cache directory.
112 ///
113 /// The default value is "staging".
114 ///
115 /// Use the environment variable `HF_XET_DATA_STAGING_SUBDIR` to set this value.
116 ref staging_subdir: String = "staging".to_string();
117
118});