1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
//! Include / exclude glob filtering for `apr pull` selective download
//! (CRUX-A-04).
//!
//! Contract: `contracts/crux-A-04-v1.yaml`.
//!
//! Pure classifier — takes a remote file list plus `--include` /
//! `--exclude` glob patterns and returns the subset that should be
//! downloaded. No I/O, no network, no filesystem access. The
//! integration-level claim ("`apr pull` actually downloads exactly
//! this set") is discharged by a separate network-gated harness.
//!
//! Formula (from contract): `Selected(R, I, X) = (if I == ∅ then R else
//! R ∩ I) \ X`. `--exclude` wins over `--include` for any overlap.
//! Glob semantics are fnmatch-style (`*`, `?`, `[abc]`, `**`) to match
//! `huggingface_hub`'s `snapshot_download(allow_patterns, ignore_patterns)`.
use glob::Pattern;
/// Error returned when a user-supplied glob is syntactically invalid.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GlobFilterError {
/// The pattern could not be parsed by `glob::Pattern::new`. The
/// offending pattern is included for operator-visible diagnostics.
InvalidPattern(String),
}
impl std::fmt::Display for GlobFilterError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GlobFilterError::InvalidPattern(p) => {
write!(f, "invalid glob pattern: {p:?}")
}
}
}
}
impl std::error::Error for GlobFilterError {}
/// Return true iff `path` matches ANY of the given compiled patterns.
fn any_match(path: &str, patterns: &[Pattern]) -> bool {
patterns.iter().any(|p| p.matches(path))
}
/// Compile a slice of raw glob strings into `glob::Pattern`s, failing
/// on the first malformed pattern.
fn compile_patterns(raw: &[&str]) -> Result<Vec<Pattern>, GlobFilterError> {
raw.iter()
.map(|s| Pattern::new(s).map_err(|_| GlobFilterError::InvalidPattern((*s).to_string())))
.collect()
}
/// Select the subset of `files` that should be downloaded, per the
/// CRUX-A-04 `glob_selection_set` formula.
///
/// - Empty `include` means "take everything".
/// - `exclude` wins over `include` for any overlap.
/// - Iteration order is preserved (stable, deterministic output).
///
/// CRUX-A-04 ALGO-001/002/003 sub-claim of FALSIFY-001/002/003: the
/// selection function matches the contract formula exactly, which is
/// the algorithm-level precondition for the integration-level
/// `apr pull --include/--exclude` download-set check.
pub fn select_files<S: AsRef<str>>(
files: &[S],
include: &[&str],
exclude: &[&str],
) -> Result<Vec<String>, GlobFilterError> {
let inc = compile_patterns(include)?;
let exc = compile_patterns(exclude)?;
let mut out = Vec::with_capacity(files.len());
for f in files {
let path = f.as_ref();
let included = inc.is_empty() || any_match(path, &inc);
let excluded = any_match(path, &exc);
if included && !excluded {
out.push(path.to_string());
}
}
Ok(out)
}
/// Return true iff a single `path` would be selected under the given
/// globs. Convenience wrapper; identical semantics to `select_files`
/// on a one-element list.
pub fn is_selected(
path: &str,
include: &[&str],
exclude: &[&str],
) -> Result<bool, GlobFilterError> {
let inc = compile_patterns(include)?;
let exc = compile_patterns(exclude)?;
let included = inc.is_empty() || any_match(path, &inc);
let excluded = any_match(path, &exc);
Ok(included && !excluded)
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_repo() -> Vec<&'static str> {
// Approximates the gpt2 file tree used in the FALSIFY tests.
vec![
"config.json",
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"vocab.json",
"merges.txt",
"model.safetensors",
"pytorch_model.bin",
"tf_model.h5",
"README.md",
]
}
#[test]
fn empty_include_empty_exclude_selects_everything() {
// Contract: empty --include means "take everything", empty
// --exclude means "drop nothing".
let files = sample_repo();
let got = select_files(&files, &[], &[]).unwrap();
assert_eq!(got.len(), files.len());
}
#[test]
fn include_safetensors_falsify_001_algorithm_sub_claim() {
// CRUX-A-04 ALGO-001 sub-claim of FALSIFY-001: `--include
// '*.safetensors'` MUST restrict the output to only files
// whose path matches the glob. Matches the `find ! -name`
// check in the shell falsification test.
let files = sample_repo();
let got = select_files(&files, &["*.safetensors"], &[]).unwrap();
assert_eq!(got, vec!["model.safetensors"]);
for f in &got {
assert!(f.ends_with(".safetensors"), "unexpected leak: {f}");
}
}
#[test]
fn exclude_bin_falsify_002_algorithm_sub_claim() {
// CRUX-A-04 ALGO-002 sub-claim of FALSIFY-002: `--exclude
// '*.bin'` MUST drop all files whose path matches the glob.
let files = sample_repo();
let got = select_files(&files, &[], &["*.bin"]).unwrap();
assert!(!got.iter().any(|f| f.ends_with(".bin")));
// All non-.bin files should be retained.
assert_eq!(got.len(), files.len() - 1);
}
#[test]
fn exclude_wins_over_include_falsify_003_algorithm_sub_claim() {
// CRUX-A-04 ALGO-003 sub-claim of FALSIFY-003: precedence rule
// — `--include '*.json' --exclude 'config.json'` keeps *.json
// files EXCEPT config.json.
let files = sample_repo();
let got = select_files(&files, &["*.json"], &["config.json"]).unwrap();
assert!(!got.iter().any(|f| f == "config.json"));
// But other .json files survive.
assert!(got.iter().any(|f| f == "tokenizer.json"));
assert!(got.iter().any(|f| f == "vocab.json"));
// And no non-json file leaks in.
for f in &got {
assert!(f.ends_with(".json"), "non-json leak: {f}");
}
}
#[test]
fn multiple_include_globs_union_semantics() {
let files = sample_repo();
let got = select_files(&files, &["*.safetensors", "*.bin"], &[]).unwrap();
assert_eq!(got.len(), 2);
assert!(got.contains(&"model.safetensors".to_string()));
assert!(got.contains(&"pytorch_model.bin".to_string()));
}
#[test]
fn multiple_exclude_globs_union_semantics() {
let files = sample_repo();
let got = select_files(&files, &[], &["*.bin", "*.h5"]).unwrap();
assert!(!got.iter().any(|f| f.ends_with(".bin")));
assert!(!got.iter().any(|f| f.ends_with(".h5")));
}
#[test]
fn question_mark_matches_single_char() {
let files = vec!["a.json", "ab.json", "b.json"];
let got = select_files(&files, &["?.json"], &[]).unwrap();
assert!(got.contains(&"a.json".to_string()));
assert!(got.contains(&"b.json".to_string()));
assert!(!got.contains(&"ab.json".to_string()));
}
#[test]
fn recursive_glob_matches_subdirs() {
// HuggingFace repos commonly nest weights under subdirs
// (`model-00001-of-00002.safetensors` is flat; LoRA adapters
// often live under `adapters/`). Confirm `**` behaves.
let files = vec![
"adapters/lora.safetensors",
"adapters/nested/deep.safetensors",
"model.safetensors",
];
let got = select_files(&files, &["adapters/**/*.safetensors"], &[]).unwrap();
assert!(got.contains(&"adapters/lora.safetensors".to_string()));
assert!(got.contains(&"adapters/nested/deep.safetensors".to_string()));
assert!(!got.contains(&"model.safetensors".to_string()));
}
#[test]
fn invalid_include_glob_is_error() {
// `glob` 0.3 rejects unclosed `[` as invalid.
let files = vec!["a.json"];
let err = select_files(&files, &["a["], &[]).unwrap_err();
match err {
GlobFilterError::InvalidPattern(p) => assert_eq!(p, "a["),
}
}
#[test]
fn invalid_exclude_glob_is_error() {
let files = vec!["a.json"];
assert!(select_files(&files, &[], &["a["]).is_err());
}
#[test]
fn selection_is_deterministic() {
// Same inputs → same output across invocations. Matches
// `download_idempotence` invariant in the contract.
let files = sample_repo();
let a = select_files(&files, &["*.json"], &["config.json"]).unwrap();
let b = select_files(&files, &["*.json"], &["config.json"]).unwrap();
assert_eq!(a, b);
}
#[test]
fn selection_preserves_input_order() {
// Deterministic iteration order eases downstream manifest
// generation — the output must match the input order.
let files = vec!["z.json", "a.json", "m.json"];
let got = select_files(&files, &["*.json"], &[]).unwrap();
assert_eq!(got, vec!["z.json", "a.json", "m.json"]);
}
#[test]
fn empty_repo_produces_empty_output() {
let files: Vec<&str> = vec![];
let got = select_files(&files, &["*.safetensors"], &[]).unwrap();
assert!(got.is_empty());
}
#[test]
fn include_matches_no_file_produces_empty() {
// If `--include` matches nothing, output is empty — NOT a
// fallback to "everything". Matches the formula exactly.
let files = sample_repo();
let got = select_files(&files, &["*.nonexistent"], &[]).unwrap();
assert!(got.is_empty());
}
#[test]
fn is_selected_agrees_with_select_files() {
let files = sample_repo();
let inc = ["*.json"];
let exc = ["config.json"];
let whole = select_files(&files, &inc, &exc).unwrap();
for f in &files {
let single = is_selected(f, &inc, &exc).unwrap();
assert_eq!(single, whole.iter().any(|w| w == f), "disagreement on {f}",);
}
}
#[test]
fn falsify_001_gpt2_shape_only_safetensors_survive() {
// CRUX-A-04 FALSIFY-001 algorithm-level: the shell test's
// predicate is `find ! -name '*.safetensors' | grep -q .`.
// Algorithm-level equivalent: every output path ends with
// .safetensors AND model.safetensors is present.
let files = sample_repo();
let got = select_files(&files, &["*.safetensors"], &[]).unwrap();
assert!(!got.is_empty());
assert!(got.iter().all(|f| f.ends_with(".safetensors")));
assert!(got.contains(&"model.safetensors".to_string()));
}
#[test]
fn falsify_002_gpt2_shape_zero_bin_files() {
// CRUX-A-04 FALSIFY-002 algorithm-level: shell test counts
// .bin files and asserts 0.
let files = sample_repo();
let got = select_files(&files, &[], &["*.bin"]).unwrap();
assert_eq!(got.iter().filter(|f| f.ends_with(".bin")).count(), 0);
}
#[test]
fn falsify_003_gpt2_shape_config_dropped_others_kept() {
// CRUX-A-04 FALSIFY-003 algorithm-level: shell test asserts
// `! -f "$TMP/config.json"` AND `ls "$TMP"/*.json` succeeds.
let files = sample_repo();
let got = select_files(&files, &["*.json"], &["config.json"]).unwrap();
assert!(!got.iter().any(|f| f == "config.json"));
assert!(got.iter().any(|f| f.ends_with(".json")));
}
}