tokf 0.2.33

Config-driven CLI tool that compresses command output before it reaches an LLM context
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
pub mod cache;
pub mod types;
pub mod variant;

use std::path::{Path, PathBuf};

use anyhow::Context;
use include_dir::{Dir, DirEntry, include_dir};

use types::{CommandPattern, FilterConfig};

static STDLIB: Dir<'static> = include_dir!("$CARGO_MANIFEST_DIR/filters");

/// Priority assigned to embedded stdlib filters — always lower than any user-installed filter.
pub const STDLIB_PRIORITY: u8 = u8::MAX;

/// Returns the embedded TOML content for a filter, if it exists.
/// `relative_path` should be like `git/push.toml`.
pub fn get_embedded_filter(relative_path: &Path) -> Option<&'static str> {
    STDLIB.get_file(relative_path)?.contents_utf8()
}

/// Returns all embedded files under `dir_path` as `(relative_path, utf8_content)` pairs.
/// `dir_path` is relative to the stdlib root (e.g. `"cargo/build_test"`).
pub fn get_embedded_dir_files(dir_path: &Path) -> Vec<(PathBuf, &'static str)> {
    let Some(dir) = STDLIB.get_dir(dir_path) else {
        return Vec::new();
    };
    dir.files()
        .filter_map(|f| Some((f.path().to_path_buf(), f.contents_utf8()?)))
        .collect()
}

/// Build default search dirs in priority order:
/// 1. `.tokf/filters/` (repo-local, resolved from CWD)
/// 2. `{config_dir}/tokf/filters/` (user-level, platform-native)
///
/// The embedded stdlib is always appended at the end by `discover_all_filters`,
/// so no binary-adjacent path is needed.
pub fn default_search_dirs() -> Vec<PathBuf> {
    let mut dirs = Vec::new();

    // 1. Repo-local override (resolved to absolute so it survives any later CWD change)
    if let Ok(cwd) = std::env::current_dir() {
        dirs.push(cwd.join(".tokf/filters"));
    }

    // 2. User-level config dir (TOKF_HOME if set, else platform-native)
    if let Some(user) = crate::paths::user_dir() {
        dirs.push(user.join("filters"));
    }

    dirs
}

/// Try to load a filter from `path`. Returns `Ok(Some(config))` on success,
/// `Ok(None)` if the file does not exist, or `Err` for other I/O / parse errors.
///
/// # Errors
///
/// Returns an error if the file exists but cannot be read or contains invalid TOML.
pub fn try_load_filter(path: &Path) -> anyhow::Result<Option<FilterConfig>> {
    let content = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
        Err(e) => {
            return Err(anyhow::Error::new(e)
                .context(format!("failed to read filter file: {}", path.display())));
        }
    };
    let config: FilterConfig = toml::from_str(&content)
        .with_context(|| format!("failed to parse filter file: {}", path.display()))?;
    Ok(Some(config))
}

/// Count non-`*` words — higher = more specific.
pub fn pattern_specificity(pattern: &str) -> usize {
    pattern.split_whitespace().filter(|w| *w != "*").count()
}

/// Extract the basename from a word that might be a path.
/// Examples: `/usr/bin/ls` -> `ls`, `./mvnw` -> `mvnw`, `git` -> `git`
fn extract_basename(word: &str) -> &str {
    // Find the last path separator (/ or \)
    word.rfind(['/', '\\']).map_or(word, |pos| &word[pos + 1..])
}

/// Skip flag-like tokens at the start of `words` until `target` is found.
///
/// Used to transparently handle global flags between a command name and its
/// subcommand, e.g. `git -C /path log` where `-C /path` are skipped when
/// matching against the pattern `git log`.
///
/// Returns the number of elements consumed from `words` **including** `target`,
/// or `None` if `target` was not found after only flag-like tokens.
///
/// Skipping rules:
/// - `--flag=value` : single token
/// - `-f` / `--flag` followed by a non-flag, non-target word : two tokens
/// - `-f` / `--flag` immediately before another flag or `target` : single token
///
/// # Ambiguity note
///
/// When a flag's prospective value word equals `target`, the value is **not**
/// consumed — the target is matched at that position instead.  This means
/// `git -C log log` matches `git log` with `words_consumed = 3`, treating the
/// first `log` as `-C`'s value… but that is the correct interpretation: git
/// changes to the directory named `log` and then runs `git log`.
fn skip_flags_to_match(words: &[&str], target: &str) -> Option<usize> {
    let mut i = 0;
    while i < words.len() {
        if words[i] == target {
            return Some(i + 1);
        }
        if words[i].starts_with('-') {
            if words[i].contains('=') {
                // --flag=value: entire flag is a single token
                i += 1;
            } else {
                // -f or --flag: skip the flag itself
                i += 1;
                // If the next word is not a flag and not our target, treat it
                // as the flag's value argument and skip it too.
                if i < words.len() && !words[i].starts_with('-') && words[i] != target {
                    i += 1;
                }
            }
        } else {
            // Non-flag, non-target: cannot skip transparently
            return None;
        }
    }
    None
}

/// Returns `words_consumed` if pattern matches a prefix of `words`, else `None`.
///
/// Pattern word `*` matches any single non-empty token but does **not** count
/// toward `words_consumed`.  This means wildcard-matched words stay in the
/// caller's `remaining_args`, making them available for `{args}` substitution
/// in `run` templates.
///
/// Trailing args beyond the pattern length are allowed (prefix semantics).
/// The first word is matched by basename, so `/usr/bin/git` matches pattern `git`.
///
/// Between consecutive pattern words, flag-like tokens (`-f`, `--flag`,
/// `--flag=value`) are skipped transparently, so `git -C /path log` matches
/// pattern `git log`.  The returned count includes any transparently-skipped
/// tokens, ensuring `command_args[..consumed]` still forms the full command
/// prefix (with the global flags in-place) when the command is re-executed.
///
/// # Implementation note
///
/// `word_idx` (the position in `words`) is tracked independently of the
/// pattern index so that transparently-skipped flag tokens advance `word_idx`
/// without advancing the pattern position.  As a result, the returned count
/// may exceed `pattern.split_whitespace().count()`.  This is intentional and
/// correct: the caller uses `command_args[..consumed]` as the full command
/// prefix, which must include the global flags.
pub fn pattern_matches_prefix(pattern: &str, words: &[&str]) -> Option<usize> {
    let pattern_words: Vec<&str> = pattern.split_whitespace().collect();
    if pattern_words.is_empty() || words.is_empty() {
        return None;
    }

    // word_idx tracks our position in `words`; it advances past both matched
    // pattern tokens and any transparently-skipped flag tokens.
    let mut word_idx = 0;
    // Count words matched by `*` wildcards — these are excluded from the
    // returned `words_consumed` so they remain in `remaining_args` and are
    // available for `{args}` substitution in `run` templates.
    let mut wildcard_consumed = 0;

    for (pat_idx, pword) in pattern_words.iter().enumerate() {
        if word_idx >= words.len() {
            return None;
        }

        if *pword == "*" {
            if words[word_idx].is_empty() {
                return None;
            }
            word_idx += 1;
            wildcard_consumed += 1;
        } else {
            // For the first word compare basenames, supporting path variants
            // on both the input word and the pattern word.
            let word_to_match = if pat_idx == 0 {
                extract_basename(words[word_idx])
            } else {
                words[word_idx]
            };

            let pword_to_match = if pat_idx == 0 {
                extract_basename(pword)
            } else {
                pword
            };

            if word_to_match == pword_to_match {
                word_idx += 1;
            } else if pat_idx > 0 {
                // Between pattern words, try to skip over global flag tokens.
                if let Some(advance) = skip_flags_to_match(&words[word_idx..], pword) {
                    word_idx += advance;
                } else {
                    return None;
                }
            } else {
                return None;
            }
        }
    }

    Some(word_idx - wildcard_consumed)
}

/// Recursively find all `.toml` files under `dir`, sorted by relative path.
/// Skips hidden entries (names starting with `.`).
///
/// Silently returns an empty vec if the directory doesn't exist or can't be read.
pub fn discover_filter_files(dir: &Path) -> Vec<PathBuf> {
    let mut files = Vec::new();
    collect_filter_files(dir, &mut files);
    files.sort();
    files
}

fn collect_filter_files(dir: &Path, files: &mut Vec<PathBuf>) {
    let Ok(entries) = std::fs::read_dir(dir) else {
        return;
    };

    let mut entries: Vec<_> = entries.filter_map(Result::ok).collect();
    entries.sort_by_key(std::fs::DirEntry::file_name);

    for entry in entries {
        let path = entry.path();
        let name = entry.file_name();
        let name_str = name.to_string_lossy();

        if name_str.starts_with('.') {
            continue;
        }

        if path.is_dir() {
            collect_filter_files(&path, files);
        } else if path.extension().is_some_and(|e| e == "toml") {
            files.push(path);
        }
    }
}

/// A discovered filter with its config, source path, and priority level.
pub struct ResolvedFilter {
    pub config: FilterConfig,
    /// Canonical SHA-256 hash of the filter config (matches `tokf publish` hash).
    pub hash: String,
    /// Absolute path to the filter file (or `<built-in>/…` for embedded filters).
    pub source_path: PathBuf,
    /// Path relative to its source search dir (for display).
    pub relative_path: PathBuf,
    /// 0 = repo-local, 1 = user-level, [`STDLIB_PRIORITY`] = built-in.
    pub priority: u8,
}

impl ResolvedFilter {
    /// Returns `words_consumed` if any of this filter's patterns match `words`.
    pub fn matches(&self, words: &[&str]) -> Option<usize> {
        for pattern in self.config.command.patterns() {
            if let Some(consumed) = pattern_matches_prefix(pattern, words) {
                return Some(consumed);
            }
        }
        None
    }

    /// Maximum specificity across all patterns (used for sorting).
    pub fn specificity(&self) -> usize {
        self.config
            .command
            .patterns()
            .iter()
            .map(|p| pattern_specificity(p))
            .max()
            .unwrap_or(0)
    }

    /// Check if this filter's display name (relative path without `.toml`) matches `name`.
    pub fn matches_name(&self, name: &str) -> bool {
        self.relative_path.with_extension("").to_string_lossy() == name
    }

    /// Human-readable priority label.
    pub const fn priority_label(&self) -> &'static str {
        match self.priority {
            0 => "local",
            1 => "user",
            _ => "built-in",
        }
    }
}

/// Discover all filters across `search_dirs` plus the embedded stdlib,
/// sorted by `(priority ASC, specificity DESC)`.
///
/// Embedded stdlib entries are appended at priority [`STDLIB_PRIORITY`],
/// so local (0) and user (1) filters always shadow built-in ones.
///
/// Deduplication: first occurrence of each command pattern (by `first()` string) wins.
///
/// # Errors
///
/// Does not return errors for missing directories or invalid TOML files — those are
/// silently skipped. Returns `Err` only on unexpected I/O failures.
pub fn discover_all_filters(search_dirs: &[PathBuf]) -> anyhow::Result<Vec<ResolvedFilter>> {
    let mut all_filters: Vec<ResolvedFilter> = Vec::new();

    for (priority, dir) in search_dirs.iter().enumerate() {
        let files = discover_filter_files(dir);

        for path in files {
            let Ok(Some(config)) = try_load_filter(&path) else {
                continue;
            };

            let relative_path = path.strip_prefix(dir).unwrap_or(&path).to_path_buf();
            let hash = tokf_common::hash::canonical_hash(&config).unwrap_or_default();

            all_filters.push(ResolvedFilter {
                config,
                hash,
                source_path: path,
                relative_path,
                priority: u8::try_from(priority).unwrap_or(u8::MAX),
            });
        }
    }

    // Append embedded stdlib at the lowest priority (STDLIB_PRIORITY ensures it
    // always sorts after local/user dirs regardless of how many dirs are in the slice).
    if let Ok(entries) = STDLIB.find("**/*.toml") {
        for entry in entries {
            if let DirEntry::File(file) = entry {
                let content = file.contents_utf8().unwrap_or("");
                let Ok(config) = toml::from_str::<FilterConfig>(content) else {
                    continue; // silently skip invalid embedded TOML
                };
                let rel = file.path().to_path_buf();
                let hash = tokf_common::hash::canonical_hash(&config).unwrap_or_default();
                all_filters.push(ResolvedFilter {
                    config,
                    hash,
                    source_path: PathBuf::from("<built-in>").join(&rel),
                    relative_path: rel,
                    priority: STDLIB_PRIORITY,
                });
            }
        }
    }

    // Sort by (priority ASC, specificity DESC): lower priority number and higher
    // specificity win.
    all_filters.sort_by(|a, b| {
        a.priority
            .cmp(&b.priority)
            .then_with(|| b.specificity().cmp(&a.specificity()))
    });

    // Dedup: keep first occurrence of each canonical command pattern.
    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
    all_filters.retain(|f| seen.insert(f.config.command.first().to_string()));

    Ok(all_filters)
}

/// Build a rewrite regex pattern for a command pattern string.
///
/// The generated regex mirrors the two runtime matching behaviours:
///
/// 1. **Basename matching** — the first word allows an optional leading path
///    prefix (`/usr/bin/`, `./`, …), so `/usr/bin/git push` matches the
///    pattern `git push`.
///
/// 2. **Transparent global flags** — between consecutive literal pattern words,
///    flag-like tokens (`-f`, `--flag`, `--flag=value`, `-f value`) are
///    tolerated, mirroring the logic in `skip_flags_to_match`.  The optional
///    flag-value group `(?:\s+[^-\s]\S*)?` naturally avoids consuming the next
///    pattern word because the NFA engine backtracks the optional group when
///    doing so is the only way the overall regex can match.  Wildcards (`*`)
///    use plain `\s+` between words.
///
/// # Examples
///
/// Pattern `"git log"` produces a regex that matches all of:
/// - `git log`
/// - `git log --oneline`
/// - `git -C /path log`
/// - `/usr/bin/git --no-pager -C /repo log --oneline`
pub fn command_pattern_to_regex(pattern: &str) -> String {
    let words: Vec<&str> = pattern.split_whitespace().collect();
    if words.is_empty() {
        return "^(\\s.*)?$".to_string();
    }

    let mut regex = String::from("^");

    for (i, &word) in words.iter().enumerate() {
        if i == 0 {
            if word == "*" {
                regex.push_str(r"\S+");
            } else {
                // Strip any path prefix from the pattern word itself (e.g.
                // `./mvnw` → `mvnw`) so that `command = "./mvnw test"` and
                // `command = "mvnw test"` produce identical regexes.
                let basename = extract_basename(word);
                // Allow an optional leading path prefix (e.g. `/usr/bin/` or
                // `./` or `C:\tools\`) so that `/usr/bin/git` and
                // `C:\tools\git` both match the pattern `git`.
                regex.push_str(r"(?:[^\s]*[\\/])?");
                regex.push_str(&regex::escape(basename));
            }
        } else if word == "*" {
            // Wildcard: require exactly one whitespace-separated token.
            regex.push_str(r"\s+\S+");
        } else {
            // Between consecutive literal words, allow any number of flag-like
            // tokens to be skipped transparently.
            //
            // A flag segment is one of:
            //   -flag=value          single token with embedded value
            //   -flag                standalone flag (no value)
            //   -flag <value>        flag then a separate non-flag, non-target word
            //
            // The optional `(?:\s+[^-\s]\S*)?` captures a flag's value
            // argument.  When the value would consume the target pattern word,
            // the NFA engine backtracks that optional group (making it empty)
            // so that `\s+{word_re}` can match instead.
            let word_re = regex::escape(word);
            regex.push_str(r"(?:\s+-[^=\s]+(?:=[^\s]+)?(?:\s+[^-\s]\S*)?)*\s+");
            regex.push_str(&word_re);
        }
    }

    regex.push_str(r"(\s.*)?$");
    regex
}

/// Extract command patterns as rewrite regex strings for a `CommandPattern`.
pub fn command_pattern_regexes(command: &CommandPattern) -> Vec<(String, String)> {
    command
        .patterns()
        .iter()
        .map(|p| (p.clone(), command_pattern_to_regex(p)))
        .collect()
}

#[cfg(test)]
mod tests;
#[cfg(test)]
mod tests_basename;
#[cfg(test)]
mod tests_discovery;
#[cfg(test)]
mod tests_matching;