#![forbid(unsafe_code)]
use tokmd_scan::normalize_slashes as normalize_path;
use tokmd_types::{FileClassification, InclusionPolicy};
pub const DEFAULT_MAX_FILE_PCT: f64 = 0.15;
pub const DEFAULT_MAX_FILE_TOKENS: usize = 16_000;
pub const DEFAULT_DENSE_THRESHOLD: f64 = 50.0;
const LOCKFILES: &[&str] = &[
"Cargo.lock",
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
"poetry.lock",
"Pipfile.lock",
"go.sum",
"composer.lock",
"Gemfile.lock",
];
const SMART_EXCLUDE_SUFFIXES: &[(&str, &str)] = &[
(".min.js", "minified"),
(".min.css", "minified"),
(".js.map", "sourcemap"),
(".css.map", "sourcemap"),
];
const SPINE_PATTERNS: &[&str] = &[
"README.md",
"README",
"README.rst",
"README.txt",
"ROADMAP.md",
"docs/ROADMAP.md",
"CONTRIBUTING.md",
"Cargo.toml",
"package.json",
"pyproject.toml",
"go.mod",
"docs/architecture.md",
"docs/design.md",
"tokmd.toml",
"cockpit.toml",
];
const GENERATED_PATTERNS: &[&str] = &[
"node-types.json",
"grammar.json",
".generated.",
".pb.go",
".pb.rs",
"_pb2.py",
".g.dart",
".freezed.dart",
];
const VENDORED_DIRS: &[&str] = &["vendor/", "third_party/", "third-party/", "node_modules/"];
const FIXTURE_DIRS: &[&str] = &[
"fixtures/",
"testdata/",
"test_data/",
"__snapshots__/",
"golden/",
];
#[must_use]
pub fn smart_exclude_reason(path: &str) -> Option<&'static str> {
let basename = path.rsplit('/').next().unwrap_or(path);
if LOCKFILES.contains(&basename) {
return Some("lockfile");
}
for &(suffix, reason) in SMART_EXCLUDE_SUFFIXES {
if basename.ends_with(suffix) {
return Some(reason);
}
}
None
}
#[must_use]
pub fn is_spine_file(path: &str) -> bool {
let normalized = normalize_path(path);
let basename = normalized.rsplit('/').next().unwrap_or(&normalized);
for &pattern in SPINE_PATTERNS {
if pattern.contains('/') {
if normalized == pattern || normalized.ends_with(&format!("/{pattern}")) {
return true;
}
} else if basename == pattern {
return true;
}
}
false
}
#[must_use]
pub fn classify_file(
path: &str,
tokens: usize,
lines: usize,
dense_threshold: f64,
) -> Vec<FileClassification> {
let mut classes = Vec::new();
let normalized = normalize_path(path);
let basename = normalized.rsplit('/').next().unwrap_or(&normalized);
if LOCKFILES.contains(&basename) {
classes.push(FileClassification::Lockfile);
}
if basename.ends_with(".min.js") || basename.ends_with(".min.css") {
classes.push(FileClassification::Minified);
}
if basename.ends_with(".js.map") || basename.ends_with(".css.map") {
classes.push(FileClassification::Sourcemap);
}
if GENERATED_PATTERNS
.iter()
.any(|pat| basename == *pat || basename.contains(pat))
{
classes.push(FileClassification::Generated);
}
if VENDORED_DIRS
.iter()
.any(|dir| normalized.contains(dir) || normalized.starts_with(dir.trim_end_matches('/')))
{
classes.push(FileClassification::Vendored);
}
if FIXTURE_DIRS
.iter()
.any(|dir| normalized.contains(dir) || normalized.starts_with(dir.trim_end_matches('/')))
{
classes.push(FileClassification::Fixture);
}
let effective_lines = lines.max(1);
let tokens_per_line = tokens as f64 / effective_lines as f64;
if tokens_per_line > dense_threshold {
classes.push(FileClassification::DataBlob);
}
classes.sort();
classes.dedup();
classes
}
#[must_use]
pub fn compute_file_cap(budget: usize, max_file_pct: f64, max_file_tokens: Option<usize>) -> usize {
if budget == usize::MAX {
return usize::MAX;
}
let pct_cap = (budget as f64 * max_file_pct) as usize;
let hard_cap = max_file_tokens.unwrap_or(DEFAULT_MAX_FILE_TOKENS);
pct_cap.min(hard_cap)
}
#[must_use]
pub fn assign_policy(
tokens: usize,
file_cap: usize,
classifications: &[FileClassification],
) -> (InclusionPolicy, Option<String>) {
if tokens <= file_cap {
return (InclusionPolicy::Full, None);
}
let skip_classes = [
FileClassification::Generated,
FileClassification::DataBlob,
FileClassification::Vendored,
];
if classifications.iter().any(|c| skip_classes.contains(c)) {
let class_names: Vec<&str> = classifications.iter().map(classification_name).collect();
return (
InclusionPolicy::Skip,
Some(format!(
"{} file exceeds cap ({} > {} tokens)",
class_names.join("+"),
tokens,
file_cap
)),
);
}
(
InclusionPolicy::HeadTail,
Some(format!(
"file exceeds cap ({} > {} tokens); head+tail included",
tokens, file_cap
)),
)
}
fn classification_name(classification: &FileClassification) -> &'static str {
match classification {
FileClassification::Generated => "generated",
FileClassification::Fixture => "fixture",
FileClassification::Vendored => "vendored",
FileClassification::Lockfile => "lockfile",
FileClassification::Minified => "minified",
FileClassification::DataBlob => "data_blob",
FileClassification::Sourcemap => "sourcemap",
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn smart_exclude_reason_detects_lockfiles_and_sourcemaps() {
assert_eq!(smart_exclude_reason("Cargo.lock"), Some("lockfile"));
assert_eq!(smart_exclude_reason("dist/app.js.map"), Some("sourcemap"));
assert_eq!(smart_exclude_reason("src/main.rs"), None);
}
#[test]
fn is_spine_file_matches_basename_and_document_paths() {
assert!(is_spine_file("README.md"));
assert!(is_spine_file("nested/docs/architecture.md"));
assert!(!is_spine_file("src/main.rs"));
}
#[test]
fn classify_file_detects_generated_and_dense_blob() {
let classes = classify_file("src/node-types.json", 50_000, 5, 50.0);
assert!(classes.contains(&FileClassification::Generated));
assert!(classes.contains(&FileClassification::DataBlob));
}
#[test]
fn assign_policy_skips_oversized_generated_files() {
let (policy, reason) = assign_policy(20_000, 16_000, &[FileClassification::Generated]);
assert_eq!(policy, InclusionPolicy::Skip);
assert!(reason.unwrap_or_default().contains("generated"));
}
}