use std::collections::BTreeSet;
use std::io::Read;
use std::path::{Component, Path, PathBuf};
use crate::index::languages;
use crate::index::security;
const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
const GENERATED_JS_MARKER_SCAN_BYTES: usize = 64 * 1024;
const GENERATED_JS_ANALYSIS_READ_BYTES: u64 = 256 * 1024;
const MINIFIED_JS_MIN_BYTES: usize = 128 * 1024;
const MINIFIED_JS_LONG_LINE_BYTES: usize = 20 * 1024;
const MINIFIED_JS_MAX_LINES: usize = 20;
const MINIFIED_JS_AVG_LINE_BYTES: usize = 2 * 1024;
const GCODE_CONFIG_PATH: &str = ".gobby/gcode.json";
const DEFAULT_HIDDEN_ALLOWLIST_PATTERNS: &[&str] = &[
".gobby/plans/**/*.md",
".github/workflows/**/*.yml",
".github/workflows/**/*.yaml",
];
const GENERATED_JS_MARKERS: &[&str] = &[
"generated by",
"do not edit",
"@generated",
"auto-generated",
"automatically generated",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FileClassification {
Ast,
ContentOnly,
}
pub fn discover_files(root: &Path, exclude_patterns: &[String]) -> (Vec<PathBuf>, Vec<PathBuf>) {
let mut candidates = Vec::new();
let mut content_only = Vec::new();
let mut seen = BTreeSet::new();
let mut settings = gobby_core::indexing::WalkerSettings::new(root);
settings.max_filesize = Some(MAX_FILE_SIZE);
let mut builder = settings.into_walker();
builder.hidden(true);
let walker = builder.build();
for entry in walker.flatten() {
let path = entry.path();
if !path.is_file() {
continue;
}
push_classified_file(
root,
path,
exclude_patterns,
&mut candidates,
&mut content_only,
&mut seen,
);
}
let hidden_allowlist = HiddenPathAllowlist::load(root);
for path in hidden_allowlist.discover(root) {
push_classified_file(
root,
&path,
exclude_patterns,
&mut candidates,
&mut content_only,
&mut seen,
);
}
(candidates, content_only)
}
pub fn classify_file(
root: &Path,
path: &Path,
exclude_patterns: &[String],
) -> Option<FileClassification> {
if !is_safe_text_file(root, path, exclude_patterns) {
return None;
}
if is_generated_js_bundle(path) {
return None;
}
if is_hidden_metadata_content_only(root, path) {
return Some(FileClassification::ContentOnly);
}
if languages::detect_language(&path.to_string_lossy()).is_some() {
Some(FileClassification::Ast)
} else {
Some(FileClassification::ContentOnly)
}
}
pub fn is_content_indexable(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
matches!(
classify_file(root, path, exclude_patterns),
Some(FileClassification::ContentOnly)
)
}
pub fn content_language(path: &Path) -> String {
let extension = path
.extension()
.map(|e| e.to_string_lossy().to_lowercase())
.filter(|ext| !ext.is_empty())
.unwrap_or_else(|| "text".to_string());
match extension.as_str() {
"md" | "markdown" => "markdown".to_string(),
"yml" | "yaml" => "yaml".to_string(),
_ => extension,
}
}
fn push_classified_file(
root: &Path,
path: &Path,
exclude_patterns: &[String],
candidates: &mut Vec<PathBuf>,
content_only: &mut Vec<PathBuf>,
seen: &mut BTreeSet<PathBuf>,
) {
let key = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
if !seen.insert(key) {
return;
}
match classify_file(root, path, exclude_patterns) {
Some(FileClassification::Ast) => candidates.push(path.to_path_buf()),
Some(FileClassification::ContentOnly) => content_only.push(path.to_path_buf()),
None => {}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct HiddenPathAllowlist {
patterns: Vec<String>,
}
impl HiddenPathAllowlist {
fn load(root: &Path) -> Self {
let mut patterns = DEFAULT_HIDDEN_ALLOWLIST_PATTERNS
.iter()
.map(|pattern| (*pattern).to_string())
.collect::<Vec<_>>();
patterns.extend(read_project_hidden_allowlist(root));
Self::from_patterns(patterns)
}
fn from_patterns(patterns: Vec<String>) -> Self {
let patterns = patterns
.into_iter()
.map(|pattern| pattern.trim().replace('\\', "/"))
.filter(|pattern| is_valid_allowlist_pattern(pattern))
.flat_map(|pattern| expand_zero_depth_globstar(&pattern))
.collect();
Self { patterns }
}
fn discover(&self, root: &Path) -> Vec<PathBuf> {
let mut paths = BTreeSet::new();
for pattern in &self.patterns {
let Some(abs_pattern) = absolute_glob_pattern(root, pattern) else {
continue;
};
let Ok(entries) = glob::glob(&abs_pattern) else {
continue;
};
for entry in entries.flatten() {
if entry.is_file() && is_hidden_path(root, &entry) {
paths.insert(entry);
}
}
}
paths.into_iter().collect()
}
}
fn read_project_hidden_allowlist(root: &Path) -> Vec<String> {
let Ok(contents) = std::fs::read_to_string(root.join(GCODE_CONFIG_PATH)) else {
return Vec::new();
};
let Ok(json) = serde_json::from_str::<serde_json::Value>(&contents) else {
return Vec::new();
};
json.get("index")
.and_then(|index| index.get("hidden_allowlist"))
.and_then(|allowlist| allowlist.as_array())
.into_iter()
.flatten()
.filter_map(|value| value.as_str().map(ToOwned::to_owned))
.collect()
}
fn is_valid_allowlist_pattern(pattern: &str) -> bool {
if pattern.is_empty() {
return false;
}
let path = Path::new(pattern);
!path.is_absolute()
&& !path.components().any(|component| {
matches!(
component,
Component::ParentDir | Component::Prefix(_) | Component::RootDir
)
})
}
fn expand_zero_depth_globstar(pattern: &str) -> Vec<String> {
let mut expanded = vec![pattern.to_string()];
if let Some((prefix, suffix)) = pattern.split_once("/**/") {
expanded.push(format!("{prefix}/{suffix}"));
}
expanded
}
fn absolute_glob_pattern(root: &Path, pattern: &str) -> Option<String> {
let root = root.to_str()?;
Some(format!("{}/{}", glob::Pattern::escape(root), pattern))
}
fn is_hidden_path(root: &Path, path: &Path) -> bool {
let rel = path.strip_prefix(root).unwrap_or(path);
rel.components().any(|component| {
component
.as_os_str()
.to_str()
.is_some_and(|name| name.starts_with('.') && name != "." && name != "..")
})
}
fn is_hidden_metadata_content_only(root: &Path, path: &Path) -> bool {
let rel = path.strip_prefix(root).unwrap_or(path);
let components = rel
.components()
.filter_map(|component| match component {
Component::Normal(value) => value.to_str(),
_ => None,
})
.collect::<Vec<_>>();
if components.len() >= 3
&& components[0] == ".gobby"
&& components[1] == "plans"
&& path_has_extension(path, &["md"])
{
return true;
}
components.len() >= 3
&& components[0] == ".github"
&& components[1] == "workflows"
&& path_has_extension(path, &["yml", "yaml"])
}
fn path_has_extension(path: &Path, extensions: &[&str]) -> bool {
path.extension()
.and_then(|extension| extension.to_str())
.map(|extension| {
let extension = extension.to_ascii_lowercase();
extensions.contains(&extension.as_str())
})
.unwrap_or(false)
}
fn is_safe_text_file(root: &Path, path: &Path, exclude_patterns: &[String]) -> bool {
if !path.is_file() {
return false;
}
if !security::validate_path(path, root) {
return false;
}
if !security::is_symlink_safe(path, root) {
return false;
}
if security::should_exclude_path(root, path, exclude_patterns) {
return false;
}
if security::has_secret_extension(path) {
return false;
}
let Ok(meta) = path.metadata() else {
return false;
};
if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
return false;
}
!security::is_binary(path)
}
fn is_generated_js_bundle(path: &Path) -> bool {
if !is_js_family_file(path) {
return false;
}
let Ok(metadata) = path.metadata() else {
return false;
};
let Ok(bytes) = read_file_prefix(path, GENERATED_JS_ANALYSIS_READ_BYTES) else {
return false;
};
if contains_generated_js_marker(&bytes) {
return true;
}
if metadata.len() < MINIFIED_JS_MIN_BYTES as u64 {
return false;
};
looks_minified_js_bundle(&bytes)
}
fn read_file_prefix(path: &Path, max_bytes: u64) -> std::io::Result<Vec<u8>> {
let mut file = std::fs::File::open(path)?;
let mut bytes = Vec::with_capacity(max_bytes.min(usize::MAX as u64) as usize);
file.by_ref().take(max_bytes).read_to_end(&mut bytes)?;
Ok(bytes)
}
fn is_js_family_file(path: &Path) -> bool {
path.extension()
.and_then(|ext| ext.to_str())
.map(|ext| {
matches!(
ext.to_ascii_lowercase().as_str(),
"js" | "jsx" | "cjs" | "mjs"
)
})
.unwrap_or(false)
}
fn contains_generated_js_marker(bytes: &[u8]) -> bool {
let scan_len = bytes.len().min(GENERATED_JS_MARKER_SCAN_BYTES);
let scan = String::from_utf8_lossy(&bytes[..scan_len]).to_ascii_lowercase();
GENERATED_JS_MARKERS
.iter()
.any(|marker| scan.contains(marker))
}
fn looks_minified_js_bundle(bytes: &[u8]) -> bool {
if bytes.len() < MINIFIED_JS_MIN_BYTES {
return false;
}
let mut line_count = 0usize;
let mut total_line_bytes = 0usize;
let mut longest_line_bytes = 0usize;
for line in bytes.split(|byte| *byte == b'\n') {
let line_len = line.len();
if line_len == 0 {
continue;
}
line_count += 1;
total_line_bytes += line_len;
longest_line_bytes = longest_line_bytes.max(line_len);
}
if line_count == 0 {
return false;
}
longest_line_bytes >= MINIFIED_JS_LONG_LINE_BYTES
|| (line_count <= MINIFIED_JS_MAX_LINES
&& total_line_bytes / line_count >= MINIFIED_JS_AVG_LINE_BYTES)
}
#[cfg(test)]
mod tests {
use super::*;
fn write_file(root: &Path, rel: &str, contents: &[u8]) {
let path = root.join(rel);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).expect("create parent");
}
std::fs::write(path, contents).expect("write file");
}
fn rels(root: &Path, paths: Vec<PathBuf>) -> Vec<String> {
let mut rels: Vec<String> = paths
.into_iter()
.map(|path| {
path.strip_prefix(root)
.expect("path under root")
.to_string_lossy()
.to_string()
})
.collect();
rels.sort();
rels
}
#[test]
fn discovers_ast_and_content_only_text_files() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, "README.md", b"# Title\n");
write_file(root, "skills/gcode/SKILL.md", b"# gcode\n");
write_file(root, "src/lib.rs", b"fn main() {}\n");
write_file(root, "src/module.mjs", b"export const value = 1;\n");
write_file(root, "docs/reference.markdown", b"# Reference\n");
write_file(root, "docs/guide.rst", b"Guide\n=====\n");
write_file(root, "notes.txt", b"plain notes\n");
write_file(root, "config/app.properties", b"mode=dev\n");
write_file(root, "config/app.toml", b"mode = 'dev'\n");
write_file(root, "scripts/setup.sh", b"#!/usr/bin/env bash\n");
write_file(root, "Dockerfile", b"FROM rust:latest\n");
write_file(root, "image.bin", b"PNG\0binary");
write_file(root, "api_key.txt", b"secret-ish\n");
write_file(root, "target/generated.txt", b"generated\n");
let excludes = vec!["target".to_string()];
let (ast, content_only) = discover_files(root, &excludes);
assert_eq!(rels(root, ast), vec!["src/lib.rs", "src/module.mjs"]);
assert_eq!(
rels(root, content_only),
vec![
"Dockerfile",
"README.md",
"config/app.properties",
"config/app.toml",
"docs/guide.rst",
"docs/reference.markdown",
"notes.txt",
"scripts/setup.sh",
"skills/gcode/SKILL.md"
]
);
}
#[test]
fn classifies_extensionless_text_as_content_only() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, "Makefile", b"test:\n\tcargo test\n");
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("Makefile"), &excludes),
Some(FileClassification::ContentOnly)
);
assert_eq!(content_language(&root.join("Makefile")), "text");
}
#[test]
fn classifies_markdown_content_language_as_markdown() {
assert_eq!(content_language(Path::new("README.md")), "markdown");
assert_eq!(
content_language(Path::new("docs/guide.markdown")),
"markdown"
);
assert_eq!(
content_language(Path::new("skills/gcode/SKILL.md")),
"markdown"
);
}
#[test]
fn classifies_yaml_content_language_as_yaml() {
assert_eq!(
content_language(Path::new(".github/workflows/ci.yml")),
"yaml"
);
assert_eq!(
content_language(Path::new(".github/workflows/release.yaml")),
"yaml"
);
}
#[test]
fn classifies_mjs_as_ast_and_markdown_as_content_only() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, "src/module.mjs", b"export const value = 1;\n");
write_file(root, "README.md", b"# Title\n");
write_file(root, "docs/guide.markdown", b"# Guide\n");
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("src/module.mjs"), &excludes),
Some(FileClassification::Ast)
);
assert_eq!(
classify_file(root, &root.join("README.md"), &excludes),
Some(FileClassification::ContentOnly)
);
assert_eq!(
classify_file(root, &root.join("docs/guide.markdown"), &excludes),
Some(FileClassification::ContentOnly)
);
}
#[test]
fn classifies_github_workflow_yaml_as_content_only() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, ".github/workflows/ci.yml", b"name: ci\n");
write_file(root, ".github/workflows/release.yaml", b"name: release\n");
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join(".github/workflows/ci.yml"), &excludes),
Some(FileClassification::ContentOnly)
);
assert_eq!(
classify_file(
root,
&root.join(".github/workflows/release.yaml"),
&excludes
),
Some(FileClassification::ContentOnly)
);
}
#[test]
fn discovers_default_hidden_metadata_allowlist() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, "src/lib.rs", b"fn main() {}\n");
write_file(root, ".gobby/plans/foo.md", b"# Plan\n");
write_file(root, ".gobby/plans/nested/bar.md", b"# Nested\n");
write_file(root, ".github/workflows/ci.yml", b"name: ci\n");
write_file(root, ".github/workflows/release.yaml", b"name: release\n");
let (ast, content_only) = discover_files(root, &[]);
assert_eq!(rels(root, ast), vec!["src/lib.rs"]);
assert_eq!(
rels(root, content_only),
vec![
".github/workflows/ci.yml",
".github/workflows/release.yaml",
".gobby/plans/foo.md",
".gobby/plans/nested/bar.md",
]
);
}
#[test]
fn skips_non_allowlisted_hidden_metadata_by_default() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, ".github/ISSUE_TEMPLATE/bug.md", b"# Bug\n");
write_file(root, ".gobby/gcode.json", br#"{"id":"project"}"#);
write_file(root, ".gobby/project.json", br#"{"id":"project"}"#);
write_file(root, ".gobby/wiki/page.md", b"# Wiki\n");
write_file(root, ".gobby/screenshots/shot.md", b"# Screenshot\n");
write_file(root, ".gobby/tasks.jsonl", b"{}\n");
write_file(root, ".gobby/memories.jsonl", b"{}\n");
let (ast, content_only) = discover_files(root, &[]);
assert!(rels(root, ast).is_empty());
assert!(rels(root, content_only).is_empty());
}
#[test]
fn discovers_project_hidden_allowlist_from_gcode_json() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(
root,
".gobby/gcode.json",
br#"{"index":{"hidden_allowlist":[".custom/agent-docs/**/*.md"]}}"#,
);
write_file(root, ".custom/agent-docs/guide.md", b"# Guide\n");
write_file(root, ".custom/agent-docs/nested/runbook.md", b"# Runbook\n");
write_file(root, ".custom/other.md", b"# Other\n");
let (ast, content_only) = discover_files(root, &[]);
assert!(rels(root, ast).is_empty());
assert_eq!(
rels(root, content_only),
vec![
".custom/agent-docs/guide.md",
".custom/agent-docs/nested/runbook.md",
]
);
}
#[test]
fn excludes_win_over_allowlisted_hidden_paths() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, ".gobby/plans/foo.md", b"# Plan\n");
write_file(root, ".github/workflows/ci.yml", b"name: ci\n");
let excludes = vec![".gobby".to_string(), "workflows".to_string()];
let (ast, content_only) = discover_files(root, &excludes);
assert!(rels(root, ast).is_empty());
assert!(rels(root, content_only).is_empty());
}
#[test]
fn skips_js_family_files_with_generated_markers() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
let excludes = Vec::new();
for (rel, marker) in [
("src/setup.mjs", "Generated by gcode setup"),
("src/app.js", "DO NOT EDIT"),
("src/view.jsx", "@generated"),
("src/runtime.cjs", "auto-generated"),
] {
write_file(
root,
rel,
format!("// {marker}\nexport const value = 1;\n").as_bytes(),
);
assert_eq!(classify_file(root, &root.join(rel), &excludes), None);
}
}
#[test]
fn keeps_ordinary_mjs_source_ast_indexable() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(
root,
"src/config.mjs",
b"export function loadConfig() {\n return { mode: 'dev' };\n}\n",
);
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("src/config.mjs"), &excludes),
Some(FileClassification::Ast)
);
}
#[test]
fn skips_large_minified_js_bundles() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
let mut bundle = b"var bundle='".to_vec();
bundle.extend(std::iter::repeat_n(b'a', MINIFIED_JS_MIN_BYTES));
bundle.extend(b"';\n");
write_file(root, "src/bundle.js", &bundle);
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("src/bundle.js"), &excludes),
None
);
}
#[test]
fn skips_single_line_minified_js_bundle_with_newline() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
let mut bundle = b"(()=>{const bundle='".to_vec();
bundle.extend(std::iter::repeat_n(b'a', MINIFIED_JS_MIN_BYTES));
bundle.extend(b"';})();\n");
write_file(root, "dist/app.js", &bundle);
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("dist/app.js"), &excludes),
None
);
}
#[test]
fn skips_single_line_minified_js_bundle_without_newline() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
let mut bundle = b"(()=>{const bundle='".to_vec();
bundle.extend(std::iter::repeat_n(b'a', MINIFIED_JS_MIN_BYTES));
bundle.extend(b"';})();");
write_file(root, "dist/app.js", &bundle);
let excludes = Vec::new();
assert_eq!(
classify_file(root, &root.join("dist/app.js"), &excludes),
None
);
}
#[test]
fn classifies_source_build_directory_as_ast_indexable() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(
root,
"src/gobby/build/workspaces.py",
b"class WorkspaceBuilder:\n pass\n",
);
let excludes = vec!["build".to_string(), "dist".to_string()];
assert_eq!(
classify_file(root, &root.join("src/gobby/build/workspaces.py"), &excludes),
Some(FileClassification::Ast)
);
}
#[test]
fn skips_root_build_directory() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
write_file(root, "build/generated.py", b"class Generated:\n pass\n");
let excludes = vec!["build".to_string(), "dist".to_string()];
assert_eq!(
classify_file(root, &root.join("build/generated.py"), &excludes),
None
);
}
#[test]
fn walker_consumes_gobby_core_walker_settings() {
let source = include_str!("walker.rs");
let settings = ["gobby_core", "::indexing::WalkerSettings"].concat();
let direct_builder = ["WalkBuilder", "::new(root)"].concat();
assert!(source.contains(&settings));
assert!(!source.contains(&direct_builder));
}
}