Skip to main content

gobby_code/index/walker/
classification.rs

1use std::path::Path;
2
3use crate::index::languages;
4use crate::index::security;
5use crate::index::{MAX_DATA_LANGUAGE_AST_SIZE, MAX_FILE_SIZE};
6
7use super::generated::is_generated_js_bundle;
8use super::hidden::{
9    HiddenPathAllowlist, is_generated_wiki_metadata, is_hidden_metadata_content_only,
10    is_hidden_path,
11};
12use super::types::{DiscoveryOptions, FileClassification};
13
14/// Classify an individual file for indexing.
15pub fn classify_file(
16    root: &Path,
17    path: &Path,
18    exclude_patterns: &[impl AsRef<str>],
19) -> Option<FileClassification> {
20    if !is_safe_text_file(root, path, exclude_patterns) {
21        return None;
22    }
23    if is_generated_wiki_metadata(root, path) {
24        return None;
25    }
26    if is_generated_js_bundle(path) {
27        return None;
28    }
29
30    if is_hidden_metadata_content_only(root, path) {
31        return Some(FileClassification::ContentOnly);
32    }
33
34    if let Some(lang) = languages::detect_language(&path.to_string_lossy()) {
35        // Oversized data files (JSON/YAML) would emit one `property` symbol per
36        // key; route them content-only so they don't bloat the graph/vector/FTS
37        // projections. `is_safe_text_file` already bounded len to (0, MAX_FILE_SIZE],
38        // so this is one extra `stat` on the data-language branch only (gobby-cli #678).
39        if languages::is_data_language(lang)
40            && path
41                .metadata()
42                .map(|m| m.len() > MAX_DATA_LANGUAGE_AST_SIZE)
43                .unwrap_or(false)
44        {
45            Some(FileClassification::ContentOnly)
46        } else {
47            Some(FileClassification::Ast)
48        }
49    } else {
50        Some(FileClassification::ContentOnly)
51    }
52}
53
54/// Classify an explicitly requested file with discovery filters applied to that
55/// one path instead of walking the whole project root.
56pub fn classify_explicit_file_with_options(
57    root: &Path,
58    path: &Path,
59    exclude_patterns: &[impl AsRef<str>],
60    options: DiscoveryOptions,
61) -> Option<FileClassification> {
62    if options.respect_gitignore && !explicit_path_visible(root, path, options) {
63        return None;
64    }
65    classify_file(root, path, exclude_patterns)
66}
67
68/// Return true when `path` is an unsupported, safe text file suitable for chunks.
69pub fn is_content_indexable(
70    root: &Path,
71    path: &Path,
72    exclude_patterns: &[impl AsRef<str>],
73) -> bool {
74    matches!(
75        classify_file(root, path, exclude_patterns),
76        Some(FileClassification::ContentOnly)
77    )
78}
79
80/// Language label for content-only files.
81pub fn content_language(path: &Path) -> String {
82    let extension = path
83        .extension()
84        .map(|e| e.to_string_lossy().to_lowercase())
85        .filter(|ext| !ext.is_empty())
86        .unwrap_or_else(|| "text".to_string());
87
88    match extension.as_str() {
89        "md" | "markdown" => "markdown".to_string(),
90        "yml" | "yaml" => "yaml".to_string(),
91        _ => extension,
92    }
93}
94
95fn explicit_path_visible(root: &Path, path: &Path, options: DiscoveryOptions) -> bool {
96    if is_hidden_path(root, path) && !HiddenPathAllowlist::load(root).matches(root, path) {
97        return false;
98    }
99
100    let walk_root = path.parent().unwrap_or(root);
101    let mut settings = gobby_core::indexing::WalkerSettings::new(walk_root);
102    settings.respect_gitignore = options.respect_gitignore;
103    settings.max_filesize = Some(MAX_FILE_SIZE);
104    let mut builder = settings.into_walker();
105    builder.hidden(false);
106    builder.max_depth(Some(1));
107    builder
108        .build()
109        .flatten()
110        .any(|entry| entry.path().is_file() && same_existing_path(entry.path(), path))
111}
112
113fn same_existing_path(left: &Path, right: &Path) -> bool {
114    let left = left.canonicalize().unwrap_or_else(|_| left.to_path_buf());
115    let right = right.canonicalize().unwrap_or_else(|_| right.to_path_buf());
116    left == right
117}
118
119fn is_safe_text_file(root: &Path, path: &Path, exclude_patterns: &[impl AsRef<str>]) -> bool {
120    if !path.is_file() {
121        return false;
122    }
123    if !security::validate_path(path, root) {
124        return false;
125    }
126    if !security::is_symlink_safe(path, root) {
127        return false;
128    }
129    if security::should_exclude_path(root, path, exclude_patterns) {
130        return false;
131    }
132    if security::has_secret_extension(path) {
133        return false;
134    }
135
136    let Ok(meta) = path.metadata() else {
137        return false;
138    };
139    if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
140        return false;
141    }
142
143    !security::is_binary(path)
144}