gobby_code/index/walker/
classification.rs1use std::path::Path;
2
3use crate::index::languages;
4use crate::index::security;
5use crate::index::{MAX_DATA_LANGUAGE_AST_SIZE, MAX_FILE_SIZE};
6
7use super::generated::is_generated_js_bundle;
8use super::hidden::{
9 HiddenPathAllowlist, is_generated_wiki_metadata, is_hidden_metadata_content_only,
10 is_hidden_path,
11};
12use super::types::{DiscoveryOptions, FileClassification};
13
14pub fn classify_file(
16 root: &Path,
17 path: &Path,
18 exclude_patterns: &[impl AsRef<str>],
19) -> Option<FileClassification> {
20 if !is_safe_text_file(root, path, exclude_patterns) {
21 return None;
22 }
23 if is_generated_wiki_metadata(root, path) {
24 return None;
25 }
26 if is_generated_js_bundle(path) {
27 return None;
28 }
29
30 if is_hidden_metadata_content_only(root, path) {
31 return Some(FileClassification::ContentOnly);
32 }
33
34 if let Some(lang) = languages::detect_language(&path.to_string_lossy()) {
35 if languages::is_data_language(lang)
40 && path
41 .metadata()
42 .map(|m| m.len() > MAX_DATA_LANGUAGE_AST_SIZE)
43 .unwrap_or(false)
44 {
45 Some(FileClassification::ContentOnly)
46 } else {
47 Some(FileClassification::Ast)
48 }
49 } else {
50 Some(FileClassification::ContentOnly)
51 }
52}
53
54pub fn classify_explicit_file_with_options(
57 root: &Path,
58 path: &Path,
59 exclude_patterns: &[impl AsRef<str>],
60 options: DiscoveryOptions,
61) -> Option<FileClassification> {
62 if options.respect_gitignore && !explicit_path_visible(root, path, options) {
63 return None;
64 }
65 classify_file(root, path, exclude_patterns)
66}
67
68pub fn is_content_indexable(
70 root: &Path,
71 path: &Path,
72 exclude_patterns: &[impl AsRef<str>],
73) -> bool {
74 matches!(
75 classify_file(root, path, exclude_patterns),
76 Some(FileClassification::ContentOnly)
77 )
78}
79
80pub fn content_language(path: &Path) -> String {
82 let extension = path
83 .extension()
84 .map(|e| e.to_string_lossy().to_lowercase())
85 .filter(|ext| !ext.is_empty())
86 .unwrap_or_else(|| "text".to_string());
87
88 match extension.as_str() {
89 "md" | "markdown" => "markdown".to_string(),
90 "yml" | "yaml" => "yaml".to_string(),
91 _ => extension,
92 }
93}
94
95fn explicit_path_visible(root: &Path, path: &Path, options: DiscoveryOptions) -> bool {
96 if is_hidden_path(root, path) && !HiddenPathAllowlist::load(root).matches(root, path) {
97 return false;
98 }
99
100 let walk_root = path.parent().unwrap_or(root);
101 let mut settings = gobby_core::indexing::WalkerSettings::new(walk_root);
102 settings.respect_gitignore = options.respect_gitignore;
103 settings.max_filesize = Some(MAX_FILE_SIZE);
104 let mut builder = settings.into_walker();
105 builder.hidden(false);
106 builder.max_depth(Some(1));
107 builder
108 .build()
109 .flatten()
110 .any(|entry| entry.path().is_file() && same_existing_path(entry.path(), path))
111}
112
113fn same_existing_path(left: &Path, right: &Path) -> bool {
114 let left = left.canonicalize().unwrap_or_else(|_| left.to_path_buf());
115 let right = right.canonicalize().unwrap_or_else(|_| right.to_path_buf());
116 left == right
117}
118
119fn is_safe_text_file(root: &Path, path: &Path, exclude_patterns: &[impl AsRef<str>]) -> bool {
120 if !path.is_file() {
121 return false;
122 }
123 if !security::validate_path(path, root) {
124 return false;
125 }
126 if !security::is_symlink_safe(path, root) {
127 return false;
128 }
129 if security::should_exclude_path(root, path, exclude_patterns) {
130 return false;
131 }
132 if security::has_secret_extension(path) {
133 return false;
134 }
135
136 let Ok(meta) = path.metadata() else {
137 return false;
138 };
139 if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
140 return false;
141 }
142
143 !security::is_binary(path)
144}