1use std::collections::HashMap;
2use std::io::BufRead;
3use std::process::Command;
4use std::sync::{Arc, OnceLock};
5use std::time::SystemTime;
6
7use argyph_embed::Embedder;
8use argyph_fs::ChangedPath;
9use argyph_fs::FileEntry;
10use argyph_graph::edge::Edge;
11use argyph_graph::graph::SymbolOutline;
12use argyph_graph::selector::SymbolSelector;
13use argyph_pack::{self, DefaultPacker, PackContext, PackRequest, PackResult, PackScope, Packer};
14use argyph_parse::types::Symbol;
15use argyph_parse::SymbolId;
16use argyph_store::Store;
17use camino::{Utf8Path, Utf8PathBuf};
18use regex::Regex;
19
20use crate::error::{CoreError, Result};
21
22pub struct SearchFilter {
23 pub paths_glob: Option<Vec<String>>,
24 pub exclude_glob: Option<Vec<String>>,
25}
26
27pub struct SearchHit {
28 pub file: Utf8PathBuf,
29 pub line: u64,
30 pub column: u64,
31 pub match_text: String,
32}
33
34pub struct SearchResult {
35 pub hits: Vec<SearchHit>,
36 pub truncated: bool,
37}
38
39pub struct LanguageSummary {
40 pub name: String,
41 pub files: u64,
42}
43
44pub struct GitInfo {
45 pub branch: String,
46 pub head_short: String,
47 pub dirty: bool,
48}
49
50pub struct RepoOverview {
51 pub languages: Vec<LanguageSummary>,
52 pub entry_points: Vec<String>,
53 pub readme_excerpt: String,
54 pub tree: String,
55 pub git: Option<GitInfo>,
56}
57
58pub struct SemanticHit {
59 pub chunk_id: String,
60 pub chunk_text: String,
61 pub file: String,
62 pub byte_range: (u32, u32),
63 pub line_range: (u32, u32),
64 pub score: f32,
65 pub source: String,
66}
67
68pub struct SemanticResult {
69 pub hits: Vec<SemanticHit>,
70 pub total_embedded: usize,
71 pub total_chunks: usize,
72}
73
74pub struct Index {
79 store: Arc<dyn Store>,
80 embedder: Arc<OnceLock<Arc<dyn Embedder>>>,
81}
82
83impl Index {
84 pub(crate) fn new(store: Arc<dyn Store>, embedder: Arc<OnceLock<Arc<dyn Embedder>>>) -> Self {
85 Self { store, embedder }
86 }
87
88 pub fn protocol_version() -> &'static str {
89 "0.1.0"
90 }
91
92 pub async fn get_file(&self, path: &Utf8Path) -> Result<Option<FileEntry>> {
93 Ok(self.store.get_file(path).await?)
94 }
95
96 pub async fn list_files(&self) -> Result<Vec<FileEntry>> {
97 Ok(self.store.list_files().await?)
98 }
99
100 pub async fn status(&self) -> Result<IndexStatus> {
101 let files = self.store.list_files().await?;
102 Ok(IndexStatus {
103 protocol_version: Self::protocol_version().to_string(),
104 file_count: files.len() as u64,
105 snapshot_at: SystemTime::now(),
106 })
107 }
108
109 pub async fn search_text(
110 &self,
111 root: &Utf8Path,
112 pattern: &str,
113 regex: bool,
114 case_sensitive: bool,
115 max_results: u64,
116 filter: Option<SearchFilter>,
117 ) -> Result<SearchResult> {
118 let max = max_results.clamp(1, 1000);
119
120 let re = build_regex(pattern, regex, case_sensitive)?;
121
122 let files = self.store.list_files().await?;
123 let files: Vec<_> = files
124 .into_iter()
125 .filter(|f| match &filter {
126 Some(filt) => path_matches_filter(f.path.as_str(), filt),
127 None => true,
128 })
129 .collect();
130
131 let mut hits = Vec::new();
132 'outer: for entry in &files {
133 let file_path = root.join(entry.path.as_str());
134 let f = match std::fs::File::open(file_path.as_str()) {
135 Ok(f) => f,
136 Err(_) => continue,
137 };
138 let reader = std::io::BufReader::new(f);
139 for (line_no, line_result) in reader.lines().enumerate() {
140 let line = match line_result {
141 Ok(l) => l,
142 Err(_) => continue,
143 };
144 for mat in re.find_iter(&line) {
145 hits.push(SearchHit {
146 file: entry.path.clone(),
147 line: (line_no + 1) as u64,
148 column: (mat.start() + 1) as u64,
149 match_text: mat.as_str().to_string(),
150 });
151 if hits.len() >= max as usize {
152 break 'outer;
153 }
154 }
155 }
156 }
157
158 let total: usize = files
159 .iter()
160 .filter_map(|f| {
161 let fp = root.join(f.path.as_str());
162 std::fs::read_to_string(fp.as_str())
163 .ok()
164 .map(|c| re.find_iter(&c).count())
165 })
166 .sum();
167 let truncated = total > max as usize;
168
169 Ok(SearchResult { hits, truncated })
170 }
171
172 pub async fn search_semantic(
173 &self,
174 query: &str,
175 k: usize,
176 filter: Option<&argyph_store::search::SearchFilter>,
177 ) -> Result<SemanticResult> {
178 let embedder = self.embedder.get().ok_or_else(|| {
179 CoreError::Embed("no embedder configured — cannot perform semantic search".into())
180 })?;
181
182 let query_vec = embedder
183 .embed_query(query)
184 .await
185 .map_err(|e| CoreError::Embed(format!("{e}")))?;
186
187 let result = self
188 .store
189 .search_hybrid(query, &query_vec, k, filter.unwrap_or(&Default::default()))
190 .await?;
191
192 Ok(SemanticResult {
193 hits: result
194 .hits
195 .into_iter()
196 .map(|h| SemanticHit {
197 chunk_id: h.chunk_id,
198 chunk_text: h.chunk_text,
199 file: h.file,
200 byte_range: h.byte_range,
201 line_range: h.line_range,
202 score: h.score,
203 source: format!("{:?}", h.source).to_lowercase(),
204 })
205 .collect(),
206 total_embedded: result.total_embedded,
207 total_chunks: result.total_chunks,
208 })
209 }
210
211 pub async fn overview(&self, root: &Utf8Path, max_tree_depth: u64) -> Result<RepoOverview> {
212 let depth = max_tree_depth.clamp(1, 6) as usize;
213 let files = self.store.list_files().await?;
214
215 let mut lang_counts: HashMap<String, u64> = HashMap::new();
216 for f in &files {
217 if let Some(lang) = &f.language {
218 *lang_counts.entry(lang.to_string()).or_default() += 1;
219 }
220 }
221 let mut languages: Vec<LanguageSummary> = lang_counts
222 .into_iter()
223 .map(|(name, count)| LanguageSummary { name, files: count })
224 .collect();
225 languages.sort_by(|a, b| b.files.cmp(&a.files));
226
227 let entry_points: Vec<String> = [
228 "src/main.rs",
229 "src/lib.rs",
230 "main.rs",
231 "lib.rs",
232 "src/index.ts",
233 "src/index.js",
234 "src/index.py",
235 ]
236 .iter()
237 .filter(|p| files.iter().any(|f| f.path.as_str() == **p))
238 .map(|s| s.to_string())
239 .collect();
240
241 let readme_excerpt = Self::read_readme(root);
242 let tree = Self::build_tree(&files, depth);
243 let git = Self::get_git_info(root);
244
245 Ok(RepoOverview {
246 languages,
247 entry_points,
248 readme_excerpt,
249 tree,
250 git,
251 })
252 }
253
254 pub async fn find_symbol(&self, name: &str, file: Option<&Utf8Path>) -> Result<Vec<Symbol>> {
255 Ok(self.store.find_symbol(name, file).await?)
256 }
257
258 pub async fn find_references(&self, sel: &SymbolSelector) -> Result<Vec<Edge>> {
259 Ok(self.store.find_references(sel).await?)
260 }
261
262 pub async fn get_callers(&self, sel: &SymbolSelector) -> Result<Vec<Edge>> {
263 Ok(self.store.get_callers(sel).await?)
264 }
265
266 pub async fn get_callees(&self, sel: &SymbolSelector) -> Result<Vec<Edge>> {
267 Ok(self.store.get_callees(sel).await?)
268 }
269
270 pub async fn get_imports(&self, file: &Utf8Path) -> Result<Vec<Edge>> {
271 Ok(self.store.get_imports(file).await?)
272 }
273
274 pub async fn get_symbol_outline(&self, file: &Utf8Path) -> Result<Vec<SymbolOutline>> {
275 Ok(self.store.get_symbol_outline(file).await?)
276 }
277
278 pub async fn reindex(&self, root: &Utf8Path, changes: &[ChangedPath]) -> Result<()> {
279 crate::tiers::incremental_reindex(root, &*self.store, changes).await
280 }
281
282 pub async fn pack(&self, root: &Utf8Path, req: &PackRequest) -> Result<PackResult> {
283 let packer = DefaultPacker::new().map_err(|e| CoreError::Io(std::io::Error::other(e)))?;
284 let ctx = IndexPackContext {
285 index: self,
286 root: root.to_owned(),
287 };
288 packer
289 .pack(req, &ctx)
290 .map_err(|e| CoreError::Io(std::io::Error::other(e)))
291 }
292
293 fn build_tree(files: &[FileEntry], depth: usize) -> String {
296 let mut paths: Vec<&str> = files.iter().map(|f| f.path.as_str()).collect();
297 paths.sort();
298 paths.truncate(500);
299 let mut out = String::new();
300 let mut prev: Vec<&str> = vec![];
301 for path in &paths {
302 let parts: Vec<&str> = path.split('/').collect();
303 let common = prev.iter().zip(&parts).filter(|(a, b)| a == b).count();
304 if common < depth {
305 for (i, part) in parts.iter().enumerate().skip(common).take(depth - common) {
306 let indent = " ".repeat(i);
307 out.push_str(&format!("{indent}{part}/\n"));
308 }
309 }
310 prev = parts;
311 }
312 out
313 }
314
315 fn read_readme(root: &camino::Utf8Path) -> String {
316 for name in &["README.md", "README", "readme.md"] {
317 let path = root.join(name);
318 if let Ok(content) = std::fs::read_to_string(path.as_str()) {
319 return content.lines().take(10).collect::<Vec<_>>().join("\n");
320 }
321 }
322 String::new()
323 }
324
325 fn get_git_info(root: &camino::Utf8Path) -> Option<GitInfo> {
326 let git_dir = root.join(".git");
327 if !git_dir.exists() {
328 return None;
329 }
330 let run = |args: &[&str]| -> Option<String> {
331 Command::new("git")
332 .args(args)
333 .current_dir(root.as_str())
334 .output()
335 .ok()
336 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
337 };
338 let branch = run(&["rev-parse", "--abbrev-ref", "HEAD"])?;
339 let head_short = run(&["rev-parse", "--short", "HEAD"])?;
340 let dirty = Command::new("git")
341 .args(["diff", "--quiet"])
342 .current_dir(root.as_str())
343 .status()
344 .ok()
345 .map(|s| !s.success())?;
346 Some(GitInfo {
347 branch,
348 head_short,
349 dirty,
350 })
351 }
352}
353
354struct IndexPackContext<'a> {
355 index: &'a Index,
356 root: Utf8PathBuf,
357}
358
359impl PackContext for IndexPackContext<'_> {
360 fn list_files(&self, scope: &PackScope) -> Vec<Utf8PathBuf> {
361 let files = tokio::runtime::Handle::current()
362 .block_on(self.index.list_files())
363 .unwrap_or_default();
364 let paths: Vec<Utf8PathBuf> = files.into_iter().map(|f| f.path).collect();
365 match scope {
366 PackScope::All => paths,
367 PackScope::Paths(requested) => {
368 let requested_set: std::collections::HashSet<_> = requested.iter().collect();
369 paths
370 .into_iter()
371 .filter(|p| requested_set.contains(p))
372 .collect()
373 }
374 PackScope::Symbol(name) => {
375 let indexed_set: std::collections::HashSet<_> = paths.iter().collect();
376 let syms = tokio::runtime::Handle::current()
377 .block_on(self.index.find_symbol(name, None))
378 .unwrap_or_default();
379 let mut file_set: std::collections::HashSet<Utf8PathBuf> =
380 std::collections::HashSet::new();
381 for sym in &syms {
382 file_set.insert(sym.file.clone());
383 let selector = SymbolSelector::ById(sym.id.clone());
384 if let Ok(callees) = tokio::runtime::Handle::current()
385 .block_on(self.index.get_callees(&selector))
386 {
387 for edge in &callees {
388 if let Some(f) = file_from_symbol_id(&edge.to) {
389 file_set.insert(f);
390 }
391 }
392 }
393 if let Ok(refs) = tokio::runtime::Handle::current()
394 .block_on(self.index.find_references(&selector))
395 {
396 for edge in &refs {
397 if let Some(f) = file_from_symbol_id(&edge.from) {
398 file_set.insert(f);
399 }
400 }
401 }
402 }
403 file_set
404 .into_iter()
405 .filter(|p| indexed_set.contains(p))
406 .collect()
407 }
408 }
409 }
410
411 fn read(&self, file: &Utf8Path) -> argyph_pack::Result<String> {
412 let full_path = self.root.join(file.as_str());
413 std::fs::read_to_string(full_path.as_str())
414 .map_err(|e| argyph_pack::PackError::Io(e.to_string()))
415 }
416
417 fn modified(&self, file: &Utf8Path) -> Option<SystemTime> {
418 tokio::runtime::Handle::current()
419 .block_on(self.index.get_file(file))
420 .ok()
421 .flatten()
422 .map(|entry| entry.modified)
423 }
424
425 fn in_edges(&self, file: &Utf8Path) -> argyph_pack::Result<usize> {
426 tokio::runtime::Handle::current()
427 .block_on(self.index.get_imports(file))
428 .map(|edges| edges.len())
429 .map_err(|e| argyph_pack::PackError::Io(e.to_string()))
430 }
431}
432
433fn build_regex(pattern: &str, regex: bool, case_sensitive: bool) -> Result<Regex> {
434 let pat = if regex {
435 pattern.to_string()
436 } else {
437 regex::escape(pattern)
438 };
439 regex::RegexBuilder::new(&pat)
440 .case_insensitive(!case_sensitive)
441 .build()
442 .map_err(|e| crate::CoreError::Io(std::io::Error::new(std::io::ErrorKind::InvalidInput, e)))
443}
444
445fn path_matches_filter(path: &str, filter: &SearchFilter) -> bool {
446 let globs_ok = filter
447 .paths_glob
448 .as_ref()
449 .is_none_or(|globs| globs.iter().any(|g| glob_match(g, path)));
450 let excludes_ok = filter
451 .exclude_glob
452 .as_ref()
453 .is_none_or(|globs| !globs.iter().any(|g| glob_match(g, path)));
454 globs_ok && excludes_ok
455}
456
457fn glob_match(glob: &str, path: &str) -> bool {
458 let cleaned = glob.trim_start_matches('!');
459 if let Ok(re) = glob_to_regex(cleaned) {
460 re.is_match(path)
461 } else {
462 path.contains(cleaned)
463 }
464}
465
466fn glob_to_regex(glob: &str) -> std::result::Result<Regex, regex::Error> {
467 let mut pattern = String::from("^");
468 let chars: Vec<char> = glob.chars().collect();
469 let mut i = 0;
470 while i < chars.len() {
471 match chars[i] {
472 '*' if i + 1 < chars.len() && chars[i + 1] == '*' => {
473 pattern.push_str(".*");
474 i += 1;
475 }
476 '*' => pattern.push_str("[^/]*"),
477 '?' => pattern.push_str("[^/]"),
478 '.' | '+' | '(' | ')' | '|' | '^' | '$' | '{' | '}' | '[' | ']' | '\\' => {
479 pattern.push('\\');
480 pattern.push(chars[i]);
481 }
482 c => pattern.push(c),
483 }
484 i += 1;
485 }
486 pattern.push('$');
487 Regex::new(&pattern)
488}
489
490#[derive(Debug, Clone)]
492pub struct IndexStatus {
493 pub protocol_version: String,
494 pub file_count: u64,
495 pub snapshot_at: SystemTime,
496}
497
498fn file_from_symbol_id(id: &SymbolId) -> Option<Utf8PathBuf> {
499 let s = id.as_str();
500 let (prefix, _) = s.rsplit_once("::")?;
501 let (file, _) = prefix.rsplit_once("::")?;
502 Some(Utf8PathBuf::from(file))
503}
504
505#[cfg(test)]
506#[allow(clippy::unwrap_used)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn glob_star_star_matches_subdirs() {
512 let re = glob_to_regex("src/**").unwrap();
513 assert!(re.is_match("src/main.rs"));
514 assert!(re.is_match("src/auth/mod.rs"));
515 }
516
517 #[test]
518 fn glob_single_star_no_slash() {
519 let re = glob_to_regex("*.rs").unwrap();
520 assert!(re.is_match("main.rs"));
521 assert!(!re.is_match("src/main.rs"));
522 }
523
524 #[test]
525 fn build_regex_literal() {
526 let re = build_regex("fn main", false, true).unwrap();
527 assert!(re.is_match("fn main() {}"));
528 assert!(!re.is_match("FN MAIN"));
529 }
530
531 #[test]
532 fn build_regex_case_insensitive() {
533 let re = build_regex("fn", false, false).unwrap();
534 assert!(re.is_match("fn main"));
535 assert!(re.is_match("FN MAIN"));
536 }
537}