use std::path::Path;
use std::time::SystemTime;
use anyhow::{Context, Result};
use sha2::{Digest, Sha256};
use tracing::warn;
use walkdir::WalkDir;
use crate::db::Database;
use crate::languages::{detect_language, get_extractor, Extractor};
use crate::types::FileInfo;
#[derive(Debug, Default, serde::Serialize)]
pub struct IndexResult {
pub files_indexed: u32,
pub files_skipped: u32,
pub files_removed: u32,
pub symbols_added: u32,
pub edges_added: u32,
pub edges_resolved: u32,
}
pub fn index_directory(db: &Database, root: &Path, force: bool) -> Result<IndexResult> {
let mut result = IndexResult::default();
let root = root.canonicalize().context("Failed to resolve root path")?;
let mut extractors: std::collections::HashMap<&'static str, Box<dyn Extractor>> =
std::collections::HashMap::new();
let mut current_files = std::collections::HashSet::new();
let last_commit = if force {
None
} else {
db.get_metadata("last_commit")?
};
let changed_files = if force {
None
} else {
git_changed_files(&root, last_commit.as_deref())
};
for entry in WalkDir::new(&root)
.follow_links(true)
.into_iter()
.filter_entry(|e| !is_ignored(e))
{
let entry = match entry {
Ok(e) => e,
Err(e) => {
warn!(error = %e, "directory walk error");
continue;
}
};
if !entry.file_type().is_file() {
continue;
}
let path = entry.path();
let rel_path = match path.strip_prefix(&root) {
Ok(p) => p.to_string_lossy().to_string(),
Err(_) => continue,
};
let lang = match detect_language(Path::new(&rel_path)) {
Some(l) => l,
None => continue,
};
current_files.insert(rel_path.clone());
if !force {
if let Some(ref changed) = changed_files {
if !changed.contains(&rel_path) && db.get_file(&rel_path)?.is_some() {
result.files_skipped += 1;
continue;
}
}
}
let source = match std::fs::read_to_string(path) {
Ok(s) => s,
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => continue, Err(e) => {
warn!(file = %rel_path, error = %e, "cannot read file");
continue;
}
};
let hash = file_hash(&source);
if !force {
if let Ok(Some(existing)) = db.get_file(&rel_path) {
if existing.hash == hash {
result.files_skipped += 1;
continue;
}
}
}
let modified = file_modified(path);
let extractor = extractors
.entry(lang)
.or_insert_with(|| get_extractor(lang).expect("lang was validated by detect_language"))
.as_mut();
let extraction = match extractor.extract(&source, &rel_path) {
Ok(e) => e,
Err(err) => {
warn!(file = %rel_path, error = %err, "extraction failed");
continue;
}
};
db.clear_file_data(&rel_path)?;
let num_symbols = extraction.symbols.len() as u32;
let num_edges = extraction.edges.len() as u32;
db.insert_symbols(&extraction.symbols)?;
db.insert_edges(&extraction.edges)?;
let contents: Vec<(String, String, String, String)> = extraction
.symbols
.iter()
.filter(|sym| sym.kind != crate::types::SymbolKind::Import)
.filter_map(|sym| {
extract_symbol_content(&source, sym)
.map(|(content, header)| (sym.id.clone(), sym.name.clone(), content, header))
})
.collect();
if !contents.is_empty() {
db.insert_symbol_contents(&contents)?;
}
db.upsert_file(&FileInfo {
path: rel_path,
last_modified: modified,
hash,
language: lang.to_string(),
num_symbols,
})?;
result.files_indexed += 1;
result.symbols_added += num_symbols;
result.edges_added += num_edges;
}
let all_indexed = db.all_files()?;
for indexed_path in all_indexed {
if !current_files.contains(&indexed_path) {
db.remove_file(&indexed_path)?;
result.files_removed += 1;
}
}
result.edges_resolved = db.resolve_edges()?;
if let Some(commit) = git_head_commit(&root) {
db.set_metadata("last_commit", &commit)?;
}
Ok(result)
}
fn is_ignored(entry: &walkdir::DirEntry) -> bool {
let name = entry.file_name().to_string_lossy();
if entry.file_type().is_dir() {
return is_ignored_dirname(&name);
}
false
}
pub fn is_ignored_dirname(name: &str) -> bool {
matches!(
name,
".git"
| ".hg"
| ".svn"
| "node_modules"
| "__pycache__"
| ".mypy_cache"
| ".pytest_cache"
| ".tox"
| ".venv"
| "venv"
| ".env"
| "env"
| "target"
| "dist"
| "build"
| ".next"
| ".nuxt"
| "vendor"
) || name.starts_with('.')
}
fn file_hash(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
fn file_modified(path: &Path) -> f64 {
path.metadata()
.and_then(|m| m.modified())
.ok()
.and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
.map(|d| d.as_secs_f64())
.unwrap_or(0.0)
}
fn git_changed_files(
root: &Path,
last_commit: Option<&str>,
) -> Option<std::collections::HashSet<String>> {
let last_commit = last_commit?;
let verify = git_cmd(root, &["cat-file", "-t", last_commit])?;
if !verify.status.success() {
return None;
}
let diff_output = git_cmd(root, &["diff", "--name-only", last_commit, "HEAD"])?;
if !diff_output.status.success() {
return None;
}
let mut changed: std::collections::HashSet<String> =
parse_git_lines(&diff_output.stdout).collect();
if let Some(out) = git_cmd(root, &["ls-files", "--others", "--exclude-standard"]) {
if out.status.success() {
changed.extend(parse_git_lines(&out.stdout));
}
}
if let Some(out) = git_cmd(root, &["diff", "--name-only"]) {
if out.status.success() {
changed.extend(parse_git_lines(&out.stdout));
}
}
if let Some(out) = git_cmd(root, &["diff", "--name-only", "--cached"]) {
if out.status.success() {
changed.extend(parse_git_lines(&out.stdout));
}
}
Some(changed)
}
fn git_head_commit(root: &Path) -> Option<String> {
let output = git_cmd(root, &["rev-parse", "HEAD"])?;
if output.status.success() {
Some(String::from_utf8(output.stdout).ok()?.trim().to_string())
} else {
None
}
}
fn git_cmd(root: &Path, args: &[&str]) -> Option<std::process::Output> {
std::process::Command::new("git")
.args(args)
.current_dir(root)
.stdin(std::process::Stdio::null())
.output()
.ok()
}
fn parse_git_lines(stdout: &[u8]) -> impl Iterator<Item = String> + '_ {
String::from_utf8_lossy(stdout)
.lines()
.filter(|l| !l.is_empty())
.map(|l| l.to_string())
.collect::<Vec<_>>()
.into_iter()
}
fn floor_char_boundary(s: &str, index: usize) -> usize {
if index >= s.len() {
return s.len();
}
let mut i = index;
while i > 0 && !s.is_char_boundary(i) {
i -= 1;
}
i
}
const MAX_CONTENT_BYTES: usize = 2048;
const MIN_CONTENT_BYTES: usize = 50;
fn extract_symbol_content(source: &str, sym: &crate::types::Symbol) -> Option<(String, String)> {
if sym.kind == crate::types::SymbolKind::Import {
return None;
}
let start = sym.start_byte as usize;
let end = sym.end_byte as usize;
if start >= end || end > source.len() {
return None;
}
let safe_start = if source.is_char_boundary(start) {
start
} else {
let mut s = start;
while s < source.len() && !source.is_char_boundary(s) {
s += 1;
}
s
};
let truncated_end = end.min(safe_start + MAX_CONTENT_BYTES);
let safe_end = floor_char_boundary(source, truncated_end);
if safe_start >= safe_end {
return None;
}
let raw = &source[safe_start..safe_end];
let trimmed = raw.trim();
if trimmed.is_empty() || trimmed.len() < MIN_CONTENT_BYTES {
return None;
}
let header = format!(
"// File: {}\n// Type: {}\n// Name: {}",
sym.file_path, sym.kind, sym.name
);
Some((raw.to_string(), header))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_file_hash_deterministic() {
let h1 = file_hash("def foo(): pass");
let h2 = file_hash("def foo(): pass");
assert_eq!(h1, h2);
}
#[test]
fn test_file_hash_different_content() {
let h1 = file_hash("def foo(): pass");
let h2 = file_hash("def bar(): pass");
assert_ne!(h1, h2);
}
#[test]
fn test_is_ignored_directories() {
let tmp = std::env::temp_dir().join("cartog_test_ignored");
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
let ignored_dirs = [
".git",
"node_modules",
"__pycache__",
"target",
"dist",
"build",
".venv",
];
let allowed_dirs = ["src", "lib", "tests", "docs"];
for name in ignored_dirs.iter().chain(allowed_dirs.iter()) {
std::fs::create_dir_all(tmp.join(name)).unwrap();
}
let entries: Vec<_> = WalkDir::new(&tmp)
.min_depth(1)
.max_depth(1)
.into_iter()
.filter_map(|e| e.ok())
.collect();
for entry in &entries {
let name = entry.file_name().to_string_lossy();
if ignored_dirs.contains(&name.as_ref()) {
assert!(is_ignored(entry), "{name} should be ignored");
}
if allowed_dirs.contains(&name.as_ref()) {
assert!(!is_ignored(entry), "{name} should NOT be ignored");
}
}
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn test_git_changed_files_no_commit() {
let result = git_changed_files(Path::new("."), None);
assert!(result.is_none());
}
#[test]
fn test_git_changed_files_invalid_commit() {
let result = git_changed_files(
Path::new("."),
Some("0000000000000000000000000000000000000000"),
);
assert!(result.is_none());
}
#[test]
fn test_git_changed_files_valid_head() {
let head = git_head_commit(Path::new("."));
if let Some(commit) = head {
let result = git_changed_files(Path::new("."), Some(&commit));
assert!(result.is_some());
}
}
#[test]
fn test_index_directory_force() {
use crate::db::Database;
let db = Database::open_memory().unwrap();
let fixtures = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/auth");
if fixtures.exists() {
let r1 = index_directory(&db, &fixtures, false).unwrap();
assert!(r1.files_indexed > 0);
let r2 = index_directory(&db, &fixtures, false).unwrap();
assert_eq!(r2.files_indexed, 0);
assert!(r2.files_skipped > 0);
let r3 = index_directory(&db, &fixtures, true).unwrap();
assert_eq!(r3.files_indexed, r1.files_indexed);
assert_eq!(r3.files_skipped, 0);
}
}
#[test]
fn test_floor_char_boundary_ascii() {
let s = "hello world";
assert_eq!(floor_char_boundary(s, 5), 5);
assert_eq!(floor_char_boundary(s, 0), 0);
assert_eq!(floor_char_boundary(s, 100), s.len());
}
#[test]
fn test_floor_char_boundary_multibyte() {
let s = "abc─def";
assert_eq!(floor_char_boundary(s, 3), 3); assert_eq!(floor_char_boundary(s, 4), 3); assert_eq!(floor_char_boundary(s, 5), 3); assert_eq!(floor_char_boundary(s, 6), 6); }
#[test]
fn test_extract_symbol_content_truncates_at_char_boundary() {
let padding = "x".repeat(MAX_CONTENT_BYTES - 1);
let source = format!("{padding}─after");
let sym = crate::types::Symbol::new(
"test_sym",
crate::types::SymbolKind::Function,
"test.rb",
1,
100,
0,
source.len() as u32,
);
let result = extract_symbol_content(&source, &sym);
assert!(result.is_some());
let (content, _header) = result.unwrap();
assert_eq!(content.len(), MAX_CONTENT_BYTES - 1);
assert!(content.is_char_boundary(content.len()));
}
}