i-self 0.4.3

Personal developer-companion CLI: scans your repos, indexes code semantically, watches your activity, and moves AI-agent sessions between tools (Claude Code, Aider, Goose, OpenAI Codex CLI, Continue.dev, OpenCode).
#![allow(dead_code)]

use super::{CodeEmbedding, EmbeddingGenerator, EmbeddingMetadata, SemanticConfig};
use anyhow::Result;
use std::path::Path;
use tokio::fs;
use tracing::{info, debug, warn};
use walkdir::WalkDir;

/// Indexes code files and generates embeddings
pub struct SemanticIndex {
    generator: EmbeddingGenerator,
    config: SemanticConfig,
}

impl SemanticIndex {
    pub fn new(config: SemanticConfig) -> Result<Self> {
        let generator = EmbeddingGenerator::new(config.clone())?;
        Ok(Self { generator, config })
    }

    /// Index a directory of code files
    pub async fn index_directory(&self, path: &Path, repository: &str) -> Result<Vec<CodeEmbedding>> {
        info!("Indexing directory: {}", path.display());

        let mut embeddings = Vec::new();
        let code_files = self.find_code_files(path).await?;

        info!("Found {} code files to index", code_files.len());

        for file_path in code_files {
            match self.index_file(&file_path, repository).await {
                Ok(file_embeddings) => {
                    debug!("Indexed {} chunks from {}", file_embeddings.len(), file_path.display());
                    embeddings.extend(file_embeddings);
                }
                Err(e) => {
                    warn!("Failed to index {}: {}", file_path.display(), e);
                }
            }
        }

        info!("Generated {} total embeddings", embeddings.len());
        Ok(embeddings)
    }

    /// Like `index_directory` but consults a per-file cache keyed by content
    /// SHA-256. Files whose content hasn't changed since last embed are
    /// skipped — their cached embeddings flow straight through to the
    /// returned vector. Files that are new, modified, or deleted are
    /// embedded fresh; deleted files drop out of the cache.
    ///
    /// The caller is responsible for persisting the (mutated) cache via
    /// `EmbeddingStorage::save_cache`. Returns `(embeddings, hits, misses)`
    /// for surfacing in CLI output.
    pub async fn index_directory_incremental(
        &self,
        path: &Path,
        repository: &str,
        cache: &mut crate::semantic::storage::IndexCache,
    ) -> Result<(Vec<CodeEmbedding>, usize, usize)> {
        info!("Incrementally indexing directory: {}", path.display());

        let code_files = self.find_code_files(path).await?;
        let mut embeddings = Vec::new();
        let mut hits = 0usize;
        let mut misses = 0usize;
        let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();

        for file_path in &code_files {
            let path_key = file_path.to_string_lossy().to_string();
            seen.insert(path_key.clone());

            let content = match tokio::fs::read_to_string(file_path).await {
                Ok(c) => c,
                Err(e) => {
                    warn!("read {}: {}", file_path.display(), e);
                    continue;
                }
            };
            let hash = sha256_hex(&content);

            // Cache hit: same path + same content hash → reuse stored embeddings.
            if let Some(entry) = cache.files.get(&path_key) {
                if entry.content_hash == hash {
                    embeddings.extend(entry.embeddings.iter().cloned());
                    hits += 1;
                    continue;
                }
            }

            // Cache miss: embed fresh, update the cache.
            misses += 1;
            match self.index_file(file_path, repository).await {
                Ok(file_embeddings) => {
                    cache.files.insert(
                        path_key,
                        crate::semantic::storage::FileCacheEntry {
                            content_hash: hash,
                            embeddings: file_embeddings.clone(),
                        },
                    );
                    embeddings.extend(file_embeddings);
                }
                Err(e) => warn!("Failed to index {}: {}", file_path.display(), e),
            }
        }

        // Drop entries for files that no longer exist under this root. We
        // only do this for paths that *would* have been walked — if the
        // user has separate indices in different repos, those stay alive.
        let root_str = path.to_string_lossy().to_string();
        cache
            .files
            .retain(|k, _| !k.starts_with(&root_str) || seen.contains(k));

        info!(
            "Generated {} embeddings — {} cached, {} fresh ({} files total)",
            embeddings.len(),
            hits,
            misses,
            code_files.len()
        );
        Ok((embeddings, hits, misses))
    }

    /// Index a single code file
    async fn index_file(&self, file_path: &Path, repository: &str) -> Result<Vec<CodeEmbedding>> {
        let content = fs::read_to_string(file_path).await?;
        let relative_path = file_path.strip_prefix(file_path.parent().unwrap_or(file_path))
            .unwrap_or(file_path)
            .to_string_lossy()
            .to_string();

        // Chunk the code
        let chunks = self.generator.chunk_code(&content, &relative_path);
        
        if chunks.is_empty() {
            return Ok(Vec::new());
        }

        // Generate embeddings for chunks
        let texts: Vec<String> = chunks.iter().map(|c| c.content.clone()).collect();
        let vectors = self.generator.embed(&texts).await?;

        // Create embeddings with metadata
        let embeddings: Vec<CodeEmbedding> = chunks.into_iter()
            .zip(vectors.into_iter())
            .map(|(chunk, embedding)| {
                CodeEmbedding {
                    id: uuid::Uuid::new_v4().to_string(),
                    content: chunk.content,
                    embedding,
                    metadata: EmbeddingMetadata {
                        source_file: relative_path.clone(),
                        repository: repository.to_string(),
                        language: chunk.language,
                        start_line: chunk.start_line,
                        end_line: chunk.end_line,
                        function_name: chunk.function_name,
                        tags: extract_tags(&relative_path, &content),
                    },
                    created_at: chrono::Utc::now(),
                }
            })
            .collect();

        Ok(embeddings)
    }

    /// Find all code files in a directory
    async fn find_code_files(&self, path: &Path) -> Result<Vec<std::path::PathBuf>> {
        let mut files = Vec::new();
        
        for entry in WalkDir::new(path)
            .follow_links(false)
            .max_depth(10)
            .into_iter()
            .filter_map(|e| e.ok())
        {
            let path = entry.path();
            
            if !path.is_file() {
                continue;
            }

            // Skip hidden and common non-code directories
            if should_skip_path(path) {
                continue;
            }

            // Check if it's a code file
            if is_code_file(path) {
                files.push(path.to_path_buf());
            }
        }

        Ok(files)
    }

    /// Index a GitHub repository (clone and index)
    pub async fn index_github_repo(&self, owner: &str, repo: &str, token: Option<&str>) -> Result<Vec<CodeEmbedding>> {
        use std::process::Command;
        
        let temp_dir = tempfile::tempdir()?;
        let clone_url = format!("https://github.com/{}/{}.git", owner, repo);
        
        info!("Cloning repository: {}", clone_url);
        
        let mut cmd = Command::new("git");
        cmd.args(&["clone", "--depth", "1", &clone_url, temp_dir.path().to_str().unwrap()]);
        
        if let Some(token) = token {
            // Use token for private repos
            let auth_url = format!("https://{}@github.com/{}/{}.git", token, owner, repo);
            cmd.args(&["--origin", "origin", &auth_url]);
        }
        
        let output = cmd.output()?;
        
        if !output.status.success() {
            anyhow::bail!("Failed to clone repository: {}", String::from_utf8_lossy(&output.stderr));
        }

        // Index the cloned repository
        let embeddings = self.index_directory(temp_dir.path(), &format!("{}/{}", owner, repo)).await?;
        
        Ok(embeddings)
    }
}

/// Lowercase hex SHA-256 of the input. Used to fingerprint file content for
/// the embedding cache; not security-relevant.
fn sha256_hex(content: &str) -> String {
    use sha2::{Digest, Sha256};
    let mut h = Sha256::new();
    h.update(content.as_bytes());
    let bytes = h.finalize();
    const HEX: &[u8; 16] = b"0123456789abcdef";
    let mut s = String::with_capacity(64);
    for b in bytes {
        s.push(HEX[(b >> 4) as usize] as char);
        s.push(HEX[(b & 0x0f) as usize] as char);
    }
    s
}

fn is_code_file(path: &Path) -> bool {
    let code_extensions = [
        "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "kt", "scala",
        "rb", "php", "c", "cpp", "cc", "cxx", "h", "hpp", "cs", "swift",
        "m", "mm", "r", "pl", "pm", "lua", "sh", "bash", "zsh", "ps1",
        "sql", "html", "css", "scss", "sass", "less", "vue", "svelte",
    ];

    if let Some(ext) = path.extension() {
        let ext = ext.to_string_lossy().to_lowercase();
        return code_extensions.contains(&ext.as_str());
    }

    false
}

fn should_skip_path(path: &Path) -> bool {
    let skip_dirs = [
        "node_modules", "target", "build", "dist", ".git",
        "__pycache__", ".next", "out", "vendor", "bin", "obj",
        ".idea", ".vscode", "coverage", ".nyc_output",
    ];

    path.components().any(|c| {
        if let Some(s) = c.as_os_str().to_str() {
            skip_dirs.contains(&s) || s.starts_with('.')
        } else {
            false
        }
    })
}

fn extract_tags(file_path: &str, content: &str) -> Vec<String> {
    let mut tags = Vec::new();
    
    // Language tag
    let ext = std::path::Path::new(file_path)
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("");
    
    match ext {
        "rs" => tags.push("rust".to_string()),
        "py" => tags.push("python".to_string()),
        "js" => tags.push("javascript".to_string()),
        "ts" => tags.push("typescript".to_string()),
        "go" => tags.push("go".to_string()),
        _ => {}
    }

    // Content-based tags
    if content.contains("async fn") || content.contains("async def") {
        tags.push("async".to_string());
    }
    
    if content.contains("test") || content.contains("#[test]") {
        tags.push("testing".to_string());
    }
    
    if content.contains("TODO") || content.contains("FIXME") {
        tags.push("has-todos".to_string());
    }

    tags
}