use anyhow::Result;
use clap::Parser;
use rayon::prelude::*;
use sift::embed::{AutoEmbedder, EmbedConfig, Embedder};
use sift::index::CodeIndex;
use sift::parser::{parse_file, LanguageId};
use sift::query::QueryEngine;
use std::path::{Path, PathBuf};
use std::time::Instant;
#[derive(Parser)]
#[command(name = "sift", version, about = "Structural codebase index for LLM tooling")]
enum Cli {
Index {
path: String,
#[arg(short, long)]
output: Option<String>,
#[arg(long)]
embed: bool,
},
Query {
query: String,
#[arg(short, long)]
index: Option<String>,
#[arg(long)]
embed: bool,
},
Skill,
}
fn main() -> Result<()> {
match Cli::parse() {
Cli::Index { path, output, embed } => cmd_index(&path, output.as_deref(), embed),
Cli::Query { query, index, embed } => cmd_query(&query, index.as_deref(), embed),
Cli::Skill => cmd_skill(),
}
}
fn cmd_index(path: &str, output: Option<&str>, embed: bool) -> Result<()> {
let start = Instant::now();
let root = Path::new(path);
if !root.exists() {
anyhow::bail!("path does not exist: {}", path);
}
let root = root.canonicalize()?;
let out_path = resolve_output_path(&root, output);
let embedder: Option<AutoEmbedder> = if embed {
let config = EmbedConfig::load();
let has_local = cfg!(feature = "candle");
let has_api = config.api_key.is_some()
|| std::env::var("OPENAI_API_KEY").is_ok()
|| config.api_url.is_some();
if !has_local && !has_api {
eprintln!("warn: --embed specified but no embedding backend configured");
eprintln!(" Set SIFT_EMBED_API_KEY for OpenAI/API embedding, or");
eprintln!(" set SIFT_EMBED_API_URL for a local API (e.g. Ollama), or");
eprintln!(" build with `--features candle` for local embeddings");
}
match AutoEmbedder::new(&config) {
Ok(e) => Some(e),
Err(e) => {
eprintln!("warn: embedding disabled: {}", e);
None
}
}
} else {
None
};
println!("Indexing {}...", root.display());
let walk = ignore::WalkBuilder::new(&root)
.standard_filters(true)
.build();
let mut files: Vec<PathBuf> = Vec::new();
for entry in walk {
let entry = match entry {
Ok(e) => e,
Err(e) => {
eprintln!(" warn: skipping entry: {:#}", e);
continue;
}
};
if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
let path = entry.path();
if path.components().any(|c| c.as_os_str() == "target") {
continue;
}
if LanguageId::from_path(path).is_some() {
files.push(path.to_path_buf());
}
}
}
if files.is_empty() {
anyhow::bail!("no supported source files found in {}", root.display());
}
println!("Found {} parseable files", files.len());
let results: Vec<_> = files
.par_iter()
.filter_map(|path| match parse_file(path) {
Ok(parsed) => Some(parsed),
Err(e) => {
eprintln!(" warn: {}: {:#}", path.display(), e);
None
}
})
.collect();
if results.is_empty() {
anyhow::bail!("no files could be parsed in {}", root.display());
}
println!("Parsed {} files in {:?}", results.len(), start.elapsed());
let index = CodeIndex::build(results, &root, embedder.as_ref().map(|e| e as &dyn Embedder));
index.save(&out_path)?;
let embedded = index.symbols.iter().filter(|s| s.embedding.is_some()).count();
println!(
"Index saved to {} ({} symbols, {} calls, {} imports, {} embedded) in {:?}",
out_path.display(),
index.symbols.len(),
index.calls.len(),
index.imports.len(),
embedded,
start.elapsed(),
);
Ok(())
}
fn cmd_query(query_str: &str, index: Option<&str>, embed: bool) -> Result<()> {
let query_str = query_str.trim();
if query_str.is_empty() {
anyhow::bail!("query string is empty — try: define <name>, calls <name>, file <path>, etc.");
}
let index_path = if let Some(p) = index {
PathBuf::from(p)
} else {
PathBuf::from(".sift/index.bin")
};
if !index_path.exists() {
anyhow::bail!(
"index not found at {} — run `sift index <path>` first",
index_path.display()
);
}
let index = CodeIndex::load(&index_path)?;
let engine = if embed {
let config = EmbedConfig::load();
match AutoEmbedder::new(&config) {
Ok(e) => QueryEngine::with_embedder(&index, Box::new(e)),
Err(e) => {
eprintln!("warn: semantic search disabled: {}", e);
QueryEngine::new(&index)
}
}
} else {
QueryEngine::new(&index)
};
let results = engine.execute(query_str);
if results.is_empty() {
println!("No results");
return Ok(());
}
let json = serde_json::to_string_pretty(&results)?;
println!("{json}");
Ok(())
}
fn cmd_skill() -> Result<()> {
let skill = r#"# sift: Codebase Structural Index
sift builds a structural index of your codebase (symbols, call graphs, imports)
using tree-sitter. It supports Rust, Python, JavaScript, TypeScript, TSX, Go,
C, C++, Java, Ruby, Zig, and Bash — with no API keys or network required.
## When to use sift
Use sift instead of reading files directly when you need to:
- Find where a symbol is defined
- Trace callers/callees of a function
- Find all implementations of an interface/trait
- Discover code relationships across files
- Search for symbols by name pattern
- Find relevant code by describing what it does (semantic search, requires --embed)
sift returns *minimal structured data* — just enough to understand relationships,
not full source code. If you need the actual implementation, read the file directly.
## Available commands
### `sift query "define <name>"`
Find all definitions whose name matches. Returns symbol kind, file, and line range.
### `sift query "calls <name>"`
Find all callers of functions/methods named <name>. Returns file and line per call site.
### `sift query "callees <name>"`
Find all functions called by definitions named <name>.
### `sift query "implements <name>"`
Find all implementations of traits/interfaces named <name>.
### `sift query "file <path>"`
List all symbols defined in a given file.
### `sift query "symbols matching <pattern>"`
Case-insensitive substring search across all symbol names.
### `sift query "semantic <description>"`
Semantic search using embeddings (requires --embed on both index and query).
Embeds the description and returns top-10 symbols ranked by cosine similarity.
Each result includes a `score` field (0.0-1.0). Example:
sift index --embed .
sift query --embed "semantic calculate monthly revenue"
Config via env vars or config file (later wins):
- `~/.config/sift/config.toml`
- `.sift/config.toml` (project-level)
- `SIFT_EMBED_*` env vars
Example `.sift/config.toml`:
[embed]
backend = "api"
api_url = "http://localhost:11434/v1/embeddings"
api_model = "nomic-embed-text"
Env var reference:
SIFT_EMBED_API_KEY # API key (optional with local backends like Ollama)
SIFT_EMBED_API_URL # default: https://api.openai.com/v1/embeddings
SIFT_EMBED_API_MODEL # default: text-embedding-3-small
SIFT_EMBED_BACKEND # "auto" (default), "api", or "local"
SIFT_EMBED_MODEL_PATH # path to local model files (candle feature)
If no backend is available, sift prints a warning explaining how to configure one.
### `sift query "files"`
List all indexed files (relative paths).
## JSON output format
```json
[
{"type": "definition", "name": "...", "kind": "function", "file": "src/foo.rs", "line": 10, "end_line": 42, "doc": "/// Doc comment text"},
{"type": "call", "caller": "foo", "callee": "bar", "file": "src/foo.rs", "line": 15},
{"type": "semantic", "name": "...", "kind": "function", "file": "src/bar.rs", "line": 5, "end_line": 20, "score": 0.92, "doc": "/// Doc comment text"}
]
```
"#;
println!("{skill}");
Ok(())
}
fn resolve_output_path(root: &Path, output: Option<&str>) -> PathBuf {
if let Some(p) = output {
PathBuf::from(p)
} else {
let dir = root.join(".sift");
let _ = std::fs::create_dir_all(&dir);
dir.join("index.bin")
}
}