use crate::cli::args::{OutputFormat, QueryArgs};
use crate::cli::commands::{create_framework, print_success, print_warning, truncate_preview};
use crate::cli::error::{CliError, Result};
use crate::encoder::TextEncoder;
use crate::retrieval::bm25::Bm25Index;
use crate::retrieval::hybrid::{compute_weights, merge_results};
use std::path::Path;
pub async fn run_query(
args: QueryArgs,
db_path: Option<&Path>,
format: OutputFormat,
) -> Result<()> {
if args.min_score < 0.0 || args.min_score > 1.0 {
return Err(CliError::Validation(format!(
"min-score must be between 0.0 and 1.0, got {}",
args.min_score
)));
}
if args.top_k == 0 {
return Err(CliError::Validation("top-k must be at least 1".into()));
}
if let Some(kw) = args.keyword_weight {
if !(0.0..=1.0).contains(&kw) {
return Err(CliError::Validation(format!(
"keyword-weight must be between 0.0 and 1.0, got {}",
kw
)));
}
}
if args.semantic_only && args.keyword_only {
return Err(CliError::Validation(
"cannot use both --semantic-only and --keyword-only".into(),
));
}
let framework = create_framework(db_path).await?;
let encoder = if args.code_aware {
TextEncoder::new_code_aware()
} else {
TextEncoder::new()
};
let query_tokens = tokenize_query(&args.text, args.code_aware);
let use_bm25 = !args.semantic_only;
let use_hdc = !args.keyword_only;
let hdc_results = if use_hdc {
let query_vector = encoder.encode(&args.text);
Some(
framework
.probe(query_vector, args.top_k)
.await
.map_err(|e| CliError::Persistence(format!("query operation failed: {}", e)))?,
)
} else {
None
};
let bm25_results = if use_bm25 {
let bm25_index = build_bm25_index(&framework).await?;
if bm25_index.is_empty() {
None
} else {
Some(bm25_index.search(&query_tokens, args.top_k))
}
} else {
None
};
let merged_results = match (bm25_results, hdc_results) {
(Some(bm25), Some(hdc)) => {
let weights = if let Some(kw) = args.keyword_weight {
(kw as f32, (1.0 - kw) as f32)
} else {
compute_weights(query_tokens.len())
};
merge_results(&bm25, &hdc, weights)
}
(Some(bm25), None) => bm25,
(None, Some(hdc)) => hdc,
(None, None) => Vec::new(),
};
let filtered: Vec<_> = merged_results
.into_iter()
.filter(|(_, score)| *score >= args.min_score as f32)
.collect();
match format {
OutputFormat::Json => {
let mut results_json: Vec<serde_json::Value> = Vec::new();
for (id, score) in &filtered {
let concept = framework.get_concept(id).await.ok().flatten();
let metadata_json = concept
.as_ref()
.map(|c| serde_json::to_value(&c.metadata).unwrap_or(serde_json::json!({})))
.unwrap_or(serde_json::json!({}));
let text = metadata_json
.get("text_preview")
.or_else(|| metadata_json.get("content_preview"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let path = metadata_json
.get("source")
.or_else(|| metadata_json.get("path"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let display_text = if args.compact {
truncate_preview(&text, 200)
} else {
text
};
results_json.push(serde_json::json!({
"score": score,
"text": display_text,
"path": path,
"metadata": metadata_json
}));
}
println!(
"{}",
serde_json::to_string(&results_json)
.map_err(|e| CliError::Output(format!("failed to serialize results: {}", e)))?
);
}
OutputFormat::Table => {
if filtered.is_empty() {
print_warning("no similar concepts found", format);
} else {
print_success(&format!("Found {} results", filtered.len()), format);
println!("{:<40} {:>12}", "CONCEPT ID", "SCORE");
println!("{:-<40} {:-<12}", "", "");
for (id, score) in &filtered {
println!("{:<40} {:>12.4}", id, score);
}
}
}
OutputFormat::Quiet => {
for (id, _) in &filtered {
println!("{}", id);
}
}
}
Ok(())
}
fn tokenize_query(text: &str, code_aware: bool) -> Vec<String> {
let processed = text.to_lowercase();
if code_aware {
tokenize_code(&processed)
} else {
processed
.split_whitespace()
.map(|s| s.to_string())
.collect()
}
}
fn tokenize_code(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
for word in text.split_whitespace() {
let parts = split_on_separators(word);
tokens.extend(parts);
}
tokens
}
fn split_on_separators(word: &str) -> Vec<String> {
let mut result = Vec::new();
let mut current = String::new();
let chars: Vec<char> = word.chars().collect();
let mut i = 0;
while i < chars.len() {
if i + 1 < chars.len() && chars[i] == ':' && chars[i + 1] == ':' {
if !current.is_empty() {
result.push(current.clone());
current.clear();
}
i += 2;
continue;
}
let c = chars[i];
if c == '_' || c == '-' || c == '.' || c == '/' {
if !current.is_empty() {
result.push(current.clone());
current.clear();
}
i += 1;
continue;
}
current.push(c);
i += 1;
}
if !current.is_empty() {
result.push(current);
}
result
}
async fn build_bm25_index(
framework: &crate::framework::ChaoticSemanticFramework,
) -> Result<Bm25Index> {
let singularity = framework.singularity();
let sing = singularity.read().await;
let mut index = Bm25Index::new();
for concept in sing.all_concepts() {
let text = concept
.metadata
.get("text_preview")
.or_else(|| concept.metadata.get("content_preview"))
.and_then(|v| v.as_str())
.unwrap_or("");
let tokens: Vec<String> = text
.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect();
if !tokens.is_empty() {
index.add_document(&concept.id, &tokens);
}
}
Ok(index)
}