use crate::config::Config;
use anyhow::Result;
pub use octolib::embedding::{
count_tokens, create_embedding_provider_from_parts, split_texts_into_token_limited_batches,
truncate_output, EmbeddingProvider, InputType,
};
pub use octolib::embedding::types::{parse_provider_model, EmbeddingProviderType};
pub mod types {
pub use octolib::embedding::types::*;
}
pub mod provider {
pub use octolib::embedding::provider::*;
}
#[derive(Debug, Clone)]
pub struct EmbeddingGenerationConfig {
pub code_model: String,
pub text_model: String,
pub batch_size: usize,
pub max_tokens_per_batch: usize,
}
impl Default for EmbeddingGenerationConfig {
fn default() -> Self {
Self {
code_model: "voyage:voyage-code-3".to_string(),
text_model: "voyage:voyage-3.5-lite".to_string(),
batch_size: 16,
max_tokens_per_batch: 100_000,
}
}
}
impl From<&Config> for EmbeddingGenerationConfig {
fn from(config: &Config) -> Self {
Self {
code_model: config.embedding.code_model.clone(),
text_model: config.embedding.text_model.clone(),
batch_size: config.index.embeddings_batch_size,
max_tokens_per_batch: config.index.embeddings_max_tokens_per_batch,
}
}
}
pub async fn generate_embeddings(
contents: &str,
is_code: bool,
config: &Config,
) -> Result<Vec<f32>> {
let embedding_config = EmbeddingGenerationConfig::from(config);
let model_string = if is_code {
&embedding_config.code_model
} else {
&embedding_config.text_model
};
let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
(p, m)
} else {
return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
};
octolib::embedding::generate_embeddings(contents, provider, model).await
}
pub async fn generate_embeddings_batch(
texts: Vec<String>,
is_code: bool,
config: &Config,
input_type: InputType,
) -> Result<Vec<Vec<f32>>> {
let embedding_config = EmbeddingGenerationConfig::from(config);
let model_string = if is_code {
&embedding_config.code_model
} else {
&embedding_config.text_model
};
let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
(p, m)
} else {
return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
};
octolib::embedding::generate_embeddings_batch(
texts,
provider,
model,
input_type,
embedding_config.batch_size,
embedding_config.max_tokens_per_batch,
)
.await
}
#[derive(Debug, Clone)]
pub struct SearchModeEmbeddings {
pub code_embeddings: Option<Vec<f32>>,
pub text_embeddings: Option<Vec<f32>>,
}
pub async fn generate_search_embeddings(
query: &str,
mode: &str,
config: &Config,
) -> Result<SearchModeEmbeddings> {
match mode {
"code" => {
let embeddings = generate_embeddings(query, true, config).await?;
Ok(SearchModeEmbeddings {
code_embeddings: Some(embeddings),
text_embeddings: None,
})
}
"docs" | "text" => {
let embeddings = generate_embeddings(query, false, config).await?;
Ok(SearchModeEmbeddings {
code_embeddings: None,
text_embeddings: Some(embeddings),
})
}
"all" => {
let embedding_config = EmbeddingGenerationConfig::from(config);
let code_model = &embedding_config.code_model;
let text_model = &embedding_config.text_model;
if code_model == text_model {
let embeddings = generate_embeddings(query, true, config).await?;
Ok(SearchModeEmbeddings {
code_embeddings: Some(embeddings.clone()),
text_embeddings: Some(embeddings),
})
} else {
let code_embeddings = generate_embeddings(query, true, config).await?;
let text_embeddings = generate_embeddings(query, false, config).await?;
Ok(SearchModeEmbeddings {
code_embeddings: Some(code_embeddings),
text_embeddings: Some(text_embeddings),
})
}
}
_ => Err(anyhow::anyhow!(
"Invalid search mode '{}'. Use 'all', 'code', 'docs', or 'text'.",
mode
)),
}
}
pub fn calculate_unique_content_hash(contents: &str, file_path: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn calculate_content_hash_with_lines(
contents: &str,
file_path: &str,
start_line: usize,
end_line: usize,
) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
hasher.update(start_line.to_string().as_bytes());
hasher.update(end_line.to_string().as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn calculate_content_hash(contents: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
format!("{:x}", hasher.finalize())
}