use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;
use cached::proc_macro::cached;
use fff_search::{
parse_grep_query, FFFMode, FilePicker, FilePickerOptions, GrepMode, GrepSearchOptions,
SharedFrecency,
};
use terraphim_config::Haystack;
use terraphim_persistence::Persistable;
use terraphim_types::{Document, DocumentType, Index};
use tokio::fs as tfs;
use super::IndexMiddleware;
use crate::Result;
fn floor_char_boundary(s: &str, index: usize) -> usize {
if index >= s.len() {
return s.len();
}
let mut i = index;
while i > 0 && !s.is_char_boundary(i) {
i -= 1;
}
i
}
pub struct FffIndexer {
kg_scorer: Option<Arc<terraphim_file_search::kg_scorer::KgPathScorer>>,
frecency: Option<SharedFrecency>,
}
impl Default for FffIndexer {
fn default() -> Self {
let frecency = std::env::var("FFF_FRECENCY_PATH").ok().and_then(|path| {
fff_search::FrecencyTracker::open(&path)
.map(|tracker| {
let shared = SharedFrecency::default();
shared.init(tracker).ok();
shared
})
.ok()
});
Self {
kg_scorer: None,
frecency,
}
}
}
#[cached(
result = true,
size = 64,
key = "String",
convert = r#"{ format!("{}::{}::{:?}", haystack.location, needle, haystack.get_extra_parameters()) }"#
)]
async fn cached_fff_index(needle: &str, haystack: &Haystack) -> Result<Index> {
let indexer = FffIndexer::default();
indexer.index_inner(needle, haystack).await
}
impl IndexMiddleware for FffIndexer {
async fn index(&self, needle: &str, haystack: &Haystack) -> Result<Index> {
if self.is_stateful() {
self.index_inner(needle, haystack).await
} else {
cached_fff_index(needle, haystack).await
}
}
}
impl FffIndexer {
pub fn new() -> Self {
Self::default()
}
pub(crate) fn is_stateful(&self) -> bool {
self.kg_scorer.is_some() || self.frecency.is_some()
}
fn allowed_extensions(haystack: &Haystack) -> Vec<String> {
let params = haystack.get_extra_parameters();
if let Some(value) = params.get("extensions") {
return value.split(',').map(|s| s.trim().to_string()).collect();
}
if let Some(value) = params.get("extension") {
return value.split(',').map(|s| s.trim().to_string()).collect();
}
if params
.get("type")
.is_some_and(|v| v == "markdown" || v == "md")
{
return vec!["md".to_string(), "markdown".to_string()];
}
vec!["md".to_string()]
}
fn file_extension_allowed(relative_path: &str, allowed: &[String]) -> bool {
Path::new(relative_path)
.extension()
.and_then(|ext| ext.to_str())
.is_some_and(|ext| allowed.iter().any(|a| a == ext))
}
pub fn with_kg_scorer(
mut self,
scorer: Arc<terraphim_file_search::kg_scorer::KgPathScorer>,
) -> Self {
self.kg_scorer = Some(scorer);
self
}
pub async fn update_document(&self, document: &Document) -> Result<()> {
let path = Path::new(&document.url);
if let Some(parent) = path.parent() {
if !parent.exists() {
log::warn!("Parent directory does not exist for {:?}", path);
}
}
let mut content = document.body.clone();
if content.contains('<') && content.contains('>') {
log::debug!("Converting HTML content to Markdown for file {:?}", path);
content = html2md::parse_html(&content);
}
log::info!("Writing updated document back to markdown file: {:?}", path);
tfs::write(path, content).await?;
Ok(())
}
fn normalize_document_id(&self, file_path: &str) -> String {
let dummy_doc = Document {
id: "dummy".to_string(),
title: "dummy".to_string(),
body: "dummy".to_string(),
url: "dummy".to_string(),
description: None,
summarization: None,
stub: None,
tags: None,
rank: None,
source_haystack: None,
doc_type: DocumentType::KgEntry,
synonyms: None,
route: None,
priority: None,
quality_score: None,
};
let original_id = format!("fff_{}", file_path);
dummy_doc.normalize_key(&original_id)
}
async fn index_inner(&self, needle: &str, haystack: &Haystack) -> Result<Index> {
let haystack_path = Path::new(&haystack.location);
log::debug!(
"FffIndexer::index called with needle: '{}' haystack: {:?}",
needle,
haystack_path
);
if !haystack_path.exists() {
log::warn!("Haystack path does not exist: {:?}", haystack_path);
return Ok(Index::default());
}
let mut picker = FilePicker::new(FilePickerOptions {
base_path: haystack.location.clone(),
mode: FFFMode::Ai,
watch: false,
cache_budget: None,
..FilePickerOptions::default()
})
.map_err(|e| crate::Error::FileSearch(e.to_string()))?;
picker
.collect_files()
.map_err(|e| crate::Error::FileSearch(e.to_string()))?;
let allowed = Self::allowed_extensions(haystack);
let files: Vec<_> = picker
.get_files()
.iter()
.filter(|f| Self::file_extension_allowed(&f.relative_path(&picker), &allowed))
.collect();
log::debug!(
"Found {} files (extensions: {:?}) in haystack: {:?}",
files.len(),
allowed,
haystack_path
);
if files.is_empty() {
return Ok(Index::default());
}
if let Some(ref frecency) = self.frecency {
log::trace!("Frecency tracker configured for fff-search indexer");
if let Ok(guard) = frecency.read() {
let _ = guard.as_ref();
}
}
let fff_query = parse_grep_query(needle);
let options = GrepSearchOptions {
max_file_size: 10 * 1024 * 1024,
max_matches_per_file: 200,
smart_case: true,
file_offset: 0,
page_limit: if self.kg_scorer.is_some() { 1000 } else { 200 },
mode: GrepMode::PlainText,
time_budget_ms: 0,
before_context: 0,
after_context: 0,
classify_definitions: false,
..GrepSearchOptions::default()
};
let result = picker.grep(&fff_query, &options);
log::debug!(
"fff-search returned {} matches across {} files",
result.matches.len(),
result.files.len()
);
let mut index = Index::default();
let mut processed_files: HashSet<usize> = HashSet::new();
for m in &result.matches {
let file_index = m.file_index;
if processed_files.contains(&file_index) {
continue;
}
processed_files.insert(file_index);
let file = match result.files.get(file_index) {
Some(f) => f,
None => {
log::warn!("Match referenced invalid file_index: {}", file_index);
continue;
}
};
let relative_path = file.relative_path(&picker);
if !Self::file_extension_allowed(&relative_path, &allowed) {
continue;
}
let full_path = haystack_path.join(relative_path);
let path_str = full_path.to_string_lossy().to_string();
let body = match tfs::read_to_string(&full_path).await {
Ok(body) => body,
Err(e) => {
log::warn!("Failed to read file: {} - {:?}", full_path.display(), e);
continue;
}
};
let title = full_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string();
let description = {
let cleaned = m.line_content.trim();
if cleaned.is_empty() {
None
} else if cleaned.len() > 200 {
let safe_end = floor_char_boundary(cleaned, 197);
Some(format!("{}...", &cleaned[..safe_end]))
} else {
Some(cleaned.to_string())
}
};
let document = Document {
id: self.normalize_document_id(&path_str),
title,
url: path_str,
body,
description,
summarization: None,
stub: None,
tags: None,
rank: None,
source_haystack: None, doc_type: DocumentType::KgEntry,
synonyms: None,
route: None,
priority: None,
quality_score: None,
};
log::debug!(
"Inserting document into index: {} ({})",
document.title,
document.id
);
index.insert(document.id.clone(), document);
}
log::debug!(
"FffIndexer completed: {} documents in final index",
index.len()
);
Ok(index)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_document_id() {
let indexer = FffIndexer::default();
let id = indexer.normalize_document_id("/path/to/test.md");
assert!(id.starts_with("fff_"));
assert!(id.contains("test_md"));
}
#[test]
fn test_normalize_document_id_with_spaces() {
let indexer = FffIndexer::default();
let id = indexer.normalize_document_id("/path/to/my file.md");
assert!(id.starts_with("fff_"));
assert!(id.contains("my_file_md"));
}
#[test]
fn test_allowed_extensions_defaults_to_markdown() {
let haystack = Haystack {
location: "test".to_string(),
service: terraphim_config::ServiceType::Ripgrep,
read_only: true,
fetch_content: false,
atomic_server_secret: None,
extra_parameters: std::collections::HashMap::new(),
};
let allowed = FffIndexer::allowed_extensions(&haystack);
assert_eq!(allowed, vec!["md"]);
}
#[test]
fn test_allowed_extensions_parses_comma_list() {
let mut params = std::collections::HashMap::new();
params.insert("extensions".to_string(), "rs,toml,md".to_string());
let haystack = Haystack {
location: "crates".to_string(),
service: terraphim_config::ServiceType::Ripgrep,
read_only: true,
fetch_content: false,
atomic_server_secret: None,
extra_parameters: params,
};
let allowed = FffIndexer::allowed_extensions(&haystack);
assert_eq!(allowed, vec!["rs", "toml", "md"]);
}
#[test]
fn test_file_extension_allowed() {
let allowed = vec!["rs".to_string(), "md".to_string()];
assert!(FffIndexer::file_extension_allowed("lib.rs", &allowed));
assert!(FffIndexer::file_extension_allowed("main.md", &allowed));
assert!(!FffIndexer::file_extension_allowed("Cargo.toml", &allowed));
assert!(!FffIndexer::file_extension_allowed("lib.py", &allowed));
assert!(!FffIndexer::file_extension_allowed("lib", &allowed));
}
#[test]
fn test_is_stateful_returns_false_when_no_scorer_or_frecency() {
let indexer = FffIndexer::default();
assert!(!indexer.is_stateful());
}
#[test]
fn test_allowed_extensions_type_markdown() {
let mut params = std::collections::HashMap::new();
params.insert("type".to_string(), "markdown".to_string());
let haystack = Haystack {
location: "docs".to_string(),
service: terraphim_config::ServiceType::Ripgrep,
read_only: true,
fetch_content: false,
atomic_server_secret: None,
extra_parameters: params,
};
let allowed = FffIndexer::allowed_extensions(&haystack);
assert!(allowed.contains(&"md".to_string()));
}
}