use crate::config::Config;
use crate::embedding::calculate_unique_content_hash;
use crate::indexer::file_utils::FileUtils;
use crate::indexer::markdown_processor::parse_markdown_content;
use crate::indexer::text_processing::{TextChunkWithLines, TextProcessor};
use crate::state::SharedState;
use crate::store::{DocumentBlock, Store, TextBlock};
use anyhow::Result;
pub fn is_allowed_text_extension(path: &std::path::Path) -> bool {
FileUtils::is_allowed_text_extension(path)
}
pub fn is_markdown_file(path: &std::path::Path) -> bool {
if let Some(extension) = path.extension() {
if let Some(ext_str) = extension.to_str() {
let ext_lower = ext_str.to_lowercase();
return ext_lower == "md" || ext_lower == "markdown";
}
}
false
}
pub fn is_text_file(contents: &str) -> bool {
FileUtils::is_text_file(contents)
}
pub fn chunk_text(content: &str, chunk_size: usize, overlap: usize) -> Vec<TextChunkWithLines> {
TextProcessor::chunk_text(content, chunk_size, overlap)
}
pub async fn process_text_file(
store: &Store,
contents: &str,
file_path: &str,
text_blocks_batch: &mut Vec<TextBlock>,
config: &Config,
state: SharedState,
) -> Result<()> {
let force_reindex = state.read().force_reindex;
let chunks = chunk_text(
contents,
config.index.chunk_size,
config.index.chunk_overlap,
);
for (chunk_idx, chunk_with_lines) in chunks.iter().enumerate() {
let chunk_hash = calculate_unique_content_hash(
&chunk_with_lines.content,
&format!("{}#{}", file_path, chunk_idx),
);
let exists = !force_reindex && store.content_exists(&chunk_hash, "text_blocks").await?;
if !exists {
text_blocks_batch.push(TextBlock {
path: file_path.to_string(),
language: "text".to_string(),
content: chunk_with_lines.content.clone(),
start_line: chunk_with_lines.start_line, end_line: chunk_with_lines.end_line, hash: chunk_hash,
distance: None,
});
}
}
Ok(())
}
pub async fn process_markdown_file(
store: &Store,
contents: &str,
file_path: &str,
document_blocks_batch: &mut Vec<DocumentBlock>,
config: &Config,
state: SharedState,
) -> Result<()> {
let force_reindex = state.read().force_reindex;
let document_blocks = parse_markdown_content(contents, file_path, config);
for doc_block in document_blocks {
let exists = !force_reindex
&& store
.content_exists(&doc_block.hash, "document_blocks")
.await?;
if !exists {
document_blocks_batch.push(doc_block);
}
}
Ok(())
}