use crate::config::Config;
use crate::error::{MemvidError, Result};
use crate::ml::{EmbeddingConfig, EmbeddingModel};
use crate::qr::QrEncoder;
use crate::storage::EncodingStats;
use crate::text::{ChunkMetadata, ChunkingStrategy, PdfProcessor, TextChunker};
use crate::video::encoder::VideoEncoder;
use regex;
use std::path::Path;
pub struct MemvidEncoder {
config: Config,
chunks: Vec<ChunkMetadata>,
qr_encoder: QrEncoder,
text_chunker: TextChunker,
video_encoder: VideoEncoder,
embedding_model: EmbeddingModel,
}
impl MemvidEncoder {
pub async fn new(config: Option<Config>) -> Result<Self> {
let config = config.unwrap_or_default();
let qr_encoder = QrEncoder::new(config.qr.clone());
let text_chunker = TextChunker::new(config.chunking.clone(), ChunkingStrategy::Sentence)?;
let video_encoder = VideoEncoder::new(config.video.clone());
let embedding_model = match EmbeddingModel::new(EmbeddingConfig::default()).await {
Ok(model) => {
log::info!(
"🧠 Embedding model initialized - semantic embeddings will be generated during encoding"
);
model
}
Err(e) => {
log::error!(
"❌ Failed to initialize embedding model: {}. Encoding will not proceed.",
e
);
return Err(MemvidError::Generic(e.to_string()));
}
};
Ok(Self {
config,
chunks: Vec::new(),
qr_encoder,
text_chunker,
video_encoder,
embedding_model,
})
}
pub async fn add_text(&mut self, text: &str, chunk_size: usize, overlap: usize) -> Result<()> {
let mut custom_config = self.config.chunking.clone();
custom_config.chunk_size = chunk_size;
custom_config.overlap = overlap;
let custom_chunker = TextChunker::new(custom_config, ChunkingStrategy::Sentence)?;
let mut new_chunks = custom_chunker.chunk_text(text, None)?;
let start_id = self.chunks.len();
for (i, chunk) in new_chunks.iter_mut().enumerate() {
chunk.id = start_id + i;
}
let chunk_count = new_chunks.len();
self.chunks.extend(new_chunks);
log::info!(
"Added {} chunks from text. Total: {}",
chunk_count,
self.chunks.len()
);
Ok(())
}
pub async fn add_pdf<P: AsRef<Path>>(&mut self, pdf_path: P) -> Result<()> {
let path = pdf_path.as_ref();
if !path.exists() {
return Err(MemvidError::Pdf(format!(
"PDF file not found: {}",
path.display()
)));
}
let text = PdfProcessor::extract_text(path)?;
let source = Some(path.to_string_lossy().to_string());
let mut new_chunks = self.text_chunker.chunk_text(&text, source)?;
let start_id = self.chunks.len();
for (i, chunk) in new_chunks.iter_mut().enumerate() {
chunk.id = start_id + i;
}
let chunk_count = new_chunks.len();
self.chunks.extend(new_chunks);
log::info!(
"Added {} chunks from PDF {}. Total: {}",
chunk_count,
path.display(),
self.chunks.len()
);
Ok(())
}
pub async fn add_text_file<P: AsRef<Path>>(&mut self, file_path: P) -> Result<()> {
let path = file_path.as_ref();
let text = std::fs::read_to_string(path).map_err(MemvidError::Io)?;
let source = Some(path.to_string_lossy().to_string());
let mut new_chunks = self.text_chunker.chunk_text(&text, source)?;
let start_id = self.chunks.len();
for (i, chunk) in new_chunks.iter_mut().enumerate() {
chunk.id = start_id + i;
}
let chunk_count = new_chunks.len();
self.chunks.extend(new_chunks);
log::info!(
"Added {} chunks from text file {}. Total: {}",
chunk_count,
path.display(),
self.chunks.len()
);
Ok(())
}
pub async fn add_markdown_file<P: AsRef<Path>>(&mut self, file_path: P) -> Result<()> {
let content = std::fs::read_to_string(file_path).map_err(MemvidError::Io)?;
let text = self.extract_text_from_markdown(&content);
let chunk_size = self.config.chunking.chunk_size;
let overlap = self.config.chunking.overlap;
self.add_text(&text, chunk_size, overlap).await
}
pub async fn build_video(
&mut self,
output_file: &str,
index_file: &str,
) -> Result<EncodingStats> {
if self.chunks.is_empty() {
return Err(MemvidError::Generic(
"No chunks added for encoding".to_string(),
));
}
let start_time = std::time::Instant::now();
log::info!("Starting video encoding for {} chunks", self.chunks.len());
log::info!(
"🧠 Generating semantic embeddings for {} chunks...",
self.chunks.len()
);
let chunk_texts: Vec<String> = self.chunks.iter().map(|c| c.text.clone()).collect();
let embeddings = self.embedding_model.encode_batch(&chunk_texts)?;
for (chunk, embedding) in self.chunks.iter_mut().zip(embeddings.iter()) {
chunk.embedding = Some(embedding.clone());
}
log::info!("✅ Generated embeddings for {} chunks", self.chunks.len());
log::info!("Generating QR code frames...");
let mut qr_images = Vec::new();
for (frame_num, chunk) in self.chunks.iter_mut().enumerate() {
let qr_frame = self.qr_encoder.encode_text(&chunk.text)?;
qr_images.push(qr_frame.image);
chunk.frame = Some(frame_num as u32);
if frame_num % 100 == 0 {
log::info!("Generated {} QR frames", frame_num + 1);
}
}
log::info!("Generated {} QR frames, encoding video...", qr_images.len());
self.video_encoder
.encode_frames(&qr_images, output_file)
.await?;
let video_file_size = std::fs::metadata(output_file).map(|m| m.len()).unwrap_or(0);
self.save_index(index_file).await?;
let processing_time = start_time.elapsed().as_secs_f64();
log::info!(
"Video encoding completed: {} chunks → {} frames → {} MB in {:.2}s",
self.chunks.len(),
qr_images.len(),
video_file_size as f64 / 1_048_576.0,
processing_time
);
Ok(EncodingStats {
total_chunks: self.chunks.len(),
total_frames: qr_images.len(),
processing_time,
video_file_size,
})
}
pub async fn build_video_with_progress<F>(
&mut self,
output_file: &str,
index_file: &str,
progress_callback: F,
) -> Result<EncodingStats>
where
F: Fn(f32),
{
if self.chunks.is_empty() {
return Err(MemvidError::Generic(
"No chunks added for encoding".to_string(),
));
}
let start_time = std::time::Instant::now();
let total_chunks = self.chunks.len();
progress_callback(0.0);
log::info!(
"🧠 Generating semantic embeddings for {} chunks...",
self.chunks.len()
);
let chunk_texts: Vec<String> = self.chunks.iter().map(|c| c.text.clone()).collect();
let embeddings = self.embedding_model.encode_batch(&chunk_texts)?;
for (chunk, embedding) in self.chunks.iter_mut().zip(embeddings.iter()) {
chunk.embedding = Some(embedding.clone());
}
log::info!("✅ Generated embeddings for {} chunks", self.chunks.len());
progress_callback(25.0);
log::info!("Generating QR code frames...");
let mut qr_images = Vec::new();
for (frame_num, chunk) in self.chunks.iter_mut().enumerate() {
let qr_frame = self.qr_encoder.encode_text(&chunk.text)?;
qr_images.push(qr_frame.image);
chunk.frame = Some(frame_num as u32);
let qr_progress = 25.0 + (frame_num + 1) as f32 / total_chunks as f32 * 40.0;
progress_callback(qr_progress);
}
progress_callback(65.0);
log::info!("Encoding video...");
self.video_encoder
.encode_frames(&qr_images, output_file)
.await?;
progress_callback(90.0);
self.save_index(index_file).await?;
progress_callback(100.0);
let processing_time = start_time.elapsed().as_secs_f64();
let video_file_size = std::fs::metadata(output_file).map(|m| m.len()).unwrap_or(0);
log::info!(
"Video encoding completed: {} chunks → {} frames → {} MB in {:.2}s",
self.chunks.len(),
qr_images.len(),
video_file_size as f64 / 1_048_576.0,
processing_time
);
Ok(EncodingStats {
total_chunks: self.chunks.len(),
total_frames: qr_images.len(),
processing_time,
video_file_size,
})
}
async fn save_index(&self, database_file: &str) -> Result<()> {
if let Some(parent) = Path::new(database_file).parent() {
std::fs::create_dir_all(parent).map_err(MemvidError::Io)?;
}
let mut database = crate::storage::Database::new(database_file)?;
database.insert_chunks(&self.chunks)?;
let stats = database.get_stats()?;
log::info!(
"Saved {} chunks to SQLite database {} ({} bytes)",
stats.chunk_count,
database_file,
stats.file_size_bytes
);
Ok(())
}
pub fn clear(&mut self) {
self.chunks.clear();
log::info!("Cleared all chunks");
}
pub fn get_stats(&self) -> EncodingStats {
EncodingStats {
total_chunks: self.chunks.len(),
total_frames: self.chunks.iter().filter(|c| c.frame.is_some()).count(),
processing_time: 0.0,
video_file_size: 0,
}
}
pub fn chunk_count(&self) -> usize {
self.chunks.len()
}
pub fn config(&self) -> &Config {
&self.config
}
fn extract_text_from_markdown(&self, content: &str) -> String {
let mut text = content.to_string();
text = regex::Regex::new(r"```[\s\S]*?```")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"`[^`]*`")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"^#{1,6}\s+")
.unwrap()
.replace_all(&text, "")
.to_string();
text = regex::Regex::new(r"\[([^\]]*)\]\([^\)]*\)")
.unwrap()
.replace_all(&text, "$1")
.to_string();
text = regex::Regex::new(r"!\[[^\]]*\]\([^\)]*\)")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"\*\*([^\*]*)\*\*")
.unwrap()
.replace_all(&text, "$1")
.to_string();
text = regex::Regex::new(r"\*([^\*]*)\*")
.unwrap()
.replace_all(&text, "$1")
.to_string();
text = regex::Regex::new(r"__([^_]*)__")
.unwrap()
.replace_all(&text, "$1")
.to_string();
text = regex::Regex::new(r"_([^_]*)_")
.unwrap()
.replace_all(&text, "$1")
.to_string();
text = regex::Regex::new(r"^---+$")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"^\*\*\*+$")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"<[^>]*>")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = regex::Regex::new(r"\s+")
.unwrap()
.replace_all(&text, " ")
.to_string();
text = text.trim().to_string();
text
}
pub fn add_chunks(&mut self, chunks: Vec<String>) -> Result<()> {
let start_id = self.chunks.len();
for (i, text) in chunks.iter().enumerate() {
let chunk = ChunkMetadata {
id: start_id + i,
text: text.clone(),
source: None,
page: None,
offset: 0,
length: text.len(),
frame: None,
embedding: None,
};
self.chunks.push(chunk);
}
log::info!(
"Added {} chunks directly. Total: {}",
chunks.len(),
self.chunks.len()
);
Ok(())
}
pub fn has_embeddings(&self) -> bool {
true
}
pub async fn append_chunks(
&mut self,
existing_video_file: &str,
existing_index_file: &str,
new_chunks: Vec<String>,
) -> Result<EncodingStats> {
if new_chunks.is_empty() {
return Err(MemvidError::Generic(
"No new chunks provided for appending".to_string(),
));
}
let start_time = std::time::Instant::now();
log::info!(
"📚 INCREMENTAL UPDATE: Appending {} new chunks to existing knowledge base",
new_chunks.len()
);
let video_decoder = crate::video::decoder::VideoDecoder::new()?;
let video_info = video_decoder.get_video_info(existing_video_file).await?;
let current_frame_count = video_info.frame_count;
log::info!("Current video has {} frames", current_frame_count);
let existing_db = crate::storage::Database::new(existing_index_file)?;
let existing_chunk_count = existing_db.get_chunk_count()?;
log::info!("Current database has {} chunks", existing_chunk_count);
log::info!("🧠 Generating embeddings for new chunks...");
let embeddings = self.embedding_model.encode_batch(&new_chunks)?;
let mut new_chunk_metadata = Vec::new();
for (i, (text, embedding)) in new_chunks.iter().zip(embeddings.iter()).enumerate() {
let chunk_id = existing_chunk_count + i;
let frame_num = current_frame_count + i as u32;
new_chunk_metadata.push(crate::text::ChunkMetadata {
id: chunk_id,
text: text.clone(),
source: Some("incremental_update".to_string()), page: None,
offset: 0,
length: text.len(),
frame: Some(frame_num),
embedding: Some(embedding.clone()),
});
}
log::info!("Generating QR frames for new chunks...");
let mut new_qr_images = Vec::new();
for chunk_text in &new_chunks {
let qr_frame = self.qr_encoder.encode_text(chunk_text)?;
new_qr_images.push(qr_frame.image);
}
let temp_new_video = format!("{}.new_frames.mp4", existing_video_file);
self.video_encoder
.encode_frames(&new_qr_images, &temp_new_video)
.await?;
log::info!("Concatenating new frames to existing video...");
let temp_combined_video = format!("{}.temp_combined.mp4", existing_video_file);
self.concatenate_videos(existing_video_file, &temp_new_video, &temp_combined_video)
.await?;
std::fs::rename(&temp_combined_video, existing_video_file).map_err(MemvidError::Io)?;
log::info!("Updating database with new chunks...");
let mut updated_db = crate::storage::Database::new(existing_index_file)?;
updated_db.insert_chunks(&new_chunk_metadata)?;
let _ = std::fs::remove_file(&temp_new_video);
let _ = std::fs::remove_file(&temp_combined_video);
let processing_time = start_time.elapsed().as_secs_f64();
let final_video_size = std::fs::metadata(existing_video_file)
.map(|m| m.len())
.unwrap_or(0);
log::info!(
"✅ INCREMENTAL UPDATE COMPLETED: Added {} chunks in {:.2}s",
new_chunks.len(),
processing_time
);
log::info!(
"Final video: {} frames, {} MB",
current_frame_count + new_chunks.len() as u32,
final_video_size as f64 / 1_048_576.0
);
Ok(EncodingStats {
total_chunks: new_chunks.len(),
total_frames: new_chunks.len(),
processing_time,
video_file_size: final_video_size,
})
}
async fn concatenate_videos(&self, video1: &str, video2: &str, output: &str) -> Result<()> {
log::info!("Concatenating {} + {} → {}", video1, video2, output);
ffmpeg_next::init()
.map_err(|e| MemvidError::Video(format!("FFmpeg init failed: {}", e)))?;
let concat_list_file = format!("{}.concat_list.txt", output);
let concat_content = format!(
"file '{}'\nfile '{}'",
std::path::Path::new(video1)
.canonicalize()
.unwrap()
.display(),
std::path::Path::new(video2)
.canonicalize()
.unwrap()
.display()
);
std::fs::write(&concat_list_file, concat_content).map_err(MemvidError::Io)?;
let output_status = std::process::Command::new("ffmpeg")
.args([
"-f",
"concat",
"-safe",
"0",
"-i",
&concat_list_file,
"-c",
"copy",
"-y", output,
])
.status()
.map_err(|e| MemvidError::Video(format!("Failed to execute ffmpeg: {}", e)))?;
let _ = std::fs::remove_file(&concat_list_file);
if !output_status.success() {
return Err(MemvidError::Video(format!(
"FFmpeg concatenation failed with exit code: {:?}",
output_status.code()
)));
}
log::info!("Successfully concatenated videos");
Ok(())
}
pub async fn append_conversation_history(
&mut self,
existing_video_file: &str,
existing_index_file: &str,
conversations: Vec<(String, String)>, ) -> Result<EncodingStats> {
let mut chunks = Vec::new();
for (i, (human_msg, ai_response)) in conversations.iter().enumerate() {
chunks.push(format!("Human: {}", human_msg));
chunks.push(format!("Assistant: {}", ai_response));
if i % 10 == 0 {
log::info!(
"Prepared {}/{} conversations for storage",
i + 1,
conversations.len()
);
}
}
log::info!(
"💬 Storing {} conversation turns ({} chunks) to knowledge base",
conversations.len(),
chunks.len()
);
self.append_chunks(existing_video_file, existing_index_file, chunks)
.await
}
pub async fn append_document_chunks(
&mut self,
existing_video_file: &str,
existing_index_file: &str,
document_path: &str,
) -> Result<EncodingStats> {
log::info!("📄 Processing new document: {}", document_path);
if document_path.ends_with(".pdf") {
self.add_pdf(std::path::Path::new(document_path)).await?;
} else if document_path.ends_with(".txt") || document_path.ends_with(".md") {
self.add_text_file(std::path::Path::new(document_path))
.await?;
} else {
return Err(MemvidError::Generic(format!(
"Unsupported document format: {}",
document_path
)));
}
let new_chunks: Vec<String> = self.chunks.iter().map(|c| c.text.clone()).collect();
log::info!("Extracted {} chunks from document", new_chunks.len());
self.append_chunks(existing_video_file, existing_index_file, new_chunks)
.await
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[tokio::test]
async fn test_encoder_creation() {
let encoder = MemvidEncoder::new(None).await.unwrap();
assert_eq!(encoder.chunk_count(), 0);
}
#[tokio::test]
async fn test_add_text() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let test_text = "This is a longer test text that should definitely meet the minimum chunk size requirement for proper chunking. The text needs to be at least 100 characters long to create a valid chunk.";
encoder.add_text(test_text, 1024, 32).await.unwrap();
assert_eq!(encoder.chunk_count(), 1);
}
#[tokio::test]
async fn test_add_text_file() {
let mut temp_file = NamedTempFile::new().unwrap();
writeln!(temp_file, "This is test content that is long enough to meet the minimum chunk size requirement. We need at least 100 characters to create a valid chunk for processing.").unwrap();
let mut encoder = MemvidEncoder::new(None).await.unwrap();
encoder.add_text_file(temp_file.path()).await.unwrap();
assert!(encoder.chunk_count() > 0);
}
#[tokio::test]
async fn test_clear() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let test_text = "This is test content that is long enough to meet the minimum chunk size requirement for proper chunking and processing. The text needs to be sufficient.";
encoder.add_text(test_text, 1024, 32).await.unwrap();
assert!(encoder.chunk_count() > 0);
encoder.clear();
assert_eq!(encoder.chunk_count(), 0);
}
#[tokio::test]
async fn test_add_chunks_direct() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let chunks = vec![
"chunk1".to_string(),
"chunk2".to_string(),
"chunk3".to_string(),
];
encoder.add_chunks(chunks.clone()).unwrap();
assert_eq!(encoder.chunk_count(), 3);
assert_eq!(encoder.chunks[0].text, "chunk1");
assert_eq!(encoder.chunks[1].text, "chunk2");
assert_eq!(encoder.chunks[2].text, "chunk3");
}
#[tokio::test]
async fn test_add_text_chunking() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let test_text =
"This is a long test document that should be split into multiple chunks. ".repeat(20);
encoder.add_text(&test_text, 500, 50).await.unwrap();
assert!(encoder.chunk_count() > 1);
for chunk in &encoder.chunks {
assert!(!chunk.text.is_empty());
}
}
#[tokio::test]
async fn test_build_video_integration() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let chunks = vec![
"Test chunk 1: Important information".to_string(),
"Test chunk 2: More data here".to_string(),
"Test chunk 3: Final piece of info".to_string(),
];
encoder.add_chunks(chunks).unwrap();
let temp_dir = tempfile::tempdir().unwrap();
let video_file = temp_dir.path().join("test.mp4");
let index_file = temp_dir.path().join("test_index.db");
let stats = encoder
.build_video(video_file.to_str().unwrap(), index_file.to_str().unwrap())
.await
.unwrap();
assert!(video_file.exists());
assert!(index_file.exists());
assert_eq!(stats.total_chunks, 3);
assert_eq!(stats.total_frames, 3);
assert!(stats.video_file_size > 0);
assert!(stats.processing_time > 0.0);
}
#[tokio::test]
async fn test_encoder_stats_detailed() {
let mut encoder = MemvidEncoder::new(None).await.unwrap();
let chunks = vec![
"short".to_string(),
"medium length chunk".to_string(),
"this is a longer chunk with more text".to_string(),
];
encoder.add_chunks(chunks.clone()).unwrap();
let stats = encoder.get_stats();
assert_eq!(stats.total_chunks, 3);
let total_chars: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(stats.total_chunks, 3);
let avg_chunk_size = total_chars as f64 / chunks.len() as f64;
assert!(avg_chunk_size > 0.0);
}
}