use serde::{Deserialize, Serialize};
use std::path::Path;
const DEFAULT_TARGET_CHUNK_SIZE: usize = 1500;
const DEFAULT_MAX_CHUNK_SIZE: usize = 2500;
const DEFAULT_MIN_CHUNK_SIZE: usize = 100;
const DEFAULT_OVERLAP_SIZE: usize = 200;
const SMALL_TARGET_CHUNK_SIZE: usize = 800;
const SMALL_MAX_CHUNK_SIZE: usize = 1200;
const SMALL_MIN_CHUNK_SIZE: usize = 50;
const SMALL_OVERLAP_SIZE: usize = 100;
const LARGE_TARGET_CHUNK_SIZE: usize = 2000;
const LARGE_MAX_CHUNK_SIZE: usize = 3500;
const LARGE_MIN_CHUNK_SIZE: usize = 200;
const LARGE_OVERLAP_SIZE: usize = 300;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum DocumentType {
Pdf,
Markdown,
PlainText,
Docx,
Unknown,
}
impl DocumentType {
pub fn from_path(path: &Path) -> Self {
path.extension()
.and_then(|ext| ext.to_str())
.map(Self::from_extension)
.unwrap_or(Self::Unknown)
}
pub fn from_extension(ext: &str) -> Self {
match ext.to_lowercase().as_str() {
"pdf" => Self::Pdf,
"md" | "markdown" => Self::Markdown,
"txt" | "text" => Self::PlainText,
"docx" => Self::Docx,
_ => Self::Unknown,
}
}
pub fn extension(&self) -> &'static str {
match self {
Self::Pdf => "pdf",
Self::Markdown => "md",
Self::PlainText => "txt",
Self::Docx => "docx",
Self::Unknown => "",
}
}
pub fn mime_type(&self) -> &'static str {
match self {
Self::Pdf => "application/pdf",
Self::Markdown => "text/markdown",
Self::PlainText => "text/plain",
Self::Docx => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Self::Unknown => "application/octet-stream",
}
}
pub fn is_supported(&self) -> bool {
!matches!(self, Self::Unknown)
}
}
impl std::fmt::Display for DocumentType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Pdf => write!(f, "PDF"),
Self::Markdown => write!(f, "Markdown"),
Self::PlainText => write!(f, "Plain Text"),
Self::Docx => write!(f, "DOCX"),
Self::Unknown => write!(f, "Unknown"),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub document_id: String,
pub conversation_id: Option<String>,
pub project_id: Option<String>,
pub file_name: String,
pub file_type: DocumentType,
pub file_size_bytes: u64,
pub chunk_count: u32,
pub file_hash: String,
pub created_at: i64,
pub page_count: Option<u32>,
pub title: Option<String>,
}
impl DocumentMetadata {
pub fn new(
document_id: String,
file_name: String,
file_type: DocumentType,
file_size_bytes: u64,
file_hash: String,
) -> Self {
Self {
document_id,
conversation_id: None,
project_id: None,
file_name,
file_type,
file_size_bytes,
chunk_count: 0,
file_hash,
created_at: chrono::Utc::now().timestamp(),
page_count: None,
title: None,
}
}
pub fn with_conversation(mut self, conversation_id: String) -> Self {
self.conversation_id = Some(conversation_id);
self
}
pub fn with_project(mut self, project_id: String) -> Self {
self.project_id = Some(project_id);
self
}
pub fn with_chunk_count(mut self, count: u32) -> Self {
self.chunk_count = count;
self
}
pub fn with_page_count(mut self, count: u32) -> Self {
self.page_count = Some(count);
self
}
pub fn with_title(mut self, title: String) -> Self {
self.title = Some(title);
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
pub chunk_id: String,
pub document_id: String,
pub content: String,
pub start_offset: usize,
pub end_offset: usize,
pub chunk_index: u32,
pub total_chunks: u32,
pub page_number: Option<u32>,
pub section: Option<String>,
}
impl DocumentChunk {
pub fn new(
document_id: String,
content: String,
start_offset: usize,
end_offset: usize,
chunk_index: u32,
total_chunks: u32,
) -> Self {
let chunk_id = format!("{}:{}", document_id, chunk_index);
Self {
chunk_id,
document_id,
content,
start_offset,
end_offset,
chunk_index,
total_chunks,
page_number: None,
section: None,
}
}
pub fn with_page(mut self, page: u32) -> Self {
self.page_number = Some(page);
self
}
pub fn with_section(mut self, section: String) -> Self {
self.section = Some(section);
self
}
pub fn len(&self) -> usize {
self.end_offset - self.start_offset
}
pub fn is_empty(&self) -> bool {
self.content.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct ChunkerConfig {
pub target_chunk_size: usize,
pub max_chunk_size: usize,
pub min_chunk_size: usize,
pub overlap_size: usize,
pub respect_headers: bool,
pub respect_paragraphs: bool,
}
impl Default for ChunkerConfig {
fn default() -> Self {
Self {
target_chunk_size: DEFAULT_TARGET_CHUNK_SIZE,
max_chunk_size: DEFAULT_MAX_CHUNK_SIZE,
min_chunk_size: DEFAULT_MIN_CHUNK_SIZE,
overlap_size: DEFAULT_OVERLAP_SIZE,
respect_headers: true,
respect_paragraphs: true,
}
}
}
impl ChunkerConfig {
pub fn small() -> Self {
Self {
target_chunk_size: SMALL_TARGET_CHUNK_SIZE,
max_chunk_size: SMALL_MAX_CHUNK_SIZE,
min_chunk_size: SMALL_MIN_CHUNK_SIZE,
overlap_size: SMALL_OVERLAP_SIZE,
..Default::default()
}
}
pub fn large() -> Self {
Self {
target_chunk_size: LARGE_TARGET_CHUNK_SIZE,
max_chunk_size: LARGE_MAX_CHUNK_SIZE,
min_chunk_size: LARGE_MIN_CHUNK_SIZE,
overlap_size: LARGE_OVERLAP_SIZE,
..Default::default()
}
}
}
#[derive(Debug, Clone)]
pub struct DocumentSearchRequest {
pub query: String,
pub conversation_id: Option<String>,
pub project_id: Option<String>,
pub limit: usize,
pub min_score: f32,
pub hybrid: bool,
pub file_type: Option<DocumentType>,
}
impl DocumentSearchRequest {
pub fn new(query: impl Into<String>) -> Self {
Self {
query: query.into(),
conversation_id: None,
project_id: None,
limit: 10,
min_score: 0.5,
hybrid: true,
file_type: None,
}
}
pub fn with_conversation(mut self, conversation_id: String) -> Self {
self.conversation_id = Some(conversation_id);
self
}
pub fn with_project(mut self, project_id: String) -> Self {
self.project_id = Some(project_id);
self
}
pub fn with_limit(mut self, limit: usize) -> Self {
self.limit = limit;
self
}
pub fn with_min_score(mut self, min_score: f32) -> Self {
self.min_score = min_score;
self
}
pub fn with_hybrid(mut self, hybrid: bool) -> Self {
self.hybrid = hybrid;
self
}
pub fn with_file_type(mut self, file_type: DocumentType) -> Self {
self.file_type = Some(file_type);
self
}
}
#[derive(Debug, Clone)]
pub struct DocumentSearchResult {
pub chunk_id: String,
pub document_id: String,
pub file_name: String,
pub content: String,
pub score: f32,
pub vector_score: f32,
pub keyword_score: Option<f32>,
pub chunk_index: u32,
pub total_chunks: u32,
pub section: Option<String>,
pub page_number: Option<u32>,
}
impl DocumentSearchResult {
pub fn from_chunk(chunk: &DocumentChunk, file_name: String, vector_score: f32) -> Self {
Self {
chunk_id: chunk.chunk_id.clone(),
document_id: chunk.document_id.clone(),
file_name,
content: chunk.content.clone(),
score: vector_score,
vector_score,
keyword_score: None,
chunk_index: chunk.chunk_index,
total_chunks: chunk.total_chunks,
section: chunk.section.clone(),
page_number: chunk.page_number,
}
}
pub fn with_combined_score(mut self, score: f32) -> Self {
self.score = score;
self
}
pub fn with_keyword_score(mut self, score: f32) -> Self {
self.keyword_score = Some(score);
self
}
}
#[derive(Debug, Clone)]
pub struct ExtractedDocument {
pub content: String,
pub file_type: DocumentType,
pub page_count: Option<usize>,
pub title: Option<String>,
pub warnings: Vec<String>,
}
impl ExtractedDocument {
pub fn new(content: String, file_type: DocumentType) -> Self {
Self {
content,
file_type,
page_count: None,
title: None,
warnings: Vec::new(),
}
}
pub fn with_page_count(mut self, count: usize) -> Self {
self.page_count = Some(count);
self
}
pub fn with_title(mut self, title: String) -> Self {
self.title = Some(title);
self
}
pub fn with_warning(mut self, warning: String) -> Self {
self.warnings.push(warning);
self
}
pub fn is_empty(&self) -> bool {
self.content.trim().is_empty()
}
pub fn len(&self) -> usize {
self.content.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_document_type_from_path() {
assert_eq!(
DocumentType::from_path(&PathBuf::from("test.pdf")),
DocumentType::Pdf
);
assert_eq!(
DocumentType::from_path(&PathBuf::from("README.md")),
DocumentType::Markdown
);
assert_eq!(
DocumentType::from_path(&PathBuf::from("notes.txt")),
DocumentType::PlainText
);
assert_eq!(
DocumentType::from_path(&PathBuf::from("doc.docx")),
DocumentType::Docx
);
assert_eq!(
DocumentType::from_path(&PathBuf::from("file.xyz")),
DocumentType::Unknown
);
}
#[test]
fn test_document_type_from_extension() {
assert_eq!(DocumentType::from_extension("PDF"), DocumentType::Pdf);
assert_eq!(
DocumentType::from_extension("markdown"),
DocumentType::Markdown
);
assert_eq!(DocumentType::from_extension("TXT"), DocumentType::PlainText);
}
#[test]
fn test_document_type_mime_types() {
assert_eq!(DocumentType::Pdf.mime_type(), "application/pdf");
assert_eq!(DocumentType::Markdown.mime_type(), "text/markdown");
assert_eq!(DocumentType::PlainText.mime_type(), "text/plain");
}
#[test]
fn test_document_type_is_supported() {
assert!(DocumentType::Pdf.is_supported());
assert!(DocumentType::Markdown.is_supported());
assert!(DocumentType::PlainText.is_supported());
assert!(DocumentType::Docx.is_supported());
assert!(!DocumentType::Unknown.is_supported());
}
#[test]
fn test_document_metadata_builder() {
let meta = DocumentMetadata::new(
"doc-123".to_string(),
"test.pdf".to_string(),
DocumentType::Pdf,
1024,
"abc123".to_string(),
)
.with_conversation("conv-456".to_string())
.with_project("proj-789".to_string())
.with_chunk_count(10)
.with_page_count(5)
.with_title("Test Document".to_string());
assert_eq!(meta.document_id, "doc-123");
assert_eq!(meta.conversation_id, Some("conv-456".to_string()));
assert_eq!(meta.project_id, Some("proj-789".to_string()));
assert_eq!(meta.chunk_count, 10);
assert_eq!(meta.page_count, Some(5));
assert_eq!(meta.title, Some("Test Document".to_string()));
}
#[test]
fn test_document_chunk_creation() {
let chunk = DocumentChunk::new(
"doc-123".to_string(),
"Hello world".to_string(),
0,
11,
0,
5,
);
assert_eq!(chunk.chunk_id, "doc-123:0");
assert_eq!(chunk.len(), 11);
assert!(!chunk.is_empty());
}
#[test]
fn test_search_request_builder() {
let request = DocumentSearchRequest::new("test query")
.with_conversation("conv-123".to_string())
.with_limit(20)
.with_min_score(0.7)
.with_hybrid(false)
.with_file_type(DocumentType::Pdf);
assert_eq!(request.query, "test query");
assert_eq!(request.conversation_id, Some("conv-123".to_string()));
assert_eq!(request.limit, 20);
assert_eq!(request.min_score, 0.7);
assert!(!request.hybrid);
assert_eq!(request.file_type, Some(DocumentType::Pdf));
}
#[test]
fn test_extracted_document() {
let doc = ExtractedDocument::new("Hello world".to_string(), DocumentType::PlainText)
.with_page_count(1)
.with_title("Test".to_string())
.with_warning("Some warning".to_string());
assert_eq!(doc.len(), 11);
assert!(!doc.is_empty());
assert_eq!(doc.page_count, Some(1));
assert_eq!(doc.title, Some("Test".to_string()));
assert_eq!(doc.warnings.len(), 1);
}
}