rexis_rag/
document.rs

1//! # RRAG Document Types
2//!
3//! Core document handling with zero-copy optimizations and efficient processing.
4//! Designed for Rust's ownership system and memory efficiency.
5
6use crate::{RragError, RragResult};
7use serde::{Deserialize, Serialize};
8use std::borrow::Cow;
9use std::collections::HashMap;
10use uuid::Uuid;
11
12/// Document metadata using Cow for zero-copy string handling
13pub type Metadata = HashMap<String, serde_json::Value>;
14
15/// Core document type optimized for Rust patterns
16///
17/// Uses `Cow<str>` for flexible string handling:
18/// - Borrowed strings when possible (zero-copy)
19/// - Owned strings when necessary (after processing)
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct Document {
22    /// Unique document identifier
23    pub id: String,
24
25    /// Document content - uses Cow for efficient string handling
26    #[serde(with = "cow_str_serde")]
27    pub content: Cow<'static, str>,
28
29    /// Document metadata
30    pub metadata: Metadata,
31
32    /// Content hash for deduplication
33    pub content_hash: Option<String>,
34
35    /// Document creation timestamp
36    pub created_at: chrono::DateTime<chrono::Utc>,
37}
38
39impl Document {
40    /// Create a new document with generated ID
41    pub fn new(content: impl Into<Cow<'static, str>>) -> Self {
42        let content = content.into();
43        Self {
44            id: Uuid::new_v4().to_string(),
45            content,
46            metadata: HashMap::new(),
47            content_hash: None,
48            created_at: chrono::Utc::now(),
49        }
50    }
51
52    /// Create document with specific ID
53    pub fn with_id(id: impl Into<String>, content: impl Into<Cow<'static, str>>) -> Self {
54        let content = content.into();
55        Self {
56            id: id.into(),
57            content,
58            metadata: HashMap::new(),
59            content_hash: None,
60            created_at: chrono::Utc::now(),
61        }
62    }
63
64    /// Add metadata using builder pattern
65    pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
66        self.metadata.insert(key.into(), value);
67        self
68    }
69
70    /// Add multiple metadata entries
71    pub fn with_metadata_map(mut self, metadata: Metadata) -> Self {
72        self.metadata.extend(metadata);
73        self
74    }
75
76    /// Generate content hash for deduplication
77    pub fn with_content_hash(mut self) -> Self {
78        self.content_hash = Some(Self::hash_content(&self.content));
79        self
80    }
81
82    /// Get content as string slice
83    pub fn content_str(&self) -> &str {
84        &self.content
85    }
86
87    /// Get content length in characters
88    pub fn content_length(&self) -> usize {
89        self.content.chars().count()
90    }
91
92    /// Check if document is empty
93    pub fn is_empty(&self) -> bool {
94        self.content.trim().is_empty()
95    }
96
97    /// Generate hash for content deduplication
98    fn hash_content(content: &str) -> String {
99        // Simple hash implementation - in production, use a proper hash function
100        use std::collections::hash_map::DefaultHasher;
101        use std::hash::{Hash, Hasher};
102
103        let mut hasher = DefaultHasher::new();
104        content.hash(&mut hasher);
105        format!("{:x}", hasher.finish())
106    }
107}
108
109/// Document chunk for processing pipelines
110///
111/// Represents a portion of a document with positional information
112/// and overlap handling for better context preservation.
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct DocumentChunk {
115    /// Reference to parent document ID
116    pub document_id: String,
117
118    /// Chunk content
119    pub content: String,
120
121    /// Chunk index within the document
122    pub chunk_index: usize,
123
124    /// Character start position in original document
125    pub start_position: usize,
126
127    /// Character end position in original document
128    pub end_position: usize,
129
130    /// Overlap with previous chunk (characters)
131    pub overlap_previous: usize,
132
133    /// Overlap with next chunk (characters)
134    pub overlap_next: usize,
135
136    /// Chunk metadata (inherited from document + chunk-specific)
137    pub metadata: Metadata,
138}
139
140impl DocumentChunk {
141    /// Create a new document chunk
142    pub fn new(
143        document_id: impl Into<String>,
144        content: impl Into<String>,
145        chunk_index: usize,
146        start_position: usize,
147        end_position: usize,
148    ) -> Self {
149        Self {
150            document_id: document_id.into(),
151            content: content.into(),
152            chunk_index,
153            start_position,
154            end_position,
155            overlap_previous: 0,
156            overlap_next: 0,
157            metadata: HashMap::new(),
158        }
159    }
160
161    /// Set overlap information
162    pub fn with_overlap(mut self, previous: usize, next: usize) -> Self {
163        self.overlap_previous = previous;
164        self.overlap_next = next;
165        self
166    }
167
168    /// Add metadata
169    pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
170        self.metadata.insert(key.into(), value);
171        self
172    }
173
174    /// Get chunk length
175    pub fn length(&self) -> usize {
176        self.content.len()
177    }
178
179    /// Check if chunk is empty
180    pub fn is_empty(&self) -> bool {
181        self.content.trim().is_empty()
182    }
183}
184
185/// Document chunking strategy
186#[derive(Debug, Clone)]
187pub enum ChunkingStrategy {
188    /// Fixed size chunking with overlap
189    FixedSize {
190        /// Size of each chunk in characters
191        size: usize,
192        /// Number of characters to overlap between chunks
193        overlap: usize,
194    },
195
196    /// Sentence-based chunking
197    Sentence {
198        /// Maximum number of sentences per chunk
199        max_sentences: usize,
200        /// Number of sentences to overlap between chunks
201        overlap_sentences: usize,
202    },
203
204    /// Paragraph-based chunking
205    Paragraph {
206        /// Maximum number of paragraphs per chunk
207        max_paragraphs: usize,
208    },
209
210    /// Semantic chunking (requires embeddings)
211    Semantic {
212        /// Similarity threshold for semantic boundaries
213        similarity_threshold: f32,
214    },
215}
216
217impl Default for ChunkingStrategy {
218    fn default() -> Self {
219        Self::FixedSize {
220            size: 512,
221            overlap: 64,
222        }
223    }
224}
225
226/// Document chunker with various strategies
227pub struct DocumentChunker {
228    strategy: ChunkingStrategy,
229}
230
231impl DocumentChunker {
232    /// Create chunker with default strategy
233    pub fn new() -> Self {
234        Self {
235            strategy: ChunkingStrategy::default(),
236        }
237    }
238
239    /// Create chunker with specific strategy
240    pub fn with_strategy(strategy: ChunkingStrategy) -> Self {
241        Self { strategy }
242    }
243
244    /// Chunk a document into smaller pieces
245    pub fn chunk_document(&self, document: &Document) -> RragResult<Vec<DocumentChunk>> {
246        let content = document.content_str();
247
248        let chunks = match &self.strategy {
249            ChunkingStrategy::FixedSize { size, overlap } => {
250                self.chunk_fixed_size(content, *size, *overlap)
251            }
252            ChunkingStrategy::Sentence {
253                max_sentences,
254                overlap_sentences,
255            } => self.chunk_by_sentences(content, *max_sentences, *overlap_sentences),
256            ChunkingStrategy::Paragraph { max_paragraphs } => {
257                self.chunk_by_paragraphs(content, *max_paragraphs)
258            }
259            ChunkingStrategy::Semantic { .. } => {
260                // Placeholder for semantic chunking
261                return Err(RragError::document_processing(
262                    "Semantic chunking not yet implemented",
263                ));
264            }
265        };
266
267        // Convert to DocumentChunk structs
268        let mut document_chunks = Vec::new();
269        let mut current_position = 0;
270
271        for (i, chunk_content) in chunks.iter().enumerate() {
272            let start_pos = current_position;
273            let end_pos = start_pos + chunk_content.len();
274
275            let mut chunk = DocumentChunk::new(&document.id, chunk_content, i, start_pos, end_pos);
276
277            // Inherit document metadata
278            chunk.metadata = document.metadata.clone();
279
280            // Add chunk-specific metadata
281            chunk = chunk
282                .with_metadata(
283                    "chunk_total",
284                    serde_json::Value::Number(chunks.len().into()),
285                )
286                .with_metadata(
287                    "chunk_strategy",
288                    serde_json::Value::String(
289                        match &self.strategy {
290                            ChunkingStrategy::FixedSize { .. } => "fixed_size",
291                            ChunkingStrategy::Sentence { .. } => "sentence",
292                            ChunkingStrategy::Paragraph { .. } => "paragraph",
293                            ChunkingStrategy::Semantic { .. } => "semantic",
294                        }
295                        .to_string(),
296                    ),
297                );
298
299            document_chunks.push(chunk);
300            current_position = end_pos;
301        }
302
303        Ok(document_chunks)
304    }
305
306    /// Fixed size chunking implementation
307    fn chunk_fixed_size(&self, content: &str, size: usize, overlap: usize) -> Vec<String> {
308        if content.len() <= size {
309            return vec![content.to_string()];
310        }
311
312        let mut chunks = Vec::new();
313        let mut start = 0;
314
315        while start < content.len() {
316            let end = std::cmp::min(start + size, content.len());
317            let chunk = &content[start..end];
318            chunks.push(chunk.to_string());
319
320            if end >= content.len() {
321                break;
322            }
323
324            start = if overlap >= end { 0 } else { end - overlap };
325        }
326
327        chunks
328    }
329
330    /// Sentence-based chunking implementation
331    fn chunk_by_sentences(
332        &self,
333        content: &str,
334        max_sentences: usize,
335        overlap_sentences: usize,
336    ) -> Vec<String> {
337        // Simple sentence splitting - in production, use a proper NLP library
338        let sentences: Vec<&str> = content
339            .split(|c| c == '.' || c == '!' || c == '?')
340            .map(|s| s.trim())
341            .filter(|s| !s.is_empty())
342            .collect();
343
344        if sentences.len() <= max_sentences {
345            return vec![content.to_string()];
346        }
347
348        let mut chunks = Vec::new();
349        let mut start = 0;
350
351        while start < sentences.len() {
352            let end = std::cmp::min(start + max_sentences, sentences.len());
353            let chunk_sentences = &sentences[start..end];
354            let chunk = chunk_sentences.join(". ") + ".";
355            chunks.push(chunk);
356
357            if end >= sentences.len() {
358                break;
359            }
360
361            start = if overlap_sentences >= end {
362                0
363            } else {
364                end - overlap_sentences
365            };
366        }
367
368        chunks
369    }
370
371    /// Paragraph-based chunking implementation
372    fn chunk_by_paragraphs(&self, content: &str, max_paragraphs: usize) -> Vec<String> {
373        let paragraphs: Vec<&str> = content
374            .split("\n\n")
375            .map(|p| p.trim())
376            .filter(|p| !p.is_empty())
377            .collect();
378
379        if paragraphs.len() <= max_paragraphs {
380            return vec![content.to_string()];
381        }
382
383        let mut chunks = Vec::new();
384        let mut current_chunk = Vec::new();
385
386        for paragraph in paragraphs {
387            current_chunk.push(paragraph);
388
389            if current_chunk.len() >= max_paragraphs {
390                chunks.push(current_chunk.join("\n\n"));
391                current_chunk.clear();
392            }
393        }
394
395        // Add remaining paragraphs
396        if !current_chunk.is_empty() {
397            chunks.push(current_chunk.join("\n\n"));
398        }
399
400        chunks
401    }
402}
403
404impl Default for DocumentChunker {
405    fn default() -> Self {
406        Self::new()
407    }
408}
409
410/// Custom serde module for Cow<str> handling
411mod cow_str_serde {
412    use serde::{Deserialize, Deserializer, Serialize, Serializer};
413    use std::borrow::Cow;
414
415    pub fn serialize<S>(cow: &Cow<'static, str>, serializer: S) -> Result<S::Ok, S::Error>
416    where
417        S: Serializer,
418    {
419        cow.as_ref().serialize(serializer)
420    }
421
422    pub fn deserialize<'de, D>(deserializer: D) -> Result<Cow<'static, str>, D::Error>
423    where
424        D: Deserializer<'de>,
425    {
426        let s = String::deserialize(deserializer)?;
427        Ok(Cow::Owned(s))
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_document_creation() {
437        let doc = Document::new("Test content")
438            .with_metadata("source", serde_json::Value::String("test".to_string()));
439
440        assert_eq!(doc.content_str(), "Test content");
441        assert!(!doc.id.is_empty());
442        assert_eq!(
443            doc.metadata.get("source").unwrap().as_str().unwrap(),
444            "test"
445        );
446    }
447
448    #[test]
449    fn test_document_chunk() {
450        let chunk = DocumentChunk::new("doc1", "chunk content", 0, 0, 13)
451            .with_overlap(0, 5)
452            .with_metadata("test", serde_json::Value::String("value".to_string()));
453
454        assert_eq!(chunk.document_id, "doc1");
455        assert_eq!(chunk.content, "chunk content");
456        assert_eq!(chunk.length(), 13);
457        assert_eq!(chunk.overlap_next, 5);
458    }
459
460    #[test]
461    fn test_fixed_size_chunking() {
462        let chunker = DocumentChunker::with_strategy(ChunkingStrategy::FixedSize {
463            size: 10,
464            overlap: 3,
465        });
466
467        let doc = Document::new("This is a test document for chunking");
468        let chunks = chunker.chunk_document(&doc).unwrap();
469
470        assert!(!chunks.is_empty());
471        assert!(chunks[0].content.len() <= 10);
472    }
473
474    #[test]
475    fn test_sentence_chunking() {
476        let chunker = DocumentChunker::with_strategy(ChunkingStrategy::Sentence {
477            max_sentences: 2,
478            overlap_sentences: 1,
479        });
480
481        let doc =
482            Document::new("First sentence. Second sentence. Third sentence. Fourth sentence.");
483        let chunks = chunker.chunk_document(&doc).unwrap();
484
485        assert!(!chunks.is_empty());
486    }
487
488    #[test]
489    fn test_document_hash() {
490        let doc1 = Document::new("Same content").with_content_hash();
491        let doc2 = Document::new("Same content").with_content_hash();
492        let doc3 = Document::new("Different content").with_content_hash();
493
494        assert_eq!(doc1.content_hash, doc2.content_hash);
495        assert_ne!(doc1.content_hash, doc3.content_hash);
496    }
497
498    #[test]
499    fn test_empty_document() {
500        let doc = Document::new("   ");
501        assert!(doc.is_empty());
502
503        let doc2 = Document::new("content");
504        assert!(!doc2.is_empty());
505    }
506}