1use crate::{RragError, RragResult};
7use serde::{Deserialize, Serialize};
8use std::borrow::Cow;
9use std::collections::HashMap;
10use uuid::Uuid;
11
12pub type Metadata = HashMap<String, serde_json::Value>;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct Document {
22 pub id: String,
24
25 #[serde(with = "cow_str_serde")]
27 pub content: Cow<'static, str>,
28
29 pub metadata: Metadata,
31
32 pub content_hash: Option<String>,
34
35 pub created_at: chrono::DateTime<chrono::Utc>,
37}
38
39impl Document {
40 pub fn new(content: impl Into<Cow<'static, str>>) -> Self {
42 let content = content.into();
43 Self {
44 id: Uuid::new_v4().to_string(),
45 content,
46 metadata: HashMap::new(),
47 content_hash: None,
48 created_at: chrono::Utc::now(),
49 }
50 }
51
52 pub fn with_id(id: impl Into<String>, content: impl Into<Cow<'static, str>>) -> Self {
54 let content = content.into();
55 Self {
56 id: id.into(),
57 content,
58 metadata: HashMap::new(),
59 content_hash: None,
60 created_at: chrono::Utc::now(),
61 }
62 }
63
64 pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
66 self.metadata.insert(key.into(), value);
67 self
68 }
69
70 pub fn with_metadata_map(mut self, metadata: Metadata) -> Self {
72 self.metadata.extend(metadata);
73 self
74 }
75
76 pub fn with_content_hash(mut self) -> Self {
78 self.content_hash = Some(Self::hash_content(&self.content));
79 self
80 }
81
82 pub fn content_str(&self) -> &str {
84 &self.content
85 }
86
87 pub fn content_length(&self) -> usize {
89 self.content.chars().count()
90 }
91
92 pub fn is_empty(&self) -> bool {
94 self.content.trim().is_empty()
95 }
96
97 fn hash_content(content: &str) -> String {
99 use std::collections::hash_map::DefaultHasher;
101 use std::hash::{Hash, Hasher};
102
103 let mut hasher = DefaultHasher::new();
104 content.hash(&mut hasher);
105 format!("{:x}", hasher.finish())
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct DocumentChunk {
115 pub document_id: String,
117
118 pub content: String,
120
121 pub chunk_index: usize,
123
124 pub start_position: usize,
126
127 pub end_position: usize,
129
130 pub overlap_previous: usize,
132
133 pub overlap_next: usize,
135
136 pub metadata: Metadata,
138}
139
140impl DocumentChunk {
141 pub fn new(
143 document_id: impl Into<String>,
144 content: impl Into<String>,
145 chunk_index: usize,
146 start_position: usize,
147 end_position: usize,
148 ) -> Self {
149 Self {
150 document_id: document_id.into(),
151 content: content.into(),
152 chunk_index,
153 start_position,
154 end_position,
155 overlap_previous: 0,
156 overlap_next: 0,
157 metadata: HashMap::new(),
158 }
159 }
160
161 pub fn with_overlap(mut self, previous: usize, next: usize) -> Self {
163 self.overlap_previous = previous;
164 self.overlap_next = next;
165 self
166 }
167
168 pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
170 self.metadata.insert(key.into(), value);
171 self
172 }
173
174 pub fn length(&self) -> usize {
176 self.content.len()
177 }
178
179 pub fn is_empty(&self) -> bool {
181 self.content.trim().is_empty()
182 }
183}
184
185#[derive(Debug, Clone)]
187pub enum ChunkingStrategy {
188 FixedSize {
190 size: usize,
192 overlap: usize,
194 },
195
196 Sentence {
198 max_sentences: usize,
200 overlap_sentences: usize,
202 },
203
204 Paragraph {
206 max_paragraphs: usize,
208 },
209
210 Semantic {
212 similarity_threshold: f32,
214 },
215}
216
217impl Default for ChunkingStrategy {
218 fn default() -> Self {
219 Self::FixedSize {
220 size: 512,
221 overlap: 64,
222 }
223 }
224}
225
226pub struct DocumentChunker {
228 strategy: ChunkingStrategy,
229}
230
231impl DocumentChunker {
232 pub fn new() -> Self {
234 Self {
235 strategy: ChunkingStrategy::default(),
236 }
237 }
238
239 pub fn with_strategy(strategy: ChunkingStrategy) -> Self {
241 Self { strategy }
242 }
243
244 pub fn chunk_document(&self, document: &Document) -> RragResult<Vec<DocumentChunk>> {
246 let content = document.content_str();
247
248 let chunks = match &self.strategy {
249 ChunkingStrategy::FixedSize { size, overlap } => {
250 self.chunk_fixed_size(content, *size, *overlap)
251 }
252 ChunkingStrategy::Sentence {
253 max_sentences,
254 overlap_sentences,
255 } => self.chunk_by_sentences(content, *max_sentences, *overlap_sentences),
256 ChunkingStrategy::Paragraph { max_paragraphs } => {
257 self.chunk_by_paragraphs(content, *max_paragraphs)
258 }
259 ChunkingStrategy::Semantic { .. } => {
260 return Err(RragError::document_processing(
262 "Semantic chunking not yet implemented",
263 ));
264 }
265 };
266
267 let mut document_chunks = Vec::new();
269 let mut current_position = 0;
270
271 for (i, chunk_content) in chunks.iter().enumerate() {
272 let start_pos = current_position;
273 let end_pos = start_pos + chunk_content.len();
274
275 let mut chunk = DocumentChunk::new(&document.id, chunk_content, i, start_pos, end_pos);
276
277 chunk.metadata = document.metadata.clone();
279
280 chunk = chunk
282 .with_metadata(
283 "chunk_total",
284 serde_json::Value::Number(chunks.len().into()),
285 )
286 .with_metadata(
287 "chunk_strategy",
288 serde_json::Value::String(
289 match &self.strategy {
290 ChunkingStrategy::FixedSize { .. } => "fixed_size",
291 ChunkingStrategy::Sentence { .. } => "sentence",
292 ChunkingStrategy::Paragraph { .. } => "paragraph",
293 ChunkingStrategy::Semantic { .. } => "semantic",
294 }
295 .to_string(),
296 ),
297 );
298
299 document_chunks.push(chunk);
300 current_position = end_pos;
301 }
302
303 Ok(document_chunks)
304 }
305
306 fn chunk_fixed_size(&self, content: &str, size: usize, overlap: usize) -> Vec<String> {
308 if content.len() <= size {
309 return vec![content.to_string()];
310 }
311
312 let mut chunks = Vec::new();
313 let mut start = 0;
314
315 while start < content.len() {
316 let end = std::cmp::min(start + size, content.len());
317 let chunk = &content[start..end];
318 chunks.push(chunk.to_string());
319
320 if end >= content.len() {
321 break;
322 }
323
324 start = if overlap >= end { 0 } else { end - overlap };
325 }
326
327 chunks
328 }
329
330 fn chunk_by_sentences(
332 &self,
333 content: &str,
334 max_sentences: usize,
335 overlap_sentences: usize,
336 ) -> Vec<String> {
337 let sentences: Vec<&str> = content
339 .split(|c| c == '.' || c == '!' || c == '?')
340 .map(|s| s.trim())
341 .filter(|s| !s.is_empty())
342 .collect();
343
344 if sentences.len() <= max_sentences {
345 return vec![content.to_string()];
346 }
347
348 let mut chunks = Vec::new();
349 let mut start = 0;
350
351 while start < sentences.len() {
352 let end = std::cmp::min(start + max_sentences, sentences.len());
353 let chunk_sentences = &sentences[start..end];
354 let chunk = chunk_sentences.join(". ") + ".";
355 chunks.push(chunk);
356
357 if end >= sentences.len() {
358 break;
359 }
360
361 start = if overlap_sentences >= end {
362 0
363 } else {
364 end - overlap_sentences
365 };
366 }
367
368 chunks
369 }
370
371 fn chunk_by_paragraphs(&self, content: &str, max_paragraphs: usize) -> Vec<String> {
373 let paragraphs: Vec<&str> = content
374 .split("\n\n")
375 .map(|p| p.trim())
376 .filter(|p| !p.is_empty())
377 .collect();
378
379 if paragraphs.len() <= max_paragraphs {
380 return vec![content.to_string()];
381 }
382
383 let mut chunks = Vec::new();
384 let mut current_chunk = Vec::new();
385
386 for paragraph in paragraphs {
387 current_chunk.push(paragraph);
388
389 if current_chunk.len() >= max_paragraphs {
390 chunks.push(current_chunk.join("\n\n"));
391 current_chunk.clear();
392 }
393 }
394
395 if !current_chunk.is_empty() {
397 chunks.push(current_chunk.join("\n\n"));
398 }
399
400 chunks
401 }
402}
403
404impl Default for DocumentChunker {
405 fn default() -> Self {
406 Self::new()
407 }
408}
409
410mod cow_str_serde {
412 use serde::{Deserialize, Deserializer, Serialize, Serializer};
413 use std::borrow::Cow;
414
415 pub fn serialize<S>(cow: &Cow<'static, str>, serializer: S) -> Result<S::Ok, S::Error>
416 where
417 S: Serializer,
418 {
419 cow.as_ref().serialize(serializer)
420 }
421
422 pub fn deserialize<'de, D>(deserializer: D) -> Result<Cow<'static, str>, D::Error>
423 where
424 D: Deserializer<'de>,
425 {
426 let s = String::deserialize(deserializer)?;
427 Ok(Cow::Owned(s))
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 #[test]
436 fn test_document_creation() {
437 let doc = Document::new("Test content")
438 .with_metadata("source", serde_json::Value::String("test".to_string()));
439
440 assert_eq!(doc.content_str(), "Test content");
441 assert!(!doc.id.is_empty());
442 assert_eq!(
443 doc.metadata.get("source").unwrap().as_str().unwrap(),
444 "test"
445 );
446 }
447
448 #[test]
449 fn test_document_chunk() {
450 let chunk = DocumentChunk::new("doc1", "chunk content", 0, 0, 13)
451 .with_overlap(0, 5)
452 .with_metadata("test", serde_json::Value::String("value".to_string()));
453
454 assert_eq!(chunk.document_id, "doc1");
455 assert_eq!(chunk.content, "chunk content");
456 assert_eq!(chunk.length(), 13);
457 assert_eq!(chunk.overlap_next, 5);
458 }
459
460 #[test]
461 fn test_fixed_size_chunking() {
462 let chunker = DocumentChunker::with_strategy(ChunkingStrategy::FixedSize {
463 size: 10,
464 overlap: 3,
465 });
466
467 let doc = Document::new("This is a test document for chunking");
468 let chunks = chunker.chunk_document(&doc).unwrap();
469
470 assert!(!chunks.is_empty());
471 assert!(chunks[0].content.len() <= 10);
472 }
473
474 #[test]
475 fn test_sentence_chunking() {
476 let chunker = DocumentChunker::with_strategy(ChunkingStrategy::Sentence {
477 max_sentences: 2,
478 overlap_sentences: 1,
479 });
480
481 let doc =
482 Document::new("First sentence. Second sentence. Third sentence. Fourth sentence.");
483 let chunks = chunker.chunk_document(&doc).unwrap();
484
485 assert!(!chunks.is_empty());
486 }
487
488 #[test]
489 fn test_document_hash() {
490 let doc1 = Document::new("Same content").with_content_hash();
491 let doc2 = Document::new("Same content").with_content_hash();
492 let doc3 = Document::new("Different content").with_content_hash();
493
494 assert_eq!(doc1.content_hash, doc2.content_hash);
495 assert_ne!(doc1.content_hash, doc3.content_hash);
496 }
497
498 #[test]
499 fn test_empty_document() {
500 let doc = Document::new(" ");
501 assert!(doc.is_empty());
502
503 let doc2 = Document::new("content");
504 assert!(!doc2.is_empty());
505 }
506}