graphrag_core/nlp/
semantic_chunking.rs

1//! Semantic Chunking
2//!
3//! This module provides intelligent text chunking strategies that respect:
4//! - Sentence boundaries
5//! - Paragraph structure
6//! - Topic coherence
7//! - Semantic similarity
8//!
9//! ## Chunking Strategies
10//!
11//! 1. **Sentence-based**: Chunks at sentence boundaries
12//! 2. **Paragraph-based**: Chunks at paragraph breaks
13//! 3. **Topic-based**: Chunks when topic shifts detected
14//! 4. **Semantic**: Chunks based on embedding similarity
15//! 5. **Hybrid**: Combines multiple strategies
16
17use serde::{Deserialize, Serialize};
18
19/// Chunking strategy
20#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
21pub enum ChunkingStrategy {
22    /// Fixed-size chunks (character count)
23    FixedSize,
24    /// Sentence boundary-based chunks
25    Sentence,
26    /// Paragraph boundary-based chunks
27    Paragraph,
28    /// Topic shift detection
29    Topic,
30    /// Semantic similarity-based
31    Semantic,
32    /// Hybrid approach (combines multiple strategies)
33    Hybrid,
34}
35
36/// Chunking configuration
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ChunkingConfig {
39    /// Chunking strategy
40    pub strategy: ChunkingStrategy,
41    /// Target chunk size (characters or sentences)
42    pub target_size: usize,
43    /// Minimum chunk size
44    pub min_size: usize,
45    /// Maximum chunk size
46    pub max_size: usize,
47    /// Overlap size (characters or sentences)
48    pub overlap: usize,
49    /// Similarity threshold for semantic chunking (0.0 to 1.0)
50    pub similarity_threshold: f32,
51}
52
53impl Default for ChunkingConfig {
54    fn default() -> Self {
55        Self {
56            strategy: ChunkingStrategy::Sentence,
57            target_size: 500,
58            min_size: 100,
59            max_size: 1000,
60            overlap: 50,
61            similarity_threshold: 0.7,
62        }
63    }
64}
65
66/// Text chunk with metadata
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SemanticChunk {
69    /// Chunk text
70    pub text: String,
71    /// Start position in original text
72    pub start: usize,
73    /// End position in original text
74    pub end: usize,
75    /// Sentence count in chunk
76    pub sentence_count: usize,
77    /// Paragraph count in chunk
78    pub paragraph_count: usize,
79    /// Coherence score (higher = more coherent)
80    pub coherence: f32,
81}
82
83/// Semantic chunker
84pub struct SemanticChunker {
85    config: ChunkingConfig,
86}
87
88impl SemanticChunker {
89    /// Create new semantic chunker
90    pub fn new(config: ChunkingConfig) -> Self {
91        Self { config }
92    }
93
94    /// Create with default configuration
95    pub fn default_config() -> Self {
96        Self {
97            config: ChunkingConfig::default(),
98        }
99    }
100
101    /// Chunk text according to strategy
102    pub fn chunk(&self, text: &str) -> Vec<SemanticChunk> {
103        match self.config.strategy {
104            ChunkingStrategy::FixedSize => self.chunk_fixed_size(text),
105            ChunkingStrategy::Sentence => self.chunk_by_sentences(text),
106            ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text),
107            ChunkingStrategy::Topic => self.chunk_by_topic(text),
108            ChunkingStrategy::Semantic => self.chunk_by_similarity(text),
109            ChunkingStrategy::Hybrid => self.chunk_hybrid(text),
110        }
111    }
112
113    /// Fixed-size chunking (baseline)
114    fn chunk_fixed_size(&self, text: &str) -> Vec<SemanticChunk> {
115        let mut chunks = Vec::new();
116        let chars: Vec<char> = text.chars().collect();
117        let total_len = chars.len();
118        let mut start = 0;
119
120        while start < total_len {
121            let end = (start + self.config.target_size).min(total_len);
122            let chunk_text: String = chars[start..end].iter().collect();
123
124            chunks.push(SemanticChunk {
125                text: chunk_text,
126                start,
127                end,
128                sentence_count: 0, // Will be calculated if needed
129                paragraph_count: 0,
130                coherence: 1.0,
131            });
132
133            start += self.config.target_size - self.config.overlap;
134        }
135
136        chunks
137    }
138
139    /// Sentence-based chunking
140    fn chunk_by_sentences(&self, text: &str) -> Vec<SemanticChunk> {
141        let sentences = self.split_sentences(text);
142        let mut chunks = Vec::new();
143        let mut current_chunk = Vec::new();
144        let mut current_size = 0;
145        let mut chunk_start = 0;
146
147        for sentence in sentences.iter() {
148            let sentence_len = sentence.len();
149
150            // Check if adding this sentence exceeds max size
151            if current_size + sentence_len > self.config.max_size && !current_chunk.is_empty() {
152                // Create chunk from accumulated sentences
153                let chunk_text = current_chunk.join(" ");
154                let chunk_end = chunk_start + chunk_text.len();
155
156                chunks.push(SemanticChunk {
157                    text: chunk_text,
158                    start: chunk_start,
159                    end: chunk_end,
160                    sentence_count: current_chunk.len(),
161                    paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
162                    coherence: self.calculate_coherence(&current_chunk),
163                });
164
165                // Start new chunk with overlap
166                let overlap_sentences = if current_chunk.len() > 1 {
167                    vec![current_chunk.last().unwrap().clone()]
168                } else {
169                    Vec::new()
170                };
171
172                chunk_start = chunk_end - overlap_sentences.join(" ").len();
173                current_chunk = overlap_sentences;
174                current_size = current_chunk.iter().map(|s| s.len()).sum();
175            }
176
177            current_chunk.push(sentence.clone());
178            current_size += sentence_len;
179
180            // Check if we've reached target size
181            if current_size >= self.config.target_size {
182                let chunk_text = current_chunk.join(" ");
183                let chunk_end = chunk_start + chunk_text.len();
184
185                chunks.push(SemanticChunk {
186                    text: chunk_text,
187                    start: chunk_start,
188                    end: chunk_end,
189                    sentence_count: current_chunk.len(),
190                    paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
191                    coherence: self.calculate_coherence(&current_chunk),
192                });
193
194                // Start new chunk with overlap
195                let overlap_sentences = if current_chunk.len() > 1 {
196                    vec![current_chunk.last().unwrap().clone()]
197                } else {
198                    Vec::new()
199                };
200
201                chunk_start = chunk_end - overlap_sentences.join(" ").len();
202                current_chunk = overlap_sentences;
203                current_size = current_chunk.iter().map(|s| s.len()).sum();
204            }
205        }
206
207        // Add remaining sentences as final chunk
208        if !current_chunk.is_empty() && current_chunk.join(" ").len() >= self.config.min_size {
209            let chunk_text = current_chunk.join(" ");
210            let chunk_end = chunk_start + chunk_text.len();
211
212            chunks.push(SemanticChunk {
213                text: chunk_text,
214                start: chunk_start,
215                end: chunk_end,
216                sentence_count: current_chunk.len(),
217                paragraph_count: self.count_paragraphs(&current_chunk.join(" ")),
218                coherence: self.calculate_coherence(&current_chunk),
219            });
220        }
221
222        chunks
223    }
224
225    /// Paragraph-based chunking
226    fn chunk_by_paragraphs(&self, text: &str) -> Vec<SemanticChunk> {
227        let paragraphs: Vec<&str> = text
228            .split("\n\n")
229            .filter(|p| !p.trim().is_empty())
230            .collect();
231
232        let mut chunks = Vec::new();
233        let mut current_chunk = Vec::new();
234        let mut current_size = 0;
235        let mut chunk_start = 0;
236
237        for paragraph in paragraphs {
238            let para_len = paragraph.len();
239
240            if current_size + para_len > self.config.max_size && !current_chunk.is_empty() {
241                // Create chunk
242                let chunk_text = current_chunk.join("\n\n");
243                let chunk_end = chunk_start + chunk_text.len();
244
245                chunks.push(SemanticChunk {
246                    text: chunk_text.clone(),
247                    start: chunk_start,
248                    end: chunk_end,
249                    sentence_count: self.count_sentences(&chunk_text),
250                    paragraph_count: current_chunk.len(),
251                    coherence: self.calculate_coherence(&current_chunk),
252                });
253
254                chunk_start = chunk_end;
255                current_chunk = Vec::new();
256                current_size = 0;
257            }
258
259            current_chunk.push(paragraph.to_string());
260            current_size += para_len;
261        }
262
263        // Add remaining chunk
264        if !current_chunk.is_empty() {
265            let chunk_text = current_chunk.join("\n\n");
266            let chunk_end = chunk_start + chunk_text.len();
267
268            chunks.push(SemanticChunk {
269                text: chunk_text.clone(),
270                start: chunk_start,
271                end: chunk_end,
272                sentence_count: self.count_sentences(&chunk_text),
273                paragraph_count: current_chunk.len(),
274                coherence: self.calculate_coherence(&current_chunk),
275            });
276        }
277
278        chunks
279    }
280
281    /// Topic-based chunking (simplified TextTiling algorithm)
282    fn chunk_by_topic(&self, text: &str) -> Vec<SemanticChunk> {
283        let sentences = self.split_sentences(text);
284        let mut chunks = Vec::new();
285
286        // Calculate lexical cohesion scores between adjacent sentences
287        let mut boundaries = vec![0]; // Start of text is always a boundary
288
289        for i in 1..sentences.len() {
290            let cohesion = self.lexical_cohesion(&sentences[i - 1], &sentences[i]);
291
292            // If cohesion is low, mark as potential boundary
293            if cohesion < self.config.similarity_threshold {
294                boundaries.push(i);
295            }
296        }
297
298        boundaries.push(sentences.len()); // End of text is always a boundary
299
300        // Create chunks from boundaries
301        let mut text_pos = 0;
302        for window in boundaries.windows(2) {
303            let start_idx = window[0];
304            let end_idx = window[1];
305
306            let chunk_sentences = &sentences[start_idx..end_idx];
307            let chunk_text = chunk_sentences.join(" ");
308            let chunk_len = chunk_text.len();
309
310            if chunk_len >= self.config.min_size {
311                chunks.push(SemanticChunk {
312                    text: chunk_text,
313                    start: text_pos,
314                    end: text_pos + chunk_len,
315                    sentence_count: chunk_sentences.len(),
316                    paragraph_count: self.count_paragraphs(&chunk_sentences.join(" ")),
317                    coherence: self.calculate_coherence(chunk_sentences),
318                });
319            }
320
321            text_pos += chunk_len;
322        }
323
324        chunks
325    }
326
327    /// Semantic similarity-based chunking
328    ///
329    /// Uses lexical cohesion (word overlap) to measure similarity between sentences.
330    /// Creates chunk boundaries where similarity drops below threshold.
331    fn chunk_by_similarity(&self, text: &str) -> Vec<SemanticChunk> {
332        let sentences = self.split_sentences(text);
333
334        if sentences.is_empty() {
335            return vec![];
336        }
337
338        if sentences.len() == 1 {
339            let sentence = &sentences[0];
340            return vec![SemanticChunk {
341                text: sentence.clone(),
342                start: 0,
343                end: sentence.len(),
344                sentence_count: 1,
345                paragraph_count: 1,
346                coherence: 1.0,
347            }];
348        }
349
350        // Calculate similarity between adjacent sentences
351        let mut similarities = Vec::new();
352        for i in 0..sentences.len() - 1 {
353            let similarity = self.lexical_cohesion(&sentences[i], &sentences[i + 1]);
354            similarities.push(similarity);
355        }
356
357        // Identify chunk boundaries where similarity < threshold
358        let mut boundaries = vec![0]; // Start with first sentence
359        for (i, &similarity) in similarities.iter().enumerate() {
360            if similarity < self.config.similarity_threshold {
361                // Low similarity → create boundary
362                boundaries.push(i + 1);
363            }
364        }
365        boundaries.push(sentences.len()); // End boundary
366
367        // Create chunks from boundaries
368        let mut chunks: Vec<SemanticChunk> = Vec::new();
369        let mut text_pos = 0;
370
371        for window in boundaries.windows(2) {
372            let start_idx = window[0];
373            let end_idx = window[1];
374
375            let chunk_sentences = &sentences[start_idx..end_idx];
376            let chunk_text = chunk_sentences.join(" ");
377            let chunk_len = chunk_text.len();
378
379            // Skip empty chunks
380            if chunk_text.trim().is_empty() {
381                continue;
382            }
383
384            // Check size constraints
385            if chunk_len < self.config.min_size && !chunks.is_empty() {
386                // Merge with previous chunk if too small
387                if let Some(last_chunk) = chunks.last_mut() {
388                    last_chunk.text.push(' ');
389                    last_chunk.text.push_str(&chunk_text);
390                    last_chunk.end = text_pos + chunk_len;
391                    last_chunk.sentence_count += chunk_sentences.len();
392                    last_chunk.paragraph_count = self.count_paragraphs(&last_chunk.text);
393                    last_chunk.coherence =
394                        self.calculate_coherence(&self.split_sentences(&last_chunk.text));
395                    text_pos += chunk_len + 1; // +1 for space
396                    continue;
397                }
398            }
399
400            // Split large chunks by sentence boundaries
401            if chunk_len > self.config.max_size {
402                // Further split by sentences while respecting max_size
403                let mut current_text = String::new();
404                let mut current_start = text_pos;
405                let mut current_sentences = Vec::new();
406
407                for sentence in chunk_sentences {
408                    if current_text.len() + sentence.len() > self.config.max_size
409                        && !current_text.is_empty()
410                    {
411                        // Create chunk
412                        chunks.push(SemanticChunk {
413                            text: current_text.trim().to_string(),
414                            start: current_start,
415                            end: current_start + current_text.len(),
416                            sentence_count: current_sentences.len(),
417                            paragraph_count: self.count_paragraphs(&current_text),
418                            coherence: self.calculate_coherence(&current_sentences),
419                        });
420
421                        current_start += current_text.len() + 1;
422                        current_text = String::new();
423                        current_sentences.clear();
424                    }
425
426                    if !current_text.is_empty() {
427                        current_text.push(' ');
428                    }
429                    current_text.push_str(sentence);
430                    current_sentences.push(sentence.clone());
431                }
432
433                // Add remaining text
434                if !current_text.is_empty() {
435                    chunks.push(SemanticChunk {
436                        text: current_text.trim().to_string(),
437                        start: current_start,
438                        end: current_start + current_text.len(),
439                        sentence_count: current_sentences.len(),
440                        paragraph_count: self.count_paragraphs(&current_text),
441                        coherence: self.calculate_coherence(&current_sentences),
442                    });
443                }
444
445                text_pos += chunk_len + 1;
446            } else {
447                // Normal chunk
448                chunks.push(SemanticChunk {
449                    text: chunk_text.clone(),
450                    start: text_pos,
451                    end: text_pos + chunk_len,
452                    sentence_count: chunk_sentences.len(),
453                    paragraph_count: self.count_paragraphs(&chunk_text),
454                    coherence: self.calculate_coherence(chunk_sentences),
455                });
456
457                text_pos += chunk_len + 1; // +1 for space between chunks
458            }
459        }
460
461        chunks
462    }
463
464    /// Hybrid chunking strategy
465    fn chunk_hybrid(&self, text: &str) -> Vec<SemanticChunk> {
466        // Start with paragraph boundaries
467        let para_chunks = self.chunk_by_paragraphs(text);
468
469        // Further split large paragraphs by sentences
470        let mut final_chunks = Vec::new();
471
472        for chunk in para_chunks {
473            if chunk.text.len() > self.config.max_size {
474                // Split by sentences
475                let mut temp_config = self.config.clone();
476                temp_config.strategy = ChunkingStrategy::Sentence;
477                let sub_chunker = SemanticChunker::new(temp_config);
478                let sub_chunks = sub_chunker.chunk(&chunk.text);
479                final_chunks.extend(sub_chunks);
480            } else {
481                final_chunks.push(chunk);
482            }
483        }
484
485        final_chunks
486    }
487
488    /// Split text into sentences (simple heuristic)
489    fn split_sentences(&self, text: &str) -> Vec<String> {
490        let mut sentences = Vec::new();
491        let mut current = String::new();
492
493        for c in text.chars() {
494            current.push(c);
495
496            // Simple sentence boundary detection
497            if matches!(c, '.' | '!' | '?') {
498                if let Some(next) = current.chars().last() {
499                    if next.is_whitespace() || !current.trim().is_empty() {
500                        sentences.push(current.trim().to_string());
501                        current = String::new();
502                    }
503                }
504            }
505        }
506
507        // Add remaining text
508        if !current.trim().is_empty() {
509            sentences.push(current.trim().to_string());
510        }
511
512        sentences
513    }
514
515    /// Count sentences in text
516    fn count_sentences(&self, text: &str) -> usize {
517        text.chars()
518            .filter(|c| matches!(c, '.' | '!' | '?'))
519            .count()
520    }
521
522    /// Count paragraphs in text
523    fn count_paragraphs(&self, text: &str) -> usize {
524        text.split("\n\n")
525            .filter(|p| !p.trim().is_empty())
526            .count()
527            .max(1)
528    }
529
530    /// Calculate lexical cohesion between two texts (word overlap)
531    fn lexical_cohesion(&self, text1: &str, text2: &str) -> f32 {
532        let text1_lower = text1.to_lowercase();
533        let words1: std::collections::HashSet<_> = text1_lower.split_whitespace().collect();
534
535        let text2_lower = text2.to_lowercase();
536        let words2: std::collections::HashSet<_> = text2_lower.split_whitespace().collect();
537
538        let intersection = words1.intersection(&words2).count();
539        let union = words1.union(&words2).count();
540
541        if union == 0 {
542            0.0
543        } else {
544            intersection as f32 / union as f32
545        }
546    }
547
548    /// Calculate coherence score for a chunk (based on word overlap)
549    fn calculate_coherence(&self, sentences: &[String]) -> f32 {
550        if sentences.len() < 2 {
551            return 1.0;
552        }
553
554        let mut total_cohesion = 0.0;
555        for window in sentences.windows(2) {
556            total_cohesion += self.lexical_cohesion(&window[0], &window[1]);
557        }
558
559        total_cohesion / (sentences.len() - 1) as f32
560    }
561}
562
563/// Chunking statistics
564#[derive(Debug, Clone, Serialize, Deserialize)]
565pub struct ChunkingStats {
566    /// Total chunks created
567    pub total_chunks: usize,
568    /// Average chunk size (characters)
569    pub avg_chunk_size: f32,
570    /// Minimum chunk size
571    pub min_chunk_size: usize,
572    /// Maximum chunk size
573    pub max_chunk_size: usize,
574    /// Average coherence score
575    pub avg_coherence: f32,
576    /// Average sentences per chunk
577    pub avg_sentences_per_chunk: f32,
578}
579
580impl ChunkingStats {
581    /// Calculate statistics from chunks
582    pub fn from_chunks(chunks: &[SemanticChunk]) -> Self {
583        if chunks.is_empty() {
584            return Self {
585                total_chunks: 0,
586                avg_chunk_size: 0.0,
587                min_chunk_size: 0,
588                max_chunk_size: 0,
589                avg_coherence: 0.0,
590                avg_sentences_per_chunk: 0.0,
591            };
592        }
593
594        let total_chunks = chunks.len();
595        let sizes: Vec<usize> = chunks.iter().map(|c| c.text.len()).collect();
596        let avg_chunk_size = sizes.iter().sum::<usize>() as f32 / total_chunks as f32;
597        let min_chunk_size = *sizes.iter().min().unwrap();
598        let max_chunk_size = *sizes.iter().max().unwrap();
599
600        let avg_coherence = chunks.iter().map(|c| c.coherence).sum::<f32>() / total_chunks as f32;
601        let avg_sentences_per_chunk =
602            chunks.iter().map(|c| c.sentence_count).sum::<usize>() as f32 / total_chunks as f32;
603
604        Self {
605            total_chunks,
606            avg_chunk_size,
607            min_chunk_size,
608            max_chunk_size,
609            avg_coherence,
610            avg_sentences_per_chunk,
611        }
612    }
613}
614
615#[cfg(test)]
616mod tests {
617    use super::*;
618
619    const TEST_TEXT: &str = "This is the first sentence. This is the second sentence. \
620                              This is the third sentence.\n\n\
621                              This is a new paragraph with different content. \
622                              It has multiple sentences too. \
623                              And here is another one.";
624
625    #[test]
626    fn test_fixed_size_chunking() {
627        let config = ChunkingConfig {
628            strategy: ChunkingStrategy::FixedSize,
629            target_size: 50,
630            min_size: 10,
631            max_size: 100,
632            overlap: 10,
633            similarity_threshold: 0.7,
634        };
635
636        let chunker = SemanticChunker::new(config);
637        let chunks = chunker.chunk(TEST_TEXT);
638
639        assert!(!chunks.is_empty());
640        for chunk in &chunks {
641            assert!(chunk.text.len() <= 100);
642        }
643    }
644
645    #[test]
646    fn test_sentence_chunking() {
647        let config = ChunkingConfig {
648            strategy: ChunkingStrategy::Sentence,
649            target_size: 100,
650            min_size: 20,
651            max_size: 200,
652            overlap: 20,
653            similarity_threshold: 0.7,
654        };
655
656        let chunker = SemanticChunker::new(config);
657        let chunks = chunker.chunk(TEST_TEXT);
658
659        assert!(!chunks.is_empty());
660        for chunk in &chunks {
661            assert!(chunk.sentence_count > 0);
662            assert!(chunk.text.len() >= 20);
663        }
664    }
665
666    #[test]
667    fn test_paragraph_chunking() {
668        let config = ChunkingConfig {
669            strategy: ChunkingStrategy::Paragraph,
670            target_size: 100,
671            min_size: 20,
672            max_size: 500,
673            overlap: 0,
674            similarity_threshold: 0.7,
675        };
676
677        let chunker = SemanticChunker::new(config);
678        let chunks = chunker.chunk(TEST_TEXT);
679
680        assert!(!chunks.is_empty());
681        for chunk in &chunks {
682            assert!(chunk.paragraph_count > 0);
683        }
684    }
685
686    #[test]
687    fn test_topic_chunking() {
688        let config = ChunkingConfig {
689            strategy: ChunkingStrategy::Topic,
690            target_size: 100,
691            min_size: 20,
692            max_size: 300,
693            overlap: 0,
694            similarity_threshold: 0.3,
695        };
696
697        let chunker = SemanticChunker::new(config);
698        let chunks = chunker.chunk(TEST_TEXT);
699
700        assert!(!chunks.is_empty());
701    }
702
703    #[test]
704    fn test_hybrid_chunking() {
705        let config = ChunkingConfig {
706            strategy: ChunkingStrategy::Hybrid,
707            target_size: 100,
708            min_size: 20,
709            max_size: 150,
710            overlap: 10,
711            similarity_threshold: 0.7,
712        };
713
714        let chunker = SemanticChunker::new(config);
715        let chunks = chunker.chunk(TEST_TEXT);
716
717        assert!(!chunks.is_empty());
718    }
719
720    #[test]
721    fn test_chunking_stats() {
722        let chunker = SemanticChunker::default_config();
723        let chunks = chunker.chunk(TEST_TEXT);
724        let stats = ChunkingStats::from_chunks(&chunks);
725
726        assert_eq!(stats.total_chunks, chunks.len());
727        assert!(stats.avg_chunk_size > 0.0);
728        assert!(stats.avg_coherence >= 0.0 && stats.avg_coherence <= 1.0);
729    }
730
731    #[test]
732    fn test_sentence_splitting() {
733        let chunker = SemanticChunker::default_config();
734        let sentences = chunker.split_sentences("Hello world. How are you? I am fine!");
735
736        assert_eq!(sentences.len(), 3);
737        assert_eq!(sentences[0], "Hello world.");
738        assert_eq!(sentences[1], "How are you?");
739        assert_eq!(sentences[2], "I am fine!");
740    }
741
742    #[test]
743    fn test_lexical_cohesion() {
744        let chunker = SemanticChunker::default_config();
745
746        let cohesion1 =
747            chunker.lexical_cohesion("The cat sat on the mat", "The cat was very happy");
748        assert!(cohesion1 > 0.0);
749
750        let cohesion2 =
751            chunker.lexical_cohesion("The cat sat on the mat", "Quantum physics is complex");
752        assert!(cohesion2 < cohesion1);
753    }
754}
graphrag_core/nlp/semantic_chunking.rs

graphrag_core/nlp/
semantic_chunking.rs