infiniloom_engine/embedding/
hierarchy.rs

1//! Hierarchical chunking for improved RAG recall
2//!
3//! This module provides hierarchical chunking that creates summary chunks for
4//! container types (classes, structs, modules) with references to their children.
5//! This enables RAG systems to retrieve both high-level overviews and specific
6//! implementation details.
7//!
8//! # Hierarchy Levels
9//!
10//! 1. **Container Summary**: Class/struct with docstring, signature, and child list
11//! 2. **Child Chunks**: Individual methods, fields, nested types
12//!
13//! # Example Output
14//!
15//! For a class `UserService`:
16//! - Summary chunk: Contains class docstring, signature, and list of method names
17//! - Method chunks: Individual `get_user()`, `create_user()`, etc.
18//!
19//! # Usage
20//!
21//! ```rust,ignore
22//! use infiniloom_engine::embedding::hierarchy::HierarchyBuilder;
23//!
24//! let builder = HierarchyBuilder::new();
25//! let hierarchical_chunks = builder.build_hierarchy(&chunks);
26//! ```
27
28use std::collections::HashMap;
29
30use serde::{Deserialize, Serialize};
31
32use super::hasher::hash_content;
33use super::types::{ChunkContext, ChunkKind, ChunkSource, EmbedChunk};
34
35/// Configuration for hierarchical chunking
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct HierarchyConfig {
38    /// Generate summary chunks for classes
39    pub summarize_classes: bool,
40
41    /// Generate summary chunks for structs
42    pub summarize_structs: bool,
43
44    /// Generate summary chunks for modules
45    pub summarize_modules: bool,
46
47    /// Minimum number of children to generate a summary
48    pub min_children_for_summary: usize,
49
50    /// Include child signatures in summary
51    pub include_child_signatures: bool,
52
53    /// Maximum number of children to list in summary
54    pub max_children_in_summary: usize,
55}
56
57impl Default for HierarchyConfig {
58    fn default() -> Self {
59        Self {
60            summarize_classes: true,
61            summarize_structs: true,
62            summarize_modules: false, // Often too broad
63            min_children_for_summary: 2,
64            include_child_signatures: true,
65            max_children_in_summary: 20,
66        }
67    }
68}
69
70/// Reference to a child chunk
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ChildReference {
73    /// Child chunk ID
74    pub id: String,
75
76    /// Child symbol name
77    pub name: String,
78
79    /// Child kind (method, field, etc.)
80    pub kind: ChunkKind,
81
82    /// Child signature (optional)
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub signature: Option<String>,
85
86    /// Brief description from docstring (first line)
87    #[serde(skip_serializing_if = "Option::is_none")]
88    pub brief: Option<String>,
89}
90
91/// Summary chunk for a container type (class, struct, module)
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct HierarchySummary {
94    /// The container chunk this summarizes
95    pub container_id: String,
96
97    /// Container symbol name
98    pub container_name: String,
99
100    /// Container kind
101    pub container_kind: ChunkKind,
102
103    /// List of child references
104    pub children: Vec<ChildReference>,
105
106    /// Total number of children (may exceed max_children_in_summary)
107    pub total_children: usize,
108}
109
110/// Builder for hierarchical chunks
111pub struct HierarchyBuilder {
112    config: HierarchyConfig,
113}
114
115impl Default for HierarchyBuilder {
116    fn default() -> Self {
117        Self::new()
118    }
119}
120
121impl HierarchyBuilder {
122    /// Create a new hierarchy builder with default config
123    pub fn new() -> Self {
124        Self {
125            config: HierarchyConfig::default(),
126        }
127    }
128
129    /// Create with custom configuration
130    pub fn with_config(config: HierarchyConfig) -> Self {
131        Self { config }
132    }
133
134    /// Build hierarchy from chunks, returning summary chunks
135    ///
136    /// This identifies parent-child relationships and generates summary chunks
137    /// for containers that meet the threshold.
138    pub fn build_hierarchy(&self, chunks: &[EmbedChunk]) -> Vec<EmbedChunk> {
139        // Group children by parent
140        let mut parent_children: HashMap<String, Vec<&EmbedChunk>> = HashMap::new();
141
142        // Find all chunks that have a parent
143        for chunk in chunks {
144            if let Some(ref parent) = chunk.source.parent {
145                let key = format!("{}:{}", chunk.source.file, parent);
146                parent_children.entry(key).or_default().push(chunk);
147            }
148        }
149
150        // Find container chunks and create summaries
151        let mut summaries = Vec::new();
152
153        for chunk in chunks {
154            if !self.should_summarize(&chunk.kind) {
155                continue;
156            }
157
158            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
159            let children = parent_children.get(&key);
160
161            if let Some(children) = children {
162                if children.len() >= self.config.min_children_for_summary {
163                    if let Some(summary) = self.create_summary_chunk(chunk, children) {
164                        summaries.push(summary);
165                    }
166                }
167            }
168        }
169
170        summaries
171    }
172
173    /// Check if a chunk kind should have a summary
174    fn should_summarize(&self, kind: &ChunkKind) -> bool {
175        match kind {
176            ChunkKind::Class => self.config.summarize_classes,
177            ChunkKind::Struct => self.config.summarize_structs,
178            ChunkKind::Module => self.config.summarize_modules,
179            ChunkKind::Interface | ChunkKind::Trait => self.config.summarize_classes,
180            _ => false,
181        }
182    }
183
184    /// Create a summary chunk for a container
185    fn create_summary_chunk(
186        &self,
187        container: &EmbedChunk,
188        children: &[&EmbedChunk],
189    ) -> Option<EmbedChunk> {
190        // Build child references
191        let mut child_refs: Vec<ChildReference> = children
192            .iter()
193            .take(self.config.max_children_in_summary)
194            .map(|child| ChildReference {
195                id: child.id.clone(),
196                name: child.source.symbol.clone(),
197                kind: child.kind,
198                signature: if self.config.include_child_signatures {
199                    child.context.signature.clone()
200                } else {
201                    None
202                },
203                brief: child.context.docstring.as_ref().and_then(|d| {
204                    d.lines().next().map(|s| {
205                        let s = s.trim();
206                        if s.len() > 100 {
207                            format!("{}...", &s[..97])
208                        } else {
209                            s.to_string()
210                        }
211                    })
212                }),
213            })
214            .collect();
215
216        // Sort by name for determinism
217        child_refs.sort_by(|a, b| a.name.cmp(&b.name));
218
219        // Build summary content
220        let summary_content = self.build_summary_content(container, &child_refs, children.len());
221
222        // Hash the summary content
223        let hash = hash_content(&summary_content);
224
225        // Create tags for the summary
226        let mut tags = vec!["summary".to_string(), "hierarchy".to_string()];
227        tags.extend(container.context.tags.iter().cloned());
228
229        Some(EmbedChunk {
230            id: hash.short_id,
231            full_hash: hash.full_hash,
232            content: summary_content,
233            tokens: 0, // Will be computed by caller if needed
234            kind: container.kind,
235            source: ChunkSource {
236                repo: container.source.repo.clone(),
237                file: container.source.file.clone(),
238                lines: container.source.lines,
239                symbol: format!("{}_summary", container.source.symbol),
240                fqn: container.source.fqn.as_ref().map(|f| format!("{}_summary", f)),
241                language: container.source.language.clone(),
242                parent: container.source.parent.clone(),
243                visibility: container.source.visibility,
244                is_test: container.source.is_test,
245            },
246            context: ChunkContext {
247                docstring: container.context.docstring.clone(),
248                comments: Vec::new(),
249                signature: container.context.signature.clone(),
250                calls: Vec::new(), // Summary doesn't have direct calls
251                called_by: Vec::new(),
252                imports: container.context.imports.clone(),
253                tags,
254                lines_of_code: 0,
255                max_nesting_depth: 0,
256            },
257            part: None,
258        })
259    }
260
261    /// Build the summary content string
262    fn build_summary_content(
263        &self,
264        container: &EmbedChunk,
265        child_refs: &[ChildReference],
266        total_children: usize,
267    ) -> String {
268        let mut content = String::new();
269
270        // Add container signature if available
271        if let Some(ref sig) = container.context.signature {
272            content.push_str(sig);
273            content.push('\n');
274        }
275
276        // Add docstring if available
277        if let Some(ref doc) = container.context.docstring {
278            content.push('\n');
279            content.push_str(doc);
280            content.push('\n');
281        }
282
283        // Add child list
284        content.push_str("\n/* Members:\n");
285
286        for child in child_refs {
287            content.push_str(" * - ");
288            content.push_str(&child.name);
289
290            if let Some(ref sig) = child.signature {
291                // Compact signature (remove body, keep just declaration)
292                let sig_line = sig.lines().next().unwrap_or(sig).trim();
293                if sig_line != child.name {
294                    content.push_str(": ");
295                    content.push_str(sig_line);
296                }
297            }
298
299            if let Some(ref brief) = child.brief {
300                content.push_str(" - ");
301                content.push_str(brief);
302            }
303
304            content.push('\n');
305        }
306
307        if total_children > child_refs.len() {
308            content.push_str(&format!(
309                " * ... and {} more\n",
310                total_children - child_refs.len()
311            ));
312        }
313
314        content.push_str(" */\n");
315
316        content
317    }
318
319    /// Enrich existing chunks with hierarchy metadata
320    ///
321    /// This adds `hierarchy` information to chunk context tags
322    pub fn enrich_chunks(&self, chunks: &mut [EmbedChunk]) {
323        // Build parent -> children map
324        let mut parent_children: HashMap<String, Vec<String>> = HashMap::new();
325
326        for chunk in chunks.iter() {
327            if let Some(ref parent) = chunk.source.parent {
328                let key = format!("{}:{}", chunk.source.file, parent);
329                parent_children
330                    .entry(key)
331                    .or_default()
332                    .push(chunk.source.symbol.clone());
333            }
334        }
335
336        // Enrich container chunks with child count
337        for chunk in chunks.iter_mut() {
338            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
339            if let Some(children) = parent_children.get(&key) {
340                chunk
341                    .context
342                    .tags
343                    .push(format!("has-children:{}", children.len()));
344            }
345
346            // Mark chunks that have a parent
347            if chunk.source.parent.is_some() {
348                chunk.context.tags.push("has-parent".to_string());
349            }
350        }
351    }
352}
353
354/// Get the hierarchy summary for a specific container
355pub fn get_hierarchy_summary(
356    chunks: &[EmbedChunk],
357    container_symbol: &str,
358    file: &str,
359) -> Option<HierarchySummary> {
360    // Find the container
361    let container = chunks
362        .iter()
363        .find(|c| c.source.symbol == container_symbol && c.source.file == file)?;
364
365    // Find children
366    let children: Vec<ChildReference> = chunks
367        .iter()
368        .filter(|c| {
369            c.source.parent.as_deref() == Some(container_symbol) && c.source.file == file
370        })
371        .map(|c| ChildReference {
372            id: c.id.clone(),
373            name: c.source.symbol.clone(),
374            kind: c.kind,
375            signature: c.context.signature.clone(),
376            brief: c.context.docstring.as_ref().and_then(|d| {
377                d.lines().next().map(|s| s.trim().to_string())
378            }),
379        })
380        .collect();
381
382    Some(HierarchySummary {
383        container_id: container.id.clone(),
384        container_name: container.source.symbol.clone(),
385        container_kind: container.kind,
386        total_children: children.len(),
387        children,
388    })
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394    use crate::embedding::types::{RepoIdentifier, Visibility};
395
396    fn create_test_chunk(
397        id: &str,
398        symbol: &str,
399        kind: ChunkKind,
400        parent: Option<&str>,
401        signature: Option<&str>,
402        docstring: Option<&str>,
403    ) -> EmbedChunk {
404        EmbedChunk {
405            id: id.to_string(),
406            full_hash: format!("{}_full", id),
407            content: format!("content of {}", symbol),
408            tokens: 100,
409            kind,
410            source: ChunkSource {
411                repo: RepoIdentifier::default(),
412                file: "test.rs".to_string(),
413                lines: (1, 10),
414                symbol: symbol.to_string(),
415                fqn: Some(format!("test::{}", symbol)),
416                language: "Rust".to_string(),
417                parent: parent.map(String::from),
418                visibility: Visibility::Public,
419                is_test: false,
420            },
421            context: ChunkContext {
422                docstring: docstring.map(String::from),
423                comments: Vec::new(),
424                signature: signature.map(String::from),
425                calls: Vec::new(),
426                called_by: Vec::new(),
427                imports: Vec::new(),
428                tags: Vec::new(),
429                lines_of_code: 10,
430                max_nesting_depth: 2,
431            },
432            part: None,
433        }
434    }
435
436    #[test]
437    fn test_build_hierarchy_basic() {
438        let chunks = vec![
439            create_test_chunk(
440                "c1",
441                "UserService",
442                ChunkKind::Class,
443                None,
444                Some("class UserService"),
445                Some("Service for user management"),
446            ),
447            create_test_chunk(
448                "c2",
449                "get_user",
450                ChunkKind::Method,
451                Some("UserService"),
452                Some("fn get_user(&self, id: u64) -> User"),
453                Some("Get a user by ID"),
454            ),
455            create_test_chunk(
456                "c3",
457                "create_user",
458                ChunkKind::Method,
459                Some("UserService"),
460                Some("fn create_user(&self, data: UserData) -> User"),
461                Some("Create a new user"),
462            ),
463            create_test_chunk(
464                "c4",
465                "delete_user",
466                ChunkKind::Method,
467                Some("UserService"),
468                Some("fn delete_user(&self, id: u64)"),
469                Some("Delete a user"),
470            ),
471        ];
472
473        let builder = HierarchyBuilder::new();
474        let summaries = builder.build_hierarchy(&chunks);
475
476        assert_eq!(summaries.len(), 1);
477        assert!(summaries[0].source.symbol.contains("summary"));
478        assert!(summaries[0].content.contains("Members:"));
479        assert!(summaries[0].content.contains("get_user"));
480        assert!(summaries[0].content.contains("create_user"));
481        assert!(summaries[0].content.contains("delete_user"));
482    }
483
484    #[test]
485    fn test_hierarchy_min_children() {
486        let chunks = vec![
487            create_test_chunk(
488                "c1",
489                "SmallClass",
490                ChunkKind::Class,
491                None,
492                Some("class SmallClass"),
493                None,
494            ),
495            create_test_chunk(
496                "c2",
497                "only_method",
498                ChunkKind::Method,
499                Some("SmallClass"),
500                None,
501                None,
502            ),
503        ];
504
505        let builder = HierarchyBuilder::with_config(HierarchyConfig {
506            min_children_for_summary: 2, // Requires at least 2 children
507            ..Default::default()
508        });
509
510        let summaries = builder.build_hierarchy(&chunks);
511        assert!(summaries.is_empty()); // Only 1 child, no summary
512    }
513
514    #[test]
515    fn test_hierarchy_enrich_chunks() {
516        let mut chunks = vec![
517            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
518            create_test_chunk(
519                "c2",
520                "method1",
521                ChunkKind::Method,
522                Some("MyClass"),
523                None,
524                None,
525            ),
526            create_test_chunk(
527                "c3",
528                "method2",
529                ChunkKind::Method,
530                Some("MyClass"),
531                None,
532                None,
533            ),
534        ];
535
536        let builder = HierarchyBuilder::new();
537        builder.enrich_chunks(&mut chunks);
538
539        // Container should have child count tag
540        assert!(chunks[0]
541            .context
542            .tags
543            .iter()
544            .any(|t| t.starts_with("has-children:")));
545
546        // Children should have parent tag
547        assert!(chunks[1].context.tags.contains(&"has-parent".to_string()));
548        assert!(chunks[2].context.tags.contains(&"has-parent".to_string()));
549    }
550
551    #[test]
552    fn test_get_hierarchy_summary() {
553        let chunks = vec![
554            create_test_chunk(
555                "c1",
556                "MyStruct",
557                ChunkKind::Struct,
558                None,
559                Some("struct MyStruct"),
560                None,
561            ),
562            create_test_chunk(
563                "c2",
564                "field1",
565                ChunkKind::Variable,
566                Some("MyStruct"),
567                None,
568                None,
569            ),
570            create_test_chunk(
571                "c3",
572                "new",
573                ChunkKind::Function,
574                Some("MyStruct"),
575                Some("fn new() -> Self"),
576                Some("Create a new instance"),
577            ),
578        ];
579
580        let summary = get_hierarchy_summary(&chunks, "MyStruct", "test.rs");
581        assert!(summary.is_some());
582
583        let summary = summary.unwrap();
584        assert_eq!(summary.container_name, "MyStruct");
585        assert_eq!(summary.total_children, 2);
586        assert!(summary.children.iter().any(|c| c.name == "field1"));
587        assert!(summary.children.iter().any(|c| c.name == "new"));
588    }
589
590    #[test]
591    fn test_summary_content_format() {
592        let chunks = vec![
593            create_test_chunk(
594                "c1",
595                "Calculator",
596                ChunkKind::Class,
597                None,
598                Some("pub struct Calculator"),
599                Some("A simple calculator"),
600            ),
601            create_test_chunk(
602                "c2",
603                "add",
604                ChunkKind::Method,
605                Some("Calculator"),
606                Some("fn add(&self, a: i32, b: i32) -> i32"),
607                Some("Add two numbers"),
608            ),
609            create_test_chunk(
610                "c3",
611                "subtract",
612                ChunkKind::Method,
613                Some("Calculator"),
614                Some("fn subtract(&self, a: i32, b: i32) -> i32"),
615                Some("Subtract two numbers"),
616            ),
617        ];
618
619        let builder = HierarchyBuilder::new();
620        let summaries = builder.build_hierarchy(&chunks);
621
622        assert_eq!(summaries.len(), 1);
623        let summary = &summaries[0];
624
625        // Check content structure
626        assert!(summary.content.contains("pub struct Calculator"));
627        assert!(summary.content.contains("A simple calculator"));
628        assert!(summary.content.contains("/* Members:"));
629        assert!(summary.content.contains(" * - add"));
630        assert!(summary.content.contains(" * - subtract"));
631        assert!(summary.content.contains(" */"));
632    }
633
634    #[test]
635    fn test_config_options() {
636        let config = HierarchyConfig {
637            summarize_classes: true,
638            summarize_structs: false,
639            summarize_modules: false,
640            min_children_for_summary: 1,
641            include_child_signatures: false,
642            max_children_in_summary: 5,
643        };
644
645        let builder = HierarchyBuilder::with_config(config);
646
647        let class_chunks = vec![
648            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
649            create_test_chunk("c2", "m1", ChunkKind::Method, Some("MyClass"), None, None),
650        ];
651
652        let struct_chunks = vec![
653            create_test_chunk("s1", "MyStruct", ChunkKind::Struct, None, None, None),
654            create_test_chunk("s2", "f1", ChunkKind::Variable, Some("MyStruct"), None, None),
655        ];
656
657        // Class should get summary
658        assert_eq!(builder.build_hierarchy(&class_chunks).len(), 1);
659
660        // Struct should NOT get summary (disabled)
661        assert_eq!(builder.build_hierarchy(&struct_chunks).len(), 0);
662    }
663}