Skip to main content

infiniloom_engine/embedding/
hierarchy.rs

1//! Hierarchical chunking for improved RAG recall
2//!
3//! This module provides hierarchical chunking that creates summary chunks for
4//! container types (classes, structs, modules) with references to their children.
5//! This enables RAG systems to retrieve both high-level overviews and specific
6//! implementation details.
7//!
8//! # Hierarchy Levels
9//!
10//! 1. **Container Summary**: Class/struct with docstring, signature, and child list
11//! 2. **Child Chunks**: Individual methods, fields, nested types
12//!
13//! # Example Output
14//!
15//! For a class `UserService`:
16//! - Summary chunk: Contains class docstring, signature, and list of method names
17//! - Method chunks: Individual `get_user()`, `create_user()`, etc.
18//!
19//! # Usage
20//!
21//! ```rust,ignore
22//! use infiniloom_engine::embedding::hierarchy::HierarchyBuilder;
23//!
24//! let builder = HierarchyBuilder::new();
25//! let hierarchical_chunks = builder.build_hierarchy(&chunks);
26//! ```
27
28use std::collections::HashMap;
29
30use serde::{Deserialize, Serialize};
31
32use super::hasher::hash_content;
33use super::types::{ChunkContext, ChunkKind, ChunkSource, EmbedChunk};
34
35/// Configuration for hierarchical chunking
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct HierarchyConfig {
38    /// Generate summary chunks for classes
39    pub summarize_classes: bool,
40
41    /// Generate summary chunks for structs
42    pub summarize_structs: bool,
43
44    /// Generate summary chunks for modules
45    pub summarize_modules: bool,
46
47    /// Minimum number of children to generate a summary
48    pub min_children_for_summary: usize,
49
50    /// Include child signatures in summary
51    pub include_child_signatures: bool,
52
53    /// Maximum number of children to list in summary
54    pub max_children_in_summary: usize,
55}
56
57impl Default for HierarchyConfig {
58    fn default() -> Self {
59        Self {
60            summarize_classes: true,
61            summarize_structs: true,
62            summarize_modules: false, // Often too broad
63            min_children_for_summary: 2,
64            include_child_signatures: true,
65            max_children_in_summary: 20,
66        }
67    }
68}
69
70/// Reference to a child chunk
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ChildReference {
73    /// Child chunk ID
74    pub id: String,
75
76    /// Child symbol name
77    pub name: String,
78
79    /// Child kind (method, field, etc.)
80    pub kind: ChunkKind,
81
82    /// Child signature (optional)
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub signature: Option<String>,
85
86    /// Brief description from docstring (first line)
87    #[serde(skip_serializing_if = "Option::is_none")]
88    pub brief: Option<String>,
89}
90
91/// Summary chunk for a container type (class, struct, module)
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct HierarchySummary {
94    /// The container chunk this summarizes
95    pub container_id: String,
96
97    /// Container symbol name
98    pub container_name: String,
99
100    /// Container kind
101    pub container_kind: ChunkKind,
102
103    /// List of child references
104    pub children: Vec<ChildReference>,
105
106    /// Total number of children (may exceed max_children_in_summary)
107    pub total_children: usize,
108}
109
110/// Builder for hierarchical chunks
111pub struct HierarchyBuilder {
112    config: HierarchyConfig,
113}
114
115impl Default for HierarchyBuilder {
116    fn default() -> Self {
117        Self::new()
118    }
119}
120
121impl HierarchyBuilder {
122    /// Create a new hierarchy builder with default config
123    pub fn new() -> Self {
124        Self { config: HierarchyConfig::default() }
125    }
126
127    /// Create with custom configuration
128    pub fn with_config(config: HierarchyConfig) -> Self {
129        Self { config }
130    }
131
132    /// Build hierarchy from chunks, returning summary chunks
133    ///
134    /// This identifies parent-child relationships and generates summary chunks
135    /// for containers that meet the threshold.
136    pub fn build_hierarchy(&self, chunks: &[EmbedChunk]) -> Vec<EmbedChunk> {
137        // Group children by parent
138        let mut parent_children: HashMap<String, Vec<&EmbedChunk>> = HashMap::new();
139
140        // Find all chunks that have a parent
141        for chunk in chunks {
142            if let Some(ref parent) = chunk.source.parent {
143                let key = format!("{}:{}", chunk.source.file, parent);
144                parent_children.entry(key).or_default().push(chunk);
145            }
146        }
147
148        // Find container chunks and create summaries
149        let mut summaries = Vec::new();
150
151        for chunk in chunks {
152            if !self.should_summarize(&chunk.kind) {
153                continue;
154            }
155
156            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
157            let children = parent_children.get(&key);
158
159            if let Some(children) = children {
160                if children.len() >= self.config.min_children_for_summary {
161                    if let Some(summary) = self.create_summary_chunk(chunk, children) {
162                        summaries.push(summary);
163                    }
164                }
165            }
166        }
167
168        summaries
169    }
170
171    /// Check if a chunk kind should have a summary
172    fn should_summarize(&self, kind: &ChunkKind) -> bool {
173        match kind {
174            ChunkKind::Class => self.config.summarize_classes,
175            ChunkKind::Struct => self.config.summarize_structs,
176            ChunkKind::Module => self.config.summarize_modules,
177            ChunkKind::Interface | ChunkKind::Trait => self.config.summarize_classes,
178            _ => false,
179        }
180    }
181
182    /// Create a summary chunk for a container
183    fn create_summary_chunk(
184        &self,
185        container: &EmbedChunk,
186        children: &[&EmbedChunk],
187    ) -> Option<EmbedChunk> {
188        // Build child references
189        let mut child_refs: Vec<ChildReference> = children
190            .iter()
191            .take(self.config.max_children_in_summary)
192            .map(|child| ChildReference {
193                id: child.id.clone(),
194                name: child.source.symbol.clone(),
195                kind: child.kind,
196                signature: if self.config.include_child_signatures {
197                    child.context.signature.clone()
198                } else {
199                    None
200                },
201                brief: child.context.docstring.as_ref().and_then(|d| {
202                    d.lines().next().map(|s| {
203                        let s = s.trim();
204                        if s.len() > 100 {
205                            format!("{}...", &s[..97])
206                        } else {
207                            s.to_owned()
208                        }
209                    })
210                }),
211            })
212            .collect();
213
214        // Sort by name for determinism
215        child_refs.sort_by(|a, b| a.name.cmp(&b.name));
216
217        // Build summary content
218        let summary_content = self.build_summary_content(container, &child_refs, children.len());
219
220        // Hash the summary content
221        let hash = hash_content(&summary_content);
222
223        // Create tags for the summary
224        let mut tags = vec!["summary".to_owned(), "hierarchy".to_owned()];
225        tags.extend(container.context.tags.iter().cloned());
226
227        Some(EmbedChunk {
228            id: hash.short_id,
229            full_hash: hash.full_hash,
230            content: summary_content,
231            tokens: 0, // Will be computed by caller if needed
232            kind: container.kind,
233            source: ChunkSource {
234                repo: container.source.repo.clone(),
235                file: container.source.file.clone(),
236                lines: container.source.lines,
237                symbol: format!("{}_summary", container.source.symbol),
238                fqn: container
239                    .source
240                    .fqn
241                    .as_ref()
242                    .map(|f| format!("{}_summary", f)),
243                language: container.source.language.clone(),
244                parent: container.source.parent.clone(),
245                visibility: container.source.visibility,
246                is_test: container.source.is_test,
247                module_path: container.source.module_path.clone(),
248                parent_chunk_id: None,
249            },
250            children_ids: Vec::new(),
251            context: ChunkContext {
252                docstring: container.context.docstring.clone(),
253                comments: Vec::new(),
254                signature: container.context.signature.clone(),
255                calls: Vec::new(), // Summary doesn't have direct calls
256                called_by: Vec::new(),
257                imports: container.context.imports.clone(),
258                tags,
259                keywords: container.context.keywords.clone(),
260                context_prefix: container.context.context_prefix.clone(),
261                summary: None,
262                qualified_calls: Vec::new(),
263                unresolved_calls: Vec::new(),
264                identifiers: container.context.identifiers.clone(),
265                type_signature: None,
266                parameter_types: Vec::new(),
267                return_type: None,
268                error_types: Vec::new(),
269                lines_of_code: 0,
270                max_nesting_depth: 0,
271                git: container.context.git.clone(),
272                complexity_score: None,
273                dependents_count: None,
274            },
275            repr: "code".to_owned(),
276            code_chunk_id: None,
277            part: None,
278        })
279    }
280
281    /// Build the summary content string
282    fn build_summary_content(
283        &self,
284        container: &EmbedChunk,
285        child_refs: &[ChildReference],
286        total_children: usize,
287    ) -> String {
288        let mut content = String::new();
289
290        // Add container signature if available
291        if let Some(ref sig) = container.context.signature {
292            content.push_str(sig);
293            content.push('\n');
294        }
295
296        // Add docstring if available
297        if let Some(ref doc) = container.context.docstring {
298            content.push('\n');
299            content.push_str(doc);
300            content.push('\n');
301        }
302
303        // Add child list
304        content.push_str("\n/* Members:\n");
305
306        for child in child_refs {
307            content.push_str(" * - ");
308            content.push_str(&child.name);
309
310            if let Some(ref sig) = child.signature {
311                // Compact signature (remove body, keep just declaration)
312                let sig_line = sig.lines().next().unwrap_or(sig).trim();
313                if sig_line != child.name {
314                    content.push_str(": ");
315                    content.push_str(sig_line);
316                }
317            }
318
319            if let Some(ref brief) = child.brief {
320                content.push_str(" - ");
321                content.push_str(brief);
322            }
323
324            content.push('\n');
325        }
326
327        if total_children > child_refs.len() {
328            content.push_str(&format!(" * ... and {} more\n", total_children - child_refs.len()));
329        }
330
331        content.push_str(" */\n");
332
333        content
334    }
335
336    /// Enrich existing chunks with hierarchy metadata
337    ///
338    /// This adds `hierarchy` information to chunk context tags
339    pub fn enrich_chunks(&self, chunks: &mut [EmbedChunk]) {
340        // Build parent -> children map
341        let mut parent_children: HashMap<String, Vec<String>> = HashMap::new();
342
343        for chunk in chunks.iter() {
344            if let Some(ref parent) = chunk.source.parent {
345                let key = format!("{}:{}", chunk.source.file, parent);
346                parent_children
347                    .entry(key)
348                    .or_default()
349                    .push(chunk.source.symbol.clone());
350            }
351        }
352
353        // Enrich container chunks with child count
354        for chunk in chunks.iter_mut() {
355            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
356            if let Some(children) = parent_children.get(&key) {
357                chunk
358                    .context
359                    .tags
360                    .push(format!("has-children:{}", children.len()));
361            }
362
363            // Mark chunks that have a parent
364            if chunk.source.parent.is_some() {
365                chunk.context.tags.push("has-parent".to_owned());
366            }
367        }
368    }
369}
370
371/// Get the hierarchy summary for a specific container
372pub fn get_hierarchy_summary(
373    chunks: &[EmbedChunk],
374    container_symbol: &str,
375    file: &str,
376) -> Option<HierarchySummary> {
377    // Find the container
378    let container = chunks
379        .iter()
380        .find(|c| c.source.symbol == container_symbol && c.source.file == file)?;
381
382    // Find children
383    let children: Vec<ChildReference> = chunks
384        .iter()
385        .filter(|c| c.source.parent.as_deref() == Some(container_symbol) && c.source.file == file)
386        .map(|c| ChildReference {
387            id: c.id.clone(),
388            name: c.source.symbol.clone(),
389            kind: c.kind,
390            signature: c.context.signature.clone(),
391            brief: c
392                .context
393                .docstring
394                .as_ref()
395                .and_then(|d| d.lines().next().map(|s| s.trim().to_owned())),
396        })
397        .collect();
398
399    Some(HierarchySummary {
400        container_id: container.id.clone(),
401        container_name: container.source.symbol.clone(),
402        container_kind: container.kind,
403        total_children: children.len(),
404        children,
405    })
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411    use crate::embedding::types::{RepoIdentifier, Visibility};
412
413    fn create_test_chunk(
414        id: &str,
415        symbol: &str,
416        kind: ChunkKind,
417        parent: Option<&str>,
418        signature: Option<&str>,
419        docstring: Option<&str>,
420    ) -> EmbedChunk {
421        EmbedChunk {
422            id: id.to_owned(),
423            full_hash: format!("{}_full", id),
424            content: format!("content of {}", symbol),
425            tokens: 100,
426            kind,
427            source: ChunkSource {
428                repo: RepoIdentifier::default(),
429                file: "test.rs".to_owned(),
430                lines: (1, 10),
431                symbol: symbol.to_owned(),
432                fqn: Some(format!("test::{}", symbol)),
433                language: "Rust".to_owned(),
434                parent: parent.map(String::from),
435                visibility: Visibility::Public,
436                is_test: false,
437                module_path: None,
438                parent_chunk_id: None,
439            },
440            children_ids: Vec::new(),
441            context: ChunkContext {
442                docstring: docstring.map(String::from),
443                comments: Vec::new(),
444                signature: signature.map(String::from),
445                calls: Vec::new(),
446                called_by: Vec::new(),
447                imports: Vec::new(),
448                tags: Vec::new(),
449                keywords: Vec::new(),
450                context_prefix: None,
451                summary: None,
452                qualified_calls: Vec::new(),
453                unresolved_calls: Vec::new(),
454                identifiers: None,
455                type_signature: None,
456                parameter_types: Vec::new(),
457                return_type: None,
458                error_types: Vec::new(),
459                lines_of_code: 10,
460                max_nesting_depth: 2,
461                git: None,
462                complexity_score: None,
463                dependents_count: None,
464            },
465            repr: "code".to_string(),
466            code_chunk_id: None,
467            part: None,
468        }
469    }
470
471    #[test]
472    fn test_build_hierarchy_basic() {
473        let chunks = vec![
474            create_test_chunk(
475                "c1",
476                "UserService",
477                ChunkKind::Class,
478                None,
479                Some("class UserService"),
480                Some("Service for user management"),
481            ),
482            create_test_chunk(
483                "c2",
484                "get_user",
485                ChunkKind::Method,
486                Some("UserService"),
487                Some("fn get_user(&self, id: u64) -> User"),
488                Some("Get a user by ID"),
489            ),
490            create_test_chunk(
491                "c3",
492                "create_user",
493                ChunkKind::Method,
494                Some("UserService"),
495                Some("fn create_user(&self, data: UserData) -> User"),
496                Some("Create a new user"),
497            ),
498            create_test_chunk(
499                "c4",
500                "delete_user",
501                ChunkKind::Method,
502                Some("UserService"),
503                Some("fn delete_user(&self, id: u64)"),
504                Some("Delete a user"),
505            ),
506        ];
507
508        let builder = HierarchyBuilder::new();
509        let summaries = builder.build_hierarchy(&chunks);
510
511        assert_eq!(summaries.len(), 1);
512        assert!(summaries[0].source.symbol.contains("summary"));
513        assert!(summaries[0].content.contains("Members:"));
514        assert!(summaries[0].content.contains("get_user"));
515        assert!(summaries[0].content.contains("create_user"));
516        assert!(summaries[0].content.contains("delete_user"));
517    }
518
519    #[test]
520    fn test_hierarchy_min_children() {
521        let chunks = vec![
522            create_test_chunk(
523                "c1",
524                "SmallClass",
525                ChunkKind::Class,
526                None,
527                Some("class SmallClass"),
528                None,
529            ),
530            create_test_chunk(
531                "c2",
532                "only_method",
533                ChunkKind::Method,
534                Some("SmallClass"),
535                None,
536                None,
537            ),
538        ];
539
540        let builder = HierarchyBuilder::with_config(HierarchyConfig {
541            min_children_for_summary: 2, // Requires at least 2 children
542            ..Default::default()
543        });
544
545        let summaries = builder.build_hierarchy(&chunks);
546        assert!(summaries.is_empty()); // Only 1 child, no summary
547    }
548
549    #[test]
550    fn test_hierarchy_enrich_chunks() {
551        let mut chunks = vec![
552            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
553            create_test_chunk("c2", "method1", ChunkKind::Method, Some("MyClass"), None, None),
554            create_test_chunk("c3", "method2", ChunkKind::Method, Some("MyClass"), None, None),
555        ];
556
557        let builder = HierarchyBuilder::new();
558        builder.enrich_chunks(&mut chunks);
559
560        // Container should have child count tag
561        assert!(chunks[0]
562            .context
563            .tags
564            .iter()
565            .any(|t| t.starts_with("has-children:")));
566
567        // Children should have parent tag
568        assert!(chunks[1].context.tags.contains(&"has-parent".to_owned()));
569        assert!(chunks[2].context.tags.contains(&"has-parent".to_owned()));
570    }
571
572    #[test]
573    fn test_get_hierarchy_summary() {
574        let chunks = vec![
575            create_test_chunk(
576                "c1",
577                "MyStruct",
578                ChunkKind::Struct,
579                None,
580                Some("struct MyStruct"),
581                None,
582            ),
583            create_test_chunk("c2", "field1", ChunkKind::Variable, Some("MyStruct"), None, None),
584            create_test_chunk(
585                "c3",
586                "new",
587                ChunkKind::Function,
588                Some("MyStruct"),
589                Some("fn new() -> Self"),
590                Some("Create a new instance"),
591            ),
592        ];
593
594        let summary = get_hierarchy_summary(&chunks, "MyStruct", "test.rs");
595        assert!(summary.is_some());
596
597        let summary = summary.unwrap();
598        assert_eq!(summary.container_name, "MyStruct");
599        assert_eq!(summary.total_children, 2);
600        assert!(summary.children.iter().any(|c| c.name == "field1"));
601        assert!(summary.children.iter().any(|c| c.name == "new"));
602    }
603
604    #[test]
605    fn test_summary_content_format() {
606        let chunks = vec![
607            create_test_chunk(
608                "c1",
609                "Calculator",
610                ChunkKind::Class,
611                None,
612                Some("pub struct Calculator"),
613                Some("A simple calculator"),
614            ),
615            create_test_chunk(
616                "c2",
617                "add",
618                ChunkKind::Method,
619                Some("Calculator"),
620                Some("fn add(&self, a: i32, b: i32) -> i32"),
621                Some("Add two numbers"),
622            ),
623            create_test_chunk(
624                "c3",
625                "subtract",
626                ChunkKind::Method,
627                Some("Calculator"),
628                Some("fn subtract(&self, a: i32, b: i32) -> i32"),
629                Some("Subtract two numbers"),
630            ),
631        ];
632
633        let builder = HierarchyBuilder::new();
634        let summaries = builder.build_hierarchy(&chunks);
635
636        assert_eq!(summaries.len(), 1);
637        let summary = &summaries[0];
638
639        // Check content structure
640        assert!(summary.content.contains("pub struct Calculator"));
641        assert!(summary.content.contains("A simple calculator"));
642        assert!(summary.content.contains("/* Members:"));
643        assert!(summary.content.contains(" * - add"));
644        assert!(summary.content.contains(" * - subtract"));
645        assert!(summary.content.contains(" */"));
646    }
647
648    #[test]
649    fn test_config_options() {
650        let config = HierarchyConfig {
651            summarize_classes: true,
652            summarize_structs: false,
653            summarize_modules: false,
654            min_children_for_summary: 1,
655            include_child_signatures: false,
656            max_children_in_summary: 5,
657        };
658
659        let builder = HierarchyBuilder::with_config(config);
660
661        let class_chunks = vec![
662            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
663            create_test_chunk("c2", "m1", ChunkKind::Method, Some("MyClass"), None, None),
664        ];
665
666        let struct_chunks = vec![
667            create_test_chunk("s1", "MyStruct", ChunkKind::Struct, None, None, None),
668            create_test_chunk("s2", "f1", ChunkKind::Variable, Some("MyStruct"), None, None),
669        ];
670
671        // Class should get summary
672        assert_eq!(builder.build_hierarchy(&class_chunks).len(), 1);
673
674        // Struct should NOT get summary (disabled)
675        assert_eq!(builder.build_hierarchy(&struct_chunks).len(), 0);
676    }
677}