infiniloom_engine/embedding/
hierarchy.rs

1//! Hierarchical chunking for improved RAG recall
2//!
3//! This module provides hierarchical chunking that creates summary chunks for
4//! container types (classes, structs, modules) with references to their children.
5//! This enables RAG systems to retrieve both high-level overviews and specific
6//! implementation details.
7//!
8//! # Hierarchy Levels
9//!
10//! 1. **Container Summary**: Class/struct with docstring, signature, and child list
11//! 2. **Child Chunks**: Individual methods, fields, nested types
12//!
13//! # Example Output
14//!
15//! For a class `UserService`:
16//! - Summary chunk: Contains class docstring, signature, and list of method names
17//! - Method chunks: Individual `get_user()`, `create_user()`, etc.
18//!
19//! # Usage
20//!
21//! ```rust,ignore
22//! use infiniloom_engine::embedding::hierarchy::HierarchyBuilder;
23//!
24//! let builder = HierarchyBuilder::new();
25//! let hierarchical_chunks = builder.build_hierarchy(&chunks);
26//! ```
27
28use std::collections::HashMap;
29
30use serde::{Deserialize, Serialize};
31
32use super::hasher::hash_content;
33use super::types::{ChunkContext, ChunkKind, ChunkSource, EmbedChunk};
34
35/// Configuration for hierarchical chunking
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct HierarchyConfig {
38    /// Generate summary chunks for classes
39    pub summarize_classes: bool,
40
41    /// Generate summary chunks for structs
42    pub summarize_structs: bool,
43
44    /// Generate summary chunks for modules
45    pub summarize_modules: bool,
46
47    /// Minimum number of children to generate a summary
48    pub min_children_for_summary: usize,
49
50    /// Include child signatures in summary
51    pub include_child_signatures: bool,
52
53    /// Maximum number of children to list in summary
54    pub max_children_in_summary: usize,
55}
56
57impl Default for HierarchyConfig {
58    fn default() -> Self {
59        Self {
60            summarize_classes: true,
61            summarize_structs: true,
62            summarize_modules: false, // Often too broad
63            min_children_for_summary: 2,
64            include_child_signatures: true,
65            max_children_in_summary: 20,
66        }
67    }
68}
69
70/// Reference to a child chunk
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ChildReference {
73    /// Child chunk ID
74    pub id: String,
75
76    /// Child symbol name
77    pub name: String,
78
79    /// Child kind (method, field, etc.)
80    pub kind: ChunkKind,
81
82    /// Child signature (optional)
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub signature: Option<String>,
85
86    /// Brief description from docstring (first line)
87    #[serde(skip_serializing_if = "Option::is_none")]
88    pub brief: Option<String>,
89}
90
91/// Summary chunk for a container type (class, struct, module)
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct HierarchySummary {
94    /// The container chunk this summarizes
95    pub container_id: String,
96
97    /// Container symbol name
98    pub container_name: String,
99
100    /// Container kind
101    pub container_kind: ChunkKind,
102
103    /// List of child references
104    pub children: Vec<ChildReference>,
105
106    /// Total number of children (may exceed max_children_in_summary)
107    pub total_children: usize,
108}
109
110/// Builder for hierarchical chunks
111pub struct HierarchyBuilder {
112    config: HierarchyConfig,
113}
114
115impl Default for HierarchyBuilder {
116    fn default() -> Self {
117        Self::new()
118    }
119}
120
121impl HierarchyBuilder {
122    /// Create a new hierarchy builder with default config
123    pub fn new() -> Self {
124        Self { config: HierarchyConfig::default() }
125    }
126
127    /// Create with custom configuration
128    pub fn with_config(config: HierarchyConfig) -> Self {
129        Self { config }
130    }
131
132    /// Build hierarchy from chunks, returning summary chunks
133    ///
134    /// This identifies parent-child relationships and generates summary chunks
135    /// for containers that meet the threshold.
136    pub fn build_hierarchy(&self, chunks: &[EmbedChunk]) -> Vec<EmbedChunk> {
137        // Group children by parent
138        let mut parent_children: HashMap<String, Vec<&EmbedChunk>> = HashMap::new();
139
140        // Find all chunks that have a parent
141        for chunk in chunks {
142            if let Some(ref parent) = chunk.source.parent {
143                let key = format!("{}:{}", chunk.source.file, parent);
144                parent_children.entry(key).or_default().push(chunk);
145            }
146        }
147
148        // Find container chunks and create summaries
149        let mut summaries = Vec::new();
150
151        for chunk in chunks {
152            if !self.should_summarize(&chunk.kind) {
153                continue;
154            }
155
156            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
157            let children = parent_children.get(&key);
158
159            if let Some(children) = children {
160                if children.len() >= self.config.min_children_for_summary {
161                    if let Some(summary) = self.create_summary_chunk(chunk, children) {
162                        summaries.push(summary);
163                    }
164                }
165            }
166        }
167
168        summaries
169    }
170
171    /// Check if a chunk kind should have a summary
172    fn should_summarize(&self, kind: &ChunkKind) -> bool {
173        match kind {
174            ChunkKind::Class => self.config.summarize_classes,
175            ChunkKind::Struct => self.config.summarize_structs,
176            ChunkKind::Module => self.config.summarize_modules,
177            ChunkKind::Interface | ChunkKind::Trait => self.config.summarize_classes,
178            _ => false,
179        }
180    }
181
182    /// Create a summary chunk for a container
183    fn create_summary_chunk(
184        &self,
185        container: &EmbedChunk,
186        children: &[&EmbedChunk],
187    ) -> Option<EmbedChunk> {
188        // Build child references
189        let mut child_refs: Vec<ChildReference> = children
190            .iter()
191            .take(self.config.max_children_in_summary)
192            .map(|child| ChildReference {
193                id: child.id.clone(),
194                name: child.source.symbol.clone(),
195                kind: child.kind,
196                signature: if self.config.include_child_signatures {
197                    child.context.signature.clone()
198                } else {
199                    None
200                },
201                brief: child.context.docstring.as_ref().and_then(|d| {
202                    d.lines().next().map(|s| {
203                        let s = s.trim();
204                        if s.len() > 100 {
205                            format!("{}...", &s[..97])
206                        } else {
207                            s.to_owned()
208                        }
209                    })
210                }),
211            })
212            .collect();
213
214        // Sort by name for determinism
215        child_refs.sort_by(|a, b| a.name.cmp(&b.name));
216
217        // Build summary content
218        let summary_content = self.build_summary_content(container, &child_refs, children.len());
219
220        // Hash the summary content
221        let hash = hash_content(&summary_content);
222
223        // Create tags for the summary
224        let mut tags = vec!["summary".to_owned(), "hierarchy".to_owned()];
225        tags.extend(container.context.tags.iter().cloned());
226
227        Some(EmbedChunk {
228            id: hash.short_id,
229            full_hash: hash.full_hash,
230            content: summary_content,
231            tokens: 0, // Will be computed by caller if needed
232            kind: container.kind,
233            source: ChunkSource {
234                repo: container.source.repo.clone(),
235                file: container.source.file.clone(),
236                lines: container.source.lines,
237                symbol: format!("{}_summary", container.source.symbol),
238                fqn: container
239                    .source
240                    .fqn
241                    .as_ref()
242                    .map(|f| format!("{}_summary", f)),
243                language: container.source.language.clone(),
244                parent: container.source.parent.clone(),
245                visibility: container.source.visibility,
246                is_test: container.source.is_test,
247            },
248            context: ChunkContext {
249                docstring: container.context.docstring.clone(),
250                comments: Vec::new(),
251                signature: container.context.signature.clone(),
252                calls: Vec::new(), // Summary doesn't have direct calls
253                called_by: Vec::new(),
254                imports: container.context.imports.clone(),
255                tags,
256                lines_of_code: 0,
257                max_nesting_depth: 0,
258            },
259            part: None,
260        })
261    }
262
263    /// Build the summary content string
264    fn build_summary_content(
265        &self,
266        container: &EmbedChunk,
267        child_refs: &[ChildReference],
268        total_children: usize,
269    ) -> String {
270        let mut content = String::new();
271
272        // Add container signature if available
273        if let Some(ref sig) = container.context.signature {
274            content.push_str(sig);
275            content.push('\n');
276        }
277
278        // Add docstring if available
279        if let Some(ref doc) = container.context.docstring {
280            content.push('\n');
281            content.push_str(doc);
282            content.push('\n');
283        }
284
285        // Add child list
286        content.push_str("\n/* Members:\n");
287
288        for child in child_refs {
289            content.push_str(" * - ");
290            content.push_str(&child.name);
291
292            if let Some(ref sig) = child.signature {
293                // Compact signature (remove body, keep just declaration)
294                let sig_line = sig.lines().next().unwrap_or(sig).trim();
295                if sig_line != child.name {
296                    content.push_str(": ");
297                    content.push_str(sig_line);
298                }
299            }
300
301            if let Some(ref brief) = child.brief {
302                content.push_str(" - ");
303                content.push_str(brief);
304            }
305
306            content.push('\n');
307        }
308
309        if total_children > child_refs.len() {
310            content.push_str(&format!(" * ... and {} more\n", total_children - child_refs.len()));
311        }
312
313        content.push_str(" */\n");
314
315        content
316    }
317
318    /// Enrich existing chunks with hierarchy metadata
319    ///
320    /// This adds `hierarchy` information to chunk context tags
321    pub fn enrich_chunks(&self, chunks: &mut [EmbedChunk]) {
322        // Build parent -> children map
323        let mut parent_children: HashMap<String, Vec<String>> = HashMap::new();
324
325        for chunk in chunks.iter() {
326            if let Some(ref parent) = chunk.source.parent {
327                let key = format!("{}:{}", chunk.source.file, parent);
328                parent_children
329                    .entry(key)
330                    .or_default()
331                    .push(chunk.source.symbol.clone());
332            }
333        }
334
335        // Enrich container chunks with child count
336        for chunk in chunks.iter_mut() {
337            let key = format!("{}:{}", chunk.source.file, chunk.source.symbol);
338            if let Some(children) = parent_children.get(&key) {
339                chunk
340                    .context
341                    .tags
342                    .push(format!("has-children:{}", children.len()));
343            }
344
345            // Mark chunks that have a parent
346            if chunk.source.parent.is_some() {
347                chunk.context.tags.push("has-parent".to_owned());
348            }
349        }
350    }
351}
352
353/// Get the hierarchy summary for a specific container
354pub fn get_hierarchy_summary(
355    chunks: &[EmbedChunk],
356    container_symbol: &str,
357    file: &str,
358) -> Option<HierarchySummary> {
359    // Find the container
360    let container = chunks
361        .iter()
362        .find(|c| c.source.symbol == container_symbol && c.source.file == file)?;
363
364    // Find children
365    let children: Vec<ChildReference> = chunks
366        .iter()
367        .filter(|c| c.source.parent.as_deref() == Some(container_symbol) && c.source.file == file)
368        .map(|c| ChildReference {
369            id: c.id.clone(),
370            name: c.source.symbol.clone(),
371            kind: c.kind,
372            signature: c.context.signature.clone(),
373            brief: c
374                .context
375                .docstring
376                .as_ref()
377                .and_then(|d| d.lines().next().map(|s| s.trim().to_owned())),
378        })
379        .collect();
380
381    Some(HierarchySummary {
382        container_id: container.id.clone(),
383        container_name: container.source.symbol.clone(),
384        container_kind: container.kind,
385        total_children: children.len(),
386        children,
387    })
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393    use crate::embedding::types::{RepoIdentifier, Visibility};
394
395    fn create_test_chunk(
396        id: &str,
397        symbol: &str,
398        kind: ChunkKind,
399        parent: Option<&str>,
400        signature: Option<&str>,
401        docstring: Option<&str>,
402    ) -> EmbedChunk {
403        EmbedChunk {
404            id: id.to_owned(),
405            full_hash: format!("{}_full", id),
406            content: format!("content of {}", symbol),
407            tokens: 100,
408            kind,
409            source: ChunkSource {
410                repo: RepoIdentifier::default(),
411                file: "test.rs".to_owned(),
412                lines: (1, 10),
413                symbol: symbol.to_owned(),
414                fqn: Some(format!("test::{}", symbol)),
415                language: "Rust".to_owned(),
416                parent: parent.map(String::from),
417                visibility: Visibility::Public,
418                is_test: false,
419            },
420            context: ChunkContext {
421                docstring: docstring.map(String::from),
422                comments: Vec::new(),
423                signature: signature.map(String::from),
424                calls: Vec::new(),
425                called_by: Vec::new(),
426                imports: Vec::new(),
427                tags: Vec::new(),
428                lines_of_code: 10,
429                max_nesting_depth: 2,
430            },
431            part: None,
432        }
433    }
434
435    #[test]
436    fn test_build_hierarchy_basic() {
437        let chunks = vec![
438            create_test_chunk(
439                "c1",
440                "UserService",
441                ChunkKind::Class,
442                None,
443                Some("class UserService"),
444                Some("Service for user management"),
445            ),
446            create_test_chunk(
447                "c2",
448                "get_user",
449                ChunkKind::Method,
450                Some("UserService"),
451                Some("fn get_user(&self, id: u64) -> User"),
452                Some("Get a user by ID"),
453            ),
454            create_test_chunk(
455                "c3",
456                "create_user",
457                ChunkKind::Method,
458                Some("UserService"),
459                Some("fn create_user(&self, data: UserData) -> User"),
460                Some("Create a new user"),
461            ),
462            create_test_chunk(
463                "c4",
464                "delete_user",
465                ChunkKind::Method,
466                Some("UserService"),
467                Some("fn delete_user(&self, id: u64)"),
468                Some("Delete a user"),
469            ),
470        ];
471
472        let builder = HierarchyBuilder::new();
473        let summaries = builder.build_hierarchy(&chunks);
474
475        assert_eq!(summaries.len(), 1);
476        assert!(summaries[0].source.symbol.contains("summary"));
477        assert!(summaries[0].content.contains("Members:"));
478        assert!(summaries[0].content.contains("get_user"));
479        assert!(summaries[0].content.contains("create_user"));
480        assert!(summaries[0].content.contains("delete_user"));
481    }
482
483    #[test]
484    fn test_hierarchy_min_children() {
485        let chunks = vec![
486            create_test_chunk(
487                "c1",
488                "SmallClass",
489                ChunkKind::Class,
490                None,
491                Some("class SmallClass"),
492                None,
493            ),
494            create_test_chunk(
495                "c2",
496                "only_method",
497                ChunkKind::Method,
498                Some("SmallClass"),
499                None,
500                None,
501            ),
502        ];
503
504        let builder = HierarchyBuilder::with_config(HierarchyConfig {
505            min_children_for_summary: 2, // Requires at least 2 children
506            ..Default::default()
507        });
508
509        let summaries = builder.build_hierarchy(&chunks);
510        assert!(summaries.is_empty()); // Only 1 child, no summary
511    }
512
513    #[test]
514    fn test_hierarchy_enrich_chunks() {
515        let mut chunks = vec![
516            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
517            create_test_chunk("c2", "method1", ChunkKind::Method, Some("MyClass"), None, None),
518            create_test_chunk("c3", "method2", ChunkKind::Method, Some("MyClass"), None, None),
519        ];
520
521        let builder = HierarchyBuilder::new();
522        builder.enrich_chunks(&mut chunks);
523
524        // Container should have child count tag
525        assert!(chunks[0]
526            .context
527            .tags
528            .iter()
529            .any(|t| t.starts_with("has-children:")));
530
531        // Children should have parent tag
532        assert!(chunks[1].context.tags.contains(&"has-parent".to_owned()));
533        assert!(chunks[2].context.tags.contains(&"has-parent".to_owned()));
534    }
535
536    #[test]
537    fn test_get_hierarchy_summary() {
538        let chunks = vec![
539            create_test_chunk(
540                "c1",
541                "MyStruct",
542                ChunkKind::Struct,
543                None,
544                Some("struct MyStruct"),
545                None,
546            ),
547            create_test_chunk("c2", "field1", ChunkKind::Variable, Some("MyStruct"), None, None),
548            create_test_chunk(
549                "c3",
550                "new",
551                ChunkKind::Function,
552                Some("MyStruct"),
553                Some("fn new() -> Self"),
554                Some("Create a new instance"),
555            ),
556        ];
557
558        let summary = get_hierarchy_summary(&chunks, "MyStruct", "test.rs");
559        assert!(summary.is_some());
560
561        let summary = summary.unwrap();
562        assert_eq!(summary.container_name, "MyStruct");
563        assert_eq!(summary.total_children, 2);
564        assert!(summary.children.iter().any(|c| c.name == "field1"));
565        assert!(summary.children.iter().any(|c| c.name == "new"));
566    }
567
568    #[test]
569    fn test_summary_content_format() {
570        let chunks = vec![
571            create_test_chunk(
572                "c1",
573                "Calculator",
574                ChunkKind::Class,
575                None,
576                Some("pub struct Calculator"),
577                Some("A simple calculator"),
578            ),
579            create_test_chunk(
580                "c2",
581                "add",
582                ChunkKind::Method,
583                Some("Calculator"),
584                Some("fn add(&self, a: i32, b: i32) -> i32"),
585                Some("Add two numbers"),
586            ),
587            create_test_chunk(
588                "c3",
589                "subtract",
590                ChunkKind::Method,
591                Some("Calculator"),
592                Some("fn subtract(&self, a: i32, b: i32) -> i32"),
593                Some("Subtract two numbers"),
594            ),
595        ];
596
597        let builder = HierarchyBuilder::new();
598        let summaries = builder.build_hierarchy(&chunks);
599
600        assert_eq!(summaries.len(), 1);
601        let summary = &summaries[0];
602
603        // Check content structure
604        assert!(summary.content.contains("pub struct Calculator"));
605        assert!(summary.content.contains("A simple calculator"));
606        assert!(summary.content.contains("/* Members:"));
607        assert!(summary.content.contains(" * - add"));
608        assert!(summary.content.contains(" * - subtract"));
609        assert!(summary.content.contains(" */"));
610    }
611
612    #[test]
613    fn test_config_options() {
614        let config = HierarchyConfig {
615            summarize_classes: true,
616            summarize_structs: false,
617            summarize_modules: false,
618            min_children_for_summary: 1,
619            include_child_signatures: false,
620            max_children_in_summary: 5,
621        };
622
623        let builder = HierarchyBuilder::with_config(config);
624
625        let class_chunks = vec![
626            create_test_chunk("c1", "MyClass", ChunkKind::Class, None, None, None),
627            create_test_chunk("c2", "m1", ChunkKind::Method, Some("MyClass"), None, None),
628        ];
629
630        let struct_chunks = vec![
631            create_test_chunk("s1", "MyStruct", ChunkKind::Struct, None, None, None),
632            create_test_chunk("s2", "f1", ChunkKind::Variable, Some("MyStruct"), None, None),
633        ];
634
635        // Class should get summary
636        assert_eq!(builder.build_hierarchy(&class_chunks).len(), 1);
637
638        // Struct should NOT get summary (disabled)
639        assert_eq!(builder.build_hierarchy(&struct_chunks).len(), 0);
640    }
641}