Skip to main content

graphrag_core/core/
metadata.rs

1//! Chunk metadata for semantic enrichment
2//!
3//! This module defines metadata structures that enhance text chunks with
4//! semantic information including document structure (chapter, section),
5//! keywords, summaries, and positional information.
6
7use serde::{Deserialize, Serialize};
8
9/// Metadata associated with a text chunk, providing semantic context
10///
11/// This structure enriches chunks with information about their position in the
12/// document hierarchy, extracted keywords, automatically generated summaries,
13/// and other contextual information useful for retrieval and understanding.
14#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
15pub struct ChunkMetadata {
16    /// Chapter name/title this chunk belongs to
17    ///
18    /// Examples: "Chapter 1: Introduction", "Part I"
19    pub chapter: Option<String>,
20
21    /// Section name/title within the chapter
22    ///
23    /// Examples: "1.1 Background", "Introduction"
24    pub section: Option<String>,
25
26    /// Subsection name/title within the section
27    ///
28    /// Examples: "1.1.1 Historical Context", "Early Development"
29    pub subsection: Option<String>,
30
31    /// Automatically detected or manually assigned topic
32    ///
33    /// Examples: "Machine Learning", "Neural Networks", "Data Processing"
34    pub topic: Option<String>,
35
36    /// Extracted keywords from the chunk content (TF-IDF or similar)
37    ///
38    /// Ordered by relevance/importance, typically 3-10 keywords
39    pub keywords: Vec<String>,
40
41    /// Automatically generated summary of the chunk content
42    ///
43    /// Typically 1-3 sentences capturing the main points
44    pub summary: Option<String>,
45
46    /// Hierarchical level in document structure (0 = root/chapter, 1 = section, 2 = subsection, etc.)
47    ///
48    /// Used to understand the depth of this chunk in the document hierarchy
49    pub structural_level: Option<u8>,
50
51    /// Relative position in the document (0.0 to 1.0)
52    ///
53    /// 0.0 = beginning, 0.5 = middle, 1.0 = end
54    /// Useful for positional weighting in retrieval
55    pub position_in_document: Option<f32>,
56
57    /// Full heading path from document root to this chunk
58    ///
59    /// Example: ["Chapter 1", "Section 1.1", "Subsection 1.1.1"]
60    /// Provides complete context of the chunk's location in document hierarchy
61    pub heading_path: Vec<String>,
62
63    /// Confidence score for metadata extraction (0.0 to 1.0)
64    ///
65    /// Indicates how confident the system is about the assigned metadata
66    pub confidence: Option<f32>,
67
68    /// Custom metadata key-value pairs
69    ///
70    /// Allows for extensibility with domain-specific metadata
71    #[serde(default)]
72    pub custom: std::collections::HashMap<String, String>,
73}
74
75impl ChunkMetadata {
76    /// Create a new empty metadata instance
77    pub fn new() -> Self {
78        Self::default()
79    }
80
81    /// Create metadata with chapter information
82    pub fn with_chapter(mut self, chapter: String) -> Self {
83        self.chapter = Some(chapter);
84        self
85    }
86
87    /// Create metadata with section information
88    pub fn with_section(mut self, section: String) -> Self {
89        self.section = Some(section);
90        self
91    }
92
93    /// Create metadata with subsection information
94    pub fn with_subsection(mut self, subsection: String) -> Self {
95        self.subsection = Some(subsection);
96        self
97    }
98
99    /// Create metadata with keywords
100    pub fn with_keywords(mut self, keywords: Vec<String>) -> Self {
101        self.keywords = keywords;
102        self
103    }
104
105    /// Create metadata with summary
106    pub fn with_summary(mut self, summary: String) -> Self {
107        self.summary = Some(summary);
108        self
109    }
110
111    /// Create metadata with structural level
112    pub fn with_structural_level(mut self, level: u8) -> Self {
113        self.structural_level = Some(level);
114        self
115    }
116
117    /// Create metadata with position in document
118    pub fn with_position(mut self, position: f32) -> Self {
119        self.position_in_document = Some(position.clamp(0.0, 1.0));
120        self
121    }
122
123    /// Create metadata with heading path
124    pub fn with_heading_path(mut self, path: Vec<String>) -> Self {
125        self.heading_path = path;
126        self
127    }
128
129    /// Add a custom metadata field
130    pub fn add_custom(mut self, key: String, value: String) -> Self {
131        self.custom.insert(key, value);
132        self
133    }
134
135    /// Check if metadata has any structural information (chapter, section, or subsection)
136    pub fn has_structure_info(&self) -> bool {
137        self.chapter.is_some() || self.section.is_some() || self.subsection.is_some()
138    }
139
140    /// Check if metadata has semantic enrichment (keywords or summary)
141    pub fn has_semantic_info(&self) -> bool {
142        !self.keywords.is_empty() || self.summary.is_some()
143    }
144
145    /// Get the deepest level heading (subsection > section > chapter)
146    pub fn get_deepest_heading(&self) -> Option<&String> {
147        self.subsection
148            .as_ref()
149            .or(self.section.as_ref())
150            .or(self.chapter.as_ref())
151    }
152
153    /// Get full hierarchical context as a formatted string
154    ///
155    /// Example: "Chapter 1 > Section 1.1 > Subsection 1.1.1"
156    pub fn get_hierarchy_string(&self) -> Option<String> {
157        if self.heading_path.is_empty() {
158            return None;
159        }
160        Some(self.heading_path.join(" > "))
161    }
162
163    /// Calculate completeness score (0.0 to 1.0) based on populated fields
164    ///
165    /// Higher scores indicate more complete metadata
166    pub fn completeness_score(&self) -> f32 {
167        let mut score = 0.0;
168        let total = 9.0;
169
170        if self.chapter.is_some() {
171            score += 1.0;
172        }
173        if self.section.is_some() {
174            score += 1.0;
175        }
176        if self.subsection.is_some() {
177            score += 1.0;
178        }
179        if self.topic.is_some() {
180            score += 1.0;
181        }
182        if !self.keywords.is_empty() {
183            score += 1.0;
184        }
185        if self.summary.is_some() {
186            score += 1.0;
187        }
188        if self.structural_level.is_some() {
189            score += 1.0;
190        }
191        if self.position_in_document.is_some() {
192            score += 1.0;
193        }
194        if !self.heading_path.is_empty() {
195            score += 1.0;
196        }
197
198        score / total
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    #[test]
207    fn test_metadata_creation() {
208        let metadata = ChunkMetadata::new();
209        assert!(metadata.chapter.is_none());
210        assert!(metadata.keywords.is_empty());
211        assert_eq!(metadata.completeness_score(), 0.0);
212    }
213
214    #[test]
215    fn test_metadata_builder() {
216        let metadata = ChunkMetadata::new()
217            .with_chapter("Chapter 1".to_string())
218            .with_section("Section 1.1".to_string())
219            .with_keywords(vec!["test".to_string(), "metadata".to_string()])
220            .with_summary("This is a test summary.".to_string());
221
222        assert_eq!(metadata.chapter, Some("Chapter 1".to_string()));
223        assert_eq!(metadata.section, Some("Section 1.1".to_string()));
224        assert_eq!(metadata.keywords.len(), 2);
225        assert!(metadata.has_structure_info());
226        assert!(metadata.has_semantic_info());
227    }
228
229    #[test]
230    fn test_heading_hierarchy() {
231        let metadata = ChunkMetadata::new().with_heading_path(vec![
232            "Chapter 1".to_string(),
233            "Section 1.1".to_string(),
234            "Subsection 1.1.1".to_string(),
235        ]);
236
237        assert_eq!(
238            metadata.get_hierarchy_string(),
239            Some("Chapter 1 > Section 1.1 > Subsection 1.1.1".to_string())
240        );
241    }
242
243    #[test]
244    fn test_deepest_heading() {
245        let mut metadata = ChunkMetadata::new();
246        assert!(metadata.get_deepest_heading().is_none());
247
248        metadata.chapter = Some("Chapter 1".to_string());
249        assert_eq!(metadata.get_deepest_heading(), Some(&"Chapter 1".to_string()));
250
251        metadata.section = Some("Section 1.1".to_string());
252        assert_eq!(
253            metadata.get_deepest_heading(),
254            Some(&"Section 1.1".to_string())
255        );
256
257        metadata.subsection = Some("Subsection 1.1.1".to_string());
258        assert_eq!(
259            metadata.get_deepest_heading(),
260            Some(&"Subsection 1.1.1".to_string())
261        );
262    }
263
264    #[test]
265    fn test_completeness_score() {
266        let mut metadata = ChunkMetadata::new();
267        assert_eq!(metadata.completeness_score(), 0.0);
268
269        metadata.chapter = Some("Chapter 1".to_string());
270        metadata.keywords = vec!["test".to_string()];
271        metadata.summary = Some("Summary".to_string());
272
273        let score = metadata.completeness_score();
274        assert!(score > 0.0 && score < 1.0);
275    }
276
277    #[test]
278    fn test_position_clamping() {
279        let metadata = ChunkMetadata::new().with_position(1.5);
280        assert_eq!(metadata.position_in_document, Some(1.0));
281
282        let metadata2 = ChunkMetadata::new().with_position(-0.5);
283        assert_eq!(metadata2.position_in_document, Some(0.0));
284    }
285
286    #[test]
287    fn test_custom_metadata() {
288        let metadata = ChunkMetadata::new()
289            .add_custom("author".to_string(), "John Doe".to_string())
290            .add_custom("date".to_string(), "2024-01-01".to_string());
291
292        assert_eq!(metadata.custom.len(), 2);
293        assert_eq!(
294            metadata.custom.get("author"),
295            Some(&"John Doe".to_string())
296        );
297    }
298
299    #[test]
300    fn test_serialization() {
301        let metadata = ChunkMetadata::new()
302            .with_chapter("Chapter 1".to_string())
303            .with_keywords(vec!["test".to_string()])
304            .with_position(0.5);
305
306        let json = serde_json::to_string(&metadata).unwrap();
307        let deserialized: ChunkMetadata = serde_json::from_str(&json).unwrap();
308
309        assert_eq!(metadata, deserialized);
310    }
311}