agentroot_core/llm/
metadata_generator.rs

1//! Metadata generation for documents
2
3use crate::error::Result;
4use crate::index::ast_chunker::ChunkType;
5use async_trait::async_trait;
6use serde::{Deserialize, Serialize};
7
8/// Metadata generation trait
9#[async_trait]
10pub trait MetadataGenerator: Send + Sync {
11    /// Generate comprehensive metadata for a document
12    async fn generate_metadata(
13        &self,
14        content: &str,
15        context: &MetadataContext,
16    ) -> Result<DocumentMetadata>;
17
18    /// Get model name
19    fn model_name(&self) -> &str;
20}
21
22/// Context information for metadata generation
23#[derive(Debug, Clone)]
24pub struct MetadataContext {
25    /// Provider type (file, github, url, pdf, sql)
26    pub source_type: String,
27    /// Programming language (if applicable)
28    pub language: Option<String>,
29    /// File extension (if applicable)
30    pub file_extension: Option<String>,
31    /// Collection name
32    pub collection_name: String,
33    /// Provider configuration (JSON)
34    pub provider_config: Option<String>,
35    /// Document creation timestamp
36    pub created_at: String,
37    /// Document modification timestamp
38    pub modified_at: String,
39    /// AST chunk types found in document
40    pub existing_structure: Option<Vec<ChunkType>>,
41}
42
43impl MetadataContext {
44    /// Create new metadata context
45    pub fn new(source_type: String, collection_name: String) -> Self {
46        Self {
47            source_type,
48            collection_name,
49            language: None,
50            file_extension: None,
51            provider_config: None,
52            created_at: String::new(),
53            modified_at: String::new(),
54            existing_structure: None,
55        }
56    }
57
58    /// Set language
59    pub fn with_language(mut self, language: String) -> Self {
60        self.language = Some(language);
61        self
62    }
63
64    /// Set file extension
65    pub fn with_extension(mut self, extension: String) -> Self {
66        self.file_extension = Some(extension);
67        self
68    }
69
70    /// Set provider config
71    pub fn with_provider_config(mut self, config: String) -> Self {
72        self.provider_config = Some(config);
73        self
74    }
75
76    /// Set timestamps
77    pub fn with_timestamps(mut self, created_at: String, modified_at: String) -> Self {
78        self.created_at = created_at;
79        self.modified_at = modified_at;
80        self
81    }
82
83    /// Set existing structure
84    pub fn with_structure(mut self, structure: Vec<ChunkType>) -> Self {
85        self.existing_structure = Some(structure);
86        self
87    }
88}
89
90/// Generated metadata result
91#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
92pub struct DocumentMetadata {
93    /// Document summary (100-200 words)
94    pub summary: String,
95    /// Semantic title (improved from filename)
96    pub semantic_title: String,
97    /// Keywords for search (5-10 terms)
98    pub keywords: Vec<String>,
99    /// Document category (tutorial, reference, config, etc.)
100    pub category: String,
101    /// Purpose/intent description
102    pub intent: String,
103    /// Related concepts/entities
104    pub concepts: Vec<String>,
105    /// Difficulty level (beginner, intermediate, advanced)
106    pub difficulty: String,
107    /// Suggested search queries
108    pub suggested_queries: Vec<String>,
109}
110
111impl DocumentMetadata {
112    /// Create new empty metadata
113    pub fn new() -> Self {
114        Self {
115            summary: String::new(),
116            semantic_title: String::new(),
117            keywords: Vec::new(),
118            category: String::new(),
119            intent: String::new(),
120            concepts: Vec::new(),
121            difficulty: String::new(),
122            suggested_queries: Vec::new(),
123        }
124    }
125
126    /// Create metadata with basic fields
127    pub fn basic(title: String, summary: String) -> Self {
128        Self {
129            summary,
130            semantic_title: title,
131            keywords: Vec::new(),
132            category: "unknown".to_string(),
133            intent: String::new(),
134            concepts: Vec::new(),
135            difficulty: "intermediate".to_string(),
136            suggested_queries: Vec::new(),
137        }
138    }
139
140    /// Validate metadata completeness
141    pub fn is_complete(&self) -> bool {
142        !self.summary.is_empty()
143            && !self.semantic_title.is_empty()
144            && !self.keywords.is_empty()
145            && !self.category.is_empty()
146    }
147
148    /// Convert to JSON string
149    pub fn to_json(&self) -> Result<String> {
150        serde_json::to_string(self).map_err(|e| e.into())
151    }
152
153    /// Parse from JSON string
154    pub fn from_json(json: &str) -> Result<Self> {
155        serde_json::from_str(json).map_err(|e| e.into())
156    }
157}
158
159impl Default for DocumentMetadata {
160    fn default() -> Self {
161        Self::new()
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn test_metadata_context_builder() {
171        let context = MetadataContext::new("file".to_string(), "test-collection".to_string())
172            .with_language("rust".to_string())
173            .with_extension("rs".to_string())
174            .with_timestamps("2024-01-01".to_string(), "2024-01-02".to_string());
175
176        assert_eq!(context.source_type, "file");
177        assert_eq!(context.collection_name, "test-collection");
178        assert_eq!(context.language, Some("rust".to_string()));
179        assert_eq!(context.file_extension, Some("rs".to_string()));
180    }
181
182    #[test]
183    fn test_document_metadata_basic() {
184        let metadata = DocumentMetadata::basic(
185            "Test Document".to_string(),
186            "This is a test summary.".to_string(),
187        );
188
189        assert_eq!(metadata.semantic_title, "Test Document");
190        assert_eq!(metadata.summary, "This is a test summary.");
191        assert_eq!(metadata.difficulty, "intermediate");
192        assert!(!metadata.is_complete());
193    }
194
195    #[test]
196    fn test_document_metadata_complete() {
197        let metadata = DocumentMetadata {
198            summary: "A comprehensive test".to_string(),
199            semantic_title: "Test".to_string(),
200            keywords: vec!["test".to_string()],
201            category: "test".to_string(),
202            intent: "Testing".to_string(),
203            concepts: vec!["testing".to_string()],
204            difficulty: "beginner".to_string(),
205            suggested_queries: vec!["how to test".to_string()],
206        };
207
208        assert!(metadata.is_complete());
209    }
210
211    #[test]
212    fn test_metadata_json_serialization() {
213        let metadata = DocumentMetadata {
214            summary: "Test summary".to_string(),
215            semantic_title: "Test Title".to_string(),
216            keywords: vec!["test".to_string(), "rust".to_string()],
217            category: "tutorial".to_string(),
218            intent: "Learn testing".to_string(),
219            concepts: vec!["unit testing".to_string()],
220            difficulty: "beginner".to_string(),
221            suggested_queries: vec!["rust testing".to_string()],
222        };
223
224        let json = metadata.to_json().unwrap();
225        let parsed = DocumentMetadata::from_json(&json).unwrap();
226
227        assert_eq!(metadata, parsed);
228    }
229
230    #[test]
231    fn test_metadata_context_with_structure() {
232        let structure = vec![ChunkType::Function, ChunkType::Struct];
233        let context = MetadataContext::new("file".to_string(), "code".to_string())
234            .with_structure(structure.clone());
235
236        assert_eq!(context.existing_structure, Some(structure));
237    }
238}