scribe_selection/
demotion.rs

1//! Multi-Fidelity Demotion System for V4 variant
2//!
3//! Implements progressive content reduction: FULL → CHUNK → SIGNATURE
4//! - Intelligent content reduction when approaching budget limits
5//! - Maintains most important information while reducing token usage
6//! - Progressive degradation preserves critical functionality
7//! - Language-specific semantic chunking and signature extraction using tree-sitter AST parsing
8
9use serde::{Serialize, Deserialize};
10use std::collections::HashMap;
11use scribe_core::{Result, ScribeError};
12use crate::ast_parser::{AstParser, AstLanguage, AstChunk as AstParserChunk, AstSignature};
13
14/// Content fidelity levels for demotion system
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum FidelityMode {
17    /// Complete file content
18    Full,
19    /// Important chunks only
20    Chunk,
21    /// Type signatures and interfaces only
22    Signature,
23}
24
25/// Result of applying demotion to a file
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct DemotionResult {
28    pub original_path: String,
29    pub original_tokens: usize,
30    pub demoted_tokens: usize,
31    pub fidelity_mode: FidelityMode,
32    pub content: String,
33    pub chunks_kept: usize,
34    pub chunks_total: usize,
35    pub compression_ratio: f64,
36    pub quality_score: f64, // How much important info was preserved
37}
38
39/// Information about a code chunk
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ChunkInfo {
42    pub start_line: usize,
43    pub end_line: usize,
44    pub chunk_type: String,
45    pub content: String,
46    pub importance_score: f64,
47    pub estimated_tokens: usize,
48    pub dependencies: Vec<String>, // Other chunks this depends on
49}
50
51/// Splits code into semantic chunks for selective demotion using tree-sitter AST parsing
52pub struct CodeChunker {
53    language_cache: HashMap<String, Option<AstLanguage>>,
54    ast_parser: AstParser,
55}
56
57impl CodeChunker {
58    pub fn new() -> Result<Self> {
59        Ok(Self {
60            language_cache: HashMap::new(),
61            ast_parser: AstParser::new()?,
62        })
63    }
64
65    pub fn detect_language(&mut self, file_path: &str) -> Option<AstLanguage> {
66        if let Some(cached) = self.language_cache.get(file_path) {
67            return cached.clone();
68        }
69
70        let ext = file_path.split('.').last().unwrap_or("");
71        let language = AstLanguage::from_extension(ext);
72
73        self.language_cache.insert(file_path.to_string(), language.clone());
74        language
75    }
76
77    pub fn chunk_content(&mut self, content: &str, file_path: &str) -> Result<Vec<ChunkInfo>> {
78        let language = match self.detect_language(file_path) {
79            Some(lang) => lang,
80            None => return Ok(self.chunk_generic(content)),
81        };
82
83        // Use AST parser to get semantic chunks
84        // Create a temporary file path with the correct extension
85        let temp_path = format!("temp.{}", match language {
86            AstLanguage::Python => "py",
87            AstLanguage::JavaScript => "js", 
88            AstLanguage::TypeScript => "ts",
89            AstLanguage::Go => "go",
90            AstLanguage::Rust => "rs",
91        });
92        let ast_chunks = self.ast_parser.parse_chunks(content, &temp_path)?;
93        
94        let mut chunks = Vec::new();
95        for ast_chunk in ast_chunks {
96            let chunk = ChunkInfo {
97                start_line: ast_chunk.start_line,
98                end_line: ast_chunk.end_line,
99                chunk_type: ast_chunk.chunk_type,
100                content: ast_chunk.content.clone(),
101                importance_score: ast_chunk.importance_score,
102                estimated_tokens: ast_chunk.content.len() / 4, // Rough estimate
103                dependencies: Vec::new(), // Could be enhanced with dependency analysis
104            };
105            chunks.push(chunk);
106        }
107
108        Ok(chunks)
109    }
110
111    fn chunk_generic(&self, content: &str) -> Vec<ChunkInfo> {
112        let lines: Vec<&str> = content.split('\n').collect();
113        let chunk_size = 20; // Lines per chunk
114        let mut chunks = Vec::new();
115
116        for (i, chunk_lines) in lines.chunks(chunk_size).enumerate() {
117            let start_line = i * chunk_size + 1;
118            let end_line = start_line + chunk_lines.len() - 1;
119            let content = chunk_lines.join("\n");
120            
121            let chunk = ChunkInfo {
122                start_line,
123                end_line,
124                chunk_type: "generic".to_string(),
125                content: content.clone(),
126                importance_score: 0.5, // Default score for generic chunks
127                estimated_tokens: content.len() / 4,
128                dependencies: Vec::new(),
129            };
130            chunks.push(chunk);
131        }
132
133        chunks
134    }
135
136    pub fn select_chunks_by_budget(&self, chunks: &[ChunkInfo], token_budget: usize) -> Vec<usize> {
137        // Sort chunks by importance score (descending)
138        let mut indexed_chunks: Vec<(usize, &ChunkInfo)> = chunks.iter().enumerate().collect();
139        indexed_chunks.sort_by(|a, b| b.1.importance_score.partial_cmp(&a.1.importance_score).unwrap());
140
141        let mut selected_indices = Vec::new();
142        let mut used_tokens = 0;
143
144        for (index, chunk) in indexed_chunks {
145            if used_tokens + chunk.estimated_tokens <= token_budget {
146                selected_indices.push(index);
147                used_tokens += chunk.estimated_tokens;
148            }
149        }
150
151        // Sort indices to maintain original order
152        selected_indices.sort();
153        selected_indices
154    }
155}
156
157impl Default for CodeChunker {
158    fn default() -> Self {
159        Self::new().expect("Failed to create CodeChunker")
160    }
161}
162
163/// Extracts type signatures and interfaces for the highest fidelity reduction using tree-sitter
164pub struct SignatureExtractor {
165    ast_parser: AstParser,
166}
167
168impl SignatureExtractor {
169    pub fn new() -> Result<Self> {
170        Ok(Self {
171            ast_parser: AstParser::new()?,
172        })
173    }
174
175    pub fn extract_signatures(&mut self, content: &str, file_path: &str) -> Result<Vec<String>> {
176        let language = AstLanguage::from_extension(
177            file_path.split('.').last().unwrap_or("")
178        );
179
180        let language = match language {
181            Some(lang) => lang,
182            None => return Ok(vec![self.extract_generic_signatures(content)]),
183        };
184
185        // Use AST parser to extract signatures
186        // Create a temporary file path with the correct extension  
187        let temp_path = format!("temp.{}", match language {
188            AstLanguage::Python => "py",
189            AstLanguage::JavaScript => "js", 
190            AstLanguage::TypeScript => "ts",
191            AstLanguage::Go => "go",
192            AstLanguage::Rust => "rs",
193        });
194        let signatures = self.ast_parser.extract_signatures(content, &temp_path)?;
195        
196        Ok(signatures.into_iter().map(|sig| {
197            format!("{}:{} // {}", sig.name, sig.signature_type, sig.signature)
198        }).collect())
199    }
200
201    fn extract_generic_signatures(&self, content: &str) -> String {
202        // For unknown file types, try to extract function-like patterns
203        let lines: Vec<&str> = content.lines().collect();
204        let mut signatures = Vec::new();
205
206        for line in lines {
207            let trimmed = line.trim();
208            if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with("#") {
209                continue;
210            }
211
212            // Look for function-like patterns (very basic heuristic)
213            if trimmed.contains("(") && trimmed.contains(")") && 
214               (trimmed.contains("def ") || trimmed.contains("function ") || 
215                trimmed.contains("fn ") || trimmed.contains("func ")) {
216                signatures.push(trimmed.to_string());
217            }
218        }
219
220        signatures.join("\n")
221    }
222}
223
224impl Default for SignatureExtractor {
225    fn default() -> Self {
226        Self::new().expect("Failed to create SignatureExtractor")
227    }
228}
229
230/// Main demotion engine that orchestrates the progressive content reduction
231pub struct DemotionEngine {
232    chunker: CodeChunker,
233    signature_extractor: SignatureExtractor,
234}
235
236impl DemotionEngine {
237    pub fn new() -> Result<Self> {
238        Ok(Self {
239            chunker: CodeChunker::new()?,
240            signature_extractor: SignatureExtractor::new()?,
241        })
242    }
243
244    pub fn demote_content(
245        &mut self,
246        content: &str,
247        file_path: &str,
248        target_mode: FidelityMode,
249        token_budget: Option<usize>,
250    ) -> Result<DemotionResult> {
251        let original_tokens = content.len() / 4; // Rough estimate
252
253        match target_mode {
254            FidelityMode::Full => {
255                Ok(DemotionResult {
256                    original_path: file_path.to_string(),
257                    original_tokens,
258                    demoted_tokens: original_tokens,
259                    fidelity_mode: FidelityMode::Full,
260                    content: content.to_string(),
261                    chunks_kept: 1,
262                    chunks_total: 1,
263                    compression_ratio: 1.0,
264                    quality_score: 1.0,
265                })
266            }
267            FidelityMode::Chunk => {
268                self.demote_to_chunks(content, file_path, token_budget, original_tokens)
269            }
270            FidelityMode::Signature => {
271                self.demote_to_signatures(content, file_path, original_tokens)
272            }
273        }
274    }
275
276    fn demote_to_chunks(
277        &mut self,
278        content: &str,
279        file_path: &str,
280        token_budget: Option<usize>,
281        original_tokens: usize,
282    ) -> Result<DemotionResult> {
283        let chunks = self.chunker.chunk_content(content, file_path)?;
284        let chunks_total = chunks.len();
285
286        let selected_indices = if let Some(budget) = token_budget {
287            self.chunker.select_chunks_by_budget(&chunks, budget)
288        } else {
289            // Keep all chunks if no budget specified
290            (0..chunks.len()).collect()
291        };
292
293        let chunks_kept = selected_indices.len();
294        let selected_chunks: Vec<String> = selected_indices
295            .iter()
296            .map(|&i| chunks[i].content.clone())
297            .collect();
298
299        let demoted_content = selected_chunks.join("\n\n// ... [content omitted] ...\n\n");
300        let demoted_tokens = demoted_content.len() / 4;
301
302        let quality_score = if chunks_total > 0 {
303            selected_indices
304                .iter()
305                .map(|&i| chunks[i].importance_score)
306                .sum::<f64>() / chunks_total as f64
307        } else {
308            0.0
309        };
310
311        Ok(DemotionResult {
312            original_path: file_path.to_string(),
313            original_tokens,
314            demoted_tokens,
315            fidelity_mode: FidelityMode::Chunk,
316            content: demoted_content,
317            chunks_kept,
318            chunks_total,
319            compression_ratio: demoted_tokens as f64 / original_tokens as f64,
320            quality_score,
321        })
322    }
323
324    fn demote_to_signatures(
325        &mut self,
326        content: &str,
327        file_path: &str,
328        original_tokens: usize,
329    ) -> Result<DemotionResult> {
330        let signatures = self.signature_extractor.extract_signatures(content, file_path)?;
331        let demoted_content = signatures.join("\n");
332        let demoted_tokens = demoted_content.len() / 4;
333
334        Ok(DemotionResult {
335            original_path: file_path.to_string(),
336            original_tokens,
337            demoted_tokens,
338            fidelity_mode: FidelityMode::Signature,
339            content: demoted_content,
340            chunks_kept: signatures.len(),
341            chunks_total: signatures.len(), // For signatures, kept = total
342            compression_ratio: demoted_tokens as f64 / original_tokens as f64,
343            quality_score: 0.8, // Signatures preserve high-level structure
344        })
345    }
346}
347
348impl Default for DemotionEngine {
349    fn default() -> Self {
350        Self::new().expect("Failed to create DemotionEngine")
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn test_language_detection() {
360        let mut chunker = CodeChunker::new().unwrap();
361        
362        assert_eq!(chunker.detect_language("test.py"), Some(AstLanguage::Python));
363        assert_eq!(chunker.detect_language("test.js"), Some(AstLanguage::JavaScript));
364        assert_eq!(chunker.detect_language("test.ts"), Some(AstLanguage::TypeScript));
365        assert_eq!(chunker.detect_language("test.go"), Some(AstLanguage::Go));
366        assert_eq!(chunker.detect_language("test.rs"), Some(AstLanguage::Rust));
367        assert_eq!(chunker.detect_language("test.txt"), None);
368    }
369
370    #[test]
371    fn test_fidelity_modes() {
372        let engine = DemotionEngine::new().unwrap();
373        
374        // Test that all fidelity modes are correctly represented
375        assert_eq!(FidelityMode::Full as u8, 0);
376        assert_ne!(FidelityMode::Chunk, FidelityMode::Signature);
377    }
378
379    #[test]
380    fn test_chunk_budget_selection() {
381        let chunker = CodeChunker::new().unwrap();
382        
383        let chunks = vec![
384            ChunkInfo {
385                start_line: 1,
386                end_line: 5,
387                chunk_type: "function".to_string(),
388                content: "def test(): pass".to_string(),
389                importance_score: 0.8,
390                estimated_tokens: 10,
391                dependencies: vec![],
392            },
393            ChunkInfo {
394                start_line: 6,
395                end_line: 10,
396                chunk_type: "comment".to_string(),
397                content: "# This is a comment".to_string(),
398                importance_score: 0.2,
399                estimated_tokens: 5,
400                dependencies: vec![],
401            },
402        ];
403
404        let selected = chunker.select_chunks_by_budget(&chunks, 12);
405        assert_eq!(selected, vec![0]); // Should select the function with higher importance
406    }
407}