scribe_selection/
demotion.rs

1//! Multi-Fidelity Demotion System for V4 variant
2//!
3//! Implements progressive content reduction: FULL → CHUNK → SIGNATURE
4//! - Intelligent content reduction when approaching budget limits
5//! - Maintains most important information while reducing token usage
6//! - Progressive degradation preserves critical functionality
7//! - Language-specific semantic chunking and signature extraction using tree-sitter AST parsing
8
9use crate::ast_parser::{AstLanguage, AstParser};
10use regex::Regex;
11use scribe_core::tokenization::{utils as token_utils, TokenCounter};
12use scribe_core::{Result, ScribeError};
13use serde::{Deserialize, Serialize};
14use std::cell::RefCell;
15use std::collections::{HashMap, HashSet};
16use std::path::Path;
17use std::rc::Rc;
18
19/// Content fidelity levels for demotion system
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum FidelityMode {
22    /// Complete file content
23    Full,
24    /// Important chunks only
25    Chunk,
26    /// Type signatures and interfaces only
27    Signature,
28}
29
30/// Result of applying demotion to a file
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DemotionResult {
33    pub original_path: String,
34    pub original_tokens: usize,
35    pub demoted_tokens: usize,
36    pub fidelity_mode: FidelityMode,
37    pub content: String,
38    pub chunks_kept: usize,
39    pub chunks_total: usize,
40    pub compression_ratio: f64,
41    pub quality_score: f64, // How much important info was preserved
42}
43
44/// Information about a code chunk
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct ChunkInfo {
47    pub start_line: usize,
48    pub end_line: usize,
49    pub chunk_type: String,
50    pub content: String,
51    pub importance_score: f64,
52    pub estimated_tokens: usize,
53    pub dependencies: Vec<String>, // Other chunks this depends on
54}
55
56/// Splits code into semantic chunks for selective demotion using tree-sitter AST parsing
57pub struct CodeChunker {
58    language_cache: HashMap<String, Option<AstLanguage>>,
59    ast_parser: Rc<RefCell<AstParser>>,
60}
61
62impl CodeChunker {
63    pub fn new(ast_parser: Rc<RefCell<AstParser>>) -> Self {
64        Self {
65            language_cache: HashMap::new(),
66            ast_parser,
67        }
68    }
69
70    pub fn detect_language(&mut self, file_path: &str) -> Option<AstLanguage> {
71        if let Some(cached) = self.language_cache.get(file_path) {
72            return cached.clone();
73        }
74
75        let ext = file_path.split('.').last().unwrap_or("");
76        let language = AstLanguage::from_extension(ext);
77
78        self.language_cache
79            .insert(file_path.to_string(), language.clone());
80        language
81    }
82
83    pub fn chunk_content(&mut self, content: &str, file_path: &str) -> Result<Vec<ChunkInfo>> {
84        let language = match self.detect_language(file_path) {
85            Some(lang) => lang,
86            None => return Ok(self.chunk_generic(content, file_path)),
87        };
88
89        // Use AST parser to get semantic chunks
90        // Create a temporary file path with the correct extension
91        let temp_path = format!(
92            "temp.{}",
93            match language {
94                AstLanguage::Python => "py",
95                AstLanguage::JavaScript => "js",
96                AstLanguage::TypeScript => "ts",
97                AstLanguage::Go => "go",
98                AstLanguage::Rust => "rs",
99            }
100        );
101        let ast_chunks = self
102            .ast_parser
103            .borrow_mut()
104            .parse_chunks(content, &temp_path)?;
105
106        let mut chunks = Vec::new();
107        for ast_chunk in ast_chunks {
108            let chunk = ChunkInfo {
109                start_line: ast_chunk.start_line,
110                end_line: ast_chunk.end_line,
111                chunk_type: ast_chunk.chunk_type,
112                content: ast_chunk.content.clone(),
113                importance_score: ast_chunk.importance_score,
114                estimated_tokens: estimate_tokens_for_content(&ast_chunk.content, file_path),
115                dependencies: Vec::new(), // Could be enhanced with dependency analysis
116            };
117            chunks.push(chunk);
118        }
119
120        Ok(chunks)
121    }
122
123    fn chunk_generic(&self, content: &str, file_path: &str) -> Vec<ChunkInfo> {
124        let lines: Vec<&str> = content.split('\n').collect();
125        let chunk_size = 20; // Lines per chunk
126        let mut chunks = Vec::new();
127
128        for (i, chunk_lines) in lines.chunks(chunk_size).enumerate() {
129            let start_line = i * chunk_size + 1;
130            let end_line = start_line + chunk_lines.len() - 1;
131            let content = chunk_lines.join("\n");
132
133            let chunk = ChunkInfo {
134                start_line,
135                end_line,
136                chunk_type: "generic".to_string(),
137                content: content.clone(),
138                importance_score: 0.5, // Default score for generic chunks
139                estimated_tokens: estimate_tokens_for_content(&content, file_path),
140                dependencies: Vec::new(),
141            };
142            chunks.push(chunk);
143        }
144
145        chunks
146    }
147
148    pub fn select_chunks_by_budget(&self, chunks: &[ChunkInfo], token_budget: usize) -> Vec<usize> {
149        // Sort chunks by importance score (descending)
150        let mut indexed_chunks: Vec<(usize, &ChunkInfo)> = chunks.iter().enumerate().collect();
151        indexed_chunks.sort_by(|a, b| {
152            b.1.importance_score
153                .partial_cmp(&a.1.importance_score)
154                .unwrap()
155        });
156
157        let mut selected_indices = Vec::new();
158        let mut used_tokens = 0;
159
160        for (index, chunk) in indexed_chunks {
161            if used_tokens + chunk.estimated_tokens <= token_budget {
162                selected_indices.push(index);
163                used_tokens += chunk.estimated_tokens;
164            }
165        }
166
167        // Sort indices to maintain original order
168        selected_indices.sort();
169        selected_indices
170    }
171}
172
173impl Default for CodeChunker {
174    fn default() -> Self {
175        let ast_parser = Rc::new(RefCell::new(
176            AstParser::new().expect("Failed to create AstParser"),
177        ));
178        Self::new(ast_parser)
179    }
180}
181
182/// Extracts type signatures and interfaces for the highest fidelity reduction using tree-sitter
183pub struct SignatureExtractor {
184    ast_parser: Rc<RefCell<AstParser>>,
185}
186
187impl SignatureExtractor {
188    pub fn new(ast_parser: Rc<RefCell<AstParser>>) -> Self {
189        Self { ast_parser }
190    }
191
192    pub fn extract_signatures(&mut self, content: &str, file_path: &str) -> Result<Vec<String>> {
193        let language = AstLanguage::from_extension(file_path.split('.').last().unwrap_or(""));
194
195        let language = match language {
196            Some(lang) => lang,
197            None => return Ok(vec![self.extract_generic_signatures(content)]),
198        };
199
200        // Use AST parser to extract signatures
201        // Create a temporary file path with the correct extension
202        let temp_path = format!(
203            "temp.{}",
204            match language {
205                AstLanguage::Python => "py",
206                AstLanguage::JavaScript => "js",
207                AstLanguage::TypeScript => "ts",
208                AstLanguage::Go => "go",
209                AstLanguage::Rust => "rs",
210            }
211        );
212        let signatures = self
213            .ast_parser
214            .borrow_mut()
215            .extract_signatures(content, &temp_path)?;
216
217        Ok(signatures
218            .into_iter()
219            .map(|sig| format!("{}:{} // {}", sig.name, sig.signature_type, sig.signature))
220            .collect())
221    }
222
223    fn extract_generic_signatures(&self, content: &str) -> String {
224        // For unknown file types, try to extract function-like patterns
225        let lines: Vec<&str> = content.lines().collect();
226        let mut signatures = Vec::new();
227
228        for line in lines {
229            let trimmed = line.trim();
230            if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with("#") {
231                continue;
232            }
233
234            // Look for function-like patterns (very basic heuristic)
235            if trimmed.contains("(")
236                && trimmed.contains(")")
237                && (trimmed.contains("def ")
238                    || trimmed.contains("function ")
239                    || trimmed.contains("fn ")
240                    || trimmed.contains("func "))
241            {
242                signatures.push(trimmed.to_string());
243            }
244        }
245
246        signatures.join("\n")
247    }
248}
249
250impl Default for SignatureExtractor {
251    fn default() -> Self {
252        let ast_parser = Rc::new(RefCell::new(
253            AstParser::new().expect("Failed to create AstParser"),
254        ));
255        Self::new(ast_parser)
256    }
257}
258
259/// Main demotion engine that orchestrates the progressive content reduction
260pub struct DemotionEngine {
261    chunker: CodeChunker,
262    signature_extractor: SignatureExtractor,
263}
264
265impl DemotionEngine {
266    pub fn new() -> Result<Self> {
267        let ast_parser = Rc::new(RefCell::new(AstParser::new()?));
268        Ok(Self {
269            chunker: CodeChunker::new(ast_parser.clone()),
270            signature_extractor: SignatureExtractor::new(ast_parser),
271        })
272    }
273
274    pub fn demote_content(
275        &mut self,
276        content: &str,
277        file_path: &str,
278        target_mode: FidelityMode,
279        token_budget: Option<usize>,
280    ) -> Result<DemotionResult> {
281        let original_tokens = estimate_tokens_for_content(content, file_path);
282
283        match target_mode {
284            FidelityMode::Full => Ok(DemotionResult {
285                original_path: file_path.to_string(),
286                original_tokens,
287                demoted_tokens: original_tokens,
288                fidelity_mode: FidelityMode::Full,
289                content: content.to_string(),
290                chunks_kept: 1,
291                chunks_total: 1,
292                compression_ratio: 1.0,
293                quality_score: 1.0,
294            }),
295            FidelityMode::Chunk => {
296                self.demote_to_chunks(content, file_path, token_budget, original_tokens)
297            }
298            FidelityMode::Signature => {
299                self.demote_to_signatures(content, file_path, original_tokens)
300            }
301        }
302    }
303
304    fn demote_to_chunks(
305        &mut self,
306        content: &str,
307        file_path: &str,
308        token_budget: Option<usize>,
309        original_tokens: usize,
310    ) -> Result<DemotionResult> {
311        let chunks = self.chunker.chunk_content(content, file_path)?;
312        let chunks_total = chunks.len();
313
314        let selected_indices = if let Some(budget) = token_budget {
315            self.chunker.select_chunks_by_budget(&chunks, budget)
316        } else {
317            // Keep all chunks if no budget specified
318            (0..chunks.len()).collect()
319        };
320
321        let chunks_kept = selected_indices.len();
322        let selected_chunks: Vec<String> = selected_indices
323            .iter()
324            .map(|&i| chunks[i].content.clone())
325            .collect();
326
327        let demoted_content = if selected_chunks.is_empty() {
328            let structure = extract_symbol_signatures(content, file_path);
329            if structure.is_empty() {
330                // Fallback: create basic structure summary if no chunks extracted
331                let lines: Vec<&str> = content.lines().collect();
332                lines
333                    .iter()
334                    .filter(|line| !line.trim().is_empty())
335                    .take(10)
336                    .map(|s| s.to_string())
337                    .collect::<Vec<_>>()
338                    .join("\n")
339            } else {
340                structure.join("\n")
341            }
342        } else {
343            selected_chunks.join("\n\n// ... [content omitted] ...\n\n")
344        };
345
346        let demoted_tokens = if demoted_content.is_empty() {
347            if std::env::var("SCRIBE_DEBUG").is_ok() {
348                eprintln!(
349                    "CHUNK DEMOTION BUG: Empty demoted content for {}",
350                    file_path
351                );
352            }
353            1 // Minimum tokens for empty content
354        } else {
355            let tokens = estimate_tokens_for_content(&demoted_content, file_path);
356            if std::env::var("SCRIBE_DEBUG").is_ok() {
357                eprintln!(
358                    "CHUNK DEMOTION DEBUG: {} has {} chars -> {} tokens",
359                    file_path,
360                    demoted_content.len(),
361                    std::cmp::max(1, tokens)
362                );
363            }
364            std::cmp::max(1, tokens)
365        };
366
367        let quality_score = if chunks_total > 0 {
368            selected_indices
369                .iter()
370                .map(|&i| chunks[i].importance_score)
371                .sum::<f64>()
372                / chunks_total as f64
373        } else {
374            0.0
375        };
376
377        Ok(DemotionResult {
378            original_path: file_path.to_string(),
379            original_tokens,
380            demoted_tokens,
381            fidelity_mode: FidelityMode::Chunk,
382            content: demoted_content,
383            chunks_kept,
384            chunks_total,
385            compression_ratio: demoted_tokens as f64 / original_tokens as f64,
386            quality_score,
387        })
388    }
389
390    fn demote_to_signatures(
391        &mut self,
392        content: &str,
393        file_path: &str,
394        original_tokens: usize,
395    ) -> Result<DemotionResult> {
396        let signatures = self
397            .signature_extractor
398            .extract_signatures(content, file_path)?;
399
400        // If no signatures extracted, fall back to basic fallback
401        let demoted_content = if signatures.is_empty() {
402            let mut fallback = extract_symbol_signatures(content, file_path);
403            if fallback.is_empty() {
404                match self
405                    .chunker
406                    .ast_parser
407                    .borrow_mut()
408                    .parse_chunks(content, file_path)
409                {
410                    Ok(chunks) => {
411                        for chunk in chunks {
412                            if let Some(name) = chunk.name {
413                                fallback.push(format!("{} {}", chunk.chunk_type, name));
414                            }
415                        }
416
417                        if fallback.is_empty() {
418                            self.signature_extractor.extract_generic_signatures(content)
419                        } else {
420                            fallback.join("\n")
421                        }
422                    }
423                    Err(_) => self.signature_extractor.extract_generic_signatures(content),
424                }
425            } else {
426                fallback.join("\n")
427            }
428        } else {
429            signatures.join("\n")
430        };
431
432        // Better token estimation based on actual content
433        let demoted_tokens = if demoted_content.is_empty() {
434            if std::env::var("SCRIBE_DEBUG").is_ok() {
435                eprintln!("DEMOTION BUG: Empty demoted content for {}", file_path);
436            }
437            1 // Minimum tokens for empty content
438        } else {
439            let tokens = estimate_tokens_for_content(&demoted_content, file_path);
440            if std::env::var("SCRIBE_DEBUG").is_ok() {
441                eprintln!(
442                    "DEMOTION DEBUG: {} has {} chars -> {} tokens",
443                    file_path,
444                    demoted_content.len(),
445                    std::cmp::max(1, tokens)
446                );
447            }
448            std::cmp::max(1, tokens)
449        };
450
451        Ok(DemotionResult {
452            original_path: file_path.to_string(),
453            original_tokens,
454            demoted_tokens,
455            fidelity_mode: FidelityMode::Signature,
456            content: demoted_content,
457            chunks_kept: signatures.len(),
458            chunks_total: signatures.len(), // For signatures, kept = total
459            compression_ratio: demoted_tokens as f64 / original_tokens as f64,
460            quality_score: 0.8, // Signatures preserve high-level structure
461        })
462    }
463}
464
465impl Default for DemotionEngine {
466    fn default() -> Self {
467        Self::new().expect("Failed to create DemotionEngine")
468    }
469}
470
471fn estimate_tokens_for_content(content: &str, file_path: &str) -> usize {
472    let path_hint = Path::new(file_path);
473    TokenCounter::global()
474        .estimate_file_tokens(content, path_hint)
475        .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(content))
476}
477
478fn extract_symbol_signatures(content: &str, file_path: &str) -> Vec<String> {
479    let extension = Path::new(file_path)
480        .extension()
481        .and_then(|ext| ext.to_str())
482        .unwrap_or("")
483        .to_lowercase();
484
485    let pattern = match extension.as_str() {
486        "rs" => r"(?m)^\s*(pub\s+)?(async\s+)?(fn|struct|enum|trait)\s+[A-Za-z0-9_]+",
487        "py" => r"(?m)^\s*(def|class)\s+[A-Za-z0-9_]+",
488        "ts" | "tsx" | "js" | "jsx" => {
489            r"(?m)^\s*(export\s+)?(async\s+)?(function|class)\s+[A-Za-z0-9_]+"
490        }
491        "go" => r"(?m)^\s*func\s+[A-Za-z0-9_]+",
492        "java" => r"(?m)^\s*(public\s+)?(class|interface|enum)\s+[A-Za-z0-9_]+",
493        "cs" => r"(?m)^\s*(public\s+)?(class|interface|struct)\s+[A-Za-z0-9_]+",
494        _ => r"(?m)^\s*(fn|function|def|class)\s+[A-Za-z0-9_]+",
495    };
496
497    let regex = match Regex::new(pattern) {
498        Ok(re) => re,
499        Err(_) => return Vec::new(),
500    };
501
502    let mut seen = std::collections::HashSet::new();
503    let mut results = Vec::new();
504
505    for mat in regex.find_iter(content) {
506        let line = mat.as_str().trim().to_string();
507        if seen.insert(line.clone()) {
508            results.push(line);
509        }
510    }
511
512    results
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_language_detection() {
521        let ast_parser = Rc::new(RefCell::new(AstParser::new().unwrap()));
522        let mut chunker = CodeChunker::new(ast_parser);
523
524        assert_eq!(
525            chunker.detect_language("test.py"),
526            Some(AstLanguage::Python)
527        );
528        assert_eq!(
529            chunker.detect_language("test.js"),
530            Some(AstLanguage::JavaScript)
531        );
532        assert_eq!(
533            chunker.detect_language("test.ts"),
534            Some(AstLanguage::TypeScript)
535        );
536        assert_eq!(chunker.detect_language("test.go"), Some(AstLanguage::Go));
537        assert_eq!(chunker.detect_language("test.rs"), Some(AstLanguage::Rust));
538        assert_eq!(chunker.detect_language("test.txt"), None);
539    }
540
541    #[test]
542    fn test_fidelity_modes() {
543        let engine = DemotionEngine::new().unwrap();
544
545        // Test that all fidelity modes are correctly represented
546        assert_eq!(FidelityMode::Full as u8, 0);
547        assert_ne!(FidelityMode::Chunk, FidelityMode::Signature);
548    }
549
550    #[test]
551    fn test_chunk_budget_selection() {
552        let ast_parser = Rc::new(RefCell::new(AstParser::new().unwrap()));
553        let chunker = CodeChunker::new(ast_parser);
554
555        let chunks = vec![
556            ChunkInfo {
557                start_line: 1,
558                end_line: 5,
559                chunk_type: "function".to_string(),
560                content: "def test(): pass".to_string(),
561                importance_score: 0.8,
562                estimated_tokens: 10,
563                dependencies: vec![],
564            },
565            ChunkInfo {
566                start_line: 6,
567                end_line: 10,
568                chunk_type: "comment".to_string(),
569                content: "# This is a comment".to_string(),
570                importance_score: 0.2,
571                estimated_tokens: 5,
572                dependencies: vec![],
573            },
574        ];
575
576        let selected = chunker.select_chunks_by_budget(&chunks, 12);
577        assert_eq!(selected, vec![0]); // Should select the function with higher importance
578    }
579}