1use serde::{Serialize, Deserialize};
10use std::collections::HashMap;
11use scribe_core::{Result, ScribeError};
12use crate::ast_parser::{AstParser, AstLanguage, AstChunk as AstParserChunk, AstSignature};
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16pub enum FidelityMode {
17 Full,
19 Chunk,
21 Signature,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct DemotionResult {
28 pub original_path: String,
29 pub original_tokens: usize,
30 pub demoted_tokens: usize,
31 pub fidelity_mode: FidelityMode,
32 pub content: String,
33 pub chunks_kept: usize,
34 pub chunks_total: usize,
35 pub compression_ratio: f64,
36 pub quality_score: f64, }
38
39#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct ChunkInfo {
42 pub start_line: usize,
43 pub end_line: usize,
44 pub chunk_type: String,
45 pub content: String,
46 pub importance_score: f64,
47 pub estimated_tokens: usize,
48 pub dependencies: Vec<String>, }
50
51pub struct CodeChunker {
53 language_cache: HashMap<String, Option<AstLanguage>>,
54 ast_parser: AstParser,
55}
56
57impl CodeChunker {
58 pub fn new() -> Result<Self> {
59 Ok(Self {
60 language_cache: HashMap::new(),
61 ast_parser: AstParser::new()?,
62 })
63 }
64
65 pub fn detect_language(&mut self, file_path: &str) -> Option<AstLanguage> {
66 if let Some(cached) = self.language_cache.get(file_path) {
67 return cached.clone();
68 }
69
70 let ext = file_path.split('.').last().unwrap_or("");
71 let language = AstLanguage::from_extension(ext);
72
73 self.language_cache.insert(file_path.to_string(), language.clone());
74 language
75 }
76
77 pub fn chunk_content(&mut self, content: &str, file_path: &str) -> Result<Vec<ChunkInfo>> {
78 let language = match self.detect_language(file_path) {
79 Some(lang) => lang,
80 None => return Ok(self.chunk_generic(content)),
81 };
82
83 let temp_path = format!("temp.{}", match language {
86 AstLanguage::Python => "py",
87 AstLanguage::JavaScript => "js",
88 AstLanguage::TypeScript => "ts",
89 AstLanguage::Go => "go",
90 AstLanguage::Rust => "rs",
91 });
92 let ast_chunks = self.ast_parser.parse_chunks(content, &temp_path)?;
93
94 let mut chunks = Vec::new();
95 for ast_chunk in ast_chunks {
96 let chunk = ChunkInfo {
97 start_line: ast_chunk.start_line,
98 end_line: ast_chunk.end_line,
99 chunk_type: ast_chunk.chunk_type,
100 content: ast_chunk.content.clone(),
101 importance_score: ast_chunk.importance_score,
102 estimated_tokens: ast_chunk.content.len() / 4, dependencies: Vec::new(), };
105 chunks.push(chunk);
106 }
107
108 Ok(chunks)
109 }
110
111 fn chunk_generic(&self, content: &str) -> Vec<ChunkInfo> {
112 let lines: Vec<&str> = content.split('\n').collect();
113 let chunk_size = 20; let mut chunks = Vec::new();
115
116 for (i, chunk_lines) in lines.chunks(chunk_size).enumerate() {
117 let start_line = i * chunk_size + 1;
118 let end_line = start_line + chunk_lines.len() - 1;
119 let content = chunk_lines.join("\n");
120
121 let chunk = ChunkInfo {
122 start_line,
123 end_line,
124 chunk_type: "generic".to_string(),
125 content: content.clone(),
126 importance_score: 0.5, estimated_tokens: content.len() / 4,
128 dependencies: Vec::new(),
129 };
130 chunks.push(chunk);
131 }
132
133 chunks
134 }
135
136 pub fn select_chunks_by_budget(&self, chunks: &[ChunkInfo], token_budget: usize) -> Vec<usize> {
137 let mut indexed_chunks: Vec<(usize, &ChunkInfo)> = chunks.iter().enumerate().collect();
139 indexed_chunks.sort_by(|a, b| b.1.importance_score.partial_cmp(&a.1.importance_score).unwrap());
140
141 let mut selected_indices = Vec::new();
142 let mut used_tokens = 0;
143
144 for (index, chunk) in indexed_chunks {
145 if used_tokens + chunk.estimated_tokens <= token_budget {
146 selected_indices.push(index);
147 used_tokens += chunk.estimated_tokens;
148 }
149 }
150
151 selected_indices.sort();
153 selected_indices
154 }
155}
156
157impl Default for CodeChunker {
158 fn default() -> Self {
159 Self::new().expect("Failed to create CodeChunker")
160 }
161}
162
163pub struct SignatureExtractor {
165 ast_parser: AstParser,
166}
167
168impl SignatureExtractor {
169 pub fn new() -> Result<Self> {
170 Ok(Self {
171 ast_parser: AstParser::new()?,
172 })
173 }
174
175 pub fn extract_signatures(&mut self, content: &str, file_path: &str) -> Result<Vec<String>> {
176 let language = AstLanguage::from_extension(
177 file_path.split('.').last().unwrap_or("")
178 );
179
180 let language = match language {
181 Some(lang) => lang,
182 None => return Ok(vec![self.extract_generic_signatures(content)]),
183 };
184
185 let temp_path = format!("temp.{}", match language {
188 AstLanguage::Python => "py",
189 AstLanguage::JavaScript => "js",
190 AstLanguage::TypeScript => "ts",
191 AstLanguage::Go => "go",
192 AstLanguage::Rust => "rs",
193 });
194 let signatures = self.ast_parser.extract_signatures(content, &temp_path)?;
195
196 Ok(signatures.into_iter().map(|sig| {
197 format!("{}:{} // {}", sig.name, sig.signature_type, sig.signature)
198 }).collect())
199 }
200
201 fn extract_generic_signatures(&self, content: &str) -> String {
202 let lines: Vec<&str> = content.lines().collect();
204 let mut signatures = Vec::new();
205
206 for line in lines {
207 let trimmed = line.trim();
208 if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with("#") {
209 continue;
210 }
211
212 if trimmed.contains("(") && trimmed.contains(")") &&
214 (trimmed.contains("def ") || trimmed.contains("function ") ||
215 trimmed.contains("fn ") || trimmed.contains("func ")) {
216 signatures.push(trimmed.to_string());
217 }
218 }
219
220 signatures.join("\n")
221 }
222}
223
224impl Default for SignatureExtractor {
225 fn default() -> Self {
226 Self::new().expect("Failed to create SignatureExtractor")
227 }
228}
229
230pub struct DemotionEngine {
232 chunker: CodeChunker,
233 signature_extractor: SignatureExtractor,
234}
235
236impl DemotionEngine {
237 pub fn new() -> Result<Self> {
238 Ok(Self {
239 chunker: CodeChunker::new()?,
240 signature_extractor: SignatureExtractor::new()?,
241 })
242 }
243
244 pub fn demote_content(
245 &mut self,
246 content: &str,
247 file_path: &str,
248 target_mode: FidelityMode,
249 token_budget: Option<usize>,
250 ) -> Result<DemotionResult> {
251 let original_tokens = content.len() / 4; match target_mode {
254 FidelityMode::Full => {
255 Ok(DemotionResult {
256 original_path: file_path.to_string(),
257 original_tokens,
258 demoted_tokens: original_tokens,
259 fidelity_mode: FidelityMode::Full,
260 content: content.to_string(),
261 chunks_kept: 1,
262 chunks_total: 1,
263 compression_ratio: 1.0,
264 quality_score: 1.0,
265 })
266 }
267 FidelityMode::Chunk => {
268 self.demote_to_chunks(content, file_path, token_budget, original_tokens)
269 }
270 FidelityMode::Signature => {
271 self.demote_to_signatures(content, file_path, original_tokens)
272 }
273 }
274 }
275
276 fn demote_to_chunks(
277 &mut self,
278 content: &str,
279 file_path: &str,
280 token_budget: Option<usize>,
281 original_tokens: usize,
282 ) -> Result<DemotionResult> {
283 let chunks = self.chunker.chunk_content(content, file_path)?;
284 let chunks_total = chunks.len();
285
286 let selected_indices = if let Some(budget) = token_budget {
287 self.chunker.select_chunks_by_budget(&chunks, budget)
288 } else {
289 (0..chunks.len()).collect()
291 };
292
293 let chunks_kept = selected_indices.len();
294 let selected_chunks: Vec<String> = selected_indices
295 .iter()
296 .map(|&i| chunks[i].content.clone())
297 .collect();
298
299 let demoted_content = selected_chunks.join("\n\n// ... [content omitted] ...\n\n");
300 let demoted_tokens = demoted_content.len() / 4;
301
302 let quality_score = if chunks_total > 0 {
303 selected_indices
304 .iter()
305 .map(|&i| chunks[i].importance_score)
306 .sum::<f64>() / chunks_total as f64
307 } else {
308 0.0
309 };
310
311 Ok(DemotionResult {
312 original_path: file_path.to_string(),
313 original_tokens,
314 demoted_tokens,
315 fidelity_mode: FidelityMode::Chunk,
316 content: demoted_content,
317 chunks_kept,
318 chunks_total,
319 compression_ratio: demoted_tokens as f64 / original_tokens as f64,
320 quality_score,
321 })
322 }
323
324 fn demote_to_signatures(
325 &mut self,
326 content: &str,
327 file_path: &str,
328 original_tokens: usize,
329 ) -> Result<DemotionResult> {
330 let signatures = self.signature_extractor.extract_signatures(content, file_path)?;
331 let demoted_content = signatures.join("\n");
332 let demoted_tokens = demoted_content.len() / 4;
333
334 Ok(DemotionResult {
335 original_path: file_path.to_string(),
336 original_tokens,
337 demoted_tokens,
338 fidelity_mode: FidelityMode::Signature,
339 content: demoted_content,
340 chunks_kept: signatures.len(),
341 chunks_total: signatures.len(), compression_ratio: demoted_tokens as f64 / original_tokens as f64,
343 quality_score: 0.8, })
345 }
346}
347
348impl Default for DemotionEngine {
349 fn default() -> Self {
350 Self::new().expect("Failed to create DemotionEngine")
351 }
352}
353
354#[cfg(test)]
355mod tests {
356 use super::*;
357
358 #[test]
359 fn test_language_detection() {
360 let mut chunker = CodeChunker::new().unwrap();
361
362 assert_eq!(chunker.detect_language("test.py"), Some(AstLanguage::Python));
363 assert_eq!(chunker.detect_language("test.js"), Some(AstLanguage::JavaScript));
364 assert_eq!(chunker.detect_language("test.ts"), Some(AstLanguage::TypeScript));
365 assert_eq!(chunker.detect_language("test.go"), Some(AstLanguage::Go));
366 assert_eq!(chunker.detect_language("test.rs"), Some(AstLanguage::Rust));
367 assert_eq!(chunker.detect_language("test.txt"), None);
368 }
369
370 #[test]
371 fn test_fidelity_modes() {
372 let engine = DemotionEngine::new().unwrap();
373
374 assert_eq!(FidelityMode::Full as u8, 0);
376 assert_ne!(FidelityMode::Chunk, FidelityMode::Signature);
377 }
378
379 #[test]
380 fn test_chunk_budget_selection() {
381 let chunker = CodeChunker::new().unwrap();
382
383 let chunks = vec![
384 ChunkInfo {
385 start_line: 1,
386 end_line: 5,
387 chunk_type: "function".to_string(),
388 content: "def test(): pass".to_string(),
389 importance_score: 0.8,
390 estimated_tokens: 10,
391 dependencies: vec![],
392 },
393 ChunkInfo {
394 start_line: 6,
395 end_line: 10,
396 chunk_type: "comment".to_string(),
397 content: "# This is a comment".to_string(),
398 importance_score: 0.2,
399 estimated_tokens: 5,
400 dependencies: vec![],
401 },
402 ];
403
404 let selected = chunker.select_chunks_by_budget(&chunks, 12);
405 assert_eq!(selected, vec![0]); }
407}