1use crate::ast_parser::{AstLanguage, AstParser};
10use regex::Regex;
11use scribe_core::tokenization::{utils as token_utils, TokenCounter};
12use scribe_core::{Result, ScribeError};
13use serde::{Deserialize, Serialize};
14use std::cell::RefCell;
15use std::collections::{HashMap, HashSet};
16use std::path::Path;
17use std::rc::Rc;
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum FidelityMode {
22 Full,
24 Chunk,
26 Signature,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DemotionResult {
33 pub original_path: String,
34 pub original_tokens: usize,
35 pub demoted_tokens: usize,
36 pub fidelity_mode: FidelityMode,
37 pub content: String,
38 pub chunks_kept: usize,
39 pub chunks_total: usize,
40 pub compression_ratio: f64,
41 pub quality_score: f64, }
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct ChunkInfo {
47 pub start_line: usize,
48 pub end_line: usize,
49 pub chunk_type: String,
50 pub content: String,
51 pub importance_score: f64,
52 pub estimated_tokens: usize,
53 pub dependencies: Vec<String>, }
55
56pub struct CodeChunker {
58 language_cache: HashMap<String, Option<AstLanguage>>,
59 ast_parser: Rc<RefCell<AstParser>>,
60}
61
62impl CodeChunker {
63 pub fn new(ast_parser: Rc<RefCell<AstParser>>) -> Self {
64 Self {
65 language_cache: HashMap::new(),
66 ast_parser,
67 }
68 }
69
70 pub fn detect_language(&mut self, file_path: &str) -> Option<AstLanguage> {
71 if let Some(cached) = self.language_cache.get(file_path) {
72 return cached.clone();
73 }
74
75 let ext = file_path.split('.').last().unwrap_or("");
76 let language = AstLanguage::from_extension(ext);
77
78 self.language_cache
79 .insert(file_path.to_string(), language.clone());
80 language
81 }
82
83 pub fn chunk_content(&mut self, content: &str, file_path: &str) -> Result<Vec<ChunkInfo>> {
84 let language = match self.detect_language(file_path) {
85 Some(lang) => lang,
86 None => return Ok(self.chunk_generic(content, file_path)),
87 };
88
89 let temp_path = format!(
92 "temp.{}",
93 match language {
94 AstLanguage::Python => "py",
95 AstLanguage::JavaScript => "js",
96 AstLanguage::TypeScript => "ts",
97 AstLanguage::Go => "go",
98 AstLanguage::Rust => "rs",
99 }
100 );
101 let ast_chunks = self
102 .ast_parser
103 .borrow_mut()
104 .parse_chunks(content, &temp_path)?;
105
106 let mut chunks = Vec::new();
107 for ast_chunk in ast_chunks {
108 let chunk = ChunkInfo {
109 start_line: ast_chunk.start_line,
110 end_line: ast_chunk.end_line,
111 chunk_type: ast_chunk.chunk_type,
112 content: ast_chunk.content.clone(),
113 importance_score: ast_chunk.importance_score,
114 estimated_tokens: estimate_tokens_for_content(&ast_chunk.content, file_path),
115 dependencies: Vec::new(), };
117 chunks.push(chunk);
118 }
119
120 Ok(chunks)
121 }
122
123 fn chunk_generic(&self, content: &str, file_path: &str) -> Vec<ChunkInfo> {
124 let lines: Vec<&str> = content.split('\n').collect();
125 let chunk_size = 20; let mut chunks = Vec::new();
127
128 for (i, chunk_lines) in lines.chunks(chunk_size).enumerate() {
129 let start_line = i * chunk_size + 1;
130 let end_line = start_line + chunk_lines.len() - 1;
131 let content = chunk_lines.join("\n");
132
133 let chunk = ChunkInfo {
134 start_line,
135 end_line,
136 chunk_type: "generic".to_string(),
137 content: content.clone(),
138 importance_score: 0.5, estimated_tokens: estimate_tokens_for_content(&content, file_path),
140 dependencies: Vec::new(),
141 };
142 chunks.push(chunk);
143 }
144
145 chunks
146 }
147
148 pub fn select_chunks_by_budget(&self, chunks: &[ChunkInfo], token_budget: usize) -> Vec<usize> {
149 let mut indexed_chunks: Vec<(usize, &ChunkInfo)> = chunks.iter().enumerate().collect();
151 indexed_chunks.sort_by(|a, b| {
152 b.1.importance_score
153 .partial_cmp(&a.1.importance_score)
154 .unwrap()
155 });
156
157 let mut selected_indices = Vec::new();
158 let mut used_tokens = 0;
159
160 for (index, chunk) in indexed_chunks {
161 if used_tokens + chunk.estimated_tokens <= token_budget {
162 selected_indices.push(index);
163 used_tokens += chunk.estimated_tokens;
164 }
165 }
166
167 selected_indices.sort();
169 selected_indices
170 }
171}
172
173impl Default for CodeChunker {
174 fn default() -> Self {
175 let ast_parser = Rc::new(RefCell::new(
176 AstParser::new().expect("Failed to create AstParser"),
177 ));
178 Self::new(ast_parser)
179 }
180}
181
182pub struct SignatureExtractor {
184 ast_parser: Rc<RefCell<AstParser>>,
185}
186
187impl SignatureExtractor {
188 pub fn new(ast_parser: Rc<RefCell<AstParser>>) -> Self {
189 Self { ast_parser }
190 }
191
192 pub fn extract_signatures(&mut self, content: &str, file_path: &str) -> Result<Vec<String>> {
193 let language = AstLanguage::from_extension(file_path.split('.').last().unwrap_or(""));
194
195 let language = match language {
196 Some(lang) => lang,
197 None => return Ok(vec![self.extract_generic_signatures(content)]),
198 };
199
200 let temp_path = format!(
203 "temp.{}",
204 match language {
205 AstLanguage::Python => "py",
206 AstLanguage::JavaScript => "js",
207 AstLanguage::TypeScript => "ts",
208 AstLanguage::Go => "go",
209 AstLanguage::Rust => "rs",
210 }
211 );
212 let signatures = self
213 .ast_parser
214 .borrow_mut()
215 .extract_signatures(content, &temp_path)?;
216
217 Ok(signatures
218 .into_iter()
219 .map(|sig| format!("{}:{} // {}", sig.name, sig.signature_type, sig.signature))
220 .collect())
221 }
222
223 fn extract_generic_signatures(&self, content: &str) -> String {
224 let lines: Vec<&str> = content.lines().collect();
226 let mut signatures = Vec::new();
227
228 for line in lines {
229 let trimmed = line.trim();
230 if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with("#") {
231 continue;
232 }
233
234 if trimmed.contains("(")
236 && trimmed.contains(")")
237 && (trimmed.contains("def ")
238 || trimmed.contains("function ")
239 || trimmed.contains("fn ")
240 || trimmed.contains("func "))
241 {
242 signatures.push(trimmed.to_string());
243 }
244 }
245
246 signatures.join("\n")
247 }
248}
249
250impl Default for SignatureExtractor {
251 fn default() -> Self {
252 let ast_parser = Rc::new(RefCell::new(
253 AstParser::new().expect("Failed to create AstParser"),
254 ));
255 Self::new(ast_parser)
256 }
257}
258
259pub struct DemotionEngine {
261 chunker: CodeChunker,
262 signature_extractor: SignatureExtractor,
263}
264
265impl DemotionEngine {
266 pub fn new() -> Result<Self> {
267 let ast_parser = Rc::new(RefCell::new(AstParser::new()?));
268 Ok(Self {
269 chunker: CodeChunker::new(ast_parser.clone()),
270 signature_extractor: SignatureExtractor::new(ast_parser),
271 })
272 }
273
274 pub fn demote_content(
275 &mut self,
276 content: &str,
277 file_path: &str,
278 target_mode: FidelityMode,
279 token_budget: Option<usize>,
280 ) -> Result<DemotionResult> {
281 let original_tokens = estimate_tokens_for_content(content, file_path);
282
283 match target_mode {
284 FidelityMode::Full => Ok(DemotionResult {
285 original_path: file_path.to_string(),
286 original_tokens,
287 demoted_tokens: original_tokens,
288 fidelity_mode: FidelityMode::Full,
289 content: content.to_string(),
290 chunks_kept: 1,
291 chunks_total: 1,
292 compression_ratio: 1.0,
293 quality_score: 1.0,
294 }),
295 FidelityMode::Chunk => {
296 self.demote_to_chunks(content, file_path, token_budget, original_tokens)
297 }
298 FidelityMode::Signature => {
299 self.demote_to_signatures(content, file_path, original_tokens)
300 }
301 }
302 }
303
304 fn demote_to_chunks(
305 &mut self,
306 content: &str,
307 file_path: &str,
308 token_budget: Option<usize>,
309 original_tokens: usize,
310 ) -> Result<DemotionResult> {
311 let chunks = self.chunker.chunk_content(content, file_path)?;
312 let chunks_total = chunks.len();
313
314 let selected_indices = if let Some(budget) = token_budget {
315 self.chunker.select_chunks_by_budget(&chunks, budget)
316 } else {
317 (0..chunks.len()).collect()
319 };
320
321 let chunks_kept = selected_indices.len();
322 let selected_chunks: Vec<String> = selected_indices
323 .iter()
324 .map(|&i| chunks[i].content.clone())
325 .collect();
326
327 let demoted_content = if selected_chunks.is_empty() {
328 let structure = extract_symbol_signatures(content, file_path);
329 if structure.is_empty() {
330 let lines: Vec<&str> = content.lines().collect();
332 lines
333 .iter()
334 .filter(|line| !line.trim().is_empty())
335 .take(10)
336 .map(|s| s.to_string())
337 .collect::<Vec<_>>()
338 .join("\n")
339 } else {
340 structure.join("\n")
341 }
342 } else {
343 selected_chunks.join("\n\n// ... [content omitted] ...\n\n")
344 };
345
346 let demoted_tokens = if demoted_content.is_empty() {
347 if std::env::var("SCRIBE_DEBUG").is_ok() {
348 eprintln!(
349 "CHUNK DEMOTION BUG: Empty demoted content for {}",
350 file_path
351 );
352 }
353 1 } else {
355 let tokens = estimate_tokens_for_content(&demoted_content, file_path);
356 if std::env::var("SCRIBE_DEBUG").is_ok() {
357 eprintln!(
358 "CHUNK DEMOTION DEBUG: {} has {} chars -> {} tokens",
359 file_path,
360 demoted_content.len(),
361 std::cmp::max(1, tokens)
362 );
363 }
364 std::cmp::max(1, tokens)
365 };
366
367 let quality_score = if chunks_total > 0 {
368 selected_indices
369 .iter()
370 .map(|&i| chunks[i].importance_score)
371 .sum::<f64>()
372 / chunks_total as f64
373 } else {
374 0.0
375 };
376
377 Ok(DemotionResult {
378 original_path: file_path.to_string(),
379 original_tokens,
380 demoted_tokens,
381 fidelity_mode: FidelityMode::Chunk,
382 content: demoted_content,
383 chunks_kept,
384 chunks_total,
385 compression_ratio: demoted_tokens as f64 / original_tokens as f64,
386 quality_score,
387 })
388 }
389
390 fn demote_to_signatures(
391 &mut self,
392 content: &str,
393 file_path: &str,
394 original_tokens: usize,
395 ) -> Result<DemotionResult> {
396 let signatures = self
397 .signature_extractor
398 .extract_signatures(content, file_path)?;
399
400 let demoted_content = if signatures.is_empty() {
402 let mut fallback = extract_symbol_signatures(content, file_path);
403 if fallback.is_empty() {
404 match self
405 .chunker
406 .ast_parser
407 .borrow_mut()
408 .parse_chunks(content, file_path)
409 {
410 Ok(chunks) => {
411 for chunk in chunks {
412 if let Some(name) = chunk.name {
413 fallback.push(format!("{} {}", chunk.chunk_type, name));
414 }
415 }
416
417 if fallback.is_empty() {
418 self.signature_extractor.extract_generic_signatures(content)
419 } else {
420 fallback.join("\n")
421 }
422 }
423 Err(_) => self.signature_extractor.extract_generic_signatures(content),
424 }
425 } else {
426 fallback.join("\n")
427 }
428 } else {
429 signatures.join("\n")
430 };
431
432 let demoted_tokens = if demoted_content.is_empty() {
434 if std::env::var("SCRIBE_DEBUG").is_ok() {
435 eprintln!("DEMOTION BUG: Empty demoted content for {}", file_path);
436 }
437 1 } else {
439 let tokens = estimate_tokens_for_content(&demoted_content, file_path);
440 if std::env::var("SCRIBE_DEBUG").is_ok() {
441 eprintln!(
442 "DEMOTION DEBUG: {} has {} chars -> {} tokens",
443 file_path,
444 demoted_content.len(),
445 std::cmp::max(1, tokens)
446 );
447 }
448 std::cmp::max(1, tokens)
449 };
450
451 Ok(DemotionResult {
452 original_path: file_path.to_string(),
453 original_tokens,
454 demoted_tokens,
455 fidelity_mode: FidelityMode::Signature,
456 content: demoted_content,
457 chunks_kept: signatures.len(),
458 chunks_total: signatures.len(), compression_ratio: demoted_tokens as f64 / original_tokens as f64,
460 quality_score: 0.8, })
462 }
463}
464
465impl Default for DemotionEngine {
466 fn default() -> Self {
467 Self::new().expect("Failed to create DemotionEngine")
468 }
469}
470
471fn estimate_tokens_for_content(content: &str, file_path: &str) -> usize {
472 let path_hint = Path::new(file_path);
473 TokenCounter::global()
474 .estimate_file_tokens(content, path_hint)
475 .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(content))
476}
477
478fn extract_symbol_signatures(content: &str, file_path: &str) -> Vec<String> {
479 let extension = Path::new(file_path)
480 .extension()
481 .and_then(|ext| ext.to_str())
482 .unwrap_or("")
483 .to_lowercase();
484
485 let pattern = match extension.as_str() {
486 "rs" => r"(?m)^\s*(pub\s+)?(async\s+)?(fn|struct|enum|trait)\s+[A-Za-z0-9_]+",
487 "py" => r"(?m)^\s*(def|class)\s+[A-Za-z0-9_]+",
488 "ts" | "tsx" | "js" | "jsx" => {
489 r"(?m)^\s*(export\s+)?(async\s+)?(function|class)\s+[A-Za-z0-9_]+"
490 }
491 "go" => r"(?m)^\s*func\s+[A-Za-z0-9_]+",
492 "java" => r"(?m)^\s*(public\s+)?(class|interface|enum)\s+[A-Za-z0-9_]+",
493 "cs" => r"(?m)^\s*(public\s+)?(class|interface|struct)\s+[A-Za-z0-9_]+",
494 _ => r"(?m)^\s*(fn|function|def|class)\s+[A-Za-z0-9_]+",
495 };
496
497 let regex = match Regex::new(pattern) {
498 Ok(re) => re,
499 Err(_) => return Vec::new(),
500 };
501
502 let mut seen = std::collections::HashSet::new();
503 let mut results = Vec::new();
504
505 for mat in regex.find_iter(content) {
506 let line = mat.as_str().trim().to_string();
507 if seen.insert(line.clone()) {
508 results.push(line);
509 }
510 }
511
512 results
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
520 fn test_language_detection() {
521 let ast_parser = Rc::new(RefCell::new(AstParser::new().unwrap()));
522 let mut chunker = CodeChunker::new(ast_parser);
523
524 assert_eq!(
525 chunker.detect_language("test.py"),
526 Some(AstLanguage::Python)
527 );
528 assert_eq!(
529 chunker.detect_language("test.js"),
530 Some(AstLanguage::JavaScript)
531 );
532 assert_eq!(
533 chunker.detect_language("test.ts"),
534 Some(AstLanguage::TypeScript)
535 );
536 assert_eq!(chunker.detect_language("test.go"), Some(AstLanguage::Go));
537 assert_eq!(chunker.detect_language("test.rs"), Some(AstLanguage::Rust));
538 assert_eq!(chunker.detect_language("test.txt"), None);
539 }
540
541 #[test]
542 fn test_fidelity_modes() {
543 let engine = DemotionEngine::new().unwrap();
544
545 assert_eq!(FidelityMode::Full as u8, 0);
547 assert_ne!(FidelityMode::Chunk, FidelityMode::Signature);
548 }
549
550 #[test]
551 fn test_chunk_budget_selection() {
552 let ast_parser = Rc::new(RefCell::new(AstParser::new().unwrap()));
553 let chunker = CodeChunker::new(ast_parser);
554
555 let chunks = vec![
556 ChunkInfo {
557 start_line: 1,
558 end_line: 5,
559 chunk_type: "function".to_string(),
560 content: "def test(): pass".to_string(),
561 importance_score: 0.8,
562 estimated_tokens: 10,
563 dependencies: vec![],
564 },
565 ChunkInfo {
566 start_line: 6,
567 end_line: 10,
568 chunk_type: "comment".to_string(),
569 content: "# This is a comment".to_string(),
570 importance_score: 0.2,
571 estimated_tokens: 5,
572 dependencies: vec![],
573 },
574 ];
575
576 let selected = chunker.select_chunks_by_budget(&chunks, 12);
577 assert_eq!(selected, vec![0]); }
579}