sqz_engine/
file_reader.rs

1//! Multi-mode file reader supporting 8 reading modes for optimal compression.
2//!
3//! Modes: `full`, `map`, `signatures`, `diff`, `aggressive`, `entropy`, `task`, `lines`.
4
5use std::collections::HashMap;
6use std::ops::Range;
7use std::path::Path;
8
9use rusqlite::{params, Connection};
10use serde::{Deserialize, Serialize};
11
12use crate::ast_parser::AstParser;
13use crate::error::{Result, SqzError};
14
15// ── ReadMode ──────────────────────────────────────────────────────────────────
16
17/// The 8 file reading modes.
18#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
19#[serde(rename_all = "lowercase")]
20pub enum FileReadMode {
21    /// Complete file content, no compression.
22    Full,
23    /// Structural overview (~50 tokens for a 300-line file).
24    Map,
25    /// Function/class signatures only via AST parser.
26    Signatures,
27    /// Changes since last cached read.
28    Diff,
29    /// Maximum compression (entropy + signatures combined).
30    Aggressive,
31    /// High-information sections only (Shannon entropy).
32    Entropy,
33    /// Task-relevant sections based on current intent via FTS5.
34    Task,
35    /// Specific line ranges with surrounding context.
36    Lines(Range<usize>),
37}
38
39// ── ReadResult ────────────────────────────────────────────────────────────────
40
41/// Result of a file read operation.
42#[derive(Debug, Clone)]
43pub struct ReadResult {
44    pub content: String,
45    pub mode: String,
46    pub tokens_original: u32,
47    pub tokens_result: u32,
48}
49
50// ── Entropy helpers ───────────────────────────────────────────────────────────
51
52/// A logical block of source code with its entropy score.
53#[derive(Debug, Clone)]
54pub struct BlockEntropy {
55    pub start_line: usize,
56    pub end_line: usize,
57    pub entropy: f64,
58    pub text: String,
59}
60
61/// Compute Shannon entropy (bits per character) for a string.
62fn shannon_entropy(text: &str) -> f64 {
63    if text.is_empty() {
64        return 0.0;
65    }
66    let mut freq: HashMap<char, usize> = HashMap::new();
67    let total = text.len() as f64;
68    for ch in text.chars() {
69        *freq.entry(ch).or_insert(0) += 1;
70    }
71    let mut entropy = 0.0;
72    for &count in freq.values() {
73        let p = count as f64 / total;
74        if p > 0.0 {
75            entropy -= p * p.log2();
76        }
77    }
78    entropy
79}
80
81/// Split source into logical blocks (separated by blank lines) and compute
82/// entropy for each block.
83fn compute_block_entropies(source: &str) -> Vec<BlockEntropy> {
84    let lines: Vec<&str> = source.lines().collect();
85    let mut blocks = Vec::new();
86    let mut block_start = 0;
87    let mut current_block = String::new();
88
89    for (i, line) in lines.iter().enumerate() {
90        if line.trim().is_empty() {
91            if !current_block.trim().is_empty() {
92                blocks.push(BlockEntropy {
93                    start_line: block_start,
94                    end_line: i,
95                    entropy: shannon_entropy(&current_block),
96                    text: current_block.clone(),
97                });
98            }
99            current_block.clear();
100            block_start = i + 1;
101        } else {
102            if current_block.is_empty() {
103                block_start = i;
104            }
105            if !current_block.is_empty() {
106                current_block.push('\n');
107            }
108            current_block.push_str(line);
109        }
110    }
111    // Flush last block
112    if !current_block.trim().is_empty() {
113        blocks.push(BlockEntropy {
114            start_line: block_start,
115            end_line: lines.len(),
116            entropy: shannon_entropy(&current_block),
117            text: current_block,
118        });
119    }
120    blocks
121}
122
123/// Return blocks above the given percentile threshold.
124fn filter_high_entropy(blocks: &[BlockEntropy], percentile: f64) -> Vec<&BlockEntropy> {
125    if blocks.is_empty() {
126        return Vec::new();
127    }
128    let mut entropies: Vec<f64> = blocks.iter().map(|b| b.entropy).collect();
129    entropies.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
130    let idx = ((percentile / 100.0) * (entropies.len() as f64 - 1.0)).round() as usize;
131    let threshold = entropies[idx.min(entropies.len() - 1)];
132    blocks.iter().filter(|b| b.entropy >= threshold).collect()
133}
134
135// ── FTS5 task-mode helpers ────────────────────────────────────────────────────
136
137/// Index file chunks into an in-memory FTS5 table and return sections matching
138/// the intent via BM25 ranking.
139fn fts5_task_filter(source: &str, intent: &str) -> Result<String> {
140    let chunks = chunk_by_blocks(source);
141    if chunks.is_empty() {
142        return Ok(String::new());
143    }
144
145    let conn = Connection::open_in_memory()
146        .map_err(|e| SqzError::Other(format!("FTS5 in-memory open failed: {e}")))?;
147
148    conn.execute_batch(
149        r#"
150        CREATE VIRTUAL TABLE IF NOT EXISTS file_fts USING fts5(
151            chunk_id,
152            body,
153            tokenize='porter ascii'
154        );
155        "#,
156    )
157    .map_err(|e| SqzError::Other(format!("FTS5 schema creation failed: {e}")))?;
158
159    for (i, chunk) in chunks.iter().enumerate() {
160        conn.execute(
161            "INSERT INTO file_fts(chunk_id, body) VALUES (?1, ?2)",
162            params![i.to_string(), chunk],
163        )
164        .map_err(|e| SqzError::Other(format!("FTS5 insert failed: {e}")))?;
165    }
166
167    // Sanitize intent for FTS5 query
168    let sanitized: String = intent
169        .chars()
170        .map(|c| if c.is_alphanumeric() || c.is_whitespace() { c } else { ' ' })
171        .collect();
172    let terms: Vec<&str> = sanitized.split_whitespace().collect();
173    if terms.is_empty() {
174        // No usable terms — return full content
175        return Ok(source.to_string());
176    }
177
178    let fts_query = terms.join(" OR ");
179
180    let mut stmt = conn
181        .prepare(
182            r#"SELECT body FROM file_fts
183               WHERE file_fts MATCH ?1
184               ORDER BY rank
185               LIMIT 20"#,
186        )
187        .map_err(|e| SqzError::Other(format!("FTS5 query prepare failed: {e}")))?;
188
189    let rows = stmt
190        .query_map(params![fts_query], |row| row.get::<_, String>(0))
191        .map_err(|e| SqzError::Other(format!("FTS5 query failed: {e}")))?;
192
193    let mut results = Vec::new();
194    for row in rows {
195        results.push(row.map_err(|e| SqzError::Other(format!("FTS5 row read failed: {e}")))?);
196    }
197
198    if results.is_empty() {
199        // No matches — return full content as fallback
200        return Ok(source.to_string());
201    }
202
203    Ok(results.join("\n\n"))
204}
205
206/// Chunk source by blank-line-separated blocks (same strategy as sandbox).
207fn chunk_by_blocks(text: &str) -> Vec<String> {
208    const MAX_CHUNK_BYTES: usize = 512;
209    let paragraphs: Vec<&str> = text.split("\n\n").collect();
210    let mut chunks = Vec::new();
211
212    for para in paragraphs {
213        let trimmed = para.trim();
214        if trimmed.is_empty() {
215            continue;
216        }
217        if trimmed.len() <= MAX_CHUNK_BYTES {
218            chunks.push(trimmed.to_string());
219        } else {
220            let mut current = String::new();
221            for line in trimmed.lines() {
222                if !current.is_empty() && current.len() + line.len() + 1 > MAX_CHUNK_BYTES {
223                    chunks.push(std::mem::take(&mut current));
224                }
225                if !current.is_empty() {
226                    current.push('\n');
227                }
228                current.push_str(line);
229            }
230            if !current.is_empty() {
231                chunks.push(current);
232            }
233        }
234    }
235
236    if chunks.is_empty() && !text.trim().is_empty() {
237        chunks.push(text.trim().to_string());
238    }
239    chunks
240}
241
242// ── Approximate token count ───────────────────────────────────────────────────
243
244fn approx_tokens(s: &str) -> u32 {
245    ((s.len() as f64) / 4.0).ceil() as u32
246}
247
248// ── FileReader ────────────────────────────────────────────────────────────────
249
250/// Multi-mode file reader that produces compressed output based on the
251/// selected reading mode.
252pub struct FileReader {
253    ast_parser: AstParser,
254    entropy_percentile: f64,
255    context_lines: usize,
256}
257
258impl FileReader {
259    /// Create a new `FileReader` with default settings.
260    ///
261    /// - `entropy_percentile`: retain blocks above this percentile (default 60.0).
262    /// - `context_lines`: lines of context around line ranges (default 3).
263    pub fn new() -> Self {
264        Self {
265            ast_parser: AstParser::new(),
266            entropy_percentile: 60.0,
267            context_lines: 3,
268        }
269    }
270
271    /// Create a `FileReader` with custom entropy percentile and context lines.
272    pub fn with_config(entropy_percentile: f64, context_lines: usize) -> Self {
273        Self {
274            ast_parser: AstParser::new(),
275            entropy_percentile,
276            context_lines,
277        }
278    }
279
280    /// Read a file using the specified mode.
281    ///
282    /// - `path`: file path (used for language detection in signatures/map modes).
283    /// - `source`: the file content as a string.
284    /// - `mode`: one of the 8 reading modes.
285    /// - `intent`: optional task intent for `Task` mode.
286    /// - `cached_content`: optional previously cached content for `Diff` mode.
287    pub fn read(
288        &self,
289        path: &Path,
290        source: &str,
291        mode: &FileReadMode,
292        intent: Option<&str>,
293        cached_content: Option<&str>,
294    ) -> Result<ReadResult> {
295        let tokens_original = approx_tokens(source);
296
297        match mode {
298            FileReadMode::Full => self.read_full(source, tokens_original),
299            FileReadMode::Map => self.read_map(path, source, tokens_original),
300            FileReadMode::Signatures => self.read_signatures(path, source, tokens_original),
301            FileReadMode::Diff => self.read_diff(source, cached_content, tokens_original),
302            FileReadMode::Aggressive => self.read_aggressive(path, source, tokens_original),
303            FileReadMode::Entropy => self.read_entropy(source, tokens_original),
304            FileReadMode::Task => self.read_task(source, intent, tokens_original),
305            FileReadMode::Lines(range) => {
306                self.read_lines(source, range.clone(), tokens_original)
307            }
308        }
309    }
310
311    /// Full mode: return the complete file content unchanged.
312    fn read_full(&self, source: &str, tokens_original: u32) -> Result<ReadResult> {
313        Ok(ReadResult {
314            content: source.to_string(),
315            mode: "full".to_string(),
316            tokens_original,
317            tokens_result: tokens_original,
318        })
319    }
320
321    /// Map mode: produce a structural overview ≤50 tokens for a typical
322    /// 300-line file. Shows module hierarchy, exports, dependencies, and
323    /// type signatures in a compact format.
324    fn read_map(&self, path: &Path, source: &str, tokens_original: u32) -> Result<ReadResult> {
325        let lang = detect_language(path);
326        let mut parts: Vec<String> = Vec::new();
327
328        // File header
329        let line_count = source.lines().count();
330        parts.push(format!(
331            "# {} ({} lines)",
332            path.file_name()
333                .map(|n| n.to_string_lossy().to_string())
334                .unwrap_or_default(),
335            line_count
336        ));
337
338        if let Some(lang) = &lang {
339            if self.ast_parser.is_supported(lang) {
340                if let Ok(summary) = self.ast_parser.extract_signatures(source, lang) {
341                    // Imports (compact count only)
342                    if !summary.imports.is_empty() {
343                        let count = summary.imports.len();
344                        parts.push(format!("imports: {count}"));
345                    }
346                    // Deduplicate type names and show count
347                    if !summary.types.is_empty() {
348                        let mut names: Vec<&str> =
349                            summary.types.iter().map(|t| t.name.as_str()).collect();
350                        names.sort_unstable();
351                        names.dedup();
352                        parts.push(format!("types({}): {}", names.len(), names.join(", ")));
353                    }
354                    // Deduplicate class/struct names and show count
355                    if !summary.classes.is_empty() {
356                        let mut names: Vec<&str> =
357                            summary.classes.iter().map(|c| c.name.as_str()).collect();
358                        names.sort_unstable();
359                        names.dedup();
360                        parts.push(format!("structs({}): {}", names.len(), names.join(", ")));
361                    }
362                    // Deduplicate function names and show count
363                    if !summary.functions.is_empty() {
364                        let mut names: Vec<&str> =
365                            summary.functions.iter().map(|f| f.name.as_str()).collect();
366                        names.sort_unstable();
367                        names.dedup();
368                        parts.push(format!("fns({}): {}", names.len(), names.join(", ")));
369                    }
370                }
371            }
372        }
373
374        // Enforce ≤50 token budget: truncate parts if needed
375        const MAP_TOKEN_BUDGET: u32 = 50;
376        loop {
377            let content = parts.join("\n");
378            if approx_tokens(&content) <= MAP_TOKEN_BUDGET || parts.len() <= 1 {
379                break;
380            }
381            // Remove the last detail line to shrink output
382            parts.pop();
383        }
384
385        // If AST didn't produce much, fall back to line-count summary
386        if parts.len() <= 1 {
387            // Simple structural scan: count indentation-based sections
388            let mut section_count = 0u32;
389            for line in source.lines() {
390                let trimmed = line.trim();
391                if trimmed.starts_with("fn ")
392                    || trimmed.starts_with("pub fn ")
393                    || trimmed.starts_with("def ")
394                    || trimmed.starts_with("class ")
395                    || trimmed.starts_with("function ")
396                    || trimmed.starts_with("struct ")
397                    || trimmed.starts_with("impl ")
398                    || trimmed.starts_with("trait ")
399                {
400                    section_count += 1;
401                }
402            }
403            if section_count > 0 {
404                parts.push(format!("sections: {section_count}"));
405            }
406        }
407
408        let content = parts.join("\n");
409        let tokens_result = approx_tokens(&content);
410
411        Ok(ReadResult {
412            content,
413            mode: "map".to_string(),
414            tokens_original,
415            tokens_result,
416        })
417    }
418
419    /// Signatures mode: extract function/class signatures via AST parser.
420    fn read_signatures(
421        &self,
422        path: &Path,
423        source: &str,
424        tokens_original: u32,
425    ) -> Result<ReadResult> {
426        let lang = detect_language(path);
427        if let Some(lang) = &lang {
428            if self.ast_parser.is_supported(lang) {
429                let summary = self.ast_parser.extract_signatures(source, lang)?;
430                let content = summary.to_text();
431                let tokens_result = approx_tokens(&content);
432                return Ok(ReadResult {
433                    content,
434                    mode: "signatures".to_string(),
435                    tokens_original,
436                    tokens_result,
437                });
438            }
439        }
440        // Fallback: return full content for unsupported languages
441        Ok(ReadResult {
442            content: source.to_string(),
443            mode: "signatures".to_string(),
444            tokens_original,
445            tokens_result: tokens_original,
446        })
447    }
448
449    /// Diff mode: compare against cached version, return only changes with
450    /// surrounding context lines.
451    fn read_diff(
452        &self,
453        source: &str,
454        cached_content: Option<&str>,
455        tokens_original: u32,
456    ) -> Result<ReadResult> {
457        let cached = match cached_content {
458            Some(c) => c,
459            None => {
460                // No cached version — return full content
461                return Ok(ReadResult {
462                    content: source.to_string(),
463                    mode: "diff".to_string(),
464                    tokens_original,
465                    tokens_result: tokens_original,
466                });
467            }
468        };
469
470        if source == cached {
471            let content = "(no changes)".to_string();
472            return Ok(ReadResult {
473                content,
474                mode: "diff".to_string(),
475                tokens_original,
476                tokens_result: approx_tokens("(no changes)"),
477            });
478        }
479
480        let new_lines: Vec<&str> = source.lines().collect();
481        let old_lines: Vec<&str> = cached.lines().collect();
482
483        // Find changed line indices using a simple line-by-line comparison
484        let mut changed_lines: Vec<usize> = Vec::new();
485        let max_len = new_lines.len().max(old_lines.len());
486        for i in 0..max_len {
487            let new_line = new_lines.get(i).copied().unwrap_or("");
488            let old_line = old_lines.get(i).copied().unwrap_or("");
489            if new_line != old_line {
490                changed_lines.push(i);
491            }
492        }
493
494        if changed_lines.is_empty() {
495            let content = "(no changes)".to_string();
496            return Ok(ReadResult {
497                content,
498                mode: "diff".to_string(),
499                tokens_original,
500                tokens_result: approx_tokens("(no changes)"),
501            });
502        }
503
504        // Build output with context around changed lines
505        let ctx = self.context_lines;
506        let mut included: Vec<bool> = vec![false; new_lines.len()];
507        for &line_idx in &changed_lines {
508            let start = line_idx.saturating_sub(ctx);
509            let end = (line_idx + ctx + 1).min(new_lines.len());
510            for j in start..end {
511                included[j] = true;
512            }
513        }
514
515        let mut output = Vec::new();
516        let mut in_range = false;
517        for (i, line) in new_lines.iter().enumerate() {
518            if included[i] {
519                if !in_range {
520                    output.push(format!("@@ line {} @@", i + 1));
521                    in_range = true;
522                }
523                let marker = if changed_lines.contains(&i) {
524                    ">"
525                } else {
526                    " "
527                };
528                output.push(format!("{marker} {line}"));
529            } else {
530                in_range = false;
531            }
532        }
533
534        let content = output.join("\n");
535        let tokens_result = approx_tokens(&content);
536
537        Ok(ReadResult {
538            content,
539            mode: "diff".to_string(),
540            tokens_original,
541            tokens_result,
542        })
543    }
544
545    /// Aggressive mode: maximum compression combining entropy filtering and
546    /// signature extraction.
547    fn read_aggressive(
548        &self,
549        path: &Path,
550        source: &str,
551        tokens_original: u32,
552    ) -> Result<ReadResult> {
553        // First try signatures
554        let lang = detect_language(path);
555        let sig_content = if let Some(lang) = &lang {
556            if self.ast_parser.is_supported(lang) {
557                self.ast_parser
558                    .extract_signatures(source, lang)
559                    .ok()
560                    .map(|s| s.to_text())
561            } else {
562                None
563            }
564        } else {
565            None
566        };
567
568        // Then entropy filter on the original source
569        let blocks = compute_block_entropies(source);
570        let high = filter_high_entropy(&blocks, self.entropy_percentile);
571        let entropy_content: String = high.iter().map(|b| b.text.as_str()).collect::<Vec<_>>().join("\n\n");
572
573        // Combine: prefer signatures if available, append high-entropy blocks
574        // that aren't already covered
575        let content = match sig_content {
576            Some(sigs) if !sigs.is_empty() => {
577                if entropy_content.is_empty() {
578                    sigs
579                } else {
580                    format!("{sigs}\n\n// --- high-entropy blocks ---\n{entropy_content}")
581                }
582            }
583            _ => {
584                if entropy_content.is_empty() {
585                    source.to_string()
586                } else {
587                    entropy_content
588                }
589            }
590        };
591
592        let tokens_result = approx_tokens(&content).min(tokens_original);
593        Ok(ReadResult {
594            content,
595            mode: "aggressive".to_string(),
596            tokens_original,
597            tokens_result,
598        })
599    }
600
601    /// Entropy mode: compute Shannon entropy per block, return only
602    /// high-entropy blocks.
603    fn read_entropy(&self, source: &str, tokens_original: u32) -> Result<ReadResult> {
604        let blocks = compute_block_entropies(source);
605        let high = filter_high_entropy(&blocks, self.entropy_percentile);
606
607        if high.is_empty() {
608            return Ok(ReadResult {
609                content: source.to_string(),
610                mode: "entropy".to_string(),
611                tokens_original,
612                tokens_result: tokens_original,
613            });
614        }
615
616        let content: String = high
617            .iter()
618            .map(|b| format!("// lines {}-{}\n{}", b.start_line + 1, b.end_line, b.text))
619            .collect::<Vec<_>>()
620            .join("\n\n");
621
622        let tokens_result = approx_tokens(&content);
623
624        // If line annotations pushed us above full mode, fall back to raw
625        // high-entropy text without annotations.
626        if tokens_result > tokens_original {
627            let plain: String = high
628                .iter()
629                .map(|b| b.text.as_str())
630                .collect::<Vec<_>>()
631                .join("\n\n");
632            let plain_tokens = approx_tokens(&plain).min(tokens_original);
633            return Ok(ReadResult {
634                content: plain,
635                mode: "entropy".to_string(),
636                tokens_original,
637                tokens_result: plain_tokens,
638            });
639        }
640
641        Ok(ReadResult {
642            content,
643            mode: "entropy".to_string(),
644            tokens_original,
645            tokens_result,
646        })
647    }
648
649    /// Task mode: use current intent to select relevant sections via FTS5.
650    fn read_task(
651        &self,
652        source: &str,
653        intent: Option<&str>,
654        tokens_original: u32,
655    ) -> Result<ReadResult> {
656        let intent = match intent {
657            Some(i) if !i.trim().is_empty() => i,
658            _ => {
659                // No intent — return full content
660                return Ok(ReadResult {
661                    content: source.to_string(),
662                    mode: "task".to_string(),
663                    tokens_original,
664                    tokens_result: tokens_original,
665                });
666            }
667        };
668
669        let content = fts5_task_filter(source, intent)?;
670        let tokens_result = approx_tokens(&content);
671
672        Ok(ReadResult {
673            content,
674            mode: "task".to_string(),
675            tokens_original,
676            tokens_result,
677        })
678    }
679
680    /// Lines mode: extract specific line ranges with context.
681    fn read_lines(
682        &self,
683        source: &str,
684        range: Range<usize>,
685        tokens_original: u32,
686    ) -> Result<ReadResult> {
687        let lines: Vec<&str> = source.lines().collect();
688        let total = lines.len();
689
690        // Clamp range to valid bounds
691        let start = range.start.min(total);
692        let end = range.end.min(total);
693
694        if start >= end {
695            return Ok(ReadResult {
696                content: String::new(),
697                mode: "lines".to_string(),
698                tokens_original,
699                tokens_result: 0,
700            });
701        }
702
703        // Add context lines
704        let ctx_start = start.saturating_sub(self.context_lines);
705        let ctx_end = (end + self.context_lines).min(total);
706
707        let mut output = Vec::new();
708        output.push(format!("// lines {}-{} (of {})", start + 1, end, total));
709        for i in ctx_start..ctx_end {
710            let marker = if i >= start && i < end { ">" } else { " " };
711            output.push(format!("{marker} {:4} | {}", i + 1, lines[i]));
712        }
713
714        let content = output.join("\n");
715        let tokens_result = approx_tokens(&content);
716
717        Ok(ReadResult {
718            content,
719            mode: "lines".to_string(),
720            tokens_original,
721            tokens_result,
722        })
723    }
724
725    /// Access the underlying AST parser.
726    pub fn ast_parser(&self) -> &AstParser {
727        &self.ast_parser
728    }
729
730    /// Get the configured entropy percentile.
731    pub fn entropy_percentile(&self) -> f64 {
732        self.entropy_percentile
733    }
734}
735
736impl Default for FileReader {
737    fn default() -> Self {
738        Self::new()
739    }
740}
741
742// ── Language detection ────────────────────────────────────────────────────────
743
744/// Detect programming language from file extension.
745fn detect_language(path: &Path) -> Option<String> {
746    let ext = path.extension()?.to_str()?;
747    let lang = match ext {
748        "rs" => "rust",
749        "py" => "python",
750        "js" | "mjs" | "cjs" => "javascript",
751        "ts" | "tsx" => "typescript",
752        "go" => "go",
753        "java" => "java",
754        "c" | "h" => "c",
755        "cpp" | "cc" | "cxx" | "hpp" => "cpp",
756        "rb" => "ruby",
757        "sh" | "bash" => "bash",
758        "json" => "json",
759        "html" | "htm" => "html",
760        "css" => "css",
761        "cs" => "csharp",
762        "kt" | "kts" => "kotlin",
763        "swift" => "swift",
764        "toml" => "toml",
765        "yml" | "yaml" => "yaml",
766        _ => return None,
767    };
768    Some(lang.to_string())
769}
770
771// ── Public helpers ────────────────────────────────────────────────────────────
772
773/// Compute Shannon entropy for a string (public for use by EntropyAnalyzer).
774pub fn compute_entropy(text: &str) -> f64 {
775    shannon_entropy(text)
776}
777
778/// Compute block entropies for source code (public for use by EntropyAnalyzer).
779pub fn analyze_block_entropies(source: &str) -> Vec<BlockEntropy> {
780    compute_block_entropies(source)
781}
782
783// ── Tests ─────────────────────────────────────────────────────────────────────
784
785#[cfg(test)]
786mod tests {
787    use super::*;
788    use std::path::Path;
789
790    fn sample_rust_source() -> &'static str {
791        r#"use std::collections::HashMap;
792use std::path::Path;
793
794/// A configuration struct.
795pub struct Config {
796    pub name: String,
797    pub value: i32,
798}
799
800impl Config {
801    pub fn new(name: &str, value: i32) -> Self {
802        Self {
803            name: name.to_string(),
804            value,
805        }
806    }
807
808    pub fn validate(&self) -> bool {
809        !self.name.is_empty() && self.value > 0
810    }
811}
812
813pub fn process(config: &Config) -> String {
814    let mut result = String::new();
815    for i in 0..config.value {
816        result.push_str(&format!("item {}: {}\n", i, config.name));
817    }
818    result
819}
820
821pub type ConfigMap = HashMap<String, Config>;
822
823fn internal_helper() -> i32 {
824    42
825}
826"#
827    }
828
829    fn large_source(lines: usize) -> String {
830        let mut src = String::new();
831        src.push_str("use std::collections::HashMap;\n\n");
832        src.push_str("pub struct MyStruct {\n    field: i32,\n}\n\n");
833        for i in 0..lines.saturating_sub(6) {
834            src.push_str(&format!("// line {i}: some content here\n"));
835        }
836        src
837    }
838
839    #[test]
840    fn test_full_mode_returns_unchanged() {
841        let reader = FileReader::new();
842        let source = "hello world\nline two\n";
843        let result = reader
844            .read(Path::new("test.txt"), source, &FileReadMode::Full, None, None)
845            .unwrap();
846        assert_eq!(result.content, source);
847        assert_eq!(result.mode, "full");
848        assert_eq!(result.tokens_original, result.tokens_result);
849    }
850
851    #[test]
852    fn test_map_mode_compact_output() {
853        let reader = FileReader::new();
854        let source = &large_source(300);
855        let result = reader
856            .read(
857                Path::new("test.rs"),
858                source,
859                &FileReadMode::Map,
860                None,
861                None,
862            )
863            .unwrap();
864        assert_eq!(result.mode, "map");
865        // Map mode should produce ≤50 tokens for a ~300-line file
866        assert!(
867            result.tokens_result <= 50,
868            "map mode produced {} tokens, expected ≤50",
869            result.tokens_result
870        );
871    }
872
873    #[test]
874    fn test_signatures_mode_extracts_signatures() {
875        let reader = FileReader::new();
876        let source = sample_rust_source();
877        let result = reader
878            .read(
879                Path::new("test.rs"),
880                source,
881                &FileReadMode::Signatures,
882                None,
883                None,
884            )
885            .unwrap();
886        assert_eq!(result.mode, "signatures");
887        assert!(result.content.contains("use std::collections::HashMap"));
888        assert!(result.tokens_result < result.tokens_original);
889    }
890
891    #[test]
892    fn test_signatures_mode_unsupported_language_fallback() {
893        let reader = FileReader::new();
894        let source = "some content";
895        let result = reader
896            .read(
897                Path::new("test.xyz"),
898                source,
899                &FileReadMode::Signatures,
900                None,
901                None,
902            )
903            .unwrap();
904        // Unsupported language: returns full content
905        assert_eq!(result.content, source);
906    }
907
908    #[test]
909    fn test_diff_mode_no_cached() {
910        let reader = FileReader::new();
911        let source = "line 1\nline 2\n";
912        let result = reader
913            .read(
914                Path::new("test.txt"),
915                source,
916                &FileReadMode::Diff,
917                None,
918                None,
919            )
920            .unwrap();
921        // No cached content — returns full
922        assert_eq!(result.content, source);
923    }
924
925    #[test]
926    fn test_diff_mode_no_changes() {
927        let reader = FileReader::new();
928        let source = "line 1\nline 2\n";
929        let result = reader
930            .read(
931                Path::new("test.txt"),
932                source,
933                &FileReadMode::Diff,
934                None,
935                Some(source),
936            )
937            .unwrap();
938        assert_eq!(result.content, "(no changes)");
939    }
940
941    #[test]
942    fn test_diff_mode_with_changes() {
943        let reader = FileReader::new();
944        let old = "line 1\nline 2\nline 3\nline 4\nline 5\n";
945        let new = "line 1\nline 2 modified\nline 3\nline 4\nline 5\n";
946        let result = reader
947            .read(
948                Path::new("test.txt"),
949                new,
950                &FileReadMode::Diff,
951                None,
952                Some(old),
953            )
954            .unwrap();
955        assert!(result.content.contains("line 2 modified"));
956        assert!(result.content.contains("@@"));
957        // Diff output includes only changed sections, not the full file
958        assert_ne!(result.content, new);
959    }
960
961    #[test]
962    fn test_entropy_mode_filters_blocks() {
963        let reader = FileReader::new();
964        // Create source with varying entropy: some complex code, some boilerplate
965        let source = r#"
966fn complex_algorithm(data: &[u8]) -> Vec<u8> {
967    let mut result = Vec::new();
968    for (i, &byte) in data.iter().enumerate() {
969        let transformed = byte ^ (i as u8).wrapping_mul(0x5A);
970        result.push(transformed.rotate_left(3));
971    }
972    result
973}
974
975// boilerplate
976// boilerplate
977// boilerplate
978// boilerplate
979// boilerplate
980
981pub fn another_complex_fn(x: f64, y: f64) -> f64 {
982    let theta = x.atan2(y);
983    let r = (x * x + y * y).sqrt();
984    r * theta.sin() + theta.cos() * r.ln()
985}
986"#;
987        let result = reader
988            .read(
989                Path::new("test.rs"),
990                source,
991                &FileReadMode::Entropy,
992                None,
993                None,
994            )
995            .unwrap();
996        assert_eq!(result.mode, "entropy");
997    }
998
999    #[test]
1000    fn test_task_mode_no_intent_returns_full() {
1001        let reader = FileReader::new();
1002        let source = "some content\n";
1003        let result = reader
1004            .read(
1005                Path::new("test.txt"),
1006                source,
1007                &FileReadMode::Task,
1008                None,
1009                None,
1010            )
1011            .unwrap();
1012        assert_eq!(result.content, source);
1013    }
1014
1015    #[test]
1016    fn test_task_mode_with_intent() {
1017        let reader = FileReader::new();
1018        let source = r#"
1019fn authentication_handler(req: Request) -> Response {
1020    let token = req.header("Authorization");
1021    validate_token(token)
1022}
1023
1024fn database_query(sql: &str) -> Vec<Row> {
1025    let conn = get_connection();
1026    conn.execute(sql)
1027}
1028
1029fn logging_middleware(req: Request) -> Request {
1030    println!("Request: {}", req.path());
1031    req
1032}
1033"#;
1034        let result = reader
1035            .read(
1036                Path::new("test.rs"),
1037                source,
1038                &FileReadMode::Task,
1039                Some("authentication token validation"),
1040                None,
1041            )
1042            .unwrap();
1043        assert_eq!(result.mode, "task");
1044        // Should include the authentication section
1045        assert!(result.content.contains("authentication") || result.content.contains("token"));
1046    }
1047
1048    #[test]
1049    fn test_lines_mode_extracts_range() {
1050        let reader = FileReader::new();
1051        let source = "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\nline 10\n";
1052        let result = reader
1053            .read(
1054                Path::new("test.txt"),
1055                source,
1056                &FileReadMode::Lines(3..6),
1057                None,
1058                None,
1059            )
1060            .unwrap();
1061        assert_eq!(result.mode, "lines");
1062        assert!(result.content.contains("line 4"));
1063        assert!(result.content.contains("line 5"));
1064        assert!(result.content.contains("line 6"));
1065    }
1066
1067    #[test]
1068    fn test_lines_mode_empty_range() {
1069        let reader = FileReader::new();
1070        let source = "line 1\nline 2\n";
1071        let result = reader
1072            .read(
1073                Path::new("test.txt"),
1074                source,
1075                &FileReadMode::Lines(5..5),
1076                None,
1077                None,
1078            )
1079            .unwrap();
1080        assert!(result.content.is_empty());
1081    }
1082
1083    #[test]
1084    fn test_aggressive_mode_compresses() {
1085        let reader = FileReader::new();
1086        let source = sample_rust_source();
1087        let result = reader
1088            .read(
1089                Path::new("test.rs"),
1090                source,
1091                &FileReadMode::Aggressive,
1092                None,
1093                None,
1094            )
1095            .unwrap();
1096        assert_eq!(result.mode, "aggressive");
1097        // Aggressive should produce fewer tokens than full
1098        assert!(
1099            result.tokens_result <= result.tokens_original,
1100            "aggressive mode should compress: {} vs {}",
1101            result.tokens_result,
1102            result.tokens_original
1103        );
1104    }
1105
1106    #[test]
1107    fn test_shannon_entropy_empty() {
1108        assert_eq!(shannon_entropy(""), 0.0);
1109    }
1110
1111    #[test]
1112    fn test_shannon_entropy_single_char() {
1113        assert_eq!(shannon_entropy("aaaa"), 0.0);
1114    }
1115
1116    #[test]
1117    fn test_shannon_entropy_varied() {
1118        let e = shannon_entropy("abcdefghij");
1119        // 10 distinct chars, each appearing once: entropy = log2(10) ≈ 3.32
1120        assert!(e > 3.0, "entropy of varied text should be high: {e}");
1121    }
1122
1123    #[test]
1124    fn test_detect_language() {
1125        assert_eq!(detect_language(Path::new("foo.rs")), Some("rust".into()));
1126        assert_eq!(detect_language(Path::new("bar.py")), Some("python".into()));
1127        assert_eq!(detect_language(Path::new("baz.js")), Some("javascript".into()));
1128        assert_eq!(detect_language(Path::new("qux.ts")), Some("typescript".into()));
1129        assert_eq!(detect_language(Path::new("no_ext")), None);
1130    }
1131
1132    #[test]
1133    fn test_file_read_mode_enum_variants() {
1134        // Ensure all 8 modes exist
1135        let modes: Vec<FileReadMode> = vec![
1136            FileReadMode::Full,
1137            FileReadMode::Map,
1138            FileReadMode::Signatures,
1139            FileReadMode::Diff,
1140            FileReadMode::Aggressive,
1141            FileReadMode::Entropy,
1142            FileReadMode::Task,
1143            FileReadMode::Lines(0..10),
1144        ];
1145        assert_eq!(modes.len(), 8);
1146    }
1147
1148    #[test]
1149    fn test_block_entropies_computation() {
1150        let source = "fn foo() {\n    let x = 1;\n}\n\nfn bar() {\n    let y = 2;\n}\n";
1151        let blocks = compute_block_entropies(source);
1152        assert_eq!(blocks.len(), 2);
1153        assert!(blocks[0].entropy > 0.0);
1154        assert!(blocks[1].entropy > 0.0);
1155    }
1156
1157    #[test]
1158    fn test_default_creates_reader() {
1159        let reader = FileReader::default();
1160        assert_eq!(reader.entropy_percentile(), 60.0);
1161    }
1162
1163    #[test]
1164    fn test_with_config() {
1165        let reader = FileReader::with_config(75.0, 5);
1166        assert_eq!(reader.entropy_percentile(), 75.0);
1167    }
1168
1169    // ── Property-based tests ──────────────────────────────────────────────
1170
1171    use proptest::prelude::*;
1172
1173    /// Generate random source code content of a given line count.
1174    /// Produces a mix of struct definitions, function signatures, comments,
1175    /// and blank lines to simulate realistic Rust source files.
1176    fn arb_source_code(min_lines: usize, max_lines: usize) -> impl Strategy<Value = String> {
1177        proptest::collection::vec(
1178            prop_oneof![
1179                Just("use std::collections::HashMap;\n".to_string()),
1180                Just("pub struct Foo {\n    field: i32,\n}\n".to_string()),
1181                Just("pub fn bar(x: i32) -> i32 {\n    x + 1\n}\n".to_string()),
1182                Just("// a comment line\n".to_string()),
1183                Just("\n".to_string()),
1184                Just("fn helper() -> bool { true }\n".to_string()),
1185                Just("let val = compute(a, b, c);\n".to_string()),
1186                Just("impl Foo {\n    pub fn new() -> Self { Self { field: 0 } }\n}\n".to_string()),
1187            ],
1188            min_lines..=max_lines,
1189        )
1190        .prop_map(|chunks| chunks.join(""))
1191    }
1192
1193    // ── Property 36: Multi-mode file reading compression ratio ────────────
1194    //
1195    // **Validates: Requirements 31.2, 31.3**
1196    //
1197    // For any file of at least 100 lines:
1198    //   (a) `map` mode SHALL produce output of at most 50 tokens.
1199    //   (b) Non-full modes (map, signatures, entropy, aggressive) SHALL
1200    //       produce fewer or equal tokens compared to full mode.
1201
1202    proptest! {
1203        #[test]
1204        fn prop36_map_mode_token_limit(
1205            source in arb_source_code(40, 80),
1206        ) {
1207            // Each chunk produces ~2-4 lines, so 40-80 chunks ≈ 100-300+ lines
1208            let line_count = source.lines().count();
1209            // Only test files with at least 100 lines
1210            prop_assume!(line_count >= 100);
1211
1212            let reader = FileReader::new();
1213            let path = Path::new("test.rs");
1214
1215            let map_result = reader
1216                .read(path, &source, &FileReadMode::Map, None, None)
1217                .unwrap();
1218
1219            prop_assert!(
1220                map_result.tokens_result <= 50,
1221                "map mode produced {} tokens for a {}-line file, expected ≤50",
1222                map_result.tokens_result,
1223                line_count
1224            );
1225        }
1226
1227        #[test]
1228        fn prop36_non_full_modes_compress(
1229            source in arb_source_code(30, 80),
1230        ) {
1231            let line_count = source.lines().count();
1232            prop_assume!(line_count >= 100);
1233
1234            let reader = FileReader::new();
1235            let path = Path::new("test.rs");
1236
1237            let full_result = reader
1238                .read(path, &source, &FileReadMode::Full, None, None)
1239                .unwrap();
1240            let full_tokens = full_result.tokens_result;
1241
1242            // map mode
1243            let map_result = reader
1244                .read(path, &source, &FileReadMode::Map, None, None)
1245                .unwrap();
1246            prop_assert!(
1247                map_result.tokens_result <= full_tokens,
1248                "map ({}) should be ≤ full ({})",
1249                map_result.tokens_result, full_tokens
1250            );
1251
1252            // signatures mode
1253            let sig_result = reader
1254                .read(path, &source, &FileReadMode::Signatures, None, None)
1255                .unwrap();
1256            prop_assert!(
1257                sig_result.tokens_result <= full_tokens,
1258                "signatures ({}) should be ≤ full ({})",
1259                sig_result.tokens_result, full_tokens
1260            );
1261
1262            // entropy mode
1263            let ent_result = reader
1264                .read(path, &source, &FileReadMode::Entropy, None, None)
1265                .unwrap();
1266            prop_assert!(
1267                ent_result.tokens_result <= full_tokens,
1268                "entropy ({}) should be ≤ full ({})",
1269                ent_result.tokens_result, full_tokens
1270            );
1271
1272            // aggressive mode
1273            let agg_result = reader
1274                .read(path, &source, &FileReadMode::Aggressive, None, None)
1275                .unwrap();
1276            prop_assert!(
1277                agg_result.tokens_result <= full_tokens,
1278                "aggressive ({}) should be ≤ full ({})",
1279                agg_result.tokens_result, full_tokens
1280            );
1281        }
1282    }
1283}
sqz_engine/file_reader.rs

sqz_engine/
file_reader.rs