batless 0.6.0 - Docs.rs

//! File information structure for batless
//!
//! This module defines the FileInfo structure that holds all information
//! about a processed file, including content, metadata, and processing results.

use crate::summary_item::SummaryItem;
use serde::{Deserialize, Serialize};

/// Information about a processed file
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileInfo {
    /// The processed lines of the file
    pub lines: Vec<String>,
    /// Original lines before any summary transformations
    pub original_lines: Option<Vec<String>>,
    /// Total number of lines in the original file
    pub total_lines: usize,
    /// Whether total_lines reflects the entire file
    pub total_lines_exact: bool,
    /// Total number of bytes in the original file
    pub total_bytes: usize,
    /// Whether the file was truncated during processing
    pub truncated: bool,
    /// Whether truncation was due to line limit
    pub truncated_by_lines: bool,
    /// Whether truncation was due to byte limit
    pub truncated_by_bytes: bool,
    /// Whether truncation was due to context fitting
    pub truncated_by_context: bool,
    /// Detected or specified language
    pub language: Option<String>,
    /// Detected encoding of the file
    pub encoding: String,
    /// Syntax errors encountered during processing
    pub syntax_errors: Vec<String>,
    /// Extracted tokens (if requested)
    pub tokens: Option<Vec<String>>,
    /// Total number of tokens extracted (including truncated samples)
    pub token_total: Option<usize>,
    /// Summary items with line numbers (if in summary mode)
    pub summary_lines: Option<Vec<SummaryItem>>,
    /// SHA-256 hex digest of file content (only populated when --hash is passed)
    pub file_hash: Option<String>,
    /// Estimated LLM token count for the output content (heuristic)
    pub estimated_llm_tokens: Option<u64>,
    /// AI model used for token estimation (e.g. "Claude", "Gpt4")
    pub token_model: Option<String>,
    /// Ratio of original line count to stripped line count (set when strip_comments or strip_blank_lines is active)
    pub compression_ratio: Option<f64>,
}

impl FileInfo {
    /// Create a new FileInfo instance
    pub fn new() -> Self {
        Self {
            lines: Vec::new(),
            original_lines: None,
            total_lines: 0,
            total_lines_exact: true,
            total_bytes: 0,
            truncated: false,
            truncated_by_lines: false,
            truncated_by_bytes: false,
            truncated_by_context: false,
            language: None,
            encoding: "UTF-8".to_string(),
            syntax_errors: Vec::new(),
            tokens: None,
            token_total: None,
            summary_lines: None,
            file_hash: None,
            estimated_llm_tokens: None,
            token_model: None,
            compression_ratio: None,
        }
    }

    /// Create a FileInfo with basic metadata
    pub fn with_metadata(
        total_lines: usize,
        total_bytes: usize,
        language: Option<String>,
        encoding: String,
    ) -> Self {
        Self {
            lines: Vec::new(),
            original_lines: None,
            total_lines,
            total_lines_exact: true,
            total_bytes,
            truncated: false,
            truncated_by_lines: false,
            truncated_by_bytes: false,
            truncated_by_context: false,
            language,
            encoding,
            syntax_errors: Vec::new(),
            tokens: None,
            token_total: None,
            summary_lines: None,
            file_hash: None,
            estimated_llm_tokens: None,
            token_model: None,
            compression_ratio: None,
        }
    }

    /// Set the processed lines
    pub fn with_lines(mut self, lines: Vec<String>) -> Self {
        self.lines = lines;
        self
    }

    /// Set truncation information
    pub fn with_truncation(mut self, truncated: bool, by_lines: bool, by_bytes: bool) -> Self {
        self.truncated = truncated;
        self.truncated_by_lines = by_lines;
        self.truncated_by_bytes = by_bytes;
        self
    }

    /// Set context-based truncation
    pub fn with_context_truncation(mut self, truncated: bool) -> Self {
        self.truncated = truncated;
        self.truncated_by_context = truncated;
        self
    }

    /// Add a syntax error
    pub fn add_syntax_error(&mut self, error: String) {
        self.syntax_errors.push(error);
    }

    /// Set tokens
    pub fn with_tokens(mut self, tokens: Option<Vec<String>>) -> Self {
        self.tokens = tokens;
        self
    }

    /// Store the total number of tokens identified
    pub fn with_token_total(mut self, total: Option<usize>) -> Self {
        self.token_total = total;
        self
    }

    /// Set file hash
    pub fn with_file_hash(mut self, hash: Option<String>) -> Self {
        self.file_hash = hash;
        self
    }

    /// Set estimated LLM token count and the model used for estimation
    pub fn with_estimated_llm_tokens(mut self, tokens: Option<u64>, model: Option<String>) -> Self {
        self.estimated_llm_tokens = tokens;
        self.token_model = model;
        self
    }

    /// Set compression ratio (original lines / stripped lines)
    pub fn with_compression_ratio(mut self, ratio: Option<f64>) -> Self {
        self.compression_ratio = ratio;
        self
    }

    /// Set summary items
    pub fn with_summary_lines(mut self, summary_lines: Option<Vec<SummaryItem>>) -> Self {
        self.summary_lines = summary_lines;
        self
    }

    /// Preserve original lines before summary transformations
    pub fn with_original_lines(mut self, original: Option<Vec<String>>) -> Self {
        self.original_lines = original;
        self
    }

    /// Mark whether total_lines is exact
    pub fn with_total_lines_exact(mut self, exact: bool) -> Self {
        self.total_lines_exact = exact;
        self
    }

    /// Check if the file was processed successfully
    pub fn is_success(&self) -> bool {
        self.syntax_errors.is_empty()
    }

    /// Get the number of processed lines
    pub fn processed_lines(&self) -> usize {
        self.lines.len()
    }

    /// Get the processing ratio (processed lines / total lines)
    pub fn processing_ratio(&self) -> f64 {
        if self.total_lines == 0 {
            1.0
        } else {
            self.processed_lines() as f64 / self.total_lines as f64
        }
    }

    /// Check if any tokens were extracted
    pub fn has_tokens(&self) -> bool {
        self.tokens.as_ref().is_some_and(|t| !t.is_empty())
    }

    /// Check if summary was generated
    pub fn has_summary(&self) -> bool {
        self.summary_lines.as_ref().is_some_and(|s| !s.is_empty())
    }

    /// Get the number of tokens (if any)
    pub fn token_count(&self) -> usize {
        self.token_total
            .or_else(|| self.tokens.as_ref().map(Vec::len))
            .unwrap_or(0)
    }

    /// Check if the displayed token list was truncated
    pub fn tokens_truncated(&self) -> bool {
        matches!(
            (&self.token_total, &self.tokens),
            (Some(total), Some(tokens)) if *total > tokens.len()
        )
    }

    /// Get the number of summary lines (if any)
    pub fn summary_line_count(&self) -> usize {
        self.summary_lines.as_ref().map_or(0, Vec::len)
    }

    /// Get truncation reason as a human-readable string
    pub fn truncation_reason(&self) -> Option<String> {
        if !self.truncated {
            return None;
        }

        let mut reasons = Vec::new();
        if self.truncated_by_lines {
            reasons.push("line limit");
        }
        if self.truncated_by_bytes {
            reasons.push("byte limit");
        }
        if self.truncated_by_context {
            reasons.push("context fitting");
        }

        if reasons.is_empty() {
            Some("unknown reason".to_string())
        } else {
            Some(reasons.join(" and "))
        }
    }

    /// Get a summary of processing statistics
    pub fn get_stats_summary(&self) -> ProcessingStats {
        ProcessingStats {
            total_lines: self.total_lines,
            total_lines_exact: self.total_lines_exact,
            processed_lines: self.processed_lines(),
            total_bytes: self.total_bytes,
            truncated: self.truncated,
            truncation_reason: self.truncation_reason(),
            has_syntax_errors: !self.syntax_errors.is_empty(),
            error_count: self.syntax_errors.len(),
            language: self.language.clone(),
            encoding: self.encoding.clone(),
            token_count: self.token_count(),
            tokens_truncated: self.tokens_truncated(),
            summary_line_count: self.summary_line_count(),
        }
    }
}

impl Default for FileInfo {
    fn default() -> Self {
        Self::new()
    }
}

/// Processing statistics summary
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessingStats {
    pub total_lines: usize,
    pub total_lines_exact: bool,
    pub processed_lines: usize,
    pub total_bytes: usize,
    pub truncated: bool,
    pub truncation_reason: Option<String>,
    pub has_syntax_errors: bool,
    pub error_count: usize,
    pub language: Option<String>,
    pub encoding: String,
    pub token_count: usize,
    pub tokens_truncated: bool,
    pub summary_line_count: usize,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_new_file_info() {
        let info = FileInfo::new();
        assert_eq!(info.lines.len(), 0);
        assert!(info.original_lines.is_none());
        assert_eq!(info.total_lines, 0);
        assert!(info.total_lines_exact);
        assert_eq!(info.total_bytes, 0);
        assert!(!info.truncated);
        assert!(!info.truncated_by_lines);
        assert!(!info.truncated_by_bytes);
        assert!(!info.truncated_by_context);
        assert_eq!(info.language, None);
        assert_eq!(info.encoding, "UTF-8");
        assert_eq!(info.syntax_errors.len(), 0);
        assert_eq!(info.tokens, None);
        assert!(info.token_total.is_none());
        assert_eq!(info.summary_lines, None);
    }

    #[test]
    fn test_with_metadata() {
        let info =
            FileInfo::with_metadata(100, 1024, Some("rust".to_string()), "UTF-8".to_string());
        assert_eq!(info.total_lines, 100);
        assert!(info.total_lines_exact);
        assert_eq!(info.total_bytes, 1024);
        assert_eq!(info.language, Some("rust".to_string()));
        assert_eq!(info.encoding, "UTF-8");
    }

    #[test]
    fn test_builder_pattern() {
        use crate::summary_item::SummaryItem;
        let lines = vec!["line1".to_string(), "line2".to_string()];
        let tokens = vec!["token1".to_string(), "token2".to_string()];
        let summary = vec![SummaryItem::new("fn main()", 1, Some(3), "function")];

        let info = FileInfo::new()
            .with_lines(lines.clone())
            .with_truncation(true, true, false)
            .with_original_lines(Some(lines.clone()))
            .with_total_lines_exact(false)
            .with_tokens(Some(tokens.clone()))
            .with_summary_lines(Some(summary.clone()));

        assert_eq!(info.lines, lines);
        assert!(info.truncated);
        assert!(info.truncated_by_lines);
        assert!(!info.truncated_by_bytes);
        assert_eq!(info.tokens, Some(tokens));
        assert!(!info.total_lines_exact);
        assert_eq!(info.summary_lines.as_ref().map(Vec::len), Some(1));
    }

    #[test]
    #[allow(clippy::float_cmp)]
    fn test_processing_ratio() {
        let mut info = FileInfo::new();
        info.total_lines = 100;
        info.lines = vec!["line".to_string(); 50];

        assert_eq!(info.processing_ratio(), 0.5);

        // Test edge case with zero total lines
        info.total_lines = 0;
        assert_eq!(info.processing_ratio(), 1.0);
    }

    #[test]
    fn test_truncation_reason() {
        let mut info = FileInfo::new();
        assert_eq!(info.truncation_reason(), None);

        info.truncated = true;
        info.truncated_by_lines = true;
        assert_eq!(info.truncation_reason(), Some("line limit".to_string()));

        info.truncated_by_bytes = true;
        assert_eq!(
            info.truncation_reason(),
            Some("line limit and byte limit".to_string())
        );

        info.truncated_by_lines = false;
        assert_eq!(info.truncation_reason(), Some("byte limit".to_string()));
    }

    #[test]
    fn test_helper_methods() {
        let mut info = FileInfo::new();

        // Test success status
        assert!(info.is_success());
        info.add_syntax_error("test error".to_string());
        assert!(!info.is_success());

        // Test token and summary checks
        assert!(!info.has_tokens());
        assert!(!info.has_summary());
        assert_eq!(info.token_count(), 0);
        assert_eq!(info.summary_line_count(), 0);

        info.tokens = Some(vec!["token".to_string()]);
        info.token_total = Some(5);
        info.summary_lines = Some(vec![SummaryItem::new("summary", 1, None, "other")]);

        assert!(info.has_tokens());
        assert!(info.tokens_truncated());
        assert!(info.has_summary());
        assert_eq!(info.token_count(), 5);
        assert_eq!(info.summary_line_count(), 1);
    }

    #[test]
    fn test_stats_summary() {
        let mut info =
            FileInfo::with_metadata(100, 2048, Some("rust".to_string()), "UTF-8".to_string());
        info.lines = vec!["line".to_string(); 50];
        info.truncated = true;
        info.truncated_by_lines = true;
        info.total_lines_exact = false;
        info.add_syntax_error("test error".to_string());
        info.tokens = Some(vec!["token1".to_string(), "token2".to_string()]);
        info.token_total = Some(5);

        let stats = info.get_stats_summary();
        assert_eq!(stats.total_lines, 100);
        assert!(!stats.total_lines_exact);
        assert_eq!(stats.processed_lines, 50);
        assert_eq!(stats.total_bytes, 2048);
        assert!(stats.truncated);
        assert_eq!(stats.truncation_reason, Some("line limit".to_string()));
        assert!(stats.has_syntax_errors);
        assert_eq!(stats.error_count, 1);
        assert_eq!(stats.language, Some("rust".to_string()));
        assert_eq!(stats.encoding, "UTF-8");
        assert_eq!(stats.token_count, 5);
        assert!(stats.tokens_truncated);
        assert_eq!(stats.summary_line_count, 0);
    }
}