opencrabs 0.3.58

//! Read File Tool
//!
//! Allows reading file contents from the filesystem.

use super::error::{Result, ToolError, validate_file_path};
use super::hashline::hash::{format_hashline, hash_line};
use super::r#trait::{Tool, ToolCapability, ToolExecutionContext, ToolResult};
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use tokio::fs;
use tokio::io::{AsyncBufReadExt, BufReader};

/// Maximum file size to read without warning (10MB)
const LARGE_FILE_THRESHOLD: u64 = 10 * 1024 * 1024;

/// Maximum file size to read at all (100MB)
const MAX_FILE_SIZE: u64 = 100 * 1024 * 1024;

/// Maximum number of lines to read in a single request
const MAX_LINES: usize = 100_000;

/// Binary media that must NOT be read as text — reading the raw bytes yields
/// garbage. Returns the tool the agent should call instead, or `None` for a
/// normal text file. Keyed on extension so it's cheap and needs no file read.
fn media_tool_redirect(path: &std::path::Path) -> Option<String> {
    let ext = path
        .extension()
        .and_then(|e| e.to_str())
        .map(|e| e.to_lowercase())?;
    let p = path.display();
    match ext.as_str() {
        "png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "svg" | "heic" | "heif" | "tiff" => {
            Some(format!(
                "'{p}' is an image — reading it as text would be garbage. Call \
                 analyze_image(image='{p}', question='...') to view it with a vision model."
            ))
        }
        "mp4" | "m4v" | "mov" | "webm" | "mkv" | "avi" | "3gp" | "flv" => Some(format!(
            "'{p}' is a video. Call analyze_video(path='{p}', question='...') to view it."
        )),
        "pdf" | "docx" | "doc" | "pptx" | "xlsx" | "epub" => Some(format!(
            "'{p}' is a document. Call parse_document(path='{p}') to read its text\
             {}.",
            if ext == "pdf" {
                ", or pdf_to_images then analyze_image for scanned/figure pages"
            } else {
                ""
            }
        )),
        _ => None,
    }
}

/// Read file tool
pub struct ReadTool;

#[derive(Debug, Deserialize, Serialize)]
struct ReadInput {
    /// Path to the file to read
    path: String,

    /// Optional: Start line (0-indexed)
    #[serde(skip_serializing_if = "Option::is_none")]
    start_line: Option<usize>,

    /// Optional: Number of lines to read
    #[serde(skip_serializing_if = "Option::is_none")]
    line_count: Option<usize>,

    /// Optional: Output with hashline tags (HASH|content format, where HASH is a 2-char content hash)
    #[serde(default)]
    hashline: Option<bool>,
}

#[async_trait]
impl Tool for ReadTool {
    fn name(&self) -> &str {
        "read_file"
    }

    fn description(&self) -> &str {
        "Read contents of a file from the filesystem. Can optionally read specific line ranges."
    }

    fn input_schema(&self) -> Value {
        serde_json::json!({
            "type": "object",
            "properties": {
                "path": {
                    "type": "string",
                    "description": "Path to the file to read (absolute or relative to working directory)"
                },
                "start_line": {
                    "type": "integer",
                    "description": "Optional: Starting line number (0-indexed)",
                    "minimum": 0
                },
                "line_count": {
                    "type": "integer",
                    "description": "Optional: Number of lines to read from start_line",
                    "minimum": 1
                },
                "hashline": {
                    "type": "boolean",
                    "description": "Optional: Output lines in HASH|content format (2-char content hash) for use with hashline_edit tool. Default: false."
                }
            },
            "required": ["path"]
        })
    }

    fn capabilities(&self) -> Vec<ToolCapability> {
        vec![ToolCapability::ReadFiles]
    }

    fn requires_approval(&self) -> bool {
        false // Reading files is generally safe
    }

    fn validate_input(&self, input: &Value) -> Result<()> {
        let _: ReadInput = serde_json::from_value(input.clone())
            .map_err(|e| ToolError::InvalidInput(format!("Invalid input: {}", e)))?;
        Ok(())
    }

    async fn execute(&self, input: Value, context: &ToolExecutionContext) -> Result<ToolResult> {
        let input: ReadInput = serde_json::from_value(input)?;

        // Validate path: safety check, existence, and file type
        let path = match validate_file_path(&input.path, &context.working_dir()) {
            Ok(p) => p,
            Err(msg) => return Ok(ToolResult::error(msg)),
        };

        // Bounce binary media (images/video/docs) to the right tool — reading
        // their bytes as text is meaningless, and the model otherwise loops on
        // read_file for a dropped screenshot instead of calling analyze_image.
        if let Some(redirect) = media_tool_redirect(&path) {
            return Ok(ToolResult::error(redirect));
        }

        // Check file size to prevent memory exhaustion
        let metadata = fs::metadata(&path).await.map_err(ToolError::Io)?;
        let file_size = metadata.len();

        if file_size > MAX_FILE_SIZE {
            return Ok(ToolResult::error(format!(
                "File too large: {} MB exceeds maximum {} MB. Use start_line and line_count to read portions.",
                file_size / (1024 * 1024),
                MAX_FILE_SIZE / (1024 * 1024)
            )));
        }

        let is_large_file = file_size > LARGE_FILE_THRESHOLD;

        let is_hashline = input.hashline.unwrap_or(false);

        // For large files or line-range requests, use buffered streaming
        let (output, total_lines, warning) =
            if input.start_line.is_some() || input.line_count.is_some() || is_large_file {
                self.read_with_buffer(&path, input.start_line, input.line_count, is_large_file)
                    .await?
            } else {
                // Small file: read entire contents directly
                let contents = fs::read_to_string(&path).await.map_err(ToolError::Io)?;
                let line_count = contents.lines().count();
                (contents, line_count, None)
            };

        // Apply hashline formatting if requested
        let output = if is_hashline {
            let file_start_line = input.start_line.unwrap_or(0) + 1; // convert 0-indexed to 1-indexed

            // First pass: compute all hashes and detect collisions
            let lines_with_hashes: Vec<(usize, String, &str)> = output
                .lines()
                .enumerate()
                .map(|(i, line)| {
                    let line_num = file_start_line + i;
                    let hash = hash_line(line);
                    (line_num, hash, line)
                })
                .collect();

            // Build reverse lookup to detect collisions
            let mut hash_to_lines: std::collections::HashMap<&str, Vec<usize>> =
                std::collections::HashMap::new();
            for (line_num, hash, _) in &lines_with_hashes {
                hash_to_lines
                    .entry(hash.as_str())
                    .or_default()
                    .push(*line_num);
            }

            // Identify collision hashes (appear on multiple lines) - own the strings
            let collision_hashes: std::collections::HashSet<String> = hash_to_lines
                .iter()
                .filter(|(_, lines)| lines.len() > 1)
                .map(|(hash, _)| hash.to_string())
                .collect();

            // Second pass: format output, marking collision lines
            let mut formatted_lines = Vec::new();
            for (_line_num, hash, line) in lines_with_hashes {
                if collision_hashes.contains(&hash) {
                    // Collision: don't show hash, add instruction
                    formatted_lines.push(format!("COLLISION|{}", line));
                } else {
                    formatted_lines.push(format_hashline(0, &hash, line));
                }
            }

            // Add collision warning at the end if any collisions detected
            if !collision_hashes.is_empty() {
                let collision_count = collision_hashes.len();
                formatted_lines.push(String::new());
                formatted_lines.push(format!(
                    "[WARNING: {} line(s) have hash collisions and cannot be edited with hashline_edit. Use the conventional edit_file tool with search/replace instead.]",
                    collision_count
                ));
            }

            formatted_lines.join("\n")
        } else {
            output
        };

        let output_len = output.len();
        let mut result = ToolResult::success(output)
            .with_metadata("path".to_string(), path.display().to_string())
            .with_metadata("bytes".to_string(), output_len.to_string())
            .with_metadata("total_lines".to_string(), total_lines.to_string());

        // Add warning for large files
        if let Some(warn_msg) = warning {
            result = result.with_metadata("warning".to_string(), warn_msg);
        }

        Ok(result)
    }
}

impl ReadTool {
    /// Read file using buffered I/O for memory efficiency
    async fn read_with_buffer(
        &self,
        path: &std::path::Path,
        start_line: Option<usize>,
        line_count: Option<usize>,
        is_large_file: bool,
    ) -> Result<(String, usize, Option<String>)> {
        let file = fs::File::open(path).await.map_err(ToolError::Io)?;
        let reader = BufReader::new(file);
        let mut lines = reader.lines();

        let start = start_line.unwrap_or(0);
        let max_lines = line_count.unwrap_or(MAX_LINES).min(MAX_LINES);

        let mut output = String::new();
        let mut current_line = 0;
        let mut lines_read = 0;
        let mut total_lines = 0;
        let mut truncated = false;

        // Skip lines before start
        while current_line < start {
            match lines.next_line().await.map_err(ToolError::Io)? {
                Some(_) => {
                    current_line += 1;
                    total_lines += 1;
                }
                None => {
                    return Err(ToolError::InvalidInput(format!(
                        "Start line {} exceeds file length {}",
                        start, current_line
                    )));
                }
            }
        }

        // Read requested lines
        while lines_read < max_lines {
            match lines.next_line().await.map_err(ToolError::Io)? {
                Some(line) => {
                    if !output.is_empty() {
                        output.push('\n');
                    }
                    output.push_str(&line);
                    lines_read += 1;
                    total_lines += 1;
                }
                None => break,
            }
        }

        // Count remaining lines if we haven't read the whole file
        if line_count.is_none() && lines_read >= MAX_LINES {
            truncated = true;
            // Count remaining lines without loading them into memory
            while lines.next_line().await.map_err(ToolError::Io)?.is_some() {
                total_lines += 1;
            }
        } else {
            // Count any remaining lines
            while lines.next_line().await.map_err(ToolError::Io)?.is_some() {
                total_lines += 1;
            }
        }

        let warning = if truncated {
            Some(format!(
                "Output truncated at {} lines. File has {} total lines. Use start_line and line_count for pagination.",
                MAX_LINES, total_lines
            ))
        } else if is_large_file && line_count.is_none() {
            Some(format!(
                "Large file ({} lines). Consider using start_line and line_count for better performance.",
                total_lines
            ))
        } else {
            None
        };

        Ok((output, total_lines, warning))
    }
}