use super::path_input::required_path_field;
use super::schemas::read_file_input_schema;
use super::{DEFAULT_LIMIT, FILE_SIZE_LINE_COUNT_LIMIT, MAX_FILE_SIZE, MAX_LIMIT};
use crate::tools::spec::{
ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, optional_str, optional_u64,
};
use async_trait::async_trait;
use regex::Regex;
use serde_json::{Value, json};
use std::fs;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
use std::process::{Command, Stdio};
use std::sync::LazyLock;
use zagens_config::workspace_meta_file_read;
pub struct ReadFileTool;
#[async_trait]
impl ToolSpec for ReadFileTool {
fn name(&self) -> &'static str {
"read_file"
}
fn description(&self) -> &'static str {
"Read a file from the workspace. Plain text uses line paging (start_line or offset + limit) with streaming newline decode (low memory); files starting with UTF-16/UTF-32 BOM use full-file decode. PDFs: `pdftotext` or `pdf-extract`. DOCX/XLSX/PPTX: extracts text from OOXML ZIP."
}
fn input_schema(&self) -> Value {
read_file_input_schema()
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::ReadOnly, ToolCapability::Sandboxable]
}
fn supports_parallel(&self) -> bool {
true
}
async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
let path_str = required_path_field(&input, "read_file")?;
let file_path = context.resolve_path(path_str)?;
let pages = optional_str(&input, "pages");
if is_pdf(&file_path)? {
return read_pdf(&file_path, pages);
}
if is_docx(&file_path)? {
return read_docx(&file_path);
}
if is_xlsx(&file_path)? {
return read_xlsx(&file_path);
}
if is_pptx(&file_path)? {
return read_pptx(&file_path);
}
let start_line = match (
input.get("start_line").and_then(Value::as_u64),
input.get("offset").and_then(Value::as_u64),
) {
(Some(s), _) => s.max(1),
(None, Some(o)) => o.max(1),
(None, None) => 1,
};
let limit =
optional_u64(&input, "limit", DEFAULT_LIMIT as u64).clamp(1, MAX_LIMIT as u64) as usize;
let metadata_result = fs::metadata(&file_path);
let size_bytes = metadata_result.as_ref().ok().map(|m| m.len());
if size_bytes.is_some_and(|s| s > MAX_FILE_SIZE) {
return Err(ToolError::execution_failed(format!(
"[TOO_LARGE] 文件 {} 大小 {} 超过读取上限 ({}MB)",
file_path.display(),
size_bytes.unwrap(),
MAX_FILE_SIZE / 1024 / 1024
)));
}
let sniff_totals = size_bytes.is_some_and(|s| s <= FILE_SIZE_LINE_COUNT_LIMIT);
let skip = start_line.saturating_sub(1) as usize;
let (collected, truncated, total_lines_known, encoding_used, encoding_detected_via) =
if file_needs_bulk_text_decode(&file_path)? {
let bytes =
fs::read(&file_path).map_err(|e| map_plain_read_io_error(&file_path, e))?;
let (text, encoding_used, encoding_detected_via) = detect_and_decode(&bytes);
let all_lines: Vec<&str> = text.lines().collect();
let total_lines_known = sniff_totals.then_some(all_lines.len());
let end = (skip + limit).min(all_lines.len());
let collected: Vec<String> = if skip < all_lines.len() {
all_lines[skip..end]
.iter()
.copied()
.map(String::from)
.collect()
} else {
Vec::new()
};
let truncated = skip + collected.len() < all_lines.len();
(
collected,
truncated,
total_lines_known,
encoding_used,
encoding_detected_via,
)
} else {
read_plain_lines_stream(&file_path, skip, limit, sniff_totals)
.map_err(|e| map_plain_read_io_error(&file_path, e))?
};
let mut content = collected.join("\n");
if let Some(total) = total_lines_known
&& total >= 500
{
let rel = file_path
.strip_prefix(&context.workspace)
.unwrap_or(&file_path)
.to_string_lossy()
.replace('\\', "/");
let index_path = workspace_meta_file_read(&context.workspace, "symbols.json");
if let Ok(raw) = std::fs::read_to_string(&index_path)
&& let Ok(index) = serde_json::from_str::<crate::symbol_index::SymbolIndex>(&raw)
&& let Some(summary) = crate::symbol_index::format_file_summary(&index, &rel, total)
{
content = format!("{summary}\n\n---\n\n{content}");
}
}
if truncated && !collected.is_empty() {
let line_range = format!(
"第 {}-{} 行",
start_line,
start_line + collected.len() as u64 - 1
);
let next = start_line + collected.len() as u64;
if let Some(t) = total_lines_known {
content.push_str(&format!(
"\n\n... ({} 行,共 {} 行; 下一窗口设 start_line={} 或 offset={} 接续)",
line_range, t, next, next,
));
} else {
content.push_str(&format!(
"\n\n... ({} 行; 下一窗口设 start_line={} 或 offset={} 接续 — 文件中还有更多行)",
line_range, next, next,
));
}
}
let mut metadata = json!({
"path": file_path.to_string_lossy(),
"lines_read": collected.len(),
"truncated": truncated,
"encoding_used": encoding_used,
"encoding_detected_via": encoding_detected_via,
});
if let Some(s) = size_bytes {
metadata["size_bytes"] = json!(s);
}
if let Some(t) = total_lines_known {
metadata["total_lines"] = json!(t);
}
Ok(ToolResult::success(content).with_metadata(metadata))
}
}
pub(in crate::tools::file) fn is_pdf(path: &Path) -> Result<bool, ToolError> {
if path
.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
{
return Ok(true);
}
let mut buf = [0u8; 4];
let result = match fs::File::open(path) {
Ok(mut f) => {
use std::io::Read;
f.read_exact(&mut buf).map(|_| buf)
}
Err(_) => return Ok(false),
};
Ok(matches!(result, Ok(b) if &b == b"%PDF"))
}
pub(in crate::tools::file) fn parse_pages_arg(spec: &str) -> Option<(u32, u32)> {
let trimmed = spec.trim();
if trimmed.is_empty() {
return None;
}
if let Some((a, b)) = trimmed.split_once('-') {
let start: u32 = a.trim().parse().ok()?;
let end: u32 = b.trim().parse().ok()?;
if start == 0 || end < start {
return None;
}
Some((start, end))
} else {
let n: u32 = trimmed.parse().ok()?;
if n == 0 {
return None;
}
Some((n, n))
}
}
pub(crate) fn detect_and_decode(bytes: &[u8]) -> (String, String, String) {
if bytes.is_empty() {
return (String::new(), "utf-8".into(), "empty".into());
}
if let Some((enc, bom_len)) = encoding_rs::Encoding::for_bom(bytes) {
let (cow, _encoding, _had_errors) = enc.decode(&bytes[bom_len..]);
let label = enc.name().to_lowercase();
return (cow.into_owned(), label, "bom".into());
}
if let Ok(text) = std::str::from_utf8(bytes) {
return (text.to_string(), "utf-8".into(), "default".into());
}
let (cow, _enc, had_errors) = encoding_rs::GB18030.decode(bytes);
if !had_errors {
return (cow.into_owned(), "gb18030".into(), "fallback".into());
}
let (cow, _enc, _had_errors) = encoding_rs::WINDOWS_1252.decode(bytes);
let label = "windows-1252 (gb18030 had errors)".to_string();
(cow.into_owned(), label, "fallback".into())
}
pub fn sniff_encoding_label(sample: &[u8]) -> Option<String> {
if sample.is_empty() {
return None;
}
if let Some((enc, _bom_len)) = encoding_rs::Encoding::for_bom(sample) {
return Some(enc.name().to_ascii_lowercase());
}
if std::str::from_utf8(sample).is_ok() {
return Some("utf-8".into());
}
let (_cow, _, had_errors) = encoding_rs::GB18030.decode(sample);
if !had_errors {
return Some("gb18030".into());
}
Some("windows-1252-likely".into())
}
#[derive(Clone, Copy)]
enum PhysicalLineEnc {
Utf8,
Gb18030,
Win1252,
}
fn map_plain_read_io_error(path: &Path, e: std::io::Error) -> ToolError {
let kind = e.kind();
if kind == std::io::ErrorKind::NotFound {
ToolError::execution_failed(format!("[NOT_FOUND] 文件 {} 不存在: {e}", path.display()))
} else if kind == std::io::ErrorKind::PermissionDenied {
ToolError::execution_failed(format!("[PERMISSION] 没有权限读取 {}: {e}", path.display()))
} else {
ToolError::execution_failed(format!("Failed to read {}: {e}", path.display()))
}
}
fn file_needs_bulk_text_decode(path: &Path) -> Result<bool, ToolError> {
let mut file = fs::File::open(path).map_err(|e| map_plain_read_io_error(path, e))?;
let mut probe = [0u8; 4];
let read = file
.read(&mut probe)
.map_err(|e| map_plain_read_io_error(path, e))?;
if read < 2 {
return Ok(false);
}
if read >= 4
&& (probe.starts_with(&[0xFF, 0xFE, 0x00, 0x00])
|| probe.starts_with(&[0x00, 0x00, 0xFE, 0xFF]))
{
return Ok(true);
}
if probe.starts_with(&[0xFF, 0xFE]) || probe.starts_with(&[0xFE, 0xFF]) {
return Ok(true);
}
Ok(false)
}
fn trim_line_terminator(mut b: &[u8]) -> &[u8] {
if b.ends_with(b"\r\n") {
return &b[..b.len() - 2];
}
if let Some(rest) = b.strip_suffix(b"\n") {
b = rest;
}
b.strip_suffix(b"\r").unwrap_or(b)
}
fn decode_physical_line(bytes: &[u8], strip_utf8_bom: bool) -> (String, PhysicalLineEnc) {
let mut slice = trim_line_terminator(bytes);
if strip_utf8_bom && slice.len() >= 3 && slice.starts_with(&[0xEF, 0xBB, 0xBF]) {
slice = &slice[3..];
}
if slice.is_empty() {
return (String::new(), PhysicalLineEnc::Utf8);
}
if std::str::from_utf8(slice).is_ok() {
return (
std::str::from_utf8(slice)
.expect("utf-8 checked")
.to_string(),
PhysicalLineEnc::Utf8,
);
}
let (cow_gbk, _, had_errors) = encoding_rs::GB18030.decode(slice);
if !had_errors {
return (cow_gbk.into_owned(), PhysicalLineEnc::Gb18030);
}
let (cow, _, _) = encoding_rs::WINDOWS_1252.decode(slice);
(cow.into_owned(), PhysicalLineEnc::Win1252)
}
fn summarize_physical_line_encoding(utf: u64, gbk: u64, win: u64) -> String {
let kinds = (utf > 0) as u8 + (gbk > 0) as u8 + (win > 0) as u8;
if kinds <= 1 {
if gbk > 0 {
return "gb18030".into();
}
if win > 0 {
return "windows-1252".into();
}
return "utf-8".into();
}
format!("mixed(utf8_lines={utf}, gb18030_lines={gbk}, windows1252_lines={win})")
}
type PlainLinesStreamResult =
Result<(Vec<String>, bool, Option<usize>, String, String), std::io::Error>;
fn read_plain_lines_stream(
path: &Path,
skip: usize,
limit: usize,
sniff_totals: bool,
) -> PlainLinesStreamResult {
let file = fs::File::open(path)?;
let mut reader = BufReader::new(file);
let mut buf = Vec::new();
let mut lineno: u64 = 0;
let mut out = Vec::new();
let mut utf = 0u64;
let mut gbk = 0u64;
let mut win = 0u64;
let skip_u64 = skip as u64;
loop {
buf.clear();
let n = reader.read_until(b'\n', &mut buf)?;
if n == 0 {
break;
}
lineno += 1;
let (decoded, enc) = decode_physical_line(&buf, lineno == 1);
match enc {
PhysicalLineEnc::Utf8 => utf += 1,
PhysicalLineEnc::Gb18030 => gbk += 1,
PhysicalLineEnc::Win1252 => win += 1,
}
if lineno <= skip_u64 {
continue;
}
if out.len() < limit {
out.push(decoded);
}
}
let eligible = lineno.saturating_sub(skip_u64);
let truncated = eligible > limit as u64;
let total_lines_known = sniff_totals.then_some(lineno as usize);
let encoding_used = summarize_physical_line_encoding(utf, gbk, win);
Ok((
out,
truncated,
total_lines_known,
encoding_used,
"streaming-line".into(),
))
}
static DOCX_WT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"<w:t[^>]*>(.*?)</w:t>").unwrap());
static XLSX_SI_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<si>(.*?)</si>").unwrap());
static XLSX_T_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<t[^>]*>(.*?)</t>").unwrap());
static PPTX_AT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"<a:t[^>]*>(.*?)</a:t>").unwrap());
fn is_docx(path: &Path) -> Result<bool, ToolError> {
Ok(path
.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| ext.eq_ignore_ascii_case("docx")))
}
fn is_xlsx(path: &Path) -> Result<bool, ToolError> {
Ok(path
.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| ext.eq_ignore_ascii_case("xlsx")))
}
fn is_pptx(path: &Path) -> Result<bool, ToolError> {
Ok(path
.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| ext.eq_ignore_ascii_case("pptx")))
}
pub(crate) fn read_docx(path: &Path) -> Result<ToolResult, ToolError> {
let size_bytes = fs::metadata(path).map(|m| m.len()).ok();
let file = fs::File::open(path).map_err(|e| {
ToolError::execution_failed(format!(
"[NOT_FOUND] 无法打开 DOCX 文件 {}: {e}",
path.display()
))
})?;
let mut archive = zip::ZipArchive::new(file).map_err(|e| {
ToolError::execution_failed(format!(
"[BINARY] 无法解析 DOCX/ZIP {}: {e}",
path.display()
))
})?;
let mut doc_xml = String::new();
match archive.by_name("word/document.xml") {
Ok(mut entry) => {
entry.read_to_string(&mut doc_xml).map_err(|e| {
ToolError::execution_failed(format!(
"Failed to read word/document.xml from {}: {e}",
path.display()
))
})?;
}
Err(e) => {
return Err(ToolError::execution_failed(format!(
"[BINARY] word/document.xml not found in {}: {e}",
path.display()
)));
}
}
let mut result = String::new();
for para in doc_xml.split("</w:p>") {
let mut line = String::new();
for cap in DOCX_WT_RE.captures_iter(para) {
if let Some(m) = cap.get(1) {
line.push_str(m.as_str());
}
}
let trimmed = line.trim();
if !trimmed.is_empty() {
if !result.is_empty() {
result.push('\n');
}
result.push_str(trimmed);
}
}
if result.is_empty() {
return Ok(
ToolResult::success("[DOCX] 文件内容为空或仅包含非文本元素。").with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "docx",
"size_bytes": size_bytes,
})),
);
}
Ok(ToolResult::success(result).with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "docx",
"size_bytes": size_bytes,
})))
}
fn read_xlsx(path: &Path) -> Result<ToolResult, ToolError> {
let size_bytes = fs::metadata(path).map(|m| m.len()).ok();
let file = fs::File::open(path).map_err(|e| {
ToolError::execution_failed(format!(
"[NOT_FOUND] 无法打开 XLSX 文件 {}: {e}",
path.display()
))
})?;
let mut archive = zip::ZipArchive::new(file).map_err(|e| {
ToolError::execution_failed(format!(
"[BINARY] 无法解析 XLSX/ZIP {}: {e}",
path.display()
))
})?;
let mut shared_strings: Vec<String> = Vec::new();
if let Ok(mut entry) = archive.by_name("xl/sharedStrings.xml") {
let mut ss_xml = String::new();
entry.read_to_string(&mut ss_xml).ok();
for si_cap in XLSX_SI_RE.captures_iter(&ss_xml) {
let si_text = si_cap.get(1).map(|m| m.as_str()).unwrap_or("");
let mut merged = String::new();
for t_cap in XLSX_T_RE.captures_iter(si_text) {
if let Some(tm) = t_cap.get(1) {
merged.push_str(tm.as_str());
}
}
shared_strings.push(merged);
}
}
let mut sheet_names: Vec<String> = Vec::new();
if let Ok(mut entry) = archive.by_name("xl/workbook.xml") {
let mut wb_xml = String::new();
entry.read_to_string(&mut wb_xml).ok();
let name_re = regex::Regex::new(r#"name="([^"]*)""#).unwrap();
for cap in name_re.captures_iter(&wb_xml) {
sheet_names.push(cap[1].to_string());
}
}
let sheet_re =
regex::Regex::new(r#"<c r="([A-Z]+)(\d+)"(?:\s+t="([^"]*)")?>(?:<v>([^<]*)</v>)?</c>"#)
.unwrap();
let inline_re = regex::Regex::new(
r#"<c r="([A-Z]+)(\d+)"[^>]*t="inlineStr"[^>]*>.*?<t[^>]*>(.*?)</t>.*?</c>"#,
)
.unwrap();
let mut result = String::new();
for i in 1.. {
let sheet_path = format!("xl/worksheets/sheet{i}.xml");
let sheet_xml = match archive.by_name(&sheet_path) {
Ok(mut entry) => {
let mut s = String::new();
entry.read_to_string(&mut s).ok();
s
}
Err(_) => break,
};
let name = sheet_names
.get(i - 1)
.cloned()
.unwrap_or_else(|| format!("Sheet{i}"));
if !result.is_empty() {
result.push('\n');
}
result.push_str(&format!("=== Sheet: {name} ===\n"));
let mut rows: std::collections::BTreeMap<u64, Vec<(String, String)>> =
std::collections::BTreeMap::new();
for cap in inline_re.captures_iter(&sheet_xml) {
let col = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
let row: u64 = cap
.get(2)
.and_then(|m| m.as_str().parse().ok())
.unwrap_or(0);
let text = cap.get(3).map(|m| m.as_str()).unwrap_or("");
rows.entry(row).or_default().push((col, text.to_string()));
}
for cap in sheet_re.captures_iter(&sheet_xml) {
let col = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
let row: u64 = cap
.get(2)
.and_then(|m| m.as_str().parse().ok())
.unwrap_or(0);
let t_type = cap.get(3).map(|m| m.as_str()).unwrap_or("");
let val = cap.get(4).map(|m| m.as_str()).unwrap_or("");
if t_type == "inlineStr" {
continue; }
let cell_text = if t_type == "s" {
let idx: usize = val.parse().unwrap_or(0);
shared_strings.get(idx).cloned().unwrap_or_default()
} else {
val.to_string()
};
rows.entry(row).or_default().push((col, cell_text));
}
for cells in rows.values() {
let line: Vec<String> = cells
.iter()
.map(|(col, txt)| format!("[{col}] {txt}"))
.collect();
result.push_str(&line.join(" "));
result.push('\n');
}
}
if result.is_empty() {
return Ok(
ToolResult::success("[XLSX] 文件内容为空或无有效数据。").with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "xlsx",
"size_bytes": size_bytes,
})),
);
}
Ok(
ToolResult::success(result.trim_end().to_string()).with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "xlsx",
"size_bytes": size_bytes,
})),
)
}
pub(crate) fn read_pptx(path: &Path) -> Result<ToolResult, ToolError> {
let size_bytes = fs::metadata(path).map(|m| m.len()).ok();
let file = fs::File::open(path).map_err(|e| {
ToolError::execution_failed(format!(
"[NOT_FOUND] 无法打开 PPTX 文件 {}: {e}",
path.display()
))
})?;
let mut archive = zip::ZipArchive::new(file).map_err(|e| {
ToolError::execution_failed(format!(
"[BINARY] 无法解析 PPTX/ZIP {}: {e}",
path.display()
))
})?;
let mut result = String::new();
for i in 1.. {
let slide_path = format!("ppt/slides/slide{i}.xml");
let slide_xml = match archive.by_name(&slide_path) {
Ok(mut entry) => {
let mut s = String::new();
entry.read_to_string(&mut s).ok();
s
}
Err(_) => break,
};
let mut slide_text = String::new();
for cap in PPTX_AT_RE.captures_iter(&slide_xml) {
if let Some(m) = cap.get(1) {
slide_text.push_str(m.as_str());
}
}
let trimmed = slide_text.trim();
if !trimmed.is_empty() {
if !result.is_empty() {
result.push('\n');
}
result.push_str(&format!("=== Slide {i} ===\n"));
result.push_str(trimmed);
}
}
if result.is_empty() {
return Ok(
ToolResult::success("[PPTX] 文件内容为空或仅包含非文本元素。").with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "pptx",
"size_bytes": size_bytes,
})),
);
}
Ok(ToolResult::success(result).with_metadata(json!({
"path": path.to_string_lossy(),
"kind": "pptx",
"size_bytes": size_bytes,
})))
}
pub(crate) fn read_pdf(path: &Path, pages: Option<&str>) -> Result<ToolResult, ToolError> {
let size_bytes = fs::metadata(path).map(|m| m.len()).ok();
let mut cmd = Command::new("pdftotext");
cmd.arg("-layout");
let valid_pages = if let Some(spec) = pages {
match parse_pages_arg(spec) {
Some(range) => {
cmd.arg("-f").arg(range.0.to_string());
cmd.arg("-l").arg(range.1.to_string());
Some(range)
}
None => {
return Err(ToolError::invalid_input(format!(
"invalid `pages` value `{spec}` (expected `N` or `N-M`, e.g. `1-5`)"
)));
}
}
} else {
None
};
cmd.arg(path).arg("-");
cmd.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::piped());
match cmd.spawn() {
Ok(child) => {
let output = child.wait_with_output().map_err(|e| {
ToolError::execution_failed(format!("pdftotext failed to complete: {e}"))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
return Err(ToolError::execution_failed(format!(
"pdftotext failed (exit {:?}): {stderr}",
output.status.code()
)));
}
let text = String::from_utf8_lossy(&output.stdout).to_string();
let mut metadata = json!({
"path": path.to_string_lossy(),
"kind": "pdf",
"extractor": "pdftotext",
"size_bytes": size_bytes,
});
if let Some(range) = valid_pages {
metadata["pages"] = json!(format!("{}-{}", range.0, range.1));
}
return Ok(ToolResult::success(text).with_metadata(metadata));
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
}
Err(e) => {
return Err(ToolError::execution_failed(format!(
"failed to launch pdftotext: {e}"
)));
}
}
let bytes = match fs::read(path) {
Ok(b) => b,
Err(e) => {
return ToolResult::json(&json!({
"type": "binary_unavailable",
"path": path.display().to_string(),
"kind": "pdf",
"reason": "pdftotext not installed and failed to read file for pdf-extract",
"detail": e.to_string(),
"hint": "install poppler for better PDF support (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
}))
.map_err(|e| ToolError::execution_failed(format!("failed to serialize response: {e}")));
}
};
let text = match pdf_extract::extract_text_from_mem(&bytes) {
Ok(t) => t,
Err(e) => {
return ToolResult::json(&json!({
"type": "binary_unavailable",
"path": path.display().to_string(),
"kind": "pdf",
"reason": "pdftotext not installed and pdf-extract failed",
"detail": e.to_string(),
"hint": "install poppler for better PDF support (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
}))
.map_err(|e| ToolError::execution_failed(format!("failed to serialize response: {e}")));
}
};
if text.trim().is_empty() {
return ToolResult::json(&json!({
"type": "binary_unavailable",
"path": path.display().to_string(),
"kind": "pdf",
"reason": "pdf-extract returned empty text — the PDF may be scanned, encrypted, or uses unsupported features",
"hint": "install poppler for better PDF support (macOS: `brew install poppler`; Debian/Ubuntu: `apt install poppler-utils`)"
}))
.map_err(|e| ToolError::execution_failed(format!("failed to serialize response: {e}")));
}
let note = if valid_pages.is_some() {
"\n\n[注意: pdf-extract 不支持分页,已返回全文。安装 poppler 可启用 --pages 功能。]\n"
} else {
""
};
let mut metadata = json!({
"path": path.to_string_lossy(),
"kind": "pdf",
"extractor": "pdf-extract",
"fallback_from_missing_pdftotext": true,
"size_bytes": size_bytes,
});
if valid_pages.is_some() {
metadata["pdf_extract_pages_note"] =
json!("pages only apply when pdftotext is installed; full document returned")
}
Ok(ToolResult::success(format!("{note}{text}")).with_metadata(metadata))
}