use std::fs;
use std::path::Path;
use pdf_oxide::converters::ConversionOptions;
use pdf_oxide::PdfDocument;
use serde_json::{json, Value};
pub fn run(args: &Value) -> Result<Value, (i32, String)> {
let file_path = args
.get("file_path")
.and_then(|v| v.as_str())
.ok_or((-32602, "Missing required parameter: file_path".to_string()))?;
let output_path = args
.get("output_path")
.and_then(|v| v.as_str())
.ok_or((-32602, "Missing required parameter: output_path".to_string()))?;
let format = args
.get("format")
.and_then(|v| v.as_str())
.unwrap_or("text");
let pages_str = args.get("pages").and_then(|v| v.as_str());
let password = args.get("password").and_then(|v| v.as_str());
let images = args
.get("images")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let embed_images = args
.get("embed_images")
.and_then(|v| v.as_bool())
.unwrap_or(true);
if !matches!(format, "text" | "markdown" | "html") {
return Err((-32602, format!("Invalid format: {format}. Must be text, markdown, or html")));
}
let mut doc =
PdfDocument::open(file_path).map_err(|e| (-32603, format!("Failed to open PDF: {e}")))?;
if let Some(pw) = password {
let ok = doc
.authenticate(pw.as_bytes())
.map_err(|e| (-32603, format!("Authentication error: {e}")))?;
if !ok {
return Err((-32603, "Incorrect password".to_string()));
}
}
let page_count = doc
.page_count()
.map_err(|e| (-32603, format!("Failed to get page count: {e}")))?;
let page_indices = match pages_str {
Some(s) => parse_page_ranges(s)?,
None => (0..page_count).collect(),
};
for &idx in &page_indices {
if idx >= page_count {
return Err((
-32602,
format!("Page {} out of range (document has {} pages)", idx + 1, page_count),
));
}
}
let output_dir = Path::new(output_path)
.parent()
.unwrap_or(Path::new("."))
.to_path_buf();
let opts = ConversionOptions {
embed_images,
image_output_dir: if !embed_images {
Some(output_dir.to_string_lossy().into_owned())
} else {
None
},
..Default::default()
};
let content = extract_pages(&mut doc, &page_indices, format, &opts)?;
if let Some(parent) = Path::new(output_path).parent() {
if !parent.as_os_str().is_empty() {
fs::create_dir_all(parent)
.map_err(|e| (-32603, format!("Failed to create output directory: {e}")))?;
}
}
fs::write(output_path, &content)
.map_err(|e| (-32603, format!("Failed to write output: {e}")))?;
let mut images_extracted = 0;
if images {
let img_dir = output_dir.join(
Path::new(output_path)
.file_stem()
.unwrap_or_default()
.to_string_lossy()
.as_ref(),
);
fs::create_dir_all(&img_dir)
.map_err(|e| (-32603, format!("Failed to create image directory: {e}")))?;
for &page_idx in &page_indices {
match doc.extract_images_to_files(page_idx, &img_dir, None, Some(images_extracted + 1))
{
Ok(refs) => images_extracted += refs.len(),
Err(e) => {
eprintln!("Warning: image extraction failed for page {}: {e}", page_idx + 1)
},
}
}
}
let file_size = fs::metadata(output_path).map(|m| m.len()).unwrap_or(0);
let mut message = format!(
"Extracted {} page(s) as {} to {} ({} bytes)",
page_indices.len(),
format,
output_path,
file_size
);
if images_extracted > 0 {
message.push_str(&format!(". {} image(s) saved.", images_extracted));
}
Ok(json!({
"content": [{ "type": "text", "text": message }]
}))
}
fn extract_pages(
doc: &mut PdfDocument,
page_indices: &[usize],
format: &str,
opts: &ConversionOptions,
) -> Result<String, (i32, String)> {
let mut parts = Vec::with_capacity(page_indices.len());
for &idx in page_indices {
let part = match format {
"text" => doc.extract_text(idx).map_err(|e| {
(-32603, format!("Text extraction failed on page {}: {e}", idx + 1))
})?,
"markdown" => doc.to_markdown(idx, opts).map_err(|e| {
(-32603, format!("Markdown conversion failed on page {}: {e}", idx + 1))
})?,
"html" => doc.to_html(idx, opts).map_err(|e| {
(-32603, format!("HTML conversion failed on page {}: {e}", idx + 1))
})?,
_ => unreachable!(),
};
parts.push(part);
}
let separator = match format {
"text" => "\n\n",
"markdown" => "\n---\n\n",
"html" => "\n",
_ => unreachable!(),
};
Ok(parts.join(separator))
}
fn parse_page_ranges(input: &str) -> Result<Vec<usize>, (i32, String)> {
let mut pages = Vec::new();
for part in input.split(',') {
let part = part.trim();
if part.is_empty() {
continue;
}
if let Some((start, end)) = part.split_once('-') {
let start: usize = start
.trim()
.parse()
.map_err(|_| (-32602, format!("Invalid page number: '{}'", start.trim())))?;
let end: usize = end
.trim()
.parse()
.map_err(|_| (-32602, format!("Invalid page number: '{}'", end.trim())))?;
if start == 0 || end == 0 {
return Err((-32602, "Page numbers start at 1".to_string()));
}
if start > end {
return Err((-32602, format!("Invalid range: {start}-{end} (start > end)")));
}
for p in start..=end {
pages.push(p - 1);
}
} else {
let page: usize = part
.parse()
.map_err(|_| (-32602, format!("Invalid page number: '{part}'")))?;
if page == 0 {
return Err((-32602, "Page numbers start at 1".to_string()));
}
pages.push(page - 1);
}
}
if pages.is_empty() {
return Err((-32602, "No page numbers specified".to_string()));
}
Ok(pages)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn fixture_path(name: &str) -> PathBuf {
let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
p.pop(); p.push("tests/fixtures");
p.push(name);
p
}
#[test]
fn test_parse_page_ranges_single() {
assert_eq!(parse_page_ranges("1").unwrap(), vec![0]);
assert_eq!(parse_page_ranges("5").unwrap(), vec![4]);
}
#[test]
fn test_parse_page_ranges_range() {
assert_eq!(parse_page_ranges("1-3").unwrap(), vec![0, 1, 2]);
}
#[test]
fn test_parse_page_ranges_mixed() {
assert_eq!(parse_page_ranges("1-3,7,10-12").unwrap(), vec![0, 1, 2, 6, 9, 10, 11]);
}
#[test]
fn test_parse_page_ranges_zero_rejected() {
assert!(parse_page_ranges("0").is_err());
}
#[test]
fn test_parse_page_ranges_invalid_range() {
assert!(parse_page_ranges("5-3").is_err());
}
#[test]
fn test_parse_page_ranges_empty() {
assert!(parse_page_ranges("").is_err());
}
#[test]
fn test_parse_page_ranges_non_numeric() {
assert!(parse_page_ranges("abc").is_err());
}
#[test]
fn test_extract_text_to_file() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.txt");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap(),
"format": "text"
});
let result = run(&args).expect("extract should succeed");
assert!(result["content"][0]["text"]
.as_str()
.unwrap()
.contains("1 page(s)"));
assert!(out.exists());
}
#[test]
fn test_extract_markdown_to_file() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.md");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap(),
"format": "markdown"
});
let result = run(&args).expect("extract should succeed");
assert!(result["content"][0]["text"]
.as_str()
.unwrap()
.contains("markdown"));
assert!(out.exists());
}
#[test]
fn test_extract_html_to_file() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.html");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap(),
"format": "html"
});
let result = run(&args).expect("extract should succeed");
assert!(result["content"][0]["text"]
.as_str()
.unwrap()
.contains("html"));
assert!(out.exists());
}
#[test]
fn test_extract_default_format_is_text() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.txt");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap()
});
let result = run(&args).expect("extract should succeed");
assert!(result["content"][0]["text"]
.as_str()
.unwrap()
.contains("text"));
}
#[test]
fn test_extract_page_out_of_range() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.txt");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap(),
"pages": "999"
});
let err = run(&args).unwrap_err();
assert_eq!(err.0, -32602);
assert!(err.1.contains("out of range"));
}
#[test]
fn test_extract_creates_parent_dirs() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("nested/deep/out.txt");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap()
});
run(&args).expect("should create nested dirs");
assert!(out.exists());
}
#[test]
fn test_extract_nonexistent_pdf() {
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.txt");
let args = json!({
"file_path": "/does/not/exist.pdf",
"output_path": out.to_str().unwrap()
});
let err = run(&args).unwrap_err();
assert_eq!(err.0, -32603);
assert!(err.1.contains("Failed to open PDF"));
}
#[test]
fn test_extract_invalid_format() {
let pdf = fixture_path("simple.pdf");
let dir = tempfile::tempdir().unwrap();
let out = dir.path().join("out.txt");
let args = json!({
"file_path": pdf.to_str().unwrap(),
"output_path": out.to_str().unwrap(),
"format": "csv"
});
let err = run(&args).unwrap_err();
assert_eq!(err.0, -32602);
assert!(err.1.contains("csv"));
}
#[test]
fn test_missing_file_path() {
let args = json!({ "output_path": "/tmp/out.txt" });
let err = run(&args).unwrap_err();
assert_eq!(err.0, -32602);
assert!(err.1.contains("file_path"));
}
#[test]
fn test_missing_output_path() {
let args = json!({ "file_path": "test.pdf" });
let err = run(&args).unwrap_err();
assert_eq!(err.0, -32602);
assert!(err.1.contains("output_path"));
}
}