use argentor_core::{ArgentorResult, ToolCall, ToolResult};
use argentor_skills::skill::{Skill, SkillDescriptor};
use async_trait::async_trait;
use base64::Engine;
use serde_json::{json, Value};
pub struct PdfLoaderSkill {
descriptor: SkillDescriptor,
}
impl PdfLoaderSkill {
pub fn new() -> Self {
Self {
descriptor: SkillDescriptor {
name: "pdf_loader".to_string(),
description: "PDF document loader: extract_text, extract_metadata, count_pages, extract_page_range. Input as base64-encoded bytes or PDF string.".to_string(),
parameters_schema: json!({
"type": "object",
"properties": {
"operation": {
"type": "string",
"enum": ["extract_text", "extract_metadata", "count_pages", "extract_page_range"],
"description": "The PDF operation to perform"
},
"data": {
"type": "string",
"description": "PDF content (base64-encoded bytes or raw PDF string starting with '%PDF-')"
},
"encoding": {
"type": "string",
"enum": ["base64", "raw"],
"description": "Encoding of 'data'. Default: auto-detect."
},
"start_page": {
"type": "integer",
"description": "Start page (1-indexed) for extract_page_range"
},
"end_page": {
"type": "integer",
"description": "End page (1-indexed, inclusive) for extract_page_range"
}
},
"required": ["operation", "data"]
}),
required_capabilities: vec![],
requires_approval: false,
},
}
}
}
impl Default for PdfLoaderSkill {
fn default() -> Self {
Self::new()
}
}
fn decode_input(data: &str, encoding: Option<&str>) -> Result<Vec<u8>, String> {
match encoding {
Some("raw") => Ok(data.as_bytes().to_vec()),
Some("base64") => base64::engine::general_purpose::STANDARD
.decode(data)
.map_err(|e| format!("Invalid base64: {e}")),
_ => {
if data.starts_with("%PDF-") {
Ok(data.as_bytes().to_vec())
} else {
base64::engine::general_purpose::STANDARD
.decode(data)
.map_err(|e| format!("Invalid base64 (and no PDF header): {e}"))
}
}
}
}
fn validate_pdf(bytes: &[u8]) -> Result<String, String> {
if bytes.len() < 5 {
return Err("Data too short to be a PDF".to_string());
}
if &bytes[..5] != b"%PDF-" {
return Err("Missing %PDF- header".to_string());
}
let end = bytes
.iter()
.take(10)
.position(|b| *b == b'\n' || *b == b'\r');
let header_end = end.unwrap_or(8);
let version = String::from_utf8_lossy(&bytes[5..header_end]).to_string();
Ok(version)
}
fn count_pages_internal(bytes: &[u8]) -> usize {
let haystack = String::from_utf8_lossy(bytes);
let mut count = 0;
let mut search_from = 0;
while let Some(idx) = haystack[search_from..].find("/Type") {
let abs = search_from + idx;
let rest = &haystack[abs..];
let after_type = &rest[5..].trim_start();
if let Some(after) = after_type.strip_prefix("/Page") {
if !after
.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic())
{
count += 1;
}
}
search_from = abs + 5;
}
count
}
fn extract_text_internal(bytes: &[u8]) -> String {
let haystack = String::from_utf8_lossy(bytes);
let mut output = String::new();
let mut chars = haystack.chars().peekable();
let mut buffer = String::new();
let mut depth: i32 = 0;
let mut escape = false;
#[allow(clippy::while_let_on_iterator)]
while let Some(c) = chars.next() {
if escape {
match c {
'n' => buffer.push('\n'),
'r' => buffer.push('\r'),
't' => buffer.push('\t'),
'\\' => buffer.push('\\'),
'(' => buffer.push('('),
')' => buffer.push(')'),
other => buffer.push(other),
}
escape = false;
continue;
}
if c == '\\' && depth > 0 {
escape = true;
continue;
}
if c == '(' {
if depth == 0 {
buffer.clear();
}
depth += 1;
continue;
}
if c == ')' {
depth -= 1;
if depth == 0 {
let printable = buffer.chars().filter(|c| !c.is_control()).count();
if printable > 0 && buffer.is_ascii() {
output.push_str(&buffer);
output.push(' ');
}
buffer.clear();
}
continue;
}
if depth > 0 {
buffer.push(c);
}
}
output.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn extract_metadata_internal(bytes: &[u8]) -> Value {
let haystack = String::from_utf8_lossy(bytes);
let mut meta = json!({});
for field in [
"Title",
"Author",
"Subject",
"Keywords",
"Creator",
"Producer",
"CreationDate",
"ModDate",
] {
let needle = format!("/{field}");
if let Some(idx) = haystack.find(&needle) {
let rest = &haystack[idx + needle.len()..];
let rest = rest.trim_start();
if rest.starts_with('(') {
let mut depth = 0;
let mut escape = false;
let mut value = String::new();
for c in rest.chars() {
if escape {
value.push(c);
escape = false;
continue;
}
if c == '\\' {
escape = true;
continue;
}
if c == '(' {
depth += 1;
if depth == 1 {
continue;
}
} else if c == ')' {
depth -= 1;
if depth == 0 {
break;
}
}
if depth >= 1 {
value.push(c);
}
}
meta[field.to_lowercase()] = Value::String(value);
}
}
}
meta
}
#[async_trait]
impl Skill for PdfLoaderSkill {
fn descriptor(&self) -> &SkillDescriptor {
&self.descriptor
}
async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
let operation = match call.arguments["operation"].as_str() {
Some(op) => op,
None => {
return Ok(ToolResult::error(
&call.id,
"Missing required parameter: 'operation'",
))
}
};
let data = match call.arguments["data"].as_str() {
Some(d) => d,
None => {
return Ok(ToolResult::error(
&call.id,
"Missing required parameter: 'data'",
))
}
};
let encoding = call.arguments["encoding"].as_str();
let bytes = match decode_input(data, encoding) {
Ok(b) => b,
Err(e) => return Ok(ToolResult::error(&call.id, e)),
};
let version = match validate_pdf(&bytes) {
Ok(v) => v,
Err(e) => return Ok(ToolResult::error(&call.id, e)),
};
match operation {
"extract_text" => {
let text = extract_text_internal(&bytes);
Ok(ToolResult::success(
&call.id,
json!({
"text": text,
"length": text.len(),
"pdf_version": version,
})
.to_string(),
))
}
"extract_metadata" => {
let meta = extract_metadata_internal(&bytes);
Ok(ToolResult::success(
&call.id,
json!({
"metadata": meta,
"pdf_version": version,
})
.to_string(),
))
}
"count_pages" => {
let count = count_pages_internal(&bytes);
Ok(ToolResult::success(
&call.id,
json!({
"pages": count,
"pdf_version": version,
})
.to_string(),
))
}
"extract_page_range" => {
let start = call.arguments["start_page"].as_u64().unwrap_or(1);
let end = call.arguments["end_page"].as_u64().unwrap_or(start);
if start == 0 || end < start {
return Ok(ToolResult::error(
&call.id,
"Invalid range: start_page must be >=1 and end_page >= start_page",
));
}
let text = extract_text_internal(&bytes);
let total = count_pages_internal(&bytes);
Ok(ToolResult::success(
&call.id,
json!({
"text": text,
"start_page": start,
"end_page": end,
"total_pages": total,
"note": "Simple parser does not map text to specific pages; full text returned",
})
.to_string(),
))
}
_ => Ok(ToolResult::error(
&call.id,
format!("Unknown operation: '{operation}'. Supported: extract_text, extract_metadata, count_pages, extract_page_range"),
)),
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
const SAMPLE_PDF: &str = "%PDF-1.4\n\
1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\
2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\
3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>\nendobj\n\
4 0 obj\n<< /Length 44 >>\nstream\nBT /F1 12 Tf (Hello World) Tj ET\nendstream\nendobj\n\
5 0 obj\n<< /Title (Sample Document) /Author (Argentor Test) /Subject (Testing) >>\nendobj\n\
xref\n0 6\n0000000000 65535 f\n\
trailer\n<< /Size 6 /Info 5 0 R >>\nstartxref\n0\n%%EOF\n";
fn make_call(args: Value) -> ToolCall {
ToolCall {
id: "test".to_string(),
name: "pdf_loader".to_string(),
arguments: args,
}
}
#[tokio::test]
async fn test_extract_text() {
let skill = PdfLoaderSkill::new();
let call =
make_call(json!({"operation": "extract_text", "data": SAMPLE_PDF, "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error, "Result: {}", result.content);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let text = parsed["text"].as_str().unwrap();
assert!(
text.contains("Hello World") || text.contains("Sample Document"),
"Expected extracted text, got: {text}"
);
}
#[tokio::test]
async fn test_count_pages() {
let skill = PdfLoaderSkill::new();
let call =
make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF, "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["pages"], 1);
}
#[tokio::test]
async fn test_extract_metadata() {
let skill = PdfLoaderSkill::new();
let call = make_call(
json!({"operation": "extract_metadata", "data": SAMPLE_PDF, "encoding": "raw"}),
);
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let meta = &parsed["metadata"];
assert_eq!(meta["title"], "Sample Document");
assert_eq!(meta["author"], "Argentor Test");
assert_eq!(meta["subject"], "Testing");
}
#[tokio::test]
async fn test_extract_page_range() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({
"operation": "extract_page_range",
"data": SAMPLE_PDF,
"encoding": "raw",
"start_page": 1,
"end_page": 1
}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["start_page"], 1);
assert_eq!(parsed["end_page"], 1);
}
#[tokio::test]
async fn test_invalid_page_range() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({
"operation": "extract_page_range",
"data": SAMPLE_PDF,
"encoding": "raw",
"start_page": 5,
"end_page": 2
}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_invalid_pdf_header() {
let skill = PdfLoaderSkill::new();
let call =
make_call(json!({"operation": "extract_text", "data": "not a pdf", "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("PDF") || result.content.contains("header"));
}
#[tokio::test]
async fn test_too_short_data() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({"operation": "extract_text", "data": "%P", "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_base64_encoded() {
let skill = PdfLoaderSkill::new();
let encoded = base64::engine::general_purpose::STANDARD.encode(SAMPLE_PDF.as_bytes());
let call =
make_call(json!({"operation": "count_pages", "data": encoded, "encoding": "base64"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error, "Result: {}", result.content);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["pages"], 1);
}
#[tokio::test]
async fn test_auto_detect_raw() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
}
#[tokio::test]
async fn test_invalid_base64() {
let skill = PdfLoaderSkill::new();
let call = make_call(
json!({"operation": "extract_text", "data": "!!!not-base64!!!", "encoding": "base64"}),
);
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("base64") || result.content.contains("Invalid"));
}
#[tokio::test]
async fn test_missing_operation() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({"data": SAMPLE_PDF}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_missing_data() {
let skill = PdfLoaderSkill::new();
let call = make_call(json!({"operation": "extract_text"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_unknown_operation() {
let skill = PdfLoaderSkill::new();
let call =
make_call(json!({"operation": "redact_pii", "data": SAMPLE_PDF, "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("Unknown operation"));
}
#[tokio::test]
async fn test_pdf_version_extracted() {
let skill = PdfLoaderSkill::new();
let call =
make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF, "encoding": "raw"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["pdf_version"], "1.4");
}
#[test]
fn test_descriptor_name() {
let skill = PdfLoaderSkill::new();
assert_eq!(skill.descriptor().name, "pdf_loader");
}
#[test]
fn test_count_pages_ignores_pages_collection() {
let pdf = b"%PDF-1.4\n<< /Type /Pages /Count 3 >>\n<< /Type /Page >>\n";
assert_eq!(count_pages_internal(pdf), 1);
}
}