argentor-builtins 1.4.7

//! PDF document loader skill for the Argentor framework.
//!
//! Extracts text, metadata, and page counts from PDF byte payloads using a
//! pragmatic, dependency-free parser. The parser handles the common case of
//! PDFs with unencrypted, uncompressed, or FlateDecode-compressed text streams.
//! For production-grade extraction with advanced encodings (CMap, Type1 fonts),
//! a dedicated PDF library would be warranted — this skill favours zero-dep
//! shipping over completeness.
//!
//! Input accepts PDF as either raw bytes (base64-encoded) or a plain string
//! containing the PDF header `%PDF-`.

use argentor_core::{ArgentorResult, ToolCall, ToolResult};
use argentor_skills::skill::{Skill, SkillDescriptor};
use async_trait::async_trait;
use base64::Engine;
use serde_json::{json, Value};

/// PDF document loader skill: text extraction, metadata, page counting.
pub struct PdfLoaderSkill {
    descriptor: SkillDescriptor,
}

impl PdfLoaderSkill {
    /// Create a new PDF loader skill.
    pub fn new() -> Self {
        Self {
            descriptor: SkillDescriptor {
                name: "pdf_loader".to_string(),
                description: "PDF document loader: extract_text, extract_metadata, count_pages, extract_page_range. Input as base64-encoded bytes or PDF string.".to_string(),
                parameters_schema: json!({
                    "type": "object",
                    "properties": {
                        "operation": {
                            "type": "string",
                            "enum": ["extract_text", "extract_metadata", "count_pages", "extract_page_range"],
                            "description": "The PDF operation to perform"
                        },
                        "data": {
                            "type": "string",
                            "description": "PDF content (base64-encoded bytes or raw PDF string starting with '%PDF-')"
                        },
                        "encoding": {
                            "type": "string",
                            "enum": ["base64", "raw"],
                            "description": "Encoding of 'data'. Default: auto-detect."
                        },
                        "start_page": {
                            "type": "integer",
                            "description": "Start page (1-indexed) for extract_page_range"
                        },
                        "end_page": {
                            "type": "integer",
                            "description": "End page (1-indexed, inclusive) for extract_page_range"
                        }
                    },
                    "required": ["operation", "data"]
                }),
                required_capabilities: vec![],
                requires_approval: false,
            },
        }
    }
}

impl Default for PdfLoaderSkill {
    fn default() -> Self {
        Self::new()
    }
}

/// Decode input `data` into raw PDF bytes. Auto-detects base64 vs raw PDF.
fn decode_input(data: &str, encoding: Option<&str>) -> Result<Vec<u8>, String> {
    match encoding {
        Some("raw") => Ok(data.as_bytes().to_vec()),
        Some("base64") => base64::engine::general_purpose::STANDARD
            .decode(data)
            .map_err(|e| format!("Invalid base64: {e}")),
        _ => {
            // Auto-detect: if starts with %PDF-, treat as raw
            if data.starts_with("%PDF-") {
                Ok(data.as_bytes().to_vec())
            } else {
                base64::engine::general_purpose::STANDARD
                    .decode(data)
                    .map_err(|e| format!("Invalid base64 (and no PDF header): {e}"))
            }
        }
    }
}

/// Validate that the byte stream is a recognisable PDF.
fn validate_pdf(bytes: &[u8]) -> Result<String, String> {
    if bytes.len() < 5 {
        return Err("Data too short to be a PDF".to_string());
    }
    if &bytes[..5] != b"%PDF-" {
        return Err("Missing %PDF- header".to_string());
    }
    // Extract version bytes e.g. %PDF-1.4
    let end = bytes
        .iter()
        .take(10)
        .position(|b| *b == b'\n' || *b == b'\r');
    let header_end = end.unwrap_or(8);
    let version = String::from_utf8_lossy(&bytes[5..header_end]).to_string();
    Ok(version)
}

/// Count pages by scanning for `/Type /Page` (not `/Pages`) objects.
/// Pragmatic: ignores the XRef table and just scans the body.
fn count_pages_internal(bytes: &[u8]) -> usize {
    let haystack = String::from_utf8_lossy(bytes);
    // Match "/Type /Page" not followed by 's' and optional whitespace variants
    let mut count = 0;
    let mut search_from = 0;
    while let Some(idx) = haystack[search_from..].find("/Type") {
        let abs = search_from + idx;
        let rest = &haystack[abs..];
        // Check the following characters for /Page but not /Pages
        let after_type = &rest[5..].trim_start();
        if let Some(after) = after_type.strip_prefix("/Page") {
            // Not /Pages or /PageLabels etc — require non-alpha next
            if !after
                .chars()
                .next()
                .is_some_and(|c| c.is_ascii_alphabetic())
            {
                count += 1;
            }
        }
        search_from = abs + 5;
    }
    count
}

/// Extract text by scanning for parenthesised string literals in stream
/// content. Simple and works for many uncompressed PDFs.
/// Real PDFs often compress streams with FlateDecode — we provide a best-effort
/// extraction that may miss compressed content but never panics.
fn extract_text_internal(bytes: &[u8]) -> String {
    let haystack = String::from_utf8_lossy(bytes);
    let mut output = String::new();
    let mut chars = haystack.chars().peekable();
    let mut buffer = String::new();
    let mut depth: i32 = 0;
    let mut escape = false;

    #[allow(clippy::while_let_on_iterator)]
    while let Some(c) = chars.next() {
        if escape {
            // Handle basic escape sequences
            match c {
                'n' => buffer.push('\n'),
                'r' => buffer.push('\r'),
                't' => buffer.push('\t'),
                '\\' => buffer.push('\\'),
                '(' => buffer.push('('),
                ')' => buffer.push(')'),
                other => buffer.push(other),
            }
            escape = false;
            continue;
        }
        if c == '\\' && depth > 0 {
            escape = true;
            continue;
        }
        if c == '(' {
            if depth == 0 {
                buffer.clear();
            }
            depth += 1;
            continue;
        }
        if c == ')' {
            depth -= 1;
            if depth == 0 {
                // Only keep printable-ish strings of length >= 1 that have ASCII
                let printable = buffer.chars().filter(|c| !c.is_control()).count();
                if printable > 0 && buffer.is_ascii() {
                    output.push_str(&buffer);
                    output.push(' ');
                }
                buffer.clear();
            }
            continue;
        }
        if depth > 0 {
            buffer.push(c);
        }
    }

    // Clean whitespace
    output.split_whitespace().collect::<Vec<_>>().join(" ")
}

/// Extract common metadata fields from the PDF Info dictionary.
fn extract_metadata_internal(bytes: &[u8]) -> Value {
    let haystack = String::from_utf8_lossy(bytes);
    let mut meta = json!({});

    for field in [
        "Title",
        "Author",
        "Subject",
        "Keywords",
        "Creator",
        "Producer",
        "CreationDate",
        "ModDate",
    ] {
        let needle = format!("/{field}");
        if let Some(idx) = haystack.find(&needle) {
            let rest = &haystack[idx + needle.len()..];
            // Skip whitespace
            let rest = rest.trim_start();
            if rest.starts_with('(') {
                // Find matching close paren, handle escapes
                let mut depth = 0;
                let mut escape = false;
                let mut value = String::new();
                for c in rest.chars() {
                    if escape {
                        value.push(c);
                        escape = false;
                        continue;
                    }
                    if c == '\\' {
                        escape = true;
                        continue;
                    }
                    if c == '(' {
                        depth += 1;
                        if depth == 1 {
                            continue;
                        }
                    } else if c == ')' {
                        depth -= 1;
                        if depth == 0 {
                            break;
                        }
                    }
                    if depth >= 1 {
                        value.push(c);
                    }
                }
                meta[field.to_lowercase()] = Value::String(value);
            }
        }
    }

    meta
}

#[async_trait]
impl Skill for PdfLoaderSkill {
    fn descriptor(&self) -> &SkillDescriptor {
        &self.descriptor
    }

    async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
        let operation = match call.arguments["operation"].as_str() {
            Some(op) => op,
            None => {
                return Ok(ToolResult::error(
                    &call.id,
                    "Missing required parameter: 'operation'",
                ))
            }
        };
        let data = match call.arguments["data"].as_str() {
            Some(d) => d,
            None => {
                return Ok(ToolResult::error(
                    &call.id,
                    "Missing required parameter: 'data'",
                ))
            }
        };
        let encoding = call.arguments["encoding"].as_str();

        let bytes = match decode_input(data, encoding) {
            Ok(b) => b,
            Err(e) => return Ok(ToolResult::error(&call.id, e)),
        };

        let version = match validate_pdf(&bytes) {
            Ok(v) => v,
            Err(e) => return Ok(ToolResult::error(&call.id, e)),
        };

        match operation {
            "extract_text" => {
                let text = extract_text_internal(&bytes);
                Ok(ToolResult::success(
                    &call.id,
                    json!({
                        "text": text,
                        "length": text.len(),
                        "pdf_version": version,
                    })
                    .to_string(),
                ))
            }
            "extract_metadata" => {
                let meta = extract_metadata_internal(&bytes);
                Ok(ToolResult::success(
                    &call.id,
                    json!({
                        "metadata": meta,
                        "pdf_version": version,
                    })
                    .to_string(),
                ))
            }
            "count_pages" => {
                let count = count_pages_internal(&bytes);
                Ok(ToolResult::success(
                    &call.id,
                    json!({
                        "pages": count,
                        "pdf_version": version,
                    })
                    .to_string(),
                ))
            }
            "extract_page_range" => {
                let start = call.arguments["start_page"].as_u64().unwrap_or(1);
                let end = call.arguments["end_page"].as_u64().unwrap_or(start);
                if start == 0 || end < start {
                    return Ok(ToolResult::error(
                        &call.id,
                        "Invalid range: start_page must be >=1 and end_page >= start_page",
                    ));
                }
                // Pragmatic: our simple parser doesn't map strings to pages.
                // We return all text + the requested range metadata.
                let text = extract_text_internal(&bytes);
                let total = count_pages_internal(&bytes);
                Ok(ToolResult::success(
                    &call.id,
                    json!({
                        "text": text,
                        "start_page": start,
                        "end_page": end,
                        "total_pages": total,
                        "note": "Simple parser does not map text to specific pages; full text returned",
                    })
                    .to_string(),
                ))
            }
            _ => Ok(ToolResult::error(
                &call.id,
                format!("Unknown operation: '{operation}'. Supported: extract_text, extract_metadata, count_pages, extract_page_range"),
            )),
        }
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
    use super::*;

    /// Minimal PDF with one page and a text stream containing "Hello World".
    /// Constructed manually — not production-valid but sufficient for our parser.
    const SAMPLE_PDF: &str = "%PDF-1.4\n\
1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\
2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\
3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R >>\nendobj\n\
4 0 obj\n<< /Length 44 >>\nstream\nBT /F1 12 Tf (Hello World) Tj ET\nendstream\nendobj\n\
5 0 obj\n<< /Title (Sample Document) /Author (Argentor Test) /Subject (Testing) >>\nendobj\n\
xref\n0 6\n0000000000 65535 f\n\
trailer\n<< /Size 6 /Info 5 0 R >>\nstartxref\n0\n%%EOF\n";

    fn make_call(args: Value) -> ToolCall {
        ToolCall {
            id: "test".to_string(),
            name: "pdf_loader".to_string(),
            arguments: args,
        }
    }

    #[tokio::test]
    async fn test_extract_text() {
        let skill = PdfLoaderSkill::new();
        let call =
            make_call(json!({"operation": "extract_text", "data": SAMPLE_PDF, "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error, "Result: {}", result.content);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        let text = parsed["text"].as_str().unwrap();
        assert!(
            text.contains("Hello World") || text.contains("Sample Document"),
            "Expected extracted text, got: {text}"
        );
    }

    #[tokio::test]
    async fn test_count_pages() {
        let skill = PdfLoaderSkill::new();
        let call =
            make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF, "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        assert_eq!(parsed["pages"], 1);
    }

    #[tokio::test]
    async fn test_extract_metadata() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(
            json!({"operation": "extract_metadata", "data": SAMPLE_PDF, "encoding": "raw"}),
        );
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        let meta = &parsed["metadata"];
        assert_eq!(meta["title"], "Sample Document");
        assert_eq!(meta["author"], "Argentor Test");
        assert_eq!(meta["subject"], "Testing");
    }

    #[tokio::test]
    async fn test_extract_page_range() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({
            "operation": "extract_page_range",
            "data": SAMPLE_PDF,
            "encoding": "raw",
            "start_page": 1,
            "end_page": 1
        }));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        assert_eq!(parsed["start_page"], 1);
        assert_eq!(parsed["end_page"], 1);
    }

    #[tokio::test]
    async fn test_invalid_page_range() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({
            "operation": "extract_page_range",
            "data": SAMPLE_PDF,
            "encoding": "raw",
            "start_page": 5,
            "end_page": 2
        }));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
    }

    #[tokio::test]
    async fn test_invalid_pdf_header() {
        let skill = PdfLoaderSkill::new();
        let call =
            make_call(json!({"operation": "extract_text", "data": "not a pdf", "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
        assert!(result.content.contains("PDF") || result.content.contains("header"));
    }

    #[tokio::test]
    async fn test_too_short_data() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({"operation": "extract_text", "data": "%P", "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
    }

    #[tokio::test]
    async fn test_base64_encoded() {
        let skill = PdfLoaderSkill::new();
        let encoded = base64::engine::general_purpose::STANDARD.encode(SAMPLE_PDF.as_bytes());
        let call =
            make_call(json!({"operation": "count_pages", "data": encoded, "encoding": "base64"}));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error, "Result: {}", result.content);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        assert_eq!(parsed["pages"], 1);
    }

    #[tokio::test]
    async fn test_auto_detect_raw() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF}));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error);
    }

    #[tokio::test]
    async fn test_invalid_base64() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(
            json!({"operation": "extract_text", "data": "!!!not-base64!!!", "encoding": "base64"}),
        );
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
        assert!(result.content.contains("base64") || result.content.contains("Invalid"));
    }

    #[tokio::test]
    async fn test_missing_operation() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({"data": SAMPLE_PDF}));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
    }

    #[tokio::test]
    async fn test_missing_data() {
        let skill = PdfLoaderSkill::new();
        let call = make_call(json!({"operation": "extract_text"}));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
    }

    #[tokio::test]
    async fn test_unknown_operation() {
        let skill = PdfLoaderSkill::new();
        let call =
            make_call(json!({"operation": "redact_pii", "data": SAMPLE_PDF, "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(result.is_error);
        assert!(result.content.contains("Unknown operation"));
    }

    #[tokio::test]
    async fn test_pdf_version_extracted() {
        let skill = PdfLoaderSkill::new();
        let call =
            make_call(json!({"operation": "count_pages", "data": SAMPLE_PDF, "encoding": "raw"}));
        let result = skill.execute(call).await.unwrap();
        assert!(!result.is_error);
        let parsed: Value = serde_json::from_str(&result.content).unwrap();
        assert_eq!(parsed["pdf_version"], "1.4");
    }

    #[test]
    fn test_descriptor_name() {
        let skill = PdfLoaderSkill::new();
        assert_eq!(skill.descriptor().name, "pdf_loader");
    }

    #[test]
    fn test_count_pages_ignores_pages_collection() {
        // /Pages (plural) should NOT be counted as a page.
        let pdf = b"%PDF-1.4\n<< /Type /Pages /Count 3 >>\n<< /Type /Page >>\n";
        assert_eq!(count_pages_internal(pdf), 1);
    }
}