npcrs 0.1.11

Rust core for the NPC system — agent kernel, jinx executor, LLM client
Documentation
use crate::error::{NpcError, Result};
use std::path::Path;

#[derive(Debug, Clone)]
pub struct FileContent {
    pub content: String,
    pub file_type: String,
    pub path: String,
    pub size: usize,
}

pub fn load_file_contents(path: &str) -> Result<FileContent> {
    let path_obj = Path::new(path);
    let ext = path_obj
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("")
        .to_lowercase();

    let raw = std::fs::read(path).map_err(|e| NpcError::FileLoad {
        path: path.to_string(),
        source: e,
    })?;
    let size = raw.len();

    let content = match ext.as_str() {
        "pdf" => extract_pdf_text(path),
        "html" | "htm" => {
            let html = String::from_utf8_lossy(&raw).to_string();
            super::text::strip_html(&html)
        }
        _ => String::from_utf8_lossy(&raw).to_string(),
    };

    Ok(FileContent {
        content,
        file_type: ext,
        path: path.to_string(),
        size,
    })
}

fn extract_pdf_text(path: &str) -> String {
    std::process::Command::new("pdftotext")
        .args(["-nopgbrk", path, "-"])
        .output()
        .ok()
        .filter(|o| o.status.success())
        .map(|o| String::from_utf8_lossy(&o.stdout).to_string())
        .unwrap_or_else(|| format!("[PDF extraction failed for {}]", path))
}

pub fn load_txt(path: &str) -> Result<String> {
    std::fs::read_to_string(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })
}

pub fn load_csv(path: &str) -> Result<String> {
    std::fs::read_to_string(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })
}

pub fn load_json(path: &str) -> Result<String> {
    let data = std::fs::read_to_string(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })?;
    match serde_json::from_str::<serde_json::Value>(&data) {
        Ok(val) => Ok(serde_json::to_string_pretty(&val).unwrap_or(data)),
        Err(_) => Ok(data),
    }
}

pub fn load_excel(path: &str) -> Result<String> {
    use calamine::{Reader, open_workbook_auto};
    let mut workbook = open_workbook_auto(path)
        .map_err(|e| NpcError::Other(format!("Excel open failed: {}", e)))?;
    let mut output = String::new();
    for sheet_name in workbook.sheet_names().to_vec() {
        if let Ok(range) = workbook.worksheet_range(&sheet_name) {
            output.push_str(&format!("--- {} ---\n", sheet_name));
            for row in range.rows() {
                let cells: Vec<String> = row.iter().map(|c| format!("{}", c)).collect();
                output.push_str(&cells.join("\t"));
                output.push('\n');
            }
            output.push('\n');
        }
    }
    Ok(output)
}

pub fn load_image(path: &str) -> Result<String> {
    let raw = std::fs::read(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })?;
    use base64::Engine;
    let b64 = base64::engine::general_purpose::STANDARD.encode(&raw);
    let ext = Path::new(path)
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("png");
    Ok(format!(
        "[Image: {} ({} bytes)]\ndata:image/{};base64,{}",
        path,
        raw.len(),
        ext,
        b64
    ))
}

pub fn load_pdf(path: &str) -> String {
    extract_pdf_text(path)
}

pub fn load_docx(path: &str) -> Result<String> {
    let file = std::fs::File::open(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })?;
    let mut archive = zip::ZipArchive::new(file)
        .map_err(|e| NpcError::Other(format!("DOCX zip open failed: {}", e)))?;
    let mut text = String::new();
    if let Ok(mut doc_xml) = archive.by_name("word/document.xml") {
        let mut xml = String::new();
        std::io::Read::read_to_string(&mut doc_xml, &mut xml).ok();
        for cap in regex::Regex::new(r"<w:t[^>]*>(.*?)</w:t>")
            .unwrap()
            .captures_iter(&xml)
        {
            text.push_str(&cap[1]);
        }
        text = regex::Regex::new(r"</w:p>")
            .unwrap()
            .replace_all(&text, "\n")
            .to_string();
    }
    Ok(text)
}

pub fn load_pptx(path: &str) -> Result<String> {
    let file = std::fs::File::open(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })?;
    let mut archive = zip::ZipArchive::new(file)
        .map_err(|e| NpcError::Other(format!("PPTX zip open failed: {}", e)))?;
    let mut text = String::new();
    let tag_strip = regex::Regex::new(r"<[^>]+>").unwrap();
    for i in 0..archive.len() {
        if let Ok(entry) = archive.by_index(i) {
            let name = entry.name().to_string();
            if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
                let mut xml = String::new();
                let mut reader = std::io::BufReader::new(entry);
                std::io::Read::read_to_string(&mut reader, &mut xml).ok();
                for cap in regex::Regex::new(r"<a:t>(.*?)</a:t>")
                    .unwrap()
                    .captures_iter(&xml)
                {
                    text.push_str(&cap[1]);
                    text.push(' ');
                }
                text.push('\n');
            }
        }
    }
    Ok(text)
}

pub fn load_html(path: &str) -> Result<String> {
    let raw = std::fs::read_to_string(path).map_err(|e| NpcError::FileLoad {
        path: path.into(),
        source: e,
    })?;
    Ok(super::text::strip_html(&raw))
}

pub fn load_audio(path: &str) -> Result<String> {
    match super::audio::transcribe_audio_file(path, None) {
        Ok(t) if !t.is_empty() => Ok(t),
        _ => Ok(format!("[Audio file at {}; no transcript]", path)),
    }
}

pub fn load_video(path: &str) -> Result<String> {
    match super::video::summarize_video_file(path, None, 600) {
        Ok(s) => Ok(s),
        Err(_) => Ok(format!("[Video file at {}]", path)),
    }
}

pub fn chunk_text_simple(content: &str, chunk_size: usize) -> Vec<String> {
    let mut chunks = Vec::new();
    let mut start = 0;
    while start < content.len() {
        let mut end = (start + chunk_size).min(content.len());
        while end > start && !content.is_char_boundary(end) {
            end -= 1;
        }
        let chunk = content[start..end].trim();
        if !chunk.is_empty() {
            chunks.push(chunk.to_string());
        }
        start = end;
    }
    chunks
}

pub fn load_file_contents_chunked(path: &str, chunk_size: Option<usize>) -> Vec<String> {
    let cs = chunk_size.unwrap_or(8000);
    match load_file_contents(path) {
        Ok(fc) => {
            if fc.content.is_empty() {
                vec![]
            } else {
                chunk_text_simple(&fc.content, cs)
            }
        }
        Err(e) => vec![format!("Error loading {}: {}", path, e)],
    }
}

pub fn extension_category(ext: &str) -> &'static str {
    match ext.to_uppercase().as_str() {
        "PNG" | "JPG" | "JPEG" | "GIF" | "SVG" | "WEBP" | "BMP" | "TIFF" => "images",
        "MP4" | "AVI" | "MOV" | "WMV" | "MPG" | "MPEG" | "WEBM" | "MKV" => "videos",
        "DOCX" | "PPTX" | "PDF" | "XLSX" | "XLS" | "TXT" | "CSV" | "MD" | "HTML" | "HTM" => {
            "documents"
        }
        "MP3" | "WAV" | "M4A" | "AAC" | "FLAC" | "OGG" => "audio",
        "ZIP" | "RAR" | "7Z" | "TAR" | "GZ" => "archives",
        _ => "unknown",
    }
}