#[cfg(feature = "python")]
use pyo3::prelude::*;
#[cfg(feature = "python")]
use crate::{UniversalProcessor, ProcessingParams, UniversalOutput, DocumentChunk, DocumentMetadata};
#[cfg(feature = "python")]
use crate::processors::DocumentProcessor;
#[cfg(feature = "python")]
use std::path::Path;
#[cfg(feature = "python")]
#[pyclass]
pub struct PyUniversalProcessor {
}
#[cfg(feature = "python")]
#[pymethods]
impl PyUniversalProcessor {
#[new]
pub fn new() -> Self {
Self {}
}
#[pyo3(signature = (file_path, params=None))]
pub fn process_file(&self, file_path: &str, params: Option<PyProcessingParams>) -> PyResult<PyUniversalOutput> {
let processor = UniversalProcessor::new();
let params = params.unwrap_or_default().into();
let result = processor.process_file(Path::new(file_path), Some(params));
match result {
Ok(output) => Ok(PyUniversalOutput::from(output)),
Err(e) => Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string())),
}
}
#[staticmethod]
pub fn get_supported_extensions() -> Vec<String> {
UniversalProcessor::supported_extensions().iter().map(|s| s.to_string()).collect()
}
#[pyo3(signature = (content, params=None))]
pub fn process_text_content(&self, content: &str, params: Option<PyProcessingParams>) -> PyResult<PyUniversalOutput> {
let params = params.unwrap_or_default().into();
let txt_processor = crate::processors::txt::TxtProcessor::new();
let result = txt_processor.process_content(content.as_bytes(), "text_content", ¶ms);
match result {
Ok(output) => Ok(PyUniversalOutput::from(output)),
Err(e) => Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string())),
}
}
}
#[cfg(feature = "python")]
#[pyclass]
#[derive(Clone)]
pub struct PyProcessingParams {
#[pyo3(get, set)]
pub chunk_size: usize,
#[pyo3(get, set)]
pub overlap: usize,
#[pyo3(get, set)]
pub clean_text: bool,
#[pyo3(get, set)]
pub extract_metadata: bool,
#[pyo3(get, set)]
pub preserve_formatting: bool,
}
#[cfg(feature = "python")]
#[pymethods]
impl PyProcessingParams {
#[new]
#[pyo3(signature = (chunk_size=800, overlap=100, clean_text=true, extract_metadata=true, preserve_formatting=false))]
pub fn new(
chunk_size: usize,
overlap: usize,
clean_text: bool,
extract_metadata: bool,
preserve_formatting: bool,
) -> Self {
Self {
chunk_size,
overlap,
clean_text,
extract_metadata,
preserve_formatting,
}
}
}
#[cfg(feature = "python")]
impl Default for PyProcessingParams {
fn default() -> Self {
Self::new(800, 100, true, true, false)
}
}
#[cfg(feature = "python")]
impl From<PyProcessingParams> for ProcessingParams {
fn from(py_params: PyProcessingParams) -> Self {
Self {
max_chunk_size: py_params.chunk_size,
chunk_overlap: py_params.overlap,
text_cleaning: py_params.clean_text,
language_detection: false, format_specific: if py_params.preserve_formatting {
serde_json::json!({"preserve_formatting": true})
} else {
serde_json::Value::Null
},
}
}
}
#[cfg(feature = "python")]
#[pyclass]
#[derive(Clone)]
pub struct PyDocumentMetadata {
#[pyo3(get)]
pub filename: String,
#[pyo3(get)]
pub filepath: String,
#[pyo3(get)]
pub document_type: String,
#[pyo3(get)]
pub file_size: u64,
#[pyo3(get)]
pub created_at: String,
#[pyo3(get)]
pub modified_at: String,
#[pyo3(get)]
pub title: Option<String>,
#[pyo3(get)]
pub author: Option<String>,
}
#[cfg(feature = "python")]
impl From<DocumentMetadata> for PyDocumentMetadata {
fn from(metadata: DocumentMetadata) -> Self {
Self {
filename: metadata.filename,
filepath: metadata.filepath,
document_type: metadata.document_type.to_string().to_string(),
file_size: metadata.file_size,
created_at: metadata.created_at
.map(|dt| dt.to_rfc3339())
.unwrap_or_else(|| "Unknown".to_string()),
modified_at: metadata.modified_at
.map(|dt| dt.to_rfc3339())
.unwrap_or_else(|| "Unknown".to_string()),
title: metadata.title,
author: metadata.author,
}
}
}
#[cfg(feature = "python")]
#[pyclass]
#[derive(Clone)]
pub struct PyDocumentChunk {
#[pyo3(get)]
pub id: String,
#[pyo3(get)]
pub content: String,
#[pyo3(get)]
pub chunk_index: usize,
#[pyo3(get)]
pub size: usize,
#[pyo3(get)]
pub word_count: usize,
#[pyo3(get)]
pub char_count: usize,
}
#[cfg(feature = "python")]
impl From<DocumentChunk> for PyDocumentChunk {
fn from(chunk: DocumentChunk) -> Self {
let word_count = chunk.metadata.format_specific
.as_ref()
.and_then(|v| v.get("word_count"))
.and_then(|v| v.as_u64())
.unwrap_or_else(|| {
chunk.content.split_whitespace().count() as u64
}) as usize;
Self {
id: chunk.id,
content: chunk.content,
chunk_index: chunk.chunk_index,
size: chunk.metadata.size,
word_count,
char_count: chunk.metadata.size,
}
}
}
#[cfg(feature = "python")]
#[pyclass]
#[derive(Clone)]
pub struct PyUniversalOutput {
#[pyo3(get)]
pub document_metadata: PyDocumentMetadata,
#[pyo3(get)]
pub chunks: Vec<PyDocumentChunk>,
}
#[cfg(feature = "python")]
#[pymethods]
impl PyUniversalOutput {
pub fn total_word_count(&self) -> usize {
self.chunks.iter().map(|chunk| chunk.word_count).sum()
}
pub fn total_char_count(&self) -> usize {
self.chunks.iter().map(|chunk| chunk.char_count).sum()
}
pub fn chunk_count(&self) -> usize {
self.chunks.len()
}
pub fn get_text_chunks(&self) -> Vec<String> {
self.chunks.iter().map(|chunk| chunk.content.clone()).collect()
}
pub fn to_json(&self) -> PyResult<String> {
let json_value = serde_json::json!({
"document_metadata": {
"filename": self.document_metadata.filename,
"filepath": self.document_metadata.filepath,
"document_type": self.document_metadata.document_type,
"file_size": self.document_metadata.file_size,
"created_at": self.document_metadata.created_at,
"modified_at": self.document_metadata.modified_at,
"title": self.document_metadata.title,
"author": self.document_metadata.author
},
"chunks": self.chunks.iter().map(|chunk| serde_json::json!({
"id": chunk.id,
"content": chunk.content,
"chunk_index": chunk.chunk_index,
"size": chunk.size,
"word_count": chunk.word_count,
"char_count": chunk.char_count
})).collect::<Vec<_>>(),
"total_chunks": self.chunks.len(),
"total_word_count": self.total_word_count(),
"total_char_count": self.total_char_count()
});
serde_json::to_string_pretty(&json_value)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(e.to_string()))
}
}
#[cfg(feature = "python")]
impl From<UniversalOutput> for PyUniversalOutput {
fn from(output: UniversalOutput) -> Self {
Self {
document_metadata: PyDocumentMetadata::from(output.document_metadata),
chunks: output.chunks.into_iter().map(PyDocumentChunk::from).collect(),
}
}
}
#[cfg(feature = "python")]
#[pymodule]
#[pyo3(name = "doc_loader")]
fn python_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyUniversalProcessor>()?;
m.add_class::<PyProcessingParams>()?;
m.add_class::<PyDocumentMetadata>()?;
m.add_class::<PyDocumentChunk>()?;
m.add_class::<PyUniversalOutput>()?;
#[pyfn(m)]
#[pyo3(signature = (file_path, chunk_size=None))]
fn process_file(file_path: &str, chunk_size: Option<usize>) -> PyResult<PyUniversalOutput> {
let processor = PyUniversalProcessor::new();
let mut params = PyProcessingParams::default();
if let Some(size) = chunk_size {
params.chunk_size = size;
}
processor.process_file(file_path, Some(params))
}
#[pyfn(m)]
#[pyo3(signature = (content, chunk_size=None))]
fn process_text(content: &str, chunk_size: Option<usize>) -> PyResult<PyUniversalOutput> {
let processor = PyUniversalProcessor::new();
let mut params = PyProcessingParams::default();
if let Some(size) = chunk_size {
params.chunk_size = size;
}
processor.process_text_content(content, Some(params))
}
#[pyfn(m)]
fn supported_extensions() -> Vec<String> {
crate::processors::UniversalProcessor::supported_extensions().iter().map(|s| s.to_string()).collect()
}
Ok(())
}