use pyo3::prelude::*;
use pyo3::types::PyBytes;
use crate::python::types::PyExtractionResult;
use crate::python::errors::map_rust_error;
#[pyfunction]
#[pyo3(
signature = (path),
text_signature = "(path, /)",
name = "extract_from_path"
)]
#[doc = "Extract content and metadata from a file at the given path.
This function automatically detects the file format and uses the appropriate
parser to extract text content and metadata. The GIL is released during I/O
and parsing operations, enabling concurrent processing from multiple threads.
Args:
path (str): Path to the file to extract from
Returns:
ExtractionResult: Object containing mime_type, content, metadata, and detection_confidence
Raises:
IOError: If the file cannot be read or does not exist
ValueError: If the file format is unsupported or the file is corrupted
RuntimeError: If detection fails or extraction is only partially successful
Example:
>>> import omniparse
>>> result = omniparse.extract_from_path(\"document.pdf\")
>>> print(f\"Type: {result.mime_type}\")
>>> print(f\"Content: {result.content}\")
>>> print(f\"Author: {result.metadata.get('author', 'Unknown')}\")
"]
pub fn extract_from_path(py: Python, path: String) -> PyResult<PyExtractionResult> {
py.allow_threads(|| {
let result = crate::extract_from_path(&path)
.map_err(map_rust_error)?;
Ok(PyExtractionResult { inner: result })
})
}
#[pyfunction]
#[pyo3(
signature = (data, mime_hint=None),
text_signature = "(data, mime_hint=None, /)",
name = "extract_from_bytes"
)]
#[doc = "Extract content and metadata from raw bytes.
This function is useful when you have file data in memory rather than on disk.
You can optionally provide a MIME type hint to improve detection accuracy. The
GIL is released during parsing operations for concurrent processing.
Args:
data (bytes): Raw bytes of the file content
mime_hint (str, optional): MIME type hint to assist detection
(e.g., \"application/pdf\", \"text/plain\"). Defaults to None.
Returns:
ExtractionResult: Object containing mime_type, content, metadata, and detection_confidence
Raises:
ValueError: If the data format is unsupported or corrupted
RuntimeError: If detection fails or extraction is only partially successful
Example:
>>> import omniparse
>>> with open(\"document.pdf\", \"rb\") as f:
... data = f.read()
>>> result = omniparse.extract_from_bytes(data, mime_hint=\"application/pdf\")
>>> print(result.mime_type)
'application/pdf'
"]
pub fn extract_from_bytes(
py: Python,
data: &Bound<'_, PyBytes>,
mime_hint: Option<String>
) -> PyResult<PyExtractionResult> {
let bytes = data.as_bytes();
let hint = mime_hint.as_deref();
py.allow_threads(|| {
let result = crate::extract_from_bytes(bytes, hint)
.map_err(map_rust_error)?;
Ok(PyExtractionResult { inner: result })
})
}
#[pyfunction]
#[pyo3(
text_signature = "()",
name = "supported_mime_types"
)]
#[doc = "Get a list of all supported MIME types.
Returns all MIME types that have registered parsers in the system. Use this
to check what formats are supported before attempting extraction.
Returns:
list[str]: List of MIME type strings that can be parsed by omniparse
Example:
>>> import omniparse
>>> formats = supported_mime_types()
>>> print(f\"Supports {len(formats)} formats\")
Supports 25 formats
>>> print(\"PDF supported:\", \"application/pdf\" in formats)
PDF supported: True
"]
pub fn supported_mime_types() -> Vec<String> {
crate::supported_mime_types()
}
#[pyfunction]
#[pyo3(
signature = (mime_type),
text_signature = "(mime_type, /)",
name = "is_mime_supported"
)]
#[doc = "Check if a specific MIME type is supported.
This is a convenience function to quickly check if a format can be processed
without needing to iterate through all supported types.
Args:
mime_type (str): MIME type string to check (e.g., \"application/pdf\")
Returns:
bool: True if the MIME type is supported, False otherwise
Example:
>>> import omniparse
>>> if is_mime_supported(\"application/pdf\"):
... result = extract_from_path(\"document.pdf\")
>>> else:
... print(\"PDF format not supported\")
"]
pub fn is_mime_supported(mime_type: String) -> bool {
crate::is_mime_supported(&mime_type)
}