Skip to main content

hanzo_extract/
lib.rs

1//! # Hanzo Extract
2//!
3//! Content extraction with built-in sanitization via `hanzo-guard`.
4//!
5//! This crate provides utilities for extracting text content from various sources
6//! (web pages, PDFs, etc.) and sanitizing the output for safe use with LLMs.
7//!
8//! ## Features
9//!
10//! - **Web Extraction**: Fetch and extract clean text from web pages
11//! - **PDF Extraction**: Extract text from PDF documents
12//! - **Sanitization**: Automatic PII redaction and injection detection via `hanzo-guard`
13//!
14//! ## Example
15//!
16//! ```rust,ignore
17//! use hanzo_extract::{WebExtractor, ExtractorConfig};
18//!
19//! #[tokio::main]
20//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
21//!     let extractor = WebExtractor::new(ExtractorConfig::default());
22//!     let result = extractor.extract("https://example.com").await?;
23//!     println!("Extracted: {}", result.text);
24//!     Ok(())
25//! }
26//! ```
27//!
28//! ## Architecture
29//!
30//! ```text
31//! ┌─────────────┐     ┌──────────────┐     ┌─────────────────┐
32//! │   Source    │ ──► │  Extractor   │ ──► │  Hanzo Guard    │
33//! │ (URL/PDF)   │     │ (Text Parse) │     │ (Sanitization)  │
34//! └─────────────┘     └──────────────┘     └─────────────────┘
35//!                                                   │
36//!                                                   ▼
37//!                                          ┌─────────────────┐
38//!                                          │  Clean Output   │
39//!                                          │ (LLM-Ready)     │
40//!                                          └─────────────────┘
41//! ```
42
43pub mod config;
44pub mod error;
45pub mod result;
46
47#[cfg(feature = "web")]
48pub mod web;
49
50#[cfg(feature = "pdf")]
51pub mod pdf;
52
53#[cfg(feature = "sanitize")]
54pub mod sanitize;
55
56#[cfg(feature = "conversations")]
57pub mod conversations;
58
59pub use config::ExtractorConfig;
60pub use error::{ExtractError, Result};
61pub use result::ExtractResult;
62
63#[cfg(feature = "web")]
64pub use web::WebExtractor;
65
66#[cfg(feature = "pdf")]
67pub use pdf::PdfExtractor;
68
69/// Common trait for all extractors
70#[async_trait::async_trait]
71pub trait Extractor: Send + Sync {
72    /// Extract text content from the given source
73    async fn extract(&self, source: &str) -> Result<ExtractResult>;
74
75    /// Extract and sanitize content
76    #[cfg(feature = "sanitize")]
77    async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult>;
78}