hanzo_extract/lib.rs
1//! # Hanzo Extract
2//!
3//! Content extraction with built-in sanitization via `hanzo-guard`.
4//!
5//! This crate provides utilities for extracting text content from various sources
6//! (web pages, PDFs, etc.) and sanitizing the output for safe use with LLMs.
7//!
8//! ## Features
9//!
10//! - **Web Extraction**: Fetch and extract clean text from web pages
11//! - **PDF Extraction**: Extract text from PDF documents
12//! - **Sanitization**: Automatic PII redaction and injection detection via `hanzo-guard`
13//!
14//! ## Example
15//!
16//! ```rust,ignore
17//! use hanzo_extract::{WebExtractor, ExtractorConfig};
18//!
19//! #[tokio::main]
20//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
21//! let extractor = WebExtractor::new(ExtractorConfig::default());
22//! let result = extractor.extract("https://example.com").await?;
23//! println!("Extracted: {}", result.text);
24//! Ok(())
25//! }
26//! ```
27//!
28//! ## Architecture
29//!
30//! ```text
31//! ┌─────────────┐ ┌──────────────┐ ┌─────────────────┐
32//! │ Source │ ──► │ Extractor │ ──► │ Hanzo Guard │
33//! │ (URL/PDF) │ │ (Text Parse) │ │ (Sanitization) │
34//! └─────────────┘ └──────────────┘ └─────────────────┘
35//! │
36//! ▼
37//! ┌─────────────────┐
38//! │ Clean Output │
39//! │ (LLM-Ready) │
40//! └─────────────────┘
41//! ```
42
43pub mod config;
44pub mod error;
45pub mod result;
46
47#[cfg(feature = "web")]
48pub mod web;
49
50#[cfg(feature = "pdf")]
51pub mod pdf;
52
53#[cfg(feature = "sanitize")]
54pub mod sanitize;
55
56#[cfg(feature = "conversations")]
57pub mod conversations;
58
59pub use config::ExtractorConfig;
60pub use error::{ExtractError, Result};
61pub use result::ExtractResult;
62
63#[cfg(feature = "web")]
64pub use web::WebExtractor;
65
66#[cfg(feature = "pdf")]
67pub use pdf::PdfExtractor;
68
69/// Common trait for all extractors
70#[async_trait::async_trait]
71pub trait Extractor: Send + Sync {
72 /// Extract text content from the given source
73 async fn extract(&self, source: &str) -> Result<ExtractResult>;
74
75 /// Extract and sanitize content
76 #[cfg(feature = "sanitize")]
77 async fn extract_sanitized(&self, source: &str) -> Result<ExtractResult>;
78}