Skip to main content

content_extractor_rl/
lib.rs

1//! Content Extractor RL - RL-based HTML article extraction library
2//!
3//! This library provides functionality for extracting article content from HTML
4//! using reinforcement learning with fallback to heuristic-based extraction.
5
6// ============================================================================
7// FILE: crates/content-extractor-rl/src/lib.rs
8// ============================================================================
9
10pub mod config;
11pub mod text_utils;
12pub mod html_parser;
13pub mod site_profile;
14pub mod baseline_extractor;
15pub mod evaluation;
16
17pub use evaluation::{
18    GroundTruthData, GroundTruthEvaluator, EvaluationMetrics,
19    algorithm_comparison::{AlgorithmComparator, ComparisonReport},
20};
21pub mod models;
22pub use models::ModelMetadata;
23
24pub mod environment;
25pub mod replay_buffer;
26pub mod reward;
27pub mod curriculum;
28
29pub mod agents;
30pub use agents::{AgentFactory, AlgorithmType, RLAgent};
31pub use cli_utils::*;
32pub use checkpoint::{Checkpoint, CheckpointManager};
33
34pub mod training;
35pub mod hyperparameter_tuner;
36pub mod plotting;
37pub mod device;
38
39// Optional MLflow integration
40#[cfg(feature = "mlflow-rs")]
41pub mod mlflow;
42
43// Re-exports
44pub use config::Config;
45pub use site_profile::{SiteProfile, SiteProfileMemory};
46pub use baseline_extractor::BaselineExtractor;
47pub use environment::ArticleExtractionEnvironment;
48pub use training::{train_standard, train_with_improvements, TrainingMetrics};
49pub use hyperparameter_tuner::{TPEOptimizer, Hyperparameters, HyperparameterSpace, TrialResult};
50
51pub use plotting::{TrainingPlotter, PlotConfig};
52pub use device::{get_device, cuda_is_available, get_device_info, print_device_info};
53
54pub mod checkpoint;
55pub mod cli_utils;
56
57
58
59#[cfg(feature = "mlflow-rs")]
60pub use mlflow::{MlflowTracker, create_tracker};
61
62/// Result type for article extraction operations
63pub type Result<T> = std::result::Result<T, ExtractionError>;
64
65/// Errors that can occur during article extraction
66#[derive(Debug)]
67pub enum ExtractionError {
68    IoError(std::io::Error),
69    ParseError(String),
70    NetworkError(String),
71    ModelError(String),
72    ExtractionFailed(String),
73    CandleError(String),
74    RuntimeError(String),
75    MlflowError(String),
76}
77
78impl From<anyhow::Error> for ExtractionError {
79    fn from(err: anyhow::Error) -> Self {
80        ExtractionError::MlflowError(format!("{}", err))
81    }
82}
83
84impl std::fmt::Display for ExtractionError {
85    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
86        match self {
87            ExtractionError::IoError(e) => write!(f, "IO error: {}", e),
88            ExtractionError::ParseError(e) => write!(f, "Parse error: {}", e),
89            ExtractionError::NetworkError(e) => write!(f, "Network error: {}", e),
90            ExtractionError::ModelError(e) => write!(f, "Model error: {}", e),
91            ExtractionError::ExtractionFailed(e) => write!(f, "Extraction failed: {}", e),
92            ExtractionError::CandleError(e) => write!(f, "Candle error: {}", e),
93            ExtractionError::RuntimeError(e) => write!(f, "Runtime error: {}", e),
94            ExtractionError::MlflowError(e) => write!(f, "MLFlow error: {}", e),
95        }
96    }
97}
98
99impl std::error::Error for ExtractionError {}
100
101impl From<std::io::Error> for ExtractionError {
102    fn from(err: std::io::Error) -> Self {
103        ExtractionError::IoError(err)
104    }
105}
106
107impl From<serde_json::Error> for ExtractionError {
108    fn from(err: serde_json::Error) -> Self {
109        ExtractionError::ParseError(err.to_string())
110    }
111}
112
113impl From<candle_core::Error> for ExtractionError {
114    fn from(err: candle_core::Error) -> Self {
115        ExtractionError::CandleError(err.to_string())
116    }
117}
118
119/// Extracted article result
120#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
121pub struct ExtractedArticle {
122    pub url: String,
123    pub title: Option<String>,
124    pub date: Option<String>,
125    pub content: String,
126    pub quality_score: f32,
127    pub method: String,
128    pub xpath: Option<String>,
129}
130
131/// Batch extraction result
132#[derive(Debug, serde::Serialize, serde::Deserialize)]
133pub struct BatchExtractionResult {
134    pub articles: Vec<ExtractedArticle>,
135}