Skip to main content

content_extractor_rl/
lib.rs

1//! Content Extractor RL - RL-based HTML article extraction library
2//!
3//! This library provides functionality for extracting article content from HTML
4//! using reinforcement learning with fallback to heuristic-based extraction.
5
6// ============================================================================
7// FILE: crates/content-extractor-rl/src/lib.rs
8// ============================================================================
9
10pub mod config;
11pub mod text_utils;
12pub mod html_parser;
13pub mod site_profile;
14pub mod baseline_extractor;
15pub mod evaluation;
16
17pub use evaluation::{
18    GroundTruthData, GroundTruthEvaluator, EvaluationMetrics,
19    algorithm_comparison::{AlgorithmComparator, ComparisonReport},
20};
21pub mod models;
22pub use models::ModelMetadata;
23
24pub mod environment;
25pub mod node_features;
26pub mod node_classifier;
27pub use node_classifier::{NodeClassifier, HybridExtractor, train_classifier};
28pub mod replay_buffer;
29pub mod reward;
30pub mod curriculum;
31
32pub mod agents;
33pub use agents::{AgentFactory, AlgorithmType, RLAgent};
34pub use cli_utils::*;
35pub use checkpoint::{Checkpoint, CheckpointManager};
36
37pub mod training;
38pub mod hyperparameter_tuner;
39pub mod plotting;
40pub mod device;
41
42// Optional MLflow integration
43#[cfg(feature = "mlflow-rs")]
44pub mod mlflow;
45
46// Re-exports
47pub use config::Config;
48pub use site_profile::{SiteProfile, SiteProfileMemory};
49pub use baseline_extractor::BaselineExtractor;
50pub use environment::ArticleExtractionEnvironment;
51pub use training::{train_standard, train_with_improvements, TrainingMetrics, TrainingSample};
52pub use hyperparameter_tuner::{TPEOptimizer, Hyperparameters, HyperparameterSpace, TrialResult};
53
54pub use plotting::{TrainingPlotter, PlotConfig};
55pub use device::{get_device, cuda_is_available, get_device_info, print_device_info};
56
57pub mod checkpoint;
58pub mod cli_utils;
59
60
61
62#[cfg(feature = "mlflow-rs")]
63pub use mlflow::{MlflowTracker, create_tracker};
64
65/// Result type for article extraction operations
66pub type Result<T> = std::result::Result<T, ExtractionError>;
67
68/// Errors that can occur during article extraction
69#[derive(Debug)]
70pub enum ExtractionError {
71    IoError(std::io::Error),
72    ParseError(String),
73    NetworkError(String),
74    ModelError(String),
75    ExtractionFailed(String),
76    CandleError(String),
77    RuntimeError(String),
78    MlflowError(String),
79}
80
81impl From<anyhow::Error> for ExtractionError {
82    fn from(err: anyhow::Error) -> Self {
83        ExtractionError::MlflowError(format!("{}", err))
84    }
85}
86
87impl std::fmt::Display for ExtractionError {
88    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
89        match self {
90            ExtractionError::IoError(e) => write!(f, "IO error: {}", e),
91            ExtractionError::ParseError(e) => write!(f, "Parse error: {}", e),
92            ExtractionError::NetworkError(e) => write!(f, "Network error: {}", e),
93            ExtractionError::ModelError(e) => write!(f, "Model error: {}", e),
94            ExtractionError::ExtractionFailed(e) => write!(f, "Extraction failed: {}", e),
95            ExtractionError::CandleError(e) => write!(f, "Candle error: {}", e),
96            ExtractionError::RuntimeError(e) => write!(f, "Runtime error: {}", e),
97            ExtractionError::MlflowError(e) => write!(f, "MLFlow error: {}", e),
98        }
99    }
100}
101
102impl std::error::Error for ExtractionError {}
103
104impl From<std::io::Error> for ExtractionError {
105    fn from(err: std::io::Error) -> Self {
106        ExtractionError::IoError(err)
107    }
108}
109
110impl From<serde_json::Error> for ExtractionError {
111    fn from(err: serde_json::Error) -> Self {
112        ExtractionError::ParseError(err.to_string())
113    }
114}
115
116impl From<candle_core::Error> for ExtractionError {
117    fn from(err: candle_core::Error) -> Self {
118        ExtractionError::CandleError(err.to_string())
119    }
120}
121
122/// Extracted article result
123#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
124pub struct ExtractedArticle {
125    pub url: String,
126    pub title: Option<String>,
127    pub date: Option<String>,
128    pub content: String,
129    pub quality_score: f32,
130    pub method: String,
131    pub xpath: Option<String>,
132}
133
134/// Batch extraction result
135#[derive(Debug, serde::Serialize, serde::Deserialize)]
136pub struct BatchExtractionResult {
137    pub articles: Vec<ExtractedArticle>,
138}