Skip to main content

content_extractor_rl/
curriculum.rs

1
2//! Curriculum learning manager
3
4// ============================================================================
5// FILE: crates/content-extractor-rl/src/curriculum.rs
6// ============================================================================
7
8pub struct CurriculumManager {
9    current_threshold: f32,
10    max_threshold: f32,
11    increment_rate: f32,
12}
13
14impl CurriculumManager {
15    /// Create new curriculum manager
16    pub fn new() -> Self {
17        Self {
18            current_threshold: 0.3,
19            max_threshold: 1.0,
20            increment_rate: 0.01,
21        }
22    }
23
24    /// Update difficulty threshold
25    pub fn update_threshold(&mut self, episode: usize) {
26        if episode.is_multiple_of(100) {
27            self.current_threshold = (self.current_threshold + self.increment_rate)
28                .min(self.max_threshold);
29        }
30    }
31
32    /// Get current difficulty threshold
33    pub fn get_threshold(&self) -> f32 {
34        self.current_threshold
35    }
36
37    /// Estimate HTML difficulty
38    pub fn estimate_difficulty(&self, html: &str) -> f32 {
39        let html_len = html.len();
40        let script_count = html.matches("<script").count();
41        let div_count = html.matches("<div").count();
42        let has_article = html.to_lowercase().contains("<article");
43
44        let mut difficulty: f32 = 0.0;
45
46        if html_len > 100_000 {
47            difficulty += 0.3;
48        } else if html_len > 50_000 {
49            difficulty += 0.2;
50        }
51
52        if script_count > 20 {
53            difficulty += 0.3;
54        } else if script_count > 10 {
55            difficulty += 0.2;
56        }
57
58        if div_count > 100 {
59            difficulty += 0.2;
60        }
61
62        if has_article {
63            difficulty -= 0.2;
64        }
65
66        difficulty.clamp(0.0, 1.0)
67    }
68
69    /// Check if HTML is appropriate for current curriculum stage
70    pub fn is_appropriate(&self, html: &str) -> bool {
71        let difficulty = self.estimate_difficulty(html);
72        difficulty <= self.current_threshold
73    }
74}
75
76impl Default for CurriculumManager {
77    fn default() -> Self {
78        Self::new()
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    #[test]
87    fn test_curriculum_manager() {
88        let mut manager = CurriculumManager::new();
89        assert_eq!(manager.get_threshold(), 0.3);
90
91        manager.update_threshold(100);
92        assert!(manager.get_threshold() > 0.3);
93    }
94}