content_extractor_rl/
curriculum.rs1
2pub struct CurriculumManager {
9 current_threshold: f32,
10 max_threshold: f32,
11 increment_rate: f32,
12}
13
14impl CurriculumManager {
15 pub fn new() -> Self {
17 Self {
18 current_threshold: 0.3,
19 max_threshold: 1.0,
20 increment_rate: 0.01,
21 }
22 }
23
24 pub fn update_threshold(&mut self, episode: usize) {
26 if episode.is_multiple_of(100) {
27 self.current_threshold = (self.current_threshold + self.increment_rate)
28 .min(self.max_threshold);
29 }
30 }
31
32 pub fn get_threshold(&self) -> f32 {
34 self.current_threshold
35 }
36
37 pub fn estimate_difficulty(&self, html: &str) -> f32 {
39 let html_len = html.len();
40 let script_count = html.matches("<script").count();
41 let div_count = html.matches("<div").count();
42 let has_article = html.to_lowercase().contains("<article");
43
44 let mut difficulty: f32 = 0.0;
45
46 if html_len > 100_000 {
47 difficulty += 0.3;
48 } else if html_len > 50_000 {
49 difficulty += 0.2;
50 }
51
52 if script_count > 20 {
53 difficulty += 0.3;
54 } else if script_count > 10 {
55 difficulty += 0.2;
56 }
57
58 if div_count > 100 {
59 difficulty += 0.2;
60 }
61
62 if has_article {
63 difficulty -= 0.2;
64 }
65
66 difficulty.clamp(0.0, 1.0)
67 }
68
69 pub fn is_appropriate(&self, html: &str) -> bool {
71 let difficulty = self.estimate_difficulty(html);
72 difficulty <= self.current_threshold
73 }
74}
75
76impl Default for CurriculumManager {
77 fn default() -> Self {
78 Self::new()
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use super::*;
85
86 #[test]
87 fn test_curriculum_manager() {
88 let mut manager = CurriculumManager::new();
89 assert_eq!(manager.get_threshold(), 0.3);
90
91 manager.update_threshold(100);
92 assert!(manager.get_threshold() > 0.3);
93 }
94}