1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
//! Exploration strategies of IQN.
use serde::{Deserialize, Serialize};
use std::default::Default;
use tch::Tensor;
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
/// Explorers for IQN.
pub enum IqnExplorer {
// /// Softmax action selection.
// Softmax(Softmax),
/// Epsilon-greedy action selection.
EpsilonGreedy(EpsilonGreedy),
}
// /// Softmax explorer for IQN.
// pub struct Softmax {}
// #[allow(clippy::new_without_default)]
// impl Softmax {
// /// Constructs softmax explorer.
// pub fn new() -> Self { Self {} }
// /// Takes an action based on the observation and the critic.
// pub fn action<M>(&mut self, qnet: &M, obs: &Tensor) -> Tensor where
// M: Model1<Input=Tensor, Output=Tensor>,
// {
// let a = qnet.forward(obs);
// a.softmax(-1, tch::Kind::Float).multinomial(1, true)
// }
// }
#[derive(Debug, Deserialize, Serialize, PartialEq, Clone)]
/// Epsilon-greedy explorer for IQN.
pub struct EpsilonGreedy {
n_opts: usize,
eps_start: f64,
eps_final: f64,
final_step: usize,
}
impl Default for EpsilonGreedy {
fn default() -> Self {
Self {
n_opts: 0,
eps_start: 1.0,
eps_final: 0.02,
final_step: 100_000,
}
}
}
#[allow(clippy::new_without_default)]
impl EpsilonGreedy {
/// Constructs epsilon-greedy explorer.
pub fn with_params(eps_start: f64, eps_final: f64, final_step: usize) -> IqnExplorer {
IqnExplorer::EpsilonGreedy(Self {
n_opts: 0,
eps_start,
eps_final,
final_step,
})
}
/// Constructs epsilon-greedy explorer.
///
/// TODO: improve interface.
pub fn with_final_step(final_step: usize) -> IqnExplorer {
IqnExplorer::EpsilonGreedy(Self {
n_opts: 0,
eps_start: 1.0,
eps_final: 0.02,
final_step,
})
}
/// Takes an action based on the observation and the critic.
pub fn action(&mut self, action_value: Tensor) -> Tensor {
let d = (self.eps_start - self.eps_final) / (self.final_step as f64);
let eps = (self.eps_start - d * self.n_opts as f64).max(self.eps_final);
let r = fastrand::f64();
let is_random = r < eps;
self.n_opts += 1;
if is_random {
let batch_size = action_value.size()[0];
let n_actions = action_value.size()[1] as u32;
Tensor::of_slice(
(0..batch_size)
.map(|_| fastrand::u32(..n_actions) as i32)
.collect::<Vec<_>>()
.as_slice(),
)
} else {
action_value.argmax(-1, true)
}
}
}