Trait relearn::envs::Environment

source · [−]

pub trait Environment {
    type State;
    type Observation;
    type Action;
    type Feedback;

    fn initial_state(&self, rng: &mut Prng) -> Self::State;
    fn observe(&self, state: &Self::State, rng: &mut Prng) -> Self::Observation;
    fn step(
        &self,
        state: Self::State,
        action: &Self::Action,
        rng: &mut Prng,
        logger: &mut dyn StatsLogger
    ) -> (Successor<Self::State>, Self::Feedback);

    fn run<T, L>(
        self,
        actor: T,
        seed: SimSeed,
        logger: L
    ) -> Steps<Self, T, Prng, L>ⓘNotable traits for Steps<E, T, R, L>impl<E, T, R, L> Iterator for Steps<E, T, R, L> where
    E: Environment,
    T: Actor<E::Observation, E::Action>,
    R: BorrowMut<Prng>,
    L: StatsLogger,     type Item = PartialStep<E::Observation, E::Action, E::Feedback>;
    where
        T: Actor<Self::Observation, Self::Action>,
        L: StatsLogger,
        Self: Sized,
    { ... }
}

Expand description

A reinforcement learning environment.

Formally, this is a Partially Observable Markov Decision Process (POMDP) but with arbitrary feedback instead of just reward values, and with episodes. An episode is a sequence of environment steps starting with Environment::initial_state and ending when Environment::step returns either

Successor::Terminate meaning all possible future rewards are zero; or
Successor::Interrupt meaning the POMDP would continue with possible nonzero reward but but has been prematurely interrupted.

This trait encodes the dynamics of a reinforcement learning environment. The actual state is represented by the State associated type.

Design Discussion

`State`

The use of an explicit State associated type allows the type system to manage episode lifetimes; there is no possibility of an incomplete reset between episodes. However, it forces the users of this trait to handle State when they might prefer it to be a hidden internal implementation detail. Once Generic Associated Types are stable, an alternative Environment trait could have an Episode<'a> associated type where Episode provides a step method and internally manages state. However, using the generic Episode<'a> approach would make it difficult to store an environment and an episode together. Something similar could be done without GAT using an Episode<'a, E: Environment>(&'a E, E::State) struct with the same drawbacks.

Random State

The episode is not responsible for managing its own pseudo-random state. This avoids having to frequently re-initialize the random number generator on each episode and simplifies state definitions.

Trait relearn::envs::Environment

Required Associated Types

type State

type Observation

type Action

type Feedback

Required Methods

fn initial_state(&self, rng: &mut Prng) -> Self::State

fn observe(&self, state: &Self::State, rng: &mut Prng) -> Self::Observation

fn step( &self, state: Self::State, action: &Self::Action, rng: &mut Prng, logger: &mut dyn StatsLogger) -> (Successor<Self::State>, Self::Feedback)

Provided Methods

Implementations on Foreign Types

impl<T: Environment + ?Sized> Environment for &T

type State = <T as Environment>::State

type Observation = <T as Environment>::Observation

type Action = <T as Environment>::Action

type Feedback = <T as Environment>::Feedback

fn initial_state(&self, rng: &mut Prng) -> Self::State

fn observe(&self, state: &Self::State, rng: &mut Prng) -> Self::Observation

fn step( &self, state: Self::State, action: &Self::Action, rng: &mut Prng, logger: &mut dyn StatsLogger) -> (Successor<Self::State>, Self::Feedback)

impl<T: Environment + ?Sized> Environment for Box<T>

type State = <T as Environment>::State

type Observation = <T as Environment>::Observation

type Action = <T as Environment>::Action

type Feedback = <T as Environment>::Feedback

fn initial_state(&self, rng: &mut Prng) -> Self::State

fn observe(&self, state: &Self::State, rng: &mut Prng) -> Self::Observation

fn step( &self, state: Self::State, action: &Self::Action, rng: &mut Prng, logger: &mut dyn StatsLogger) -> (Successor<Self::State>, Self::Feedback)

Implementors

impl Environment for CartPole

type State = CartPoleInternalState

type Observation = CartPolePhysicalState

type Action = Push

type Feedback = Reward

impl Environment for Chain

type State = usize

type Observation = usize

type Action = Move

type Feedback = Reward

impl Environment for MemoryGame

type State = (usize, usize)

type Observation = usize

type Action = usize

type Feedback = Reward

impl Environment for PartitionGame

type State = PartitionGameState

type Observation = ([bool; 10], Option<([bool; 10], Classification)>)

type Action = Action

type Feedback = Reward

impl<D: Distribution<f64> + Bounded<f64>> Environment for Bandit<D>

type State = ()

type Observation = ()

type Action = usize

type Feedback = Reward

impl<E> Environment for MetaEnv<E> where E: EnvDistribution, E::Action: Clone, E::Observation: Clone, E::Feedback: MetaFeedback,

type State = MetaState<<E as EnvDistribution>::Environment>

type Observation = MetaObservation<<E as EnvDistribution>::Observation, <E as EnvDistribution>::Action, <<E as EnvDistribution>::Feedback as MetaFeedback>::Inner>

type Action = <E as EnvDistribution>::Action

type Feedback = <<E as EnvDistribution>::Feedback as MetaFeedback>::Outer

impl<E> Environment for Wrapped<E, TrialEpisodeLimit> where E: Environment, E::State: InnerEpisodeDone,

type State = (<E as Environment>::State, u64)

type Observation = <E as Environment>::Observation

type Action = <E as Environment>::Action

type Feedback = <E as Environment>::Feedback

impl<E, O1, O2, A1, A2, F1, F2> Environment for FirstPlayerView<E> where E: Environment<Observation = (O1, O2), Action = (A1, A2), Feedback = (F1, F2)>, A1: Clone, A2: Default,

type State = <E as Environment>::State

type Observation = O1

type Action = A1

type Feedback = F1

impl<E, O1, O2, A1, A2, F1, F2> Environment for SecondPlayerView<E> where E: Environment<Observation = (O1, O2), Action = (A1, A2), Feedback = (F1, F2)>, A1: Default, A2: Clone,

type State = <E as Environment>::State

type Observation = O2

type Action = A2

type Feedback = F2

impl<E: Environment> Environment for Wrapped<E, LatentStepLimit>

type State = StepLimitState<<E as Environment>::State>

type Observation = <E as Environment>::Observation

type Action = <E as Environment>::Action

type Feedback = <E as Environment>::Feedback

impl<E: Environment> Environment for Wrapped<E, VisibleStepLimit>

fn step(
&self,
state: Self::State,
action: &Self::Action,
rng: &mut Prng,
logger: &mut dyn StatsLogger
) -> (Successor<Self::State>, Self::Feedback)

fn step(
&self,
state: Self::State,
action: &Self::Action,
rng: &mut Prng,
logger: &mut dyn StatsLogger
) -> (Successor<Self::State>, Self::Feedback)

fn step(
&self,
state: Self::State,
action: &Self::Action,
rng: &mut Prng,
logger: &mut dyn StatsLogger
) -> (Successor<Self::State>, Self::Feedback)

type Observation = ([bool ; 10], Option<([bool ; 10], Classification)>)

impl<E> Environment for MetaEnv<E> where
E: EnvDistribution,
E::Action: Clone,
E::Observation: Clone,
E::Feedback: MetaFeedback,

impl<E> Environment for Wrapped<E, TrialEpisodeLimit> where
E: Environment,
E::State: InnerEpisodeDone,

impl<E, O1, O2, A1, A2, F1, F2> Environment for FirstPlayerView<E> where
E: Environment<Observation = (O1, O2), Action = (A1, A2), Feedback = (F1, F2)>,
A1: Clone,
A2: Default,

impl<E, O1, O2, A1, A2, F1, F2> Environment for SecondPlayerView<E> where
E: Environment<Observation = (O1, O2), Action = (A1, A2), Feedback = (F1, F2)>,
A1: Default,
A2: Clone,