border_core/base/
env.rs

1//! Environment interface for reinforcement learning.
2//!
3//! This module defines the core interface for environments in reinforcement learning.
4//! An environment represents a Markov Decision Process (MDP) where an agent can interact
5//! through actions and receive observations and rewards in return.
6
7/// Represents a reinforcement learning environment, typically modeled as a Markov Decision Process (MDP).
8///
9/// This trait defines the interface for environments in reinforcement learning. It provides methods for:
10/// - Building the environment with specific configurations
11/// - Performing steps in the environment
12/// - Resetting the environment to its initial state
13/// - Handling episode termination and truncation
14///
15/// # Associated Types
16///
17/// * `Config` - Configuration parameters for the environment
18/// * `Obs` - The type of observations returned by the environment
19/// * `Act` - The type of actions accepted by the environment
20/// * `Info` - Additional information returned with each step
21///
22/// # Examples
23///
24/// A typical interaction with an environment might look like:
25/// ```ignore
26/// let config = EnvConfig::default();
27/// let mut env = Env::build(&config, 42)?;
28/// let mut obs = env.reset(None)?;
29///
30/// loop {
31///     let action = agent.sample(&obs);
32///     let (step, _) = env.step(&action);
33///     obs = step.obs;
34///
35///     if step.is_done() {
36///         break;
37///     }
38/// }
39/// ```
40use super::{Act, Info, Obs, Step};
41use crate::record::Record;
42use anyhow::Result;
43
44/// Environment interface for reinforcement learning.
45pub trait Env {
46    /// Configuration parameters for the environment.
47    ///
48    /// This type should contain all necessary parameters to build and configure
49    /// the environment, such as environment-specific settings, rendering options,
50    /// or difficulty levels.
51    type Config: Clone;
52
53    /// The type of observations returned by the environment.
54    ///
55    /// Observations represent the state of the environment as perceived by the agent.
56    /// This type must implement the [`Obs`] trait.
57    type Obs: Obs;
58
59    /// The type of actions accepted by the environment.
60    ///
61    /// Actions represent the decisions made by the agent that affect the environment.
62    /// This type must implement the [`Act`] trait.
63    type Act: Act;
64
65    /// Additional information returned with each step.
66    ///
67    /// This type can be used to provide extra information about the environment's state
68    /// that isn't part of the observation. It must implement the [`Info`] trait.
69    type Info: Info;
70
71    /// Builds a new instance of the environment with the given configuration and random seed.
72    ///
73    /// # Arguments
74    ///
75    /// * `config` - Configuration parameters for the environment
76    /// * `seed` - Random seed for reproducibility
77    ///
78    /// # Returns
79    ///
80    /// A new instance of the environment or an error if construction fails
81    fn build(config: &Self::Config, seed: i64) -> Result<Self>
82    where
83        Self: Sized;
84
85    /// Performs a single step in the environment.
86    ///
87    /// This method advances the environment by one time step, applying the given action
88    /// and returning the resulting observation, reward, and termination information.
89    ///
90    /// # Arguments
91    ///
92    /// * `a` - The action to apply to the environment
93    ///
94    /// # Returns
95    ///
96    /// A tuple containing:
97    /// 1. A [`Step`] object with the next observation, reward, and termination info
98    /// 2. A [`Record`] with additional environment-specific information
99    fn step(&mut self, a: &Self::Act) -> (Step<Self>, Record)
100    where
101        Self: Sized;
102
103    /// Resets the environment to its initial state.
104    ///
105    /// This method resets the environment when:
106    /// - `is_done` is `None` (initial reset)
107    /// - `is_done[0] == 1` (episode termination)
108    ///
109    /// # Arguments
110    ///
111    /// * `is_done` - Optional vector indicating which environments to reset
112    ///
113    /// # Note
114    ///
115    /// While the interface supports vectorized environments through `is_done`,
116    /// the current implementation only supports single environments.
117    /// Therefore, `is_done.len()` is expected to be 1.
118    fn reset(&mut self, is_done: Option<&Vec<i8>>) -> Result<Self::Obs>;
119
120    /// Performs a step and automatically resets the environment if the episode ends.
121    ///
122    /// This is a convenience method that combines [`step`] and [`reset`] operations.
123    /// If the step results in episode termination, the environment is automatically
124    /// reset and the initial observation is included in the returned step.
125    ///
126    /// # Arguments
127    ///
128    /// * `a` - The action to apply to the environment
129    ///
130    /// # Returns
131    ///
132    /// A tuple containing:
133    /// 1. A [`Step`] object with the next observation, reward, and termination info
134    /// 2. A [`Record`] with additional environment-specific information
135    ///
136    /// [`step`]: Env::step
137    /// [`reset`]: Env::reset
138    fn step_with_reset(&mut self, a: &Self::Act) -> (Step<Self>, Record)
139    where
140        Self: Sized,
141    {
142        let (step, record) = self.step(a);
143        assert_eq!(step.is_terminated.len(), 1);
144        let step = if step.is_done() {
145            let init_obs = self.reset(None).unwrap();
146            Step {
147                act: step.act,
148                obs: step.obs,
149                reward: step.reward,
150                is_terminated: step.is_terminated,
151                is_truncated: step.is_truncated,
152                info: step.info,
153                init_obs: Some(init_obs),
154            }
155        } else {
156            step
157        };
158
159        (step, record)
160    }
161
162    /// Resets the environment with a specific index.
163    ///
164    /// This method is primarily used during evaluation to control the initial state
165    /// of the environment. The index can be used in various ways, such as:
166    /// - As a random seed for deterministic initialization
167    /// - To select specific starting conditions
168    /// - To control the difficulty level
169    ///
170    /// # Arguments
171    ///
172    /// * `ix` - An index used to control the reset behavior
173    ///
174    /// # Note
175    ///
176    /// This method is called by the [`Trainer`] during evaluation to ensure
177    /// consistent testing conditions.
178    ///
179    /// [`Trainer`]: crate::Trainer
180    fn reset_with_index(&mut self, ix: usize) -> Result<Self::Obs>;
181}