border_core/base/step.rs
1//! Step processing interface for reinforcement learning.
2//!
3//! This module defines the core interfaces for processing environment steps in reinforcement learning.
4//! It provides structures and traits for handling the transition between states, including
5//! observations, actions, rewards, and episode termination information.
6
7use super::Env;
8
9/// Additional information that can be associated with environment steps.
10///
11/// This trait is used to define custom information types that can be attached to
12/// environment steps. It is typically implemented for types that provide extra
13/// context about the environment's state or the agent's actions.
14///
15/// # Examples
16///
17/// ```ignore
18/// #[derive(Debug)]
19/// struct CustomInfo {
20/// velocity: f32,
21/// position: (f32, f32),
22/// }
23///
24/// impl Info for CustomInfo {}
25/// ```
26pub trait Info {}
27
28impl Info for () {}
29
30/// Represents a single step in the environment, containing the action taken,
31/// the resulting observation, reward, and episode status.
32///
33/// This struct encapsulates all the information produced by an environment
34/// during a single interaction step. It is used to create transitions of the form
35/// `(o_t, a_t, o_t+1, r_t)` for training reinforcement learning agents.
36///
37/// # Type Parameters
38///
39/// * `E` - The environment type that produced this step
40///
41/// # Fields
42///
43/// * `act` - The action taken by the agent
44/// * `obs` - The observation received from the environment
45/// * `reward` - The reward received for the action
46/// * `is_terminated` - Flags indicating if the episode has terminated
47/// * `is_truncated` - Flags indicating if the episode has been truncated
48/// * `info` - Additional environment-specific information
49/// * `init_obs` - The initial observation of the next episode (if applicable)
50///
51/// # Examples
52///
53/// ```ignore
54/// let step = Step::new(
55/// observation,
56/// action,
57/// vec![0.5], // reward
58/// vec![0], // not terminated
59/// vec![0], // not truncated
60/// info,
61/// None, // no initial observation
62/// );
63///
64/// if step.is_done() {
65/// // Handle episode completion
66/// }
67/// ```
68pub struct Step<E: Env> {
69 /// The action taken by the agent in this step.
70 pub act: E::Act,
71
72 /// The observation received from the environment after taking the action.
73 pub obs: E::Obs,
74
75 /// The reward received for taking the action.
76 pub reward: Vec<f32>,
77
78 /// Flags indicating if the episode has terminated.
79 /// A value of 1 indicates termination.
80 pub is_terminated: Vec<i8>,
81
82 /// Flags indicating if the episode has been truncated.
83 /// A value of 1 indicates truncation.
84 pub is_truncated: Vec<i8>,
85
86 /// Additional environment-specific information.
87 pub info: E::Info,
88
89 /// The initial observation of the next episode, if applicable.
90 /// This is used when an episode ends and a new one begins.
91 pub init_obs: Option<E::Obs>,
92}
93
94impl<E: Env> Step<E> {
95 /// Constructs a new [`Step`] object with the given components.
96 ///
97 /// # Arguments
98 ///
99 /// * `obs` - The observation received from the environment
100 /// * `act` - The action taken by the agent
101 /// * `reward` - The reward received for the action
102 /// * `is_terminated` - Flags indicating episode termination
103 /// * `is_truncated` - Flags indicating episode truncation
104 /// * `info` - Additional environment-specific information
105 /// * `init_obs` - The initial observation of the next episode
106 ///
107 /// # Returns
108 ///
109 /// A new [`Step`] object containing all the provided information
110 pub fn new(
111 obs: E::Obs,
112 act: E::Act,
113 reward: Vec<f32>,
114 is_terminated: Vec<i8>,
115 is_truncated: Vec<i8>,
116 info: E::Info,
117 init_obs: Option<E::Obs>,
118 ) -> Self {
119 Step {
120 act,
121 obs,
122 reward,
123 is_terminated,
124 is_truncated,
125 info,
126 init_obs,
127 }
128 }
129
130 /// Checks if the episode has ended, either through termination or truncation.
131 ///
132 /// # Returns
133 ///
134 /// `true` if the episode has ended, `false` otherwise
135 #[inline]
136 pub fn is_done(&self) -> bool {
137 self.is_terminated[0] == 1 || self.is_truncated[0] == 1
138 }
139}
140
141/// Processes environment steps and produces items for a replay buffer.
142///
143/// This trait defines the interface for converting [`Step`] objects into items
144/// that can be stored in a replay buffer. It is used by the [`Trainer`] to
145/// transform environment interactions into training samples.
146///
147/// # Type Parameters
148///
149/// * `E` - The environment type
150///
151/// # Associated Types
152///
153/// * `Config` - Configuration parameters for the processor
154/// * `Output` - The type of items produced by the processor
155///
156/// # Examples
157///
158/// ```ignore
159/// struct SimpleProcessor;
160///
161/// impl<E: Env> StepProcessor<E> for SimpleProcessor {
162/// type Config = ();
163/// type Output = (E::Obs, E::Act, E::Obs, f32);
164///
165/// fn build(_: &Self::Config) -> Self {
166/// Self
167/// }
168///
169/// fn reset(&mut self, _: E::Obs) {}
170///
171/// fn process(&mut self, step: Step<E>) -> Self::Output {
172/// (step.init_obs.unwrap(), step.act, step.obs, step.reward[0])
173/// }
174/// }
175/// ```
176///
177/// [`Trainer`]: crate::Trainer
178pub trait StepProcessor<E: Env> {
179 /// Configuration parameters for the processor.
180 ///
181 /// This type must implement `Clone` to support building multiple instances
182 /// with the same configuration.
183 type Config: Clone;
184
185 /// The type of items produced by the processor.
186 ///
187 /// This type should match the `Item` type of the replay buffer that will
188 /// store the processed steps.
189 type Output;
190
191 /// Builds a new processor with the given configuration.
192 ///
193 /// # Arguments
194 ///
195 /// * `config` - The configuration parameters
196 ///
197 /// # Returns
198 ///
199 /// A new instance of the processor
200 fn build(config: &Self::Config) -> Self;
201
202 /// Resets the processor with a new initial observation.
203 ///
204 /// This method is called at the start of each episode to initialize
205 /// the processor with the first observation.
206 ///
207 /// # Arguments
208 ///
209 /// * `init_obs` - The initial observation of the episode
210 fn reset(&mut self, init_obs: E::Obs);
211
212 /// Processes a step and produces an item for the replay buffer.
213 ///
214 /// This method transforms a [`Step`] object into an item that can be
215 /// stored in a replay buffer. The transformation typically involves
216 /// creating a transition tuple of the form `(o_t, a_t, o_t+1, r_t)`.
217 ///
218 /// # Arguments
219 ///
220 /// * `step` - The step to process
221 ///
222 /// # Returns
223 ///
224 /// An item ready to be stored in a replay buffer
225 fn process(&mut self, step: Step<E>) -> Self::Output;
226}