continuous_rl/
continuous_rl.rs

1#![allow(clippy::pedantic, clippy::unnecessary_wraps)]
2//! Quantum Continuous Reinforcement Learning Example
3//!
4//! This example demonstrates quantum reinforcement learning algorithms
5//! for continuous action spaces, including QDDPG and QSAC.
6
7use quantrs2_ml::autodiff::optimizers::Adam;
8use quantrs2_ml::prelude::*;
9use scirs2_core::ndarray::Array1;
10use scirs2_core::random::prelude::*;
11
12fn main() -> Result<()> {
13    println!("=== Quantum Continuous RL Demo ===\n");
14
15    // Step 1: Test pendulum environment
16    println!("1. Testing Pendulum Environment...");
17    test_pendulum_dynamics()?;
18
19    // Step 2: Train QDDPG on pendulum
20    println!("\n2. Training Quantum DDPG on Pendulum Control...");
21    train_qddpg_pendulum()?;
22
23    // Step 3: Compare with random policy
24    println!("\n3. Comparing with Random Policy...");
25    compare_policies()?;
26
27    // Step 4: Demonstrate custom continuous environment
28    println!("\n4. Custom Continuous Environment Example...");
29    custom_environment_demo()?;
30
31    println!("\n=== Continuous RL Demo Complete ===");
32
33    Ok(())
34}
35
36/// Test pendulum environment dynamics
37fn test_pendulum_dynamics() -> Result<()> {
38    let mut env = PendulumEnvironment::new();
39
40    println!("   Initial state: {:?}", env.state());
41    println!("   Action bounds: {:?}", env.action_bounds());
42
43    // Run a few steps with different actions
44    let actions = vec![
45        Array1::from_vec(vec![0.0]),  // No torque
46        Array1::from_vec(vec![2.0]),  // Max positive torque
47        Array1::from_vec(vec![-2.0]), // Max negative torque
48    ];
49
50    for (i, action) in actions.iter().enumerate() {
51        let state = env.reset();
52        let (next_state, reward, done) = env.step(action.clone())?;
53
54        println!("\n   Step {} with action {:.1}:", i + 1, action[0]);
55        println!(
56            "     State: [θ_cos={:.3}, θ_sin={:.3}, θ_dot={:.3}]",
57            state[0], state[1], state[2]
58        );
59        println!(
60            "     Next: [θ_cos={:.3}, θ_sin={:.3}, θ_dot={:.3}]",
61            next_state[0], next_state[1], next_state[2]
62        );
63        println!("     Reward: {reward:.3}, Done: {done}");
64    }
65
66    Ok(())
67}
68
69/// Train QDDPG on pendulum control
70fn train_qddpg_pendulum() -> Result<()> {
71    let state_dim = 3;
72    let action_dim = 1;
73    let action_bounds = vec![(-2.0, 2.0)];
74    let num_qubits = 4;
75    let buffer_capacity = 10000;
76
77    // Create QDDPG agent
78    let mut agent = QuantumDDPG::new(
79        state_dim,
80        action_dim,
81        action_bounds,
82        num_qubits,
83        buffer_capacity,
84    )?;
85
86    // Create environment
87    let mut env = PendulumEnvironment::new();
88
89    // Create optimizers
90    let mut actor_optimizer = Adam::new(0.001);
91    let mut critic_optimizer = Adam::new(0.001);
92
93    // Train for a few episodes (reduced for demo)
94    let episodes = 50;
95    println!("   Training QDDPG for {episodes} episodes...");
96
97    let rewards = agent.train(
98        &mut env,
99        episodes,
100        &mut actor_optimizer,
101        &mut critic_optimizer,
102    )?;
103
104    // Print training statistics
105    let avg_initial = rewards[..10].iter().sum::<f64>() / 10.0;
106    let avg_final = rewards[rewards.len() - 10..].iter().sum::<f64>() / 10.0;
107
108    println!("\n   Training Statistics:");
109    println!("   - Average initial reward: {avg_initial:.2}");
110    println!("   - Average final reward: {avg_final:.2}");
111    println!("   - Improvement: {:.2}", avg_final - avg_initial);
112
113    // Test trained agent
114    println!("\n   Testing trained agent...");
115    test_trained_agent(&agent, &mut env)?;
116
117    Ok(())
118}
119
120/// Test a trained agent
121fn test_trained_agent(agent: &QuantumDDPG, env: &mut dyn ContinuousEnvironment) -> Result<()> {
122    let test_episodes = 5;
123    let mut test_rewards = Vec::new();
124
125    for episode in 0..test_episodes {
126        let mut state = env.reset();
127        let mut episode_reward = 0.0;
128        let mut done = false;
129        let mut steps = 0;
130
131        while !done && steps < 200 {
132            let action = agent.get_action(&state, false)?; // No exploration
133            let (next_state, reward, is_done) = env.step(action.clone())?;
134
135            state = next_state;
136            episode_reward += reward;
137            done = is_done;
138            steps += 1;
139        }
140
141        test_rewards.push(episode_reward);
142        println!(
143            "   Test episode {}: Reward = {:.2}, Steps = {}",
144            episode + 1,
145            episode_reward,
146            steps
147        );
148    }
149
150    let avg_test = test_rewards.iter().sum::<f64>() / f64::from(test_episodes);
151    println!("   Average test reward: {avg_test:.2}");
152
153    Ok(())
154}
155
156/// Compare trained policy with random policy
157fn compare_policies() -> Result<()> {
158    let mut env = PendulumEnvironment::new();
159    let episodes = 10;
160
161    // Random policy performance
162    println!("   Random Policy Performance:");
163    let mut random_rewards = Vec::new();
164
165    for _ in 0..episodes {
166        let mut state = env.reset();
167        let mut episode_reward = 0.0;
168        let mut done = false;
169
170        while !done {
171            // Random action in bounds
172            let action = Array1::from_vec(vec![4.0f64.mul_add(thread_rng().gen::<f64>(), -2.0)]);
173
174            let (next_state, reward, is_done) = env.step(action)?;
175            state = next_state;
176            episode_reward += reward;
177            done = is_done;
178        }
179
180        random_rewards.push(episode_reward);
181    }
182
183    let avg_random = random_rewards.iter().sum::<f64>() / f64::from(episodes);
184    println!("   Average random policy reward: {avg_random:.2}");
185
186    // Simple control policy (proportional control)
187    println!("\n   Simple Control Policy Performance:");
188    let mut control_rewards = Vec::new();
189
190    for _ in 0..episodes {
191        let mut state = env.reset();
192        let mut episode_reward = 0.0;
193        let mut done = false;
194
195        while !done {
196            // Proportional control: torque = -k * theta
197            let theta = state[1].atan2(state[0]); // Reconstruct angle
198            let action = Array1::from_vec(vec![(-2.0 * theta).clamp(-2.0, 2.0)]);
199
200            let (next_state, reward, is_done) = env.step(action)?;
201            state = next_state;
202            episode_reward += reward;
203            done = is_done;
204        }
205
206        control_rewards.push(episode_reward);
207    }
208
209    let avg_control = control_rewards.iter().sum::<f64>() / f64::from(episodes);
210    println!("   Average control policy reward: {avg_control:.2}");
211
212    println!("\n   Performance Summary:");
213    println!("   - Random policy: {avg_random:.2}");
214    println!("   - Simple control: {avg_control:.2}");
215    println!("   - Improvement: {:.2}", avg_control - avg_random);
216
217    Ok(())
218}
219
220/// Custom continuous environment example
221fn custom_environment_demo() -> Result<()> {
222    // Define a simple 2D navigation environment
223    struct Navigation2D {
224        position: Array1<f64>,
225        goal: Array1<f64>,
226        max_steps: usize,
227        current_step: usize,
228    }
229
230    impl Navigation2D {
231        fn new() -> Self {
232            Self {
233                position: Array1::zeros(2),
234                goal: Array1::from_vec(vec![5.0, 5.0]),
235                max_steps: 50,
236                current_step: 0,
237            }
238        }
239    }
240
241    impl ContinuousEnvironment for Navigation2D {
242        fn state(&self) -> Array1<f64> {
243            // State includes position and relative goal position
244            let mut state = Array1::zeros(4);
245            state[0] = self.position[0];
246            state[1] = self.position[1];
247            state[2] = self.goal[0] - self.position[0];
248            state[3] = self.goal[1] - self.position[1];
249            state
250        }
251
252        fn action_bounds(&self) -> Vec<(f64, f64)> {
253            vec![(-1.0, 1.0), (-1.0, 1.0)] // Velocity in x and y
254        }
255
256        fn step(&mut self, action: Array1<f64>) -> Result<(Array1<f64>, f64, bool)> {
257            // Update position
258            self.position = &self.position + &action;
259
260            // Compute distance to goal
261            let distance = (self.position[0] - self.goal[0]).hypot(self.position[1] - self.goal[1]);
262
263            // Reward is negative distance (closer is better)
264            let reward = -distance;
265
266            self.current_step += 1;
267            let done = distance < 0.5 || self.current_step >= self.max_steps;
268
269            Ok((self.state(), reward, done))
270        }
271
272        fn reset(&mut self) -> Array1<f64> {
273            self.position = Array1::from_vec(vec![
274                10.0f64.mul_add(thread_rng().gen::<f64>(), -5.0),
275                10.0f64.mul_add(thread_rng().gen::<f64>(), -5.0),
276            ]);
277            self.current_step = 0;
278            self.state()
279        }
280
281        fn state_dim(&self) -> usize {
282            4
283        }
284        fn action_dim(&self) -> usize {
285            2
286        }
287    }
288
289    println!("   Created 2D Navigation Environment");
290
291    let mut nav_env = Navigation2D::new();
292    let state = nav_env.reset();
293
294    println!("   Initial position: [{:.2}, {:.2}]", state[0], state[1]);
295    println!("   Goal position: [5.00, 5.00]");
296    println!("   Action space: 2D velocity vectors in [-1, 1]");
297
298    // Demonstrate a few steps
299    println!("\n   Taking some steps:");
300    for i in 0..3 {
301        let action = Array1::from_vec(vec![
302            0.5 * 2.0f64.mul_add(thread_rng().gen::<f64>(), -1.0),
303            0.5 * 2.0f64.mul_add(thread_rng().gen::<f64>(), -1.0),
304        ]);
305
306        let (next_state, reward, done) = nav_env.step(action.clone())?;
307
308        println!(
309            "   Step {}: action=[{:.2}, {:.2}], pos=[{:.2}, {:.2}], reward={:.2}, done={}",
310            i + 1,
311            action[0],
312            action[1],
313            next_state[0],
314            next_state[1],
315            reward,
316            done
317        );
318    }
319
320    println!("\n   This demonstrates how to create custom continuous environments");
321    println!("   for quantum RL algorithms!");
322
323    Ok(())
324}