continuous_rl/
continuous_rl.rs

1//! Quantum Continuous Reinforcement Learning Example
2//!
3//! This example demonstrates quantum reinforcement learning algorithms
4//! for continuous action spaces, including QDDPG and QSAC.
5
6use ndarray::Array1;
7use quantrs2_ml::autodiff::optimizers::Adam;
8use quantrs2_ml::prelude::*;
9
10fn main() -> Result<()> {
11    println!("=== Quantum Continuous RL Demo ===\n");
12
13    // Step 1: Test pendulum environment
14    println!("1. Testing Pendulum Environment...");
15    test_pendulum_dynamics()?;
16
17    // Step 2: Train QDDPG on pendulum
18    println!("\n2. Training Quantum DDPG on Pendulum Control...");
19    train_qddpg_pendulum()?;
20
21    // Step 3: Compare with random policy
22    println!("\n3. Comparing with Random Policy...");
23    compare_policies()?;
24
25    // Step 4: Demonstrate custom continuous environment
26    println!("\n4. Custom Continuous Environment Example...");
27    custom_environment_demo()?;
28
29    println!("\n=== Continuous RL Demo Complete ===");
30
31    Ok(())
32}
33
34/// Test pendulum environment dynamics
35fn test_pendulum_dynamics() -> Result<()> {
36    let mut env = PendulumEnvironment::new();
37
38    println!("   Initial state: {:?}", env.state());
39    println!("   Action bounds: {:?}", env.action_bounds());
40
41    // Run a few steps with different actions
42    let actions = vec![
43        Array1::from_vec(vec![0.0]),  // No torque
44        Array1::from_vec(vec![2.0]),  // Max positive torque
45        Array1::from_vec(vec![-2.0]), // Max negative torque
46    ];
47
48    for (i, action) in actions.iter().enumerate() {
49        let state = env.reset();
50        let (next_state, reward, done) = env.step(action.clone())?;
51
52        println!("\n   Step {} with action {:.1}:", i + 1, action[0]);
53        println!(
54            "     State: [θ_cos={:.3}, θ_sin={:.3}, θ_dot={:.3}]",
55            state[0], state[1], state[2]
56        );
57        println!(
58            "     Next: [θ_cos={:.3}, θ_sin={:.3}, θ_dot={:.3}]",
59            next_state[0], next_state[1], next_state[2]
60        );
61        println!("     Reward: {:.3}, Done: {}", reward, done);
62    }
63
64    Ok(())
65}
66
67/// Train QDDPG on pendulum control
68fn train_qddpg_pendulum() -> Result<()> {
69    let state_dim = 3;
70    let action_dim = 1;
71    let action_bounds = vec![(-2.0, 2.0)];
72    let num_qubits = 4;
73    let buffer_capacity = 10000;
74
75    // Create QDDPG agent
76    let mut agent = QuantumDDPG::new(
77        state_dim,
78        action_dim,
79        action_bounds,
80        num_qubits,
81        buffer_capacity,
82    )?;
83
84    // Create environment
85    let mut env = PendulumEnvironment::new();
86
87    // Create optimizers
88    let mut actor_optimizer = Adam::new(0.001);
89    let mut critic_optimizer = Adam::new(0.001);
90
91    // Train for a few episodes (reduced for demo)
92    let episodes = 50;
93    println!("   Training QDDPG for {} episodes...", episodes);
94
95    let rewards = agent.train(
96        &mut env,
97        episodes,
98        &mut actor_optimizer,
99        &mut critic_optimizer,
100    )?;
101
102    // Print training statistics
103    let avg_initial = rewards[..10].iter().sum::<f64>() / 10.0;
104    let avg_final = rewards[rewards.len() - 10..].iter().sum::<f64>() / 10.0;
105
106    println!("\n   Training Statistics:");
107    println!("   - Average initial reward: {:.2}", avg_initial);
108    println!("   - Average final reward: {:.2}", avg_final);
109    println!("   - Improvement: {:.2}", avg_final - avg_initial);
110
111    // Test trained agent
112    println!("\n   Testing trained agent...");
113    test_trained_agent(&agent, &mut env)?;
114
115    Ok(())
116}
117
118/// Test a trained agent
119fn test_trained_agent(agent: &QuantumDDPG, env: &mut dyn ContinuousEnvironment) -> Result<()> {
120    let test_episodes = 5;
121    let mut test_rewards = Vec::new();
122
123    for episode in 0..test_episodes {
124        let mut state = env.reset();
125        let mut episode_reward = 0.0;
126        let mut done = false;
127        let mut steps = 0;
128
129        while !done && steps < 200 {
130            let action = agent.get_action(&state, false)?; // No exploration
131            let (next_state, reward, is_done) = env.step(action.clone())?;
132
133            state = next_state;
134            episode_reward += reward;
135            done = is_done;
136            steps += 1;
137        }
138
139        test_rewards.push(episode_reward);
140        println!(
141            "   Test episode {}: Reward = {:.2}, Steps = {}",
142            episode + 1,
143            episode_reward,
144            steps
145        );
146    }
147
148    let avg_test = test_rewards.iter().sum::<f64>() / test_episodes as f64;
149    println!("   Average test reward: {:.2}", avg_test);
150
151    Ok(())
152}
153
154/// Compare trained policy with random policy
155fn compare_policies() -> Result<()> {
156    let mut env = PendulumEnvironment::new();
157    let episodes = 10;
158
159    // Random policy performance
160    println!("   Random Policy Performance:");
161    let mut random_rewards = Vec::new();
162
163    for _ in 0..episodes {
164        let mut state = env.reset();
165        let mut episode_reward = 0.0;
166        let mut done = false;
167
168        while !done {
169            // Random action in bounds
170            let action = Array1::from_vec(vec![4.0 * rand::random::<f64>() - 2.0]);
171
172            let (next_state, reward, is_done) = env.step(action)?;
173            state = next_state;
174            episode_reward += reward;
175            done = is_done;
176        }
177
178        random_rewards.push(episode_reward);
179    }
180
181    let avg_random = random_rewards.iter().sum::<f64>() / episodes as f64;
182    println!("   Average random policy reward: {:.2}", avg_random);
183
184    // Simple control policy (proportional control)
185    println!("\n   Simple Control Policy Performance:");
186    let mut control_rewards = Vec::new();
187
188    for _ in 0..episodes {
189        let mut state = env.reset();
190        let mut episode_reward = 0.0;
191        let mut done = false;
192
193        while !done {
194            // Proportional control: torque = -k * theta
195            let theta = state[1].atan2(state[0]); // Reconstruct angle
196            let action = Array1::from_vec(vec![(-2.0 * theta).clamp(-2.0, 2.0)]);
197
198            let (next_state, reward, is_done) = env.step(action)?;
199            state = next_state;
200            episode_reward += reward;
201            done = is_done;
202        }
203
204        control_rewards.push(episode_reward);
205    }
206
207    let avg_control = control_rewards.iter().sum::<f64>() / episodes as f64;
208    println!("   Average control policy reward: {:.2}", avg_control);
209
210    println!("\n   Performance Summary:");
211    println!("   - Random policy: {:.2}", avg_random);
212    println!("   - Simple control: {:.2}", avg_control);
213    println!("   - Improvement: {:.2}", avg_control - avg_random);
214
215    Ok(())
216}
217
218/// Custom continuous environment example
219fn custom_environment_demo() -> Result<()> {
220    // Define a simple 2D navigation environment
221    struct Navigation2D {
222        position: Array1<f64>,
223        goal: Array1<f64>,
224        max_steps: usize,
225        current_step: usize,
226    }
227
228    impl Navigation2D {
229        fn new() -> Self {
230            Self {
231                position: Array1::zeros(2),
232                goal: Array1::from_vec(vec![5.0, 5.0]),
233                max_steps: 50,
234                current_step: 0,
235            }
236        }
237    }
238
239    impl ContinuousEnvironment for Navigation2D {
240        fn state(&self) -> Array1<f64> {
241            // State includes position and relative goal position
242            let mut state = Array1::zeros(4);
243            state[0] = self.position[0];
244            state[1] = self.position[1];
245            state[2] = self.goal[0] - self.position[0];
246            state[3] = self.goal[1] - self.position[1];
247            state
248        }
249
250        fn action_bounds(&self) -> Vec<(f64, f64)> {
251            vec![(-1.0, 1.0), (-1.0, 1.0)] // Velocity in x and y
252        }
253
254        fn step(&mut self, action: Array1<f64>) -> Result<(Array1<f64>, f64, bool)> {
255            // Update position
256            self.position = &self.position + &action;
257
258            // Compute distance to goal
259            let distance = ((self.position[0] - self.goal[0]).powi(2)
260                + (self.position[1] - self.goal[1]).powi(2))
261            .sqrt();
262
263            // Reward is negative distance (closer is better)
264            let reward = -distance;
265
266            self.current_step += 1;
267            let done = distance < 0.5 || self.current_step >= self.max_steps;
268
269            Ok((self.state(), reward, done))
270        }
271
272        fn reset(&mut self) -> Array1<f64> {
273            self.position = Array1::from_vec(vec![
274                10.0 * rand::random::<f64>() - 5.0,
275                10.0 * rand::random::<f64>() - 5.0,
276            ]);
277            self.current_step = 0;
278            self.state()
279        }
280
281        fn state_dim(&self) -> usize {
282            4
283        }
284        fn action_dim(&self) -> usize {
285            2
286        }
287    }
288
289    println!("   Created 2D Navigation Environment");
290
291    let mut nav_env = Navigation2D::new();
292    let state = nav_env.reset();
293
294    println!("   Initial position: [{:.2}, {:.2}]", state[0], state[1]);
295    println!("   Goal position: [5.00, 5.00]");
296    println!("   Action space: 2D velocity vectors in [-1, 1]");
297
298    // Demonstrate a few steps
299    println!("\n   Taking some steps:");
300    for i in 0..3 {
301        let action = Array1::from_vec(vec![
302            0.5 * (2.0 * rand::random::<f64>() - 1.0),
303            0.5 * (2.0 * rand::random::<f64>() - 1.0),
304        ]);
305
306        let (next_state, reward, done) = nav_env.step(action.clone())?;
307
308        println!(
309            "   Step {}: action=[{:.2}, {:.2}], pos=[{:.2}, {:.2}], reward={:.2}, done={}",
310            i + 1,
311            action[0],
312            action[1],
313            next_state[0],
314            next_state[1],
315            reward,
316            done
317        );
318    }
319
320    println!("\n   This demonstrates how to create custom continuous environments");
321    println!("   for quantum RL algorithms!");
322
323    Ok(())
324}