quantrs2_tytan/sampler/gpu/
armin.rs

1//! GPU-accelerated Armin Sampler Implementation
2
3use scirs2_core::ndarray::{Array, Ix2};
4#[cfg(all(feature = "gpu", feature = "dwave"))]
5use scirs2_core::random::{rngs::StdRng, thread_rng, Rng, SeedableRng};
6use std::collections::HashMap;
7
8#[cfg(all(feature = "gpu", feature = "dwave"))]
9use super::super::evaluate_qubo_energy;
10use super::super::{SampleResult, Sampler, SamplerError, SamplerResult};
11
12#[cfg(all(feature = "gpu", feature = "dwave"))]
13use ocl::{
14    self,
15    enums::{DeviceInfo, DeviceInfoResult},
16    Context, DeviceType, Program,
17};
18
19/// GPU-accelerated Sampler (Armin)
20///
21/// This sampler uses GPU acceleration to find solutions to
22/// QUBO/HOBO problems. It is based on parallel tempering and
23/// is optimized for large problems.
24#[cfg(feature = "gpu")]
25pub struct ArminSampler {
26    /// Random number generator seed
27    seed: Option<u64>,
28    /// Whether to use GPU ("GPU") or CPU ("CPU")
29    mode: String,
30    /// Device to use (e.g., "cuda:0")
31    device: String,
32    /// Whether to show verbose output
33    verbose: bool,
34}
35
36#[cfg(feature = "gpu")]
37impl ArminSampler {
38    /// Create a new GPU-accelerated sampler
39    ///
40    /// # Arguments
41    ///
42    /// * `seed` - An optional random seed for reproducibility
43    #[must_use]
44    pub fn new(seed: Option<u64>) -> Self {
45        Self {
46            seed,
47            mode: "GPU".to_string(),
48            device: "cuda:0".to_string(),
49            verbose: true,
50        }
51    }
52
53    /// Create a new GPU-accelerated sampler with custom parameters
54    ///
55    /// # Arguments
56    ///
57    /// * `seed` - An optional random seed for reproducibility
58    /// * `mode` - Whether to use GPU ("GPU") or CPU ("CPU")
59    /// * `device` - Device to use (e.g., "cuda:0")
60    /// * `verbose` - Whether to show verbose output
61    #[must_use]
62    pub fn with_params(seed: Option<u64>, mode: &str, device: &str, verbose: bool) -> Self {
63        Self {
64            seed,
65            mode: mode.to_string(),
66            device: device.to_string(),
67            verbose,
68        }
69    }
70
71    /// Run GPU-accelerated annealing using OpenCL
72    #[cfg(all(feature = "gpu", feature = "dwave"))]
73    fn run_gpu_annealing(
74        &self,
75        n_vars: usize,
76        h_vector: &[f64],
77        j_matrix: &[f64],
78        num_shots: usize,
79    ) -> Result<Vec<Vec<bool>>, ocl::Error> {
80        use ocl::flags;
81        use ocl::{Buffer, Context, Device, Kernel, Platform, Program, Queue};
82
83        // Check problem size
84        if n_vars > 2048 {
85            if self.verbose {
86                println!(
87                    "Problem size too large for standard OpenCL kernel. Using chunked approach."
88                );
89            }
90            return self.run_gpu_annealing_chunked(n_vars, h_vector, j_matrix, num_shots);
91        }
92
93        // Display progress if verbose
94        if self.verbose {
95            println!("Initializing GPU with {n_vars} variables and {num_shots} shots");
96        }
97
98        // Set up OpenCL environment
99        let platform = if self.device.contains("cpu") {
100            // Find CPU platform
101            Platform::list()
102                .into_iter()
103                .find(|p| p.name().unwrap_or_default().to_lowercase().contains("cpu"))
104                .unwrap_or_else(Platform::default)
105        } else {
106            // Default platform (typically GPU)
107            Platform::default()
108        };
109
110        if self.verbose {
111            println!("Using platform: {}", platform.name().unwrap_or_default());
112        }
113
114        // Find appropriate device
115        let device = if self.device.contains("cpu") {
116            // CPU device
117            Device::list_all(platform)
118                .unwrap_or_default()
119                .into_iter()
120                .find(|d| {
121                    matches!(d.info(DeviceInfo::Type).ok(), Some(DeviceInfoResult::Type(dt)) if dt == DeviceType::default().cpu())
122                })
123                .map_or_else(|| Device::first(platform), Ok)?
124        } else {
125            // GPU device
126            Device::list_all(platform)
127                .unwrap_or_default()
128                .into_iter()
129                .find(|d| {
130                    matches!(d.info(DeviceInfo::Type).ok(), Some(DeviceInfoResult::Type(dt)) if dt == DeviceType::default().gpu())
131                })
132                .map_or_else(|| Device::first(platform), Ok)?
133        };
134
135        if self.verbose {
136            println!("Using device: {}", device.name().unwrap_or_default());
137        }
138
139        // Build context and queue
140        let context = Context::builder()
141            .platform(platform)
142            .devices(device)
143            .build()?;
144        let queue = Queue::new(&context, device, None)?;
145
146        // Use GPU kernel from kernels module
147        let src = super::kernels::SIMULATED_ANNEALING_KERNEL;
148
149        // Compile the program
150        let context = Context::builder().devices(device).build()?;
151        let program = Program::builder()
152            .devices(device)
153            .src(src)
154            .build(&context)?;
155
156        // Set up buffers
157        let h_buffer = Buffer::<f32>::builder()
158            .queue(queue.clone())
159            .flags(flags::MEM_READ_ONLY)
160            .len(n_vars)
161            .build()?;
162
163        let j_buffer = Buffer::<f32>::builder()
164            .queue(queue.clone())
165            .flags(flags::MEM_READ_ONLY)
166            .len(n_vars * n_vars)
167            .build()?;
168
169        let solutions_buffer = Buffer::<u8>::builder()
170            .queue(queue)
171            .flags(flags::MEM_WRITE_ONLY)
172            .len(num_shots * n_vars)
173            .build()?;
174
175        // Convert h_vector and j_matrix to f32
176        let h_vec_f32: Vec<f32> = h_vector.iter().map(|&x| x as f32).collect();
177        let j_mat_f32: Vec<f32> = j_matrix.iter().map(|&x| x as f32).collect();
178
179        // Transfer data to GPU
180        h_buffer.write(&h_vec_f32).enq()?;
181        j_buffer.write(&j_mat_f32).enq()?;
182
183        // Set up kernel parameters
184        let init_temp = 10.0f32;
185        let mut final_temp = 0.1f32;
186        let sweeps = if n_vars < 100 {
187            1000
188        } else if n_vars < 500 {
189            2000
190        } else {
191            5000
192        };
193
194        if self.verbose {
195            println!("Running {sweeps} sweeps with temperature range [{final_temp}, {init_temp}]");
196        }
197
198        // Create a seed based on input seed or random value
199        let seed_val = self.seed.unwrap_or_else(|| thread_rng().gen());
200
201        // Set up and run standard simulated annealing kernel
202        let mut kernel = Kernel::builder()
203            .program(&program)
204            .name("simulated_annealing")
205            .global_work_size(num_shots)
206            .arg(n_vars as i32)
207            .arg(&h_buffer)
208            .arg(&j_buffer)
209            .arg(&solutions_buffer)
210            .arg(num_shots as i32)
211            .arg(init_temp)
212            .arg(final_temp)
213            .arg(sweeps)
214            .arg(seed_val)
215            .build()?;
216
217        // Execute kernel
218        unsafe {
219            kernel.enq()?;
220        }
221
222        // Read results
223        let mut solutions_data = vec![0u8; num_shots * n_vars];
224        solutions_buffer.read(&mut solutions_data).enq()?;
225
226        // Convert to Vec<Vec<bool>>
227        let mut results = Vec::with_capacity(num_shots);
228        for i in 0..num_shots {
229            let mut solution = Vec::with_capacity(n_vars);
230            for j in 0..n_vars {
231                solution.push(solutions_data[i * n_vars + j] != 0);
232            }
233            results.push(solution);
234        }
235
236        Ok(results)
237    }
238
239    /// Chunked GPU annealing for very large problems
240    #[cfg(all(feature = "gpu", feature = "dwave"))]
241    fn run_gpu_annealing_chunked(
242        &self,
243        n_vars: usize,
244        h_vector: &[f64],
245        j_matrix: &[f64],
246        num_shots: usize,
247    ) -> Result<Vec<Vec<bool>>, ocl::Error> {
248        // For problems too large to fit in a single kernel, we chunk the problem
249        // into smaller subproblems and solve iteratively
250
251        if self.verbose {
252            println!("Using chunked approach for large problem: {n_vars} variables");
253        }
254
255        // Maximum number of variables to process in a single chunk
256        const MAX_CHUNK_SIZE: usize = 1024;
257
258        // Calculate number of chunks needed
259        let num_chunks = n_vars.div_ceil(MAX_CHUNK_SIZE);
260
261        if self.verbose {
262            println!(
263                "Processing in {num_chunks} chunks of at most {MAX_CHUNK_SIZE} variables each"
264            );
265        }
266
267        // Initialize random number generator
268        let mut rng = if let Some(seed) = self.seed {
269            StdRng::seed_from_u64(seed)
270        } else {
271            let seed: u64 = thread_rng().gen();
272            StdRng::seed_from_u64(seed)
273        };
274
275        // Initialize random solutions for all shots
276        let mut solutions: Vec<Vec<bool>> = Vec::with_capacity(num_shots);
277        for _ in 0..num_shots {
278            let mut solution = Vec::with_capacity(n_vars);
279            for _ in 0..n_vars {
280                solution.push(rng.gen_bool(0.5));
281            }
282            solutions.push(solution);
283        }
284
285        // Track energies for each solution
286        let mut energies = vec![0.0; num_shots];
287
288        // Initialize energies
289        for (i, solution) in solutions.iter().enumerate() {
290            energies[i] = evaluate_qubo_energy(solution, h_vector, j_matrix, n_vars);
291        }
292
293        // Process each chunk iteratively
294        for chunk_idx in 0..num_chunks {
295            // Calculate start and end indices for this chunk
296            let start_var = chunk_idx * MAX_CHUNK_SIZE;
297            let end_var = std::cmp::min((chunk_idx + 1) * MAX_CHUNK_SIZE, n_vars);
298            let chunk_size = end_var - start_var;
299
300            if self.verbose {
301                println!(
302                    "Processing chunk {}/{}: variables {}..{}",
303                    chunk_idx + 1,
304                    num_chunks,
305                    start_var,
306                    end_var - 1
307                );
308            }
309
310            // Extract subproblem
311            let mut chunk_h = Vec::with_capacity(chunk_size);
312            let mut chunk_j = Vec::with_capacity(chunk_size * chunk_size);
313
314            // Extract linear terms for this chunk
315            for i in start_var..end_var {
316                chunk_h.push(h_vector[i]);
317            }
318
319            // Extract quadratic terms for this chunk
320            for i in start_var..end_var {
321                for j in start_var..end_var {
322                    chunk_j.push(j_matrix[i * n_vars + j]);
323                }
324            }
325
326            // Adjust linear terms based on fixed variables outside this chunk
327            for sol_idx in 0..solutions.len() {
328                let mut adjusted_h = chunk_h.clone();
329
330                // Add contributions from fixed variables
331                for i in start_var..end_var {
332                    for j in 0..n_vars {
333                        if (j < start_var || j >= end_var) && solutions[sol_idx][j] {
334                            adjusted_h[i - start_var] += j_matrix[i * n_vars + j];
335                        }
336                    }
337                }
338
339                // Process this specific solution's subproblem
340                let mut chunk_solution = Vec::with_capacity(chunk_size);
341                for i in start_var..end_var {
342                    chunk_solution.push(solutions[sol_idx][i]);
343                }
344
345                // Optimize just this chunk using GPU
346                let optimized_chunk = self.optimize_chunk(
347                    &chunk_solution,
348                    &adjusted_h,
349                    &chunk_j,
350                    chunk_size,
351                    self.seed.map(|s| s + sol_idx as u64),
352                )?;
353
354                // Update the original solution with optimized chunk
355                for (i, &val) in optimized_chunk.iter().enumerate() {
356                    solutions[sol_idx][start_var + i] = val;
357                }
358
359                // Update energy
360                energies[sol_idx] =
361                    evaluate_qubo_energy(&solutions[sol_idx], h_vector, j_matrix, n_vars);
362            }
363        }
364
365        // Sort solutions by energy
366        let mut solution_pairs: Vec<(Vec<bool>, f64)> =
367            solutions.into_iter().zip(energies).collect();
368        solution_pairs
369            .sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
370
371        // Return sorted solutions
372        Ok(solution_pairs.into_iter().map(|(sol, _)| sol).collect())
373    }
374
375    /// Optimize a single chunk of variables
376    #[cfg(all(feature = "gpu", feature = "dwave"))]
377    fn optimize_chunk(
378        &self,
379        initial_state: &[bool],
380        h_vector: &[f64],
381        j_matrix: &[f64],
382        n_vars: usize,
383        seed: Option<u64>,
384    ) -> Result<Vec<bool>, ocl::Error> {
385        use ocl::flags;
386        use ocl::{Buffer, Context, Device, Kernel, Platform, Program, Queue};
387
388        // Set up OpenCL environment (same as in run_gpu_annealing)
389        let platform = Platform::default();
390        let device = Device::first(platform)?;
391        let context = Context::builder()
392            .platform(platform)
393            .devices(device)
394            .build()?;
395        let queue = Queue::new(&context, device, None)?;
396
397        // Use chunk optimization kernel from kernels module
398        let src = super::kernels::OPTIMIZE_CHUNK_KERNEL;
399
400        // Compile the program
401        let program = Program::builder()
402            .devices(device)
403            .src(src)
404            .build(&context)?;
405
406        // Set up buffers
407        let h_buffer = Buffer::<f32>::builder()
408            .queue(queue.clone())
409            .flags(flags::MEM_READ_ONLY)
410            .len(n_vars)
411            .build()?;
412
413        let j_buffer = Buffer::<f32>::builder()
414            .queue(queue.clone())
415            .flags(flags::MEM_READ_ONLY)
416            .len(n_vars * n_vars)
417            .build()?;
418
419        let initial_buffer = Buffer::<u8>::builder()
420            .queue(queue.clone())
421            .flags(flags::MEM_READ_ONLY)
422            .len(n_vars)
423            .build()?;
424
425        let result_buffer = Buffer::<u8>::builder()
426            .queue(queue)
427            .flags(flags::MEM_WRITE_ONLY)
428            .len(n_vars)
429            .build()?;
430
431        // Convert data types
432        let h_vec_f32: Vec<f32> = h_vector.iter().map(|&x| x as f32).collect();
433        let j_mat_f32: Vec<f32> = j_matrix.iter().map(|&x| x as f32).collect();
434        let initial_u8: Vec<u8> = initial_state.iter().map(|&b| u8::from(b)).collect();
435
436        // Transfer data to device
437        h_buffer.write(&h_vec_f32).enq()?;
438        j_buffer.write(&j_mat_f32).enq()?;
439        initial_buffer.write(&initial_u8).enq()?;
440
441        // Set kernel parameters
442        let mut kernel = Kernel::builder()
443            .program(&program)
444            .name("optimize_chunk")
445            .global_work_size(1) // Only one optimization task
446            .arg(n_vars as i32)
447            .arg(&h_buffer)
448            .arg(&j_buffer)
449            .arg(&initial_buffer)
450            .arg(&result_buffer)
451            .arg(5000i32) // More sweeps for thorough optimization of a chunk
452            .arg(5.0f32)  // Higher initial temperature
453            .arg(0.01f32) // Lower final temperature
454            .arg(seed.unwrap_or_else(|| thread_rng().gen()))
455            .build()?;
456
457        // Execute kernel
458        unsafe {
459            kernel.enq()?;
460        }
461
462        // Read result
463        let mut result_u8 = vec![0u8; n_vars];
464        result_buffer.read(&mut result_u8).enq()?;
465
466        // Convert back to bool
467        let mut result = result_u8.iter().map(|&b| b != 0).collect();
468
469        Ok(result)
470    }
471
472    #[cfg(not(all(feature = "gpu", feature = "dwave")))]
473    fn run_gpu_annealing(
474        &self,
475        _n_vars: usize,
476        _h_vector: &[f64],
477        _j_matrix: &[f64],
478        _num_shots: usize,
479    ) -> Result<Vec<Vec<bool>>, String> {
480        Err("GPU support not enabled. Rebuild with '--features gpu,dwave'".to_string())
481    }
482
483    #[cfg(not(all(feature = "gpu", feature = "dwave")))]
484    fn run_gpu_annealing_chunked(
485        &self,
486        _n_vars: usize,
487        _h_vector: &[f64],
488        _j_matrix: &[f64],
489        _num_shots: usize,
490    ) -> Result<Vec<Vec<bool>>, String> {
491        Err("GPU support not enabled. Rebuild with '--features gpu,dwave'".to_string())
492    }
493
494    #[cfg(not(all(feature = "gpu", feature = "dwave")))]
495    fn optimize_chunk(
496        &self,
497        _initial_state: &[bool],
498        _h_vector: &[f64],
499        _j_matrix: &[f64],
500        _n_vars: usize,
501        _seed: Option<u64>,
502    ) -> Result<Vec<bool>, String> {
503        Err("GPU support not enabled. Rebuild with '--features gpu,dwave'".to_string())
504    }
505}
506
507#[cfg(feature = "gpu")]
508impl Sampler for ArminSampler {
509    fn run_qubo(
510        &self,
511        qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
512        shots: usize,
513    ) -> SamplerResult<Vec<SampleResult>> {
514        // Extract matrix and variable mapping
515        let (matrix, var_map) = qubo;
516
517        // Get the problem dimension
518        let n_vars = var_map.len();
519
520        // Map from indices back to variable names
521        let idx_to_var: HashMap<usize, String> = var_map
522            .iter()
523            .map(|(var, &idx)| (idx, var.clone()))
524            .collect();
525
526        // Determine compute resources based on mode
527        let is_gpu = self.mode.to_uppercase() == "GPU";
528        let device_info = if is_gpu {
529            format!("Using GPU device: {}", self.device)
530        } else {
531            "Using CPU acceleration".to_string()
532        };
533
534        if self.verbose {
535            println!("{device_info}");
536            println!("Problem size: {n_vars} variables");
537        }
538
539        // Convert QUBO matrix to appropriate format for OpenCL
540        let mut h_vector = Vec::with_capacity(n_vars);
541        let mut j_matrix = Vec::with_capacity(n_vars * n_vars);
542
543        // Extract diagonal (linear) terms
544        for i in 0..n_vars {
545            h_vector.push(matrix[[i, i]]);
546        }
547
548        // Extract off-diagonal (quadratic) terms
549        for i in 0..n_vars {
550            for j in 0..n_vars {
551                if i == j {
552                    j_matrix.push(0.0); // Zero on diagonal in J matrix
553                } else {
554                    j_matrix.push(matrix[[i, j]]);
555                }
556            }
557        }
558
559        // Set up OpenCL and run GPU annealing
560        #[cfg(all(feature = "gpu", feature = "dwave"))]
561        let ocl_result = self.run_gpu_annealing(n_vars, &h_vector, &j_matrix, shots);
562
563        #[cfg(not(all(feature = "gpu", feature = "dwave")))]
564        let ocl_result: Result<Vec<Vec<i32>>, SamplerError> = Err(SamplerError::GpuError(
565            "GPU support not enabled".to_string(),
566        ));
567
568        #[cfg(all(feature = "gpu", feature = "dwave"))]
569        match ocl_result {
570            Ok(binary_solutions) => {
571                // Process results
572                let mut solution_counts: HashMap<Vec<bool>, (f64, usize)> = HashMap::new();
573
574                for solution in binary_solutions {
575                    // Calculate energy using our helper function
576                    let mut energy = evaluate_qubo_energy(&solution, &h_vector, &j_matrix, n_vars);
577
578                    // Update solution counts
579                    let entry = solution_counts.entry(solution).or_insert((energy, 0));
580                    entry.1 += 1;
581                }
582
583                // Convert to SampleResult format
584                let mut results: Vec<SampleResult> = solution_counts
585                    .into_iter()
586                    .map(|(bin_solution, (energy, count))| {
587                        // Convert to variable assignments
588                        let assignments: HashMap<String, bool> = bin_solution
589                            .iter()
590                            .enumerate()
591                            .filter_map(|(idx, &value)| {
592                                idx_to_var
593                                    .get(&idx)
594                                    .map(|var_name| (var_name.clone(), value))
595                            })
596                            .collect();
597
598                        SampleResult {
599                            assignments,
600                            energy,
601                            occurrences: count,
602                        }
603                    })
604                    .collect();
605
606                // Sort by energy
607                results.sort_by(|a, b| {
608                    a.energy
609                        .partial_cmp(&b.energy)
610                        .unwrap_or(std::cmp::Ordering::Equal)
611                });
612
613                // Limit to requested number of shots
614                if results.len() > shots {
615                    results.truncate(shots);
616                }
617
618                Ok(results)
619            }
620            Err(e) => Err(SamplerError::GpuError(e.to_string())),
621        }
622
623        #[cfg(not(all(feature = "gpu", feature = "dwave")))]
624        match ocl_result {
625            Ok(_) => unreachable!("GPU support not enabled"),
626            Err(e) => Err(e),
627        }
628    }
629
630    fn run_hobo(
631        &self,
632        hobo: &(
633            Array<f64, scirs2_core::ndarray::IxDyn>,
634            HashMap<String, usize>,
635        ),
636        shots: usize,
637    ) -> SamplerResult<Vec<SampleResult>> {
638        // Handle QUBO case directly
639        if hobo.0.ndim() == 2 {
640            let matrix = hobo
641                .0
642                .clone()
643                .into_dimensionality::<scirs2_core::ndarray::Ix2>()
644                .map_err(|e| {
645                    SamplerError::InvalidParameter(format!(
646                        "Failed to convert HOBO to QUBO dimensionality: {e}"
647                    ))
648                })?;
649            let qubo = (matrix, hobo.1.clone());
650            return self.run_qubo(&qubo, shots);
651        }
652
653        // For higher-order problems, we could implement specialized kernels
654        // For now, return an error suggesting quadratization
655        Err(SamplerError::InvalidParameter(
656            "GPU acceleration for HOBO problems not yet implemented. Consider quadratization to QUBO format.".to_string()
657        ))
658    }
659}
660
661// Fallback implementation when GPU feature is not enabled
662#[cfg(not(feature = "gpu"))]
663pub struct ArminSampler {
664    _seed: Option<u64>,
665}
666
667#[cfg(not(feature = "gpu"))]
668impl ArminSampler {
669    #[must_use]
670    pub const fn new(_seed: Option<u64>) -> Self {
671        Self { _seed }
672    }
673
674    #[must_use]
675    pub const fn with_params(
676        _seed: Option<u64>,
677        _mode: &str,
678        _device: &str,
679        _verbose: bool,
680    ) -> Self {
681        Self { _seed }
682    }
683}
684
685#[cfg(not(feature = "gpu"))]
686impl Sampler for ArminSampler {
687    fn run_qubo(
688        &self,
689        _qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
690        _shots: usize,
691    ) -> SamplerResult<Vec<SampleResult>> {
692        Err(SamplerError::GpuError(
693            "GPU support not enabled. Rebuild with '--features gpu'".to_string(),
694        ))
695    }
696
697    fn run_hobo(
698        &self,
699        _hobo: &(
700            Array<f64, scirs2_core::ndarray::IxDyn>,
701            HashMap<String, usize>,
702        ),
703        _shots: usize,
704    ) -> SamplerResult<Vec<SampleResult>> {
705        Err(SamplerError::GpuError(
706            "GPU support not enabled. Rebuild with '--features gpu'".to_string(),
707        ))
708    }
709}