Skip to main content

quantrs2_tytan/
gpu_samplers.rs

1//! GPU-accelerated samplers with SciRS2 integration.
2//!
3//! This module provides high-performance GPU samplers for solving QUBO and HOBO problems
4//! using CUDA kernels via SciRS2, with support for multi-GPU distributed sampling.
5
6#![allow(dead_code)]
7
8use crate::sampler::{SampleResult, Sampler, SamplerError, SamplerResult};
9use scirs2_core::ndarray::{Array, ArrayD, Ix2, IxDyn};
10use scirs2_core::random::{thread_rng, Rng, RngExt};
11use std::collections::HashMap;
12use std::sync::{Arc, Mutex};
13
14#[cfg(feature = "scirs")]
15use scirs2_core::gpu;
16
17// Stubs for missing GPU functionality
18#[cfg(feature = "scirs")]
19const fn get_device_count() -> usize {
20    // Placeholder
21    1
22}
23
24#[cfg(feature = "scirs")]
25struct GpuContext;
26
27#[cfg(feature = "scirs")]
28struct DeviceInfo {
29    memory_mb: usize,
30    compute_units: usize,
31}
32
33#[cfg(feature = "scirs")]
34impl GpuContext {
35    fn new(_device_id: u32) -> Result<Self, Box<dyn std::error::Error>> {
36        Ok(Self)
37    }
38
39    const fn get_device_info(&self) -> DeviceInfo {
40        DeviceInfo {
41            memory_mb: 8192,
42            compute_units: 64,
43        }
44    }
45
46    fn allocate_memory_pool(&self, _size: usize) -> Result<(), Box<dyn std::error::Error>> {
47        Ok(())
48    }
49
50    fn allocate<T>(&self, _count: usize) -> Result<GpuBuffer<T>, Box<dyn std::error::Error>> {
51        Ok(GpuBuffer::new())
52    }
53
54    fn init_random_states(
55        &self,
56        _buffer: &GpuBuffer<u8>,
57        _seed: u64,
58    ) -> Result<(), Box<dyn std::error::Error>> {
59        Ok(())
60    }
61
62    fn launch_kernel(
63        &self,
64        _name: &str,
65        _grid: usize,
66        _block: usize,
67        _args: &[KernelArg],
68    ) -> Result<(), Box<dyn std::error::Error>> {
69        Ok(())
70    }
71
72    fn synchronize(&self) -> Result<(), Box<dyn std::error::Error>> {
73        Ok(())
74    }
75}
76
77#[cfg(feature = "scirs")]
78struct GpuMatrix;
79
80#[cfg(feature = "scirs")]
81struct GpuBuffer<T> {
82    _phantom: std::marker::PhantomData<T>,
83}
84
85#[cfg(feature = "scirs")]
86impl<T> GpuBuffer<T> {
87    const fn new() -> Self {
88        Self {
89            _phantom: std::marker::PhantomData,
90        }
91    }
92
93    fn copy_to_host(&self, _host_data: &mut [T]) -> Result<(), Box<dyn std::error::Error>> {
94        Ok(())
95    }
96
97    const fn as_kernel_arg(&self) -> KernelArg {
98        KernelArg::Buffer
99    }
100}
101
102#[cfg(feature = "scirs")]
103enum KernelArg {
104    Buffer,
105    Scalar(f32),
106    Integer(i32),
107}
108
109#[cfg(feature = "scirs")]
110impl GpuMatrix {
111    fn from_host_mixed(
112        _ctx: &GpuContext,
113        _matrix: &Array<f64, Ix2>,
114    ) -> Result<Self, Box<dyn std::error::Error>> {
115        Ok(Self)
116    }
117
118    fn from_host(
119        _ctx: &GpuContext,
120        _matrix: &Array<f64, Ix2>,
121    ) -> Result<Self, Box<dyn std::error::Error>> {
122        Ok(Self)
123    }
124
125    const fn as_kernel_arg(&self) -> KernelArg {
126        KernelArg::Buffer
127    }
128}
129
130/// GPU-accelerated sampler with CUDA kernels via SciRS2
131pub struct EnhancedArminSampler {
132    /// Random seed for reproducibility
133    seed: Option<u64>,
134    /// GPU device ID
135    device_id: usize,
136    /// Number of parallel runs per batch
137    batch_size: usize,
138    /// Temperature schedule parameters
139    initial_temp: f64,
140    final_temp: f64,
141    /// Number of sweeps per run
142    sweeps: usize,
143    /// Enable multi-GPU distribution
144    multi_gpu: bool,
145    /// Memory pool size in MB
146    memory_pool_mb: usize,
147    /// Enable asynchronous execution
148    async_mode: bool,
149    /// Mixed precision computation
150    use_mixed_precision: bool,
151    /// Verbose output
152    verbose: bool,
153}
154
155impl EnhancedArminSampler {
156    /// Create a new enhanced GPU sampler
157    pub const fn new(device_id: usize) -> Self {
158        Self {
159            seed: None,
160            device_id,
161            batch_size: 1024,
162            initial_temp: 10.0,
163            final_temp: 0.01,
164            sweeps: 1000,
165            multi_gpu: false,
166            memory_pool_mb: 1024,
167            async_mode: true,
168            use_mixed_precision: true,
169            verbose: false,
170        }
171    }
172
173    /// Enable multi-GPU mode
174    pub const fn with_multi_gpu(mut self, enable: bool) -> Self {
175        self.multi_gpu = enable;
176        self
177    }
178
179    /// Set batch size for parallel runs
180    pub const fn with_batch_size(mut self, size: usize) -> Self {
181        self.batch_size = size;
182        self
183    }
184
185    /// Set temperature schedule
186    pub const fn with_temperature(mut self, initial: f64, final_: f64) -> Self {
187        self.initial_temp = initial;
188        self.final_temp = final_;
189        self
190    }
191
192    /// Set number of sweeps
193    pub const fn with_sweeps(mut self, sweeps: usize) -> Self {
194        self.sweeps = sweeps;
195        self
196    }
197
198    /// Set memory pool size
199    pub const fn with_memory_pool(mut self, size_mb: usize) -> Self {
200        self.memory_pool_mb = size_mb;
201        self
202    }
203
204    /// Enable mixed precision computation
205    pub const fn with_mixed_precision(mut self, enable: bool) -> Self {
206        self.use_mixed_precision = enable;
207        self
208    }
209
210    /// Run GPU annealing with optimized kernels
211    #[cfg(feature = "scirs")]
212    fn run_gpu_optimized(
213        &self,
214        qubo: &Array<f64, Ix2>,
215        var_map: &HashMap<String, usize>,
216        shots: usize,
217    ) -> SamplerResult<Vec<SampleResult>> {
218        let n_vars = var_map.len();
219
220        // Initialize GPU context
221        let device_id_u32: u32 = self.device_id.try_into().map_err(|_| {
222            SamplerError::InvalidParameter(format!(
223                "Device ID {} is too large for u32",
224                self.device_id
225            ))
226        })?;
227        let ctx = GpuContext::new(device_id_u32)
228            .map_err(|e| SamplerError::GpuError(format!("Failed to initialize GPU: {e}")))?;
229
230        if self.verbose {
231            let info = ctx.get_device_info();
232            println!(
233                "GPU Device: {} MB memory, {} compute units",
234                info.memory_mb, info.compute_units
235            );
236        }
237
238        // Allocate memory pool
239        ctx.allocate_memory_pool(self.memory_pool_mb * 1024 * 1024)
240            .map_err(|e| SamplerError::GpuError(format!("Memory pool allocation failed: {e}")))?;
241
242        // Transfer QUBO matrix to GPU
243        let gpu_qubo = if self.use_mixed_precision {
244            // Convert to FP16 for mixed precision
245            GpuMatrix::from_host_mixed(&ctx, qubo)
246                .map_err(|e| SamplerError::GpuError(format!("Matrix transfer failed: {e}")))?
247        } else {
248            GpuMatrix::from_host(&ctx, qubo)
249                .map_err(|e| SamplerError::GpuError(format!("Matrix transfer failed: {e}")))?
250        };
251
252        // Run annealing in batches
253        let mut all_results = Vec::new();
254        let num_batches = shots.div_ceil(self.batch_size);
255
256        for batch in 0..num_batches {
257            let batch_size = std::cmp::min(self.batch_size, shots - batch * self.batch_size);
258
259            if self.verbose {
260                println!(
261                    "Processing batch {}/{} ({} samples)",
262                    batch + 1,
263                    num_batches,
264                    batch_size
265                );
266            }
267
268            // Launch CUDA kernel for parallel tempering
269            let states = self.launch_annealing_kernel(&ctx, &gpu_qubo, n_vars, batch_size)?;
270
271            // Convert GPU results to SampleResult
272            let batch_results = self.process_gpu_results(states, var_map)?;
273            all_results.extend(batch_results);
274        }
275
276        // Sort by energy
277        all_results.sort_by(|a, b| {
278            a.energy
279                .partial_cmp(&b.energy)
280                .unwrap_or(std::cmp::Ordering::Equal)
281        });
282
283        Ok(all_results)
284    }
285
286    /// Launch optimized CUDA annealing kernel
287    #[cfg(feature = "scirs")]
288    fn launch_annealing_kernel(
289        &self,
290        ctx: &GpuContext,
291        gpu_qubo: &GpuMatrix,
292        n_vars: usize,
293        batch_size: usize,
294    ) -> SamplerResult<Vec<Vec<bool>>> {
295        // CUDA kernel parameters
296        let block_size = 256;
297        let grid_size = batch_size.div_ceil(block_size);
298
299        // Allocate device memory for states
300        let states_size = batch_size * n_vars;
301        let d_states = ctx
302            .allocate::<u8>(states_size)
303            .map_err(|e| SamplerError::GpuError(format!("State allocation failed: {e}")))?;
304
305        // Allocate device memory for energies
306        let d_energies = ctx
307            .allocate::<f32>(batch_size)
308            .map_err(|e| SamplerError::GpuError(format!("Energy allocation failed: {e}")))?;
309
310        // Initialize random states on GPU
311        ctx.init_random_states(
312            &d_states,
313            self.seed.unwrap_or_else(|| thread_rng().random()),
314        )
315        .map_err(|e| SamplerError::GpuError(format!("Random init failed: {e}")))?;
316
317        // Launch parallel tempering kernel
318        let kernel_name = if self.use_mixed_precision {
319            "parallel_tempering_mixed_precision"
320        } else {
321            "parallel_tempering_fp32"
322        };
323
324        ctx.launch_kernel(
325            kernel_name,
326            grid_size,
327            block_size,
328            &[
329                gpu_qubo.as_kernel_arg(),
330                d_states.as_kernel_arg(),
331                d_energies.as_kernel_arg(),
332                KernelArg::Integer(n_vars as i32),
333                KernelArg::Integer(batch_size as i32),
334                KernelArg::Scalar(self.initial_temp as f32),
335                KernelArg::Scalar(self.final_temp as f32),
336                KernelArg::Integer(self.sweeps as i32),
337            ],
338        )
339        .map_err(|e| SamplerError::GpuError(format!("Kernel launch failed: {e}")))?;
340
341        // Synchronize if not in async mode
342        if !self.async_mode {
343            ctx.synchronize()
344                .map_err(|e| SamplerError::GpuError(format!("Synchronization failed: {e}")))?;
345        }
346
347        // Copy results back to host
348        let mut host_states = vec![0u8; states_size];
349        d_states
350            .copy_to_host(&mut host_states)
351            .map_err(|e| SamplerError::GpuError(format!("Result transfer failed: {e}")))?;
352
353        // Convert to boolean vectors
354        let mut results = Vec::new();
355        for i in 0..batch_size {
356            let start = i * n_vars;
357            let end = start + n_vars;
358            let state: Vec<bool> = host_states[start..end].iter().map(|&x| x != 0).collect();
359            results.push(state);
360        }
361
362        Ok(results)
363    }
364
365    /// Process GPU results into SampleResult format
366    fn process_gpu_results(
367        &self,
368        states: Vec<Vec<bool>>,
369        var_map: &HashMap<String, usize>,
370    ) -> SamplerResult<Vec<SampleResult>> {
371        let idx_to_var: HashMap<usize, String> = var_map
372            .iter()
373            .map(|(var, &idx)| (idx, var.clone()))
374            .collect();
375
376        let mut results = Vec::new();
377
378        for state in states {
379            // Create variable assignments
380            let mut assignments: HashMap<String, bool> = HashMap::new();
381            for (idx, &value) in state.iter().enumerate() {
382                let var_name = idx_to_var.get(&idx).ok_or_else(|| {
383                    SamplerError::InvalidParameter(format!(
384                        "Variable index {} not found in variable map",
385                        idx
386                    ))
387                })?;
388                assignments.insert(var_name.clone(), value);
389            }
390
391            // Energy will be calculated on GPU in real implementation
392            let energy = 0.0; // Placeholder
393
394            results.push(SampleResult {
395                assignments,
396                energy,
397                occurrences: 1,
398            });
399        }
400
401        Ok(results)
402    }
403
404    /// Fallback implementation when SciRS2 is not available
405    #[cfg(not(feature = "scirs"))]
406    fn run_gpu_optimized(
407        &self,
408        _qubo: &Array<f64, Ix2>,
409        _var_map: &HashMap<String, usize>,
410        _shots: usize,
411    ) -> SamplerResult<Vec<SampleResult>> {
412        Err(SamplerError::GpuError(
413            "GPU acceleration requires SciRS2 feature".to_string(),
414        ))
415    }
416}
417
418impl Sampler for EnhancedArminSampler {
419    fn run_qubo(
420        &self,
421        qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
422        shots: usize,
423    ) -> SamplerResult<Vec<SampleResult>> {
424        let (matrix, var_map) = qubo;
425
426        if self.multi_gpu {
427            self.run_multi_gpu(matrix, var_map, shots)
428        } else {
429            self.run_gpu_optimized(matrix, var_map, shots)
430        }
431    }
432
433    fn run_hobo(
434        &self,
435        _hobo: &(ArrayD<f64>, HashMap<String, usize>),
436        _shots: usize,
437    ) -> SamplerResult<Vec<SampleResult>> {
438        // For HOBO, we need to use tensor decomposition
439        // This is handled by MIKASAmpler
440        Err(SamplerError::InvalidParameter(
441            "Use MIKASAmpler for HOBO problems".to_string(),
442        ))
443    }
444}
445
446impl EnhancedArminSampler {
447    /// Run sampling across multiple GPUs
448    #[cfg(feature = "scirs")]
449    fn run_multi_gpu(
450        &self,
451        qubo: &Array<f64, Ix2>,
452        var_map: &HashMap<String, usize>,
453        shots: usize,
454    ) -> SamplerResult<Vec<SampleResult>> {
455        let num_gpus = get_device_count();
456
457        if num_gpus <= 1 {
458            return self.run_gpu_optimized(qubo, var_map, shots);
459        }
460
461        if self.verbose {
462            println!("Using {num_gpus} GPUs for distributed sampling");
463        }
464
465        // Distribute shots across GPUs
466        let shots_per_gpu = shots / num_gpus;
467        let remainder = shots % num_gpus;
468
469        let mut results = Arc::new(Mutex::new(Vec::new()));
470        let mut handles = Vec::new();
471
472        // Launch sampling on each GPU in parallel
473        for gpu_id in 0..num_gpus {
474            let gpu_shots = if gpu_id < remainder {
475                shots_per_gpu + 1
476            } else {
477                shots_per_gpu
478            };
479
480            let qubo_clone = qubo.clone();
481            let var_map_clone = var_map.clone();
482            let results_clone = Arc::clone(&results);
483            let sampler = self.clone_with_device(gpu_id);
484
485            let handle = std::thread::spawn(move || {
486                match sampler.run_gpu_optimized(&qubo_clone, &var_map_clone, gpu_shots) {
487                    Ok(gpu_results) => {
488                        let mut all_results = results_clone
489                            .lock()
490                            .expect("Results mutex poisoned - a GPU thread panicked");
491                        all_results.extend(gpu_results);
492                    }
493                    Err(e) => {
494                        eprintln!("GPU {gpu_id} failed: {e}");
495                    }
496                }
497            });
498
499            handles.push(handle);
500        }
501
502        // Wait for all GPUs to complete
503        for handle in handles {
504            handle.join().expect("GPU thread panicked");
505        }
506
507        let mut final_results = results
508            .lock()
509            .expect("Results mutex poisoned - a GPU thread panicked")
510            .clone();
511        final_results.sort_by(|a, b| {
512            a.energy
513                .partial_cmp(&b.energy)
514                .unwrap_or(std::cmp::Ordering::Equal)
515        });
516
517        Ok(final_results)
518    }
519
520    /// Clone sampler with different device
521    fn clone_with_device(&self, device_id: usize) -> Self {
522        Self {
523            device_id,
524            ..self.clone()
525        }
526    }
527
528    #[cfg(not(feature = "scirs"))]
529    fn run_multi_gpu(
530        &self,
531        qubo: &Array<f64, Ix2>,
532        var_map: &HashMap<String, usize>,
533        shots: usize,
534    ) -> SamplerResult<Vec<SampleResult>> {
535        self.run_gpu_optimized(qubo, var_map, shots)
536    }
537}
538
539// Make sampler cloneable
540impl Clone for EnhancedArminSampler {
541    fn clone(&self) -> Self {
542        Self {
543            seed: self.seed,
544            device_id: self.device_id,
545            batch_size: self.batch_size,
546            initial_temp: self.initial_temp,
547            final_temp: self.final_temp,
548            sweeps: self.sweeps,
549            multi_gpu: self.multi_gpu,
550            memory_pool_mb: self.memory_pool_mb,
551            async_mode: self.async_mode,
552            use_mixed_precision: self.use_mixed_precision,
553            verbose: self.verbose,
554        }
555    }
556}
557
558/// GPU-accelerated HOBO sampler (MIKASA)
559pub struct MIKASAmpler {
560    /// Base configuration from ArminSampler
561    base_config: EnhancedArminSampler,
562    /// Tensor decomposition rank
563    decomposition_rank: usize,
564    /// Use CP decomposition
565    use_cp_decomposition: bool,
566    /// Tensor contraction order optimization
567    optimize_contraction: bool,
568}
569
570impl MIKASAmpler {
571    /// Create new MIKASA sampler for HOBO problems
572    pub const fn new(device_id: usize) -> Self {
573        Self {
574            base_config: EnhancedArminSampler::new(device_id),
575            decomposition_rank: 50,
576            use_cp_decomposition: true,
577            optimize_contraction: true,
578        }
579    }
580
581    /// Set tensor decomposition rank
582    pub const fn with_rank(mut self, rank: usize) -> Self {
583        self.decomposition_rank = rank;
584        self
585    }
586
587    /// Enable/disable CP decomposition
588    pub const fn with_cp_decomposition(mut self, enable: bool) -> Self {
589        self.use_cp_decomposition = enable;
590        self
591    }
592}
593
594impl Sampler for MIKASAmpler {
595    fn run_qubo(
596        &self,
597        qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
598        shots: usize,
599    ) -> SamplerResult<Vec<SampleResult>> {
600        // Delegate to base sampler for QUBO
601        self.base_config.run_qubo(qubo, shots)
602    }
603
604    fn run_hobo(
605        &self,
606        hobo: &(ArrayD<f64>, HashMap<String, usize>),
607        shots: usize,
608    ) -> SamplerResult<Vec<SampleResult>> {
609        let (tensor, var_map) = hobo;
610
611        // Apply tensor decomposition for efficient GPU computation
612        #[cfg(feature = "scirs")]
613        {
614            self.run_hobo_gpu(tensor, var_map, shots)
615        }
616
617        #[cfg(not(feature = "scirs"))]
618        {
619            Err(SamplerError::GpuError(
620                "HOBO GPU acceleration requires SciRS2 feature".to_string(),
621            ))
622        }
623    }
624}
625
626impl MIKASAmpler {
627    #[cfg(feature = "scirs")]
628    fn run_hobo_gpu(
629        &self,
630        tensor: &ArrayD<f64>,
631        var_map: &HashMap<String, usize>,
632        shots: usize,
633    ) -> SamplerResult<Vec<SampleResult>> {
634        // Stub tensor contraction functionality
635        use scirs2_core::ndarray::{Array, IxDyn};
636        let cp_decomposition = |_: &ArrayD<f64>| -> Result<
637            (Vec<usize>, Vec<Array<f64, IxDyn>>, f64),
638            Box<dyn std::error::Error>,
639        > { Ok((vec![], vec![Array::zeros(IxDyn(&[1]))], 0.0f64)) };
640        let optimize_contraction_order = |_: &[usize]| -> Vec<usize> { vec![] };
641
642        let n_vars = var_map.len();
643        let order = tensor.ndim();
644
645        if self.base_config.verbose {
646            println!("Processing {order}-order tensor with {n_vars} variables");
647        }
648
649        // Apply tensor decomposition if beneficial
650        if self.use_cp_decomposition && order > 2 {
651            // Perform CP decomposition
652            let (factors, core_tensors, reconstruction_error) = cp_decomposition(tensor)
653                .map_err(|e| SamplerError::GpuError(format!("CP decomposition failed: {e}")))?;
654
655            let decomposed = DecomposedTensor {
656                factors,
657                core_tensors,
658                reconstruction_error,
659            };
660
661            if self.base_config.verbose {
662                println!("Decomposed tensor to rank {}", self.decomposition_rank);
663            }
664
665            // Run GPU sampling on decomposed form
666            self.run_decomposed_hobo_gpu(decomposed, var_map, shots)
667        } else {
668            // Direct GPU computation for low-order tensors
669            self.run_direct_hobo_gpu(tensor, var_map, shots)
670        }
671    }
672
673    #[cfg(feature = "scirs")]
674    fn run_decomposed_hobo_gpu(
675        &self,
676        decomposed: DecomposedTensor,
677        var_map: &HashMap<String, usize>,
678        shots: usize,
679    ) -> SamplerResult<Vec<SampleResult>> {
680        // Placeholder for decomposed tensor GPU implementation
681        Err(SamplerError::InvalidParameter(
682            "Decomposed HOBO GPU sampling not yet implemented".to_string(),
683        ))
684    }
685
686    #[cfg(feature = "scirs")]
687    fn run_direct_hobo_gpu(
688        &self,
689        tensor: &ArrayD<f64>,
690        var_map: &HashMap<String, usize>,
691        shots: usize,
692    ) -> SamplerResult<Vec<SampleResult>> {
693        // Placeholder for direct tensor GPU implementation
694        Err(SamplerError::InvalidParameter(
695            "Direct HOBO GPU sampling not yet implemented".to_string(),
696        ))
697    }
698}
699
700// Placeholder for decomposed tensor type
701#[cfg(feature = "scirs")]
702struct DecomposedTensor {
703    // CP decomposition components
704    factors: Vec<usize>,
705    core_tensors: Vec<Array<f64, IxDyn>>,
706    reconstruction_error: f64,
707}
708
709/// Asynchronous GPU sampling pipeline
710pub struct AsyncGpuPipeline {
711    /// Number of pipeline stages
712    num_stages: usize,
713    /// Queue depth per stage
714    queue_depth: usize,
715    /// Base sampler
716    sampler: EnhancedArminSampler,
717}
718
719impl AsyncGpuPipeline {
720    /// Create new asynchronous pipeline
721    pub const fn new(sampler: EnhancedArminSampler) -> Self {
722        Self {
723            num_stages: 3,
724            queue_depth: 4,
725            sampler,
726        }
727    }
728
729    /// Run pipelined sampling
730    pub fn run_pipelined(
731        &self,
732        qubo: &Array<f64, Ix2>,
733        var_map: &HashMap<String, usize>,
734        shots: usize,
735    ) -> SamplerResult<Vec<SampleResult>> {
736        // Pipeline stages:
737        // 1. Initialize states on GPU
738        // 2. Run annealing kernels
739        // 3. Transfer results back
740
741        // This would implement overlapped execution of the three stages
742        // for maximum throughput
743
744        self.sampler
745            .run_qubo(&(qubo.clone(), var_map.clone()), shots)
746    }
747}
748
749#[cfg(test)]
750mod tests {
751    #[cfg(feature = "scirs")]
752    use super::EnhancedArminSampler;
753    use crate::sampler::Sampler;
754    #[cfg(feature = "scirs")]
755    use scirs2_core::ndarray::Array;
756    #[cfg(feature = "scirs")]
757    use std::collections::HashMap;
758
759    #[test]
760    #[cfg(feature = "scirs")]
761    fn test_enhanced_armin_sampler() {
762        let sampler = EnhancedArminSampler::new(0)
763            .with_batch_size(256)
764            .with_sweeps(100);
765
766        // Create small QUBO problem
767        let mut qubo = Array::zeros((3, 3));
768        qubo[[0, 0]] = -1.0;
769        qubo[[1, 1]] = -1.0;
770        qubo[[2, 2]] = -1.0;
771        qubo[[0, 1]] = 2.0;
772        qubo[[1, 0]] = 2.0;
773
774        let mut var_map = HashMap::new();
775        var_map.insert("x0".to_string(), 0);
776        var_map.insert("x1".to_string(), 1);
777        var_map.insert("x2".to_string(), 2);
778
779        // Run sampler
780        match sampler.run_qubo(&(qubo, var_map), 10) {
781            Ok(results) => {
782                assert!(!results.is_empty());
783                // Check that results are sorted by energy
784                for i in 1..results.len() {
785                    assert!(results[i - 1].energy <= results[i].energy);
786                }
787            }
788            Err(e) => {
789                // GPU might not be available in test environment
790                println!("GPU test skipped: {}", e);
791            }
792        }
793    }
794}