quantrs2_tytan/
gpu_samplers.rs

1//! GPU-accelerated samplers with SciRS2 integration.
2//!
3//! This module provides high-performance GPU samplers for solving QUBO and HOBO problems
4//! using CUDA kernels via SciRS2, with support for multi-GPU distributed sampling.
5
6#![allow(dead_code)]
7
8use crate::sampler::{SampleResult, Sampler, SamplerError, SamplerResult};
9use scirs2_core::ndarray::{Array, ArrayD, Ix2, IxDyn};
10use scirs2_core::random::{thread_rng, Rng};
11use std::collections::HashMap;
12use std::sync::{Arc, Mutex};
13
14#[cfg(feature = "scirs")]
15use scirs2_core::gpu;
16
17// Stubs for missing GPU functionality
18#[cfg(feature = "scirs")]
19const fn get_device_count() -> usize {
20    // Placeholder
21    1
22}
23
24#[cfg(feature = "scirs")]
25struct GpuContext;
26
27#[cfg(feature = "scirs")]
28struct DeviceInfo {
29    memory_mb: usize,
30    compute_units: usize,
31}
32
33#[cfg(feature = "scirs")]
34impl GpuContext {
35    fn new(_device_id: u32) -> Result<Self, Box<dyn std::error::Error>> {
36        Ok(Self)
37    }
38
39    const fn get_device_info(&self) -> DeviceInfo {
40        DeviceInfo {
41            memory_mb: 8192,
42            compute_units: 64,
43        }
44    }
45
46    fn allocate_memory_pool(&self, _size: usize) -> Result<(), Box<dyn std::error::Error>> {
47        Ok(())
48    }
49
50    fn allocate<T>(&self, _count: usize) -> Result<GpuBuffer<T>, Box<dyn std::error::Error>> {
51        Ok(GpuBuffer::new())
52    }
53
54    fn init_random_states(
55        &self,
56        _buffer: &GpuBuffer<u8>,
57        _seed: u64,
58    ) -> Result<(), Box<dyn std::error::Error>> {
59        Ok(())
60    }
61
62    fn launch_kernel(
63        &self,
64        _name: &str,
65        _grid: usize,
66        _block: usize,
67        _args: &[KernelArg],
68    ) -> Result<(), Box<dyn std::error::Error>> {
69        Ok(())
70    }
71
72    fn synchronize(&self) -> Result<(), Box<dyn std::error::Error>> {
73        Ok(())
74    }
75}
76
77#[cfg(feature = "scirs")]
78struct GpuMatrix;
79
80#[cfg(feature = "scirs")]
81struct GpuBuffer<T> {
82    _phantom: std::marker::PhantomData<T>,
83}
84
85#[cfg(feature = "scirs")]
86impl<T> GpuBuffer<T> {
87    const fn new() -> Self {
88        Self {
89            _phantom: std::marker::PhantomData,
90        }
91    }
92
93    fn copy_to_host(&self, _host_data: &mut [T]) -> Result<(), Box<dyn std::error::Error>> {
94        Ok(())
95    }
96
97    const fn as_kernel_arg(&self) -> KernelArg {
98        KernelArg::Buffer
99    }
100}
101
102#[cfg(feature = "scirs")]
103enum KernelArg {
104    Buffer,
105    Scalar(f32),
106    Integer(i32),
107}
108
109#[cfg(feature = "scirs")]
110impl GpuMatrix {
111    fn from_host_mixed(
112        _ctx: &GpuContext,
113        _matrix: &Array<f64, Ix2>,
114    ) -> Result<Self, Box<dyn std::error::Error>> {
115        Ok(Self)
116    }
117
118    fn from_host(
119        _ctx: &GpuContext,
120        _matrix: &Array<f64, Ix2>,
121    ) -> Result<Self, Box<dyn std::error::Error>> {
122        Ok(Self)
123    }
124
125    const fn as_kernel_arg(&self) -> KernelArg {
126        KernelArg::Buffer
127    }
128}
129
130/// GPU-accelerated sampler with CUDA kernels via SciRS2
131pub struct EnhancedArminSampler {
132    /// Random seed for reproducibility
133    seed: Option<u64>,
134    /// GPU device ID
135    device_id: usize,
136    /// Number of parallel runs per batch
137    batch_size: usize,
138    /// Temperature schedule parameters
139    initial_temp: f64,
140    final_temp: f64,
141    /// Number of sweeps per run
142    sweeps: usize,
143    /// Enable multi-GPU distribution
144    multi_gpu: bool,
145    /// Memory pool size in MB
146    memory_pool_mb: usize,
147    /// Enable asynchronous execution
148    async_mode: bool,
149    /// Mixed precision computation
150    use_mixed_precision: bool,
151    /// Verbose output
152    verbose: bool,
153}
154
155impl EnhancedArminSampler {
156    /// Create a new enhanced GPU sampler
157    pub const fn new(device_id: usize) -> Self {
158        Self {
159            seed: None,
160            device_id,
161            batch_size: 1024,
162            initial_temp: 10.0,
163            final_temp: 0.01,
164            sweeps: 1000,
165            multi_gpu: false,
166            memory_pool_mb: 1024,
167            async_mode: true,
168            use_mixed_precision: true,
169            verbose: false,
170        }
171    }
172
173    /// Enable multi-GPU mode
174    pub const fn with_multi_gpu(mut self, enable: bool) -> Self {
175        self.multi_gpu = enable;
176        self
177    }
178
179    /// Set batch size for parallel runs
180    pub const fn with_batch_size(mut self, size: usize) -> Self {
181        self.batch_size = size;
182        self
183    }
184
185    /// Set temperature schedule
186    pub const fn with_temperature(mut self, initial: f64, final_: f64) -> Self {
187        self.initial_temp = initial;
188        self.final_temp = final_;
189        self
190    }
191
192    /// Set number of sweeps
193    pub const fn with_sweeps(mut self, sweeps: usize) -> Self {
194        self.sweeps = sweeps;
195        self
196    }
197
198    /// Set memory pool size
199    pub const fn with_memory_pool(mut self, size_mb: usize) -> Self {
200        self.memory_pool_mb = size_mb;
201        self
202    }
203
204    /// Enable mixed precision computation
205    pub const fn with_mixed_precision(mut self, enable: bool) -> Self {
206        self.use_mixed_precision = enable;
207        self
208    }
209
210    /// Run GPU annealing with optimized kernels
211    #[cfg(feature = "scirs")]
212    fn run_gpu_optimized(
213        &self,
214        qubo: &Array<f64, Ix2>,
215        var_map: &HashMap<String, usize>,
216        shots: usize,
217    ) -> SamplerResult<Vec<SampleResult>> {
218        let n_vars = var_map.len();
219
220        // Initialize GPU context
221        let device_id_u32: u32 = self.device_id.try_into().map_err(|_| {
222            SamplerError::InvalidParameter(format!(
223                "Device ID {} is too large for u32",
224                self.device_id
225            ))
226        })?;
227        let ctx = GpuContext::new(device_id_u32)
228            .map_err(|e| SamplerError::GpuError(format!("Failed to initialize GPU: {e}")))?;
229
230        if self.verbose {
231            let info = ctx.get_device_info();
232            println!(
233                "GPU Device: {} MB memory, {} compute units",
234                info.memory_mb, info.compute_units
235            );
236        }
237
238        // Allocate memory pool
239        ctx.allocate_memory_pool(self.memory_pool_mb * 1024 * 1024)
240            .map_err(|e| SamplerError::GpuError(format!("Memory pool allocation failed: {e}")))?;
241
242        // Transfer QUBO matrix to GPU
243        let gpu_qubo = if self.use_mixed_precision {
244            // Convert to FP16 for mixed precision
245            GpuMatrix::from_host_mixed(&ctx, qubo)
246                .map_err(|e| SamplerError::GpuError(format!("Matrix transfer failed: {e}")))?
247        } else {
248            GpuMatrix::from_host(&ctx, qubo)
249                .map_err(|e| SamplerError::GpuError(format!("Matrix transfer failed: {e}")))?
250        };
251
252        // Run annealing in batches
253        let mut all_results = Vec::new();
254        let num_batches = shots.div_ceil(self.batch_size);
255
256        for batch in 0..num_batches {
257            let batch_size = std::cmp::min(self.batch_size, shots - batch * self.batch_size);
258
259            if self.verbose {
260                println!(
261                    "Processing batch {}/{} ({} samples)",
262                    batch + 1,
263                    num_batches,
264                    batch_size
265                );
266            }
267
268            // Launch CUDA kernel for parallel tempering
269            let states = self.launch_annealing_kernel(&ctx, &gpu_qubo, n_vars, batch_size)?;
270
271            // Convert GPU results to SampleResult
272            let batch_results = self.process_gpu_results(states, var_map)?;
273            all_results.extend(batch_results);
274        }
275
276        // Sort by energy
277        all_results.sort_by(|a, b| {
278            a.energy
279                .partial_cmp(&b.energy)
280                .unwrap_or(std::cmp::Ordering::Equal)
281        });
282
283        Ok(all_results)
284    }
285
286    /// Launch optimized CUDA annealing kernel
287    #[cfg(feature = "scirs")]
288    fn launch_annealing_kernel(
289        &self,
290        ctx: &GpuContext,
291        gpu_qubo: &GpuMatrix,
292        n_vars: usize,
293        batch_size: usize,
294    ) -> SamplerResult<Vec<Vec<bool>>> {
295        // CUDA kernel parameters
296        let block_size = 256;
297        let grid_size = batch_size.div_ceil(block_size);
298
299        // Allocate device memory for states
300        let states_size = batch_size * n_vars;
301        let d_states = ctx
302            .allocate::<u8>(states_size)
303            .map_err(|e| SamplerError::GpuError(format!("State allocation failed: {e}")))?;
304
305        // Allocate device memory for energies
306        let d_energies = ctx
307            .allocate::<f32>(batch_size)
308            .map_err(|e| SamplerError::GpuError(format!("Energy allocation failed: {e}")))?;
309
310        // Initialize random states on GPU
311        ctx.init_random_states(&d_states, self.seed.unwrap_or_else(|| thread_rng().gen()))
312            .map_err(|e| SamplerError::GpuError(format!("Random init failed: {e}")))?;
313
314        // Launch parallel tempering kernel
315        let kernel_name = if self.use_mixed_precision {
316            "parallel_tempering_mixed_precision"
317        } else {
318            "parallel_tempering_fp32"
319        };
320
321        ctx.launch_kernel(
322            kernel_name,
323            grid_size,
324            block_size,
325            &[
326                gpu_qubo.as_kernel_arg(),
327                d_states.as_kernel_arg(),
328                d_energies.as_kernel_arg(),
329                KernelArg::Integer(n_vars as i32),
330                KernelArg::Integer(batch_size as i32),
331                KernelArg::Scalar(self.initial_temp as f32),
332                KernelArg::Scalar(self.final_temp as f32),
333                KernelArg::Integer(self.sweeps as i32),
334            ],
335        )
336        .map_err(|e| SamplerError::GpuError(format!("Kernel launch failed: {e}")))?;
337
338        // Synchronize if not in async mode
339        if !self.async_mode {
340            ctx.synchronize()
341                .map_err(|e| SamplerError::GpuError(format!("Synchronization failed: {e}")))?;
342        }
343
344        // Copy results back to host
345        let mut host_states = vec![0u8; states_size];
346        d_states
347            .copy_to_host(&mut host_states)
348            .map_err(|e| SamplerError::GpuError(format!("Result transfer failed: {e}")))?;
349
350        // Convert to boolean vectors
351        let mut results = Vec::new();
352        for i in 0..batch_size {
353            let start = i * n_vars;
354            let end = start + n_vars;
355            let state: Vec<bool> = host_states[start..end].iter().map(|&x| x != 0).collect();
356            results.push(state);
357        }
358
359        Ok(results)
360    }
361
362    /// Process GPU results into SampleResult format
363    fn process_gpu_results(
364        &self,
365        states: Vec<Vec<bool>>,
366        var_map: &HashMap<String, usize>,
367    ) -> SamplerResult<Vec<SampleResult>> {
368        let idx_to_var: HashMap<usize, String> = var_map
369            .iter()
370            .map(|(var, &idx)| (idx, var.clone()))
371            .collect();
372
373        let mut results = Vec::new();
374
375        for state in states {
376            // Create variable assignments
377            let mut assignments: HashMap<String, bool> = HashMap::new();
378            for (idx, &value) in state.iter().enumerate() {
379                let var_name = idx_to_var.get(&idx).ok_or_else(|| {
380                    SamplerError::InvalidParameter(format!(
381                        "Variable index {} not found in variable map",
382                        idx
383                    ))
384                })?;
385                assignments.insert(var_name.clone(), value);
386            }
387
388            // Energy will be calculated on GPU in real implementation
389            let energy = 0.0; // Placeholder
390
391            results.push(SampleResult {
392                assignments,
393                energy,
394                occurrences: 1,
395            });
396        }
397
398        Ok(results)
399    }
400
401    /// Fallback implementation when SciRS2 is not available
402    #[cfg(not(feature = "scirs"))]
403    fn run_gpu_optimized(
404        &self,
405        _qubo: &Array<f64, Ix2>,
406        _var_map: &HashMap<String, usize>,
407        _shots: usize,
408    ) -> SamplerResult<Vec<SampleResult>> {
409        Err(SamplerError::GpuError(
410            "GPU acceleration requires SciRS2 feature".to_string(),
411        ))
412    }
413}
414
415impl Sampler for EnhancedArminSampler {
416    fn run_qubo(
417        &self,
418        qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
419        shots: usize,
420    ) -> SamplerResult<Vec<SampleResult>> {
421        let (matrix, var_map) = qubo;
422
423        if self.multi_gpu {
424            self.run_multi_gpu(matrix, var_map, shots)
425        } else {
426            self.run_gpu_optimized(matrix, var_map, shots)
427        }
428    }
429
430    fn run_hobo(
431        &self,
432        _hobo: &(ArrayD<f64>, HashMap<String, usize>),
433        _shots: usize,
434    ) -> SamplerResult<Vec<SampleResult>> {
435        // For HOBO, we need to use tensor decomposition
436        // This is handled by MIKASAmpler
437        Err(SamplerError::InvalidParameter(
438            "Use MIKASAmpler for HOBO problems".to_string(),
439        ))
440    }
441}
442
443impl EnhancedArminSampler {
444    /// Run sampling across multiple GPUs
445    #[cfg(feature = "scirs")]
446    fn run_multi_gpu(
447        &self,
448        qubo: &Array<f64, Ix2>,
449        var_map: &HashMap<String, usize>,
450        shots: usize,
451    ) -> SamplerResult<Vec<SampleResult>> {
452        let num_gpus = get_device_count();
453
454        if num_gpus <= 1 {
455            return self.run_gpu_optimized(qubo, var_map, shots);
456        }
457
458        if self.verbose {
459            println!("Using {num_gpus} GPUs for distributed sampling");
460        }
461
462        // Distribute shots across GPUs
463        let shots_per_gpu = shots / num_gpus;
464        let remainder = shots % num_gpus;
465
466        let mut results = Arc::new(Mutex::new(Vec::new()));
467        let mut handles = Vec::new();
468
469        // Launch sampling on each GPU in parallel
470        for gpu_id in 0..num_gpus {
471            let gpu_shots = if gpu_id < remainder {
472                shots_per_gpu + 1
473            } else {
474                shots_per_gpu
475            };
476
477            let qubo_clone = qubo.clone();
478            let var_map_clone = var_map.clone();
479            let results_clone = Arc::clone(&results);
480            let sampler = self.clone_with_device(gpu_id);
481
482            let handle = std::thread::spawn(move || {
483                match sampler.run_gpu_optimized(&qubo_clone, &var_map_clone, gpu_shots) {
484                    Ok(gpu_results) => {
485                        let mut all_results = results_clone
486                            .lock()
487                            .expect("Results mutex poisoned - a GPU thread panicked");
488                        all_results.extend(gpu_results);
489                    }
490                    Err(e) => {
491                        eprintln!("GPU {gpu_id} failed: {e}");
492                    }
493                }
494            });
495
496            handles.push(handle);
497        }
498
499        // Wait for all GPUs to complete
500        for handle in handles {
501            handle.join().expect("GPU thread panicked");
502        }
503
504        let mut final_results = results
505            .lock()
506            .expect("Results mutex poisoned - a GPU thread panicked")
507            .clone();
508        final_results.sort_by(|a, b| {
509            a.energy
510                .partial_cmp(&b.energy)
511                .unwrap_or(std::cmp::Ordering::Equal)
512        });
513
514        Ok(final_results)
515    }
516
517    /// Clone sampler with different device
518    fn clone_with_device(&self, device_id: usize) -> Self {
519        Self {
520            device_id,
521            ..self.clone()
522        }
523    }
524
525    #[cfg(not(feature = "scirs"))]
526    fn run_multi_gpu(
527        &self,
528        qubo: &Array<f64, Ix2>,
529        var_map: &HashMap<String, usize>,
530        shots: usize,
531    ) -> SamplerResult<Vec<SampleResult>> {
532        self.run_gpu_optimized(qubo, var_map, shots)
533    }
534}
535
536// Make sampler cloneable
537impl Clone for EnhancedArminSampler {
538    fn clone(&self) -> Self {
539        Self {
540            seed: self.seed,
541            device_id: self.device_id,
542            batch_size: self.batch_size,
543            initial_temp: self.initial_temp,
544            final_temp: self.final_temp,
545            sweeps: self.sweeps,
546            multi_gpu: self.multi_gpu,
547            memory_pool_mb: self.memory_pool_mb,
548            async_mode: self.async_mode,
549            use_mixed_precision: self.use_mixed_precision,
550            verbose: self.verbose,
551        }
552    }
553}
554
555/// GPU-accelerated HOBO sampler (MIKASA)
556pub struct MIKASAmpler {
557    /// Base configuration from ArminSampler
558    base_config: EnhancedArminSampler,
559    /// Tensor decomposition rank
560    decomposition_rank: usize,
561    /// Use CP decomposition
562    use_cp_decomposition: bool,
563    /// Tensor contraction order optimization
564    optimize_contraction: bool,
565}
566
567impl MIKASAmpler {
568    /// Create new MIKASA sampler for HOBO problems
569    pub const fn new(device_id: usize) -> Self {
570        Self {
571            base_config: EnhancedArminSampler::new(device_id),
572            decomposition_rank: 50,
573            use_cp_decomposition: true,
574            optimize_contraction: true,
575        }
576    }
577
578    /// Set tensor decomposition rank
579    pub const fn with_rank(mut self, rank: usize) -> Self {
580        self.decomposition_rank = rank;
581        self
582    }
583
584    /// Enable/disable CP decomposition
585    pub const fn with_cp_decomposition(mut self, enable: bool) -> Self {
586        self.use_cp_decomposition = enable;
587        self
588    }
589}
590
591impl Sampler for MIKASAmpler {
592    fn run_qubo(
593        &self,
594        qubo: &(Array<f64, Ix2>, HashMap<String, usize>),
595        shots: usize,
596    ) -> SamplerResult<Vec<SampleResult>> {
597        // Delegate to base sampler for QUBO
598        self.base_config.run_qubo(qubo, shots)
599    }
600
601    fn run_hobo(
602        &self,
603        hobo: &(ArrayD<f64>, HashMap<String, usize>),
604        shots: usize,
605    ) -> SamplerResult<Vec<SampleResult>> {
606        let (tensor, var_map) = hobo;
607
608        // Apply tensor decomposition for efficient GPU computation
609        #[cfg(feature = "scirs")]
610        {
611            self.run_hobo_gpu(tensor, var_map, shots)
612        }
613
614        #[cfg(not(feature = "scirs"))]
615        {
616            Err(SamplerError::GpuError(
617                "HOBO GPU acceleration requires SciRS2 feature".to_string(),
618            ))
619        }
620    }
621}
622
623impl MIKASAmpler {
624    #[cfg(feature = "scirs")]
625    fn run_hobo_gpu(
626        &self,
627        tensor: &ArrayD<f64>,
628        var_map: &HashMap<String, usize>,
629        shots: usize,
630    ) -> SamplerResult<Vec<SampleResult>> {
631        // Stub tensor contraction functionality
632        use scirs2_core::ndarray::{Array, IxDyn};
633        let cp_decomposition = |_: &ArrayD<f64>| -> Result<
634            (Vec<usize>, Vec<Array<f64, IxDyn>>, f64),
635            Box<dyn std::error::Error>,
636        > { Ok((vec![], vec![Array::zeros(IxDyn(&[1]))], 0.0f64)) };
637        let optimize_contraction_order = |_: &[usize]| -> Vec<usize> { vec![] };
638
639        let n_vars = var_map.len();
640        let order = tensor.ndim();
641
642        if self.base_config.verbose {
643            println!("Processing {order}-order tensor with {n_vars} variables");
644        }
645
646        // Apply tensor decomposition if beneficial
647        if self.use_cp_decomposition && order > 2 {
648            // Perform CP decomposition
649            let (factors, core_tensors, reconstruction_error) = cp_decomposition(tensor)
650                .map_err(|e| SamplerError::GpuError(format!("CP decomposition failed: {e}")))?;
651
652            let decomposed = DecomposedTensor {
653                factors,
654                core_tensors,
655                reconstruction_error,
656            };
657
658            if self.base_config.verbose {
659                println!("Decomposed tensor to rank {}", self.decomposition_rank);
660            }
661
662            // Run GPU sampling on decomposed form
663            self.run_decomposed_hobo_gpu(decomposed, var_map, shots)
664        } else {
665            // Direct GPU computation for low-order tensors
666            self.run_direct_hobo_gpu(tensor, var_map, shots)
667        }
668    }
669
670    #[cfg(feature = "scirs")]
671    fn run_decomposed_hobo_gpu(
672        &self,
673        decomposed: DecomposedTensor,
674        var_map: &HashMap<String, usize>,
675        shots: usize,
676    ) -> SamplerResult<Vec<SampleResult>> {
677        // Placeholder for decomposed tensor GPU implementation
678        Err(SamplerError::InvalidParameter(
679            "Decomposed HOBO GPU sampling not yet implemented".to_string(),
680        ))
681    }
682
683    #[cfg(feature = "scirs")]
684    fn run_direct_hobo_gpu(
685        &self,
686        tensor: &ArrayD<f64>,
687        var_map: &HashMap<String, usize>,
688        shots: usize,
689    ) -> SamplerResult<Vec<SampleResult>> {
690        // Placeholder for direct tensor GPU implementation
691        Err(SamplerError::InvalidParameter(
692            "Direct HOBO GPU sampling not yet implemented".to_string(),
693        ))
694    }
695}
696
697// Placeholder for decomposed tensor type
698#[cfg(feature = "scirs")]
699struct DecomposedTensor {
700    // CP decomposition components
701    factors: Vec<usize>,
702    core_tensors: Vec<Array<f64, IxDyn>>,
703    reconstruction_error: f64,
704}
705
706/// Asynchronous GPU sampling pipeline
707pub struct AsyncGpuPipeline {
708    /// Number of pipeline stages
709    num_stages: usize,
710    /// Queue depth per stage
711    queue_depth: usize,
712    /// Base sampler
713    sampler: EnhancedArminSampler,
714}
715
716impl AsyncGpuPipeline {
717    /// Create new asynchronous pipeline
718    pub const fn new(sampler: EnhancedArminSampler) -> Self {
719        Self {
720            num_stages: 3,
721            queue_depth: 4,
722            sampler,
723        }
724    }
725
726    /// Run pipelined sampling
727    pub fn run_pipelined(
728        &self,
729        qubo: &Array<f64, Ix2>,
730        var_map: &HashMap<String, usize>,
731        shots: usize,
732    ) -> SamplerResult<Vec<SampleResult>> {
733        // Pipeline stages:
734        // 1. Initialize states on GPU
735        // 2. Run annealing kernels
736        // 3. Transfer results back
737
738        // This would implement overlapped execution of the three stages
739        // for maximum throughput
740
741        self.sampler
742            .run_qubo(&(qubo.clone(), var_map.clone()), shots)
743    }
744}
745
746#[cfg(test)]
747mod tests {
748    #[cfg(feature = "scirs")]
749    use super::EnhancedArminSampler;
750    use crate::sampler::Sampler;
751    #[cfg(feature = "scirs")]
752    use scirs2_core::ndarray::Array;
753    #[cfg(feature = "scirs")]
754    use std::collections::HashMap;
755
756    #[test]
757    #[cfg(feature = "scirs")]
758    fn test_enhanced_armin_sampler() {
759        let sampler = EnhancedArminSampler::new(0)
760            .with_batch_size(256)
761            .with_sweeps(100);
762
763        // Create small QUBO problem
764        let mut qubo = Array::zeros((3, 3));
765        qubo[[0, 0]] = -1.0;
766        qubo[[1, 1]] = -1.0;
767        qubo[[2, 2]] = -1.0;
768        qubo[[0, 1]] = 2.0;
769        qubo[[1, 0]] = 2.0;
770
771        let mut var_map = HashMap::new();
772        var_map.insert("x0".to_string(), 0);
773        var_map.insert("x1".to_string(), 1);
774        var_map.insert("x2".to_string(), 2);
775
776        // Run sampler
777        match sampler.run_qubo(&(qubo, var_map), 10) {
778            Ok(results) => {
779                assert!(!results.is_empty());
780                // Check that results are sorted by energy
781                for i in 1..results.len() {
782                    assert!(results[i - 1].energy <= results[i].energy);
783                }
784            }
785            Err(e) => {
786                // GPU might not be available in test environment
787                println!("GPU test skipped: {}", e);
788            }
789        }
790    }
791}