quantrs2-sim 0.1.3

//! # AutoParallelEngine - caching Methods
//!
//! This module contains method implementations for `AutoParallelEngine`.
//!
//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)

use crate::distributed_simulator::{DistributedQuantumSimulator, DistributedSimulatorConfig};
use crate::large_scale_simulator::{LargeScaleQuantumSimulator, LargeScaleSimulatorConfig};
use quantrs2_circuit::builder::{Circuit, Simulator};
use quantrs2_core::{
    error::{QuantRS2Error, QuantRS2Result},
    gate::GateOp,
    qubit::QubitId,
};
use scirs2_core::parallel_ops::{current_num_threads, IndexedParallelIterator, ParallelIterator};
use scirs2_core::Complex64;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque};
use std::sync::{Arc, Barrier, Mutex, RwLock};
use uuid::Uuid;

use super::types::{
    AutoParallelConfig, DependencyGraph, GateNode, HardwareCharacteristics, LoadBalancer,
    OptimizationRecommendation, ParallelPerformanceStats, ParallelTask, RecommendationComplexity,
    RecommendationType, TaskPriority,
};

use super::autoparallelengine_type::AutoParallelEngine;

impl AutoParallelEngine {
    /// Create a new automatic parallelization engine
    #[must_use]
    pub fn new(config: AutoParallelConfig) -> Self {
        let num_threads = config.max_threads;
        Self {
            config,
            analysis_cache: Arc::new(RwLock::new(HashMap::new())),
            performance_stats: Arc::new(Mutex::new(ParallelPerformanceStats::default())),
            load_balancer: Arc::new(Mutex::new(LoadBalancer::new(num_threads))),
        }
    }
    /// Execute a circuit using automatic parallelization
    pub fn execute_parallel<const N: usize>(
        &self,
        circuit: &Circuit<N>,
        simulator: &mut LargeScaleQuantumSimulator,
    ) -> QuantRS2Result<Vec<Complex64>> {
        let analysis = self.analyze_circuit(circuit)?;
        if analysis.tasks.len() < self.config.min_gates_for_parallel {
            return Self::execute_sequential(circuit, simulator);
        }
        let barrier = Arc::new(Barrier::new(self.config.max_threads));
        let shared_state = Arc::new(RwLock::new(simulator.get_dense_state()?));
        let task_results = Arc::new(Mutex::new(Vec::new()));
        self.execute_parallel_tasks(&analysis.tasks, shared_state.clone(), task_results, barrier)?;
        let final_state = shared_state
            .read()
            .expect("shared state read lock should not be poisoned")
            .clone();
        Ok(final_state)
    }
    /// Execute distributed tasks across nodes
    pub(super) fn execute_distributed_tasks(
        &self,
        distributed_tasks: &[Vec<ParallelTask>],
        distributed_sim: &DistributedQuantumSimulator,
    ) -> QuantRS2Result<Vec<Vec<Complex64>>> {
        use scirs2_core::parallel_ops::{parallel_map, IndexedParallelIterator, ParallelIterator};
        let cluster_status = distributed_sim.get_cluster_status();
        let num_nodes = cluster_status.len();
        let node_results: Vec<Vec<Complex64>> =
            parallel_map(&(0..num_nodes).collect::<Vec<_>>(), |&node_id| {
                let tasks = &distributed_tasks[node_id];
                let mut node_result = Vec::new();
                for task in tasks {
                    let task_result = Self::execute_task_on_node(task, node_id);
                    node_result.extend(task_result);
                }
                node_result
            });
        Ok(node_results)
    }
    /// Execute a single task on a specific node
    pub(super) const fn execute_task_on_node(
        task: &ParallelTask,
        node_id: usize,
    ) -> Vec<Complex64> {
        Vec::new()
    }
    /// Build dependency graph for the circuit
    pub(super) fn build_dependency_graph<const N: usize>(
        &self,
        circuit: &Circuit<N>,
    ) -> QuantRS2Result<DependencyGraph> {
        let gates = circuit.gates();
        let mut nodes = Vec::with_capacity(gates.len());
        let mut edges: HashMap<usize, Vec<usize>> = HashMap::new();
        let mut reverse_edges: HashMap<usize, Vec<usize>> = HashMap::new();
        for (i, gate) in gates.iter().enumerate() {
            let qubits: HashSet<QubitId> = gate.qubits().into_iter().collect();
            let cost = Self::estimate_gate_cost(gate.as_ref());
            nodes.push(GateNode {
                gate_index: i,
                gate: gate.clone(),
                qubits,
                layer: 0,
                cost,
            });
            edges.insert(i, Vec::new());
            reverse_edges.insert(i, Vec::new());
        }
        for i in 0..nodes.len() {
            for j in (i + 1)..nodes.len() {
                if !nodes[i].qubits.is_disjoint(&nodes[j].qubits) {
                    if let Some(edge_list) = edges.get_mut(&i) {
                        edge_list.push(j);
                    }
                    if let Some(reverse_edge_list) = reverse_edges.get_mut(&j) {
                        reverse_edge_list.push(i);
                    }
                }
            }
        }
        let layers = Self::compute_topological_layers(&nodes, &edges)?;
        for (layer_idx, layer) in layers.iter().enumerate() {
            for &node_idx in layer {
                if let Some(node) = nodes.get_mut(node_idx) {
                    node.layer = layer_idx;
                }
            }
        }
        Ok(DependencyGraph {
            nodes,
            edges,
            reverse_edges,
            layers,
        })
    }
    /// Compute topological layers for parallel execution
    pub(super) fn compute_topological_layers(
        nodes: &[GateNode],
        edges: &HashMap<usize, Vec<usize>>,
    ) -> QuantRS2Result<Vec<Vec<usize>>> {
        let mut in_degree: HashMap<usize, usize> = HashMap::new();
        let mut layers = Vec::new();
        let mut queue = VecDeque::new();
        for i in 0..nodes.len() {
            in_degree.insert(i, 0);
        }
        for to_list in edges.values() {
            for &to in to_list {
                if let Some(degree) = in_degree.get_mut(&to) {
                    *degree += 1;
                }
            }
        }
        for i in 0..nodes.len() {
            if in_degree[&i] == 0 {
                queue.push_back(i);
            }
        }
        while !queue.is_empty() {
            let mut current_layer = Vec::new();
            let layer_size = queue.len();
            for _ in 0..layer_size {
                if let Some(node) = queue.pop_front() {
                    current_layer.push(node);
                    if let Some(neighbors) = edges.get(&node) {
                        for &neighbor in neighbors {
                            let new_degree = in_degree[&neighbor] - 1;
                            in_degree.insert(neighbor, new_degree);
                            if new_degree == 0 {
                                queue.push_back(neighbor);
                            }
                        }
                    }
                }
            }
            if !current_layer.is_empty() {
                layers.push(current_layer);
            }
        }
        Ok(layers)
    }
    /// Dependency-based parallelization strategy
    pub(super) fn dependency_based_parallelization(
        &self,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut tasks = Vec::new();
        for layer in &graph.layers {
            if layer.len() > 1 {
                let chunks = self.partition_layer_into_tasks(layer, graph)?;
                for chunk in chunks {
                    let task = self.create_parallel_task(chunk, graph)?;
                    tasks.push(task);
                }
            } else {
                if let Some(&gate_idx) = layer.first() {
                    let task = self.create_parallel_task(vec![gate_idx], graph)?;
                    tasks.push(task);
                }
            }
        }
        Ok(tasks)
    }
    /// Layer-based parallelization strategy
    pub(super) fn layer_based_parallelization(
        &self,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut tasks = Vec::new();
        for layer in &graph.layers {
            let max_gates_per_task = self.config.resource_constraints.max_gates_per_thread;
            for chunk in layer.chunks(max_gates_per_task) {
                let task = self.create_parallel_task(chunk.to_vec(), graph)?;
                tasks.push(task);
            }
        }
        Ok(tasks)
    }
    /// Qubit partitioning parallelization strategy
    pub(super) fn qubit_partitioning_parallelization<const N: usize>(
        &self,
        circuit: &Circuit<N>,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let qubit_partitions = self.partition_qubits(circuit)?;
        let mut tasks = Vec::new();
        for partition in qubit_partitions {
            let mut partition_gates = Vec::new();
            for (i, node) in graph.nodes.iter().enumerate() {
                if node.qubits.iter().all(|q| partition.contains(q)) {
                    partition_gates.push(i);
                }
            }
            if !partition_gates.is_empty() {
                let task = self.create_parallel_task(partition_gates, graph)?;
                tasks.push(task);
            }
        }
        Ok(tasks)
    }
    /// Hybrid parallelization strategy
    pub(super) fn hybrid_parallelization<const N: usize>(
        &self,
        circuit: &Circuit<N>,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let dependency_tasks = self.dependency_based_parallelization(graph)?;
        let layer_tasks = self.layer_based_parallelization(graph)?;
        let partition_tasks = self.qubit_partitioning_parallelization(circuit, graph)?;
        let strategies = vec![
            ("dependency", dependency_tasks),
            ("layer", layer_tasks),
            ("partition", partition_tasks),
        ];
        let best_strategy = strategies.into_iter().max_by(|(_, tasks_a), (_, tasks_b)| {
            let efficiency_a = Self::calculate_strategy_efficiency(tasks_a);
            let efficiency_b = Self::calculate_strategy_efficiency(tasks_b);
            efficiency_a
                .partial_cmp(&efficiency_b)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        match best_strategy {
            Some((_, tasks)) => Ok(tasks),
            None => Ok(Vec::new()),
        }
    }
    /// Aggressive parallelization for highly independent circuits
    pub(super) fn aggressive_parallelization(
        &self,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut tasks = Vec::new();
        let mut visited = vec![false; graph.nodes.len()];
        for (idx, node) in graph.nodes.iter().enumerate() {
            if visited[idx] {
                continue;
            }
            let mut parallel_group = vec![idx];
            visited[idx] = true;
            for (other_idx, other_node) in graph.nodes.iter().enumerate() {
                if visited[other_idx] {
                    continue;
                }
                if !Self::gates_have_dependency(idx, other_idx, graph)
                    && !Self::gates_share_qubits(&node.qubits, &other_node.qubits)
                {
                    parallel_group.push(other_idx);
                    visited[other_idx] = true;
                }
            }
            if !parallel_group.is_empty() {
                tasks.push(self.create_parallel_task(parallel_group, graph)?);
            }
        }
        Ok(tasks)
    }
    /// Calculate gate type distribution
    pub(super) fn calculate_gate_distribution(
        gates: &[Arc<dyn GateOp + Send + Sync>],
    ) -> HashMap<String, usize> {
        let mut distribution = HashMap::new();
        for gate in gates {
            let gate_type = format!("{gate:?}");
            *distribution.entry(gate_type).or_insert(0) += 1;
        }
        distribution
    }
    /// Merge small tasks together
    pub(super) fn merge_small_tasks(
        &self,
        tasks: Vec<ParallelTask>,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut merged = Vec::new();
        let mut current_batch = Vec::new();
        let mut current_cost = 0.0;
        const COST_THRESHOLD: f64 = 10.0;
        for task in tasks {
            if task.cost < COST_THRESHOLD {
                current_batch.push(task);
                if let Some(last_task) = current_batch.last() {
                    current_cost += last_task.cost;
                }
                if current_cost >= COST_THRESHOLD {
                    merged.push(Self::merge_task_batch(current_batch)?);
                    current_batch = Vec::new();
                    current_cost = 0.0;
                }
            } else {
                merged.push(task);
            }
        }
        if !current_batch.is_empty() {
            merged.push(Self::merge_task_batch(current_batch)?);
        }
        Ok(merged)
    }
    /// Split large tasks for better parallelism
    pub(super) fn split_large_tasks(tasks: Vec<ParallelTask>) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut split_tasks = Vec::new();
        const COST_THRESHOLD: f64 = 100.0;
        for task in tasks {
            if task.cost > COST_THRESHOLD && task.gates.len() > 4 {
                let mid = task.gates.len() / 2;
                let (gates1, gates2) = task.gates.split_at(mid);
                split_tasks.push(ParallelTask {
                    id: Uuid::new_v4(),
                    gates: gates1.to_vec(),
                    qubits: task.qubits.clone(),
                    cost: task.cost / 2.0,
                    memory_requirement: task.memory_requirement / 2,
                    dependencies: task.dependencies.clone(),
                    priority: task.priority,
                });
                split_tasks.push(ParallelTask {
                    id: Uuid::new_v4(),
                    gates: gates2.to_vec(),
                    qubits: task.qubits.clone(),
                    cost: task.cost / 2.0,
                    memory_requirement: task.memory_requirement / 2,
                    dependencies: HashSet::new(),
                    priority: task.priority,
                });
            } else {
                split_tasks.push(task);
            }
        }
        Ok(split_tasks)
    }
    /// Merge a batch of tasks into one
    pub(super) fn merge_task_batch(batch: Vec<ParallelTask>) -> QuantRS2Result<ParallelTask> {
        let mut merged_gates = Vec::new();
        let mut merged_qubits = HashSet::new();
        let mut merged_cost = 0.0;
        let mut merged_memory = 0;
        let mut merged_deps = HashSet::new();
        let mut max_priority = TaskPriority::Low;
        for task in batch {
            merged_gates.extend(task.gates);
            merged_qubits.extend(task.qubits);
            merged_cost += task.cost;
            merged_memory += task.memory_requirement;
            merged_deps.extend(task.dependencies);
            if task.priority as u8 > max_priority as u8 {
                max_priority = task.priority;
            }
        }
        Ok(ParallelTask {
            id: Uuid::new_v4(),
            gates: merged_gates,
            qubits: merged_qubits,
            cost: merged_cost,
            memory_requirement: merged_memory,
            dependencies: merged_deps,
            priority: max_priority,
        })
    }
    /// Cache-optimized parallelization
    pub(super) fn cache_optimized_parallelization(
        &self,
        graph: &DependencyGraph,
        hw_char: &HardwareCharacteristics,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let max_task_size = hw_char.l2_cache_size / (16 * 2);
        let mut tasks = Vec::new();
        let mut current_group = Vec::new();
        let mut current_size = 0;
        for (idx, node) in graph.nodes.iter().enumerate() {
            let gate_size = (1 << node.qubits.len()) * 16;
            if current_size + gate_size > max_task_size && !current_group.is_empty() {
                tasks.push(self.create_parallel_task(current_group, graph)?);
                current_group = Vec::new();
                current_size = 0;
            }
            current_group.push(idx);
            current_size += gate_size;
        }
        if !current_group.is_empty() {
            tasks.push(self.create_parallel_task(current_group, graph)?);
        }
        Ok(tasks)
    }
    /// SIMD-optimized parallelization
    pub(super) fn simd_optimized_parallelization(
        &self,
        graph: &DependencyGraph,
        hw_char: &HardwareCharacteristics,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let mut rotation_gates = Vec::new();
        let mut other_gates = Vec::new();
        for (idx, node) in graph.nodes.iter().enumerate() {
            if Self::is_rotation_gate(node.gate.as_ref()) {
                rotation_gates.push(idx);
            } else {
                other_gates.push(idx);
            }
        }
        let mut tasks = Vec::new();
        let vec_width = hw_char.simd_width / 128;
        for chunk in rotation_gates.chunks(vec_width) {
            tasks.push(self.create_parallel_task(chunk.to_vec(), graph)?);
        }
        for idx in other_gates {
            tasks.push(self.create_parallel_task(vec![idx], graph)?);
        }
        Ok(tasks)
    }
    /// NUMA-aware parallelization
    pub(super) fn numa_aware_parallelization(
        &self,
        graph: &DependencyGraph,
        hw_char: &HardwareCharacteristics,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let num_nodes = hw_char.num_numa_nodes;
        let mut node_tasks: Vec<Vec<usize>> = vec![Vec::new(); num_nodes];
        for (idx, node) in graph.nodes.iter().enumerate() {
            let numa_node = Self::select_numa_node(node, num_nodes);
            node_tasks[numa_node].push(idx);
        }
        let mut tasks = Vec::new();
        for node_task_indices in node_tasks {
            if !node_task_indices.is_empty() {
                tasks.push(self.create_parallel_task(node_task_indices, graph)?);
            }
        }
        Ok(tasks)
    }
    /// Refine tasks for cache efficiency
    pub(super) fn refine_for_cache(
        tasks: Vec<ParallelTask>,
        hw_char: &HardwareCharacteristics,
    ) -> QuantRS2Result<Vec<ParallelTask>> {
        let max_cache_size = hw_char.l2_cache_size;
        let mut refined = Vec::new();
        for task in tasks {
            if task.memory_requirement > max_cache_size {
                let mid = task.gates.len() / 2;
                let (gates1, gates2) = task.gates.split_at(mid);
                refined.push(ParallelTask {
                    id: Uuid::new_v4(),
                    gates: gates1.to_vec(),
                    qubits: task.qubits.clone(),
                    cost: task.cost / 2.0,
                    memory_requirement: task.memory_requirement / 2,
                    dependencies: task.dependencies.clone(),
                    priority: task.priority,
                });
                refined.push(ParallelTask {
                    id: Uuid::new_v4(),
                    gates: gates2.to_vec(),
                    qubits: task.qubits,
                    cost: task.cost / 2.0,
                    memory_requirement: task.memory_requirement / 2,
                    dependencies: HashSet::new(),
                    priority: task.priority,
                });
            } else {
                refined.push(task);
            }
        }
        Ok(refined)
    }
    /// Create a parallel task from a group of gate indices
    pub(super) fn create_parallel_task(
        &self,
        gate_indices: Vec<usize>,
        graph: &DependencyGraph,
    ) -> QuantRS2Result<ParallelTask> {
        let mut gates = Vec::new();
        let mut qubits = HashSet::new();
        let mut total_cost = 0.0;
        let mut memory_requirement = 0;
        for &idx in &gate_indices {
            if let Some(node) = graph.nodes.get(idx) {
                gates.push(node.gate.clone());
                qubits.extend(&node.qubits);
                total_cost += node.cost;
                memory_requirement += Self::estimate_gate_memory(node.gate.as_ref());
            }
        }
        let dependencies = self.calculate_task_dependencies(&gate_indices, graph)?;
        Ok(ParallelTask {
            id: Uuid::new_v4(),
            gates,
            qubits,
            cost: total_cost,
            memory_requirement,
            dependencies,
            priority: TaskPriority::Normal,
        })
    }
    /// Calculate task dependencies
    pub(super) fn calculate_task_dependencies(
        &self,
        gate_indices: &[usize],
        graph: &DependencyGraph,
    ) -> QuantRS2Result<HashSet<Uuid>> {
        let mut dependencies = HashSet::new();
        for &gate_idx in gate_indices {
            if let Some(parent_indices) = graph.reverse_edges.get(&gate_idx) {
                for &parent_idx in parent_indices {
                    if !gate_indices.contains(&parent_idx) {
                        let dep_uuid = Self::generate_gate_dependency_uuid(parent_idx);
                        dependencies.insert(dep_uuid);
                    }
                }
            }
        }
        Ok(dependencies)
    }
    /// Partition layer into parallel tasks
    pub(super) fn partition_layer_into_tasks(
        &self,
        layer: &[usize],
        graph: &DependencyGraph,
    ) -> QuantRS2Result<Vec<Vec<usize>>> {
        let max_gates_per_task = self.config.resource_constraints.max_gates_per_thread;
        let mut chunks = Vec::new();
        for chunk in layer.chunks(max_gates_per_task) {
            chunks.push(chunk.to_vec());
        }
        Ok(chunks)
    }
    /// Partition qubits into independent subsystems
    pub(super) fn partition_qubits<const N: usize>(
        &self,
        circuit: &Circuit<N>,
    ) -> QuantRS2Result<Vec<HashSet<QubitId>>> {
        let mut partitions = Vec::new();
        let mut used_qubits = HashSet::new();
        for i in 0..N {
            let qubit = QubitId::new(i as u32);
            if used_qubits.insert(qubit) {
                let mut partition = HashSet::new();
                partition.insert(qubit);
                partitions.push(partition);
            }
        }
        Ok(partitions)
    }
    /// Generate optimization recommendations
    pub(super) fn generate_optimization_recommendations<const N: usize>(
        &self,
        circuit: &Circuit<N>,
        graph: &DependencyGraph,
        tasks: &[ParallelTask],
    ) -> Vec<OptimizationRecommendation> {
        let mut recommendations = Vec::new();
        if graph.layers.iter().any(|layer| layer.len() == 1) {
            recommendations.push(OptimizationRecommendation {
                recommendation_type: RecommendationType::GateReordering,
                description: "Consider reordering gates to create larger parallel layers"
                    .to_string(),
                expected_improvement: 0.2,
                complexity: RecommendationComplexity::Medium,
            });
        }
        let task_costs: Vec<f64> = tasks.iter().map(|t| t.cost).collect();
        let cost_variance = Self::calculate_variance(&task_costs);
        if cost_variance > 0.5 {
            recommendations.push(OptimizationRecommendation {
                recommendation_type: RecommendationType::ResourceAllocation,
                description: "Task costs are unbalanced, consider load balancing optimization"
                    .to_string(),
                expected_improvement: 0.15,
                complexity: RecommendationComplexity::Low,
            });
        }
        recommendations
    }
    /// Execute circuit sequentially (fallback)
    pub(super) fn execute_sequential<const N: usize>(
        circuit: &Circuit<N>,
        simulator: &LargeScaleQuantumSimulator,
    ) -> QuantRS2Result<Vec<Complex64>> {
        let result = simulator.run(circuit)?;
        Ok(Vec::new())
    }
    /// Compute hash for circuit caching
    pub(super) fn compute_circuit_hash<const N: usize>(circuit: &Circuit<N>) -> u64 {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        let mut hasher = DefaultHasher::new();
        circuit.num_gates().hash(&mut hasher);
        circuit.num_qubits().hash(&mut hasher);
        for gate in circuit.gates() {
            gate.name().hash(&mut hasher);
            gate.qubits().len().hash(&mut hasher);
        }
        hasher.finish()
    }
}