scirs2-cluster 0.3.4

//! HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
//!
//! HDBSCAN is an extension of DBSCAN that converts DBSCAN into a hierarchical clustering algorithm,
//! and then uses a technique to extract a flat clustering based on the stability of clusters.
//! This allows HDBSCAN to find clusters of varying densities, unlike DBSCAN which uses a global density threshold.

use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::cmp::Ordering;
use std::fmt::Debug;

use super::distance;
use super::DistanceMetric;
use crate::error::{ClusteringError, Result};

/// Result of the HDBSCAN algorithm
#[derive(Debug, Clone)]
pub struct HDBSCANResult<F: Float> {
    /// Cluster labels for each point (-1 for noise)
    pub labels: Array1<i32>,

    /// Probabilities of cluster membership
    pub probabilities: Array1<F>,

    /// Condensed hierarchy of the clusters (internal representation)
    pub condensed_tree: Option<CondensedTree<F>>,

    /// Single linkage tree (dendrogram)
    pub single_linkage_tree: Option<SingleLinkageTree<F>>,

    /// Cluster centroids (if computed)
    pub centroids: Option<Array2<F>>,

    /// Cluster medoids (if computed)
    pub medoids: Option<Array1<usize>>,
}

/// Single linkage tree (dendrogram) representation
#[derive(Debug, Clone)]
pub struct SingleLinkageTree<F: Float> {
    /// Left child of each node
    pub left_child: Vec<i32>,

    /// Right child of each node
    pub right_child: Vec<i32>,

    /// Distance at which the cluster was formed
    pub distances: Vec<F>,

    /// Size of each cluster
    pub sizes: Vec<usize>,
}

/// Condensed tree representation for extracting flat clusters
#[derive(Debug, Clone)]
pub struct CondensedTree<F: Float> {
    /// Parent cluster IDs
    pub parent: Vec<i32>,

    /// Child cluster IDs
    pub child: Vec<i32>,

    /// Strength of connection (lambda values)
    pub lambda_val: Vec<F>,

    /// Size of each cluster
    pub sizes: Vec<usize>,
}

/// Element for the minimum spanning tree priority queue
#[derive(Debug, Clone, PartialEq)]
#[allow(dead_code)]
struct MSTElement<F: Float> {
    /// Index of the first point
    point1: usize,

    /// Index of the second point
    point2: usize,

    /// Distance between the points
    distance: F,
}

impl<F: Float> Eq for MSTElement<F> {}

impl<F: Float> PartialOrd for MSTElement<F> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl<F: Float> Ord for MSTElement<F> {
    fn cmp(&self, other: &Self) -> Ordering {
        // Use reverse ordering for min-heap (smaller distances have higher priority)
        other
            .distance
            .partial_cmp(&self.distance)
            .unwrap_or(Ordering::Equal)
    }
}

/// Options for HDBSCAN clustering
#[derive(Debug, Clone)]
pub struct HDBSCANOptions<F: Float> {
    /// Minimum number of points to form a cluster (default: 5)
    pub min_cluster_size: usize,

    /// Number of points required for a point to be a core point
    /// If None, defaults to min_cluster_size
    pub minsamples: Option<usize>,

    /// Distance threshold for merging clusters (default: 0.0)
    pub cluster_selection_epsilon: F,

    /// Maximum size of clusters (default: None)
    pub max_cluster_size: Option<usize>,

    /// Cluster selection method: "eom" (excess of mass) or "leaf" (default: "eom")
    pub cluster_selection_method: ClusterSelectionMethod,

    /// Allow extraction of a single cluster (default: false)
    pub allow_single_cluster: bool,

    /// Store cluster centroids, medoids, or both (default: None)
    pub store_centers: Option<StoreCenter>,

    /// Distance metric to use (default: Euclidean)
    pub metric: DistanceMetric,

    /// Alpha parameter for HDBSCAN* (default: 1.0)
    pub alpha: F,
}

/// Method for selecting flat clusters from the hierarchy
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ClusterSelectionMethod {
    /// Excess of Mass algorithm - balanced approach that favors larger clusters
    EOM,

    /// Leaf clustering - finest possible clusters
    Leaf,
}

/// Type of cluster centers to store
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StoreCenter {
    /// Store only centroid
    Centroid,

    /// Store only medoid
    Medoid,

    /// Store both centroid and medoid
    Both,
}

impl<F: Float + FromPrimitive> Default for HDBSCANOptions<F> {
    fn default() -> Self {
        Self {
            min_cluster_size: 5,
            minsamples: None,
            cluster_selection_epsilon: F::zero(),
            max_cluster_size: None,
            cluster_selection_method: ClusterSelectionMethod::EOM,
            allow_single_cluster: false,
            store_centers: None,
            metric: DistanceMetric::Euclidean,
            alpha: F::one(),
        }
    }
}

/// Runs the HDBSCAN algorithm to find hierarchical density-based clusters
///
/// HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
/// is an extension of DBSCAN that converts it into a hierarchical clustering algorithm,
/// and then uses a technique to extract a flat clustering based on the stability of clusters.
///
/// # Arguments
///
/// * `data` - Input data (nsamples × n_features)
/// * `options` - Algorithm parameters
///
/// # Returns
///
/// * `Result<HDBSCANResult<F>>` - Cluster assignments and other results
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::{Array2, ArrayView2};
/// use scirs2_cluster::{hdbscan, HDBSCANOptions};
///
/// // Example data with two clusters
/// let data = Array2::from_shape_vec((10, 2), vec![
///     1.0, 2.0,  // Cluster 1
///     1.5, 1.8,
///     0.9, 1.9,
///     1.0, 2.2,
///     1.2, 2.0,
///     8.0, 9.0,  // Cluster 2
///     8.2, 8.8,
///     7.8, 9.2,
///     8.5, 8.9,
///     7.9, 9.0,
/// ]).expect("Operation failed");
///
/// // Run HDBSCAN with adjusted parameters for this dataset
/// let options = HDBSCANOptions {
///     min_cluster_size: 2,
///     minsamples: Some(2),
///     ..Default::default()
/// };
///
/// let result = hdbscan(data.view(), Some(options)).expect("Operation failed");
///
/// // Print the cluster labels
/// println!("Cluster labels: {:?}", result.labels);
/// ```
#[allow(dead_code)]
pub fn hdbscan<F>(
    data: ArrayView2<F>,
    options: Option<HDBSCANOptions<F>>,
) -> Result<HDBSCANResult<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    // Input validation
    let nsamples = data.shape()[0];

    if nsamples == 0 {
        return Err(ClusteringError::InvalidInput("Empty input data".into()));
    }

    let opts = options.unwrap_or_default();
    let minsamples = opts.minsamples.unwrap_or(opts.min_cluster_size);

    if minsamples < 2 {
        return Err(ClusteringError::InvalidInput(
            "minsamples must be at least 2".into(),
        ));
    }

    if opts.min_cluster_size < 2 {
        return Err(ClusteringError::InvalidInput(
            "min_cluster_size must be at least 2".into(),
        ));
    }

    // Step 1: Compute core distances
    let core_distances = compute_core_distances(data, minsamples, opts.metric)?;

    // Step 2: Compute mutual reachability distances
    let mutual_reachability = compute_mutual_reachability(data, &core_distances, opts.metric)?;

    // Step 3: Build the minimum spanning tree
    let mst = build_mst(&mutual_reachability)?;

    // Step 4: Convert MST to a single-linkage tree
    let single_linkage_tree = mst_to_single_linkage(&mst, nsamples)?;

    // Step 5: Construct the condensed tree
    let condensed_tree = condense_tree(&single_linkage_tree, opts.min_cluster_size)?;

    // Step 6: Extract clusters from the condensed tree
    let (labels, probabilities) = extract_clusters(
        &condensed_tree,
        opts.cluster_selection_method,
        opts.allow_single_cluster,
        nsamples,
    )?;

    // Optional: Compute cluster centroids/medoids if requested
    let (centroids, medoids) = if opts.store_centers.is_some() {
        compute_centers(data, &labels, &opts.store_centers)?
    } else {
        (None, None)
    };

    // Create and return the final result
    Ok(HDBSCANResult {
        labels,
        probabilities,
        condensed_tree: Some(condensed_tree),
        single_linkage_tree: Some(single_linkage_tree),
        centroids,
        medoids,
    })
}

// Type alias for the complex return type
type CentersResult<F> = (Option<Array2<F>>, Option<Array1<usize>>);

/// Compute cluster centroids and/or medoids
///
/// # Arguments
///
/// * `data` - Input data (nsamples × n_features)
/// * `labels` - Cluster labels for each sample
/// * `store_centers` - Which type of centers to compute
///
/// # Returns
///
/// * `Result<(Option<Array2<F>>, Option<Array1<usize>>)>` - Tuple of (centroids, medoids)
#[allow(dead_code)]
fn compute_centers<F>(
    data: ArrayView2<F>,
    labels: &Array1<i32>,
    store_centers: &Option<StoreCenter>,
) -> Result<CentersResult<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    if store_centers.is_none() {
        return Ok((None, None));
    }

    let store_centers = store_centers.expect("Operation failed");
    let compute_centroids =
        store_centers == StoreCenter::Centroid || store_centers == StoreCenter::Both;
    let compute_medoids =
        store_centers == StoreCenter::Medoid || store_centers == StoreCenter::Both;

    // Find the number of clusters (excluding noise points)
    let n_clusters = labels
        .iter()
        .filter(|&&l| l >= 0)
        .fold(0, |max, &l| max.max(l + 1));

    if n_clusters == 0 {
        // No clusters found, just noise points
        return Ok((None, None));
    }

    let nsamples = data.shape()[0];
    let n_features = data.shape()[1];

    let centroids = if compute_centroids {
        // Initialize centroids
        let mut centroids = Array2::<F>::zeros((n_clusters as usize, n_features));
        let mut counts = vec![0; n_clusters as usize];

        // Sum up points in each cluster
        for i in 0..nsamples {
            let label = labels[i];
            if label >= 0 {
                let cluster_idx = label as usize;
                counts[cluster_idx] += 1;

                for j in 0..n_features {
                    centroids[[cluster_idx, j]] = centroids[[cluster_idx, j]] + data[[i, j]];
                }
            }
        }

        // Divide by counts to get means
        for i in 0..n_clusters as usize {
            if counts[i] > 0 {
                for j in 0..n_features {
                    centroids[[i, j]] =
                        centroids[[i, j]] / F::from_usize(counts[i]).expect("Operation failed");
                }
            }
        }

        Some(centroids)
    } else {
        None
    };

    let medoids = if compute_medoids {
        // For each cluster, find the point that minimizes sum of distances to other points
        let mut medoids = Vec::with_capacity(n_clusters as usize);

        for cluster_idx in 0..n_clusters {
            // Get points in this cluster
            let cluster_points: Vec<usize> = labels
                .iter()
                .enumerate()
                .filter(|(_, &l)| l == cluster_idx)
                .map(|(i, _)| i)
                .collect();

            if cluster_points.is_empty() {
                // Empty cluster, use a dummy point
                medoids.push(0);
                continue;
            }

            // Find point with minimum sum of distances to other points in cluster
            let mut min_dist_sum = F::infinity();
            let mut medoid_idx = cluster_points[0];

            for &point_idx in &cluster_points {
                let mut dist_sum = F::zero();

                for &other_idx in &cluster_points {
                    if point_idx != other_idx {
                        // Compute distance between point and other point
                        let point1 = data.row(point_idx).to_vec();
                        let point2 = data.row(other_idx).to_vec();

                        // Use Euclidean distance for medoid calculation
                        let dist = distance::euclidean(&point1, &point2);

                        dist_sum = dist_sum + dist;
                    }
                }

                if dist_sum < min_dist_sum {
                    min_dist_sum = dist_sum;
                    medoid_idx = point_idx;
                }
            }

            medoids.push(medoid_idx);
        }

        Some(Array1::from(medoids))
    } else {
        None
    };

    Ok((centroids, medoids))
}

/// Extract DBSCAN-like clusters from HDBSCAN results at a specific distance threshold
///
/// This function extracts a flat clustering from the hierarchical clustering
/// produced by HDBSCAN, similar to running DBSCAN with the specified distance threshold.
///
/// # Arguments
///
/// * `hdbscan_result` - The result from the HDBSCAN algorithm
/// * `cut_distance` - The distance threshold (epsilon) for DBSCAN-like clustering
///
/// # Returns
///
/// * `Result<Array1<i32>>` - Cluster labels for each point (-1 for noise)
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::{Array2, ArrayView2};
/// use scirs2_cluster::{hdbscan, dbscan_clustering};
///
/// // Example data with two clusters
/// let data = Array2::from_shape_vec((10, 2), vec![
///     1.0, 2.0,  // Cluster 1
///     1.5, 1.8,
///     0.9, 1.9,
///     1.0, 2.2,
///     1.2, 2.0,
///     8.0, 9.0,  // Cluster 2
///     8.2, 8.8,
///     7.8, 9.2,
///     8.5, 8.9,
///     7.9, 9.0,
/// ]).expect("Operation failed");
///
/// // Run HDBSCAN with default parameters
/// let result = hdbscan(data.view(), None).expect("Operation failed");
///
/// // Extract DBSCAN-like clustering with eps=1.0
/// let dbscan_labels = dbscan_clustering(&result, 1.0).expect("Operation failed");
///
/// // Print the cluster labels
/// println!("DBSCAN cluster labels: {:?}", dbscan_labels);
/// ```
#[allow(dead_code)]
pub fn dbscan_clustering<F>(
    hdbscan_result: &HDBSCANResult<F>,
    cut_distance: F,
) -> Result<Array1<i32>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    // Check if there's a single-linkage tree in the _result
    let single_linkage_tree = match &hdbscan_result.single_linkage_tree {
        Some(tree) => tree,
        None => {
            return Err(ClusteringError::InvalidInput(
                "HDBSCAN _result doesn't contain a single-linkage tree".into(),
            ))
        }
    };

    // Convert cut_distance to lambda (reciprocal of distance)
    let cut_lambda = if cut_distance > F::zero() {
        F::one() / cut_distance
    } else {
        return Err(ClusteringError::InvalidInput(
            "cut_distance must be positive".into(),
        ));
    };

    // Initialize a union-find data structure for tracking clusters
    let nsamples = hdbscan_result.labels.len();
    let mut union_find = UnionFind::new(nsamples);

    // Convert distances to lambdas (1/_distance)
    let lambdas: Vec<F> = single_linkage_tree
        .distances
        .iter()
        .map(|&d| {
            if d > F::zero() {
                F::one() / d
            } else {
                F::infinity()
            }
        })
        .collect();

    // Process merges up to the cut _distance
    for (i, &lambda) in lambdas.iter().enumerate() {
        // Only consider merges below the cut _distance (i.e., above the cut lambda)
        if lambda < cut_lambda {
            continue;
        }

        let left = single_linkage_tree.left_child[i];
        let right = single_linkage_tree.right_child[i];

        // If these are leaf nodes (original points), merge their clusters
        if left < nsamples as i32 && left >= 0 && right < nsamples as i32 && right >= 0 {
            union_find.union(left as usize, right as usize);
        }
        // If one is a leaf and one is an internal node, merge the leaf with all points in the internal node
        else if left < nsamples as i32 && left >= 0 {
            // Get all points in the right subtree
            let right_points = get_leaves(right, single_linkage_tree, nsamples as i32);
            for &point in &right_points {
                if point >= 0 && point < nsamples as i32 {
                    union_find.union(left as usize, point as usize);
                }
            }
        } else if right < nsamples as i32 && right >= 0 {
            // Get all points in the left subtree
            let left_points = get_leaves(left, single_linkage_tree, nsamples as i32);
            for &point in &left_points {
                if point >= 0 && point < nsamples as i32 {
                    union_find.union(right as usize, point as usize);
                }
            }
        }
        // If both are internal nodes, merge all points in both subtrees
        else {
            let left_points = get_leaves(left, single_linkage_tree, nsamples as i32);
            let right_points = get_leaves(right, single_linkage_tree, nsamples as i32);

            if !left_points.is_empty() && !right_points.is_empty() {
                let left_rep = left_points[0];
                for &point in &right_points {
                    if point >= 0
                        && point < nsamples as i32
                        && left_rep >= 0
                        && left_rep < nsamples as i32
                    {
                        union_find.union(left_rep as usize, point as usize);
                    }
                }
            }
        }
    }

    // Convert the union-find structure to cluster labels
    let mut labels = vec![-1; nsamples];
    let mut cluster_map = std::collections::HashMap::new();
    let mut next_label = 0;

    for (i, label) in labels.iter_mut().enumerate().take(nsamples) {
        let root = union_find.find(i);

        // Only create clusters with at least 2 points
        if union_find.size(root) > 1 {
            let cluster_label = *cluster_map.entry(root).or_insert_with(|| {
                let label = next_label;
                next_label += 1;
                label
            });

            *label = cluster_label;
        }
    }

    Ok(Array1::from(labels))
}

/// Get all leaf nodes (original points) in a subtree
#[allow(dead_code)]
fn get_leaves(_node: i32, tree: &SingleLinkageTree<impl Float>, nsamples: i32) -> Vec<i32> {
    let mut leaves = Vec::new();

    // If _node is a leaf, return it
    if _node < nsamples && _node >= 0 {
        leaves.push(_node);
        return leaves;
    }

    // Find index of this _node in the tree
    let node_idx = (_node - nsamples) as usize;
    if node_idx >= tree.left_child.len() {
        return leaves;
    }

    // Recursively get leaves from left and right children
    let left = tree.left_child[node_idx];
    let right = tree.right_child[node_idx];

    leaves.extend(get_leaves(left, tree, nsamples));
    leaves.extend(get_leaves(right, tree, nsamples));

    leaves
}

// Below are helper functions for the algorithm implementation

/// Calculate mutual reachability distance between points
///
/// The mutual reachability distance between two points is defined as:
/// max(core_distance(point1), core_distance(point2), distance(point1, point2))
///
/// # Arguments
///
/// * `distance` - The original distance between the two points
/// * `core_dist1` - The core distance of the first point
/// * `core_dist2` - The core distance of the second point
///
/// # Returns
///
/// The mutual reachability distance
#[allow(dead_code)]
fn mutual_reachability_distance<F: Float>(distance: F, core_dist1: F, coredist2: F) -> F {
    distance.max(core_dist1).max(coredist2)
}

/// Compute core distances for each point in the dataset
///
/// The core distance of a point is the distance to its `minsamples - 1`-th nearest neighbor
/// (since we include the point itself in the neighborhood count).
///
/// # Arguments
///
/// * `data` - Input data (nsamples × n_features)
/// * `minsamples` - The number of points required for a point to be a core point
/// * `metric` - The distance metric to use
///
/// # Returns
///
/// * `Result<Array1<F>>` - Core distances for each point
#[allow(dead_code)]
fn compute_core_distances<F>(
    data: ArrayView2<F>,
    minsamples: usize,
    metric: DistanceMetric,
) -> Result<Array1<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    let nsamples = data.shape()[0];

    if minsamples > nsamples {
        return Err(ClusteringError::InvalidInput(format!(
            "minsamples ({}) cannot be larger than the number of _samples ({})",
            minsamples, nsamples
        )));
    }

    // For each point, find the distance to its (minsamples-1)th nearest neighbor
    let mut core_distances = Array1::<F>::zeros(nsamples);

    // First calculate the pairwise distances
    let mut distances = Array2::<F>::zeros((nsamples, nsamples));

    for i in 0..nsamples {
        for j in (i + 1)..nsamples {
            let point1 = data.row(i).to_vec();
            let point2 = data.row(j).to_vec();

            let dist = match metric {
                DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
                DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
                DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
                DistanceMetric::Minkowski => distance::minkowski(
                    &point1,
                    &point2,
                    F::from(3.0).expect("Failed to convert constant to float"),
                ),
            };

            distances[[i, j]] = dist;
            distances[[j, i]] = dist; // Distance matrix is symmetric
        }
    }

    // Calculate core distances
    for i in 0..nsamples {
        // Extract distances to other points
        let mut row_distances: Vec<F> = Vec::with_capacity(nsamples - 1);

        for j in 0..nsamples {
            if i != j {
                row_distances.push(distances[[i, j]]);
            }
        }

        // Sort distances
        row_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));

        // Core distance is the distance to the (minsamples-1)th nearest neighbor
        // We subtract 1 because we excluded the point itself when collecting neighbors
        if minsamples > 1 && minsamples - 1 < row_distances.len() {
            core_distances[i] = row_distances[minsamples - 2];
        } else {
            // If minsamples is 1 or there aren't enough neighbors, use the distance to the nearest point
            core_distances[i] = if row_distances.is_empty() {
                F::zero() // Point is isolated
            } else {
                row_distances[0]
            };
        }
    }

    Ok(core_distances)
}

/// Compute the mutual reachability distance matrix
///
/// # Arguments
///
/// * `data` - Input data (nsamples × n_features)
/// * `core_distances` - Core distances for each point
/// * `metric` - The distance metric to use
///
/// # Returns
///
/// * `Result<Array2<F>>` - Mutual reachability distance matrix
#[allow(dead_code)]
fn compute_mutual_reachability<F>(
    data: ArrayView2<F>,
    core_distances: &Array1<F>,
    metric: DistanceMetric,
) -> Result<Array2<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    let nsamples = data.shape()[0];
    let mut mutual_reachability = Array2::<F>::zeros((nsamples, nsamples));

    // First calculate the mutual reachability _distances
    for i in 0..nsamples {
        for j in (i + 1)..nsamples {
            let point1 = data.row(i).to_vec();
            let point2 = data.row(j).to_vec();

            let dist = match metric {
                DistanceMetric::Euclidean => distance::euclidean(&point1, &point2),
                DistanceMetric::Manhattan => distance::manhattan(&point1, &point2),
                DistanceMetric::Chebyshev => distance::chebyshev(&point1, &point2),
                DistanceMetric::Minkowski => distance::minkowski(
                    &point1,
                    &point2,
                    F::from(3.0).expect("Failed to convert constant to float"),
                ),
            };

            let mrd = mutual_reachability_distance(dist, core_distances[i], core_distances[j]);

            mutual_reachability[[i, j]] = mrd;
            mutual_reachability[[j, i]] = mrd; // Distance matrix is symmetric
        }

        // Set diagonal to core distance (needed for some clustering methods)
        mutual_reachability[[i, i]] = core_distances[i];
    }

    Ok(mutual_reachability)
}

/// Build a minimum spanning tree using Prim's algorithm
///
/// # Arguments
///
/// * `distances` - Mutual reachability distance matrix
///
/// # Returns
///
/// * `Result<Vec<(usize, usize, F)>>` - MST edges as (source, target, distance) tuples
#[allow(dead_code)]
fn build_mst<F>(distances: &Array2<F>) -> Result<Vec<(usize, usize, F)>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    let nsamples = distances.shape()[0];
    let mut mst_edges = Vec::with_capacity(nsamples - 1);

    // Use Prim's algorithm to build the MST
    // Start with node 0 and add the closest node in each iteration

    // Track whether a node is in the MST or not
    let mut in_mst = vec![false; nsamples];

    // Track the minimum distance to reach each node
    let mut min_distances = vec![F::infinity(); nsamples];

    // Track the source node for each destination node
    let mut source_nodes = vec![0; nsamples];

    // Start with node 0
    let mut current_node = 0;
    min_distances[current_node] = F::zero();

    for _ in 0..(nsamples - 1) {
        // Mark current node as in MST
        in_mst[current_node] = true;

        // Update _distances for neighbors
        for j in 0..nsamples {
            // Skip nodes already in MST
            if !in_mst[j] {
                let distance = distances[[current_node, j]];

                // If this provides a better path to j
                if distance < min_distances[j] {
                    min_distances[j] = distance;
                    source_nodes[j] = current_node;
                }
            }
        }

        // Find nearest node not in MST
        let mut min_dist = F::infinity();
        let mut next_node = 0;

        for j in 0..nsamples {
            if !in_mst[j] && min_distances[j] < min_dist {
                min_dist = min_distances[j];
                next_node = j;
            }
        }

        // If min_dist is still infinity, the graph is disconnected
        if min_dist.is_infinite() {
            return Err(ClusteringError::ComputationError(
                "Graph is disconnected; HDBSCAN requires a connected graph.".into(),
            ));
        }

        // Add edge to MST
        mst_edges.push((source_nodes[next_node], next_node, min_dist));

        // Move to next node
        current_node = next_node;
    }

    Ok(mst_edges)
}

/// A simple union-find data structure for MST to single-linkage tree conversion
struct UnionFind {
    /// Parent indices for each node
    parent: Vec<usize>,

    /// Size of each tree
    size: Vec<usize>,
}

impl UnionFind {
    /// Create a new union-find structure with n elements
    fn new(n: usize) -> Self {
        let mut parent = Vec::with_capacity(n);
        let mut size = Vec::with_capacity(n);

        // Initially, each element is its own parent
        for i in 0..n {
            parent.push(i);
            size.push(1);
        }

        UnionFind { parent, size }
    }

    /// Find the representative (root) of the set containing element x
    fn find(&mut self, x: usize) -> usize {
        if self.parent[x] != x {
            // Path compression: make parent point directly to the root
            self.parent[x] = self.find(self.parent[x]);
        }
        self.parent[x]
    }

    /// Union the sets containing elements x and y
    fn union(&mut self, x: usize, y: usize) -> usize {
        let root_x = self.find(x);
        let root_y = self.find(y);

        if root_x == root_y {
            return root_x; // Already in the same set
        }

        // Union by size: attach smaller tree to larger one
        if self.size[root_x] < self.size[root_y] {
            self.parent[root_x] = root_y;
            self.size[root_y] += self.size[root_x];
            root_y
        } else {
            self.parent[root_y] = root_x;
            self.size[root_x] += self.size[root_y];
            root_x
        }
    }

    /// Get the size of the tree containing element x
    fn size(&mut self, x: usize) -> usize {
        let root = self.find(x);
        self.size[root]
    }
}

/// Convert a minimum spanning tree to a single-linkage tree (dendrogram)
///
/// # Arguments
///
/// * `mst` - Minimum spanning tree edges as (source, target, distance) tuples
/// * `nsamples` - Number of samples in the dataset
///
/// # Returns
///
/// * `Result<SingleLinkageTree<F>>` - Single-linkage tree representation
#[allow(dead_code)]
fn mst_to_single_linkage<F>(
    mst: &[(usize, usize, F)],
    nsamples: usize,
) -> Result<SingleLinkageTree<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    // Sort MST edges by distance
    let mut sorted_mst = mst.to_vec();
    sorted_mst.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(Ordering::Equal));

    // Create arrays for the single-linkage tree
    let mut left_child = Vec::with_capacity(nsamples - 1);
    let mut right_child = Vec::with_capacity(nsamples - 1);
    let mut distances = Vec::with_capacity(nsamples - 1);
    let mut sizes = Vec::with_capacity(nsamples - 1);

    // Union-find data structure to track clusters
    // Need to allocate space for all nodes including internal nodes
    // For n samples, we'll have n-1 internal nodes in the hierarchy
    let total_nodes = nsamples + (nsamples - 1);
    let mut union_find = UnionFind::new(total_nodes);

    // Next id for new nodes (internal nodes of the tree)
    let mut next_id = nsamples;

    // Process each edge in order of increasing distance
    for (source, dest, distance) in sorted_mst {
        // Find the current clusters that each point belongs to
        let cluster1 = union_find.find(source);
        let cluster2 = union_find.find(dest);

        // Skip if points are already in the same cluster
        if cluster1 == cluster2 {
            continue;
        }

        // Get sizes of the clusters being merged
        let size1 = union_find.size(cluster1);
        let size2 = union_find.size(cluster2);

        // Record the merge in the linkage tree
        left_child.push(cluster1 as i32);
        right_child.push(cluster2 as i32);
        distances.push(distance);
        sizes.push(size1 + size2);

        // Merge the clusters
        union_find.union(cluster1, cluster2);

        // Update the cluster ID to the new merged cluster
        union_find.parent[cluster1] = next_id;
        union_find.parent[cluster2] = next_id;

        // Move to the next available ID for new clusters
        next_id += 1;
    }

    Ok(SingleLinkageTree {
        left_child,
        right_child,
        distances,
        sizes,
    })
}

/// Condense the single-linkage tree to extract important clusters
///
/// This transforms the single-linkage tree into a condensed tree that only includes
/// clusters with at least min_cluster_size points.
///
/// # Arguments
///
/// * `single_linkage_tree` - The single-linkage tree
/// * `min_cluster_size` - Minimum number of points to form a cluster
///
/// # Returns
///
/// * `Result<CondensedTree<F>>` - Condensed tree for cluster extraction
#[allow(dead_code)]
fn condense_tree<F>(
    single_linkage_tree: &SingleLinkageTree<F>,
    min_cluster_size: usize,
) -> Result<CondensedTree<F>>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    let n_merges = single_linkage_tree.distances.len();
    let nsamples = n_merges + 1;

    // Initialize parent and child arrays
    let mut parent = Vec::new();
    let mut child = Vec::new();
    let mut lambda_val = Vec::new();
    let mut sizes = Vec::new();

    // Convert distances to lambda values (1/distance)
    // This makes larger values indicate more significant clusters
    let mut lambdas: Vec<F> = single_linkage_tree
        .distances
        .iter()
        .map(|&d| {
            if d > F::zero() {
                F::one() / d
            } else {
                F::zero()
            }
        })
        .collect();

    // Add an entry for the root of the _tree with lambda 0
    lambdas.push(F::zero());

    // Minimum lambda value for a cluster
    let min_lambda = if lambdas.is_empty() {
        F::zero()
    } else {
        let max_val = lambdas.iter().fold(F::zero(), |max, &val| max.max(val));
        max_val / F::from(1000.0).expect("Failed to convert constant to float") // Small fraction of max lambda
    };

    // Node sizes (initial: all nodes are individual points)
    let mut node_sizes = vec![1; nsamples];

    // Extend node_sizes with merged nodes
    for &_size in &single_linkage_tree.sizes {
        node_sizes.push(_size);
    }

    // Process merges from earlier to later (increasing distance)
    let mut cluster_map = std::collections::HashMap::new();
    let mut next_cluster_id = nsamples;

    // Process each merge
    for (i, &current_lambda) in lambdas.iter().enumerate().take(n_merges) {
        let left = single_linkage_tree.left_child[i];
        let right = single_linkage_tree.right_child[i];

        // Current parent is the node created by this merge
        let current_parent = nsamples + i;

        // Check if left and right children meet the minimum _size requirement
        let left_size = node_sizes[left as usize];
        let right_size = node_sizes[right as usize];

        // Process left child
        if left_size >= min_cluster_size {
            // Map to cluster ID
            let mapped_left = *cluster_map.entry(left).or_insert_with(|| {
                let id = next_cluster_id;
                next_cluster_id += 1;
                id
            });

            // Add to condensed _tree
            parent.push(current_parent as i32);
            child.push(mapped_left as i32);
            lambda_val.push(current_lambda);
            sizes.push(left_size);
        } else if left >= 0 && left < nsamples as i32 {
            // Left child is a leaf node (original point)
            // Use negative index for individual points
            parent.push(current_parent as i32);
            child.push(-(left + 1)); // Convert to negative index
            lambda_val.push(current_lambda);
            sizes.push(1);
        }

        // Process right child
        if right_size >= min_cluster_size {
            // Map to cluster ID
            let mapped_right = *cluster_map.entry(right).or_insert_with(|| {
                let id = next_cluster_id;
                next_cluster_id += 1;
                id
            });

            // Add to condensed _tree
            parent.push(current_parent as i32);
            child.push(mapped_right as i32);
            lambda_val.push(current_lambda);
            sizes.push(right_size);
        } else if right >= 0 && right < nsamples as i32 {
            // Right child is a leaf node (original point)
            // Use negative index for individual points
            parent.push(current_parent as i32);
            child.push(-(right + 1)); // Convert to negative index
            lambda_val.push(current_lambda);
            sizes.push(1);
        }

        // Map current parent to itself in cluster map
        cluster_map.insert(current_parent as i32, current_parent);
    }

    // Filter out any entries with lambda less than min_lambda
    // This removes insignificant clusters
    let mut filtered_parent = Vec::new();
    let mut filtered_child = Vec::new();
    let mut filtered_lambda = Vec::new();
    let mut filtered_sizes = Vec::new();

    for i in 0..parent.len() {
        if lambda_val[i] >= min_lambda {
            filtered_parent.push(parent[i]);
            filtered_child.push(child[i]);
            filtered_lambda.push(lambda_val[i]);
            filtered_sizes.push(sizes[i]);
        }
    }

    Ok(CondensedTree {
        parent: filtered_parent,
        child: filtered_child,
        lambda_val: filtered_lambda,
        sizes: filtered_sizes,
    })
}

/// Extract clusters from the condensed tree
///
/// # Arguments
///
/// * `condensed_tree` - The condensed tree from which to extract clusters
/// * `method` - Method to use for cluster selection (EOM or Leaf)
/// * `allow_single_cluster` - Whether to allow a single cluster result
///
/// # Returns
///
/// * `Result<(Array1<i32>, Array1<F>)>` - Tuple of (cluster labels, stability scores)
#[allow(dead_code)]
fn extract_clusters<F>(
    condensed_tree: &CondensedTree<F>,
    method: ClusterSelectionMethod,
    allow_single_cluster: bool,
    nsamples: usize,
) -> Result<(Array1<i32>, Array1<F>)>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    // Find the root node of the _tree
    // The node with the highest ID is the root
    let root: i32 = condensed_tree.parent.iter().fold(0, |max, &p| max.max(p));

    // Determine which nodes are leaves in the condensed _tree
    // Leaves are nodes that appear as children but not as parents
    let mut is_leaf = std::collections::HashSet::new();
    let mut parent_set = std::collections::HashSet::new();

    for &parent in &condensed_tree.parent {
        parent_set.insert(parent);
    }

    for &child in &condensed_tree.child {
        if !parent_set.contains(&child) || child < 0 {
            is_leaf.insert(child);
        }
    }

    // If using leaf clustering, we select all leaf nodes as clusters
    if method == ClusterSelectionMethod::Leaf {
        // Get all leaf nodes except for individual points (negative indices)
        let leaf_clusters: Vec<i32> = is_leaf.iter().filter(|&&node| node >= 0).cloned().collect();

        // Assign points to clusters
        let (labels, probabilities) =
            assign_points_to_clusters(condensed_tree, &leaf_clusters, root, nsamples)?;

        return Ok((labels, probabilities));
    }

    // For EOM (Excess of Mass), we compute the stability of each subtree
    // Recursively compute stability of all subtrees
    // The stability of a subtree is the sum of the differences in lambda values
    // for each child times the size of the child _cluster

    // Track total stability for each subtree
    let mut subtree_stability = std::collections::HashMap::new();

    // Sort nodes by lambda values (decreasing) to process deepest nodes first
    let mut nodes_by_lambda: Vec<(usize, F)> = condensed_tree
        .lambda_val
        .iter()
        .enumerate()
        .map(|(i, &lambda)| (i, lambda))
        .collect();

    nodes_by_lambda.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));

    // Process nodes from deepest to shallowest
    for (idx, _) in nodes_by_lambda {
        let child = condensed_tree.child[idx];
        let parent = condensed_tree.parent[idx];
        let lambda = condensed_tree.lambda_val[idx];
        let size = condensed_tree.sizes[idx];

        // Calculate stability
        if is_leaf.contains(&child) && child >= 0 {
            // Leaf _cluster's stability is its lambda times its size
            let stability = lambda * F::from_usize(size).expect("Operation failed");
            subtree_stability.insert(child, stability);
        } else if child >= 0 {
            // For internal nodes, add the stability of the child
            let child_stability = *subtree_stability.get(&child).unwrap_or(&F::zero());
            subtree_stability.insert(child, child_stability);
        }

        // Propagate stability to parent
        if child >= 0 {
            let child_lambda = lambda;
            let child_stability = *subtree_stability.get(&child).unwrap_or(&F::zero());

            let parent_lambda = condensed_tree
                .lambda_val
                .iter()
                .zip(condensed_tree.parent.iter())
                .find(|(_, &p)| p == parent && p != root)
                .map(|(l_, _)| *l_)
                .unwrap_or(F::zero());

            // Lambda difference is the stability decrease
            let lambda_diff = child_lambda - parent_lambda;

            // Only count positive differences
            if lambda_diff > F::zero() {
                let stability_delta = lambda_diff * F::from_usize(size).expect("Operation failed");

                // Add to parent's stability
                let parent_stability = subtree_stability.entry(parent).or_insert(F::zero());
                *parent_stability = *parent_stability + child_stability + stability_delta;
            }
        }
    }

    // Find clusters with maximum stability
    // If a child has higher stability than its parent, select the child

    // Selected clusters
    let mut selected_clusters = std::collections::HashSet::new();

    // Process from root down
    let mut to_process = vec![root];

    while let Some(node) = to_process.pop() {
        // Find all children of the current node
        let children: Vec<i32> = condensed_tree
            .parent
            .iter()
            .zip(condensed_tree.child.iter())
            .filter(|&(p_, _)| *p_ == node)
            .map(|(_, c)| *c)
            .collect();

        if children.is_empty() {
            // Node has no children, select it
            if node >= 0 {
                selected_clusters.insert(node);
            }
            continue;
        }

        // Get node's stability
        let node_stability = *subtree_stability.get(&node).unwrap_or(&F::zero());

        // Find max stability among children
        let max_child_stability = children
            .iter()
            .filter(|&&c| c >= 0)
            .map(|&c| *subtree_stability.get(&c).unwrap_or(&F::zero()))
            .fold(F::zero(), |max, s| max.max(s));

        if max_child_stability > node_stability || node == root {
            // Children have higher stability or this is the root (which we don't select)
            // Add children to processing queue
            to_process.extend(children.iter().filter(|&&c| c >= 0).cloned());
        } else {
            // Current node has higher stability than its children
            if node >= 0 {
                selected_clusters.insert(node);
            }
        }
    }

    // Ensure we don't select too few clusters
    let selected_clusters_vec: Vec<i32> = selected_clusters.into_iter().collect();

    if selected_clusters_vec.is_empty() && allow_single_cluster {
        // If no clusters were selected, use the root (excluding actual root)
        let highest_child = condensed_tree
            .child
            .iter()
            .filter(|&&c| c >= 0 && c != root)
            .cloned()
            .max()
            .unwrap_or(-1);

        if highest_child >= 0 {
            // Assign all points to this single _cluster
            let (labels, probabilities) =
                assign_points_to_clusters(condensed_tree, &[highest_child], root, nsamples)?;

            return Ok((labels, probabilities));
        }
    }

    let (labels, probabilities) =
        assign_points_to_clusters(condensed_tree, &selected_clusters_vec, root, nsamples)?;

    Ok((labels, probabilities))
}

/// Assign points to selected clusters
///
/// # Arguments
///
/// * `condensed_tree` - The condensed tree
/// * `selected_clusters` - The list of selected cluster IDs
/// * `root` - The root node of the tree
///
/// # Returns
///
/// * `Result<(Array1<i32>, Array1<F>)>` - Tuple of (cluster labels, probabilities)
#[allow(dead_code)]
fn assign_points_to_clusters<F>(
    condensed_tree: &CondensedTree<F>,
    selected_clusters: &[i32],
    root: i32,
    nsamples: usize,
) -> Result<(Array1<i32>, Array1<F>)>
where
    F: Float + FromPrimitive + Debug + PartialOrd,
{
    // nsamples is now passed as a parameter

    // Initialize labels to noise (-1)
    let mut labels = vec![-1; nsamples];
    let mut probabilities = vec![F::zero(); nsamples];

    // Collect leaf index to cluster assignment
    let mut leaf_cluster_map = std::collections::HashMap::new();
    for &cluster_id in selected_clusters {
        leaf_cluster_map.insert(cluster_id, cluster_id);
    }

    // For each point, find its maximum lambda path to any cluster
    for point_idx in 0..nsamples {
        // Points are stored with negative indices in the condensed _tree
        let point_label = -(point_idx as i32 + 1);

        // Find all edges connecting this point to the _tree
        let point_edges: Vec<(i32, F)> = condensed_tree
            .child
            .iter()
            .zip(condensed_tree.parent.iter())
            .zip(condensed_tree.lambda_val.iter())
            .filter(|&((c, _), _)| *c == point_label)
            .map(|((&_, &p), &lambda)| (p, lambda))
            .collect();

        // If no edges, point is noise
        if point_edges.is_empty() {
            continue;
        }

        // For each path from point to a cluster, track max lambda
        let mut max_lambda = F::zero();
        let mut cluster_label = -1;

        for (node, lambda) in point_edges {
            let mut current_node = node;
            let mut path_max_lambda = lambda;

            // Traverse up the _tree to find a selected cluster
            loop {
                if leaf_cluster_map.contains_key(&current_node) {
                    // Found a path to a selected cluster
                    if path_max_lambda > max_lambda {
                        max_lambda = path_max_lambda;
                        cluster_label = *leaf_cluster_map
                            .get(&current_node)
                            .expect("Operation failed");
                    }
                    break;
                }

                // Find parent of current node
                let parent_edges: Vec<(i32, F)> = condensed_tree
                    .child
                    .iter()
                    .zip(condensed_tree.parent.iter())
                    .zip(condensed_tree.lambda_val.iter())
                    .filter(|&((c, _), _)| *c == current_node)
                    .map(|((&_, &p), &lambda)| (p, lambda))
                    .collect();

                if parent_edges.is_empty() || current_node == root {
                    // Reached root without finding a selected cluster
                    break;
                }

                // Move to parent and update max lambda
                let (parent, parent_lambda) = parent_edges[0];
                path_max_lambda = path_max_lambda.min(parent_lambda);
                current_node = parent;
            }
        }

        // Assign point to cluster with highest lambda connection
        if cluster_label >= 0 {
            labels[point_idx] = cluster_label;
            probabilities[point_idx] = max_lambda;
        }
    }

    // Normalize probabilities
    let max_prob = probabilities.iter().fold(F::zero(), |max, &p| max.max(p));

    if max_prob > F::zero() {
        for prob in &mut probabilities {
            *prob = *prob / max_prob;
        }
    }

    // Check if we need to remap cluster labels to consecutive integers
    let mut unique_labels = std::collections::HashSet::new();
    for &label in &labels {
        if label >= 0 {
            unique_labels.insert(label);
        }
    }

    let remap: std::collections::HashMap<i32, i32> = unique_labels
        .iter()
        .enumerate()
        .map(|(i, &label)| (label, i as i32))
        .collect();

    // Remap labels
    let remapped_labels: Vec<i32> = labels
        .iter()
        .map(|&label| {
            if label >= 0 {
                *remap.get(&label).unwrap_or(&label)
            } else {
                label
            }
        })
        .collect();

    Ok((Array1::from(remapped_labels), Array1::from(probabilities)))
}

#[cfg(test)]
mod tests {
    use super::*;
    use scirs2_core::ndarray::Array2;

    #[test]
    fn test_hdbscan_placeholder() {
        // Create a test dataset with more points
        let data = Array2::from_shape_vec(
            (6, 2),
            vec![
                // Cluster 1 - tight cluster
                1.0, 2.0, 1.5, 1.8, 1.3, 2.2, // Cluster 2 - tight cluster
                8.0, 9.0, 8.2, 8.8, 7.8, 9.1,
            ],
        )
        .expect("Operation failed");

        // Run HDBSCAN with very small parameters for this tiny dataset
        let options = HDBSCANOptions {
            minsamples: Some(2), // Very small for testing
            min_cluster_size: 2, // Very small for testing
            cluster_selection_epsilon: 0.0,
            allow_single_cluster: true, // Allow single cluster result
            ..Default::default()
        };

        let result = hdbscan(data.view(), Some(options));

        // Check for error first
        assert!(result.is_ok(), "HDBSCAN failed: {:?}", result.err());

        let result = result.expect("Operation failed");

        // Check that we get a result with the right shape
        assert_eq!(
            result.labels.len(),
            6,
            "Labels length should be 6, got: {:?}",
            result.labels
        );
        assert_eq!(
            result.probabilities.len(),
            6,
            "Probabilities length should be 6, got: {:?}",
            result.probabilities
        );

        // For this simple test, we just check that the algorithm runs without error
        // HDBSCAN is complex and results can vary based on the implementation details
        // So we use a relaxed test that just checks basic properties
    }

    #[test]
    fn test_mutual_reachability() {
        // Test the mutual reachability distance function
        let d = 2.0;
        let core1 = 1.0;
        let core2 = 3.0;

        let mrd = mutual_reachability_distance(d, core1, core2);
        assert_eq!(mrd, 3.0);
    }
}