paralight 0.0.11

// Copyright 2024-2026 Google LLC
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! A thread pool implementing parallelism at a lightweight cost.

mod sync;
mod util;

use super::{RangeStrategy, ThreadCount};
use crate::core::pipeline::{IterPipelineImpl, Pipeline, UpperBoundedPipelineImpl};
use crate::core::range::{
    FixedRangeFactory, Range, RangeFactory, RangeOrchestrator, WorkStealingRangeFactory,
};
use crate::iter::{Accumulator, ExactSizeAccumulator, GenericThreadPool, SourceCleanup};
use crate::macros::{log_debug, log_error, log_warn};
use crossbeam_utils::CachePadded;
use sync::{make_lending_group, Borrower, Lender, WorkerState};
use util::LifetimeParameterized;
// Platforms that support `libc::sched_setaffinity()`.
#[cfg(all(
    not(miri),
    any(
        target_os = "android",
        target_os = "dragonfly",
        target_os = "freebsd",
        target_os = "linux"
    )
))]
use nix::{
    sched::{sched_setaffinity, CpuSet},
    unistd::Pid,
};
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::sync::atomic::AtomicUsize;
use std::sync::{Arc, Mutex};
use std::thread::JoinHandle;
#[cfg(all(not(miri), target_os = "windows"))]
use windows_sys::Win32::{
    Foundation::GetLastError,
    System::Threading::{GetCurrentThread, SetThreadAffinityMask},
};

/// Policy to pin worker threads to CPUs.
#[derive(Clone, Copy)]
pub enum CpuPinningPolicy {
    /// Don't pin worker threads to CPUs.
    No,
    /// Pin each worker thread to a CPU, if CPU pinning is supported and
    /// implemented on this platform.
    IfSupported,
    /// Pin each worker thread to a CPU. If CPU pinning isn't supported on this
    /// platform (or not implemented), building a thread pool will panic.
    Always,
}

/// A builder for [`ThreadPool`].
pub struct ThreadPoolBuilder {
    /// Number of worker threads to spawn in the pool.
    pub num_threads: ThreadCount,
    /// Strategy to distribute ranges of work items among threads.
    pub range_strategy: RangeStrategy,
    /// Policy to pin worker threads to CPUs.
    pub cpu_pinning: CpuPinningPolicy,
}

impl ThreadPoolBuilder {
    /// Spawns a thread pool.
    ///
    /// ```
    /// # use paralight::prelude::*;
    /// let pool_builder = ThreadPoolBuilder {
    ///     num_threads: ThreadCount::AvailableParallelism,
    ///     range_strategy: RangeStrategy::WorkStealing,
    ///     cpu_pinning: CpuPinningPolicy::No,
    /// };
    /// let mut thread_pool = pool_builder.build();
    ///
    /// let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    /// let sum = input
    ///     .par_iter()
    ///     .with_thread_pool(&mut thread_pool)
    ///     .sum::<i32>();
    /// assert_eq!(sum, 5 * 11);
    /// ```
    pub fn build(&self) -> ThreadPool {
        ThreadPool::new(self)
    }
}

/// A thread pool that can execute parallel pipelines.
///
/// This type doesn't expose any public methods other than
/// [`num_threads()`](Self::num_threads). You can interact with it via
/// the [`ThreadPoolBuilder::build()`] function to create a thread pool, and the
/// [`with_thread_pool()`](crate::iter::ParallelSourceExt::with_thread_pool)
/// method to attach a thread pool to a parallel iterator.
pub struct ThreadPool {
    inner: ThreadPoolEnum,
}

impl ThreadPool {
    /// Creates a new thread pool using the given parameters.
    fn new(builder: &ThreadPoolBuilder) -> Self {
        Self {
            inner: ThreadPoolEnum::new(builder),
        }
    }

    /// Returns the number of worker threads that have been spawned in this
    /// thread pool.
    pub fn num_threads(&self) -> NonZeroUsize {
        self.inner.num_threads()
    }
}

// SAFETY: Proof of the safety guarantees is deferred to the inner calls.
unsafe impl GenericThreadPool for &mut ThreadPool {
    fn upper_bounded_pipeline<Output: Send, Accum>(
        self,
        input_len: usize,
        init: impl Fn() -> Accum + Sync,
        process_item: impl Fn(Accum, usize) -> ControlFlow<Accum, Accum> + Sync,
        finalize: impl Fn(Accum) -> Output + Sync,
        reduce: impl Fn(Output, Output) -> Output,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // Proof of the safety guarantees is deferred to the inner function.
        self.inner
            .upper_bounded_pipeline(input_len, init, process_item, finalize, reduce, cleanup)
    }

    fn iter_pipeline<Output, Accum: Send>(
        self,
        input_len: usize,
        accum: impl Accumulator<usize, Accum> + Sync,
        reduce: impl ExactSizeAccumulator<Accum, Output>,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // Proof of the safety guarantees is deferred to the inner function.
        self.inner.iter_pipeline(input_len, accum, reduce, cleanup)
    }
}

/// Underlying [`ThreadPool`] implementation, dispatching over the
/// [`RangeStrategy`].
enum ThreadPoolEnum {
    Fixed(ThreadPoolImpl<FixedRangeFactory>),
    WorkStealing(ThreadPoolImpl<WorkStealingRangeFactory>),
}

impl ThreadPoolEnum {
    /// Creates a new thread pool using the given parameters.
    fn new(builder: &ThreadPoolBuilder) -> Self {
        let num_threads: NonZeroUsize = builder.num_threads.count();
        let num_threads: usize = num_threads.into();
        match builder.range_strategy {
            RangeStrategy::Fixed => ThreadPoolEnum::Fixed(ThreadPoolImpl::new(
                num_threads,
                FixedRangeFactory::new(num_threads),
                builder.cpu_pinning,
            )),
            RangeStrategy::WorkStealing => ThreadPoolEnum::WorkStealing(ThreadPoolImpl::new(
                num_threads,
                WorkStealingRangeFactory::new(num_threads),
                builder.cpu_pinning,
            )),
        }
    }

    /// Returns the number of worker threads that have been spawned in this
    /// thread pool.
    fn num_threads(&self) -> NonZeroUsize {
        match self {
            ThreadPoolEnum::Fixed(inner) => inner.num_threads(),
            ThreadPoolEnum::WorkStealing(inner) => inner.num_threads(),
        }
    }

    /// Processes an input of the given length in parallel and returns the
    /// aggregated output.
    ///
    /// With this variant, the pipeline may skip processing items at larger
    /// indices whenever a call to `process_item` returns
    /// [`ControlFlow::Break`].
    ///
    /// # Safety guarantees
    ///
    /// This function guarantees that:
    /// - the indices passed to `process_item()` are in `0..input_len`,
    /// - the ranges passed to `cleanup.cleanup_item_range()` are included in
    ///   `0..input_len`,
    /// - each index in `0..inner_len` is passed exactly once in calls to
    ///   `process_item()` and `cleanup.cleanup_item_range()`.
    fn upper_bounded_pipeline<Output: Send, Accum>(
        &mut self,
        input_len: usize,
        init: impl Fn() -> Accum + Sync,
        process_item: impl Fn(Accum, usize) -> ControlFlow<Accum, Accum> + Sync,
        finalize: impl Fn(Accum) -> Output + Sync,
        reduce: impl Fn(Output, Output) -> Output,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // Proof of the safety guarantees is deferred to the inner function.
        match self {
            ThreadPoolEnum::Fixed(inner) => inner.upper_bounded_pipeline(
                input_len,
                init,
                process_item,
                finalize,
                reduce,
                cleanup,
            ),
            ThreadPoolEnum::WorkStealing(inner) => inner.upper_bounded_pipeline(
                input_len,
                init,
                process_item,
                finalize,
                reduce,
                cleanup,
            ),
        }
    }

    /// Processes an input of the given length in parallel and returns the
    /// aggregated output.
    ///
    /// # Safety guarantees
    ///
    /// This function guarantees that:
    /// - the indices passed to `accum.accumulate()` are in `0..input_len`,
    /// - the ranges passed to `cleanup.cleanup_item_range()` are included in
    ///   `0..input_len`,
    /// - each index in `0..inner_len` is passed exactly once in calls to
    ///   `accum.accumulate()` and `cleanup.cleanup_item_range()`.
    fn iter_pipeline<Output, Accum: Send>(
        &mut self,
        input_len: usize,
        accum: impl Accumulator<usize, Accum> + Sync,
        reduce: impl ExactSizeAccumulator<Accum, Output>,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // Proof of the safety guarantees is deferred to the inner function.
        match self {
            ThreadPoolEnum::Fixed(inner) => inner.iter_pipeline(input_len, accum, reduce, cleanup),
            ThreadPoolEnum::WorkStealing(inner) => {
                inner.iter_pipeline(input_len, accum, reduce, cleanup)
            }
        }
    }
}

/// Underlying [`ThreadPool`] implementation, specialized to a
/// [`RangeStrategy`].
struct ThreadPoolImpl<F: RangeFactory> {
    /// Handles to all the worker threads in the pool.
    threads: Vec<WorkerThreadHandle>,
    /// Orchestrator for the work ranges distributed to the threads.
    range_orchestrator: F::Orchestrator,
    /// Pipeline to map and reduce inputs into an output.
    pipeline: Lender<DynLifetimeSyncPipeline<F::Range>>,
}

/// Handle to a worker thread in a thread pool.
struct WorkerThreadHandle {
    /// Thread handle object.
    handle: JoinHandle<()>,
}

impl<F: RangeFactory> ThreadPoolImpl<F> {
    /// Creates a new thread pool using the given parameters.
    fn new(num_threads: usize, range_factory: F, cpu_pinning: CpuPinningPolicy) -> Self
    where
        F::Range: Send + 'static,
    {
        let (lender, borrowers) = make_lending_group(num_threads);

        #[cfg(any(
            miri,
            not(any(
                target_os = "android",
                target_os = "dragonfly",
                target_os = "freebsd",
                target_os = "linux",
                target_os = "windows"
            ))
        ))]
        match cpu_pinning {
            CpuPinningPolicy::No => (),
            CpuPinningPolicy::IfSupported => {
                log_warn!("Pinning threads to CPUs is not implemented on this platform.")
            }
            CpuPinningPolicy::Always => {
                panic!("Pinning threads to CPUs is not implemented on this platform.")
            }
        }

        let threads = borrowers
            .into_iter()
            .enumerate()
            .map(|(id, borrower)| {
                let mut context = ThreadContext {
                    id,
                    range: range_factory.range(id),
                    pipeline: borrower,
                };
                WorkerThreadHandle {
                    handle: std::thread::spawn(move || {
                        #[cfg(all(
                            not(miri),
                            any(
                                target_os = "android",
                                target_os = "dragonfly",
                                target_os = "freebsd",
                                target_os = "linux"
                            )
                        ))]
                        match cpu_pinning {
                            CpuPinningPolicy::No => (),
                            CpuPinningPolicy::IfSupported => {
                                let mut cpu_set = CpuSet::new();
                                if let Err(_e) = cpu_set.set(id) {
                                    log_warn!("Failed to set CPU affinity for thread #{id}: {_e}");
                                } else if let Err(_e) =
                                    sched_setaffinity(Pid::from_raw(0), &cpu_set)
                                {
                                    log_warn!("Failed to set CPU affinity for thread #{id}: {_e}");
                                } else {
                                    log_debug!("Pinned thread #{id} to CPU #{id}");
                                }
                            }
                            CpuPinningPolicy::Always => {
                                let mut cpu_set = CpuSet::new();
                                if let Err(e) = cpu_set.set(id) {
                                    panic!("Failed to set CPU affinity for thread #{id}: {e}");
                                } else if let Err(e) = sched_setaffinity(Pid::from_raw(0), &cpu_set)
                                {
                                    panic!("Failed to set CPU affinity for thread #{id}: {e}");
                                } else {
                                    log_debug!("Pinned thread #{id} to CPU #{id}");
                                }
                            }
                        }
                        #[cfg(all(not(miri), target_os = "windows"))]
                        match cpu_pinning {
                            CpuPinningPolicy::No => (),
                            CpuPinningPolicy::IfSupported => {
                                let affinity_mask = 1usize << id;
                                // SAFETY: `GetCurrentThread()` always returns a valid handle for the
                                // current thread if it's used within the same thread.
                                let thread = unsafe { GetCurrentThread() };
                                // SAFETY: `SetThreadAffinityMask()` may fail if the requested CPU
                                // isn't in the current process's affinity mask or if the affinity
                                // mask can't be used for another reason. In these cases it will
                                // fail gracefully.
                                let result = unsafe { SetThreadAffinityMask(thread, affinity_mask) };
                                if result == 0 {
                                    // SAFETY: `GetLastError()`is used when an error is returned
                                    // from `SetThreadAffinityMask()`, and should always return an
                                    // error code.
                                    let _last_error = unsafe { GetLastError() };
                                    log_warn!("Failed to set CPU affinity for thread #{id}: error code {_last_error}");
                                } else {
                                    log_debug!("Pinned thread #{id} to CPU #{id}");
                                }
                            }
                            CpuPinningPolicy::Always => {
                                let affinity_mask = 1usize << id;
                                // SAFETY: `GetCurrentThread()` always returns a valid handle for the
                                // current thread if it's used within the same thread.
                                let thread = unsafe { GetCurrentThread() };
                                // SAFETY: `SetThreadAffinityMask()` may fail if the requested CPU
                                // isn't in the current process's affinity mask or if the affinity
                                // mask can't be used for another reason. In these cases it will
                                // fail gracefully.
                                let result = unsafe { SetThreadAffinityMask(thread, affinity_mask) };
                                if result == 0 {
                                    // SAFETY: `GetLastError()`is used when an error is returned
                                    // from `SetThreadAffinityMask()`, and should always return an
                                    // error code.
                                    let last_error = unsafe { GetLastError() };
                                    panic!("Failed to set CPU affinity for thread #{id}: error code {last_error}");
                                } else {
                                    log_debug!("Pinned thread #{id} to CPU #{id}");
                                }
                            }
                        }
                        context.run()
                    }),
                }
            })
            .collect();
        log_debug!("[main thread] Spawned threads");

        Self {
            threads,
            range_orchestrator: range_factory.orchestrator(),
            pipeline: lender,
        }
    }

    /// Returns the number of worker threads that have been spawned in this
    /// thread pool.
    fn num_threads(&self) -> NonZeroUsize {
        self.threads.len().try_into().unwrap()
    }

    /// Processes an input of the given length in parallel and returns the
    /// aggregated output.
    ///
    /// With this variant, the pipeline may skip processing items at larger
    /// indices whenever a call to `process_item` returns
    /// [`ControlFlow::Break`].
    ///
    /// # Safety guarantees
    ///
    /// This function guarantees that:
    /// - the indices passed to `process_item()` are in `0..input_len`,
    /// - the ranges passed to `cleanup.cleanup_item_range()` are included in
    ///   `0..input_len`,
    /// - each index in `0..inner_len` is passed exactly once in calls to
    ///   `process_item()` and `cleanup.cleanup_item_range()`.
    fn upper_bounded_pipeline<Output: Send, Accum>(
        &mut self,
        input_len: usize,
        init: impl Fn() -> Accum + Sync,
        process_item: impl Fn(Accum, usize) -> ControlFlow<Accum, Accum> + Sync,
        finalize: impl Fn(Accum) -> Output + Sync,
        reduce: impl Fn(Output, Output) -> Output,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // The safety guarantees derive from this call as well as how the
        // `UpperBoundedPipelineImpl` uses a `SkipIteratorWrapper` on each worker
        // thread's range.
        self.range_orchestrator.reset_ranges(input_len);

        let num_threads = self.threads.len();
        let outputs = (0..num_threads)
            .map(|_| Mutex::new(None))
            .collect::<Arc<[_]>>();
        let bound = AtomicUsize::new(usize::MAX);

        self.pipeline.lend(&UpperBoundedPipelineImpl {
            bound: CachePadded::new(bound),
            outputs: outputs.clone(),
            init,
            process_item,
            finalize,
            cleanup,
        });

        outputs
            .iter()
            .map(move |output| output.lock().unwrap().take().unwrap())
            .reduce(reduce)
            .unwrap()
    }

    /// Processes an input of the given length in parallel and returns the
    /// aggregated output.
    ///
    /// # Safety guarantees
    ///
    /// This function guarantees that:
    /// - the indices passed to `accum.accumulate()` are in `0..input_len`,
    /// - the ranges passed to `cleanup.cleanup_item_range()` are included in
    ///   `0..input_len`,
    /// - each index in `0..inner_len` is passed exactly once in calls to
    ///   `accum.accumulate()` and `cleanup.cleanup_item_range()`.
    fn iter_pipeline<Output, Accum: Send>(
        &mut self,
        input_len: usize,
        accum: impl Accumulator<usize, Accum> + Sync,
        reduce: impl ExactSizeAccumulator<Accum, Output>,
        cleanup: &(impl SourceCleanup + Sync),
    ) -> Output {
        // The safety guarantees derive from this call as well as how the
        // `IterPipelineImpl` uses a `SkipIteratorWrapper` on each worker thread's
        // range.
        self.range_orchestrator.reset_ranges(input_len);

        let num_threads = self.threads.len();
        let outputs = (0..num_threads)
            .map(|_| Mutex::new(None))
            .collect::<Arc<[_]>>();

        self.pipeline.lend(&IterPipelineImpl {
            outputs: outputs.clone(),
            accum,
            cleanup,
        });

        reduce.accumulate_exact(
            outputs
                .iter()
                .map(move |output| output.lock().unwrap().take().unwrap()),
        )
    }
}

impl<F: RangeFactory> Drop for ThreadPoolImpl<F> {
    /// Joins all the threads in the pool.
    #[allow(clippy::single_match, clippy::unused_enumerate_index)]
    fn drop(&mut self) {
        self.pipeline.finish_workers();

        log_debug!("[main thread] Joining threads in the pool...");
        for (_i, t) in self.threads.drain(..).enumerate() {
            let result = t.handle.join();
            match result {
                Ok(_) => log_debug!("[main thread] Thread {_i} joined with result: {result:?}"),
                Err(_) => log_error!("[main thread] Thread {_i} joined with result: {result:?}"),
            }
        }
        log_debug!("[main thread] Joined threads.");

        #[cfg(feature = "log_parallelism")]
        self.range_orchestrator.print_statistics();
    }
}

/// An intermediate struct representing a `dyn Pipeline<R> + Sync` with variable
/// lifetime. Because Rust doesn't directly support higher-kinded types, we use
/// the generic associated type of the [`LifetimeParameterized`] trait as a
/// proxy.
struct DynLifetimeSyncPipeline<R: Range>(PhantomData<R>);

impl<R: Range> LifetimeParameterized for DynLifetimeSyncPipeline<R> {
    type T<'a> = dyn Pipeline<R> + Sync + 'a;
}

/// Context object owned by a worker thread.
struct ThreadContext<R: Range> {
    /// Thread index.
    id: usize,
    /// Range of items that this worker thread needs to process.
    range: R,
    /// Pipeline to map and reduce inputs into the output.
    pipeline: Borrower<DynLifetimeSyncPipeline<R>>,
}

impl<R: Range> ThreadContext<R> {
    /// Main function run by this thread.
    fn run(&mut self) {
        loop {
            match self.pipeline.borrow(|pipeline| {
                pipeline.run(self.id, &self.range);
            }) {
                WorkerState::Finished => break,
                WorkerState::Ready => continue,
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::iter::{ExactParallelSourceExt, IntoExactParallelRefSource, ParallelIteratorExt};

    #[test]
    fn test_build_thread_pool_available_parallelism() {
        let mut thread_pool = ThreadPoolBuilder {
            num_threads: ThreadCount::AvailableParallelism,
            range_strategy: RangeStrategy::Fixed,
            cpu_pinning: CpuPinningPolicy::No,
        }
        .build();

        let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
        let sum = input
            .par_iter()
            .with_thread_pool(&mut thread_pool)
            .sum::<i32>();

        assert_eq!(sum, 5 * 11);
    }

    #[test]
    fn test_build_thread_pool_fixed_thread_count() {
        let mut thread_pool = ThreadPoolBuilder {
            num_threads: ThreadCount::try_from(4).unwrap(),
            range_strategy: RangeStrategy::Fixed,
            cpu_pinning: CpuPinningPolicy::No,
        }
        .build();

        let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
        let sum = input
            .par_iter()
            .with_thread_pool(&mut thread_pool)
            .sum::<i32>();

        assert_eq!(sum, 5 * 11);
    }

    #[test]
    fn test_build_thread_pool_cpu_pinning_if_supported() {
        let mut thread_pool = ThreadPoolBuilder {
            num_threads: ThreadCount::AvailableParallelism,
            range_strategy: RangeStrategy::Fixed,
            cpu_pinning: CpuPinningPolicy::IfSupported,
        }
        .build();

        let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
        let sum = input
            .par_iter()
            .with_thread_pool(&mut thread_pool)
            .sum::<i32>();

        assert_eq!(sum, 5 * 11);
    }

    #[cfg(all(
        not(miri),
        any(
            target_os = "android",
            target_os = "dragonfly",
            target_os = "freebsd",
            target_os = "linux",
            target_os = "windows"
        )
    ))]
    #[test]
    fn test_build_thread_pool_cpu_pinning_always() {
        let mut thread_pool = ThreadPoolBuilder {
            num_threads: ThreadCount::AvailableParallelism,
            range_strategy: RangeStrategy::Fixed,
            cpu_pinning: CpuPinningPolicy::Always,
        }
        .build();

        let input = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
        let sum = input
            .par_iter()
            .with_thread_pool(&mut thread_pool)
            .sum::<i32>();

        assert_eq!(sum, 5 * 11);
    }

    #[cfg(any(
        miri,
        not(any(
            target_os = "android",
            target_os = "dragonfly",
            target_os = "freebsd",
            target_os = "linux",
            target_os = "windows"
        ))
    ))]
    #[test]
    #[should_panic = "Pinning threads to CPUs is not implemented on this platform."]
    fn test_build_thread_pool_cpu_pinning_always_not_supported() {
        ThreadPoolBuilder {
            num_threads: ThreadCount::AvailableParallelism,
            range_strategy: RangeStrategy::Fixed,
            cpu_pinning: CpuPinningPolicy::Always,
        }
        .build();
    }

    #[test]
    fn test_num_threads() {
        for range_strategy in [RangeStrategy::Fixed, RangeStrategy::WorkStealing] {
            let thread_pool = ThreadPoolBuilder {
                num_threads: ThreadCount::AvailableParallelism,
                range_strategy,
                cpu_pinning: CpuPinningPolicy::No,
            }
            .build();
            assert_eq!(
                thread_pool.num_threads(),
                std::thread::available_parallelism().unwrap()
            );

            let thread_pool = ThreadPoolBuilder {
                num_threads: ThreadCount::try_from(4).unwrap(),
                range_strategy,
                cpu_pinning: CpuPinningPolicy::No,
            }
            .build();
            assert_eq!(
                thread_pool.num_threads(),
                NonZeroUsize::try_from(4).unwrap()
            );
        }
    }
}