1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
use std::sync::atomic::{AtomicBool, Ordering};

use anyhow::{anyhow, Result};

static IS_INITED: AtomicBool = AtomicBool::new(false);

/// Automatically collects system topology and optimizes key operations on the fly.
///
/// ## Panics
///
/// Panies when failing to init SAS.
/// Note that reinitialization is not supported.
///
#[inline]
pub fn init() {
    try_init().unwrap()
}

/// Automatically collects system topology and optimizes key operations on the fly.
///
#[inline]
pub fn try_init() -> Result<()> {
    Sas::default()
        .init()
        .map_err(|error| anyhow!("failed to init SAS: {error}"))
}

/// SAS optimization arguments.
#[derive(Clone, Debug, Default)]
#[cfg_attr(feature = "clap", derive(::clap::Parser))]
#[cfg_attr(feature = "serde", derive(::serde::Serialize, ::serde::Deserialize))]
pub struct Sas {
    /// Runtime system type.
    #[cfg_attr(
        feature = "clap",
        arg(
            default_value = "SystemType::default()",
            env = "SAS_SYSTEM_TYPE",
            long = "sas-system-type",
            value_name = "TYPE"
        )
    )]
    pub system_type: SystemType,
}

impl Sas {
    /// Optimizes key operations with given arguments.
    ///
    pub fn init(self) -> Result<()> {
        if !IS_INITED.swap(true, Ordering::SeqCst) {
            self.init_unchecked()
        } else {
            Ok(())
        }
    }

    fn init_unchecked(self) -> Result<()> {
        #[cfg(feature = "rayon")]
        {
            use rayon::ThreadPoolBuilder;

            let threads = prepare_threads()?;

            let mut builder = ThreadPoolBuilder::new().num_threads(threads.len());
            if matches!(self.system_type, SystemType::Python) {
                builder = builder.use_current_thread();
            }
            builder.build_global()?;

            bind_threads(threads)?;
        }

        Ok(())
    }
}

/// Runtime system type.
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "clap", derive(::clap::Parser))]
#[cfg_attr(feature = "serde", derive(::serde::Serialize, ::serde::Deserialize))]
#[cfg_attr(feature = "strum", derive(::strum::Display, ::strum::EnumString))]
pub enum SystemType {
    /// Use all threads without the main thread
    #[default]
    Generic,
    /// Use all threads even with the main thread
    Python,
}

#[cfg(all(feature = "numa", feature = "rayon"))]
#[inline]
fn get_topology() -> Result<::hwlocality::Topology> {
    ::hwlocality::Topology::new().map_err(Into::into)
}

#[cfg(all(not(feature = "numa"), feature = "rayon"))]
fn prepare_threads() -> Result<Vec<usize>> {
    use std::thread;

    let num_threads = thread::available_parallelism()
        .map(usize::from)
        .unwrap_or(1);
    Ok((0..num_threads).collect())
}

#[cfg(all(feature = "numa", feature = "rayon"))]
fn prepare_threads() -> Result<Vec<usize>> {
    use rand::{
        distributions::{Distribution, Uniform},
        thread_rng,
    };

    // get NUMA/CPUs info
    let topology = get_topology()?;
    let all_numa_nodes = topology.nodeset();
    let all_cpus = topology.cpuset();

    // count the resources
    let num_numa_nodes = all_numa_nodes
        .last_set()
        .map(|set| set.into())
        .unwrap_or(0usize)
        + 1;
    let num_cpus = all_cpus.last_set().map(|set| set.into()).unwrap_or(0usize) + 1;
    let num_threads_per_cpu = num_cpus / num_numa_nodes;

    // pick a random NUMA node
    let numa_node = Uniform::new(0usize, num_numa_nodes).sample(&mut thread_rng());

    // get all the CPUs in the NUMA node
    let cpu_begin = numa_node * num_threads_per_cpu;
    let cpu_end = cpu_begin + num_threads_per_cpu;
    Ok((cpu_begin..cpu_end).collect())
}

#[cfg(all(not(feature = "numa"), feature = "rayon"))]
#[inline]
fn bind_threads(_: Vec<usize>) -> Result<()> {
    Ok(())
}

#[cfg(all(feature = "numa", feature = "rayon"))]
fn bind_threads(threads: Vec<usize>) -> Result<()> {
    use hwlocality::cpu::{binding::CpuBindingFlags, cpuset::CpuSet};

    ::rayon::scope(|s| {
        s.spawn_broadcast({
            move |_, ctx| {
                // bind the given thread into the NUMA node
                let topology = get_topology().expect("failed to load topology");
                let cpus = {
                    let mut res = CpuSet::new();
                    res.set(threads[ctx.index()]);
                    res
                };
                topology
                    .bind_cpu(&cpus, CpuBindingFlags::THREAD)
                    .expect("failed to bind the rayon thread into CPU");
            }
        });
    });
    Ok(())
}