1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//! This module provides a function that forces an artificial data dependency
//! between two loads. Basically, the code:
//!
//! ```text
//! val = some_atomic.load(DepOrd);
//! val2_ref = &val2;
//! val2 ref ^= val;
//! val2_ref ^= val; // val_2 ref now is equal to &val2, but data depends on val
//! loaded_val2 = *val2_ref; // Is ordered-after val as if by consume ordering
//! ```
//! is executed. This can be far faster than fences on arm and
//! power architectures, since the ordering is a result of data dependencies in
//! the pipeline and not full-on fences. This still isn't free, since you must
//! wait for the previous load to finish but it's better than a fence
//!
//!
//! # Example:
//! ```
//! use std::sync::atomic::{AtomicUsize, Ordering};
//! use std::sync::{Arc, Barrier};
//! use std::thread;
//! use atomic_utilities::artificial_dep::{DepOrd, dependently};
//! let num_run = 1000000;
//! let atomic_val1 = Arc::new(AtomicUsize::new(0));
//! let atomic_val2 = Arc::new(AtomicUsize::new(0));
//! let start_bar = Arc::new(Barrier::new(2));
//! let atomic_valt1 = atomic_val1.clone();
//! let atomic_valt2 = atomic_val2.clone();
//! let start_bart = start_bar.clone();
//! let to_join = thread::spawn(move || {
//!     start_bart.wait();
//!     for i in 0..num_run {
//!         atomic_valt2.store(i, Ordering::Relaxed);
//!         atomic_valt1.store(i, Ordering::Release);
//!     }
//! });
//!
//! start_bar.wait();
//! for _ in 0..num_run {
//!     let val1_ld = atomic_val1.load(DepOrd);
//!     let val2_ld = dependently(val1_ld, &atomic_val2,
//!                              |dep_ref| dep_ref.load(Ordering::Relaxed));
//!     assert!(val2_ld >= val1_ld); // Can fail if val2_ld is ordered_before val1_ld
//! }
//! ```
#[cfg(not(all(any(target_arch = "arm", target_arch = "aarch64"),
              use_asm)))]
mod artificial_dep_inner {
    use std::sync::atomic::Ordering;
    pub const DEPORD: Ordering = Ordering::Acquire;

    #[inline(always)]
    pub fn false_dep<T>(myref: &T, _v: usize) -> &T {
        myref
    }

    #[inline(always)]
    pub fn false_dep_mut<T>(myref: &mut T, _v: usize) -> &mut T {
        myref
    }
}

#[cfg(all(any(target_arch = "arm", target_arch = "aarch64"),
          use_asm))]
mod artificial_dep_inner {
    use std::sync::atomic::Ordering;
    pub const DEPORD: Ordering = Ordering::Relaxed;

    #[inline(always)]
    pub fn false_dep<T>(mut myref: &T, val: usize) -> &T {
        unsafe {
            asm!("eor $0, $0, $1
              eor $0, $0, $1"
              : "+r" (myref)
              : "r" (val));
            myref
        }
    }

    #[inline(always)]
    pub fn false_dep_mut<T>(mut myref: &mut T, val: usize) -> &mut T {
        unsafe {
            asm!("eor $0, $0, $1
              eor $0, $0, $1"
              : "+r" (myref)
              : "r" (val));
            myref
        }
    }
}

/* Once this can be tested on a power machine it's good to go
#[cfg(all(any(target_arch = "powerpc", target_arch = "powerpc64"),
          use_asm))]
mod artificial_dep_inner {
    use std::sync::atomic::Ordering;
    pub const DEPORD: Ordering = Ordering::Relaxed;

    #[inline(always)]
    pub fn false_dep<T>(myref: &T, val: usize) -> &T {
        asm!("xor $1 $0 $0
              xor $1 $0 $0"
              : "+r" (myref)
              : "r" (val));
        myref
    }
}*/

use std::sync::atomic::Ordering;

/// The ordering that must be used for any load which has fake dependent operations
#[allow(non_upper_case_globals)]
pub const DepOrd: Ordering = artificial_dep_inner::DEPORD;

/// Ensures that loads from the value myref are ordered after the load of val.
/// Val can be anything convertable to a usize, or any usize calculated from the
/// base load. 
#[inline(always)]
pub fn dependently<T, R, F: FnOnce(&T) -> R>(val: usize, myref: &T, myfn: F) -> R {
    myfn(artificial_dep_inner::false_dep(myref, val))
}

/// Ensures that loads from the value myref are ordered after the load of val.
/// Val can be anything convertable to a usize, or any usize calculated from the
/// base load. 
#[inline(always)]
pub fn dependently_mut<T, R, F: FnOnce(&mut T) -> R>(val: usize, myref: &mut T, myfn: F) -> R {
    myfn(artificial_dep_inner::false_dep_mut(myref, val))
}