1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
//! Binary operations trait.
//!
//! This trait defines element-wise binary operations on tensors.
use crate::error::Result;
use crate::runtime::Runtime;
use crate::tensor::Tensor;
/// Element-wise binary operations on tensors.
///
/// This trait defines operations that take two input tensors and produce one output tensor.
/// All binary operations support broadcasting.
///
/// # Broadcasting
///
/// Binary operations follow NumPy-style broadcasting rules:
/// - Dimensions are compared element-wise, from the trailing dimensions backward
/// - Two dimensions are compatible when they are equal, or when one of them is 1
/// - Dimensions of size 1 are stretched to match the other dimension
/// - The output has shape equal to the pairwise maximum of the input shapes
///
/// # Example
///
/// ```
/// use numr::prelude::*;
///
/// let device = CpuDevice::new();
/// let client = CpuRuntime::default_client(&device);
///
/// let a = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 2.0, 3.0, 4.0], &[2, 2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[5.0f32, 6.0, 7.0, 8.0], &[2, 2], &device);
///
/// let c = client.add(&a, &b)?; // [6.0, 8.0, 10.0, 12.0]
/// # Ok::<(), numr::error::Error>(())
/// ```
pub trait BinaryOps<R: Runtime> {
/// Element-wise addition: a + b
///
/// Adds two tensors element-wise, supporting broadcasting.
///
/// # Arguments
/// * `a` - Left operand
/// * `b` - Right operand (shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor with the result of the addition.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 2.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[3.0f32, 4.0], &[2], &device);
/// let result = client.add(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn add(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise subtraction: a - b
///
/// Subtracts two tensors element-wise, supporting broadcasting.
///
/// # Arguments
/// * `a` - Left operand (minuend)
/// * `b` - Right operand (subtrahend, shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor with the result of the subtraction.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[5.0f32, 8.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 3.0], &[2], &device);
/// let result = client.sub(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn sub(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise multiplication: a * b
///
/// Multiplies two tensors element-wise, supporting broadcasting.
///
/// # Arguments
/// * `a` - Left operand
/// * `b` - Right operand (shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor with the result of the multiplication.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[2.0f32, 3.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[4.0f32, 5.0], &[2], &device);
/// let result = client.mul(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn mul(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise division: a / b
///
/// Divides two tensors element-wise, supporting broadcasting.
/// Division by zero is undefined behavior (implementation-dependent).
///
/// # Arguments
/// * `a` - Left operand (dividend/numerator)
/// * `b` - Right operand (divisor/denominator, shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor with the result of the division.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[10.0f32, 9.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[2.0f32, 3.0], &[2], &device);
/// let result = client.div(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn div(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise power: a^b
///
/// Raises the elements of the first tensor to the power of the elements
/// of the second tensor, element-wise, supporting broadcasting.
///
/// # Arguments
/// * `a` - Base tensor
/// * `b` - Exponent tensor (shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor with the result of the power operation.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let base = Tensor::<CpuRuntime>::from_slice(&[2.0f32, 3.0], &[2], &device);
/// let exponent = Tensor::<CpuRuntime>::from_slice(&[3.0f32, 2.0], &[2], &device);
/// let result = client.pow(&base, &exponent)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn pow(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise maximum: max(a, b)
///
/// Computes the element-wise maximum of two tensors, supporting broadcasting.
///
/// # Arguments
/// * `a` - First tensor
/// * `b` - Second tensor (shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor containing the maximum of corresponding elements.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 5.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[3.0f32, 2.0], &[2], &device);
/// let result = client.maximum(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn maximum(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Element-wise minimum: min(a, b)
///
/// Computes the element-wise minimum of two tensors, supporting broadcasting.
///
/// # Arguments
/// * `a` - First tensor
/// * `b` - Second tensor (shape must be broadcastable with `a`)
///
/// # Returns
/// A new tensor containing the minimum of corresponding elements.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let a = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 5.0], &[2], &device);
/// let b = Tensor::<CpuRuntime>::from_slice(&[3.0f32, 2.0], &[2], &device);
/// let result = client.minimum(&a, &b)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn minimum(&self, a: &Tensor<R>, b: &Tensor<R>) -> Result<Tensor<R>>;
/// Two-argument arctangent: atan2(y, x)
///
/// Computes the angle in radians between the positive x-axis and the point (x, y),
/// element-wise, supporting broadcasting.
///
/// The result is in the range [-π, π]. This function is essential for converting
/// Cartesian coordinates to polar coordinates and for spatial algorithms.
///
/// # Arguments
/// * `y` - Y-coordinate tensor
/// * `x` - X-coordinate tensor (shape must be broadcastable with `y`)
///
/// # Returns
/// A new tensor with the angle in radians for each (y, x) pair.
///
/// # Errors
/// Returns an error if shapes are not broadcastable.
///
/// # Example
///
/// ```
/// # use numr::prelude::*;
/// # let device = CpuDevice::new();
/// # let client = CpuRuntime::default_client(&device);
/// let y = Tensor::<CpuRuntime>::from_slice(&[1.0f32, 0.0], &[2], &device);
/// let x = Tensor::<CpuRuntime>::from_slice(&[0.0f32, 1.0], &[2], &device);
/// let angles = client.atan2(&y, &x)?;
/// # Ok::<(), numr::error::Error>(())
/// ```
fn atan2(&self, y: &Tensor<R>, x: &Tensor<R>) -> Result<Tensor<R>>;
/// Fused multiply-add: a * b + c
///
/// Computes the element-wise fused multiply-add of three tensors in a single pass,
/// reducing memory bandwidth compared to separate multiply and add operations.
/// Uses hardware FMA instructions where available (AVX2/AVX-512/NEON).
///
/// All three tensors must have the same shape (no broadcasting).
///
/// # Arguments
/// * `a` - First multiplicand
/// * `b` - Second multiplicand
/// * `c` - Addend
fn fused_mul_add(&self, a: &Tensor<R>, b: &Tensor<R>, c: &Tensor<R>) -> Result<Tensor<R>>;
/// Fused add-multiply: (a + b) * c
///
/// Computes the element-wise fused add-multiply of three tensors in a single pass.
/// Common in residual + scaling patterns.
///
/// All three tensors must have the same shape (no broadcasting).
///
/// # Arguments
/// * `a` - First addend
/// * `b` - Second addend
/// * `c` - Multiplicand
fn fused_add_mul(&self, a: &Tensor<R>, b: &Tensor<R>, c: &Tensor<R>) -> Result<Tensor<R>>;
}