pub fn mean_py(data: &Bound<'_, PyArray1<f64>>) -> PyResult<f64>
Calculate mean - optimized with 8-way unrolling and multiple accumulators