diffsl 0.11.2

A compiler for a domain-specific language for ordinary differential equations (ODE).
Documentation
// This should work on LLVM 7, 8, 9, however in CI the version of clang installed on Ubuntu 18.04 cannot load
// a clang plugin properly without segfaulting on exit. This is fine on Ubuntu 20.04 or later LLVM versions...
// RUN: if [ %llvmver -lt 18 ]; then %clang++ -fno-exceptions  -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O1 %s -S -emit-llvm -o - %loadClangEnzyme -mllvm -enzyme-auto-sparsity=1 | %lli - ; fi
// RUN: if [ %llvmver -lt 18 ]; then %clang++ -fno-exceptions  -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O2 %s -S -emit-llvm -o - %loadClangEnzyme -mllvm -enzyme-auto-sparsity=1  | %lli - ; fi
// RUN: if [ %llvmver -lt 18 ]; then %clang++ -fno-exceptions  -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 %s -S -emit-llvm -o - %loadClangEnzyme  -mllvm -enzyme-auto-sparsity=1 | %lli - ; fi
// TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0  -std=c++11 -O1 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
// TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0  -std=c++11 -O2 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
// TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0  -std=c++11 -O3 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi

#include <vector>
#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <string.h>
#include <time.h>

#include "matrix.h"


template<typename T>
__attribute__((always_inline))
static T face_load(unsigned long long offset, T* x, const int* faces) {
    offset /= sizeof(T);
    return x[faces[offset]];
}

template<typename T>
__attribute__((always_inline))
static void face_store(unsigned long long offset, T* x, const int* faces) {
    assert(0 && "store is not legal");
}


template<typename T>
__attribute__((always_inline))
static T area_load(unsigned long long offset, T* pos0, const int* faces) {
    offset /= sizeof(T);

    int idx = offset / 9;

    int inc = offset % 9;

    int i = faces[3*idx];
    int j = faces[3*idx+1];
    int k = faces[3*idx+2];

    /// pos_data[0:3] -> pos[3*faces[i]:3*faces[i]+3]
    /// pos_data[3:6] -> pos[3*faces[j]:3*faces[j]+3]
    /// pos_data[6:9] -> pos[3*faces[k]:3*faces[k]+3]

    if (inc < 3) {
        return pos0[3*i+inc];
    } else if (inc < 6) {
        return pos0[3*j+inc-3];
    } else {
        return pos0[3*k+inc-6];
    }
}

template<typename T>
__attribute__((always_inline))
static void area_store(unsigned long long offset, T* pos0, const int* faces) {
    assert(0 && "store is not legal");
}

template<typename T>
__attribute__((always_inline))
static T eigenstuffM(const T *__restrict__ pos0, size_t n, const int *__restrict__ faces, const T *__restrict__ x) {
    T sum = 0;
    __builtin_assume(n != 0);
    for (size_t idx=0; idx<n; idx++) {
        int i = faces[3*idx];
        int j = faces[3*idx+1];
        int k = faces[3*idx+2];

/*
        T xi = x[i];
        T xj = x[j];
        T xk = x[k];
        */

        T xi = x[3 * idx];  /// x[i] -> real_x[faces[i]] 
        T xj = x[3 * idx + 1];
        T xk = x[3 * idx + 2];

        const T* pos_data = &pos0[9 * idx];
        /// 
        /// pos_data[0:3] -> pos[3*faces[i]:3*faces[i]+3]
        /// pos_data[3:6] -> pos[3*faces[j]:3*faces[j]+3]
        /// pos_data[6:9] -> pos[3*faces[k]:3*faces[k]+3]

        T tri_area = area(&pos_data[0], &pos_data[3], &pos_data[6]);

        sum += (xi * xi + xj * xj + xk * xk) * (1.0 / 3.0) * tri_area;  // barycentric mass lumping
    }
    return sum;
}


// Calculate total energy for all faces in 3D
template<typename T>
__attribute__((always_inline))
static T eigenstuffL(const T *__restrict__ x, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ pos0) {
    T sum = 0;
    __builtin_assume(num_faces != 0);
    for (size_t idx=0; idx<num_faces; idx++) {
        int i = faces[3*idx];
        int j = faces[3*idx+1];
        int k = faces[3*idx+2];

        T X[2][3] = {
            {   pos0[3*j+0] - pos0[3*i+0],
                pos0[3*j+1] - pos0[3*i+1],
                pos0[3*j+2] - pos0[3*i+2]},
            {pos0[3*k+0] - pos0[3*i+0], pos0[3*k+1] - pos0[3*i+1], pos0[3*k+2] - pos0[3*i+2]}
        };

        T pInvX[3][2];
        pseudo_inverse(pInvX, X);
       
        T diffs[] = {x[j] - x[i], x[k] - x[i]};

        T g[3];
        #pragma clang loop unroll(full)
        for (int i = 0; i < 3; ++i) {
            T sum = 0.0f;
            #pragma clang loop unroll(full)
            for (int j = 0; j < 2; ++j) {
                sum += pInvX[i][j] * diffs[j];
            }
            g[i] = sum;
        }

        sum += dot_product<T, 3>(g, g) * area(&pos0[3*i], &pos0[3*j], &pos0[3*k]);
    }

    return sum;
}


template<typename T>
__attribute__((always_inline))
static void gradient_ip(const T *__restrict__ pos0, const size_t num_faces, const int* faces, const T *__restrict__ x, T *__restrict__ out)
{
    __enzyme_autodiff<void>((void *)eigenstuffM<T>,
                            enzyme_const, pos0,
                            enzyme_const, num_faces,
                            enzyme_const, faces,
                            enzyme_dup, x, out);
}

template<typename T>
__attribute__((noinline))
std::vector<Triple<T>> hessian(const T*__restrict__ pos0, size_t num_faces, const int* faces, const T*__restrict__ x, size_t x_pts)
{    
    float* x2 = __enzyme_post_sparse_todense<float*>(face_load<float>, face_store<float>, x, faces);

    /*
    float* x3 = (float*)malloc(sizeof(float)*9*num_faces);
    for (size_t idx=0; idx<num_faces; idx++) {
        int i = faces[3*idx];
        int j = faces[3*idx+1];
        int k = faces[3*idx+2];
        x3[idx * 9 + 0] = pos[3*i+0];
        x3[idx * 9 + 1] = pos[3*i+1];
        x3[idx * 9 + 2] = pos[3*i+2];
        x3[idx * 9 + 3] = pos[3*j+0];
        x3[idx * 9 + 4] = pos[3*j+1];
        x3[idx * 9 + 5] = pos[3*j+2];
        x3[idx * 9 + 6] = pos[3*k+0];
        x3[idx * 9 + 7] = pos[3*k+1];
        x3[idx * 9 + 8] = pos[3*k+2];
    }
    */

    float* pos02 = __enzyme_post_sparse_todense<float*>(area_load<float>, area_store<float>, pos0, faces);
    std::vector<Triple<T>> hess;
    __builtin_assume(x_pts != 0);
    for (size_t i=0; i<3*x_pts; i++)
        __enzyme_fwddiff<void>((void *)gradient_ip<T>,
                                enzyme_const, pos02,
                                enzyme_const, num_faces,
                                enzyme_const, faces,
                                enzyme_dup, x2, __enzyme_todense<T*>(ident_load<T>, ident_store<T>, i),
                                enzyme_dupnoneed, nullptr, __enzyme_todense<T*>(sparse_load<T>, sparse_store<T>, i, &hess));
    return hess;
}

int main(int argc, char** argv) {
    size_t x_pts = 8;

    if (argc >= 2) {
         x_pts = atoi(argv[1]);
    }

    // TODO generate data for more inputs
    assert(x_pts == 8);
    const float x[] = {0.0, 1.0, 0.0};


    const size_t num_faces = 1;
    const int faces[] = {0, 1, 2};

    const float pos0[] = {1.0, 2.0, 3.0, 4.0, 3.0, 2.0, 3.0, 1.0, 3.0};

    // Call eigenstuffM_simple
    struct timeval start, end;
    gettimeofday(&start, NULL);
    const float resultM = eigenstuffM(pos0, num_faces, faces, x);
    gettimeofday(&end, NULL);
    printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end));

    // Call eigenstuffL_simple
    gettimeofday(&start, NULL);
    const float resultL = eigenstuffL(pos0, num_faces, faces, x);
    gettimeofday(&end, NULL);
    printf("Result for eigenstuffL_simple: %f, runtime:%f\n", resultL, tdiff(&start, &end));

    float dx[sizeof(x)/sizeof(x[0])];
    for (size_t i=0; i<sizeof(dx)/sizeof(x[0]); i++)
        dx[i] = 0;
    gradient_ip(pos0, num_faces, faces, x, dx);

    if (x_pts < 30) {
    for (size_t i=0; i<sizeof(dx)/sizeof(dx[0]); i++)
        printf("eigenstuffM grad_vert[%zu]=%f\n", i, dx[i]);
    }

    gettimeofday(&start, NULL);
    auto hess_x = hessian(pos0, num_faces, faces, x, x_pts);
    gettimeofday(&end, NULL);

    printf("Number of elements %ld\n", hess_x.size());
  
    printf("Runtime %0.6f\n", tdiff(&start, &end));

    if (x_pts <= 8)
    for (auto &hess : hess_x) {
        printf("i=%lu, j=%lu, val=%f\n", hess.row, hess.col, hess.val);
    }

    return 0;
}