stereokit-rust 0.4.0-alpha.22

High-Level Rust bindings around the StereoKitC library for XR
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
/*
Bounding Volume Hierarchy for accelerated ray-triangle intersection testing.

Based on my existing ray tracing code, but also heavily inspired by 
Jacco Bikker's excellent BVH series starting with
https://jacco.ompf2.com/2022/04/13/how-to-build-a-bvh-part-1-basics/

Possible optimizations:
- Use surface area heuristic for determining when to stop
  partitioning, instead of the fixed max leaf size currently used
- Add multiple split plane candidates, particularly binned SAH
- Use a custom float3 value to get rid of vec3 usage in boundingbox, 
  so vec3_field() isn't needed anymore
- SIMD operations for certain computations
- Multi-threaded construction
- Currently, the split step during recursive construction simply gives up 
  when it encounters a bunch of triangles for which none of the 3 split
  axes provide any way to split the triangles into two groups (i.e. all 
  triangles found to be on the left and/or all on the right). In those cases
  currently a leaf node holding all those triangles is created. But an 
  option could be to simply divide the triangles into two groups randomly
  and recurse into the two groups. This will still provide a speedup, as
  bboxes can be tested for each group and discarded early, instead of always
  having to test each triangle separately.

June, 2022
Paul Melis, SURF (paul.melis@surf.nl)
*/
#define NOMINMAX
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <float.h>
#include "bvh.h"
#include "bbox.h"
#include "../stereokit.h"
#include "../sk_memory.h"
#include "../sk_math.h"
#include "../asset_types/mesh.h"
#include "../libraries/sokol_time.h"

//#define VERBOSE_BUILD
//#define VERBOSE_INTERSECTION
//#define VERBOSE_STATS

namespace sk {

const int TRAVERSAL_STACK_SIZE = 128;

// One node in the Bounding Volume Hierarchy

struct bvh_node_t
{
    // The bounding box for this node
    boundingbox bbox;
    
    // If num_triangles == 0 then this is an inner node, 
    // otherwise a leaf node of num_triangles triangles.
    uint32_t    num_triangles;

    // If an inner node this is the index of the left child (with the right child
    // at leaf_first+1). If a leaf node this is the index in sorted_triangles[]
    // of the first triangle.
    uint32_t    leaf_first;

    bool is_leaf() const { return num_triangles > 0; }
};

// Statistics on a built BVH

static void
bvh_stats_clear(bvh_stats_t *stats)
{
    stats->depth = 0;
    stats->num_leafs = 0;
    stats->num_inner_nodes = 0;
    stats->max_leaf_size = 0;
    stats->num_forced_leafs = 0;
}

static void
gather_stats(const mesh_bvh_t *bvh, short depth, int current_node_index, bvh_stats_t *stats, int acc_leaf_size)
{
    const bvh_node_t& node = bvh->nodes[current_node_index];
    stats->depth = (short)maxi(stats->depth, depth);

    if (node.is_leaf())
    {
        stats->num_leafs++;
        stats->max_leaf_size = maxi(stats->max_leaf_size, node.num_triangles);
        if (node.num_triangles > (uint32_t)acc_leaf_size)
            stats->num_forced_leafs++;
    }
    else
    {
        stats->num_inner_nodes++;
        gather_stats(bvh, depth+1, node.leaf_first, stats, acc_leaf_size);
        gather_stats(bvh, depth+1, node.leaf_first+1, stats, acc_leaf_size);
    }
}

void        
mesh_bvh_statistics(const mesh_bvh_t *bvh, bvh_stats_t *stats, int acc_leaf_size)
{
    bvh_stats_clear(stats);
    gather_stats(bvh, 1, 0, stats, acc_leaf_size);
}

// Determine the bounding box that encloses the given triangles,
// which are specified as a sub-sequence of sorted_triangles, 
// i.e. sorted_triangles[first..first+count-1]
static void
bound_triangles(boundingbox& bbox, const uint32_t *sorted_triangles, const vec3 *triangle_vertices, int first, int count)
{
    bbox_clear(bbox);

    for (int t = first; t < first+count; t++)
    {
        const vec3 *p = &triangle_vertices[3*sorted_triangles[t]];
        bbox_update(bbox, p[0]);
        bbox_update(bbox, p[1]);
        bbox_update(bbox, p[2]);
    }

    // Safety margin
    bbox_grow(bbox, C_EPSILON);
}

// Convenience method for indexing a vec3 by coordinate index
inline float
vec3_field(vec3 v, int index)
{
    switch (index)
    {
    case 0:
        return v.x;
    case 1:
        return v.y;
    case 2:
        return v.z;
    default:
        return 0.0f;
    }
}

// Recursively subdivide the triangles in the current leaf node into two groups, 
// by determining a split plane based on the current bound, 
// and sorting the triangles into "left-of" and "right-of".
static void
mesh_bvh_build_recursive(int current_node_index, 
    bvh_node_t *nodes, uint32_t *next_node_index,
    uint32_t *sorted_triangles,
    int acceptable_leaf_size, 
    const vec3* triangle_vertices, const vec3* triangle_centroids, 
    const mesh_collision_t *collision_data)
{
    vec3        bbox_size, bbox_center;
    int         i, j, t;
    short       sorted_dimensions[3];
    short       split_axis;
    float       split_value;

    bvh_node_t& node = nodes[current_node_index];

#ifdef VERBOSE_BUILD
    printf("build_recursive(): current_node_index = %d\n", current_node_index);
    printf("... leaf_first = %d, num_triangles = %d\n", node.leaf_first, node.num_triangles);
    for (i = node.leaf_first; i < node.leaf_first+node.num_triangles; i++)
        printf("[%d] %d  ", i, sorted_triangles[i]);
    printf("\n");
#endif

    if (node.num_triangles <= (uint32_t)acceptable_leaf_size)
    {
#ifdef VERBOSE_BUILD
        printf("Creating a leaf node of %d triangles\n", node.num_triangles);
#endif

        return;
    }

    //
    // Split the triangles into two groups, using a split plane based
    // on largest bbox side
    //
    
    boundingbox& bbox = node.bbox;

#ifdef VERBOSE_BUILD
    printf("... bounding box:\n");
    printf("... (min) %.6f, %.6f, %.6f\n", bbox_min(bbox).x, bbox_min(bbox).y, bbox_min(bbox).z);
    printf("... (max) %.6f, %.6f, %.6f\n", bbox_max(bbox).x, bbox_max(bbox).y, bbox_max(bbox).z);
#endif

    bbox_size = bbox.bounds[1] - bbox.bounds[0];
    bbox_center = 0.5f * (bbox.bounds[0] + bbox.bounds[1]);

    // Sort bbox sides by size in descending order

    sorted_dimensions[0] = 0;
    sorted_dimensions[1] = 1;
    sorted_dimensions[2] = 2;

    // Good old bubble sort :)
    for (i = 0; i < 2; i++)
    {
        for (j = i+1; j < 3; j++)
        {
            if (vec3_field(bbox_size, sorted_dimensions[i])
                <
                vec3_field(bbox_size, sorted_dimensions[j]))
            {
                t = sorted_dimensions[i];
                sorted_dimensions[i] = sorted_dimensions[j];
                sorted_dimensions[j] = (short)t;
            }
        }
    }

    // Iterate over the three split dimensions in descending order of
    // bbox size on that dimension. When the triangles are unsplitable
    // (or we can't split in a favorable way) continue to the next dimension.
    // Create a leaf with all triangles in case none of the dimensions is a 
    // good split candidate.

    uint32_t l, r;
    boundingbox left_bbox, right_bbox;

    for (int d = 0; d < 3; d++)
    {
        // Partition the triangles on the chosen split axis, with the
        // split plane at the center of the bbox

        split_axis = sorted_dimensions[d];
        split_value = vec3_field(bbox_center, split_axis);

#ifdef VERBOSE_BUILD
        printf("Splitting on axis %d, value %.6f\n", split_axis, split_value);

        for (l = node.leaf_first; l < node.leaf_first+node.num_triangles; l++)
        {
            const vec3&p = triangle_centroids[sorted_triangles[l]];
            printf("[%d] tri %d, %6f, %6f, %6f\n", l, sorted_triangles[l], p.x, p.y, p.z);
        }
#endif

        l = node.leaf_first;
        r = l + node.num_triangles - 1;
        while (l <= r)
        {            
            if (vec3_field(triangle_centroids[sorted_triangles[l]], split_axis) < split_value)
                l++;
            else
            {
                uint32_t temp = sorted_triangles[l];
                sorted_triangles[l] = sorted_triangles[r];
                sorted_triangles[r--] = temp;
            }
        }        

        const int num_triangles_left = l - node.leaf_first;
        const int num_triangles_right = node.num_triangles - num_triangles_left;

#ifdef VERBOSE_BUILD
        printf("%d left, %d right\n", num_triangles_left, num_triangles_right);
#endif

        if (num_triangles_left == 0 || num_triangles_right == 0)
        {
            // All triangles in one of the half, discontinue this attempt
#ifdef VERBOSE_BUILD
            printf("Skipping dimension %d, all triangles on one side\n");
#endif
            continue;
        }

        // Triangles are now split into two groups, determine bboxes for each.    

        bound_triangles(left_bbox, sorted_triangles, triangle_vertices, node.leaf_first, num_triangles_left);
        bound_triangles(right_bbox, sorted_triangles, triangle_vertices, l, num_triangles_right);

#ifdef VERBOSE_BUILD
        printf("left bbox:  %.6f, %.6f, %.6f .. %.6f, %.6f, %.6f\n",
            left_bbox.bounds[0].x, left_bbox.bounds[0].y, left_bbox.bounds[0].z,
            left_bbox.bounds[1].x, left_bbox.bounds[1].y, left_bbox.bounds[1].z);
        printf("right bbox: %.6f, %.6f, %.6f .. %.6f, %.6f, %.6f\n",
            right_bbox.bounds[0].x, right_bbox.bounds[0].y, right_bbox.bounds[0].z,
            right_bbox.bounds[1].x, right_bbox.bounds[1].y, right_bbox.bounds[1].z);
#endif

        // Create two child nodes and recurse
        
        const uint32_t left_child_index = *next_node_index;
        const uint32_t right_child_index = left_child_index + 1;
        *next_node_index += 2;

        bvh_node_t& left_node = nodes[left_child_index];
        left_node.bbox = left_bbox;
        left_node.leaf_first = node.leaf_first;
        left_node.num_triangles = num_triangles_left;

        bvh_node_t& right_node = nodes[right_child_index];
        right_node.bbox = right_bbox;
        right_node.leaf_first = l;
        right_node.num_triangles = num_triangles_right;

        // Turn original leaf node into an inner node
        node.leaf_first = left_child_index;
        node.num_triangles = 0;
        
        // XXX Could check for leaf size here and only recurse when needed, instead of
        // doing the check in build_recursive()
        mesh_bvh_build_recursive(left_child_index, nodes, next_node_index, sorted_triangles, acceptable_leaf_size, 
            triangle_vertices, triangle_centroids, collision_data);
        mesh_bvh_build_recursive(right_child_index, nodes, next_node_index, sorted_triangles, acceptable_leaf_size, 
            triangle_vertices, triangle_centroids, collision_data);

        return;
    }

    // Split dimensions exhausted, forced to create a leaf

#ifdef VERBOSE_BUILD
    // XXX warn
    printf("Split options exhausted, creating a leaf of %d triangles\n", node.num_triangles);
#endif

#if 0
    // Save forced leafs to .obj files, for debugging
    char s[256];
    static int next_forced_leaf = 0;
    sprintf(s, "forced_leaf%04d.obj", next_forced_leaf++);
    printf("-> Writing %s\n", s);
    FILE *f = fopen(s, "wt");
    // XXX yuck, saves ALL scene vertices, not just the ones used by the triangles
    for (i = 0; i < the_mesh->vert_count; i++)
        fprintf(f, "v %g %g %g\n", the_mesh->verts[i].pos.x, the_mesh->verts[i].pos.y, the_mesh->verts[i].pos.z);
    // Note: 0-based indices converted to 1-based
    for (t = node.leaf_first; t < node.leaf_first+node.num_triangles; t++)
    {
        const uint32_t &tri = sorted_triangles[t];
        fprintf(f, "f %d %d %d\n", 1+the_mesh->inds[3*tri+0], 1+the_mesh->inds[3*tri+1], 1+the_mesh->inds[3*tri+2]);
    }
    fclose(f);
#endif
}

// Build a BVH over the triangles in the given mesh.
mesh_bvh_t*
mesh_bvh_create(const mesh_t mesh, int acc_leaf_size, bool show_stats)
{
    // XXX refuse if num triangles is zero?
#if defined(VERBOSE_STATS)
    const double t0 = time_get_raw();
#endif

    mesh_bvh_t *bvh = sk_malloc_zero_t(mesh_bvh_t, 1);

    // A restriction during BVH construction is that we don't want to touch the
    // underlying vertex and index arrays in the passed mesh. So we need to keep some local
    // array of triangle indices and sort that during BVH construction.
    // This array needs to be kept around after construction, as the BVH leaf nodes
    // will point into it to list the triangles in each node.
    //
    // Also, we need, either explicitly or implicitly, a bounding box
    // for each triangle during BVH construction. We could pre-compute those triangle 
    // bboxes and store them in a temporary array. But each bbox would take up 6 floats, 
    // i.e. 24 bytes. So for a 1M triangle model  the temporary bbox array uses 24MB. 
    // For typical (smallish) SK meshes this overhead should not be a problem, but for 
    // large meshes on memory-constrained devices it could be. Plus it would come
    // on top of the array mentioned above.
    //
    // Instead, we leverage the existing mesh collision data, which provides
    // an array of triangle vertices, to precompute triangle centroids, which
    // are then used during BVH construction. Whenever a bounding box of a 
    // group of triangles is needed this is computed on-the-fly.

    bvh->collision_data = mesh_get_collision_data(mesh);
    if (bvh->collision_data == nullptr)
    {
        log_err("mesh_bvh_t::build(): no mesh collision data available");
        return nullptr;
    }

    // Compute triangle centroids, used during construction to partition
    // triangles in two groups

    const uint32_t num_triangles = mesh->ind_count / 3;

    const vec3* triangle_vertices = bvh->collision_data->pts;
    vec3* triangle_centroids = sk_malloc_t(vec3, num_triangles);
    
    for (uint32_t t = 0; t < num_triangles; t++) {
        triangle_centroids[t] = 0.33333f * (
            triangle_vertices[3*t+0] + triangle_vertices[3*t+1] + triangle_vertices[3*t+2]
        );
    }

#ifdef VERBOSE_BUILD
    const vert_t* vertices = mesh->verts;
    const vind_t* indices  = mesh->inds;

    printf("%d triangles:\n", num_triangles);
    for (int t = 0; t < num_triangles; t++)
    {
        printf("[tri %d] %d %d %d\n", t, indices[3*t+0], indices[3*t+1], indices[3*t+2]);
        for (int i = 0; i < 3; i++)
            printf("... %.6f %.6f %.6f\n", vertices[indices[3*t+i]].pos.x, vertices[indices[3*t+i]].pos.y, vertices[indices[3*t+i]].pos.z);
        printf("... centroid %.6f, %.6f, %.6f\n", triangle_centroids[t].x, triangle_centroids[t].y, triangle_centroids[t].z);
    }
#endif

    // List of triangle indices, which will get reordered during construction
    uint32_t *sorted_triangles = bvh->sorted_triangles = sk_malloc_t(uint32_t, num_triangles);
    for (uint32_t i = 0; i < num_triangles; i++)
        sorted_triangles[i] = i;

    // Compute mesh bounding box (could reuse what's in mesh_t, but not sure it's accurate)

    boundingbox mesh_bbox;
    bound_triangles(mesh_bbox, sorted_triangles, triangle_vertices, 0, num_triangles-1);

#ifdef VERBOSE_BUILD
    printf("bvh_build():\n");
    printf("... mesh of %d triangles\n", num_triangles);
    printf("... bounding box:\n");
    printf("... (min) %.6f, %.6f, %.6f\n", bbox_min(mesh_bbox).x, bbox_min(mesh_bbox).y, bbox_min(mesh_bbox).z);
    printf("... (max) %.6f, %.6f, %.6f\n", bbox_max(mesh_bbox).x, bbox_max(mesh_bbox).y, bbox_max(mesh_bbox).z);
    printf("... acceptable leaf size %d\n", acc_leaf_size);
#endif

    bbox_grow(mesh_bbox, C_EPSILON);
    bvh->the_mesh = mesh;

    // We pre-allocate an array of BVH nodes, enough to always fit. 
    // XXX After construction is done, and the actual
    // number of nodes needed is known we should trim
    // the array.

    bvh_node_t *nodes = bvh->nodes = sk_malloc_t(bvh_node_t, num_triangles*2);

    // Bootstrap with a single leaf node holding all triangles
    
    bvh_node_t& root_node = nodes[0];
    root_node.leaf_first = 0;
    root_node.num_triangles = num_triangles;
    root_node.bbox = mesh_bbox;

    // XXX could return value from build function, instead of using a variable
    uint32_t next_node_index = 1;

    // Build the BVH

    mesh_bvh_build_recursive(0, nodes, &next_node_index, sorted_triangles,
        acc_leaf_size, triangle_vertices, triangle_centroids, bvh->collision_data);

#if defined(VERBOSE_STATS)
    const double t1 = time_get_raw();

    // XXX use log function
    printf("BVH construction done in %.1fms\n", 1000*(t1-t0));

    // Done!

    if (show_stats)
    {
        printf("BVH statistics:\n");
        printf("... %d triangles\n", bvh->the_mesh->ind_count/3);
        bvh_stats_t stats;
        mesh_bvh_statistics(bvh, &stats, acc_leaf_size);
        printf("... depth %d\n", stats.depth);
        printf("... %d leaf nodes, %d inner nodes\n",
            stats.num_leafs, stats.num_inner_nodes);
        printf("... maximum leaf size %d\n", stats.max_leaf_size);
        printf("... %d forced leafs (%.1f%%)\n", stats.num_forced_leafs, 100.0f * stats.num_forced_leafs / stats.num_leafs);
    }
#endif

    // Clean up

    sk_free(triangle_centroids);

    return bvh;
}

void
mesh_bvh_destroy(mesh_bvh_t *bvh)
{
    sk_free(bvh->nodes);
    sk_free(bvh->sorted_triangles);
    *bvh = {};
}

// Find closest triangle intersection for the given model-space ray
bool
mesh_bvh_intersect(const mesh_bvh_t *bvh, ray_t model_space_ray, ray_t *out_pt, uint32_t *out_start_inds, cull_ cull_mode)
{
    const bvh_node_t *nodes = bvh->nodes;
    const uint32_t *sorted_triangles = bvh->sorted_triangles;
    const mesh_collision_t *collision_data = bvh->collision_data;

    // Use local stack to avoid having to recurse
    uint32_t traversal_node_stack[TRAVERSAL_STACK_SIZE];
    float traversal_tmin_stack[TRAVERSAL_STACK_SIZE];

    uint32_t left_child;
    bool traverse_left_child, traverse_right_child;
    
    float t_left_min, t_right_min;
    float t_left_max, t_right_max;
    float t_hit, t_nearest_hit = FLT_MAX;
    float nearest_dist = FLT_MAX;
    vec3  pt = {};

    // Stack is initially empty
    short stack_top = -1;

    // Start at root node
    uint32_t current_node_index = 0;

    // Pre-compute a ray value that's quicker to test against the bboxes
    bbox_ray_t bbox_ray(model_space_ray);

    while (true)
    {        
#ifdef VERBOSE_INTERSECTION
        printf("t_nearest_hit = %.6f\n", t_nearest_hit);
        printf("current node = %d\n", current_node_index);
        printf("stack_top = %d\n", stack_top);

        printf("stack now:\n");
        for (int i = 0; i <= stack_top; i++)
            printf("%d (%.6f)\n", traversal_node_stack[i], traversal_tmin_stack[i]);
#endif

        const bvh_node_t& node = nodes[current_node_index];

        if (!node.is_leaf())
        {
            // Check for traversal down to the child nodes of this inner node
            left_child = node.leaf_first;

            traverse_left_child = bbox_intersect_full(nodes[left_child].bbox, t_left_min, t_left_max, bbox_ray, 0.0f, t_nearest_hit);
            traverse_right_child = bbox_intersect_full(nodes[left_child+1].bbox, t_right_min, t_right_max, bbox_ray, 0.0f, t_nearest_hit);

#ifdef VERBOSE_INTERSECTION
            if (traverse_left_child)
                printf("traverse left, node %d, t=%.6f..%.6f\n", left_child, t_left_min, t_left_max);
            if (traverse_right_child)
                printf("traverse right, node %d, t=%.6f..%.6f\n", left_child+1, t_right_min, t_right_max);
#endif

            if (traverse_left_child)
            {
                if (traverse_right_child)
                {
                    // Traverse both children, but traverse into closest one first

                    if (t_left_min < t_right_min)
                    {
                        // Left, possibly right

                        // Push right
                        stack_top++;
                        traversal_node_stack[stack_top] = left_child+1;
                        traversal_tmin_stack[stack_top] = t_right_min;

                        // Traverse to left
#ifdef VERBOSE_INTERSECTION
                        printf("traversing left, then right\n");
#endif
                        current_node_index = left_child;
                    }
                    else
                    {
                        // Right, possibly left

                        // Push left
                        stack_top++;
                        traversal_node_stack[stack_top] = left_child;
                        traversal_tmin_stack[stack_top] = t_left_min;

                        // Traverse to right
#ifdef VERBOSE_INTERSECTION
                        printf("traversing right, then left\n");
#endif
                        current_node_index = left_child+1;
                    }
                }
                else
                {
                    // Traverse left only
#ifdef VERBOSE_INTERSECTION
                    printf("traversing left only\n");
#endif                    
                    current_node_index = left_child;
                }

                continue;
            }
            else if (traverse_right_child)
            {
                // Traverse right only
#ifdef VERBOSE_INTERSECTION
                printf("traversing right only\n");
#endif
                current_node_index = left_child+1;
                continue;
            }
        }
        else
        {
            // Intersect ray with triangles in this leaf node

#ifdef VERBOSE_INTERSECTION
            printf("Checking %d triangles\n", node.num_triangles);
#endif
            for (uint32_t t = node.leaf_first; t < node.leaf_first+node.num_triangles; t++)
            {
                uint32_t triangle = sorted_triangles[t];
                const plane_t& plane = collision_data->planes[triangle];

                // Inline version of plane_ray_intersect(), as we need the t value
                // XXX use cull_mode value based on dot denom
                float denom = vec3_dot(model_space_ray.dir, plane.normal);

                if (fabsf(denom) < 1e-6f)
                {
                    // Ray direction (almost) perpendicular to plane, no intersection
                    continue;
                }

                if ((cull_mode == cull_front && denom < 0) || (cull_mode == cull_back && denom > 0))
                {
                    // Front/back-face culling
                    // XXX is there a smaller test?
                    continue;
                }

                t_hit = -(vec3_dot(model_space_ray.pos, plane.normal) + plane.d) / denom;

                if (t_hit >= t_nearest_hit)
                {
                    // Hit will not be closer than best one found so far, no need to check further
                    continue;
                }

                pt = model_space_ray.pos + model_space_ray.dir * t_hit;

                // point in triangle, implementation based on:
                // https://blackpawn.com/texts/pointinpoly/default.html

                // Compute vectors
                vec3 v0 = collision_data->pts[3*triangle+1] - collision_data->pts[3*triangle+0];
                vec3 v1 = collision_data->pts[3*triangle+2] - collision_data->pts[3*triangle+0];
                vec3 v2 = pt - collision_data->pts[3*triangle+0];

                // Compute dot products
                float dot00 = vec3_dot(v0, v0);
                float dot01 = vec3_dot(v0, v1);
                float dot02 = vec3_dot(v0, v2);
                float dot11 = vec3_dot(v1, v1);
                float dot12 = vec3_dot(v1, v2);

                // Compute barycentric coordinates
                float inv_denom = 1.0f / (dot00 * dot11 - dot01 * dot01);
                float u = (dot11 * dot02 - dot01 * dot12) * inv_denom;
                float v = (dot00 * dot12 - dot01 * dot02) * inv_denom;

                // Check if point is in triangle
                if ((u >= 0) && (v >= 0) && (u + v < 1)) {
                    float dist = vec3_magnitude_sq(pt - model_space_ray.pos);
                    if (t_hit > 0 && t_hit < t_nearest_hit) {
#ifdef VERBOSE_INTERSECTION
                        printf("Found new hit at t = %.6f, distance^2 = %.6f\n", t_hit, dist);
#endif                        
                        t_nearest_hit = t_hit;
                        nearest_dist = dist;
                        if (out_start_inds != nullptr) {
                            *out_start_inds = 3*triangle;
                        }
                        *out_pt = {pt, collision_data->planes[triangle].normal};
                    }
                }
            }
        }

        // Pop from stack

        if (stack_top == -1)
        {
            // Empty stack, done!
            return nearest_dist != FLT_MAX;
        }

        // Keep popping next node to visit until we find a node whose bbox
        // is intersected closer to the ray origin than the best hit
        // found so far (or the stack becomes empty)

#ifdef VERBOSE_INTERSECTION
        printf("popping from stack:\n");
        for (int i = 0; i <= stack_top; i++)
            printf("%d (%.6f)\n", traversal_node_stack[i], traversal_tmin_stack[i]);
#endif

        while (stack_top >= 0)
        {
            current_node_index = traversal_node_stack[stack_top];
            if (traversal_tmin_stack[stack_top--] < t_nearest_hit)
            {
                // Traverse to node
                break;
            }

            if (stack_top == -1)
            {
                // Empty stack, done!
                return nearest_dist != FLT_MAX;
            }
        }
    }
}

} // namespace sk