rustsim-crowd 0.0.1

Microscopic crowd and pedestrian locomotion for rustsim: 2-D and layered 3-D, with Social Force, Collision-Free Speed, Generalized Centrifugal Force, Optimal Steps, and Anticipation Velocity models
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
//! CUDA implementation of [Anticipation Velocity](crate::anticipation_velocity).
//!
//! Closes the AVM slot of the P0-1 "CUDA kernels for AVM, OSM" item
//! from `docs/rustsim-crowd.md`. Mirrors the CFS arm closely (AVM is
//! also a first-order kinematic model; the kernel writes `vel_out`
//! and `pos_out` directly without an acceleration column), with one
//! extra step:
//!
//! - Per pedestrian, sample `num_directions` candidate directions in
//!   a forward cone of half-angle `cone_half_angle` around the
//!   desired-direction angle. For each candidate, compute the
//!   *anticipated* headroom (every neighbour extrapolated with
//!   constant velocity over `anticipation_time` seconds; walls
//!   treated as static via the closest-point geometric padding from
//!   Xu, Chraibi & Seyfried 2021 Eq. 7).
//! - Pick the candidate maximising `headroom - deviation_weight *
//!   |t| * cone_half_angle`, where `t ∈ [-1, 1]` is the cone
//!   parameter.
//! - Speed is then clamped exactly as in CFS:
//!   `v = min(eff_v0, max(0, (s - radius) / time_gap))`.
//!
//! Internal `f32` arithmetic; NVRTC at runtime, no CUDA toolkit at
//! build time. `CudaState::step` is the per-tick entry point;
//! [`step_with_fallback`] is the production wrapper that retries on
//! CPU when CUDA init or the kernel launch fails.

use cudarc::driver::{
    CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::compile_ptx;
use std::sync::Arc;

use crate::anticipation_velocity::Params;
use crate::common::{Pedestrian, WallSegment};

/// CUDA source for the AVM step kernel.
///
/// Mirrors `anticipation_velocity::best_direction_and_headroom` +
/// `anticipated_headroom` exactly (modulo `f32` precision) and the
/// same kinematic speed cap as CFS.
const AVM_CUDA_SRC: &str = r#"
extern "C" __device__ float avm_anticipated_headroom(
    unsigned int i,
    float ex, float ey,
    float pxi, float pyi, float ri, float v0_i,
    const float* pos_x_in,
    const float* pos_y_in,
    const float* vel_x_in,
    const float* vel_y_in,
    const float* radius,
    const float* wall_ax,
    const float* wall_ay,
    const float* wall_bx,
    const float* wall_by,
    unsigned int n,
    unsigned int n_walls,
    float anticipation_time,
    float wall_range)
{
    float s = 1.0e30f;

    // Pedestrian neighbours: extrapolate by anticipation_time then test
    // forward + lateral.
    for (unsigned int j = 0; j < n; ++j) {
        if (j == i) continue;
        float qfx = pos_x_in[j] + vel_x_in[j] * anticipation_time;
        float qfy = pos_y_in[j] + vel_y_in[j] * anticipation_time;
        float relx = qfx - pxi;
        float rely = qfy - pyi;
        float forward = relx * ex + rely * ey;
        if (forward <= 0.0f) continue;
        float projx = ex * forward;
        float projy = ey * forward;
        float latx = relx - projx;
        float laty = rely - projy;
        float lat = sqrtf(latx * latx + laty * laty);
        float r_sum = ri + radius[j];
        if (lat > r_sum) continue;
        float d = sqrtf(relx * relx + rely * rely);
        if (d < s) s = d;
    }

    // Walls: Xu et al. 2021 Eq. 7 closest-point geometric form.
    float horizon = v0_i * anticipation_time;
    float probex = pxi + ex * horizon;
    float probey = pyi + ey * horizon;
    for (unsigned int k = 0; k < n_walls; ++k) {
        float wax = wall_ax[k];
        float way = wall_ay[k];
        float wbx = wall_bx[k];
        float wby = wall_by[k];
        float abx = wbx - wax;
        float aby = wby - way;
        float denom = abx * abx + aby * aby;
        float t = 0.0f;
        if (denom > 1.0e-18f) {
            t = ((probex - wax) * abx + (probey - way) * aby) / denom;
            if (t < 0.0f) t = 0.0f;
            if (t > 1.0f) t = 1.0f;
        }
        float cpx = wax + t * abx;
        float cpy = way + t * aby;
        float relx = cpx - pxi;
        float rely = cpy - pyi;
        float forward = relx * ex + rely * ey;
        if (forward <= 0.0f) continue;
        float projx = ex * forward;
        float projy = ey * forward;
        float latx = relx - projx;
        float laty = rely - projy;
        float lat = sqrtf(latx * latx + laty * laty);
        if (lat > ri + wall_range) continue;
        float d = sqrtf(relx * relx + rely * rely);
        if (d < s) s = d;
    }
    return s;
}

extern "C" __global__ void avm_step(
    const float* __restrict__ pos_x_in,
    const float* __restrict__ pos_y_in,
    const float* __restrict__ vel_x_in,
    const float* __restrict__ vel_y_in,
    const float* __restrict__ radius,
    const float* __restrict__ desired_speed,
    const float* __restrict__ dest_x,
    const float* __restrict__ dest_y,
    float* __restrict__ pos_x_out,
    float* __restrict__ pos_y_out,
    float* __restrict__ vel_x_out,
    float* __restrict__ vel_y_out,
    const float* __restrict__ wall_ax,
    const float* __restrict__ wall_ay,
    const float* __restrict__ wall_bx,
    const float* __restrict__ wall_by,
    unsigned int n,
    unsigned int n_walls,
    float time_gap,
    float anticipation_time,
    unsigned int num_directions,
    float cone_half_angle,
    float deviation_weight,
    float wall_range,
    float arrival_radius,
    float dt)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;

    float pxi = pos_x_in[i];
    float pyi = pos_y_in[i];
    float ri  = radius[i];
    float v0  = desired_speed[i];
    float dxi = dest_x[i];
    float dyi = dest_y[i];

    // Desired direction toward destination + arrival taper.
    float to_dx = dxi - pxi;
    float to_dy = dyi - pyi;
    float d_to_dest = sqrtf(to_dx * to_dx + to_dy * to_dy);
    float edx = 0.0f;
    float edy = 0.0f;
    if (d_to_dest > 1.0e-12f) {
        edx = to_dx / d_to_dest;
        edy = to_dy / d_to_dest;
    }
    float eff_v0 = v0;
    if (arrival_radius > 0.0f && d_to_dest < arrival_radius) {
        eff_v0 = v0 * (d_to_dest / arrival_radius);
    }

    // Already-arrived (zero desired direction): write zero velocity, keep position.
    if (edx == 0.0f && edy == 0.0f) {
        pos_x_out[i] = pxi;
        pos_y_out[i] = pyi;
        vel_x_out[i] = 0.0f;
        vel_y_out[i] = 0.0f;
        return;
    }

    float base_angle = atan2f(edy, edx);
    unsigned int m = num_directions;
    if (m < 1) m = 1;

    // Seed best with the desired direction itself (matches CPU).
    float best_dx = edx;
    float best_dy = edy;
    float best_headroom = avm_anticipated_headroom(
        i, edx, edy, pxi, pyi, ri, v0,
        pos_x_in, pos_y_in, vel_x_in, vel_y_in, radius,
        wall_ax, wall_ay, wall_bx, wall_by,
        n, n_walls, anticipation_time, wall_range);
    float best_score = best_headroom;

    for (unsigned int k = 0; k < m; ++k) {
        float t;
        if (m == 1) {
            t = 0.0f;
        } else {
            t = -1.0f + 2.0f * (float)k / (float)(m - 1);
        }
        float theta = base_angle + t * cone_half_angle;
        float cdx = cosf(theta);
        float cdy = sinf(theta);
        float headroom = avm_anticipated_headroom(
            i, cdx, cdy, pxi, pyi, ri, v0,
            pos_x_in, pos_y_in, vel_x_in, vel_y_in, radius,
            wall_ax, wall_ay, wall_bx, wall_by,
            n, n_walls, anticipation_time, wall_range);
        float t_abs = t < 0.0f ? -t : t;
        float deviation = deviation_weight * t_abs * cone_half_angle;
        float score = headroom - deviation;
        if (score > best_score) {
            best_score = score;
            best_headroom = headroom;
            best_dx = cdx;
            best_dy = cdy;
        }
    }

    // Speed cap from anticipated headroom.
    float cap = (best_headroom - ri) / time_gap;
    if (cap < 0.0f) cap = 0.0f;
    float v = eff_v0 < cap ? eff_v0 : cap;

    float vx_new = best_dx * v;
    float vy_new = best_dy * v;
    float px_new = pxi + vx_new * dt;
    float py_new = pyi + vy_new * dt;

    pos_x_out[i] = px_new;
    pos_y_out[i] = py_new;
    vel_x_out[i] = vx_new;
    vel_y_out[i] = vy_new;
}
"#;

/// Persistent CUDA state for the AVM kernel.
pub struct CudaState {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,
}

impl CudaState {
    /// Initialise CUDA on device 0, compile the AVM kernel via NVRTC,
    /// and load it.
    pub fn new() -> Result<Self, String> {
        Self::with_block_size(256)
    }

    /// Like [`CudaState::new`] but with a configurable CUDA block size.
    pub fn with_block_size(block_size: u32) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let ctx = super::new_context(0)?;
        let stream = ctx.default_stream();
        let ptx = compile_ptx(AVM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
        let module = ctx
            .load_module(ptx)
            .map_err(|e| format!("module load failed: {e}"))?;
        let func = module
            .load_function("avm_step")
            .map_err(|e| format!("kernel lookup failed: {e}"))?;
        Ok(Self {
            ctx,
            stream,
            func,
            block_size,
        })
    }

    /// Advance `peds` by one tick of `dt` seconds on the GPU.
    ///
    /// Stateless w.r.t. device memory. Returns the kernel execution
    /// time in microseconds.
    pub fn step(
        &self,
        peds: &mut [Pedestrian],
        walls: &[WallSegment],
        params: &Params,
        dt: f64,
    ) -> Result<u128, String> {
        let n = peds.len();
        if n == 0 {
            return Ok(0);
        }

        let stream = &self.stream;
        let mut pos_x: Vec<f32> = Vec::with_capacity(n);
        let mut pos_y: Vec<f32> = Vec::with_capacity(n);
        let mut vel_x: Vec<f32> = Vec::with_capacity(n);
        let mut vel_y: Vec<f32> = Vec::with_capacity(n);
        let mut radius: Vec<f32> = Vec::with_capacity(n);
        let mut desired_speed: Vec<f32> = Vec::with_capacity(n);
        let mut dest_x: Vec<f32> = Vec::with_capacity(n);
        let mut dest_y: Vec<f32> = Vec::with_capacity(n);
        for p in peds.iter() {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        let n_walls = walls.len();
        let mut wall_ax: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_ay: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_bx: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_by: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        if n_walls == 0 {
            wall_ax.push(0.0);
            wall_ay.push(0.0);
            wall_bx.push(0.0);
            wall_by.push(0.0);
        } else {
            for w in walls {
                wall_ax.push(w.a[0] as f32);
                wall_ay.push(w.a[1] as f32);
                wall_bx.push(w.b[0] as f32);
                wall_by.push(w.b[1] as f32);
            }
        }

        let d_pos_x: CudaSlice<f32> = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y: CudaSlice<f32> = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x: CudaSlice<f32> = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y: CudaSlice<f32> = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_radius: CudaSlice<f32> = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed: CudaSlice<f32> = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x: CudaSlice<f32> = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y: CudaSlice<f32> = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let mut d_pos_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_out failed: {e}"))?;
        let mut d_pos_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_out failed: {e}"))?;
        let mut d_vel_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_out failed: {e}"))?;
        let mut d_vel_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_out failed: {e}"))?;
        let d_wall_ax: CudaSlice<f32> = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay: CudaSlice<f32> = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx: CudaSlice<f32> = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by: CudaSlice<f32> = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        let n_u32 = n as u32;
        let n_walls_u32 = n_walls as u32;
        let time_gap = params.time_gap as f32;
        let anticipation_time = params.anticipation_time as f32;
        let num_directions = params.num_directions as u32;
        let cone_half_angle = params.cone_half_angle as f32;
        let deviation_weight = params.deviation_weight as f32;
        let wall_range = params.wall_range as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        let t0 = std::time::Instant::now();

        // SAFETY: see `cuda::collision_free_speed` for the contract.
        unsafe {
            let mut b = stream.launch_builder(&self.func);
            b.arg(&d_pos_x);
            b.arg(&d_pos_y);
            b.arg(&d_vel_x);
            b.arg(&d_vel_y);
            b.arg(&d_radius);
            b.arg(&d_desired_speed);
            b.arg(&d_dest_x);
            b.arg(&d_dest_y);
            b.arg(&mut d_pos_x_out);
            b.arg(&mut d_pos_y_out);
            b.arg(&mut d_vel_x_out);
            b.arg(&mut d_vel_y_out);
            b.arg(&d_wall_ax);
            b.arg(&d_wall_ay);
            b.arg(&d_wall_bx);
            b.arg(&d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&time_gap);
            b.arg(&anticipation_time);
            b.arg(&num_directions);
            b.arg(&cone_half_angle);
            b.arg(&deviation_weight);
            b.arg(&wall_range);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }

        stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        stream
            .memcpy_dtoh(&d_pos_x_out, &mut pos_x)
            .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_pos_y_out, &mut pos_y)
            .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_x_out, &mut vel_x)
            .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_y_out, &mut vel_y)
            .map_err(|e| format!("dtoh vel_y failed: {e}"))?;

        for (i, p) in peds.iter_mut().enumerate() {
            p.pos = [pos_x[i] as f64, pos_y[i] as f64];
            p.vel = [vel_x[i] as f64, vel_y[i] as f64];
        }

        let _ = &self.ctx;
        Ok(kernel_us)
    }
}

/// Device-resident AVM state for production tick loops.
///
/// Uploads the initial pedestrian/wall columns once, advances with zero
/// host-device traffic between ticks, and downloads only when the caller asks
/// for a host snapshot.
pub struct CudaResident {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,
    n: usize,
    n_walls: usize,
    d_pos_x_a: CudaSlice<f32>,
    d_pos_y_a: CudaSlice<f32>,
    d_vel_x_a: CudaSlice<f32>,
    d_vel_y_a: CudaSlice<f32>,
    d_pos_x_b: CudaSlice<f32>,
    d_pos_y_b: CudaSlice<f32>,
    d_vel_x_b: CudaSlice<f32>,
    d_vel_y_b: CudaSlice<f32>,
    d_radius: CudaSlice<f32>,
    d_desired_speed: CudaSlice<f32>,
    d_dest_x: CudaSlice<f32>,
    d_dest_y: CudaSlice<f32>,
    d_wall_ax: CudaSlice<f32>,
    d_wall_ay: CudaSlice<f32>,
    d_wall_bx: CudaSlice<f32>,
    d_wall_by: CudaSlice<f32>,
}

impl CudaResident {
    /// Upload an initial pedestrian and wall configuration to the device.
    pub fn upload(peds: &[Pedestrian], walls: &[WallSegment]) -> Result<Self, String> {
        Self::upload_with_block_size(peds, walls, 256)
    }

    /// Like [`CudaResident::upload`] but with a configurable CUDA block size.
    pub fn upload_with_block_size(
        peds: &[Pedestrian],
        walls: &[WallSegment],
        block_size: u32,
    ) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let n = peds.len();
        if n == 0 {
            return Err("CudaResident::upload requires at least one pedestrian".to_string());
        }

        let state = CudaState::with_block_size(block_size)?;
        let stream = state.stream.clone();

        let mut pos_x = Vec::with_capacity(n);
        let mut pos_y = Vec::with_capacity(n);
        let mut vel_x = Vec::with_capacity(n);
        let mut vel_y = Vec::with_capacity(n);
        let mut radius = Vec::with_capacity(n);
        let mut desired_speed = Vec::with_capacity(n);
        let mut dest_x = Vec::with_capacity(n);
        let mut dest_y = Vec::with_capacity(n);
        for p in peds {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        let n_walls = walls.len();
        let (wall_ax, wall_ay, wall_bx, wall_by) = wall_columns(walls);

        let d_pos_x_a = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y_a = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x_a = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y_a = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_pos_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_b failed: {e}"))?;
        let d_pos_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_b failed: {e}"))?;
        let d_vel_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_b failed: {e}"))?;
        let d_vel_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_b failed: {e}"))?;
        let d_radius = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let d_wall_ax = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        stream
            .synchronize()
            .map_err(|e| format!("initial sync failed: {e}"))?;

        Ok(Self {
            ctx: state.ctx,
            stream,
            func: state.func,
            block_size,
            n,
            n_walls,
            d_pos_x_a,
            d_pos_y_a,
            d_vel_x_a,
            d_vel_y_a,
            d_pos_x_b,
            d_pos_y_b,
            d_vel_x_b,
            d_vel_y_b,
            d_radius,
            d_desired_speed,
            d_dest_x,
            d_dest_y,
            d_wall_ax,
            d_wall_ay,
            d_wall_bx,
            d_wall_by,
        })
    }

    /// Number of resident pedestrians.
    #[inline]
    pub fn len(&self) -> usize {
        self.n
    }

    /// Whether any pedestrians are resident.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.n == 0
    }

    /// Advance one AVM tick fully on the GPU.
    pub fn step(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
        let n_u32 = self.n as u32;
        let n_walls_u32 = self.n_walls as u32;
        let time_gap = params.time_gap as f32;
        let anticipation_time = params.anticipation_time as f32;
        let num_directions = params.num_directions as u32;
        let cone_half_angle = params.cone_half_angle as f32;
        let deviation_weight = params.deviation_weight as f32;
        let wall_range = params.wall_range as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = self.n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        let t0 = std::time::Instant::now();
        unsafe {
            let mut b = self.stream.launch_builder(&self.func);
            b.arg(&self.d_pos_x_a);
            b.arg(&self.d_pos_y_a);
            b.arg(&self.d_vel_x_a);
            b.arg(&self.d_vel_y_a);
            b.arg(&self.d_radius);
            b.arg(&self.d_desired_speed);
            b.arg(&self.d_dest_x);
            b.arg(&self.d_dest_y);
            b.arg(&mut self.d_pos_x_b);
            b.arg(&mut self.d_pos_y_b);
            b.arg(&mut self.d_vel_x_b);
            b.arg(&mut self.d_vel_y_b);
            b.arg(&self.d_wall_ax);
            b.arg(&self.d_wall_ay);
            b.arg(&self.d_wall_bx);
            b.arg(&self.d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&time_gap);
            b.arg(&anticipation_time);
            b.arg(&num_directions);
            b.arg(&cone_half_angle);
            b.arg(&deviation_weight);
            b.arg(&wall_range);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }
        self.stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
        std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
        std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
        std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);

        let _ = &self.ctx;
        Ok(kernel_us)
    }

    /// Bulk download of the current resident state back into `peds`.
    pub fn download(&self, peds: &mut Vec<Pedestrian>) -> Result<(), String> {
        download_resident(
            &self.stream,
            self.n,
            &self.d_pos_x_a,
            &self.d_pos_y_a,
            &self.d_vel_x_a,
            &self.d_vel_y_a,
            &self.d_radius,
            &self.d_desired_speed,
            &self.d_dest_x,
            &self.d_dest_y,
            peds,
        )
    }
}

fn wall_columns(walls: &[WallSegment]) -> (Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
    if walls.is_empty() {
        return (vec![0.0], vec![0.0], vec![0.0], vec![0.0]);
    }
    let mut ax = Vec::with_capacity(walls.len());
    let mut ay = Vec::with_capacity(walls.len());
    let mut bx = Vec::with_capacity(walls.len());
    let mut by = Vec::with_capacity(walls.len());
    for w in walls {
        ax.push(w.a[0] as f32);
        ay.push(w.a[1] as f32);
        bx.push(w.b[0] as f32);
        by.push(w.b[1] as f32);
    }
    (ax, ay, bx, by)
}

#[allow(clippy::too_many_arguments)]
fn download_resident(
    stream: &Arc<CudaStream>,
    n: usize,
    d_pos_x: &CudaSlice<f32>,
    d_pos_y: &CudaSlice<f32>,
    d_vel_x: &CudaSlice<f32>,
    d_vel_y: &CudaSlice<f32>,
    d_radius: &CudaSlice<f32>,
    d_desired_speed: &CudaSlice<f32>,
    d_dest_x: &CudaSlice<f32>,
    d_dest_y: &CudaSlice<f32>,
    peds: &mut Vec<Pedestrian>,
) -> Result<(), String> {
    peds.resize(
        n,
        Pedestrian {
            pos: [0.0, 0.0],
            vel: [0.0, 0.0],
            radius: 0.0,
            desired_speed: 0.0,
            destination: [0.0, 0.0],
        },
    );

    let mut pos_x = vec![0.0f32; n];
    let mut pos_y = vec![0.0f32; n];
    let mut vel_x = vec![0.0f32; n];
    let mut vel_y = vec![0.0f32; n];
    let mut radius = vec![0.0f32; n];
    let mut desired_speed = vec![0.0f32; n];
    let mut dest_x = vec![0.0f32; n];
    let mut dest_y = vec![0.0f32; n];

    stream
        .memcpy_dtoh(d_pos_x, &mut pos_x)
        .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_pos_y, &mut pos_y)
        .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
    stream
        .memcpy_dtoh(d_vel_x, &mut vel_x)
        .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_vel_y, &mut vel_y)
        .map_err(|e| format!("dtoh vel_y failed: {e}"))?;
    stream
        .memcpy_dtoh(d_radius, &mut radius)
        .map_err(|e| format!("dtoh radius failed: {e}"))?;
    stream
        .memcpy_dtoh(d_desired_speed, &mut desired_speed)
        .map_err(|e| format!("dtoh desired_speed failed: {e}"))?;
    stream
        .memcpy_dtoh(d_dest_x, &mut dest_x)
        .map_err(|e| format!("dtoh dest_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_dest_y, &mut dest_y)
        .map_err(|e| format!("dtoh dest_y failed: {e}"))?;

    for (i, p) in peds.iter_mut().enumerate().take(n) {
        p.pos = [pos_x[i] as f64, pos_y[i] as f64];
        p.vel = [vel_x[i] as f64, vel_y[i] as f64];
        p.radius = radius[i] as f64;
        p.desired_speed = desired_speed[i] as f64;
        p.destination = [dest_x[i] as f64, dest_y[i] as f64];
    }
    Ok(())
}

/// One-shot wrapper: initialise CUDA, run one AVM step, tear down.
pub fn step(
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> Result<u128, String> {
    let state = CudaState::new()?;
    state.step(peds, walls, params, dt)
}

/// CPU-fallback wrapper: try CUDA, fall back to
/// [`crate::anticipation_velocity::step`] on any error.
pub fn step_with_fallback(
    state: &mut Option<CudaState>,
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> bool {
    if state.is_none() {
        match CudaState::new() {
            Ok(s) => *state = Some(s),
            Err(e) => {
                eprintln!("rustsim-crowd CUDA init failed ({e}), using CPU AVM path");
                #[allow(deprecated)]
                crate::anticipation_velocity::step(peds, walls, params, dt);
                return false;
            }
        }
    }
    match state.as_ref().unwrap().step(peds, walls, params, dt) {
        Ok(_) => true,
        Err(e) => {
            eprintln!("rustsim-crowd CUDA AVM step failed ({e}), falling back to CPU");
            #[allow(deprecated)]
            crate::anticipation_velocity::step(peds, walls, params, dt);
            false
        }
    }
}