llama-cpp-sys-4 0.2.45

Low Level Bindings to llama.cpp
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"

#include <HAP_farf.h>
#include <HAP_perf.h>
#include <AEEStdErr.h>
#include <dspqueue.h>
#include <HAP_compute_res.h>
#include <HAP_etm_config.h>
#include <HAP_mem.h>
#include <HAP_power.h>
#include <HAP_ps.h>
#include <qurt.h>
#include <qurt_thread.h>
#include <qurt_memory.h>
#include <remote.h>
#include <string.h>

#include "hex-utils.h"
#include "hex-dma.h"
#include "hmx-queue.h"

#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-ops.h"
#include "worker-pool.h"

AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
    struct htp_context * ctx;
    int                  err = 0;

    ctx = calloc(1, sizeof(*ctx));
    if (ctx == NULL) {
        return AEE_ENOMEMORY;
    }

    // Use the context structure as the handle
    *handle = (remote_handle64) ctx;

    // Enable FARF logs
    HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);

    // Set client class
    {
        HAP_power_request_t request;
        memset(&request, 0, sizeof(HAP_power_request_t));
        request.type    = HAP_power_set_apptype;
        request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;

        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }
    }

    {
        HAP_power_request_t request;
        memset(&request, 0, sizeof(request));

        request.type                              = HAP_power_set_DCVS_v3;
        request.dcvs_v3.set_dcvs_enable           = TRUE;
        request.dcvs_v3.dcvs_enable               = TRUE;
        request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
        request.dcvs_v3.set_bus_params            = TRUE;
        request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.set_core_params           = TRUE;
        request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.set_sleep_disable         = TRUE;
        request.dcvs_v3.sleep_disable             = TRUE;
        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }

        memset(&request, 0, sizeof(request));
        request.type         = HAP_power_set_HVX;
        request.hvx.power_up = TRUE;
        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }
    }

    {
        // Power on HMX
        HAP_power_request_t request;
        memset(&request, 0, sizeof(HAP_power_request_t));
        request.type         = HAP_power_set_HMX;
        request.hmx.power_up = TRUE;
        FARF(ALWAYS, "Powering HMX on\n");
        err = HAP_power_set((void *) &ctx, &request);
        if (err != AEE_SUCCESS) {
            FARF(ERROR, "Error powering on HMX.");
            return err;
        }
    }

    return AEE_SUCCESS;
}

AEEResult htp_iface_close(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;

    if (!ctx) {
        return AEE_EBADPARM;
    }

    if (ctx->queue) {
        FARF(ERROR, "Closing handle with queue still open");
        return AEE_EITEMBUSY;
    }

    // release the mmaps (if any)
    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
        if (ctx->mmap[i].size) {
#if __HVX_ARCH__ > 73
            HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
#else
            HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
#endif
            ctx->mmap[i].size = 0;
            ctx->mmap[i].base = NULL;
            ctx->mmap[i].fd   = -1;
        }
    }

    free(ctx);
    return AEE_SUCCESS;
}

AEEResult htp_iface_enable_etm(remote_handle64 handle) {
    int err = HAP_user_etm_enable();
    if (err) {
        if (err == AEE_EVERSIONNOTSUPPORT) {
            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
        } else {
            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
        }
    }
    return err;
}

AEEResult htp_iface_disable_etm(remote_handle64 handle) {
    int err = HAP_user_etm_disable();
    if (err) {
        if (err == AEE_EVERSIONNOTSUPPORT) {
            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
        } else {
            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
        }
    }
    return err;
}

AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }

    // See if we already have this mapping
    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
        struct htp_mmap *m = &ctx->mmap[i];
        if (m->fd == fd) {
            m->pinned = pinned;
            return AEE_SUCCESS;
        }
    }

    // Add new mapping
    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
        struct htp_mmap *m = &ctx->mmap[i];
        if (!m->size) {
            FARF(HIGH, "mmap : fd %u size %u pinned %u", fd, size, pinned);
#if __HVX_ARCH__ > 73
            void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
#else
            if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
                FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
                abort(); // can't do much else at this point
            }

            void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
#endif
            if (va == (void*)-1) {
                FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
                return AEE_EFAILED;
            }

            m->base   = (uint64_t) va;
            m->fd     = fd;
            m->size   = size;
            m->pinned = pinned;

            return AEE_SUCCESS;
        }
    }

    return AEE_ENOMEMORY;
}

AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }

    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
        struct htp_mmap *m = &ctx->mmap[i];
        if (fd < 0 || m->fd == fd) {
            FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
#if __HVX_ARCH__ > 73
            HAP_munmap2((void *) m->base, m->size);
#else
            HAP_munmap((void *) m->base, m->size);
#endif
            m->size   = 0;
            m->base   = NULL;
            m->fd     = -1;
            m->pinned = 0;
        }
    }

    return AEE_SUCCESS;
}

static void vtcm_acquire(struct htp_context * ctx) {
    if (!ctx->vtcm_valid) {
        int err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000u);
        if (err != 0) {
            FARF(ERROR, "ggml-hex: failed to acquire VTCM: 0x%08x", (unsigned)err);
            abort();
        }

        ctx->vtcm_needs_release = false;
        ctx->vtcm_valid = true;

        // Drop the priority to make sure we get the release callback from other GGML-HTP and QNN-HTP sessions
        HAP_compute_res_update_priority(ctx->vtcm_rctx, ctx->thread_prio + 10);
    }
}

static void vtcm_release(struct htp_context * ctx) {
    if (ctx->vtcm_valid) {
        ctx->vtcm_valid         = false;
        ctx->vtcm_needs_release = false;
        HAP_compute_res_release_cached(ctx->vtcm_rctx);
    }
}

static int vtcm_release_callback(unsigned int rctx, void * state) {
    struct htp_context * ctx = (struct htp_context *) state;
    ctx->vtcm_needs_release = true;
    return 0;
}

static int vtcm_alloc(struct htp_context * ctx) {
    unsigned int vtcm_size = 8 * 1024 * 1024;  // 8MB default
    HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);

    compute_res_attr_t attr;
    HAP_compute_res_attr_init(&attr);
    HAP_compute_res_attr_set_serialize(&attr, 0);
    HAP_compute_res_attr_set_cache_mode(&attr, 1);
    HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size); // single page
    HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
    HAP_compute_res_attr_set_hmx_param(&attr, 1);

    // Allocate VTCM for scratch pads
    uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
    if (!rctx) {
        FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
        return AEE_ENOMEMORY;
    }

    void * vtcm_ptr;
    if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
        HAP_compute_res_release(rctx);
        FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
        return AEE_ENOMEMORY;
    }

    ctx->vtcm_base          = (uint8_t *) vtcm_ptr;
    ctx->vtcm_size          = vtcm_size;
    ctx->vtcm_rctx          = rctx;
    ctx->vtcm_valid         = false;
    ctx->vtcm_needs_release = false;

    return 0;
}

static void vtcm_free(struct htp_context * ctx) {
    if (ctx->vtcm_rctx) {
        HAP_compute_res_release(ctx->vtcm_rctx);
        ctx->vtcm_base = 0;
        ctx->vtcm_rctx = 0;
    }
}

static void htp_packet_callback(dspqueue_t queue, int error, void * context);
static void htp_error_callback(dspqueue_t queue, int error, void * context);

AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx) {
    struct htp_context * ctx = (struct htp_context *) handle;

    if (!ctx) {
        return AEE_EBADPARM;
    }

    if (ctx->queue) {
        FARF(ERROR, "Queue already open");
        return AEE_EITEMBUSY;
    }

    // Import queue created on the CPU
    int err = dspqueue_import(dsp_queue_id,         // Queue ID from dspqueue_export
                              htp_packet_callback,  // Packet callback
                              htp_error_callback,   // Error callback; no errors expected on the DSP
                              (void *) ctx,         // Callback context
                              &ctx->queue);

    if (err) {
        FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
        return err;
    }

    ctx->thread_id   = qurt_thread_get_id();
    ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);

    // allocate VTCM
    err = vtcm_alloc(ctx);
    if (err != AEE_SUCCESS) {
        FARF(ERROR, "Unable to allocate VTCM");
        return AEE_ENOMEMORY;
    }

#ifdef HTP_HAS_HMX
    ctx->hmx_enabled = use_hmx;
    ctx->hmx_queue   = NULL;
    if (use_hmx) {
        ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
        if (!ctx->hmx_queue) {
            FARF(ERROR, "hmx-queue-create failed");
            ctx->hmx_enabled = false;
        }
    }
    FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
#endif

    qurt_sysenv_max_hthreads_t hw_threads;
    qurt_sysenv_get_max_hw_threads(&hw_threads);
    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;

    if (n_hvx == 0) {
        n_hvx = hw_nhvx;
    }
    if (n_hvx > hw_threads.max_hthreads) {
        n_hvx = hw_threads.max_hthreads;
    }
    if (n_hvx > HTP_MAX_NTHREADS) {
        n_hvx = HTP_MAX_NTHREADS;
    }

    ctx->n_threads = n_hvx;
    for (int i = 0; i < ctx->n_threads; i++) {
        // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
        ctx->dma[i] = dma_queue_create(128);
    }

    // init worker pool
    err = worker_pool_init(&ctx->worker_pool, n_hvx);
    if (err != AEE_SUCCESS) {
        FARF(ERROR, "Unable to create worker pool");
        return err;
    }

    FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
         sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);

    return AEE_SUCCESS;
}

AEEResult htp_iface_stop(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }

    if (!ctx->queue) {
        FARF(ERROR, "Queue not open");
        return AEE_EBADSTATE;
    }

    // Close queue. dspqueue_close() will also wait for callbacks to finish.
    int err    = dspqueue_close(ctx->queue);
    ctx->queue = NULL;
    if (err != 0) {
        FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
        return err;
    }

    if (ctx->worker_pool) {
        // Release worker pool
        worker_pool_release(&ctx->worker_pool);
    }

    for (int i = 0; i < ctx->n_threads; i++) {
        dma_queue_delete(ctx->dma[i]);
    }

#ifdef HTP_HAS_HMX
    if (ctx->hmx_queue) {
        hmx_queue_delete(ctx->hmx_queue);
        ctx->hmx_queue = NULL;
    }
    ctx->hmx_enabled = false;
#endif

    vtcm_free(ctx);

    return AEE_SUCCESS;
}

static void htp_error_callback(dspqueue_t queue, int error, void * context) {
    // No errors expected on the DSP.
    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
}

struct profile_data {
    uint64_t usecs;
    uint64_t cycles;
    uint64_t pkts;
};

static inline void profile_start(struct profile_data * d) {
    d->usecs  = HAP_perf_get_qtimer_count();
    d->cycles = hex_get_cycles();
    d->pkts   = hex_get_pktcnt();
}

static inline void profile_stop(struct profile_data * d) {
    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
    d->cycles = hex_get_cycles() - d->cycles;
    d->pkts   = hex_get_pktcnt() - d->pkts;
}

static int execute_op(struct htp_ops_context * octx) {
    switch (octx->op) {
        case HTP_OP_MUL_MAT:
            return op_matmul(octx);

        case HTP_OP_MUL_MAT_ID:
            return op_matmul_id(octx);

        case HTP_OP_MUL:
        case HTP_OP_ADD:
        case HTP_OP_SUB:
        case HTP_OP_DIV:
        case HTP_OP_ADD_ID:
            return op_binary(octx);

        case HTP_OP_RMS_NORM:
        case HTP_OP_SCALE:
        case HTP_OP_SQR:
        case HTP_OP_SQRT:
        case HTP_OP_UNARY_SOFTPLUS:
        case HTP_OP_UNARY_SIGMOID:
        case HTP_OP_UNARY_NEG:
        case HTP_OP_UNARY_EXP:
            return op_unary(octx);

        case HTP_OP_UNARY_SILU:
        case HTP_OP_UNARY_GELU:
        case HTP_OP_GLU_SWIGLU:
        case HTP_OP_GLU_SWIGLU_OAI:
        case HTP_OP_GLU_GEGLU:
            return op_activations(octx);

        case HTP_OP_SOFTMAX:
            return op_softmax(octx);

        case HTP_OP_ROPE:
            return op_rope(octx);

        case HTP_OP_FLASH_ATTN_EXT:
            return op_flash_attn_ext(octx);

        case HTP_OP_SET_ROWS:
            return op_set_rows(octx);

        case HTP_OP_GET_ROWS:
            return op_get_rows(octx);

        case HTP_OP_SUM_ROWS:
            return op_sum_rows(octx);

        case HTP_OP_CPY:
            return op_cpy(octx);

        case HTP_OP_REPEAT:
            return op_repeat(octx);

        case HTP_OP_ARGSORT:
            return op_argsort(octx);

        case HTP_OP_SSM_CONV:
            return op_ssm_conv(octx);

        case HTP_OP_CUMSUM:
            return op_cumsum(octx);

        case HTP_OP_INVALID:
            break;

        // No default to catch missing cases
    }

    FARF(ERROR, "Unknown Op %u", octx->op);
    return -1;
}

static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct htp_buf_desc *b) {
    b->base = NULL;

    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
        struct htp_mmap *m = ctx->mmap + i;
        if (m->size && m->fd == b->fd) {
            b->base   = m->base;
            *m_reuse |= (1 << i);
            return true;
        }
    }

    return false;
}

static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
    if (m->size && !m->pinned) {
        FARF(HIGH, "unmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
#if __HVX_ARCH__ > 73
        HAP_munmap2((void *) m->base, m->size);
#else
        HAP_munmap((void *) m->base, m->size);
#endif
        m->size = 0;
        m->base = 0;
        m->fd   = -1;
    }
}

static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
    if (b->base) return; // already mapped

    // find unused mapping
    for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
        struct htp_mmap *m = &ctx->mmap[i];
        if (!m->size) {
#if __HVX_ARCH__ > 73
            void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
#else
            if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
                FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
                abort(); // can't do much else at this point
            }

            void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
#endif
            if (va == (void*)-1) {
                FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
                abort(); // can't do much else at this point
            }

            m->base   = b->base = (uint64_t) va;
            m->fd     = b->fd;
            m->size   = b->size;
            m->pinned = 0;

            FARF(HIGH, "mmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
            return;
        }
    }
}

static void prep_op_bufs(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t n_bufs) {
    uint32_t m_reuse = 0; // mmap reuse mask (index from ctx->mmap array)
    uint32_t b_reuse = 0; // buf reuse count

    size_t   m_vmem  = 0; // mapped vmem
    size_t   e_vmem  = 0; // extra  vmem

    // See what we can reuse
    for (uint32_t i=0; i < n_bufs; i++) {
        struct htp_buf_desc *b = bufs + i;
        if (reuse_buf(ctx, &m_reuse, b)) { b_reuse++; } else { e_vmem += b->size; }
        FARF(HIGH, "prep-buf #%u : pass0 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
    }

    if (b_reuse == n_bufs) return; // all bufs reuse existing mappings

    // See how much vmem we have mmaped right now
    for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) { m_vmem += ctx->mmap[i].size; }

    FARF(HIGH, "prep-bufs : pass1 mmap-vmem %zu extra-vmem %zu n-bufs %u b-reuse %u", m_vmem, e_vmem, n_bufs, b_reuse);

    if ((m_vmem + e_vmem) > HTP_OP_MAX_VMEM) {
        // Drop unused mappings
        for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
            bool used = m_reuse & (1<<i);
            if (!used) { drop_mmap(ctx, ctx->mmap + i); }
        }
    }

    // Create missing mappings
    for (uint32_t i=0; i < n_bufs; i++) {
        struct htp_buf_desc *b = bufs + i;
        mmap_buf(ctx, b);
        FARF(HIGH, "prep-buf #%u : pass1 fd %u base %p size %u flags 0x%x", i, b->fd, (void*) b->base, (uint32_t) b->size, b->flags);
    }
}

static void prep_tensor(struct htp_context *ctx, struct htp_buf_desc *bufs, uint32_t idx, struct htp_tensor *t) {
    uint32_t offset = t->data;
    uint32_t size   = t->size;
    uint32_t bi     = t->bi;

    t->data = bufs[bi].base + offset; // update data to the actual pointer

    FARF(HIGH, "prep-tensor #%u: bi %u offset %u size %u data %p : %u:%u:%u:%u", idx, t->bi, offset, t->size, (void*) t->data,
        t->ne[0], t->ne[1], t->ne[3], t->ne[3]);
}

static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, struct htp_tensor *tens, uint32_t n_tens) {
    for (uint32_t i=0; i < n_tens; i++) {
        prep_tensor(ctx, bufs, i, tens + i);
    }
}

static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
    memcpy(octx->op_params, op->params, sizeof(octx->op_params));
    octx->flags = op->flags;
    octx->op    = op->opcode;

    FARF(HIGH, "proc-op #%u: opcode %u flags 0x%x", idx, octx->op, octx->flags);

    // Prep input tensors
    for (uint32_t i=0; i<HTP_OP_MAX_INPUTS; i++) {
        struct htp_tensor *src = op->src[i] == 0xffff ? NULL : tens + op->src[i];

        octx->src[i] = src;
        if (!src) continue;

        if (!(src->flags & HTP_TENSOR_FLUSHED) && (src->flags & HTP_TENSOR_COMPUTE)) {
            // flush compute buffers on input
            hex_l2flush((void *) src->data, src->size);
        }

        FARF(HIGH, "prep-src #%u: data %p size %u : %u:%u:%u:%u", op->src[i], (void*) src->data, src->size,
            src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
    }

    // Prep output tensor
    struct htp_tensor *dst = tens + op->dst;

    octx->dst = dst;

    FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);

    (void) execute_op(octx);

    // flush buffers on output
    hex_l2flush((void *) dst->data, dst->size);
    dst->flags |= HTP_TENSOR_FLUSHED;

    FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
}

#define DSPQUEUE_POLL_TIMEOUT_USEC 100
#define DSPQUEUE_POLL_COUNT        100

static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
    struct htp_context * ctx = (struct htp_context *) context;

    int err;

    uint32_t poll_count = DSPQUEUE_POLL_COUNT;

    vtcm_acquire(ctx);

    while (!ctx->vtcm_needs_release) {
        struct htp_opbatch_req req;
        uint32_t r_size = sizeof(req);

        struct dspqueue_buffer dbuf;
        uint32_t n_dbufs = 1;
        uint32_t flags   = 0;

        err = dspqueue_read_noblock(queue, &flags, n_dbufs, &n_dbufs, &dbuf, r_size, &r_size, (uint8_t *) &req);
        if (err == AEE_EWOULDBLOCK) {
            if (--poll_count) {
                qurt_sleep(DSPQUEUE_POLL_TIMEOUT_USEC);
                continue;
            }
            break;
        }

        if (err != 0) {
            FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
            break;
        }

        if (r_size < sizeof(req) || n_dbufs != 1) {
            FARF(ERROR, "invalid request : size %u n-dbufs %u", r_size, n_dbufs);
            continue;
        }

        const uint32_t n_bufs = req.n_bufs;
        const uint32_t n_tens = req.n_tensors;
        const uint32_t n_ops  = req.n_ops;

        const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
        const uint32_t t_size = sizeof(struct htp_tensor)   * n_tens;
        const uint32_t o_size = sizeof(struct htp_op_desc)  * n_ops;

        if (dbuf.size < b_size + t_size + o_size) {
            FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
            break;
        }

        // Reset poll count for valid requests
        poll_count = DSPQUEUE_POLL_COUNT;

        uint8_t * m_ptr = dbuf.ptr;
        struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
        struct htp_tensor*   tens = (struct htp_tensor*)   m_ptr; m_ptr += t_size;
        struct htp_op_desc*   ops = (struct htp_op_desc*)  m_ptr;

        FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
                n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);

        prep_op_bufs(ctx, bufs, n_bufs);
        prep_tensors(ctx, bufs, tens, n_tens);

        struct htp_ops_context *octx = &ctx->octx;
        memset(octx, 0, sizeof(*octx));
        octx->n_threads = ctx->n_threads;
        octx->ctx       = ctx;

        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;
            profile_start(&prof);

            proc_op_req(octx, tens, i, &ops[i]);

            profile_stop(&prof);
            ops[i].prof_usecs  = prof.usecs;
            ops[i].prof_cycles = prof.cycles;
            ops[i].prof_pkts   = prof.pkts;
        }

        // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);

        struct htp_opbatch_rsp rsp;
        rsp.status = HTP_STATUS_OK; // FIXME

        dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
        err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
        if (err != 0) {
            FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
            break;
        }
    }

    vtcm_release(ctx);
}