Expand description
System-level, highly unsafe
bindings to
llama.cpp.
There’s a lot of nuance here; for a safe alternative, see llama_cpp
.
You need cmake
, a compatible libc
, libcxx
, libcxxabi
, and libclang
to build this
project, along with a C/C++ compiler toolchain.
The code is automatically built for static and dynamic linking using
the cmake
crate, with C FFI bindings being generated with
bindgen
.
Structs§
- _IO_
FILE - _IO_
codecvt - _IO_
marker - _IO_
wide_ data - ggml_
backend - ggml_
backend_ buffer - ggml_
backend_ buffer_ type - ggml_
backend_ event - ggml_
backend_ graph_ copy - ggml_
backend_ sched - ggml_
cgraph - ggml_
compute_ params - ggml_
context - ggml_
cplan - ggml_
gallocr - ggml_
hash_ set - ggml_
init_ params - ggml_
object - ggml_
opt_ context - ggml_
opt_ context__ bindgen_ ty_ 1 - ggml_
opt_ context__ bindgen_ ty_ 2 - ggml_
opt_ params - ggml_
opt_ params__ bindgen_ ty_ 1 - ggml_
opt_ params__ bindgen_ ty_ 2 - ggml_
scratch - ggml_
tallocr - ggml_
tensor - ggml_
type_ traits_ t - llama_
batch - llama_
beam_ view - llama_
beams_ state - llama_
chat_ message - llama_
context - llama_
context_ params - llama_
grammar - llama_
grammar_ element - llama_
kv_ cache_ view - llama_
kv_ cache_ view_ cell - llama_
model - llama_
model_ kv_ override - llama_
model_ params - llama_
model_ quantize_ params - llama_
timings - llama_
token_ data - llama_
token_ data_ array
Enums§
- ggml_
backend_ buffer_ usage - ggml_
backend_ type - ggml_
cgraph_ eval_ order - ggml_
ftype - ggml_
linesearch - ggml_
log_ level - ggml_
numa_ strategy - ggml_
object_ type - ggml_op
- ggml_
op_ pool - ggml_
opt_ result - ggml_
opt_ type - ggml_
prec - ggml_
sort_ order - ggml_
status - ggml_
task_ type - ggml_
tensor_ flag - ggml_
type - ggml_
unary_ op - llama_
ftype - llama_
model_ kv_ override_ type - llama_
pooling_ type - llama_
rope_ scaling_ type - llama_
rope_ type - llama_
split_ mode - llama_
token_ type - llama_
vocab_ type
Constants§
- llama_
gretype_ LLAMA_ GRETYPE_ ALT - llama_
gretype_ LLAMA_ GRETYPE_ CHAR - llama_
gretype_ LLAMA_ GRETYPE_ CHAR_ ALT - llama_
gretype_ LLAMA_ GRETYPE_ CHAR_ NOT - llama_
gretype_ LLAMA_ GRETYPE_ CHAR_ RNG_ UPPER - llama_
gretype_ LLAMA_ GRETYPE_ END - llama_
gretype_ LLAMA_ GRETYPE_ RULE_ REF
Functions§
- ggml_
abs ⚠ - ggml_
abs_ ⚠inplace - ggml_
acc ⚠ - ggml_
acc_ ⚠inplace - ggml_
add ⚠ - ggml_
add1 ⚠ - ggml_
add1_ ⚠inplace - ggml_
add_ ⚠cast - ggml_
add_ ⚠inplace - ggml_
add_ ⚠rel_ pos - ggml_
add_ ⚠rel_ pos_ inplace - ggml_
alibi ⚠ - ggml_
arange ⚠ - ggml_
are_ ⚠same_ shape - ggml_
argmax ⚠ - ggml_
argsort ⚠ - ggml_
backend_ ⚠alloc_ buffer - ggml_
backend_ ⚠alloc_ ctx_ tensors - ggml_
backend_ ⚠alloc_ ctx_ tensors_ from_ buft - ggml_
backend_ ⚠buffer_ clear - ggml_
backend_ ⚠buffer_ free - ggml_
backend_ ⚠buffer_ get_ alignment - ggml_
backend_ ⚠buffer_ get_ alloc_ size - ggml_
backend_ ⚠buffer_ get_ base - ggml_
backend_ ⚠buffer_ get_ max_ size - ggml_
backend_ ⚠buffer_ get_ size - ggml_
backend_ ⚠buffer_ get_ type - ggml_
backend_ ⚠buffer_ init_ tensor - ggml_
backend_ ⚠buffer_ is_ host - ggml_
backend_ ⚠buffer_ name - ggml_
backend_ ⚠buffer_ reset - ggml_
backend_ ⚠buffer_ set_ usage - ggml_
backend_ ⚠buft_ alloc_ buffer - ggml_
backend_ ⚠buft_ get_ alignment - ggml_
backend_ ⚠buft_ get_ alloc_ size - ggml_
backend_ ⚠buft_ get_ max_ size - ggml_
backend_ ⚠buft_ is_ host - ggml_
backend_ ⚠buft_ name - ggml_
backend_ ⚠buft_ supports_ backend - ggml_
backend_ ⚠compare_ graph_ backend - ggml_
backend_ ⚠cpu_ buffer_ from_ ptr - ggml_
backend_ ⚠cpu_ buffer_ type - ggml_
backend_ ⚠cpu_ init - ggml_
backend_ ⚠cpu_ set_ abort_ callback - ggml_
backend_ ⚠cpu_ set_ n_ threads - ggml_
backend_ ⚠event_ free - ggml_
backend_ ⚠event_ new - ggml_
backend_ ⚠event_ record - ggml_
backend_ ⚠event_ synchronize - ggml_
backend_ ⚠event_ wait - ggml_
backend_ ⚠free - ggml_
backend_ ⚠get_ alignment - ggml_
backend_ ⚠get_ default_ buffer_ type - ggml_
backend_ ⚠get_ max_ size - ggml_
backend_ ⚠graph_ compute - ggml_
backend_ ⚠graph_ compute_ async - ggml_
backend_ ⚠graph_ copy - ggml_
backend_ ⚠graph_ copy_ free - ggml_
backend_ ⚠graph_ plan_ compute - ggml_
backend_ ⚠graph_ plan_ create - ggml_
backend_ ⚠graph_ plan_ free - ggml_
backend_ ⚠guid - ggml_
backend_ ⚠is_ cpu - ggml_
backend_ ⚠name - ggml_
backend_ ⚠offload_ op - ggml_
backend_ ⚠reg_ alloc_ buffer - ggml_
backend_ ⚠reg_ find_ by_ name - ggml_
backend_ ⚠reg_ get_ count - ggml_
backend_ ⚠reg_ get_ default_ buffer_ type - ggml_
backend_ ⚠reg_ get_ name - ggml_
backend_ ⚠reg_ init_ backend - ggml_
backend_ ⚠reg_ init_ backend_ from_ str - ggml_
backend_ ⚠sched_ alloc_ graph - ggml_
backend_ ⚠sched_ free - ggml_
backend_ ⚠sched_ get_ buffer_ size - ggml_
backend_ ⚠sched_ get_ n_ copies - ggml_
backend_ ⚠sched_ get_ n_ splits - ggml_
backend_ ⚠sched_ get_ tensor_ backend - ggml_
backend_ ⚠sched_ graph_ compute - ggml_
backend_ ⚠sched_ graph_ compute_ async - ggml_
backend_ ⚠sched_ new - ggml_
backend_ ⚠sched_ reserve - ggml_
backend_ ⚠sched_ reset - ggml_
backend_ ⚠sched_ set_ eval_ callback - ggml_
backend_ ⚠sched_ set_ tensor_ backend - ggml_
backend_ ⚠sched_ synchronize - ggml_
backend_ ⚠supports_ op - ggml_
backend_ ⚠synchronize - ggml_
backend_ ⚠tensor_ alloc - ggml_
backend_ ⚠tensor_ copy - ggml_
backend_ ⚠tensor_ copy_ async - ggml_
backend_ ⚠tensor_ get - ggml_
backend_ ⚠tensor_ get_ async - ggml_
backend_ ⚠tensor_ set - ggml_
backend_ ⚠tensor_ set_ async - ggml_
backend_ ⚠view_ init - ggml_
blck_ ⚠size - ggml_
build_ ⚠backward_ expand - ggml_
build_ ⚠backward_ gradient_ checkpointing - ggml_
build_ ⚠forward_ expand - ggml_
cast ⚠ - ggml_
clamp ⚠ - ggml_
concat ⚠ - ggml_
cont ⚠ - ggml_
cont_ ⚠1d - ggml_
cont_ ⚠2d - ggml_
cont_ ⚠3d - ggml_
cont_ ⚠4d - ggml_
conv_ ⚠1d - ggml_
conv_ ⚠1d_ ph - ggml_
conv_ ⚠2d - ggml_
conv_ ⚠2d_ s1_ ph - ggml_
conv_ ⚠2d_ sk_ p0 - ggml_
conv_ ⚠depthwise_ 2d - ggml_
conv_ ⚠transpose_ 1d - ggml_
conv_ ⚠transpose_ 2d_ p0 - ggml_
cpu_ ⚠has_ arm_ fma - ggml_
cpu_ ⚠has_ avx - ggml_
cpu_ ⚠has_ avx2 - ggml_
cpu_ ⚠has_ avx512 - ggml_
cpu_ ⚠has_ avx512_ vbmi - ggml_
cpu_ ⚠has_ avx512_ vnni - ggml_
cpu_ ⚠has_ avx_ vnni - ggml_
cpu_ ⚠has_ blas - ggml_
cpu_ ⚠has_ clblast - ggml_
cpu_ ⚠has_ cuda - ggml_
cpu_ ⚠has_ f16c - ggml_
cpu_ ⚠has_ fma - ggml_
cpu_ ⚠has_ fp16_ va - ggml_
cpu_ ⚠has_ gpublas - ggml_
cpu_ ⚠has_ kompute - ggml_
cpu_ ⚠has_ matmul_ int8 - ggml_
cpu_ ⚠has_ metal - ggml_
cpu_ ⚠has_ neon - ggml_
cpu_ ⚠has_ sse3 - ggml_
cpu_ ⚠has_ ssse3 - ggml_
cpu_ ⚠has_ sycl - ggml_
cpu_ ⚠has_ vsx - ggml_
cpu_ ⚠has_ vulkan - ggml_
cpu_ ⚠has_ wasm_ simd - ggml_
cpy ⚠ - ggml_
cross_ ⚠entropy_ loss - ggml_
cross_ ⚠entropy_ loss_ back - ggml_
cycles ⚠ - ggml_
cycles_ ⚠per_ ms - ggml_
diag ⚠ - ggml_
diag_ ⚠mask_ inf - ggml_
diag_ ⚠mask_ inf_ inplace - ggml_
diag_ ⚠mask_ zero - ggml_
diag_ ⚠mask_ zero_ inplace - ggml_
div ⚠ - ggml_
div_ ⚠inplace - ggml_
dup ⚠ - ggml_
dup_ ⚠inplace - ggml_
dup_ ⚠tensor - ggml_
element_ ⚠size - ggml_
elu ⚠ - ggml_
elu_ ⚠inplace - ggml_
flash_ ⚠attn - ggml_
flash_ ⚠attn_ back - ggml_
flash_ ⚠ff - ggml_
fopen ⚠ - ggml_
format_ ⚠name - ggml_
fp16_ ⚠to_ fp32 - ggml_
fp16_ ⚠to_ fp32_ row - ggml_
fp32_ ⚠to_ fp16 - ggml_
fp32_ ⚠to_ fp16_ row - ggml_
free ⚠ - ggml_
ftype_ ⚠to_ ggml_ type - ggml_
gallocr_ ⚠alloc_ graph - ggml_
gallocr_ ⚠free - ggml_
gallocr_ ⚠get_ buffer_ size - ggml_
gallocr_ ⚠new - ggml_
gallocr_ ⚠new_ n - ggml_
gallocr_ ⚠reserve - ggml_
gallocr_ ⚠reserve_ n - ggml_
gelu ⚠ - ggml_
gelu_ ⚠inplace - ggml_
gelu_ ⚠quick - ggml_
gelu_ ⚠quick_ inplace - ggml_
get_ ⚠data - ggml_
get_ ⚠data_ f32 - ggml_
get_ ⚠f32_ 1d - ggml_
get_ ⚠f32_ nd - ggml_
get_ ⚠first_ tensor - ggml_
get_ ⚠i32_ 1d - ggml_
get_ ⚠i32_ nd - ggml_
get_ ⚠max_ tensor_ size - ggml_
get_ ⚠mem_ buffer - ggml_
get_ ⚠mem_ size - ggml_
get_ ⚠name - ggml_
get_ ⚠next_ tensor - ggml_
get_ ⚠no_ alloc - ggml_
get_ ⚠rel_ pos - ggml_
get_ ⚠rows - ggml_
get_ ⚠rows_ back - ggml_
get_ ⚠tensor - ggml_
get_ ⚠unary_ op - ggml_
graph_ ⚠clear - ggml_
graph_ ⚠compute - ggml_
graph_ ⚠compute_ with_ ctx - ggml_
graph_ ⚠cpy - ggml_
graph_ ⚠dump_ dot - ggml_
graph_ ⚠dup - ggml_
graph_ ⚠export - ggml_
graph_ ⚠get_ tensor - ggml_
graph_ ⚠import - ggml_
graph_ ⚠overhead - ggml_
graph_ ⚠overhead_ custom - ggml_
graph_ ⚠plan - ggml_
graph_ ⚠print - ggml_
graph_ ⚠reset - ggml_
graph_ ⚠view - ggml_
group_ ⚠norm - ggml_
group_ ⚠norm_ inplace - ggml_
guid_ ⚠matches - ggml_
hardsigmoid ⚠ - ggml_
hardswish ⚠ - ggml_
im2col ⚠ - ggml_
init ⚠ - ggml_
internal_ ⚠get_ type_ traits - ggml_
is_ ⚠3d - ggml_
is_ ⚠contiguous - ggml_
is_ ⚠empty - ggml_
is_ ⚠matrix - ggml_
is_ ⚠numa - ggml_
is_ ⚠permuted - ggml_
is_ ⚠quantized - ggml_
is_ ⚠scalar - ggml_
is_ ⚠transposed - ggml_
is_ ⚠vector - ggml_
leaky_ ⚠relu - ggml_
log ⚠ - ggml_
log_ ⚠inplace - ggml_
map_ ⚠binary_ f32 - ggml_
map_ ⚠binary_ inplace_ f32 - ggml_
map_ ⚠custom1 - ggml_
map_ ⚠custom2 - ggml_
map_ ⚠custom3 - ggml_
map_ ⚠custom1_ f32 - ggml_
map_ ⚠custom1_ inplace - ggml_
map_ ⚠custom1_ inplace_ f32 - ggml_
map_ ⚠custom2_ f32 - ggml_
map_ ⚠custom2_ inplace - ggml_
map_ ⚠custom2_ inplace_ f32 - ggml_
map_ ⚠custom3_ f32 - ggml_
map_ ⚠custom3_ inplace - ggml_
map_ ⚠custom3_ inplace_ f32 - ggml_
map_ ⚠unary_ f32 - ggml_
map_ ⚠unary_ inplace_ f32 - ggml_
mean ⚠ - ggml_
mul ⚠ - ggml_
mul_ ⚠inplace - ggml_
mul_ ⚠mat - ggml_
mul_ ⚠mat_ id - ggml_
mul_ ⚠mat_ set_ prec - ggml_
n_ ⚠dims - ggml_
nbytes ⚠ - ggml_
nbytes_ ⚠pad - ggml_
neg ⚠ - ggml_
neg_ ⚠inplace - ggml_
nelements ⚠ - ggml_
new_ ⚠f32 - ggml_
new_ ⚠graph - ggml_
new_ ⚠graph_ custom - ggml_
new_ ⚠i32 - ggml_
new_ ⚠tensor - ggml_
new_ ⚠tensor_ 1d - ggml_
new_ ⚠tensor_ 2d - ggml_
new_ ⚠tensor_ 3d - ggml_
new_ ⚠tensor_ 4d - ggml_
norm ⚠ - ggml_
norm_ ⚠inplace - ggml_
nrows ⚠ - ggml_
numa_ ⚠init - ggml_
op_ ⚠desc - ggml_
op_ ⚠name - ggml_
op_ ⚠symbol - ggml_
opt ⚠ - ggml_
opt_ ⚠default_ params - ggml_
opt_ ⚠init - ggml_
opt_ ⚠resume - ggml_
opt_ ⚠resume_ g - ggml_
out_ ⚠prod - ggml_
pad ⚠ - ggml_
permute ⚠ - ggml_
pool_ ⚠1d - ggml_
pool_ ⚠2d - ggml_
print_ ⚠backtrace - ggml_
print_ ⚠object - ggml_
print_ ⚠objects - ggml_
quantize_ ⚠chunk - ggml_
quantize_ ⚠free - ggml_
quantize_ ⚠init - ggml_
quantize_ ⚠requires_ imatrix - ggml_
relu ⚠ - ggml_
relu_ ⚠inplace - ggml_
repeat ⚠ - ggml_
repeat_ ⚠back - ggml_
reshape ⚠ - ggml_
reshape_ ⚠1d - ggml_
reshape_ ⚠2d - ggml_
reshape_ ⚠3d - ggml_
reshape_ ⚠4d - ggml_
rms_ ⚠norm - ggml_
rms_ ⚠norm_ back - ggml_
rms_ ⚠norm_ inplace - ggml_
rope ⚠ - ggml_
rope_ ⚠back - ggml_
rope_ ⚠custom - ggml_
rope_ ⚠custom_ inplace - ggml_
rope_ ⚠inplace - ggml_
rope_ ⚠xpos_ inplace - ggml_
rope_ ⚠yarn_ corr_ dims - ggml_
row_ ⚠size - ggml_
scale ⚠ - ggml_
scale_ ⚠inplace - ggml_
set ⚠ - ggml_
set_ ⚠1d - ggml_
set_ ⚠1d_ inplace - ggml_
set_ ⚠2d - ggml_
set_ ⚠2d_ inplace - ggml_
set_ ⚠f32 - ggml_
set_ ⚠f32_ 1d - ggml_
set_ ⚠f32_ nd - ggml_
set_ ⚠i32 - ggml_
set_ ⚠i32_ 1d - ggml_
set_ ⚠i32_ nd - ggml_
set_ ⚠inplace - ggml_
set_ ⚠input - ggml_
set_ ⚠name - ggml_
set_ ⚠no_ alloc - ggml_
set_ ⚠output - ggml_
set_ ⚠param - ggml_
set_ ⚠scratch - ggml_
set_ ⚠zero - ggml_
sgn ⚠ - ggml_
sgn_ ⚠inplace - ggml_
silu ⚠ - ggml_
silu_ ⚠back - ggml_
silu_ ⚠inplace - ggml_
soft_ ⚠max - ggml_
soft_ ⚠max_ back - ggml_
soft_ ⚠max_ back_ inplace - ggml_
soft_ ⚠max_ ext - ggml_
soft_ ⚠max_ inplace - ggml_
sqr ⚠ - ggml_
sqr_ ⚠inplace - ggml_
sqrt ⚠ - ggml_
sqrt_ ⚠inplace - ggml_
ssm_ ⚠conv - ggml_
ssm_ ⚠scan - ggml_
status_ ⚠to_ string - ggml_
step ⚠ - ggml_
step_ ⚠inplace - ggml_
sub ⚠ - ggml_
sub_ ⚠inplace - ggml_
sum ⚠ - ggml_
sum_ ⚠rows - ggml_
tallocr_ ⚠alloc - ggml_
tallocr_ ⚠new - ggml_
tanh ⚠ - ggml_
tanh_ ⚠inplace - ggml_
tensor_ ⚠overhead - ggml_
time_ ⚠init - ggml_
time_ ⚠ms - ggml_
time_ ⚠us - ggml_
timestep_ ⚠embedding - ggml_
top_ ⚠k - ggml_
transpose ⚠ - ggml_
type_ ⚠name - ggml_
type_ ⚠size - ggml_
type_ ⚠sizef - ggml_
unary ⚠ - ggml_
unary_ ⚠inplace - ggml_
unary_ ⚠op_ name - ggml_
unravel_ ⚠index - ggml_
upscale ⚠ - ggml_
used_ ⚠mem - ggml_
view_ ⚠1d - ggml_
view_ ⚠2d - ggml_
view_ ⚠3d - ggml_
view_ ⚠4d - ggml_
view_ ⚠tensor - ggml_
win_ ⚠part - ggml_
win_ ⚠unpart - llama_
add_ ⚠bos_ token - llama_
add_ ⚠eos_ token - llama_
backend_ ⚠free - llama_
backend_ ⚠init - llama_
batch_ ⚠free - llama_
batch_ ⚠get_ one - llama_
batch_ ⚠init - llama_
beam_ ⚠search - @details Deterministically returns entire sentence constructed by a beam search. @param ctx Pointer to the llama_context. @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. @param callback_data A pointer that is simply passed back to callback. @param n_beams Number of beams to use. @param n_past Number of tokens already evaluated. @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
- llama_
chat_ ⚠apply_ template - Apply chat template. Inspired by hf apply_chat_template() on python. Both “model” and “custom_template” are optional, but at least one is required. “custom_template” has higher precedence than “model” NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. @param chat Pointer to a list of multiple llama_chat_message @param n_msg Number of llama_chat_message in this chat @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message. @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages) @param length The size of the allocated buffer @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
- llama_
context_ ⚠default_ params - llama_
control_ ⚠vector_ apply - llama_
copy_ ⚠state_ data - llama_
decode ⚠ - llama_
dump_ ⚠timing_ info_ yaml - llama_
free ⚠ - llama_
free_ ⚠model - llama_
get_ ⚠embeddings - llama_
get_ ⚠embeddings_ ith - llama_
get_ ⚠embeddings_ seq - llama_
get_ ⚠kv_ cache_ token_ count - llama_
get_ ⚠kv_ cache_ used_ cells - llama_
get_ ⚠logits - llama_
get_ ⚠logits_ ith - llama_
get_ ⚠model - llama_
get_ ⚠model_ tensor - llama_
get_ ⚠state_ size - llama_
get_ ⚠timings - llama_
grammar_ ⚠accept_ token - @details Accepts the sampled token into the grammar
- llama_
grammar_ ⚠copy - llama_
grammar_ ⚠free - llama_
grammar_ ⚠init - llama_
kv_ ⚠cache_ clear - llama_
kv_ ⚠cache_ defrag - llama_
kv_ ⚠cache_ seq_ add - llama_
kv_ ⚠cache_ seq_ cp - llama_
kv_ ⚠cache_ seq_ div - llama_
kv_ ⚠cache_ seq_ keep - llama_
kv_ ⚠cache_ seq_ pos_ max - llama_
kv_ ⚠cache_ seq_ rm - llama_
kv_ ⚠cache_ update - llama_
kv_ ⚠cache_ view_ free - llama_
kv_ ⚠cache_ view_ init - llama_
kv_ ⚠cache_ view_ update - llama_
load_ ⚠model_ from_ file - llama_
load_ ⚠session_ file - llama_
log_ ⚠set - llama_
max_ ⚠devices - llama_
model_ ⚠apply_ lora_ from_ file - llama_
model_ ⚠default_ params - llama_
model_ ⚠desc - llama_
model_ ⚠meta_ count - llama_
model_ ⚠meta_ key_ by_ index - llama_
model_ ⚠meta_ val_ str - llama_
model_ ⚠meta_ val_ str_ by_ index - llama_
model_ ⚠n_ params - llama_
model_ ⚠quantize - llama_
model_ ⚠quantize_ default_ params - llama_
model_ ⚠size - llama_
n_ ⚠batch - llama_
n_ ⚠ctx - llama_
n_ ⚠ctx_ train - llama_
n_ ⚠embd - llama_
n_ ⚠layer - llama_
n_ ⚠seq_ max - llama_
n_ ⚠ubatch - llama_
n_ ⚠vocab - llama_
new_ ⚠context_ with_ model - llama_
numa_ ⚠init - llama_
print_ ⚠system_ info - llama_
print_ ⚠timings - llama_
reset_ ⚠timings - llama_
rope_ ⚠freq_ scale_ train - llama_
rope_ ⚠type - llama_
sample_ ⚠apply_ guidance - @details Apply classifier-free guidance to the logits as described in academic paper “Stay on topic with Classifier-Free Guidance” https://arxiv.org/abs/2306.17806 @param logits Logits extracted from the original generation context. @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
- llama_
sample_ ⚠entropy - @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
- llama_
sample_ ⚠grammar - @details Apply constraints from grammar
- llama_
sample_ ⚠min_ p - @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
- llama_
sample_ ⚠repetition_ penalties - @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
- llama_
sample_ ⚠softmax - @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
- llama_
sample_ ⚠tail_ free - @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
- llama_
sample_ ⚠temp - llama_
sample_ ⚠token - @details Randomly selects a token from the candidates based on their probabilities.
- llama_
sample_ ⚠token_ greedy - @details Selects the token with the highest probability. Does not compute the token probabilities. Use llama_sample_softmax() instead.
- llama_
sample_ ⚠token_ mirostat - @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@param candidates A vector of
llama_token_data
containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @param eta The learning rate used to updatemu
based on the error between the target and observed surprisal of the sampled word. A larger learning rate will causemu
to be updated more quickly, while a smaller learning rate will result in slower updates. @param m The number of tokens considered in the estimation ofs_hat
. This is an arbitrary value that is used to calculates_hat
, which in turn helps to calculate the value ofk
. In the paper, they usem = 100
, but you can experiment with different values to see how it affects the performance of the algorithm. @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (2 * tau
) and is updated in the algorithm based on the error between the target and observed surprisal. - llama_
sample_ ⚠token_ mirostat_ v2 - @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@param candidates A vector of
llama_token_data
containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @param eta The learning rate used to updatemu
based on the error between the target and observed surprisal of the sampled word. A larger learning rate will causemu
to be updated more quickly, while a smaller learning rate will result in slower updates. @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (2 * tau
) and is updated in the algorithm based on the error between the target and observed surprisal. - llama_
sample_ ⚠top_ k - @details Top-K sampling described in academic paper “The Curious Case of Neural Text Degeneration” https://arxiv.org/abs/1904.09751
- llama_
sample_ ⚠top_ p - @details Nucleus sampling described in academic paper “The Curious Case of Neural Text Degeneration” https://arxiv.org/abs/1904.09751
- llama_
sample_ ⚠typical - @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
- llama_
save_ ⚠session_ file - llama_
set_ ⚠abort_ callback - llama_
set_ ⚠causal_ attn - llama_
set_ ⚠n_ threads - llama_
set_ ⚠rng_ seed - llama_
set_ ⚠state_ data - llama_
split_ ⚠path - @details Build a split GGUF final path for this chunk. llama_split_path(split_path, sizeof(split_path), “/models/ggml-model-q4_0”, 2, 4) => split_path = “/models/ggml-model-q4_0-00002-of-00004.gguf”
- llama_
split_ ⚠prefix - @details Extract the path prefix from the split_path if and only if the split_no and split_count match. llama_split_prefix(split_prefix, 64, “/models/ggml-model-q4_0-00002-of-00004.gguf”, 2, 4) => split_prefix = “/models/ggml-model-q4_0”
- llama_
supports_ ⚠gpu_ offload - llama_
supports_ ⚠mlock - llama_
supports_ ⚠mmap - llama_
synchronize ⚠ - llama_
time_ ⚠us - llama_
token_ ⚠bos - llama_
token_ ⚠eos - llama_
token_ ⚠eot - llama_
token_ ⚠get_ score - llama_
token_ ⚠get_ text - llama_
token_ ⚠get_ type - llama_
token_ ⚠middle - llama_
token_ ⚠nl - llama_
token_ ⚠prefix - llama_
token_ ⚠suffix - llama_
token_ ⚠to_ piece - llama_
tokenize ⚠ - @details Convert the provided text into tokens. @param tokens The tokens pointer must be large enough to hold the resulting tokens. @return Returns the number of tokens on success, no more than n_tokens_max @return Returns a negative number on failure - the number of tokens that would have been returned @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
- llama_
vocab_ ⚠type
Type Aliases§
- FILE
- _IO_
lock_ t - __
off64_ t - __off_t
- ggml_
abort_ callback - ggml_
backend_ buffer_ t - ggml_
backend_ buffer_ type_ t - ggml_
backend_ eval_ callback - ggml_
backend_ event_ t - ggml_
backend_ graph_ plan_ t - ggml_
backend_ sched_ eval_ callback - ggml_
backend_ sched_ t - ggml_
backend_ t - ggml_
binary_ op_ f32_ t - ggml_
custom1_ op_ f32_ t - ggml_
custom1_ op_ t - ggml_
custom2_ op_ f32_ t - ggml_
custom2_ op_ t - ggml_
custom3_ op_ f32_ t - ggml_
custom3_ op_ t - ggml_
fp16_ t - ggml_
from_ float_ t - ggml_
gallocr_ t - ggml_
guid - ggml_
guid_ t - ggml_
log_ callback - ggml_
opt_ callback - ggml_
to_ float_ t - ggml_
unary_ op_ f32_ t - ggml_
vec_ dot_ t - llama_
beam_ search_ callback_ fn_ t - llama_
gretype - llama_
pos - llama_
progress_ callback - llama_
seq_ id - llama_
token