1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
use crate::requests::*;
use serde::{Deserialize, Serialize};
#[derive(Clone, Serialize, Default, Debug, Deserialize, PartialEq)]
pub struct LlamaCppCompletionRequest {
pub prompt: Vec<u32>,
#[serde(skip)]
pub prompt_string: Option<String>,
/// A formatted "Grammar" as a string.
/// See: https://github.com/richardanaya/gbnf/blob/main/gbnf/src/lib.rs
#[serde(skip_serializing_if = "Option::is_none")]
pub grammar: Option<String>,
/// Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: false
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_prompt: Option<bool>,
/// Modify the likelihood of specified tokens appearing in the completion.
///
/// Accepts a json object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100.
/// Mathematically, the bias is added to the logits generated by the model prior to sampling.
/// The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection;
/// values like -100 or 100 should result in a ban or exclusive selection of the relevant token.
#[serde(skip_serializing_if = "Option::is_none")]
pub logit_bias: Option<Vec<Vec<serde_json::Value>>>, // default: null
/// The maximum number of [tokens](https://platform.openai.com/tokenizer) to generate in the chat completion.
///
/// The total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) for counting tokens.
#[serde(skip_serializing_if = "Option::is_none")]
pub n_predict: Option<u64>,
/// stop: Specify a JSON array of stopping strings.
/// These words will not be included in the completion,
/// so make sure to add them to the prompt for the next iteration (default: []).
#[serde(skip_serializing_if = "Option::is_none")]
pub stop: Option<Vec<String>>,
/// Not currently used
#[serde(skip_serializing_if = "Option::is_none")]
pub stream: Option<bool>,
/// min: 0.0, max: 2.0, default: None
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
/// min: -2.0, max: 2.0, default: None
#[serde(skip_serializing_if = "Option::is_none")]
pub frequency_penalty: Option<f32>,
/// min: -2.0, max: 2.0, default: None
#[serde(skip_serializing_if = "Option::is_none")]
pub presence_penalty: Option<f32>,
/// min: 0.0, max: 1.0, default: None
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f32>,
}
impl LlamaCppCompletionRequest {
pub fn new(req: &CompletionRequest) -> crate::Result<Self, CompletionError> {
let cache_prompt = if req.config.cache_prompt {
Some(true)
} else {
None
};
Ok(Self {
prompt: req
.prompt
.local_prompt()
.map_err(|e| CompletionError::RequestBuilderError(e.to_string()))?
.get_built_prompt_as_tokens()
.map_err(|e| CompletionError::RequestBuilderError(e.to_string()))?,
prompt_string: Some(
req.prompt
.local_prompt()
.map_err(|e| CompletionError::RequestBuilderError(e.to_string()))?
.get_built_prompt()
.map_err(|e| CompletionError::RequestBuilderError(e.to_string()))?,
),
grammar: req.grammar_string.clone(),
cache_prompt,
logit_bias: req.logit_bias.as_ref().and_then(|lb| lb.get_llama_cpp()),
frequency_penalty: req.config.frequency_penalty,
stream: None,
n_predict: req.config.actual_request_tokens,
presence_penalty: Some(req.config.presence_penalty),
stop: Some(req.stop_sequences.to_vec()),
temperature: Some(req.config.temperature),
top_p: req.config.top_p,
})
}
}