1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
//! Basic example demonstrating LlamaCppServerAgent usage.
//!
//! This example shows how to use LlamaCppServerAgent for local LLM inference
//! with llama-server (llama.cpp HTTP server).
//!
//! # Prerequisites
//!
//! 1. Build llama.cpp: https://github.com/ggerganov/llama.cpp
//! 2. Download a GGUF model (e.g., from HuggingFace)
//! 3. Start the server:
//! ```bash
//! llama-server -m model.gguf --host 0.0.0.0 --port 8080
//! ```
//!
//! # Run
//!
//! ```bash
//! cargo run --example llama_cpp_server_basic --features llama-cpp-server
//! ```
use llm_toolkit::agent::Agent;
use llm_toolkit::agent::impls::{ChatTemplate, LlamaCppServerAgent};
#[tokio::main(flavor = "current_thread")]
async fn main() {
println!("=== llama-server API Agent Example ===\n");
// =========================================================================
// 1. Default Configuration
// =========================================================================
println!("1. Default Configuration (localhost:8080, Llama3 template)");
let agent = LlamaCppServerAgent::new();
println!(" Endpoint: {}", agent.endpoint());
// Check if server is running
if agent.is_healthy().await {
println!(" Status: llama-server is healthy\n");
} else {
println!(" Status: llama-server is NOT running");
println!(" Please start: llama-server -m model.gguf --port 8080\n");
return;
}
// =========================================================================
// 2. Check Available Slots
// =========================================================================
println!("2. Server Slots (Concurrent Request Capacity)");
match agent.available_slots().await {
Ok(slots) => println!(" Available slots: {}\n", slots),
Err(e) => println!(" Could not get slots: {}\n", e),
}
// =========================================================================
// 3. Different Chat Templates
// =========================================================================
println!("3. Chat Template Examples");
let templates = [
("Llama3", ChatTemplate::Llama3),
("Qwen", ChatTemplate::Qwen),
("Mistral", ChatTemplate::Mistral),
("None (raw)", ChatTemplate::None),
];
for (name, template) in templates {
let formatted = template.format("Hello");
println!(" {}: {}", name, formatted.replace('\n', "\\n"));
}
println!();
// =========================================================================
// 4. Custom Configuration
// =========================================================================
println!("4. Custom Configuration");
let custom_agent = LlamaCppServerAgent::new()
.with_endpoint("http://localhost:8080")
.with_chat_template(ChatTemplate::Llama3)
.with_max_tokens(256)
.with_temperature(0.7)
.with_system_prompt("You are a helpful assistant. Be concise.");
println!(" Max tokens: 256");
println!(" Temperature: 0.7");
println!(" System prompt: Set\n");
// =========================================================================
// 5. Execute a Prompt
// =========================================================================
println!("5. Execute a Prompt");
println!(" Prompt: 'What is Rust? Answer in one sentence.'\n");
match custom_agent
.execute("What is Rust? Answer in one sentence.".into())
.await
{
Ok(response) => {
println!(" Response: {}\n", response.trim());
}
Err(e) => {
println!(" Error: {}\n", e);
}
}
// =========================================================================
// 6. JSON Generation with System Prompt
// =========================================================================
println!("6. JSON Generation");
let json_agent = LlamaCppServerAgent::new()
.with_chat_template(ChatTemplate::Llama3)
.with_system_prompt(
"You are a JSON generator. Always respond with valid JSON only, no explanation.",
)
.with_temperature(0.3); // Lower temperature for more deterministic output
println!(" Prompt: 'Generate a user object with name and age'\n");
match json_agent
.execute("Generate a user object with name and age".into())
.await
{
Ok(response) => {
println!(" Response: {}\n", response.trim());
}
Err(e) => {
println!(" Error: {}\n", e);
}
}
// =========================================================================
// 7. Environment Variable Configuration
// =========================================================================
println!("7. Environment Variable Configuration");
let env_agent = LlamaCppServerAgent::from_env();
println!(
" Endpoint (from LLAMA_SERVER_ENDPOINT or default): {}",
env_agent.endpoint()
);
println!(
" Variables: LLAMA_SERVER_ENDPOINT, LLAMA_SERVER_MAX_TOKENS, LLAMA_SERVER_TEMPERATURE\n"
);
println!("=== Example Complete ===");
}