llama_cpp_4/context/
perf.rs

1//! Safe wrapper around `llama_perf_context_data`.
2use std::fmt::{Debug, Display, Formatter};
3
4use llama_cpp_sys_4::{ggml_time_us, llama_perf_context_print};
5
6use crate::context::LlamaContext;
7
8/// A wrapper around `llama_perf_context_data`.
9#[derive(Clone, Copy, Debug)]
10pub struct PerfContextData {
11    pub(crate) perf_context_data: llama_cpp_sys_4::llama_perf_context_data,
12}
13
14impl PerfContextData {
15    /// Create a new `PerfContextData`.
16    /// ```
17    /// # use llama_cpp_4::timing::PerfContextData;
18    /// let timings = PerfContextData::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 8, 9);
19    /// let timings_str = "load time = 3.00 ms
20    /// sample time = 4.00 ms / 7 runs (0.57 ms per token, 1750.00 tokens per second)
21    /// prompt eval time = 5.00 ms / 8 tokens (0.62 ms per token, 1600.00 tokens per second)
22    /// eval time = 6.00 ms / 9 runs (0.67 ms per token, 1500.00 tokens per second)
23    /// total time = 1.00 ms";
24    /// assert_eq!(timings_str, format!("{}", timings));
25    /// ```
26    #[allow(clippy::too_many_arguments)]
27    #[must_use]
28    pub fn new(
29        t_start_ms: f64,
30        // t_end_ms: f64,
31        t_load_ms: f64,
32        // t_sample_ms: f64,
33        t_p_eval_ms: f64,
34        t_eval_ms: f64,
35        // n_sample: i32,
36        n_p_eval: i32,
37        n_eval: i32,
38    ) -> Self {
39        Self {
40            perf_context_data: llama_cpp_sys_4::llama_perf_context_data {
41                t_start_ms,
42                // t_end_ms,
43                t_load_ms,
44                // t_sample_ms,
45                t_p_eval_ms,
46                t_eval_ms,
47                // n_sample,
48                n_p_eval,
49                n_eval,
50                n_reused: 0,
51            },
52        }
53    }
54
55    /// print llama context performance data
56    /// load time
57    /// prompt eval time
58    /// eval time
59    /// total time
60    pub fn print(ctx: &LlamaContext<'_>) {
61        unsafe {
62            llama_perf_context_print(ctx.context.as_ptr());
63        };
64    }
65
66    /// Get the start time in milliseconds.
67    #[must_use]
68    pub fn t_start_ms(&self) -> f64 {
69        self.perf_context_data.t_start_ms
70    }
71
72    /// Get the end time in milliseconds.
73    #[must_use]
74    pub fn t_end_ms(&self) -> f64 {
75        // self.perf_context_data.t_end_ms
76
77        #[allow(clippy::cast_precision_loss)]
78        {
79            1e-3 * (unsafe { ggml_time_us() }) as f64
80        }
81    }
82
83    /// Get the load time in milliseconds.
84    #[must_use]
85    pub fn t_load_ms(&self) -> f64 {
86        self.perf_context_data.t_load_ms
87    }
88
89    // /// Get the sample time in milliseconds.
90    // #[must_use]
91    // pub fn t_sample_ms(&self) -> f64 {
92    //     self.perf_context_data.t_sample_ms
93    // }
94
95    /// Get the prompt evaluation time in milliseconds.
96    #[must_use]
97    pub fn t_p_eval_ms(&self) -> f64 {
98        self.perf_context_data.t_p_eval_ms
99    }
100
101    /// Get the evaluation time in milliseconds.
102    #[must_use]
103    pub fn t_eval_ms(&self) -> f64 {
104        self.perf_context_data.t_eval_ms
105    }
106
107    // /// Get the number of samples.
108    // #[must_use]
109    // pub fn n_sample(&self) -> i32 {
110    //     self.perf_context_data.n_sample
111    // }
112
113    /// Get the number of prompt evaluations.
114    #[must_use]
115    pub fn n_p_eval(&self) -> i32 {
116        self.perf_context_data.n_p_eval
117    }
118
119    /// Get the number of evaluations.
120    #[must_use]
121    pub fn n_eval(&self) -> i32 {
122        self.perf_context_data.n_eval
123    }
124
125    /// Set the start time in milliseconds.
126    pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
127        self.perf_context_data.t_start_ms = t_start_ms;
128    }
129
130    // /// Set the end time in milliseconds.
131    // pub fn set_t_end_ms(&mut self, t_end_ms: f64) {
132    //     self.perf_context_data.t_end_ms = t_end_ms;
133    // }
134
135    /// Set the load time in milliseconds.
136    pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
137        self.perf_context_data.t_load_ms = t_load_ms;
138    }
139
140    // /// Set the sample time in milliseconds.
141    // pub fn set_t_sample_ms(&mut self, t_sample_ms: f64) {
142    //     self.perf_context_data.t_sample_ms = t_sample_ms;
143    // }
144
145    /// Set the prompt evaluation time in milliseconds.
146    pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
147        self.perf_context_data.t_p_eval_ms = t_p_eval_ms;
148    }
149
150    /// Set the evaluation time in milliseconds.
151    pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
152        self.perf_context_data.t_eval_ms = t_eval_ms;
153    }
154
155    // /// Set the number of samples.
156    // pub fn set_n_sample(&mut self, n_sample: i32) {
157    //     self.perf_context_data.n_sample = n_sample;
158    // }
159
160    /// Set the number of prompt evaluations.
161    pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
162        self.perf_context_data.n_p_eval = n_p_eval;
163    }
164
165    /// Set the number of evaluations.
166    pub fn set_n_eval(&mut self, n_eval: i32) {
167        self.perf_context_data.n_eval = n_eval;
168    }
169}
170
171impl Display for PerfContextData {
172    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
173        writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
174        // writeln!(
175        //     f,
176        //     "sample time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
177        //     self.t_sample_ms(),
178        //     self.n_sample(),
179        //     self.t_sample_ms() / f64::from(self.n_sample()),
180        //     1e3 / self.t_sample_ms() * f64::from(self.n_sample())
181        // )?;
182        writeln!(
183            f,
184            "prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
185            self.t_p_eval_ms(),
186            self.n_p_eval(),
187            self.t_p_eval_ms() / f64::from(self.n_p_eval()),
188            1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
189        )?;
190        writeln!(
191            f,
192            "eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
193            self.t_eval_ms(),
194            self.n_eval(),
195            self.t_eval_ms() / f64::from(self.n_eval()),
196            1e3 / self.t_eval_ms() * f64::from(self.n_eval())
197        )?;
198        write!(
199            f,
200            "total time = {:.2} ms",
201            self.t_end_ms() - self.t_start_ms()
202        )
203    }
204}
llama_cpp_4/context/perf.rs

llama_cpp_4/context/
perf.rs