Skip to main content

llama_cpp_4/context/
perf.rs

1//! Safe wrapper around `llama_perf_context_data`.
2use std::fmt::{Debug, Display, Formatter};
3
4use llama_cpp_sys_4::{ggml_time_us, llama_perf_context_print};
5
6use crate::context::LlamaContext;
7
8/// A wrapper around `llama_perf_context_data`.
9#[derive(Clone, Copy, Debug)]
10pub struct PerfContextData {
11    pub(crate) perf_context_data: llama_cpp_sys_4::llama_perf_context_data,
12}
13
14impl PerfContextData {
15    /// Create a new `PerfContextData`.
16    /// ```
17    /// # use llama_cpp_4::context::perf::PerfContextData;
18    /// let timings = PerfContextData::new(1.0, 2.0, 3.0, 4.0, 5, 6);
19    /// assert_eq!(timings.t_load_ms(), 2.0);
20    /// assert_eq!(timings.t_p_eval_ms(), 3.0);
21    /// assert_eq!(timings.t_eval_ms(), 4.0);
22    /// assert_eq!(timings.n_p_eval(), 5);
23    /// assert_eq!(timings.n_eval(), 6);
24    /// ```
25    #[allow(clippy::too_many_arguments)]
26    #[must_use]
27    pub fn new(
28        t_start_ms: f64,
29        // t_end_ms: f64,
30        t_load_ms: f64,
31        // t_sample_ms: f64,
32        t_p_eval_ms: f64,
33        t_eval_ms: f64,
34        // n_sample: i32,
35        n_p_eval: i32,
36        n_eval: i32,
37    ) -> Self {
38        Self {
39            perf_context_data: llama_cpp_sys_4::llama_perf_context_data {
40                t_start_ms,
41                // t_end_ms,
42                t_load_ms,
43                // t_sample_ms,
44                t_p_eval_ms,
45                t_eval_ms,
46                // n_sample,
47                n_p_eval,
48                n_eval,
49                n_reused: 0,
50            },
51        }
52    }
53
54    /// print llama context performance data
55    /// load time
56    /// prompt eval time
57    /// eval time
58    /// total time
59    pub fn print(ctx: &LlamaContext<'_>) {
60        unsafe {
61            llama_perf_context_print(ctx.context.as_ptr());
62        };
63    }
64
65    /// Get the start time in milliseconds.
66    #[must_use]
67    pub fn t_start_ms(&self) -> f64 {
68        self.perf_context_data.t_start_ms
69    }
70
71    /// Get the end time in milliseconds.
72    #[must_use]
73    pub fn t_end_ms(&self) -> f64 {
74        // self.perf_context_data.t_end_ms
75
76        #[allow(clippy::cast_precision_loss)]
77        {
78            1e-3 * (unsafe { ggml_time_us() }) as f64
79        }
80    }
81
82    /// Get the load time in milliseconds.
83    #[must_use]
84    pub fn t_load_ms(&self) -> f64 {
85        self.perf_context_data.t_load_ms
86    }
87
88    // /// Get the sample time in milliseconds.
89    // #[must_use]
90    // pub fn t_sample_ms(&self) -> f64 {
91    //     self.perf_context_data.t_sample_ms
92    // }
93
94    /// Get the prompt evaluation time in milliseconds.
95    #[must_use]
96    pub fn t_p_eval_ms(&self) -> f64 {
97        self.perf_context_data.t_p_eval_ms
98    }
99
100    /// Get the evaluation time in milliseconds.
101    #[must_use]
102    pub fn t_eval_ms(&self) -> f64 {
103        self.perf_context_data.t_eval_ms
104    }
105
106    // /// Get the number of samples.
107    // #[must_use]
108    // pub fn n_sample(&self) -> i32 {
109    //     self.perf_context_data.n_sample
110    // }
111
112    /// Get the number of prompt evaluations.
113    #[must_use]
114    pub fn n_p_eval(&self) -> i32 {
115        self.perf_context_data.n_p_eval
116    }
117
118    /// Get the number of evaluations.
119    #[must_use]
120    pub fn n_eval(&self) -> i32 {
121        self.perf_context_data.n_eval
122    }
123
124    /// Set the start time in milliseconds.
125    pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
126        self.perf_context_data.t_start_ms = t_start_ms;
127    }
128
129    // /// Set the end time in milliseconds.
130    // pub fn set_t_end_ms(&mut self, t_end_ms: f64) {
131    //     self.perf_context_data.t_end_ms = t_end_ms;
132    // }
133
134    /// Set the load time in milliseconds.
135    pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
136        self.perf_context_data.t_load_ms = t_load_ms;
137    }
138
139    // /// Set the sample time in milliseconds.
140    // pub fn set_t_sample_ms(&mut self, t_sample_ms: f64) {
141    //     self.perf_context_data.t_sample_ms = t_sample_ms;
142    // }
143
144    /// Set the prompt evaluation time in milliseconds.
145    pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
146        self.perf_context_data.t_p_eval_ms = t_p_eval_ms;
147    }
148
149    /// Set the evaluation time in milliseconds.
150    pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
151        self.perf_context_data.t_eval_ms = t_eval_ms;
152    }
153
154    // /// Set the number of samples.
155    // pub fn set_n_sample(&mut self, n_sample: i32) {
156    //     self.perf_context_data.n_sample = n_sample;
157    // }
158
159    /// Set the number of prompt evaluations.
160    pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
161        self.perf_context_data.n_p_eval = n_p_eval;
162    }
163
164    /// Set the number of evaluations.
165    pub fn set_n_eval(&mut self, n_eval: i32) {
166        self.perf_context_data.n_eval = n_eval;
167    }
168}
169
170impl Display for PerfContextData {
171    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
172        writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
173        // writeln!(
174        //     f,
175        //     "sample time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
176        //     self.t_sample_ms(),
177        //     self.n_sample(),
178        //     self.t_sample_ms() / f64::from(self.n_sample()),
179        //     1e3 / self.t_sample_ms() * f64::from(self.n_sample())
180        // )?;
181        writeln!(
182            f,
183            "prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
184            self.t_p_eval_ms(),
185            self.n_p_eval(),
186            self.t_p_eval_ms() / f64::from(self.n_p_eval()),
187            1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
188        )?;
189        writeln!(
190            f,
191            "eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
192            self.t_eval_ms(),
193            self.n_eval(),
194            self.t_eval_ms() / f64::from(self.n_eval()),
195            1e3 / self.t_eval_ms() * f64::from(self.n_eval())
196        )?;
197        write!(
198            f,
199            "total time = {:.2} ms",
200            self.t_end_ms() - self.t_start_ms()
201        )
202    }
203}