tract_libcli/
profile.rs

1use tract_core::internal::*;
2use tract_core::num_traits::Zero;
3use tract_core::ops::scan::State;
4use tract_core::ops::submodel::TypedModelOpState;
5
6use crate::annotations::*;
7use crate::model::Model;
8use crate::tensor::make_inputs_for_model;
9use std::any::TypeId;
10use std::time::Duration;
11
12pub struct BenchLimits {
13    pub warmup_loops: usize,
14    pub warmup_time: std::time::Duration,
15    pub max_loops: usize,
16    pub max_time: std::time::Duration,
17}
18
19impl Default for BenchLimits {
20    fn default() -> Self {
21        BenchLimits {
22            warmup_loops: 0,
23            warmup_time: Duration::default(),
24            max_loops: 100_000,
25            max_time: std::time::Duration::from_secs(5),
26        }
27    }
28}
29
30impl BenchLimits {
31    pub fn warmup(&self, model: &TypedModel, inputs: &TVec<TValue>) -> TractResult<()> {
32        if self.warmup_time.is_zero() && self.warmup_loops.is_zero() {
33            return Ok(());
34        }
35        let plan = TypedSimplePlan::new(model.clone())?;
36        let mut state = TypedSimpleState::new(Arc::new(plan))?;
37        let mut iters = 0;
38        let max_loops = if self.warmup_loops.is_zero() { usize::MAX } else { self.warmup_loops };
39        let max_time = if self.warmup_time.is_zero() { Duration::MAX } else { self.warmup_time };
40
41        let start_warmup = crate::time::now();
42        debug!("Warming up before profiling...");
43        while iters < max_loops && start_warmup.elapsed() < max_time {
44            state.run(inputs.clone())?;
45            iters += 1;
46        }
47        debug!("Done warming up.");
48        Ok(())
49    }
50}
51
52pub fn profile(
53    model: &TypedModel,
54    bench_limits: &BenchLimits,
55    dg: &mut Annotations,
56    plan_options: &PlanOptions,
57    inputs: &TVec<TValue>,
58    custom_profiler: Option<HashMap<TypeId, Profiler>>,
59    folded: bool,
60) -> TractResult<()> {
61    info!("Running entire network");
62    let mut iters = 0usize;
63    let prefix = tvec!();
64
65    bench_limits.warmup(model, inputs)?;
66
67    let plan = TypedSimplePlan::new_with_options(model.clone(), plan_options)?;
68    let mut state = TypedSimpleState::new(Arc::new(plan))?;
69
70    let start = crate::time::now();
71    let mut time_accounted_by_inner_nodes = Duration::default();
72    while iters < bench_limits.max_loops && start.elapsed() < bench_limits.max_time {
73        rec_profiler(
74            &mut state,
75            dg,
76            inputs,
77            custom_profiler.as_ref(),
78            &prefix,
79            None,
80            &mut time_accounted_by_inner_nodes,
81            folded,
82        )?;
83
84        iters += 1;
85    }
86
87    let entire = start.elapsed() - time_accounted_by_inner_nodes;
88
89    info!("Running {} iterations max. for each node.", bench_limits.max_loops);
90    info!("Running for {} ms max. for each node.", bench_limits.max_time.as_millis());
91
92    let denum = (iters as f32).recip();
93    let entire = entire.mul_f32(denum);
94    for d in dg.tags.values_mut() {
95        if let Some(d) = d.profile.as_mut() {
96            *d = d.mul_f32(denum);
97        }
98
99        if let Some(d) = d.accelerator_profile.as_mut() {
100            *d = d.mul_f32(denum);
101        }
102    }
103    let max = dg.tags.values().filter_map(|t| t.profile).max().unwrap();
104    let sum = dg.tags.values().filter_map(|t| t.profile).sum::<Duration>();
105    let accel_sum = dg.tags.values().filter_map(|t| t.accelerator_profile).sum::<Duration>();
106    dg.profile_summary = Some(ProfileSummary { max, sum, accel_sum, entire, iters });
107    Ok(())
108}
109
110#[cfg(any(target_os = "macos", target_os = "ios"))]
111pub fn profile_metal(
112    model: &TypedModel,
113    bench_limits: &BenchLimits,
114    dg: &mut Annotations,
115    plan_options: &PlanOptions,
116    inputs: &TVec<TValue>,
117) -> TractResult<()> {
118    info!("Running entire network");
119    let mut iters = 0usize;
120    let prefix = tvec!();
121
122    bench_limits.warmup(model, inputs)?;
123
124    let mut plan = TypedSimplePlan::new_with_options(model.clone(), plan_options)?;
125    let state = TypedSimpleState::new_from_inputs(&plan, inputs.clone())?;
126
127    let session_handler =
128        tract_metal::MetalSessionHandler::from_plan(&plan, &state.session_state.resolved_symbols)?;
129
130    plan = plan.with_session_handler(session_handler);
131
132    let mut state = TypedSimpleState::new(Arc::new(plan))?;
133
134    let mut entire = Duration::default();
135    while iters < bench_limits.max_loops && entire < bench_limits.max_time {
136        entire += rec_profiler_metal(&mut state, dg, inputs, &prefix)?.1;
137
138        iters += 1;
139    }
140
141    info!("Running {} iterations max. for each node.", bench_limits.max_loops);
142    info!("Running for {} ms max. for each node.", bench_limits.max_time.as_millis());
143
144    let denum = (iters as f32).recip();
145    let entire = entire.mul_f32(denum);
146    for d in dg.tags.values_mut() {
147        if let Some(d) = d.profile.as_mut() {
148            *d = d.mul_f32(denum);
149        }
150
151        if let Some(d) = d.accelerator_profile.as_mut() {
152            *d = d.mul_f32(denum);
153        }
154    }
155    let max = dg.tags.values().filter_map(|t| t.profile).max().unwrap();
156    let sum = dg.tags.values().filter_map(|t| t.profile).sum::<Duration>();
157    let accel_sum = dg.tags.values().filter_map(|t| t.accelerator_profile).sum::<Duration>();
158    dg.profile_summary = Some(ProfileSummary { max, sum, accel_sum, entire, iters });
159    Ok(())
160}
161
162#[cfg(any(target_os = "macos", target_os = "ios"))]
163pub fn rec_profiler_metal(
164    state: &mut TypedSimpleState<TypedModel, Arc<TypedSimplePlan<TypedModel>>>,
165    dg: &mut Annotations,
166    inputs: &TVec<TValue>,
167    prefix: &[(usize, String)],
168) -> TractResult<(TVec<TValue>, Duration)> {
169    // Metal profiler is broken, only profile CPU
170
171    //tract_metal::METAL_CONTEXT.with_borrow(|ctxt| {
172    //    let (mut cpu_start, mut gpu_start): (u64, u64) = (0, 0);
173    //    ctxt.device().sample_timestamps(&mut cpu_start, &mut gpu_start);
174//
175    //    let n_nodes = state.plan().model().nodes_len();
176    //    let (result, eval_dur, profiler) = ctxt.profile(n_nodes, || {
177            let profile_start = crate::time::now();
178            let r = state.run_plan_with_eval(
179                inputs.clone(),
180                |session_state, mut node_state, node, input| {
181                    // Profile node
182                    let start = crate::time::now();
183                    let res = tract_core::plan::eval(
184                        session_state,
185                        node_state.as_deref_mut(),
186                        node,
187                        input.clone(),
188                    );
189                    let elapsed = start.elapsed();
190                    let node_id = NodeQId(prefix.into(), node.id);
191                    *dg.node_mut(node_id).profile.get_or_insert(Duration::default()) += elapsed;
192
193                    res
194                },
195            )?;
196
197            Ok((r, profile_start.elapsed()))
198    //    })?;
199//
200    //    let (mut cpu_end, mut gpu_end): (u64, u64) = (0, 0);
201    //    ctxt.device().sample_timestamps(&mut cpu_end, &mut gpu_end);
202//
203    //    profiler.iter().enumerate().for_each(|(node_id, duration)| {
204    //        let node_id = NodeQId(prefix.into(), node_id);
205    //        *dg.node_mut(node_id).accelerator_profile.get_or_insert(Duration::default()) +=
206    //            Duration::from_nanos(tract_metal::utils::rescale_gpu_duration(
207    //                *duration, cpu_start, cpu_end, gpu_start, gpu_end,
208    //            ));
209    //    });
210//
211    //    Ok((result, eval_dur))
212    //})
213}
214
215#[allow(clippy::too_many_arguments)]
216pub fn rec_profiler(
217    state: &mut TypedSimpleState<TypedModel, Arc<TypedSimplePlan<TypedModel>>>,
218    dg: &mut Annotations,
219    inputs: &TVec<TValue>,
220    profilers: Option<&HashMap<TypeId, Profiler>>,
221    prefix: &[(usize, String)],
222    multiplier: Option<usize>,
223    time_accounted_by_inner_nodes: &mut Duration,
224    folded: bool,
225) -> TractResult<TVec<TValue>> {
226    let r = state.run_plan_with_eval(
227        inputs.clone(),
228        |session_state, mut node_state, node, input| {
229            // Profile node
230            let start = crate::time::now();
231            let res = tract_core::plan::eval(
232                session_state,
233                node_state.as_deref_mut(),
234                node,
235                input.clone(),
236            );
237            let elapsed = start.elapsed().mul_f32(multiplier.unwrap_or(1) as _);
238            let node_id = NodeQId(prefix.into(), node.id);
239            *dg.node_mut(node_id).profile.get_or_insert(Duration::default()) += elapsed;
240
241            if !folded {
242                let start = crate::time::now();
243                profile_submodel(
244                    node,
245                    node_state,
246                    input,
247                    dg,
248                    profilers,
249                    prefix,
250                    time_accounted_by_inner_nodes,
251                )?;
252                *time_accounted_by_inner_nodes += start.elapsed();
253            }
254
255            // Update parent nodes if any (childs timings are deducted from parents)
256            let prefix_vec = prefix.to_vec();
257            if !prefix_vec.is_empty() {
258                (1..prefix_vec.len() + 1).map(|idx| prefix_vec[..idx].to_vec()).for_each(
259                    |parent_path| {
260                        let parent_node = parent_path.last().map(|it| it.0).unwrap();
261                        let parent = dg
262                            .node_mut(NodeQId(
263                                parent_path[..parent_path.len() - 1].into(),
264                                parent_node,
265                            ))
266                            .profile
267                            .get_or_insert(Duration::default());
268                        *parent -= elapsed.min(*parent);
269                    },
270                );
271            }
272            res
273        },
274    )?;
275    Ok(r)
276}
277
278fn profile_submodel(
279    node: &TypedNode,
280    mut node_state: Option<&mut dyn OpState>,
281    input: TVec<TValue>,
282    dg: &mut Annotations,
283    profilers: Option<&HashMap<TypeId, Profiler>>,
284    prefix: &[(usize, String)],
285    time_accounted_by_inner_nodes: &mut Duration,
286) -> TractResult<()> {
287    if let Some(ref mut op_state) = node_state {
288        if let Some(profiler) = profilers.and_then(|it| it.get(&op_state.type_id())) {
289            let mut new_prefix: TVec<_> = prefix.into();
290            new_prefix.push((node.id, "submodel".to_string()));
291
292            let (_, _) =
293                (profiler.func)(*op_state, input, dg, &new_prefix, time_accounted_by_inner_nodes)?;
294        } else if let Some(scan_state) = op_state.downcast_mut::<State>() {
295            let mut new_prefix: TVec<_> = prefix.into();
296            new_prefix.push((node.id, "loop".to_string()));
297
298            let scan_inputs = make_inputs_for_model(scan_state.model_state.model())?;
299            let multi = scan_state.iteration_count(&input);
300
301            rec_profiler(
302                &mut scan_state.model_state,
303                dg,
304                &scan_inputs,
305                None,
306                &new_prefix,
307                Some(multi),
308                time_accounted_by_inner_nodes,
309                false,
310            )?;
311        } else if let Some(typed_model_state) = op_state.downcast_mut::<TypedModelOpState>() {
312            let mut new_prefix: TVec<_> = prefix.into();
313            new_prefix.push((node.id, "submodel".to_string()));
314
315            rec_profiler(
316                typed_model_state,
317                dg,
318                &input,
319                None,
320                &new_prefix,
321                None,
322                time_accounted_by_inner_nodes,
323                false,
324            )?;
325        }
326    }
327
328    Ok(())
329}
330
331type ProfilerFn = fn(
332    &mut dyn OpState,
333    TVec<TValue>,
334    &mut Annotations,
335    &[(usize, String)],
336    &mut Duration,
337) -> TractResult<(TractResult<TVec<TValue>>, Duration)>;
338
339#[derive(Clone)]
340pub struct Profiler {
341    pub func: ProfilerFn,
342    pub name: &'static str,
343}
344
345impl Hash for Profiler {
346    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
347        self.name.hash(state)
348    }
349}
350
351pub fn extract_costs(
352    annotations: &mut Annotations,
353    model: &dyn Model,
354    extra_symbols: &SymbolValues,
355) -> TractResult<()> {
356    fn extract_costs_rec(
357        annotations: &mut Annotations,
358        model: &dyn Model,
359        prefix: &[(usize, String)],
360        multiplier: TDim,
361        extra_symbols: &SymbolValues,
362    ) -> TractResult<()> {
363        if let Some(model) = model.downcast_ref::<TypedModel>() {
364            for node_id in 0..model.nodes().len() {
365                let inputs = model.node_input_facts(node_id)?;
366                let cost = model
367                    .node(node_id)
368                    .op
369                    .cost(&inputs)
370                    .with_context(|| format!("costing node {}", model.node(node_id)))?;
371                annotations.node_mut(NodeQId(prefix.into(), node_id)).cost = cost
372                    .into_iter()
373                    .map(|(k, v)| {
374                        let cost = if k.is_compute() { v * &multiplier } else { v };
375                        (k, cost.eval(extra_symbols))
376                    })
377                    .collect();
378
379                let nested_subs = model.nested_models(node_id);
380                let nested_multis = (model as &dyn Model).nested_models_iters(node_id, &inputs);
381                for (name, sub) in nested_subs {
382                    let mut prefix: TVec<_> = prefix.into();
383                    prefix.push((node_id, name.to_string()));
384                    extract_costs_rec(
385                        annotations,
386                        sub,
387                        &prefix,
388                        nested_multis.clone().unwrap_or_else(|| 1.into()) * &multiplier,
389                        extra_symbols,
390                    )?;
391                }
392            }
393        }
394        Ok(())
395    }
396    extract_costs_rec(annotations, model, &[], 1.into(), extra_symbols)
397}