luwen_ref/
detect.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::convert::Infallible;
5
6use indicatif::ProgressBar;
7use ttkmd_if::PciDevice;
8use luwen_if::{
9    chip::{
10        Chip, ChipDetectState, CommsStatus, ComponentStatusInfo, HlCommsInterface, InitError,
11        InitStatus,
12    },
13    CallbackStorage, ChipDetectOptions, UninitChip,
14};
15
16use crate::{comms_callback, error::LuwenError, ExtendedPciDevice};
17
18pub fn detect_chips_fallible() -> Result<Vec<UninitChip>, LuwenError> {
19    let mut chips = Vec::new();
20    let mut failed_chips = Vec::new();
21
22    let device_ids = PciDevice::scan();
23    for device_id in device_ids {
24        let ud = ExtendedPciDevice::open(device_id)?;
25
26        let arch = ud.borrow().device.arch;
27
28        let chip = Chip::open(arch, CallbackStorage::new(comms_callback, ud.clone()))?;
29
30        // First let's test basic pcie communication we may be in a hang state so it's
31        // important that we let the detect function know
32        let result = chip.axi_sread32("ARC_RESET.SCRATCH[0]");
33        if let Err(err) = result {
34            // Basic comms have failed... we should output a nice error message on the console
35            failed_chips.push((device_id, chip, err));
36        } else {
37            chips.push(chip);
38        }
39    }
40
41    let chip_detect_bar = indicatif::ProgressBar::new_spinner().with_style(
42        indicatif::ProgressStyle::default_spinner()
43            .template("{spinner:.green} Detecting chips (found {pos})")
44            .unwrap(),
45    );
46
47    let mut chip_init_bar = None;
48    let mut arc_init_bar = None;
49    let mut dram_init_bar = None;
50    let mut eth_init_bar = None;
51    let mut cpu_init_bar = None;
52
53    fn add_bar(bars: &indicatif::MultiProgress) -> ProgressBar {
54        let new_bar = bars.add(
55            indicatif::ProgressBar::new_spinner().with_style(
56                indicatif::ProgressStyle::default_spinner()
57                    .template("{spinner:.green} {msg}")
58                    .unwrap(),
59            ),
60        );
61        new_bar.set_message("Initializing Chip");
62        new_bar.enable_steady_tick(std::time::Duration::from_secs_f32(1.0 / 30.0));
63
64        new_bar
65    }
66
67    fn update_bar_with_status<P: std::fmt::Display, E: std::fmt::Display>(
68        bars: &indicatif::MultiProgress,
69        bar: &mut Option<ProgressBar>,
70        status: &ComponentStatusInfo<P, E>,
71    ) {
72        if bar.is_none() && status.is_present() {
73            *bar = Some(add_bar(bars));
74        }
75
76        if let Some(bar) = bar {
77            if status.is_waiting() && status.is_present() {
78                bar.set_message(status.to_string());
79            }
80        }
81    }
82
83    fn maybe_remove_bar<P, E>(
84        bars: &indicatif::MultiProgress,
85        bar: &mut Option<ProgressBar>,
86        status: &ComponentStatusInfo<P, E>,
87    ) {
88        if let Some(bar) = bar.take() {
89            if status.has_error() {
90                bar.finish();
91            } else {
92                bar.finish_and_clear();
93                bars.remove(&bar);
94            }
95        }
96    }
97
98    let bars = indicatif::MultiProgress::new();
99    let chip_detect_bar = bars.add(chip_detect_bar);
100    chip_detect_bar.enable_steady_tick(std::time::Duration::from_secs_f32(1.0 / 30.0));
101
102    // First we will output errors for the chips we alraedy know have failed
103    for (id, _, err) in &failed_chips {
104        chip_detect_bar.inc(1);
105        let bar = add_bar(&bars);
106        bar.finish_with_message(format!(
107            "Failed to communicate over pcie with chip {id}: {err}"
108        ));
109    }
110
111    let mut init_callback = |status: ChipDetectState| {
112        match status.call {
113            luwen_if::chip::CallReason::NotNew => {
114                chip_detect_bar.set_position(chip_detect_bar.position().saturating_sub(1));
115            }
116            luwen_if::chip::CallReason::NewChip => {
117                chip_detect_bar.inc(1);
118                chip_init_bar = Some(add_bar(&bars));
119            }
120            luwen_if::chip::CallReason::InitWait(status) => {
121                update_bar_with_status(&bars, &mut arc_init_bar, &status.arc_status);
122                update_bar_with_status(&bars, &mut dram_init_bar, &status.dram_status);
123                update_bar_with_status(&bars, &mut eth_init_bar, &status.eth_status);
124                update_bar_with_status(&bars, &mut cpu_init_bar, &status.cpu_status);
125
126                if let Some(bar) = chip_init_bar.as_ref() {
127                    bar.set_message(format!("Waiting chip to initialize"));
128                }
129            }
130            luwen_if::chip::CallReason::ChipInitCompleted(status) => {
131                chip_detect_bar.set_message("Chip initialization complete (found {pos})");
132
133                maybe_remove_bar(&bars, &mut arc_init_bar, &status.arc_status);
134                maybe_remove_bar(&bars, &mut dram_init_bar, &status.dram_status);
135                maybe_remove_bar(&bars, &mut eth_init_bar, &status.eth_status);
136                maybe_remove_bar(&bars, &mut cpu_init_bar, &status.cpu_status);
137
138                if let Some(bar) = chip_init_bar.take() {
139                    if status.has_error() {
140                        bar.finish_with_message("Chip initialization failed");
141                    } else {
142                        bar.finish_and_clear();
143                        bars.remove(&bar);
144                    }
145                }
146            }
147        };
148
149        Ok::<(), Infallible>(())
150    };
151
152    let options = ChipDetectOptions::default();
153    let mut chips = match luwen_if::detect_chips(chips, &mut init_callback, options) {
154        Err(InitError::CallbackError(err)) => {
155            chip_detect_bar
156                .finish_with_message(format!("Ran into error from status callback;\n{}", err));
157            return Err(luwen_if::error::PlatformError::Generic(
158                "Hit error from status callback".to_string(),
159                luwen_if::error::BtWrapper::capture(),
160            ))?;
161        }
162        Err(InitError::PlatformError(err)) => {
163            return Err(err)?;
164        }
165
166        Ok(chips) => chips,
167    };
168
169    chip_detect_bar.finish_with_message("Chip detection complete (found {pos})");
170
171    for (id, chip, err) in failed_chips.into_iter() {
172        let mut status = InitStatus::new_unknown();
173        status.comms_status = CommsStatus::CommunicationError(err.to_string());
174        status.unknown_state = false;
175        chips.insert(
176            id,
177            UninitChip::Partially {
178                status,
179                underlying: chip,
180            },
181        );
182    }
183
184    println!("");
185
186    Ok(chips)
187}
188
189pub fn detect_chips() -> Result<Vec<Chip>, LuwenError> {
190    let chips = detect_chips_fallible()?;
191
192    let mut output = Vec::with_capacity(chips.len());
193    for chip in chips {
194        output.push(chip.init(&mut |_| Ok::<(), Infallible>(())).map_err(Into::<luwen_if::error::PlatformError>::into)?);
195    }
196
197    Ok(output)
198}