luwen_if/
detect_chips.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::HashSet;
5
6use luwen_core::Arch;
7
8use crate::{
9    chip::{wait_for_init, Chip, InitError, InitStatus},
10    error::{BtWrapper, PlatformError},
11    ChipImpl, EthAddr,
12};
13
14#[derive(PartialEq, Eq, Hash, Debug, Clone)]
15enum InterfaceIdOrCoord {
16    Id(u32),
17    Coord(EthAddr),
18}
19
20/// Represents a chip object which may or may not be initialized.
21pub enum UninitChip {
22    /// A partially initialized chip, it may be unsafe (0xffffffff errors) to interact with this chip.
23    Partially {
24        /// Contains the init status
25        status: InitStatus,
26        /// Returned when the chip is explicitly upgraded.
27        /// Or init is rerun.
28        underlying: Chip,
29    },
30    /// The chip is fine and can be safely upgraded.
31    Initialized(Chip),
32}
33
34// HACK(drosen): Probably should just implement clone on Chip...
35fn clone_chip(chip: &Chip) -> Chip {
36    if let Some(wh) = chip.as_wh() {
37        Chip::from(Box::new(wh.clone()) as Box<dyn ChipImpl>)
38    } else if let Some(gs) = chip.as_gs() {
39        Chip::from(Box::new(gs.clone()) as Box<dyn ChipImpl>)
40    } else {
41        unimplemented!(
42            "Don't have a clone handler for chip with arch {:?}.",
43            chip.get_arch()
44        )
45    }
46}
47
48impl Clone for UninitChip {
49    fn clone(&self) -> Self {
50        match self {
51            Self::Partially { status, underlying } => Self::Partially {
52                status: status.clone(),
53                underlying: clone_chip(underlying),
54            },
55            Self::Initialized(chip) => Self::Initialized(clone_chip(chip)),
56        }
57    }
58}
59
60impl UninitChip {
61    pub fn new(status: InitStatus, chip: &Chip) -> Self {
62        let chip = clone_chip(chip);
63        if status.init_complete() && !status.has_error() {
64            UninitChip::Initialized(chip)
65        } else {
66            UninitChip::Partially {
67                status,
68                underlying: chip,
69            }
70        }
71    }
72
73    pub fn status(&self) -> Option<&InitStatus> {
74        match self {
75            UninitChip::Partially { status, .. } => Some(status),
76            UninitChip::Initialized(_) => None,
77        }
78    }
79
80    /// Initialize the chip, if init fails at this point then we return a result
81    /// instead of an UninitChip.
82    pub fn init<E>(
83        self,
84        init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
85    ) -> Result<Chip, InitError<E>> {
86        match self {
87            UninitChip::Partially { mut underlying, .. } => {
88                wait_for_init(&mut underlying, init_callback, false, false)?;
89
90                Ok(underlying)
91            }
92            UninitChip::Initialized(chip) => Ok(chip),
93        }
94    }
95
96    pub fn upgrade(self) -> Chip {
97        match self {
98            UninitChip::Partially { underlying, .. } => underlying,
99            UninitChip::Initialized(chip) => chip,
100        }
101    }
102
103    pub fn try_upgrade(&self) -> Option<&Chip> {
104        match self {
105            UninitChip::Partially { status, underlying } => {
106                if status.init_complete() && !status.has_error() {
107                    Some(underlying)
108                } else {
109                    None
110                }
111            }
112            UninitChip::Initialized(chip) => Some(chip),
113        }
114    }
115
116    pub fn is_initialized(&self) -> bool {
117        match self {
118            UninitChip::Partially { status, .. } => status.init_complete(),
119            UninitChip::Initialized(_) => true,
120        }
121    }
122
123    pub fn is_healthy(&self) -> Option<bool> {
124        match self {
125            UninitChip::Partially { status, .. } => {
126                if status.init_complete() {
127                    Some(status.has_error())
128                } else {
129                    None
130                }
131            }
132            UninitChip::Initialized(_) => Some(true),
133        }
134    }
135
136    pub fn arc_alive(&self) -> bool {
137        match self {
138            UninitChip::Partially { status, .. } => {
139                !status.arc_status.is_waiting() && !status.arc_status.has_error()
140            }
141            UninitChip::Initialized(_) => true,
142        }
143    }
144
145    pub fn dram_safe(&self) -> bool {
146        match self {
147            UninitChip::Partially { status, .. } => {
148                !status.dram_status.is_waiting() && !status.dram_status.has_error()
149            }
150            UninitChip::Initialized(_) => true,
151        }
152    }
153
154    pub fn eth_safe(&self) -> bool {
155        match self {
156            UninitChip::Partially { status, .. } => {
157                !status.eth_status.is_waiting() && !status.eth_status.has_error()
158            }
159            UninitChip::Initialized(_) => true,
160        }
161    }
162
163    pub fn cpu_safe(&self) -> bool {
164        match self {
165            UninitChip::Partially { status, .. } => {
166                !status.cpu_status.is_waiting() && !status.cpu_status.has_error()
167            }
168            UninitChip::Initialized(_) => true,
169        }
170    }
171}
172
173pub struct ChipDetectOptions {
174    /// If true, we will continue searching for chips even if we encounter a *recoverable* error.
175    /// If false, detection errors will be raised as an Err(..).
176    pub continue_on_failure: bool,
177    /// If true, then we will search for chips directly available over a physical interface (pci, jtag, i2c, etc...)
178    /// If false, we will search for chips directly available and via ethernet.
179    pub local_only: bool,
180    /// If len > 0 then only chips with the given archs will be returned.
181    pub chip_filter: Vec<Arch>,
182    /// If true, then we will not initialize anything that might cause a problem (i.e. a noc hang).
183    pub noc_safe: bool,
184}
185
186impl Default for ChipDetectOptions {
187    fn default() -> Self {
188        Self {
189            continue_on_failure: true,
190            local_only: false,
191            chip_filter: Vec::new(),
192            noc_safe: false,
193        }
194    }
195}
196
197impl ChipDetectOptions {
198    pub fn new() -> Self {
199        Self::default()
200    }
201
202    pub fn continue_on_failure(mut self, continue_on_failure: bool) -> Self {
203        self.continue_on_failure = continue_on_failure;
204        self
205    }
206
207    pub fn local_only(mut self, local_only: bool) -> Self {
208        self.local_only = local_only;
209        self
210    }
211
212    pub fn noc_safe(mut self, noc_safe: bool) -> Self {
213        self.noc_safe = noc_safe;
214        self
215    }
216}
217
218/// Find all chips accessible from the given set of root chips.
219/// For the most part this should be a set of chips found via a PCI scan, but it doens't have to be.
220///
221/// The most important part of this algorithm is determining which chips are duplicates of other chips.
222/// In general two boards can be differentiated by their board id, but this is not always the case.
223/// For example the gs or wh X2, in that case we must fallback on the interface id for grayskull or ethernet address for wh.
224/// However this does not cover all cases, if there is a wh X2 that is not in the root_chips list (which could be because it is in a neighbouring hose)
225/// and both chips are in two seperate meshes with the same ethernet address. We will incorrectly detect them as being one chip.
226///
227/// Search steps:
228/// 1. Add all given chips to output list removing duplicates this will ensure that if list indexes are used to
229/// assign a chip id pci chips will always be output instead of the remote equivalent.
230/// 2. To a depth first search for each root chip, adding all new chips found to the output list.
231///
232/// When continue on failure is true, we report errors, but continue searching for chips.
233/// We pass all chips that did not complete initializations as UninitChip, the user will see the status and can
234/// decide for themselves if they want to upgrade the chip to a full Chip.
235/// Error Cases:
236/// 1. ARC fw is hung, this usually means that there is a noc hang as well.
237///     a. Not catastrophic, we can recover from the hang by resetting the chip.
238/// 2. DRAM is not trained
239///     a. Not catastrophic, but we should not pass this over as a good chip as we may get a noc hang when accessing DRAM.
240/// 3. ARC did not complete initialization
241///     a. Not catastrophic, but for gs we will have no thermal control.
242/// 3. Ethernet fw is corrupted, we check this by looking for a known fw version.
243///     a. Not catastrophic, we need to report this, but can continue exploring other chips in the mesh.
244/// 4. Ethernet fw is hung, this usually means that the ethernet is in a bad state.
245///     a. Not catastrophic, we need to report this, but can continue exploring other chips in the mesh.
246/// 5. 0xffffffff error, this means that the underlying transport is hung.
247///     a. This is catastrophic, we cannot continue searching for chips, because some of the chips in the mesh may no longer be accesible
248///     b. We could recover from this by rerunning the search, but this is not implemented.
249pub fn detect_chips<E>(
250    mut root_chips: Vec<Chip>,
251    init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
252    options: ChipDetectOptions,
253) -> Result<Vec<UninitChip>, InitError<E>> {
254    let ChipDetectOptions {
255        continue_on_failure,
256        local_only,
257        chip_filter,
258        noc_safe,
259    } = options;
260
261    let mut remotes_to_investigate = Vec::new();
262    let mut seen_chips = HashSet::new();
263
264    let mut output = Vec::new();
265    for (root_index, root_chip) in root_chips.iter_mut().enumerate() {
266        if !chip_filter.is_empty() && !chip_filter.contains(&root_chip.get_arch()) {
267            return Err(PlatformError::WrongChipArchs {
268                actual: root_chip.get_arch(),
269                expected: chip_filter.clone(),
270                backtrace: BtWrapper::capture(),
271            })?;
272        }
273
274        let status = wait_for_init(root_chip, init_callback, continue_on_failure, noc_safe)?;
275
276        // We now want to convert to the uninitialized chip type.
277        let chip = UninitChip::new(status, root_chip);
278
279        // At this point we may not be able to talk to the chip over ethernet, there should have been an error output to the terminal,
280        // so we will just not perform remote chip detection.
281        let remote_ready = chip.eth_safe();
282        let arc_ready = chip.arc_alive();
283
284        output.push(chip);
285
286        let ident = if let Some(wh) = root_chip.as_wh() {
287            if arc_ready {
288                if let Ok(telem) = root_chip.get_telemetry() {
289                    if !local_only && remote_ready {
290                        remotes_to_investigate.push(root_index);
291                    }
292
293                    (
294                        Some(telem.board_id),
295                        Some(InterfaceIdOrCoord::Coord(wh.get_local_chip_coord()?)),
296                    )
297                } else {
298                    continue;
299                }
300            } else {
301                continue;
302            }
303        } else {
304            (
305                // Can't fetch board id from old gs chips
306                // this shouldn't matter anyway because we can only access them
307                // via pci
308                None,
309                root_chip
310                    .get_device_info()?
311                    .map(|v| InterfaceIdOrCoord::Id(v.interface_id)),
312            )
313        };
314
315        if !seen_chips.insert(ident) {
316            continue;
317        }
318    }
319
320    for root_chip in remotes_to_investigate.into_iter().map(|v| &root_chips[v]) {
321        let mut to_check = root_chip.get_neighbouring_chips()?;
322
323        let mut seen_coords = HashSet::new();
324        while let Some(nchip) = to_check.pop() {
325            if !seen_coords.insert(nchip.eth_addr) {
326                continue;
327            }
328
329            if !chip_filter.is_empty() && !chip_filter.contains(&root_chip.get_arch()) {
330                continue;
331            }
332
333            if let Some(wh) = root_chip.as_wh() {
334                let mut wh = wh.open_remote(nchip.eth_addr)?;
335
336                let status = wait_for_init(&mut wh, init_callback, continue_on_failure, noc_safe)?;
337
338                let local_coord = wh.get_local_chip_coord()?;
339
340                if local_coord != nchip.eth_addr {
341                    return Err(PlatformError::Generic(
342                        format!("When detecting chips in mesh found a mismatch between the expected chip coordinate {} and the actual {}", nchip.eth_addr, local_coord),
343                        crate::error::BtWrapper::capture(),
344                    ))?;
345                }
346
347                // If we cannot talk to the ARC then we cannot get the ident information so we
348                // will just return the chip and not continue to search.
349                if !status.arc_status.has_error() {
350                    let telem = wh.get_telemetry()?;
351
352                    let ident = (
353                        Some(telem.board_id),
354                        Some(InterfaceIdOrCoord::Coord(local_coord)),
355                    );
356
357                    if !seen_chips.insert(ident) {
358                        init_callback(crate::chip::ChipDetectState {
359                            chip: root_chip,
360                            call: crate::chip::CallReason::NotNew,
361                        })
362                        .map_err(InitError::CallbackError)?;
363                        continue;
364                    }
365
366                    for nchip in wh.get_neighbouring_chips()? {
367                        to_check.push(nchip);
368                    }
369                }
370
371                let chip = Chip::from(Box::new(wh) as Box<dyn ChipImpl>);
372                output.push(UninitChip::new(status, &chip));
373            } else {
374                unimplemented!("Don't have a handler for non-WH chips with ethernet support yet.")
375            }
376        }
377    }
378
379    Ok(output)
380}
381
382pub fn detect_initialized_chips<E>(
383    root_chips: Vec<Chip>,
384    init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
385    options: ChipDetectOptions,
386) -> Result<Vec<Chip>, InitError<E>> {
387    let chips = detect_chips(root_chips, init_callback, options)?;
388
389    let mut output = Vec::with_capacity(chips.len());
390    for chip in chips {
391        if chip.is_initialized() {
392            output.push(chip.upgrade());
393        } else {
394            output.push(chip.init(&mut |_| Ok(()))?);
395        }
396    }
397
398    Ok(output)
399}
400
401pub fn detect_chips_silent(
402    root_chips: Vec<Chip>,
403    options: ChipDetectOptions,
404) -> Result<Vec<Chip>, PlatformError> {
405    detect_initialized_chips::<std::convert::Infallible>(root_chips, &mut |_| Ok(()), options)
406        .map_err(|v| match v {
407            InitError::PlatformError(err) => err,
408            InitError::CallbackError(_) => unreachable!(),
409        })
410}