Skip to main content

luwen_api/
detect_chips.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::HashSet;
5
6use luwen_def::Arch;
7
8use crate::{
9    chip::{wait_for_init, Chip, InitError, InitStatus},
10    error::{BtWrapper, PlatformError},
11    ChipImpl, EthAddr,
12};
13
14#[derive(PartialEq, Eq, Hash, Debug, Clone)]
15enum InterfaceIdOrCoord {
16    Id(u32),
17    Coord(EthAddr),
18}
19
20/// Represents a chip object which may or may not be initialized.
21pub enum UninitChip {
22    /// A partially initialized chip, it may be unsafe (0xffffffff errors) to interact with this chip.
23    Partially {
24        /// Contains the init status
25        status: Box<InitStatus>,
26        /// Returned when the chip is explicitly upgraded.
27        /// Or init is rerun.
28        underlying: Chip,
29    },
30    /// The chip is fine and can be safely upgraded.
31    Initialized(Chip),
32}
33
34// HACK(drosen): Probably should just implement clone on Chip...
35fn clone_chip(chip: &Chip) -> Chip {
36    if let Some(wh) = chip.as_wh() {
37        Chip::from(Box::new(wh.clone()) as Box<dyn ChipImpl>)
38    } else if let Some(bh) = chip.as_bh() {
39        Chip::from(Box::new(bh.clone()) as Box<dyn ChipImpl>)
40    } else {
41        unimplemented!(
42            "Don't have a clone handler for chip with arch {:?}.",
43            chip.get_arch()
44        )
45    }
46}
47
48impl Clone for UninitChip {
49    fn clone(&self) -> Self {
50        match self {
51            Self::Partially { status, underlying } => Self::Partially {
52                status: status.clone(),
53                underlying: clone_chip(underlying),
54            },
55            Self::Initialized(chip) => Self::Initialized(clone_chip(chip)),
56        }
57    }
58}
59
60impl UninitChip {
61    pub fn new(status: InitStatus, chip: &Chip) -> Self {
62        let chip = clone_chip(chip);
63        if status.init_complete() && !status.has_error() {
64            UninitChip::Initialized(chip)
65        } else {
66            UninitChip::Partially {
67                status: Box::new(status),
68                underlying: chip,
69            }
70        }
71    }
72
73    pub fn status(&self) -> Option<&InitStatus> {
74        match self {
75            UninitChip::Partially { status, .. } => Some(status),
76            UninitChip::Initialized(_) => None,
77        }
78    }
79
80    /// Initialize the chip, if init fails at this point then we return a result
81    /// instead of an UninitChip.
82    pub fn init<E>(
83        self,
84        init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
85    ) -> Result<Chip, InitError<E>> {
86        match self {
87            UninitChip::Partially { mut underlying, .. } => {
88                wait_for_init(&mut underlying, init_callback, false, false)?;
89
90                Ok(underlying)
91            }
92            UninitChip::Initialized(chip) => Ok(chip),
93        }
94    }
95
96    pub fn upgrade(self) -> Chip {
97        match self {
98            UninitChip::Partially { underlying, .. } => underlying,
99            UninitChip::Initialized(chip) => chip,
100        }
101    }
102
103    pub fn try_upgrade(&self) -> Option<&Chip> {
104        match self {
105            UninitChip::Partially { status, underlying } => {
106                if status.init_complete() && !status.has_error() {
107                    Some(underlying)
108                } else {
109                    None
110                }
111            }
112            UninitChip::Initialized(chip) => Some(chip),
113        }
114    }
115
116    pub fn is_initialized(&self) -> bool {
117        match self {
118            UninitChip::Partially { status, .. } => status.init_complete(),
119            UninitChip::Initialized(_) => true,
120        }
121    }
122
123    pub fn is_healthy(&self) -> Option<bool> {
124        match self {
125            UninitChip::Partially { status, .. } => {
126                if status.init_complete() {
127                    Some(status.has_error())
128                } else {
129                    None
130                }
131            }
132            UninitChip::Initialized(_) => Some(true),
133        }
134    }
135
136    pub fn arc_alive(&self) -> bool {
137        match self {
138            UninitChip::Partially { status, .. } => {
139                !status.arc_status.is_waiting() && !status.arc_status.has_error()
140            }
141            UninitChip::Initialized(_) => true,
142        }
143    }
144
145    pub fn dram_safe(&self) -> bool {
146        match self {
147            UninitChip::Partially { status, .. } => {
148                !status.dram_status.is_waiting() && !status.dram_status.has_error()
149            }
150            UninitChip::Initialized(_) => true,
151        }
152    }
153
154    pub fn eth_safe(&self) -> bool {
155        match self {
156            UninitChip::Partially { status, .. } => {
157                !status.eth_status.is_waiting() && !status.eth_status.has_error()
158            }
159            UninitChip::Initialized(_) => true,
160        }
161    }
162
163    pub fn cpu_safe(&self) -> bool {
164        match self {
165            UninitChip::Partially { status, .. } => {
166                !status.cpu_status.is_waiting() && !status.cpu_status.has_error()
167            }
168            UninitChip::Initialized(_) => true,
169        }
170    }
171}
172
173pub struct ChipDetectOptions {
174    /// If true, we will continue searching for chips even if we encounter a *recoverable* error.
175    /// If false, detection errors will be raised as an Err(..).
176    pub continue_on_failure: bool,
177    /// If true, then we will search for chips directly available over a physical interface (pci, jtag, i2c, etc...)
178    /// If false, we will search for chips directly available and via ethernet.
179    pub local_only: bool,
180    /// If len > 0 then only chips with the given archs will be returned.
181    pub chip_filter: Vec<Arch>,
182    /// If true, then we will not initialize anything that might cause a problem (i.e. a noc hang).
183    pub noc_safe: bool,
184}
185
186impl Default for ChipDetectOptions {
187    fn default() -> Self {
188        Self {
189            continue_on_failure: true,
190            local_only: false,
191            chip_filter: Vec::new(),
192            noc_safe: false,
193        }
194    }
195}
196
197impl ChipDetectOptions {
198    pub fn new() -> Self {
199        Self::default()
200    }
201
202    pub fn continue_on_failure(mut self, continue_on_failure: bool) -> Self {
203        self.continue_on_failure = continue_on_failure;
204        self
205    }
206
207    pub fn local_only(mut self, local_only: bool) -> Self {
208        self.local_only = local_only;
209        self
210    }
211
212    pub fn noc_safe(mut self, noc_safe: bool) -> Self {
213        self.noc_safe = noc_safe;
214        self
215    }
216}
217
218/// Find all chips accessible from the given set of root chips.
219/// For the most part this should be a set of chips found via a PCI scan, but it doesn't have to be.
220///
221/// The most important part of this algorithm is determining which chips are duplicates of other chips.
222/// In general two boards can be differentiated by their board id, but this is not always the case.
223/// For example the gs or wh X2, in that case we must fallback on the interface id for grayskull or ethernet address for wh.
224/// However this does not cover all cases, if there is a wh X2 that is not in the root_chips list (which could be because it is in a neighbouring hose)
225/// and both chips are in two separate meshes with the same ethernet address. We will incorrectly detect them as being one chip.
226///
227/// Search steps:
228/// 1. Add all given chips to output list removing duplicates this will ensure that if list indexes are used to
229///    assign a chip id pci chips will always be output instead of the remote equivalent.
230/// 2. To a depth first search for each root chip, adding all new chips found to the output list.
231///
232/// When continue on failure is true, we report errors, but continue searching for chips.
233/// We pass all chips that did not complete initializations as UninitChip, the user will see the status and can
234/// decide for themselves if they want to upgrade the chip to a full Chip.
235/// Error Cases:
236/// 1. ARC fw is hung, this usually means that there is a noc hang as well.
237///    a. Not catastrophic, we can recover from the hang by resetting the chip.
238/// 2. DRAM is not trained
239///    a. Not catastrophic, but we should not pass this over as a good chip as we may get a noc hang when accessing DRAM.
240/// 3. ARC did not complete initialization
241///    a. Not catastrophic, but for gs we will have no thermal control.
242/// 3. Ethernet fw is corrupted, we check this by looking for a known fw version.
243///    a. Not catastrophic, we need to report this, but can continue exploring other chips in the mesh.
244/// 4. Ethernet fw is hung, this usually means that the ethernet is in a bad state.
245///    a. Not catastrophic, we need to report this, but can continue exploring other chips in the mesh.
246/// 5. 0xffffffff error, this means that the underlying transport is hung.
247///    a. This is catastrophic, we cannot continue searching for chips, because some of the chips in the mesh may no longer be accessible
248///    b. We could recover from this by rerunning the search, but this is not implemented.
249pub fn detect_chips<E>(
250    mut root_chips: Vec<Chip>,
251    init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
252    options: ChipDetectOptions,
253) -> Result<Vec<UninitChip>, InitError<E>> {
254    let ChipDetectOptions {
255        continue_on_failure,
256        local_only,
257        chip_filter,
258        noc_safe,
259    } = options;
260
261    let mut remotes_to_investigate = Vec::new();
262    let mut seen_chips = HashSet::new();
263
264    let mut output = Vec::new();
265    for (root_index, root_chip) in root_chips.iter_mut().enumerate() {
266        if !chip_filter.is_empty() && !chip_filter.contains(&root_chip.get_arch()) {
267            Err(PlatformError::WrongChipArchs {
268                actual: root_chip.get_arch(),
269                expected: chip_filter.clone(),
270                backtrace: BtWrapper::capture(),
271            })?;
272        }
273
274        let status = wait_for_init(root_chip, init_callback, continue_on_failure, noc_safe)?;
275
276        // We now want to convert to the uninitialized chip type.
277        let chip = UninitChip::new(status, root_chip);
278
279        // At this point we may not be able to talk to the chip over ethernet, there should have been an error output to the terminal,
280        // so we will just not perform remote chip detection.
281        let remote_ready = chip.eth_safe();
282        let arc_ready = chip.arc_alive();
283
284        output.push(chip);
285
286        let ident = if let Some(wh) = root_chip.as_wh() {
287            if arc_ready {
288                if let Ok(telem) = root_chip.get_telemetry() {
289                    // If WH UBB - skip ethernet exploration
290                    let board_type: u64 =
291                        telem.board_id_low as u64 | ((telem.board_id_high as u64) << 32);
292                    let board_upi: u64 = (board_type >> 36) & 0xFFFFF;
293                    const WH_6U_GLX_UPI: u64 = 0x35;
294
295                    // Only investigate remotes if its not a UBB board or if we are not in noc_safe mode.
296                    if !local_only && remote_ready && board_upi != WH_6U_GLX_UPI {
297                        remotes_to_investigate.push(root_index);
298                    }
299
300                    (
301                        Some(telem.board_id),
302                        Some(InterfaceIdOrCoord::Coord(wh.get_local_chip_coord()?)),
303                    )
304                } else {
305                    continue;
306                }
307            } else {
308                continue;
309            }
310        } else {
311            (
312                // Can't fetch board id from old gs chips
313                // this shouldn't matter anyway because we can only access them
314                // via pci
315                None,
316                root_chip
317                    .get_device_info()?
318                    .map(|v| InterfaceIdOrCoord::Id(v.interface_id)),
319            )
320        };
321
322        if !seen_chips.insert(ident) {
323            continue;
324        }
325    }
326
327    for root_chip in remotes_to_investigate.into_iter().map(|v| &root_chips[v]) {
328        let mut to_check = root_chip.get_neighbouring_chips()?;
329
330        let mut seen_coords = HashSet::new();
331        while let Some(nchip) = to_check.pop() {
332            if !nchip.routing_enabled {
333                continue;
334            }
335
336            if !seen_coords.insert(nchip.eth_addr) {
337                continue;
338            }
339
340            if !chip_filter.is_empty() && !chip_filter.contains(&root_chip.get_arch()) {
341                continue;
342            }
343
344            if let Some(wh) = root_chip.as_wh() {
345                let mut wh = wh.open_remote(nchip.eth_addr)?;
346
347                let status = wait_for_init(&mut wh, init_callback, continue_on_failure, noc_safe)?;
348
349                let local_coord = wh.get_local_chip_coord()?;
350
351                if local_coord != nchip.eth_addr {
352                    Err(PlatformError::Generic(
353                        format!("When detecting chips in mesh found a mismatch between the expected chip coordinate {} and the actual {}", nchip.eth_addr, local_coord),
354                        crate::error::BtWrapper::capture(),
355                    ))?;
356                }
357
358                // If we cannot talk to the ARC then we cannot get the ident information so we
359                // will just return the chip and not continue to search.
360                if !status.arc_status.has_error() {
361                    let telem = wh.get_telemetry()?;
362
363                    let ident = (
364                        Some(telem.board_id),
365                        Some(InterfaceIdOrCoord::Coord(local_coord)),
366                    );
367
368                    if !seen_chips.insert(ident) {
369                        init_callback(crate::chip::ChipDetectState {
370                            chip: root_chip,
371                            call: crate::chip::CallReason::NotNew,
372                        })
373                        .map_err(InitError::CallbackError)?;
374                        continue;
375                    }
376
377                    for nchip in wh.get_neighbouring_chips()? {
378                        to_check.push(nchip);
379                    }
380                }
381
382                let chip = Chip::from(Box::new(wh) as Box<dyn ChipImpl>);
383                output.push(UninitChip::new(status, &chip));
384            } else {
385                unimplemented!("Don't have a handler for non-WH chips with ethernet support yet.")
386            }
387        }
388    }
389
390    Ok(output)
391}
392
393pub fn detect_initialized_chips<E>(
394    root_chips: Vec<Chip>,
395    init_callback: &mut impl FnMut(crate::chip::ChipDetectState) -> Result<(), E>,
396    options: ChipDetectOptions,
397) -> Result<Vec<Chip>, InitError<E>> {
398    let chips = detect_chips(root_chips, init_callback, options)?;
399
400    let mut output = Vec::with_capacity(chips.len());
401    for chip in chips {
402        if chip.is_initialized() {
403            output.push(chip.upgrade());
404        } else {
405            output.push(chip.init(&mut |_| Ok(()))?);
406        }
407    }
408
409    Ok(output)
410}
411
412pub fn detect_chips_silent(
413    root_chips: Vec<Chip>,
414    options: ChipDetectOptions,
415) -> Result<Vec<Chip>, PlatformError> {
416    detect_initialized_chips::<std::convert::Infallible>(root_chips, &mut |_| Ok(()), options)
417        .map_err(|v| match v {
418            InitError::PlatformError(err) => err,
419            InitError::CallbackError(_) => unreachable!(),
420        })
421}