Skip to main content

luwen_api/chip/init/
status.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{convert::Infallible, fmt};
5
6use thiserror::Error;
7
8use crate::error::ArcReadyError;
9
10#[derive(Clone, Debug)]
11pub enum EthernetInitError {
12    FwCorrupted,
13    NotTrained,
14}
15
16impl fmt::Display for EthernetInitError {
17    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18        match self {
19            EthernetInitError::FwCorrupted => f.write_str("Ethernet firmware is corrupted"),
20            EthernetInitError::NotTrained => f.write_str("Ethernet is not trained"),
21        }
22    }
23}
24
25#[derive(Clone, Debug)]
26pub enum EthernetPartialInitError {
27    FwOverwritten,
28}
29
30impl fmt::Display for EthernetPartialInitError {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            EthernetPartialInitError::FwOverwritten => {
34                f.write_str("Ethernet firmware version has an invalid format and is assumed to have been overwritten")
35            }
36        }
37    }
38}
39
40#[derive(Clone, Debug)]
41pub enum ArcInitError {
42    FwCorrupted,
43    NoAccess,
44    WaitingForInit(ArcReadyError),
45    FwVersionTooOld { version: Option<u32>, required: u32 },
46    Hung,
47}
48
49impl fmt::Display for ArcInitError {
50    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51        match self {
52            ArcInitError::NoAccess => f.write_str("Could not access ARC"),
53            ArcInitError::FwCorrupted => f.write_str("ARC firmware is corrupted"),
54            ArcInitError::WaitingForInit(err) => {
55                write!(f, "ARC is waiting for initialization; {err}")
56            }
57            ArcInitError::FwVersionTooOld { version, required } => {
58                let version = if let Some(version) = version {
59                    format!("{version:x}")
60                } else {
61                    "<unknown version>".to_string()
62                };
63                write!(
64                    f,
65                    "ARC FW is older than the minimum supported version; {version} < {required:x}"
66                )
67            }
68            ArcInitError::Hung => f.write_str("ARC is hung"),
69        }
70    }
71}
72
73#[derive(Copy, Clone, Debug)]
74pub enum DramChannelStatus {
75    TrainingNone,
76    TrainingFail,
77    TrainingPass,
78    TrainingSkip,
79    PhyOff,
80    ReadEye,
81    BistEye,
82    CaDebug,
83}
84
85impl std::fmt::Display for DramChannelStatus {
86    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87        match self {
88            DramChannelStatus::TrainingNone => f.write_str("in pre-training"),
89            DramChannelStatus::TrainingFail => f.write_str("failed to train"),
90            DramChannelStatus::TrainingPass => f.write_str("passed training"),
91            DramChannelStatus::TrainingSkip => f.write_str("skipped training"),
92            DramChannelStatus::PhyOff => f.write_str("phy is off"),
93            DramChannelStatus::ReadEye => f.write_str("read eye"),
94            DramChannelStatus::BistEye => f.write_str("bist eye"),
95            DramChannelStatus::CaDebug => f.write_str("ca debug"),
96        }
97    }
98}
99
100impl TryFrom<u8> for DramChannelStatus {
101    type Error = ();
102
103    fn try_from(value: u8) -> Result<Self, ()> {
104        match value {
105            0 => Ok(DramChannelStatus::TrainingNone),
106            1 => Ok(DramChannelStatus::TrainingFail),
107            2 => Ok(DramChannelStatus::TrainingPass),
108            3 => Ok(DramChannelStatus::TrainingSkip),
109            4 => Ok(DramChannelStatus::PhyOff),
110            5 => Ok(DramChannelStatus::ReadEye),
111            6 => Ok(DramChannelStatus::BistEye),
112            7 => Ok(DramChannelStatus::CaDebug),
113            _ => Err(()),
114        }
115    }
116}
117
118#[derive(Clone, Debug)]
119pub enum DramInitError {
120    NotTrained(DramChannelStatus),
121}
122
123impl fmt::Display for DramInitError {
124    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
125        match self {
126            DramInitError::NotTrained(_) => f.write_str("DRAM was not able to train"),
127        }
128    }
129}
130
131#[derive(Clone, Debug, Error)]
132pub enum CpuInitError {
133    // NOTE: Mockup for BH prep
134}
135
136/// The final initialization status for a component within a chip.
137/// This status is not intended to drive the initialization state machine
138/// instead it gives a single high level view of the current status of a single component.
139/// The NotInitialized and InitError types have their own specializations to so the caller only has
140/// to match against the component type if absolutely necessary.
141#[derive(Debug, Clone)]
142pub enum WaitStatus<P, E> {
143    NotPresent,
144    Waiting(Option<String>),
145
146    JustFinished,
147
148    Done,
149    /// This is used in the case where the user has specific that we shouldn't check to see if the
150    /// component has actually been initialized.
151    /// See noc_safe for an example of this enumeration being used.
152    NoCheck,
153
154    Timeout(std::time::Duration),
155    NotInitialized(P),
156    Error(E),
157}
158
159impl<P, E> WaitStatus<P, E> {
160    pub fn is_done(&self) -> bool {
161        matches!(self, WaitStatus::Done)
162    }
163}
164
165/// A generic structure which contains the status information for each component.
166/// There is enough information here to determine the
167#[derive(Debug, Clone)]
168pub struct ComponentStatusInfo<P, E> {
169    pub wait_status: Box<[WaitStatus<P, E>]>,
170    pub timeout: std::time::Duration,
171    pub start_time: std::time::Instant,
172    pub name: String,
173}
174
175impl<P: fmt::Display, E: fmt::Display> fmt::Display for ComponentStatusInfo<P, E> {
176    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
177        let mut waiting_count = 0;
178        let mut completed_count = 0;
179        for status in self.wait_status.iter() {
180            if let WaitStatus::Waiting { .. } = status {
181                waiting_count += 1;
182            } else if let WaitStatus::NoCheck
183            | WaitStatus::JustFinished
184            | WaitStatus::Done
185            | WaitStatus::NotPresent = status
186            {
187                completed_count += 1;
188            }
189        }
190
191        let completed_init = waiting_count == 0;
192
193        let message = if !completed_init {
194            format!(
195                "({}/{})",
196                self.start_time.elapsed().as_secs(),
197                self.timeout.as_secs()
198            )
199        } else {
200            String::new()
201        };
202
203        let message = if self.wait_status.len() > 1 {
204            format!("{message} [{}/{}]", completed_count, self.wait_status.len(),)
205        } else {
206            message
207        };
208
209        let message = format!("{message} {}", self.name);
210
211        let mut message_options: Vec<(Vec<_>, String)> = Vec::with_capacity(self.wait_status.len());
212        let mut force_oneline = true;
213        for (index, status) in self.wait_status.iter().enumerate() {
214            if let WaitStatus::Waiting(Some(status)) = status {
215                if let Some(value) = message_options.iter_mut().find(|(_, v)| v == status) {
216                    value.0.push(index);
217                } else {
218                    message_options.push((vec![index], status.clone()));
219                }
220            } else if let WaitStatus::Error(e) = status {
221                let e = e.to_string();
222                if let Some(value) = message_options.iter_mut().find(|v| v.1 == e) {
223                    value.0.push(index);
224                } else {
225                    message_options.push((vec![index], e));
226                }
227            } else if let WaitStatus::NotInitialized(e) = status {
228                let e = e.to_string();
229                if let Some(value) = message_options.iter_mut().find(|v| v.1 == e) {
230                    value.0.push(index);
231                } else {
232                    message_options.push((vec![index], e));
233                }
234            } else {
235                force_oneline = false;
236            }
237        }
238
239        let message = if message_options.len() == 1 && force_oneline {
240            format!("{message}: {}", message_options[0].1)
241        } else {
242            let mut message = format!("{message}\n");
243            for (indexes, option) in message_options {
244                message = format!("\t{message}[");
245                for index in indexes[..indexes.len().saturating_sub(1)].iter() {
246                    message = format!("{message}{index};");
247                }
248                if let Some(index) = indexes.last() {
249                    message = format!("{message}{index}");
250                }
251                message = format!("{message}]: {option}\n");
252            }
253
254            message
255        };
256
257        f.write_str(message.as_str())
258    }
259}
260
261impl<P, E> ComponentStatusInfo<P, E> {
262    pub fn not_present(name: String) -> Self {
263        Self {
264            name,
265            wait_status: Box::new([]),
266            timeout: std::time::Duration::default(),
267            start_time: std::time::Instant::now(),
268        }
269    }
270
271    pub fn init_waiting(name: String, timeout: std::time::Duration, count: usize) -> Self {
272        let wait_status = (0..count).map(|_| WaitStatus::Waiting(None)).collect();
273        Self {
274            name,
275            wait_status,
276
277            start_time: std::time::Instant::now(),
278            timeout,
279        }
280    }
281
282    pub fn is_waiting(&self) -> bool {
283        for status in self.wait_status.iter() {
284            match status {
285                WaitStatus::Waiting(_) => {
286                    return true;
287                }
288
289                WaitStatus::NotPresent
290                | WaitStatus::JustFinished
291                | WaitStatus::Done
292                | WaitStatus::NotInitialized(_)
293                | WaitStatus::NoCheck
294                | WaitStatus::Timeout(_)
295                | WaitStatus::Error(_) => {}
296            }
297        }
298
299        false
300    }
301
302    pub fn is_present(&self) -> bool {
303        for status in self.wait_status.iter() {
304            match status {
305                WaitStatus::Waiting(_)
306                | WaitStatus::JustFinished
307                | WaitStatus::Done
308                | WaitStatus::NotInitialized(_)
309                | WaitStatus::NoCheck
310                | WaitStatus::Timeout(_)
311                | WaitStatus::Error(_) => {
312                    return true;
313                }
314
315                WaitStatus::NotPresent => {}
316            }
317        }
318
319        false
320    }
321
322    pub fn has_error(&self) -> bool {
323        for status in self.wait_status.iter() {
324            match status {
325                WaitStatus::Error(_) | WaitStatus::Timeout(_) | WaitStatus::NoCheck => {
326                    return true;
327                }
328
329                WaitStatus::NotPresent
330                | WaitStatus::JustFinished
331                | WaitStatus::Done
332                | WaitStatus::Waiting { .. }
333                | WaitStatus::NotInitialized(_) => {}
334            }
335        }
336
337        false
338    }
339}
340
341#[derive(Clone, Debug, Default)]
342pub struct InitOptions {
343    /// If false, then we will not try to initialize anything that would require talking on the NOC
344    pub noc_safe: bool,
345}
346
347#[derive(Clone, Debug)]
348pub enum CommsStatus {
349    CanCommunicate,
350    CommunicationError(String),
351}
352
353impl CommsStatus {
354    pub fn ok(&self) -> bool {
355        match self {
356            CommsStatus::CanCommunicate => true,
357            CommsStatus::CommunicationError(_) => false,
358        }
359    }
360}
361
362impl fmt::Display for CommsStatus {
363    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
364        match self {
365            CommsStatus::CanCommunicate => f.write_str("Success"),
366            CommsStatus::CommunicationError(_err) => f.write_str("Error"),
367        }
368    }
369}
370
371#[derive(Clone, Debug)]
372pub struct InitStatus {
373    pub comms_status: CommsStatus,
374    pub dram_status: ComponentStatusInfo<Infallible, DramInitError>,
375    pub cpu_status: ComponentStatusInfo<Infallible, CpuInitError>,
376    pub arc_status: ComponentStatusInfo<Infallible, ArcInitError>,
377    pub eth_status: ComponentStatusInfo<EthernetPartialInitError, EthernetInitError>,
378
379    pub init_options: InitOptions,
380
381    /// We cannot communicate with the chip prior to the initialization process. Therefore we start
382    /// with the chip in an unknown state (all status is marked as not present).
383    pub unknown_state: bool,
384}
385
386impl fmt::Display for InitStatus {
387    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
388        fn write_component_status<P, E>(status: &ComponentStatusInfo<P, E>) -> String {
389            let mut init_status = String::new();
390            if status.start_time.elapsed() > status.timeout {
391                init_status.push_str("Timeout");
392            } else {
393                init_status.push_str("In Progress");
394            }
395            let mut completed_count = 0;
396            for status in status.wait_status.iter() {
397                if let WaitStatus::NoCheck
398                | WaitStatus::JustFinished
399                | WaitStatus::Done
400                | WaitStatus::NotPresent = status
401                {
402                    completed_count += 1;
403                }
404            }
405            if !status.wait_status.is_empty() {
406                init_status.push_str(
407                    format!(
408                        ", {} out of {} initialized",
409                        completed_count,
410                        status.wait_status.len()
411                    )
412                    .as_str(),
413                );
414            }
415            init_status
416        }
417        writeln!(f, "   Communication Status: {}", self.comms_status)?;
418        writeln!(
419            f,
420            "   DRAM Status: {}",
421            write_component_status(&self.dram_status)
422        )?;
423        writeln!(
424            f,
425            "   CPU Status: {}",
426            write_component_status(&self.cpu_status)
427        )?;
428        writeln!(
429            f,
430            "   ARC Status: {}",
431            write_component_status(&self.arc_status)
432        )?;
433        writeln!(
434            f,
435            "   Ethernet Status: {}",
436            write_component_status(&self.eth_status)
437        )?;
438        writeln!(f, "   Noc Safe: {:?}", self.init_options.noc_safe)?;
439        writeln!(f, "   Unknown State: {}", self.unknown_state)
440    }
441}
442
443impl InitStatus {
444    pub fn new_unknown() -> Self {
445        InitStatus {
446            comms_status: CommsStatus::CommunicationError("Haven't checked".to_string()),
447            dram_status: ComponentStatusInfo::not_present("DRAM".to_string()),
448            cpu_status: ComponentStatusInfo::not_present("CPU".to_string()),
449            arc_status: ComponentStatusInfo::not_present("ARC".to_string()),
450            eth_status: ComponentStatusInfo::not_present("ETH".to_string()),
451            init_options: InitOptions::default(),
452            unknown_state: true,
453        }
454    }
455
456    pub fn can_communicate(&self) -> bool {
457        self.comms_status.ok()
458    }
459
460    pub fn is_waiting(&self) -> bool {
461        self.arc_status.is_waiting()
462            || self.dram_status.is_waiting()
463            || self.eth_status.is_waiting()
464            || self.cpu_status.is_waiting()
465    }
466
467    pub fn init_complete(&self) -> bool {
468        !self.is_waiting()
469    }
470
471    pub fn has_error(&self) -> bool {
472        !self.comms_status.ok()
473            || self.arc_status.has_error()
474            || self.dram_status.has_error()
475            || self.eth_status.has_error()
476            || self.cpu_status.has_error()
477    }
478}