Skip to main content

luwen_api/chip/
mod.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4mod blackhole;
5pub mod communication;
6mod creation;
7pub mod eth_addr;
8mod hl_comms;
9mod init;
10mod remote;
11mod spi;
12pub mod wh_ubb;
13mod wormhole;
14
15pub use blackhole::spirom_tables; // Added for proto file write unit testing
16pub use blackhole::{message::MessageError, Blackhole};
17pub use communication::chip_comms::{
18    axi_translate, ArcIf, AxiData, AxiError, ChipComms, MemorySlice, MemorySlices,
19};
20pub use communication::chip_interface::{ChipInterface, NocInterface};
21pub use hl_comms::{HlComms, HlCommsInterface};
22pub use init::status::InitStatus;
23pub use init::{
24    status::{CommsStatus, ComponentStatusInfo},
25    wait_for_init, CallReason, ChipDetectState, InitError,
26};
27use luwen_def::Arch;
28pub use wormhole::Wormhole;
29
30use crate::arc_msg::TypedArcMsg;
31pub use crate::arc_msg::{ArcMsg, ArcMsgOk};
32use crate::{arc_msg::ArcMsgAddr, error::PlatformError, DeviceInfo};
33pub use wh_ubb::{ubb_wait_for_driver_load, wh_ubb_ipmi_reset};
34
35/// Arc message interface
36#[derive(Debug)]
37pub struct ArcMsgOptions {
38    pub msg: ArcMsg,
39    pub wait_for_done: bool,
40    pub timeout: std::time::Duration,
41    pub use_second_mailbox: bool,
42    pub addrs: Option<ArcMsgAddr>,
43}
44
45impl Default for ArcMsgOptions {
46    fn default() -> Self {
47        Self {
48            msg: ArcMsg::Typed(TypedArcMsg::Nop),
49            wait_for_done: true,
50            timeout: std::time::Duration::from_secs(1),
51            use_second_mailbox: false,
52            addrs: None,
53        }
54    }
55}
56
57#[derive(Debug, Hash, PartialEq, Eq)]
58pub struct NeighbouringChip {
59    pub routing_enabled: bool,
60    pub local_noc_addr: (u8, u8),
61    pub remote_noc_addr: (u8, u8),
62    pub eth_addr: crate::EthAddr,
63}
64
65#[derive(Default, Debug)]
66pub struct Telemetry {
67    pub arch: Arch,
68    pub board_id: u64,
69    pub enum_version: u32,
70    pub entry_count: u32,
71    pub device_id: u32,
72    pub asic_id: u32,
73    pub asic_ro: u32,
74    pub asic_idd: u32,
75    pub board_id_high: u32,
76    pub board_id_low: u32,
77    pub harvesting_state: u32,
78    pub update_telem_speed: u32,
79    pub arc0_fw_version: u32,
80    pub arc1_fw_version: u32,
81    pub arc2_fw_version: u32,
82    pub arc3_fw_version: u32,
83    pub spibootrom_fw_version: u32,
84    pub eth_fw_version: u32,
85    pub ddr_fw_version: u32,
86    pub l2cpu_fw_version: u32,
87    pub m3_bl_fw_version: u32,
88    pub m3_app_fw_version: u32,
89    pub ddr_speed: Option<u32>,
90    pub ddr_status: u32,
91    pub eth_status0: u32,
92    pub eth_status1: u32,
93    pub pcie_status: u32,
94    pub faults: u32,
95    pub arc0_health: u32,
96    pub arc1_health: u32,
97    pub arc2_health: u32,
98    pub arc3_health: u32,
99    pub fan_speed: u32,
100    pub aiclk: u32,
101    pub axiclk: u32,
102    pub arcclk: u32,
103    pub l2cpuclk0: u32,
104    pub l2cpuclk1: u32,
105    pub l2cpuclk2: u32,
106    pub l2cpuclk3: u32,
107    pub throttler: u32,
108    pub vcore: u32,
109    pub asic_temperature: u32,
110    pub vreg_temperature: u32,
111    pub board_temperature: u32,
112    pub tdp: u32,
113    pub tdc: u32,
114    pub vdd_limits: u32,
115    pub thm_limits: u32,
116    pub wh_fw_date: u32,
117    pub asic_tmon0: u32,
118    pub asic_tmon1: u32,
119    pub mvddq_power: u32,
120    pub gddr_train_temp0: u32,
121    pub gddr_train_temp1: u32,
122    pub asic_power: Option<u32>,
123    pub aux_status: Option<u32>,
124    pub boot_date: u32,
125    pub rt_seconds: u32,
126    pub eth_debug_status0: u32,
127    pub eth_debug_status1: u32,
128    pub tt_flash_version: u32,
129    pub fw_bundle_version: u32,
130    pub timer_heartbeat: u32,
131    pub noc_translation_enabled: bool,
132    pub tensix_enabled_col: u32,
133    pub enabled_eth: u32,
134    pub enabled_gddr: u32,
135    pub enabled_l2cpu: u32,
136    pub enabled_pcie: u32,
137    pub fan_rpm: u32,
138    pub gddr01_temp: u32,
139    pub gddr23_temp: u32,
140    pub gddr45_temp: u32,
141    pub gddr67_temp: u32,
142    pub gddr01_corr_errs: u32,
143    pub gddr23_corr_errs: u32,
144    pub gddr45_corr_errs: u32,
145    pub gddr67_corr_errs: u32,
146    pub gddr_uncorr_errs: u32,
147    pub max_gddr_temp: u32,
148    pub asic_location: u32,
149    pub board_power_limit: u32,
150    pub input_power: u32,
151    pub tdc_limit_max: u32,
152    pub thm_limit_throttle: u32,
153    pub therm_trip_count: u32,
154    pub asic_id_high: u32,
155    pub asic_id_low: u32,
156    pub aiclk_limit_max: u32,
157    pub tdp_limit_max: u32,
158}
159
160impl Telemetry {
161    /// Return firmware date in YYYY-MM-DD format.
162    pub fn firmware_date(&self) -> String {
163        let year = ((self.wh_fw_date >> 28) & 0xF) + 2020;
164        let month = (self.wh_fw_date >> 24) & 0xF;
165        let day = (self.wh_fw_date >> 16) & 0xFF;
166        let _hour = (self.wh_fw_date >> 8) & 0xFF;
167        let _minute = self.wh_fw_date & 0xFF;
168        format!("{year:04}-{month:02}-{day:02}")
169    }
170
171    /// Return ARC firmware version in MAJOR.MINOR.PATCH format.
172    pub fn arc_fw_version(&self) -> String {
173        let major = (self.arc0_fw_version >> 16) & 0xFF;
174        let minor = (self.arc0_fw_version >> 8) & 0xFF;
175        let patch = self.arc0_fw_version & 0xFF;
176        format!("{major}.{minor}.{patch}")
177    }
178
179    /// Return Ethernet firmware version in MAJOR.MINOR.PATCH format.
180    pub fn eth_fw_version(&self) -> String {
181        let major = (self.eth_fw_version >> 16) & 0x0FF;
182        let minor = (self.eth_fw_version >> 12) & 0x00F;
183        let patch = self.eth_fw_version & 0xFFF;
184        format!("{major}.{minor}.{patch}")
185    }
186
187    /// Return the board serial number as an integer.
188    pub fn board_serial_number(&self) -> u64 {
189        ((self.board_id_high as u64) << 32) | self.board_id_low as u64
190    }
191
192    /// Return the board serial number as a hex-formatted string.
193    pub fn board_serial_number_hex(&self) -> String {
194        format!("{:016x}", self.board_serial_number())
195    }
196
197    /// Return the board type or None if unknown
198    pub fn try_board_type(&self) -> Option<&'static str> {
199        let serial_num = self.board_serial_number();
200        let output = match (serial_num >> 36) & 0xFFFFF {
201            0x1 => match (serial_num >> 32) & 0xF {
202                0x2 => "E300_R2",
203                0x3 | 0x4 => "E300_R3",
204                _ => return None,
205            },
206            0x3 => "e150",
207            0x7 => "e75",
208            0x8 => "NEBULA_CB",
209            0xA => "e300",
210            0xB => "GALAXY",
211            0x14 => "n300",
212            0x18 => "n150",
213            0x35 => "galaxy-wormhole",
214            0x36 => "p100",
215            0x40 => "p150a",
216            0x41 => "p150b",
217            0x42 => "p150c",
218            0x43 => "p100a",
219            0x44 => "p300b",
220            0x45 => "p300a",
221            0x46 => "p300c",
222            0x47 => "galaxy-blackhole",
223            _ => return None,
224        };
225
226        Some(output)
227    }
228
229    /// Return the board type of UNSUPPORTED
230    pub fn board_type(&self) -> &'static str {
231        self.try_board_type().unwrap_or("UNSUPPORTED")
232    }
233
234    /// Return the AI clock speed in MHz.
235    pub fn ai_clk(&self) -> u32 {
236        self.aiclk & 0xffff
237    }
238
239    /// Return the AXI clock speed in MHz.
240    pub fn axi_clk(&self) -> u32 {
241        self.axiclk
242    }
243
244    /// Return the ARC clock speed in MHz.
245    pub fn arc_clk(&self) -> u32 {
246        self.arcclk
247    }
248
249    /// Return the core voltage in volts.
250    pub fn voltage(&self) -> f64 {
251        self.vcore as f64 / 1000.0
252    }
253
254    /// Return the ASIC temperature in degrees celsius.
255    pub fn asic_temperature(&self) -> f64 {
256        if self.arch.is_blackhole() {
257            let frac: f64 = (self.asic_temperature & 0xFFFF).into();
258            let frac = frac / 65536.0;
259
260            let int: f64 = ((self.asic_temperature >> 16) as i16).into();
261
262            int + frac
263        } else {
264            ((self.asic_temperature & 0xffff) >> 4) as f64
265        }
266    }
267
268    /// Return the voltage regulator temperature in degrees celsius.
269    pub fn vreg_temperature(&self) -> f64 {
270        (self.vreg_temperature & 0xffff) as f64
271    }
272
273    /// Return the inlet temperature in degrees celsius.
274    pub fn inlet_temperature(&self) -> f64 {
275        ((self.board_temperature >> 0x10) & 0xff) as f64
276    }
277
278    /// Return the first outlet temperature in degrees celsius.
279    pub fn outlet_temperature1(&self) -> f64 {
280        ((self.board_temperature >> 0x08) & 0xff) as f64
281    }
282
283    /// Return the second outlet temperature in degrees celsius.
284    pub fn outlet_temperature2(&self) -> f64 {
285        (self.board_temperature & 0xff) as f64
286    }
287
288    /// Return the power consumption in watts.
289    pub fn power(&self) -> f64 {
290        (self.tdp & 0xffff) as f64
291    }
292
293    /// Return the current consumption in amperes.
294    pub fn current(&self) -> f64 {
295        (self.tdc & 0xffff) as f64
296    }
297
298    pub fn telemetry_heartbeat(&self) -> u32 {
299        if self.arch.is_blackhole() {
300            self.timer_heartbeat
301        } else {
302            self.arc0_health
303        }
304    }
305}
306
307pub enum ChipInitResult {
308    /// Everything is good, can continue with init
309    NoError,
310    /// We hit an error, but we can continue with init
311    /// this is for things like arc or ethernet training timeout.
312    /// If this is returned then there shouldn't be a chip returned to the user,
313    /// but we are okay to findout more information.
314    ErrorContinue(String, std::backtrace::Backtrace),
315    /// We hit an error that indicates that it would be unsafe to continue with init.
316    ErrorAbort(String, std::backtrace::Backtrace),
317}
318
319/// Defines common functionality for all chips.
320/// This is a convenience interface that allows chip type agnostic code to be written.
321///
322/// As a general rule the chip should not be accessed without an explicit request from the user.
323/// This means that chip initialization must be explicitly called and for example if the user has not
324/// explicitly stated that they want to enumerate remote chips, then we won't even start looking at remote readiness.
325/// This is to avoid situations where a problematic state is reached and causes an abort even if that capability is not needed.
326pub trait ChipImpl: HlComms + Send + Sync + 'static {
327    /// Update the initialization state of the chip.
328    /// The primary purpose of this function is to tell the caller when it is safe to starting interacting with the chip.
329    ///
330    /// However the secondary purpose is to provide information about what chip functions are currently available for use.
331    /// For example if the arc is not ready, then we should not try to send an arc message.
332    /// Or in a more complex example, if the arc is ready, but the ethernet is not (for example the ethernet fw is hung)
333    /// then we will be able to access the local arc, but won't be able to access any remote chips.
334    fn update_init_state(
335        &mut self,
336        status: &mut InitStatus,
337    ) -> Result<ChipInitResult, PlatformError>;
338
339    /// Returns the current arch of the chip, can be used to avoid
340    /// needing to ducktype when downcasting.
341    fn get_arch(&self) -> Arch;
342
343    /// Get telemetry information from the chip.
344    /// The information is not cached, so should not be called repeatedly.
345    fn get_telemetry(&self) -> Result<Telemetry, PlatformError>;
346
347    /// Send an arc_msg to the underlying chip.
348    fn arc_msg(&self, msg: ArcMsgOptions) -> Result<ArcMsgOk, PlatformError>;
349
350    /// Get a list of neighbouring chips.
351    /// Will return an empty list for gs and up to four chips for wh.
352    fn get_neighbouring_chips(&self) -> Result<Vec<NeighbouringChip>, PlatformError>;
353
354    /// Convenience function to downcast to a concrete type.
355    fn as_any(&self) -> &dyn std::any::Any;
356
357    /// Get information about the underlying chip transport.
358    /// This is a hack to get the physical id of the chip.
359    fn get_device_info(&self) -> Result<Option<DeviceInfo>, PlatformError>;
360}
361
362/// A wrapper around a chip that implements `ChipImpl`.
363/// This allows us to create and use chips without knowing their type,
364/// but we can still downcast to the concrete type if we need to.
365pub struct Chip {
366    pub inner: Box<dyn ChipImpl>,
367}
368
369impl From<Box<dyn ChipImpl>> for Chip {
370    fn from(inner: Box<dyn ChipImpl>) -> Self {
371        Self { inner }
372    }
373}
374
375impl Chip {
376    /// Downcast to a wormhole chip
377    pub fn as_wh(&self) -> Option<&Wormhole> {
378        self.inner.as_any().downcast_ref::<Wormhole>()
379    }
380
381    /// Downcast to a blackhole chip
382    pub fn as_bh(&self) -> Option<&Blackhole> {
383        self.inner.as_any().downcast_ref::<Blackhole>()
384    }
385}
386
387impl HlComms for Chip {
388    fn comms_obj(&self) -> (&dyn ChipComms, &dyn ChipInterface) {
389        self.inner.comms_obj()
390    }
391}
392
393impl ChipImpl for Chip {
394    fn update_init_state(
395        &mut self,
396        status: &mut InitStatus,
397    ) -> Result<ChipInitResult, PlatformError> {
398        self.inner.update_init_state(status)
399    }
400
401    fn get_arch(&self) -> Arch {
402        self.inner.get_arch()
403    }
404
405    fn arc_msg(&self, msg: ArcMsgOptions) -> Result<ArcMsgOk, PlatformError> {
406        self.inner.arc_msg(msg)
407    }
408
409    fn get_neighbouring_chips(&self) -> Result<Vec<NeighbouringChip>, PlatformError> {
410        self.inner.get_neighbouring_chips()
411    }
412
413    fn as_any(&self) -> &dyn std::any::Any {
414        self.inner.as_any()
415    }
416
417    fn get_telemetry(&self) -> Result<Telemetry, PlatformError> {
418        self.inner.get_telemetry()
419    }
420
421    fn get_device_info(&self) -> Result<Option<DeviceInfo>, PlatformError> {
422        self.inner.get_device_info()
423    }
424}