1mod blackhole;
5pub mod communication;
6mod creation;
7pub mod eth_addr;
8mod hl_comms;
9mod init;
10mod remote;
11mod spi;
12pub mod wh_ubb;
13mod wormhole;
14
15pub use blackhole::spirom_tables; pub use blackhole::{message::MessageError, Blackhole};
17pub use communication::chip_comms::{
18 axi_translate, ArcIf, AxiData, AxiError, ChipComms, MemorySlice, MemorySlices,
19};
20pub use communication::chip_interface::{ChipInterface, NocInterface};
21pub use hl_comms::{HlComms, HlCommsInterface};
22pub use init::status::InitStatus;
23pub use init::{
24 status::{CommsStatus, ComponentStatusInfo},
25 wait_for_init, CallReason, ChipDetectState, InitError,
26};
27use luwen_def::Arch;
28pub use wormhole::Wormhole;
29
30use crate::arc_msg::TypedArcMsg;
31pub use crate::arc_msg::{ArcMsg, ArcMsgOk};
32use crate::{arc_msg::ArcMsgAddr, error::PlatformError, DeviceInfo};
33pub use wh_ubb::{ubb_wait_for_driver_load, wh_ubb_ipmi_reset};
34
35#[derive(Debug)]
37pub struct ArcMsgOptions {
38 pub msg: ArcMsg,
39 pub wait_for_done: bool,
40 pub timeout: std::time::Duration,
41 pub use_second_mailbox: bool,
42 pub addrs: Option<ArcMsgAddr>,
43}
44
45impl Default for ArcMsgOptions {
46 fn default() -> Self {
47 Self {
48 msg: ArcMsg::Typed(TypedArcMsg::Nop),
49 wait_for_done: true,
50 timeout: std::time::Duration::from_secs(1),
51 use_second_mailbox: false,
52 addrs: None,
53 }
54 }
55}
56
57#[derive(Debug, Hash, PartialEq, Eq)]
58pub struct NeighbouringChip {
59 pub routing_enabled: bool,
60 pub local_noc_addr: (u8, u8),
61 pub remote_noc_addr: (u8, u8),
62 pub eth_addr: crate::EthAddr,
63}
64
65#[derive(Default, Debug)]
66pub struct Telemetry {
67 pub arch: Arch,
68 pub board_id: u64,
69 pub enum_version: u32,
70 pub entry_count: u32,
71 pub device_id: u32,
72 pub asic_id: u32,
73 pub asic_ro: u32,
74 pub asic_idd: u32,
75 pub board_id_high: u32,
76 pub board_id_low: u32,
77 pub harvesting_state: u32,
78 pub update_telem_speed: u32,
79 pub arc0_fw_version: u32,
80 pub arc1_fw_version: u32,
81 pub arc2_fw_version: u32,
82 pub arc3_fw_version: u32,
83 pub spibootrom_fw_version: u32,
84 pub eth_fw_version: u32,
85 pub ddr_fw_version: u32,
86 pub l2cpu_fw_version: u32,
87 pub m3_bl_fw_version: u32,
88 pub m3_app_fw_version: u32,
89 pub ddr_speed: Option<u32>,
90 pub ddr_status: u32,
91 pub eth_status0: u32,
92 pub eth_status1: u32,
93 pub pcie_status: u32,
94 pub faults: u32,
95 pub arc0_health: u32,
96 pub arc1_health: u32,
97 pub arc2_health: u32,
98 pub arc3_health: u32,
99 pub fan_speed: u32,
100 pub aiclk: u32,
101 pub axiclk: u32,
102 pub arcclk: u32,
103 pub l2cpuclk0: u32,
104 pub l2cpuclk1: u32,
105 pub l2cpuclk2: u32,
106 pub l2cpuclk3: u32,
107 pub throttler: u32,
108 pub vcore: u32,
109 pub asic_temperature: u32,
110 pub vreg_temperature: u32,
111 pub board_temperature: u32,
112 pub tdp: u32,
113 pub tdc: u32,
114 pub vdd_limits: u32,
115 pub thm_limits: u32,
116 pub wh_fw_date: u32,
117 pub asic_tmon0: u32,
118 pub asic_tmon1: u32,
119 pub mvddq_power: u32,
120 pub gddr_train_temp0: u32,
121 pub gddr_train_temp1: u32,
122 pub asic_power: Option<u32>,
123 pub aux_status: Option<u32>,
124 pub boot_date: u32,
125 pub rt_seconds: u32,
126 pub eth_debug_status0: u32,
127 pub eth_debug_status1: u32,
128 pub tt_flash_version: u32,
129 pub fw_bundle_version: u32,
130 pub timer_heartbeat: u32,
131 pub noc_translation_enabled: bool,
132 pub tensix_enabled_col: u32,
133 pub enabled_eth: u32,
134 pub enabled_gddr: u32,
135 pub enabled_l2cpu: u32,
136 pub enabled_pcie: u32,
137 pub fan_rpm: u32,
138 pub gddr01_temp: u32,
139 pub gddr23_temp: u32,
140 pub gddr45_temp: u32,
141 pub gddr67_temp: u32,
142 pub gddr01_corr_errs: u32,
143 pub gddr23_corr_errs: u32,
144 pub gddr45_corr_errs: u32,
145 pub gddr67_corr_errs: u32,
146 pub gddr_uncorr_errs: u32,
147 pub max_gddr_temp: u32,
148 pub asic_location: u32,
149 pub board_power_limit: u32,
150 pub input_power: u32,
151 pub tdc_limit_max: u32,
152 pub thm_limit_throttle: u32,
153 pub therm_trip_count: u32,
154 pub asic_id_high: u32,
155 pub asic_id_low: u32,
156 pub aiclk_limit_max: u32,
157 pub tdp_limit_max: u32,
158}
159
160impl Telemetry {
161 pub fn firmware_date(&self) -> String {
163 let year = ((self.wh_fw_date >> 28) & 0xF) + 2020;
164 let month = (self.wh_fw_date >> 24) & 0xF;
165 let day = (self.wh_fw_date >> 16) & 0xFF;
166 let _hour = (self.wh_fw_date >> 8) & 0xFF;
167 let _minute = self.wh_fw_date & 0xFF;
168 format!("{year:04}-{month:02}-{day:02}")
169 }
170
171 pub fn arc_fw_version(&self) -> String {
173 let major = (self.arc0_fw_version >> 16) & 0xFF;
174 let minor = (self.arc0_fw_version >> 8) & 0xFF;
175 let patch = self.arc0_fw_version & 0xFF;
176 format!("{major}.{minor}.{patch}")
177 }
178
179 pub fn eth_fw_version(&self) -> String {
181 let major = (self.eth_fw_version >> 16) & 0x0FF;
182 let minor = (self.eth_fw_version >> 12) & 0x00F;
183 let patch = self.eth_fw_version & 0xFFF;
184 format!("{major}.{minor}.{patch}")
185 }
186
187 pub fn board_serial_number(&self) -> u64 {
189 ((self.board_id_high as u64) << 32) | self.board_id_low as u64
190 }
191
192 pub fn board_serial_number_hex(&self) -> String {
194 format!("{:016x}", self.board_serial_number())
195 }
196
197 pub fn try_board_type(&self) -> Option<&'static str> {
199 let serial_num = self.board_serial_number();
200 let output = match (serial_num >> 36) & 0xFFFFF {
201 0x1 => match (serial_num >> 32) & 0xF {
202 0x2 => "E300_R2",
203 0x3 | 0x4 => "E300_R3",
204 _ => return None,
205 },
206 0x3 => "e150",
207 0x7 => "e75",
208 0x8 => "NEBULA_CB",
209 0xA => "e300",
210 0xB => "GALAXY",
211 0x14 => "n300",
212 0x18 => "n150",
213 0x35 => "galaxy-wormhole",
214 0x36 => "p100",
215 0x40 => "p150a",
216 0x41 => "p150b",
217 0x42 => "p150c",
218 0x43 => "p100a",
219 0x44 => "p300b",
220 0x45 => "p300a",
221 0x46 => "p300c",
222 0x47 => "galaxy-blackhole",
223 _ => return None,
224 };
225
226 Some(output)
227 }
228
229 pub fn board_type(&self) -> &'static str {
231 self.try_board_type().unwrap_or("UNSUPPORTED")
232 }
233
234 pub fn ai_clk(&self) -> u32 {
236 self.aiclk & 0xffff
237 }
238
239 pub fn axi_clk(&self) -> u32 {
241 self.axiclk
242 }
243
244 pub fn arc_clk(&self) -> u32 {
246 self.arcclk
247 }
248
249 pub fn voltage(&self) -> f64 {
251 self.vcore as f64 / 1000.0
252 }
253
254 pub fn asic_temperature(&self) -> f64 {
256 if self.arch.is_blackhole() {
257 let frac: f64 = (self.asic_temperature & 0xFFFF).into();
258 let frac = frac / 65536.0;
259
260 let int: f64 = ((self.asic_temperature >> 16) as i16).into();
261
262 int + frac
263 } else {
264 ((self.asic_temperature & 0xffff) >> 4) as f64
265 }
266 }
267
268 pub fn vreg_temperature(&self) -> f64 {
270 (self.vreg_temperature & 0xffff) as f64
271 }
272
273 pub fn inlet_temperature(&self) -> f64 {
275 ((self.board_temperature >> 0x10) & 0xff) as f64
276 }
277
278 pub fn outlet_temperature1(&self) -> f64 {
280 ((self.board_temperature >> 0x08) & 0xff) as f64
281 }
282
283 pub fn outlet_temperature2(&self) -> f64 {
285 (self.board_temperature & 0xff) as f64
286 }
287
288 pub fn power(&self) -> f64 {
290 (self.tdp & 0xffff) as f64
291 }
292
293 pub fn current(&self) -> f64 {
295 (self.tdc & 0xffff) as f64
296 }
297
298 pub fn telemetry_heartbeat(&self) -> u32 {
299 if self.arch.is_blackhole() {
300 self.timer_heartbeat
301 } else {
302 self.arc0_health
303 }
304 }
305}
306
307pub enum ChipInitResult {
308 NoError,
310 ErrorContinue(String, std::backtrace::Backtrace),
315 ErrorAbort(String, std::backtrace::Backtrace),
317}
318
319pub trait ChipImpl: HlComms + Send + Sync + 'static {
327 fn update_init_state(
335 &mut self,
336 status: &mut InitStatus,
337 ) -> Result<ChipInitResult, PlatformError>;
338
339 fn get_arch(&self) -> Arch;
342
343 fn get_telemetry(&self) -> Result<Telemetry, PlatformError>;
346
347 fn arc_msg(&self, msg: ArcMsgOptions) -> Result<ArcMsgOk, PlatformError>;
349
350 fn get_neighbouring_chips(&self) -> Result<Vec<NeighbouringChip>, PlatformError>;
353
354 fn as_any(&self) -> &dyn std::any::Any;
356
357 fn get_device_info(&self) -> Result<Option<DeviceInfo>, PlatformError>;
360}
361
362pub struct Chip {
366 pub inner: Box<dyn ChipImpl>,
367}
368
369impl From<Box<dyn ChipImpl>> for Chip {
370 fn from(inner: Box<dyn ChipImpl>) -> Self {
371 Self { inner }
372 }
373}
374
375impl Chip {
376 pub fn as_wh(&self) -> Option<&Wormhole> {
378 self.inner.as_any().downcast_ref::<Wormhole>()
379 }
380
381 pub fn as_bh(&self) -> Option<&Blackhole> {
383 self.inner.as_any().downcast_ref::<Blackhole>()
384 }
385}
386
387impl HlComms for Chip {
388 fn comms_obj(&self) -> (&dyn ChipComms, &dyn ChipInterface) {
389 self.inner.comms_obj()
390 }
391}
392
393impl ChipImpl for Chip {
394 fn update_init_state(
395 &mut self,
396 status: &mut InitStatus,
397 ) -> Result<ChipInitResult, PlatformError> {
398 self.inner.update_init_state(status)
399 }
400
401 fn get_arch(&self) -> Arch {
402 self.inner.get_arch()
403 }
404
405 fn arc_msg(&self, msg: ArcMsgOptions) -> Result<ArcMsgOk, PlatformError> {
406 self.inner.arc_msg(msg)
407 }
408
409 fn get_neighbouring_chips(&self) -> Result<Vec<NeighbouringChip>, PlatformError> {
410 self.inner.get_neighbouring_chips()
411 }
412
413 fn as_any(&self) -> &dyn std::any::Any {
414 self.inner.as_any()
415 }
416
417 fn get_telemetry(&self) -> Result<Telemetry, PlatformError> {
418 self.inner.get_telemetry()
419 }
420
421 fn get_device_info(&self) -> Result<Option<DeviceInfo>, PlatformError> {
422 self.inner.get_device_info()
423 }
424}