Skip to main content

luwen_api/chip/
wormhole.rs

1// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{backtrace, sync::Arc};
5
6use crate::{
7    arc_msg::{ArcMsgAddr, ArcMsgOk, TypedArcMsg},
8    chip::{
9        communication::{
10            chip_comms::{load_axi_table, ChipComms},
11            chip_interface::ChipInterface,
12        },
13        hl_comms::HlCommsInterface,
14    },
15    error::{BtWrapper, PlatformError},
16    ArcMsg, ChipImpl, IntoChip,
17};
18
19use super::{
20    eth_addr::EthAddr,
21    hl_comms::HlComms,
22    init::status::{ComponentStatusInfo, EthernetPartialInitError, InitOptions, WaitStatus},
23    remote::{EthAddresses, RemoteArcIf},
24    ArcMsgOptions, ChipInitResult, CommsStatus, InitStatus, NeighbouringChip,
25};
26
27/// Implementation of the interface for a Wormhole
28/// both the local and remote Wormhole chips are represented by this struct
29#[derive(Clone)]
30pub struct Wormhole {
31    pub chip_if: Arc<dyn ChipInterface + Send + Sync>,
32    pub arc_if: Arc<dyn ChipComms + Send + Sync>,
33
34    pub is_remote: bool,
35    pub use_arc_for_spi: bool,
36
37    pub arc_addrs: ArcMsgAddr,
38    pub eth_locations: [EthCore; 16],
39    pub eth_addrs: EthAddresses,
40    telemetry_addr: Arc<once_cell::sync::OnceCell<u32>>,
41}
42
43impl HlComms for Wormhole {
44    fn comms_obj(&self) -> (&dyn ChipComms, &dyn ChipInterface) {
45        (self.arc_if.as_ref(), self.chip_if.as_ref())
46    }
47}
48
49impl HlComms for &Wormhole {
50    fn comms_obj(&self) -> (&dyn ChipComms, &dyn ChipInterface) {
51        (self.arc_if.as_ref(), self.chip_if.as_ref())
52    }
53}
54
55#[derive(Clone, Copy, Debug)]
56pub struct EthCore {
57    pub x: u8,
58    pub y: u8,
59    pub enabled: bool,
60}
61
62impl Default for EthCore {
63    fn default() -> Self {
64        Self {
65            x: 0,
66            y: 0,
67            enabled: true,
68        }
69    }
70}
71
72impl Wormhole {
73    pub(crate) fn init<
74        CC: ChipComms + Send + Sync + 'static,
75        CI: ChipInterface + Send + Sync + 'static,
76    >(
77        is_remote: bool,
78        use_arc_for_spi: bool,
79        arc_if: CC,
80        chip_if: CI,
81    ) -> Result<Self, PlatformError> {
82        // let mut version = [0; 4];
83        // arc_if.axi_read(&chip_if, 0x0, &mut version);
84        // let version = u32::from_le_bytes(version);
85        let _version = 0x0;
86
87        let output = Wormhole {
88            chip_if: Arc::new(chip_if),
89
90            is_remote,
91            use_arc_for_spi,
92
93            arc_addrs: ArcMsgAddr {
94                scratch_base: arc_if.axi_translate("ARC_RESET.SCRATCH[0]")?.addr,
95                arc_misc_cntl: arc_if.axi_translate("ARC_RESET.ARC_MISC_CNTL")?.addr,
96            },
97
98            arc_if: Arc::new(arc_if),
99            eth_addrs: EthAddresses::default(),
100
101            telemetry_addr: Arc::new(once_cell::sync::OnceCell::new()),
102
103            eth_locations: [
104                EthCore {
105                    x: 9,
106                    y: 0,
107                    ..Default::default()
108                },
109                EthCore {
110                    x: 1,
111                    y: 0,
112                    ..Default::default()
113                },
114                EthCore {
115                    x: 8,
116                    y: 0,
117                    ..Default::default()
118                },
119                EthCore {
120                    x: 2,
121                    y: 0,
122                    ..Default::default()
123                },
124                EthCore {
125                    x: 7,
126                    y: 0,
127                    ..Default::default()
128                },
129                EthCore {
130                    x: 3,
131                    y: 0,
132                    ..Default::default()
133                },
134                EthCore {
135                    x: 6,
136                    y: 0,
137                    ..Default::default()
138                },
139                EthCore {
140                    x: 4,
141                    y: 0,
142                    ..Default::default()
143                },
144                EthCore {
145                    x: 9,
146                    y: 6,
147                    ..Default::default()
148                },
149                EthCore {
150                    x: 1,
151                    y: 6,
152                    ..Default::default()
153                },
154                EthCore {
155                    x: 8,
156                    y: 6,
157                    ..Default::default()
158                },
159                EthCore {
160                    x: 2,
161                    y: 6,
162                    ..Default::default()
163                },
164                EthCore {
165                    x: 7,
166                    y: 6,
167                    ..Default::default()
168                },
169                EthCore {
170                    x: 3,
171                    y: 6,
172                    ..Default::default()
173                },
174                EthCore {
175                    x: 6,
176                    y: 6,
177                    ..Default::default()
178                },
179                EthCore {
180                    x: 4,
181                    y: 6,
182                    ..Default::default()
183                },
184            ],
185        };
186
187        Ok(output)
188    }
189
190    pub fn init_eth_addrs(&mut self) -> Result<(), PlatformError> {
191        if self.eth_addrs.masked_version == 0 {
192            let telemetry = self.get_telemetry()?;
193
194            self.eth_addrs = EthAddresses::new(telemetry.eth_fw_version);
195        }
196
197        Ok(())
198    }
199
200    pub fn get_if<T: ChipInterface>(&self) -> Option<&T> {
201        self.chip_if.as_any().downcast_ref::<T>()
202    }
203
204    pub fn open_remote(&self, addr: impl IntoChip<EthAddr>) -> Result<Wormhole, PlatformError> {
205        let arc_if = RemoteArcIf {
206            addr: addr.cinto(&self.arc_if, &self.chip_if).unwrap(),
207            axi_data: Some(load_axi_table("wormhole-axi-noc.bin", 0)),
208        };
209
210        Self::init(true, true, arc_if, self.chip_if.clone())
211    }
212
213    // fn check_dram_trained(&self) {
214    //     let pc = self.axi_sread32("ARC_RESET.POST_CODE")?;
215
216    //     0x29
217    // }
218
219    fn check_arc_msg_safe(&self, msg_reg: u64, _return_reg: u64) -> Result<(), PlatformError> {
220        const POST_CODE_INIT_DONE: u32 = 0xC0DE0001;
221        const _POST_CODE_ARC_MSG_HANDLE_START: u32 = 0xC0DE0030;
222        const POST_CODE_ARC_MSG_HANDLE_DONE: u32 = 0xC0DE003F;
223        const POST_CODE_ARC_TIME_LAST: u32 = 0xC0DE007F;
224
225        let s5 = self.axi_sread32(format!("ARC_RESET.SCRATCH[{msg_reg}]"))?;
226        let pc = self.axi_sread32("ARC_RESET.POST_CODE")?;
227        let dma = self.axi_sread32("ARC_CSM.ARC_PCIE_DMA_REQUEST.trigger")?;
228
229        if pc == 0xFFFFFFFF {
230            return Err(PlatformError::ArcNotReady(
231                crate::error::ArcReadyError::NoAccess,
232                BtWrapper::capture(),
233            ))?;
234        }
235
236        if s5 == 0xDEADC0DE {
237            return Err(PlatformError::ArcNotReady(
238                crate::error::ArcReadyError::WatchdogTriggered,
239                BtWrapper::capture(),
240            ))?;
241        }
242
243        // Still booting and it will later wipe SCRATCH[5/2].
244        if s5 == 0x00000060 || pc == 0x11110000 {
245            return Err(PlatformError::ArcNotReady(
246                crate::error::ArcReadyError::BootIncomplete,
247                BtWrapper::capture(),
248            ))?;
249        }
250
251        if s5 == 0x0000AA00 || s5 == TypedArcMsg::ArcGoToSleep.msg_code() as u32 {
252            return Err(PlatformError::ArcNotReady(
253                crate::error::ArcReadyError::Asleep,
254                BtWrapper::capture(),
255            ))?;
256        }
257
258        // PCIE DMA writes SCRATCH[5] on exit, so it's not safe.
259        // Also we assume FW is hung if we see this state.
260        // (The former is only relevant when msg_reg==5, but the latter is always relevant.)
261        if dma != 0 {
262            return Err(PlatformError::ArcNotReady(
263                crate::error::ArcReadyError::OutstandingPcieDMA,
264                BtWrapper::capture(),
265            ))?;
266        }
267
268        if s5 & 0xFFFFFF00 == 0x0000AA00 {
269            let message_id = s5 & 0xFF;
270            return Err(PlatformError::ArcNotReady(
271                crate::error::ArcReadyError::MessageQueued(message_id),
272                BtWrapper::capture(),
273            ))?;
274        }
275
276        if s5 & 0xFF00FFFF == 0xAA000000 {
277            let message_id = (s5 >> 16) & 0xFF;
278            return Err(PlatformError::ArcNotReady(
279                crate::error::ArcReadyError::HandlingMessage(message_id),
280                BtWrapper::capture(),
281            ))?;
282        }
283
284        // Boot complete (new FW only), message not recognized,
285        // pcie_dma_{chip_to_host,host_to_chip}_transfer failed to acquire PCIE mutex
286        if let 0x00000001 | 0xFFFFFFFF | 0xFFFFDEAD = s5 {
287            return Ok(());
288        }
289
290        if s5 & 0x0000FFFF > 0x00000001 {
291            // YYYY00XX for XX != 0, 1
292            // Message complete, response written into s5. Post code might not be set back to idle yet, but it will happen.
293            return Ok(());
294        }
295
296        if s5 == 0 {
297            // not yet booted or L2 init or old FW finished boot, or old FW processing message
298            // or pcie_dma_{chip_to_host,host_to_chip}_transfer completed
299
300            // Some of these also represent short-term busy, but it's safe to write SCRATCH[5].
301            let pc_idle = pc == POST_CODE_INIT_DONE
302                || (POST_CODE_ARC_MSG_HANDLE_DONE..=POST_CODE_ARC_TIME_LAST).contains(&pc);
303            if pc_idle {
304                return Ok(());
305            } else {
306                return Err(PlatformError::ArcNotReady(
307                    crate::error::ArcReadyError::OldPostCode(pc),
308                    BtWrapper::capture(),
309                ))?;
310            }
311        }
312
313        // We should never get here, every case should be handled above.
314        Ok(())
315    }
316
317    pub fn spi_write(&self, addr: u32, value: &[u8]) -> Result<(), Box<dyn std::error::Error>> {
318        let spi = super::spi::ActiveSpi::new(self, self.use_arc_for_spi)?;
319
320        spi.write(self, addr, value)?;
321
322        Ok(())
323    }
324
325    pub fn spi_read(&self, addr: u32, value: &mut [u8]) -> Result<(), Box<dyn std::error::Error>> {
326        let spi = super::spi::ActiveSpi::new(self, self.use_arc_for_spi)?;
327
328        spi.read(self, addr, value)?;
329
330        Ok(())
331    }
332}
333
334fn default_status() -> InitStatus {
335    InitStatus {
336        comms_status: super::CommsStatus::CanCommunicate,
337        arc_status: ComponentStatusInfo {
338            name: "ARC".to_string(),
339            wait_status: Box::new([WaitStatus::Waiting(None)]),
340
341            start_time: std::time::Instant::now(),
342            timeout: std::time::Duration::from_secs(300),
343        },
344        dram_status: ComponentStatusInfo::init_waiting(
345            "DRAM".to_string(),
346            std::time::Duration::from_secs(300),
347            4,
348        ),
349        eth_status: ComponentStatusInfo::init_waiting(
350            "ETH".to_string(),
351            std::time::Duration::from_secs(15 * 60),
352            16,
353        ),
354        cpu_status: ComponentStatusInfo::not_present("CPU".to_string()),
355
356        init_options: InitOptions { noc_safe: false },
357
358        unknown_state: false,
359    }
360}
361
362impl ChipImpl for Wormhole {
363    fn update_init_state(
364        &mut self,
365        status: &mut InitStatus,
366    ) -> Result<ChipInitResult, PlatformError> {
367        if status.unknown_state {
368            let init_options = std::mem::take(&mut status.init_options);
369            *status = default_status();
370            status.init_options = init_options;
371        }
372
373        let comms = &mut status.comms_status;
374
375        {
376            let status = &mut status.arc_status;
377            for arc_status in status.wait_status.iter_mut() {
378                match arc_status {
379                    WaitStatus::Waiting(status_string) => {
380                        match self.check_arc_msg_safe(5, 3) {
381                            Ok(_) => *arc_status = WaitStatus::JustFinished,
382                            Err(err) => {
383                                match err {
384                                    PlatformError::ArcNotReady(reason, _) => {
385                                        // There are three possibilities when trying to get a response
386                                        // here. 1. 0xffffffff in this case we want to assume this is some
387                                        // sort of AxiError and abort the init. 2. An error we may
388                                        // eventually recover from, i.e. arc booting... 3. we have hit an
389                                        // error that won't resolve but isn't indicative of further
390                                        // problems. For example watchdog triggered.
391                                        match reason {
392                                            // This is triggered when the s5 or pc registers readback
393                                            // 0xffffffff. I am treating it like an AXI error and will
394                                            // assume something has gone terribly wrong and abort.
395                                            crate::error::ArcReadyError::NoAccess
396                                            | crate::error::ArcReadyError::BootError
397                                            | crate::error::ArcReadyError::WatchdogTriggered
398                                            | crate::error::ArcReadyError::Asleep
399                                            | crate::error::ArcReadyError::OldPostCode(_) => {
400                                                *arc_status = WaitStatus::Error(super::init::status::ArcInitError::WaitingForInit(reason));
401                                            }
402                                            crate::error::ArcReadyError::BootIncomplete
403                                            | crate::error::ArcReadyError::OutstandingPcieDMA
404                                            | crate::error::ArcReadyError::MessageQueued(_)
405                                            | crate::error::ArcReadyError::HandlingMessage(_) => {
406                                                *status_string = Some(reason.to_string());
407                                                if status.start_time.elapsed() > status.timeout {
408                                                    *arc_status = WaitStatus::Error(super::init::status::ArcInitError::WaitingForInit(reason));
409                                                }
410                                            }
411                                        }
412                                    }
413
414                                    PlatformError::UnsupportedFwVersion { version, required } => {
415                                        *arc_status = WaitStatus::Error(
416                                            super::init::status::ArcInitError::FwVersionTooOld {
417                                                version,
418                                                required,
419                                            },
420                                        );
421                                    }
422
423                                    // The fact that this is here means that our result is too generic, for now we just ignore it.
424                                    PlatformError::ArcMsgError(error) => {
425                                        return Ok(ChipInitResult::ErrorContinue(
426                                            error.to_string(),
427                                            backtrace::Backtrace::capture(),
428                                        ));
429                                    }
430
431                                    PlatformError::MessageError(error) => {
432                                        return Ok(ChipInitResult::ErrorContinue(
433                                            error.to_string(),
434                                            backtrace::Backtrace::capture(),
435                                        ));
436                                    }
437
438                                    // This is fine to hit at this stage (though it should have been already verified to not be the case).
439                                    // For now we just ignore it and hope that it will be resolved by the time the timeout expires...
440                                    PlatformError::EthernetTrainingNotComplete(_) => {
441                                        if let WaitStatus::Waiting(status_string) = arc_status {
442                                            if status.start_time.elapsed() > status.timeout {
443                                                *arc_status = WaitStatus::Timeout(status.timeout);
444                                            } else {
445                                                *status_string = Some("Waiting on arc/ethernet; this is unexpected but we'll assume that things will clear up if we wait.".to_string());
446                                            }
447                                        }
448                                    }
449
450                                    // This is an "expected error" but we probably can't recover from it, so we should abort the init.
451                                    PlatformError::AxiError(error) => {
452                                        *comms = CommsStatus::CommunicationError(error.to_string());
453                                        return Ok(ChipInitResult::ErrorAbort(
454                                            format!("ARC AXI error: {error}"),
455                                            backtrace::Backtrace::capture(),
456                                        ));
457                                    }
458
459                                    // We don't expect to hit these cases so if we do, we should assume that something went terribly
460                                    // wrong and abort the init.
461                                    PlatformError::WrongChipArch {
462                                        actual,
463                                        expected,
464                                        backtrace,
465                                    } => {
466                                        return Ok(ChipInitResult::ErrorAbort(
467                                            format!(
468                                                "expected chip: {expected}, actual detected chip: {actual}"
469                                            ),
470                                            backtrace.0,
471                                        ))
472                                    }
473
474                                    PlatformError::WrongChipArchs {
475                                        actual,
476                                        expected,
477                                        backtrace,
478                                    } => {
479                                        let expected_chips = expected
480                                            .iter()
481                                            .map(|arch| arch.to_string())
482                                            .collect::<Vec<_>>()
483                                            .join(", ");
484                                        return Ok(ChipInitResult::ErrorAbort(
485                                            format!(
486                                                "expected chip: {expected_chips}, actual detected chips: {actual}"
487                                            ),
488                                            backtrace.0,
489                                        ));
490                                    }
491
492                                    PlatformError::Generic(error, backtrace) => {
493                                        return Ok(ChipInitResult::ErrorAbort(error, backtrace.0));
494                                    }
495
496                                    PlatformError::GenericError(error, backtrace) => {
497                                        let err_msg = error.to_string();
498                                        return Ok(ChipInitResult::ErrorAbort(
499                                            err_msg,
500                                            backtrace.0,
501                                        ));
502                                    }
503                                }
504                            }
505                        }
506                    }
507                    WaitStatus::JustFinished => {
508                        *arc_status = WaitStatus::Done;
509                    }
510                    _ => {}
511                }
512            }
513        }
514
515        // If ARC has not finished initialization then we shouldn't init eth or dram.
516        if !status.arc_status.is_waiting() {
517            // If something went wrong with ARC then we probably don't have DRAM
518            if !status.arc_status.has_error() {
519                // GDDR starts powered off at boot; DRAM training only happens after GDDR
520                // power-on, which occurs after Luwen's init sequence. Skip the telemetry
521                // check and mark DRAM done immediately.
522                for dram_status in status.dram_status.wait_status.iter_mut() {
523                    if let WaitStatus::Waiting(_) = dram_status {
524                        *dram_status = WaitStatus::Done;
525                    }
526                }
527            } else {
528                for dram_status in status.dram_status.wait_status.iter_mut() {
529                    *dram_status = WaitStatus::NoCheck;
530                }
531            }
532        } else {
533            for dram_status in status.dram_status.wait_status.iter_mut() {
534                if let WaitStatus::Waiting(status_string) = dram_status {
535                    *status_string = Some("Waiting for ARC".to_string());
536                }
537            }
538        }
539
540        // If ARC has not finished initialization then we shouldn't init eth.
541        if !status.arc_status.is_waiting() {
542            // We need arc to be alive so that we can check which cores are enabled
543            if !status.arc_status.has_error() {
544                // Only do eth training if board type is not UBB
545                // By this point arc should be alive so we can safely access telem
546                let telem = self.get_telemetry()?;
547                let board_type: u64 =
548                    telem.board_id_low as u64 | ((telem.board_id_high as u64) << 32);
549                let board_upi: u64 = (board_type >> 36) & 0xFFFFF;
550                const WH_6U_GLX_UPI: u64 = 0x35;
551
552                if board_upi != WH_6U_GLX_UPI {
553                    // Only try to initialize the ethernet if we are not in noc_safe mode.
554                    if !status.init_options.noc_safe {
555                        let status = &mut status.eth_status;
556
557                        // We don't need to get the eth training status if we aren't waiting to see if dram has
558                        // trained...
559                        if status.is_waiting() {
560                            let eth_training_status = match self.check_ethernet_training_complete() {
561                                Ok(eth_status) => eth_status,
562                                Err(err) => match err {
563                                    // ARC should be initialized at this point, hitting an error here means
564                                    // that we can no longer progress in the init.
565                                    PlatformError::ArcMsgError(error) => {
566                                        return Ok(ChipInitResult::ErrorContinue(
567                                            error.to_string(),
568                                            backtrace::Backtrace::capture(),
569                                        ));
570                                    }
571
572                                    PlatformError::MessageError(error) => {
573                                        return Ok(ChipInitResult::ErrorContinue(
574                                            error.to_string(),
575                                            backtrace::Backtrace::capture(),
576                                        ));
577                                    }
578
579                                    PlatformError::ArcNotReady(error, backtrace) => {
580                                        return Ok(ChipInitResult::ErrorContinue(
581                                            error.to_string(),
582                                            backtrace.0,
583                                        ));
584                                    }
585
586                                    // We are checking for ethernet training to complete... if we hit this than
587                                    // something has gone terribly wrong
588                                    PlatformError::EthernetTrainingNotComplete(eth_cores) => {
589                                        let false_count = eth_cores.iter().filter(|&&x| !x).count();
590                                        return Ok(ChipInitResult::ErrorContinue(
591                                            format!(
592                                                "Ethernet training not complete on [{false_count}/16] ports"
593                                            ),
594                                            backtrace::Backtrace::capture(),
595                                        ));
596                                    }
597
598                                    // This is an "expected error" but we probably can't recover from it, so we should abort the init.
599                                    PlatformError::AxiError(error) => {
600                                        return Ok(ChipInitResult::ErrorAbort(
601                                            error.to_string(),
602                                            backtrace::Backtrace::capture(),
603                                        ));
604                                    }
605
606                                    // We don't expect to hit these cases so if we do, we should assume that something went terribly
607                                    // wrong and abort the init.
608                                    PlatformError::UnsupportedFwVersion { version, required } => {
609                                        return Ok(ChipInitResult::ErrorAbort(format!("Required Ethernet Firmware Version: {required}, current version: {version:?}"), backtrace::Backtrace::capture()));
610                                    }
611                                    PlatformError::WrongChipArch {
612                                        actual,
613                                        expected,
614                                        backtrace,
615                                    } => {
616                                        return Ok(ChipInitResult::ErrorAbort(
617                                            format!(
618                                            "expected chip: {expected}, actual detected chip: {actual}"
619                                        ),
620                                            backtrace.0,
621                                        ))
622                                    }
623
624                                    PlatformError::WrongChipArchs {
625                                        actual,
626                                        expected,
627                                        backtrace,
628                                    } => {
629                                        let expected_chips = expected
630                                            .iter()
631                                            .map(|arch| arch.to_string())
632                                            .collect::<Vec<_>>()
633                                            .join(", ");
634                                        return Ok(ChipInitResult::ErrorAbort(
635                                            format!(
636                                                "expected chip: {expected_chips}, actual detected chips: {actual}"
637                                            ),
638                                            backtrace.0,
639                                        ));
640                                    }
641
642                                    PlatformError::Generic(error, backtrace) => {
643                                        return Ok(ChipInitResult::ErrorAbort(error, backtrace.0));
644                                    }
645
646                                    PlatformError::GenericError(error, backtrace) => {
647                                        let err_msg = error.to_string();
648                                        return Ok(ChipInitResult::ErrorAbort(err_msg, backtrace.0));
649                                    }
650                                },
651                            };
652                            for (eth_status, training_complete) in
653                                status.wait_status.iter_mut().zip(eth_training_status)
654                            {
655                                match eth_status {
656                                    WaitStatus::Waiting(status_string) => {
657                                        if training_complete {
658                                            if let Err(_err) = self.check_ethernet_fw_version() {
659                                                *eth_status = WaitStatus::NotInitialized(
660                                                    EthernetPartialInitError::FwOverwritten,
661                                                );
662                                            } else {
663                                                *eth_status = WaitStatus::JustFinished;
664                                            }
665                                        } else if status.start_time.elapsed() > status.timeout {
666                                            *eth_status = WaitStatus::Timeout(status.timeout);
667                                        } else {
668                                            *status_string = Some(format!(
669                                                "{}: Waiting for initial training to complete",
670                                                self.get_local_chip_coord()?
671                                            ));
672                                        }
673                                    }
674                                    WaitStatus::JustFinished => {
675                                        *eth_status = WaitStatus::Done;
676                                    }
677                                    _ => {}
678                                }
679                            }
680                        }
681                    } else {
682                        let status = &mut status.eth_status;
683                        for eth_status in status.wait_status.iter_mut() {
684                            *eth_status = WaitStatus::Done;
685                        }
686                    }
687                } else {
688                    // If WH UBB - skip ethernet training check
689                    let status = &mut status.eth_status;
690                    for eth_status in status.wait_status.iter_mut() {
691                        *eth_status = WaitStatus::Done;
692                    }
693                }
694            } else {
695                let status = &mut status.eth_status;
696                for eth_status in status.wait_status.iter_mut() {
697                    *eth_status = WaitStatus::NoCheck;
698                }
699            }
700        } else {
701            for eth_status in status.eth_status.wait_status.iter_mut() {
702                if let WaitStatus::Waiting(status_string) = eth_status {
703                    *status_string = Some("Waiting for ARC".to_string());
704                }
705            }
706        }
707
708        {
709            // This is not present in wormhole.
710            let _status = &mut status.cpu_status;
711        }
712
713        Ok(ChipInitResult::NoError)
714    }
715
716    fn get_arch(&self) -> luwen_def::Arch {
717        luwen_def::Arch::Wormhole
718    }
719
720    fn arc_msg(&self, msg: ArcMsgOptions) -> Result<ArcMsgOk, PlatformError> {
721        let (msg_reg, return_reg) = if msg.use_second_mailbox {
722            (2, 4)
723        } else {
724            (5, 3)
725        };
726
727        self.check_arc_msg_safe(msg_reg, return_reg)?;
728
729        crate::arc_msg::arc_msg(
730            self,
731            &msg.msg,
732            msg.wait_for_done,
733            msg.timeout,
734            msg_reg,
735            return_reg,
736            msg.addrs.as_ref().unwrap_or(&self.arc_addrs),
737        )
738    }
739
740    fn get_neighbouring_chips(&self) -> Result<Vec<NeighbouringChip>, crate::error::PlatformError> {
741        const ETH_UNKNOWN: u32 = 0;
742        const ETH_UNCONNECTED: u32 = 1;
743        const ETH_NO_ROUTING: u32 = 2;
744
745        const SHELF_OFFSET: u64 = 9;
746        const RACK_OFFSET: u64 = 10;
747
748        let mut output = Vec::with_capacity(self.eth_locations.len());
749
750        for (
751            eth_id,
752            EthCore {
753                x: eth_x, y: eth_y, ..
754            },
755        ) in self.eth_locations.iter().copied().enumerate()
756        {
757            let port_status = self.arc_if.noc_read32(
758                &self.chip_if,
759                0,
760                eth_x,
761                eth_y,
762                self.eth_addrs.eth_conn_info + (eth_id as u64 * 4),
763            )?;
764
765            if port_status == ETH_UNCONNECTED || port_status == ETH_UNKNOWN {
766                continue;
767            }
768
769            // HACK(drosen): It's not currently possible to route galaxy->nb...
770            // This is a limitation of the current ethernet firmware routing scheme,
771            // but fixing it would require a large-ish firmware update and lots of testing so
772            // for now we are just ignoring those routes.
773
774            // Get the neighbour's board type
775            let next_board_type = self.noc_read32(
776                0,
777                eth_x,
778                eth_y,
779                0x1ec0 + (self.eth_addrs.erisc_remote_board_type_offset * 4),
780            )?;
781
782            // Get the our board type
783            let our_board_type = self.noc_read32(
784                0,
785                eth_x,
786                eth_y,
787                0x1ec0 + (self.eth_addrs.erisc_local_board_type_offset * 4),
788            )?;
789
790            // Check if it's possible to have routing disabled
791            let erisc_routing_disabled =
792                self.noc_read32(0, eth_x, eth_y, self.eth_addrs.boot_params + (19 * 4))? == 1;
793
794            // The board type value will be 0 if galaxy and non-zero if nb
795            // It's currently not possible to go from GALAXY->NB
796            let routing_disabled = (our_board_type == 0 && next_board_type != 0)
797                || (erisc_routing_disabled && port_status == ETH_NO_ROUTING);
798
799            // Decode the remote eth_addr for our erisc core
800            // This can be used to build a map of the full mesh
801            let remote_id = self.noc_read32(
802                0,
803                eth_x,
804                eth_y,
805                self.eth_addrs.node_info + (4 * RACK_OFFSET),
806            )?;
807            let remote_rack_x = remote_id & 0xFF;
808            let remote_rack_y = (remote_id >> 8) & 0xFF;
809
810            let remote_id = self.noc_read32(
811                0,
812                eth_x,
813                eth_y,
814                self.eth_addrs.node_info + (4 * SHELF_OFFSET),
815            )?;
816            let remote_shelf_x = (remote_id >> 16) & 0x3F;
817            let remote_shelf_y = (remote_id >> 22) & 0x3F;
818
819            let remote_noc_x = (remote_id >> 4) & 0x3F;
820            let remote_noc_y = (remote_id >> 10) & 0x3F;
821
822            output.push(NeighbouringChip {
823                routing_enabled: !routing_disabled,
824                local_noc_addr: (eth_x, eth_y),
825                remote_noc_addr: (remote_noc_x as u8, remote_noc_y as u8),
826                eth_addr: EthAddr {
827                    shelf_x: remote_shelf_x as u8,
828                    shelf_y: remote_shelf_y as u8,
829                    rack_x: remote_rack_x as u8,
830                    rack_y: remote_rack_y as u8,
831                },
832            });
833        }
834
835        Ok(output)
836    }
837
838    fn as_any(&self) -> &dyn std::any::Any {
839        self
840    }
841
842    fn get_telemetry(&self) -> Result<super::Telemetry, PlatformError> {
843        let offset: Result<u32, PlatformError> = self
844            .telemetry_addr
845            .get_or_try_init(|| {
846                let result = self.arc_msg(ArcMsgOptions {
847                    msg: ArcMsg::Typed(TypedArcMsg::GetSmbusTelemetryAddr),
848                    ..Default::default()
849                })?;
850
851                let offset = match result {
852                    ArcMsgOk::Ok { arg, .. } => arg,
853                    ArcMsgOk::OkBuf([_, arg, ..]) => arg,
854                    ArcMsgOk::OkNoWait => todo!(),
855                };
856
857                Ok(offset)
858            })
859            .copied();
860
861        let offset = offset?;
862
863        let csm_offset = self.arc_if.axi_translate("ARC_CSM.DATA[0]")?;
864
865        let telemetry_struct_offset = csm_offset.addr + (offset - 0x10000000) as u64;
866        let enum_version = self
867            .arc_if
868            .axi_read32(&self.chip_if, telemetry_struct_offset)?;
869        let device_id = self
870            .arc_if
871            .axi_read32(&self.chip_if, telemetry_struct_offset + 4)?;
872        let asic_ro = self
873            .arc_if
874            .axi_read32(&self.chip_if, telemetry_struct_offset + (2 * 4))?;
875        let asic_idd = self
876            .arc_if
877            .axi_read32(&self.chip_if, telemetry_struct_offset + (3 * 4))?;
878
879        let board_id_high = self
880            .arc_if
881            .axi_read32(&self.chip_if, telemetry_struct_offset + (4 * 4))?;
882        let board_id_low = self
883            .arc_if
884            .axi_read32(&self.chip_if, telemetry_struct_offset + (5 * 4))?;
885        let arc0_fw_version = self
886            .arc_if
887            .axi_read32(&self.chip_if, telemetry_struct_offset + (6 * 4))?;
888        let arc1_fw_version = self
889            .arc_if
890            .axi_read32(&self.chip_if, telemetry_struct_offset + (7 * 4))?;
891        let arc2_fw_version = self
892            .arc_if
893            .axi_read32(&self.chip_if, telemetry_struct_offset + (8 * 4))?;
894        let arc3_fw_version = self
895            .arc_if
896            .axi_read32(&self.chip_if, telemetry_struct_offset + (9 * 4))?;
897        let spibootrom_fw_version = self
898            .arc_if
899            .axi_read32(&self.chip_if, telemetry_struct_offset + (10 * 4))?;
900        let eth_fw_version = self
901            .arc_if
902            .axi_read32(&self.chip_if, telemetry_struct_offset + (11 * 4))?;
903        let m3_bl_fw_version = self
904            .arc_if
905            .axi_read32(&self.chip_if, telemetry_struct_offset + (12 * 4))?;
906        let m3_app_fw_version = self
907            .arc_if
908            .axi_read32(&self.chip_if, telemetry_struct_offset + (13 * 4))?;
909        let ddr_status = self
910            .arc_if
911            .axi_read32(&self.chip_if, telemetry_struct_offset + (14 * 4))?;
912        let eth_status0 = self
913            .arc_if
914            .axi_read32(&self.chip_if, telemetry_struct_offset + (15 * 4))?;
915        let eth_status1 = self
916            .arc_if
917            .axi_read32(&self.chip_if, telemetry_struct_offset + (16 * 4))?;
918        let pcie_status = self
919            .arc_if
920            .axi_read32(&self.chip_if, telemetry_struct_offset + (17 * 4))?;
921        let faults = self
922            .arc_if
923            .axi_read32(&self.chip_if, telemetry_struct_offset + (18 * 4))?;
924        let arc0_health = self
925            .arc_if
926            .axi_read32(&self.chip_if, telemetry_struct_offset + (19 * 4))?;
927        let arc1_health = self
928            .arc_if
929            .axi_read32(&self.chip_if, telemetry_struct_offset + (20 * 4))?;
930        let arc2_health = self
931            .arc_if
932            .axi_read32(&self.chip_if, telemetry_struct_offset + (21 * 4))?;
933        let arc3_health = self
934            .arc_if
935            .axi_read32(&self.chip_if, telemetry_struct_offset + (22 * 4))?;
936        let fan_speed = self
937            .arc_if
938            .axi_read32(&self.chip_if, telemetry_struct_offset + (23 * 4))?;
939        let aiclk = self
940            .arc_if
941            .axi_read32(&self.chip_if, telemetry_struct_offset + (24 * 4))?;
942        let axiclk = self
943            .arc_if
944            .axi_read32(&self.chip_if, telemetry_struct_offset + (25 * 4))?;
945        let arcclk = self
946            .arc_if
947            .axi_read32(&self.chip_if, telemetry_struct_offset + (26 * 4))?;
948        let throttler = self
949            .arc_if
950            .axi_read32(&self.chip_if, telemetry_struct_offset + (27 * 4))?;
951        let vcore = self
952            .arc_if
953            .axi_read32(&self.chip_if, telemetry_struct_offset + (28 * 4))?;
954        let asic_temperature = self
955            .arc_if
956            .axi_read32(&self.chip_if, telemetry_struct_offset + (29 * 4))?;
957        let vreg_temperature = self
958            .arc_if
959            .axi_read32(&self.chip_if, telemetry_struct_offset + (30 * 4))?;
960        let board_temperature = self
961            .arc_if
962            .axi_read32(&self.chip_if, telemetry_struct_offset + (31 * 4))?;
963        let tdp = self
964            .arc_if
965            .axi_read32(&self.chip_if, telemetry_struct_offset + (32 * 4))?;
966        let tdc = self
967            .arc_if
968            .axi_read32(&self.chip_if, telemetry_struct_offset + (33 * 4))?;
969        let vdd_limits = self
970            .arc_if
971            .axi_read32(&self.chip_if, telemetry_struct_offset + (34 * 4))?;
972        let thm_limits = self
973            .arc_if
974            .axi_read32(&self.chip_if, telemetry_struct_offset + (35 * 4))?;
975        let wh_fw_date = self
976            .arc_if
977            .axi_read32(&self.chip_if, telemetry_struct_offset + (36 * 4))?;
978        let asic_tmon0 = self
979            .arc_if
980            .axi_read32(&self.chip_if, telemetry_struct_offset + (37 * 4))?;
981        let asic_tmon1 = self
982            .arc_if
983            .axi_read32(&self.chip_if, telemetry_struct_offset + (38 * 4))?;
984        let mvddq_power = self
985            .arc_if
986            .axi_read32(&self.chip_if, telemetry_struct_offset + (39 * 4))?;
987        let gddr_train_temp0 = self
988            .arc_if
989            .axi_read32(&self.chip_if, telemetry_struct_offset + (40 * 4))?;
990        let gddr_train_temp1 = self
991            .arc_if
992            .axi_read32(&self.chip_if, telemetry_struct_offset + (41 * 4))?;
993        let boot_date = self
994            .arc_if
995            .axi_read32(&self.chip_if, telemetry_struct_offset + (42 * 4))?;
996        let rt_seconds = self
997            .arc_if
998            .axi_read32(&self.chip_if, telemetry_struct_offset + (43 * 4))?;
999        let eth_debug_status0 = self
1000            .arc_if
1001            .axi_read32(&self.chip_if, telemetry_struct_offset + (44 * 4))?;
1002        let eth_debug_status1 = self
1003            .arc_if
1004            .axi_read32(&self.chip_if, telemetry_struct_offset + (45 * 4))?;
1005        let tt_flash_version = self
1006            .arc_if
1007            .axi_read32(&self.chip_if, telemetry_struct_offset + (46 * 4))?;
1008
1009        let threshold: u32 = 0x02190000; // arc fw 2.25.0.0
1010        let fw_bundle_version: u32 = if arc0_fw_version >= threshold {
1011            self.arc_if
1012                .axi_read32(&self.chip_if, telemetry_struct_offset + (49 * 4))?
1013        } else {
1014            0
1015        };
1016
1017        Ok(super::Telemetry {
1018            arch: self.get_arch(),
1019            board_id: ((board_id_high as u64) << 32) | (board_id_low as u64),
1020            enum_version,
1021            device_id,
1022            asic_ro,
1023            asic_idd,
1024            board_id_high,
1025            board_id_low,
1026            arc0_fw_version,
1027            arc1_fw_version,
1028            arc2_fw_version,
1029            arc3_fw_version,
1030            spibootrom_fw_version,
1031            eth_fw_version,
1032            m3_bl_fw_version,
1033            m3_app_fw_version,
1034            ddr_status,
1035            eth_status0,
1036            eth_status1,
1037            pcie_status,
1038            faults,
1039            arc0_health,
1040            arc1_health,
1041            arc2_health,
1042            arc3_health,
1043            fan_speed,
1044            aiclk,
1045            axiclk,
1046            arcclk,
1047            throttler,
1048            vcore,
1049            asic_temperature,
1050            vreg_temperature,
1051            board_temperature,
1052            tdp,
1053            tdc,
1054            vdd_limits,
1055            thm_limits,
1056            wh_fw_date,
1057            asic_tmon0,
1058            asic_tmon1,
1059            mvddq_power,
1060            gddr_train_temp0,
1061            gddr_train_temp1,
1062            boot_date,
1063            rt_seconds,
1064            eth_debug_status0,
1065            eth_debug_status1,
1066            tt_flash_version,
1067            fw_bundle_version,
1068            timer_heartbeat: arc0_health,
1069            ..Default::default()
1070        })
1071    }
1072
1073    fn get_device_info(&self) -> Result<Option<crate::DeviceInfo>, PlatformError> {
1074        if self.is_remote {
1075            Ok(None)
1076        } else {
1077            Ok(self.chip_if.get_device_info()?)
1078        }
1079    }
1080}