Skip to main content

ic_safe_upgrades/
lib.rs

1use candid::Principal;
2use ic_call_retry::{
3    call_idempotent_method_with_retry, call_nonidempotent_method_with_retry,
4    when_out_of_time_or_stopping, Deadline, ErrorCause, RetryError,
5};
6use ic_cdk::api::canister_self;
7use ic_cdk::call::CallErrorExt;
8use ic_cdk::management_canister::InstallChunkedCodeArgs;
9use ic_cdk::management_canister::{
10    CanisterInfoArgs, CanisterInfoResult, CanisterInstallMode, ChunkHash, ClearChunkStoreArgs,
11    InstallCodeArgs, UploadChunkArgs,
12};
13use ic_management_canister_types::{
14    ChangeDetails, ChangeOrigin, StartCanisterArgs, StopCanisterArgs,
15};
16use sha2::{Digest, Sha256};
17
18#[cfg(feature = "use_call_chaos")]
19use ic_call_chaos::Call;
20#[cfg(not(feature = "use_call_chaos"))]
21use ic_cdk::call::Call;
22
23/// Represents a canister's principal ID on the IC.
24pub type CanisterId = Principal;
25
26/// Describes the stage of the upgrade during which an error occurred
27/// or after which we could not confirm status.
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum UpgradeStage {
30    Stopping,
31    ObtainingInfo,
32    Installing,
33    Starting,
34}
35
36#[derive(Debug, Clone)]
37pub enum UpgradeErrorReason {
38    RetryError(RetryError),
39    ConcurrentChangeDetected,
40}
41
42/// Errors returned by `upgrade_canister`.
43#[derive(Debug, Clone)]
44pub struct UpgradeError {
45    pub stage: UpgradeStage,
46    pub reason: UpgradeErrorReason,
47}
48
49/// Holds the meta-information needed for a chunked WASM install.
50#[derive(Debug, Clone)]
51pub struct ChunkedModule {
52    /// SHA-256 hash of the entire WASM to be installed.
53    pub wasm_module_hash: Vec<u8>,
54
55    /// The canister storing the chunks (must be on the same subnet).
56    pub store_canister_id: CanisterId,
57
58    /// The list of chunk hashes that compose the WASM.
59    pub chunk_hashes_list: Vec<Vec<u8>>,
60}
61
62/// The WASM to be installed.
63#[derive(Debug, Clone)]
64pub enum WasmModule {
65    /// A module < 2MB that can be installed in a single message
66    Bytes(Vec<u8>),
67    /// A module > 2MB that must be installed in chunks. Chunks are assumed to already have been uploaded.
68    ChunkedModule(ChunkedModule),
69}
70
71enum VersionChangeCheck {
72    /// The version hasn't changed. The upgrade failed and can be retried.
73    NoChange,
74    /// The version has changed in the expected way. The upgrade succeeded.
75    UpgradeSucceeded,
76    /// A concurrent change was detected. The upgrade shouldn't be retried.
77    ConcurrentChangeDetected,
78}
79
80async fn version_change_check(
81    target_id: CanisterId,
82    wasm_module: &WasmModule,
83    old_version: u64,
84    stop_trying: &mut impl FnMut() -> bool,
85) -> Result<VersionChangeCheck, RetryError> {
86    let (new_version, mut recent_changes) =
87        bounded_wait_canister_info(target_id, Some(1), stop_trying)
88            .await
89            .map(|info| (info.total_num_changes, info.recent_changes))?;
90    let last_change = if let Some(change) = recent_changes.pop() {
91        change
92    } else {
93        // We asked for one recent change, and there really should be at least one,
94        // since we're in the process of upgrading the canister. So there not being
95        // a change should be unreachable, but possibly some very weird concurrent
96        // changes are going on, so we can report that.
97        return Ok(VersionChangeCheck::ConcurrentChangeDetected);
98    };
99    match (
100        new_version - old_version,
101        last_change.details,
102        last_change.origin,
103    ) {
104        (0, _, _) => Ok(VersionChangeCheck::NoChange),
105        (1, ChangeDetails::CodeDeployment(dep), ChangeOrigin::FromCanister(rec))
106            if rec.canister_id == canister_self() =>
107        {
108            let expected_hash: Vec<u8> = match wasm_module {
109                WasmModule::Bytes(ref wasm_bytes) => Sha256::digest(wasm_bytes).to_vec(),
110                WasmModule::ChunkedModule(ref chunked) => chunked.wasm_module_hash.clone(),
111            };
112            if dep.module_hash != expected_hash {
113                Ok(VersionChangeCheck::ConcurrentChangeDetected)
114            } else {
115                Ok(VersionChangeCheck::UpgradeSucceeded)
116            }
117        }
118        (_, _, _) => Ok(VersionChangeCheck::ConcurrentChangeDetected),
119    }
120}
121
122/// Safely upgrade a canister to a new version, without blocking the caller from
123/// being upgraded itself.
124///
125/// Stops, installs, and then restarts the target canister.
126/// Uses bounded-wait calls under the hood, ensuring that the caller isn't blocked
127/// from upgrading itself due to open call contexts.
128/// It retries any failed calls until the `stop_trying` function returns true.
129/// See the `ic-call-retry` crate for sample functions.
130///
131/// In corner cases, it may be unknown whether the upgrade succeeded (as indicated by the
132/// `StatusUnknown` return variant).
133///
134/// Note that this function cannot protect against concurrent upgrades of the target canister.
135/// While it can detect concurrent updates in some cases (and return an error), the detection
136/// is not bulletproof. It's the caller's responsibility to ensure that they are the sole
137/// initiator of target canister upgrades, and that this function is not called multiple times in
138/// parallel.
139///
140/// # Procedure
141///
142/// 1. **Stop** the canister C via a bounded-wait call (`SysUnknown` => retry).
143///    - Because `stop_canister` is idempotent, we can safely retry until definite success.
144/// 2. **Obtain** the current version (`canister_info`) to record the old WASM hash and canister
145///    version.
146/// 3. **Upgrade** the canister. If `SysUnknown` is returned, call `canister_info` again:
147///    - If the canister's version changed by 1 and the hash is the expected one, we know the upgrade went through.
148///    - If not, we retry or eventually give up as `StatusUnknown`.
149/// 4. **Start** the canister again, also with bounded-wait calls.
150///
151/// # Returns
152/// * `Ok(())` if we can confirm a successful upgrade.
153/// * `Err(UpgradeError::UpgradeFailed(...))` if the upgrade failed definitively.
154/// * `Err(UpgradeError::StatusUnknown(...))` if we cannot confirm success or failure.
155pub async fn upgrade_canister<P>(
156    target_id: CanisterId,
157    wasm_module: WasmModule,
158    arg: Vec<u8>,
159    stop_trying: &mut P,
160) -> Result<(), UpgradeError>
161where
162    P: FnMut() -> bool,
163{
164    // Converts a `BestEffortError` into an `UpgradeError` at a given stage.
165    let add_stage = |stage: UpgradeStage| {
166        move |error: RetryError| UpgradeError {
167            stage,
168            reason: UpgradeErrorReason::RetryError(error),
169        }
170    };
171
172    // 1) Stop the canister (bounded-wait).
173    bounded_wait_stop(target_id, stop_trying)
174        .await
175        .map_err(add_stage(UpgradeStage::Stopping))?;
176
177    // 2) Query the current canister version for reference.
178    let version = bounded_wait_canister_info(target_id, None, stop_trying)
179        .await
180        .map(|info| info.total_num_changes)
181        .map_err(add_stage(UpgradeStage::ObtainingInfo))?;
182
183    // 3) Install (upgrade) the new WASM. Loop until success or timeout. We can't retry directly
184    // here if we don't know what happened, since installation isn't idempotent. Instead, use the
185    // version number to determine if the upgrade went through.
186    loop {
187        let install_result = match wasm_module {
188            WasmModule::Bytes(ref wasm_bytes) => {
189                bounded_wait_install_single_chunk(target_id, wasm_bytes, &arg, stop_trying).await
190            }
191            WasmModule::ChunkedModule(ref chunked) => {
192                bounded_wait_install_chunked(target_id, chunked, &arg, stop_trying).await
193            }
194        };
195
196        match install_result {
197            Ok(()) => break,
198            // Note that for installation, unretriable errors include `SysUnknown`
199            // Try to figure out what happened using the version and retry if the version
200            // hasn't moved
201            Err(RetryError::StatusUnknown(ErrorCause::CallFailed(rejection)))
202                if !rejection.is_clean_reject() =>
203            {
204                let version_check_result =
205                    version_change_check(target_id, &wasm_module, version, stop_trying)
206                        .await
207                        .map_err(add_stage(UpgradeStage::Installing))?;
208
209                match version_check_result {
210                    VersionChangeCheck::NoChange => {
211                        ic_cdk::println!(
212                            "Failed to upgrade {:?} and the version hasn't moved, retrying",
213                            target_id
214                        );
215                        continue;
216                    }
217                    VersionChangeCheck::UpgradeSucceeded => {
218                        break;
219                    }
220                    VersionChangeCheck::ConcurrentChangeDetected => {
221                        return Err(UpgradeError {
222                            stage: UpgradeStage::Installing,
223                            reason: UpgradeErrorReason::ConcurrentChangeDetected,
224                        });
225                    }
226                }
227            }
228            Err(error) => return Err(add_stage(UpgradeStage::Installing)(error)),
229        }
230    }
231
232    bounded_wait_start(target_id, stop_trying)
233        .await
234        .map_err(add_stage(UpgradeStage::Starting))
235}
236
237/// Stop a canister with best-effort calls until success or timeout.
238async fn bounded_wait_stop<P>(target_id: Principal, stop_trying: &mut P) -> Result<(), RetryError>
239where
240    P: FnMut() -> bool,
241{
242    let args = StopCanisterArgs {
243        canister_id: target_id,
244    };
245    Ok(call_idempotent_method_with_retry(
246        Call::bounded_wait(Principal::management_canister(), "stop_canister").with_arg(&args),
247        stop_trying,
248    )
249    .await?
250    .candid()
251    .unwrap())
252}
253
254/// Start a canister with best-effort calls until success or timeout.
255async fn bounded_wait_start<P>(target_id: CanisterId, stop_trying: &mut P) -> Result<(), RetryError>
256where
257    P: FnMut() -> bool,
258{
259    let args = StartCanisterArgs {
260        canister_id: target_id,
261    };
262    Ok(call_idempotent_method_with_retry(
263        Call::bounded_wait(Principal::management_canister(), "start_canister").with_arg(&args),
264        stop_trying,
265    )
266    .await?
267    .candid()
268    .unwrap())
269}
270
271/// Retrieve canister info (including module hash) with best-effort calls.
272async fn bounded_wait_canister_info<P>(
273    target_id: CanisterId,
274    num_requested_changes: Option<u64>,
275    stop_trying: &mut P,
276) -> Result<CanisterInfoResult, RetryError>
277where
278    P: FnMut() -> bool,
279{
280    let arg = CanisterInfoArgs {
281        canister_id: target_id,
282        num_requested_changes,
283    };
284
285    Ok(call_idempotent_method_with_retry(
286        Call::bounded_wait(Principal::management_canister(), "canister_info").with_arg(&arg),
287        stop_trying,
288    )
289    .await?
290    .candid()
291    .unwrap())
292}
293
294/// Install a small (<2MB) WASM in a single call via `install_code`.
295/// Since code installation isn't idempotent, we don't just retry on `SysUnknown`.
296/// Rather, we leave it up to the caller to handle.
297async fn bounded_wait_install_single_chunk<P>(
298    target_id: CanisterId,
299    wasm_bytes: &[u8],
300    arg: &[u8],
301    stop_trying: &mut P,
302) -> Result<(), RetryError>
303where
304    P: FnMut() -> bool,
305{
306    let install_args = InstallCodeArgs {
307        mode: CanisterInstallMode::Upgrade(None),
308        canister_id: target_id,
309        wasm_module: wasm_bytes.to_vec(),
310        arg: arg.to_vec(),
311    };
312
313    Ok(call_nonidempotent_method_with_retry(
314        Call::bounded_wait(Principal::management_canister(), "install_code")
315            .with_arg(&install_args),
316        stop_trying,
317    )
318    .await?
319    .candid()
320    .expect("Candid decoding failed"))
321}
322
323#[allow(dead_code)]
324async fn upload_chunks(
325    store_canister_id: CanisterId,
326    chunks: Vec<Vec<u8>>,
327    deadline: &Deadline,
328) -> Result<(), RetryError> {
329    let call = Call::bounded_wait(Principal::management_canister(), "clear_chunk_store").with_arg(
330        &ClearChunkStoreArgs {
331            canister_id: store_canister_id,
332        },
333    );
334
335    let mut retry_fn = when_out_of_time_or_stopping(deadline);
336    let _: () = call_idempotent_method_with_retry(call, &mut retry_fn)
337        .await?
338        .candid()
339        .unwrap();
340
341    for chunk in chunks {
342        let chunk_install_args = UploadChunkArgs {
343            canister_id: store_canister_id,
344            chunk,
345        };
346
347        let call = Call::bounded_wait(Principal::management_canister(), "upload_chunk")
348            .with_arg(&chunk_install_args);
349        let mut retry_fn = when_out_of_time_or_stopping(deadline);
350        let _: () = call_idempotent_method_with_retry(call, &mut retry_fn)
351            .await?
352            .candid()
353            .unwrap();
354    }
355    Ok(())
356}
357
358/// Install a large (>2MB) WASM by referencing pre-uploaded chunks, via `install_chunked_code`.
359/// Chunks are assumed to already have been uploaded
360async fn bounded_wait_install_chunked<P>(
361    target_id: CanisterId,
362    chunked: &ChunkedModule,
363    arg: &[u8],
364    stop_trying: &mut P,
365) -> Result<(), RetryError>
366where
367    P: FnMut() -> bool,
368{
369    let install_args = InstallChunkedCodeArgs {
370        mode: CanisterInstallMode::Upgrade(None),
371        target_canister: target_id,
372        store_canister: Some(chunked.store_canister_id),
373        chunk_hashes_list: chunked
374            .chunk_hashes_list
375            .iter()
376            .map(|hash| ChunkHash { hash: hash.clone() })
377            .collect(),
378        wasm_module_hash: chunked.wasm_module_hash.clone(),
379        arg: arg.to_vec(),
380    };
381
382    let install_call = Call::bounded_wait(Principal::management_canister(), "install_chunked_code")
383        .with_arg(&install_args);
384    let res = call_nonidempotent_method_with_retry(install_call, stop_trying).await?;
385    Ok(res.candid().unwrap())
386}