Skip to main content

ic_call_chaos/
lib.rs

1//! Chaos testing library for Internet Computer inter-canister calls
2//!
3//! `ic_call_chaos` is a small library to enable testing the resilience of Internet Computer canisters to inter-canister call failures.
4//! It allows you to simulate various failure scenarios, such as dropped, timed out, or rejected calls, to ensure that your canisters
5//! can handle these situations gracefully.
6//!
7//! It is designed to be used in conjunction with the `ic-cdk` library, which provides the necessary tools for building canisters on
8//! the Internet Computer.
9//!
10//! ## Usage
11//!
12//! 1. Import `Call` and friends from `ic_call_chaos` instead of `ic_cdk::call`. The provided interface is the
13//!    same as `ic_cdk::call`, but with additional functionality to simulate failures. The default policy is
14//!    `AllowAll`, which means that all calls will be passed to the underlying `ic_cdk` library.
15//!    You likely want to make the replacement import conditional on a feature flag, so that you don't inherit
16//!    the overhead of (or any bugs in) the wrapper in production.
17//! 1. Provide a way to change the failure policy from tests.
18//! 1. In your tests, apply the desired policy.
19//!
20//! For examples, look at the source of this library, and in particular `canister/src/lib.rs` for an example of how to
21//! add `ic_call_chaos` to your canister code, and `pocket_ic_test/tests/integration_test.rs` for an example of how to
22//! use it in your tests.
23
24use candid::utils::ArgumentEncoder;
25use candid::{CandidType, Principal};
26use ic_cdk::call::{
27    Call as CdkCall, CallFailed, CallFuture as CdkCallFuture, CallPerformFailed, CallRejected,
28    OnewayError, RejectCode, Response,
29};
30use lazy_static::lazy_static;
31use rand::{Rng, SeedableRng};
32use rand_chacha::ChaCha8Rng;
33use std::future::IntoFuture;
34use std::mem;
35use std::pin::Pin;
36use std::sync::Mutex;
37use std::task::Poll;
38
39/// A trait that defines a policy for allowing or rejecting calls.
40pub trait Policy: Send + Sync {
41    /// Whether to allow a call.
42    ///
43    /// If the call is allowed, the ic_cdk call will be executed. This still doesn't mean that the
44    /// call will succeed, as it might actually fail for an arbitrary reason (e.g., not having enough cycles,
45    /// system being under load, etc).
46    ///
47    /// If the call is not allowed, the ic_cdk call will not be executed. The policy can, however,
48    /// execute the call under the hood and still return an error. For example, for bounded-wait
49    /// calls, you may return `RejectCode::SysUnknown` and still issue the call, simulating a
50    /// timeout in production. Returning a `CallFailed::CallRejected` will cause the chaos library
51    /// to actually delay producing the rejection, by calling a no-op on the management canister.
52    /// This is done in order to simulate the time it would take for the call to be rejected in
53    /// production.
54    ///
55    /// Note that this takes a mutable reference to the policy, so it can be used to maintain state
56    /// if needed (e.g., drop the first `N` calls, and then allow all calls to go through)
57    fn allow(&mut self, call: &Call) -> Result<(), CallFailed>;
58
59    /// Whether to allow a one-way call.
60    ///
61    /// If the call is allowed, the ic_cdk call will be executed. This still doesn't mean that the
62    /// call will succeed, as it might fail for an arbitrary reason (e.g., not having enough cycles,
63    /// system being under load, etc).
64    ///
65    /// If the call is not allowed, the ic_cdk call will not be executed. To allow simulating the
66    /// call failing silently, the error is returned as an `Option<OnewayError>`. An error of
67    /// `None` means that the call shouldn't be executed but no error should be returned either. An
68    /// error of `Some(OnewayError)` means that the call shouldn't be executed and the error should
69    /// be returned to the caller.
70    ///
71    /// Note that this takes a mutable reference to the policy, so it can be used to maintain state
72    /// if needed (e.g., drop the first `N` calls, and then allow all calls to go through)
73    fn allow_oneway(&mut self, call: &Call) -> Result<(), Option<OnewayError>>;
74}
75
76/// A simple policy that allows all calls.
77#[derive(Default)]
78pub struct AllowAll {}
79
80impl Policy for AllowAll {
81    fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
82        Ok(())
83    }
84
85    fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
86        Ok(())
87    }
88}
89
90/// A simple policy that denies all calls, returning a `SysTransient` reject code.
91#[derive(Default)]
92pub struct DenyAll {}
93
94impl Policy for DenyAll {
95    fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
96        Err(CallFailed::CallRejected(CallRejected::with_rejection(
97            RejectCode::SysTransient as u32,
98            "Chaos testing: call rejected".to_string(),
99        )))
100    }
101
102    fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
103        Err(Some(CallPerformFailed.into()))
104    }
105}
106
107/// A simple policy that fails every other call with a `SysTransient` reject code.
108#[derive(Default)]
109pub struct AllowEveryOther {
110    pub allow_next: bool,
111}
112
113impl Policy for AllowEveryOther {
114    fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
115        self.allow_next = !self.allow_next;
116        if !self.allow_next {
117            Ok(())
118        } else {
119            Err(CallFailed::CallRejected(CallRejected::with_rejection(
120                RejectCode::SysTransient as u32,
121                "Chaos testing: call rejected".to_string(),
122            )))
123        }
124    }
125
126    fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
127        self.allow_next = !self.allow_next;
128        if !self.allow_next {
129            Ok(())
130        } else {
131            Err(Some(CallPerformFailed.into()))
132        }
133    }
134}
135
136/// A policy that allows calls with a given probability. The probability is a float between 0 and 1.
137/// For bounded wait calls, if `silently_perform_bounded_wait_calls` is set to true, bounded-wat calls
138/// A will be executed though a reject code (`SysUnknown`) will be returned. This is useful for
139/// simulating timeouts in production.
140pub struct WithProbability {
141    probability: f32,
142    silently_perform_bounded_wait_calls: bool,
143    rng: ChaCha8Rng,
144}
145
146impl WithProbability {
147    /// Create a new `WithProbability` policy with the given probability and seed.
148    /// 
149    /// # Arguments
150    /// 
151    /// * `probability` - A float between 0 and 1 representing the probability of allowing calls.
152    /// * `seed` - A u64 seed for the random number generator.
153    /// * `silently_perform_bounded_wait_calls` - A boolean indicating whether to silently perform bounded-wait calls even if they are reported as rejected (with a `SysUnknown` reject code).
154    pub fn new(probability: f32, seed: u64, silently_perform_bounded_wait_calls: bool) -> Self {
155        assert!(probability >= 0.0, "Probability should be >= 0");
156        assert!(probability <= 1.0, "Probability should be <= 1");
157        Self {
158            probability,
159            silently_perform_bounded_wait_calls,
160            rng: ChaCha8Rng::seed_from_u64(seed),
161        }
162    }
163}
164
165impl Policy for WithProbability {
166    fn allow(&mut self, call: &Call) -> Result<(), CallFailed> {
167        let allow = self.rng.random::<f32>() < self.probability;
168        if allow {
169            Ok(())
170        } else if call.call_type == CallType::BoundedWait
171            && self.silently_perform_bounded_wait_calls
172        {
173            let _res = call.call.oneway();
174            Err(CallFailed::CallRejected(CallRejected::with_rejection(
175                RejectCode::SysUnknown as u32,
176                "Chaos testing: timing call out".to_string(),
177            )))
178        } else {
179            Err(CallFailed::CallRejected(CallRejected::with_rejection(
180                RejectCode::SysTransient as u32,
181                "Chaos testing: call rejected".to_string(),
182            )))
183        }
184    }
185
186    fn allow_oneway(&mut self, call: &Call) -> Result<(), Option<OnewayError>> {
187        let allow = self.rng.random::<f32>() < self.probability;
188        if allow {
189            Ok(())
190        } else if call.call_type == CallType::BoundedWait
191            && self.silently_perform_bounded_wait_calls
192        {
193            Err(None)
194        } else {
195            Err(Some(CallPerformFailed.into()))
196        }
197    }
198}
199
200lazy_static! {
201    static ref POLICY: Mutex<Box<dyn Policy>> = Mutex::new(Box::new(AllowAll::default()));
202}
203
204pub fn set_policy<P: Policy + 'static>(policy: P) -> () {
205    let mut guard = POLICY
206        .lock()
207        .expect("Couldn't lock the policy mutex when setting the policy");
208    *guard = Box::new(policy);
209}
210
211#[derive(Clone, Debug, PartialEq, Eq)]
212pub enum CallType {
213    BoundedWait,
214    UnboundedWait,
215}
216
217/// A wrapper around `ic_cdk::call::Call` that enables "chaos testing" by failing calls
218/// according to a policy set by `set_policy`. It's implemented as a drop-in replacement
219/// for `ic_cdk::call::Call`, so it can be used in lieu of it by simple changing imports.
220/// See the documentation on `ic_cdk::call::Call` for more details on the individual methods.
221#[derive(Clone, Debug)]
222pub struct Call<'m, 'a> {
223    pub canister_id: Principal,
224    pub method: &'m str,
225    pub call_type: CallType,
226    call: CdkCall<'m, 'a>,
227}
228
229impl<'m> Call<'m, '_> {
230    pub fn bounded_wait(canister_id: Principal, method: &'m str) -> Self {
231        Call {
232            canister_id,
233            method,
234            call_type: CallType::BoundedWait,
235            call: CdkCall::bounded_wait(canister_id, method),
236        }
237    }
238
239    pub fn unbounded_wait(canister_id: Principal, method: &'m str) -> Self {
240        Call {
241            canister_id,
242            method,
243            call_type: CallType::UnboundedWait,
244            call: CdkCall::unbounded_wait(canister_id, method),
245        }
246    }
247}
248
249impl<'a> Call<'_, 'a> {
250    pub fn with_arg<T: CandidType>(self, arg: &T) -> Self {
251        Self {
252            call: self.call.with_arg(arg),
253            ..self
254        }
255    }
256
257    pub fn with_args<A: ArgumentEncoder>(self, args: &A) -> Self {
258        Self {
259            call: self.call.with_args(args),
260            ..self
261        }
262    }
263
264    pub fn with_raw_args(self, raw_args: &'a [u8]) -> Self {
265        Self {
266            call: self.call.with_raw_args(raw_args),
267            ..self
268        }
269    }
270
271    pub fn with_cycles(mut self, cycles: u128) -> Self {
272        self.call = self.call.with_cycles(cycles);
273        self
274    }
275
276    pub fn change_timeout(mut self, timeout_seconds: u32) -> Self {
277        self.call = self.call.change_timeout(timeout_seconds);
278        self
279    }
280
281    pub fn get_cost(&self) -> u128 {
282        self.call.get_cost()
283    }
284}
285
286impl Call<'_, '_> {
287    /// Sends the call and ignores the reply.
288    pub fn oneway(&self) -> Result<(), OnewayError> {
289        let mut policy = POLICY
290            .lock()
291            .expect("Couldn't lock the policy mutex when sending a one-way call");
292        match policy.allow_oneway(self) {
293            Ok(_) => self.call.oneway(),
294            Err(None) =>
295            // Don't execute the call, but don't return an error either
296            {
297                Ok(())
298            }
299            Err(Some(err)) => Err(err),
300        }
301    }
302}
303
304enum CallFutureState<'m, 'a> {
305    // The call has been rejected, however, we're waiting for a dummy management canister call
306    // to finish, in order to simulate the passage of time that would happen when an asynchronous
307    // reject happens in reality.
308    Rejected(CallFailed),
309    // The call has been allowed, and we're waiting for the result.
310    Allowed(CdkCallFuture<'m, 'a>),
311    // The policy hasn't been applied yet, so this is before awaiting
312    Outstanding(Call<'m, 'a>),
313    // We've already returned a `Poll::Ready`. We shouldn't get polled again.
314    Completed,
315}
316
317pub struct CallFuture<'m, 'a> {
318    state: CallFutureState<'m, 'a>,
319}
320
321impl<'m, 'a> IntoFuture for Call<'m, 'a> {
322    type IntoFuture = CallFuture<'m, 'a>;
323    type Output = Result<Response, CallFailed>;
324
325    fn into_future(self) -> Self::IntoFuture {
326        CallFuture {
327            state: CallFutureState::Outstanding(self),
328        }
329    }
330}
331
332impl std::future::Future for CallFuture<'_, '_> {
333    type Output = Result<Response, CallFailed>;
334
335    fn poll(
336        self: Pin<&mut Self>,
337        context: &mut std::task::Context<'_>,
338    ) -> std::task::Poll<Self::Output> {
339        let fut = Pin::into_inner(self);
340        let (mut cdk_fut, opt_err) = match fut.state {
341            CallFutureState::Completed => {
342                panic!("CallFuture is already completed, it shouldn't be polled again")
343            }
344            CallFutureState::Outstanding(ref mut call) => {
345                let mut policy = POLICY
346                    .lock()
347                    .expect("Couldn't lock the policy mutex when sending a call");
348                match policy.allow(&call) {
349                    Ok(()) => {
350                        let call = call.clone();
351                        (call.call.into_future(), None)
352                    }
353                    Err(call_failed) => {
354                        match call_failed {
355                            CallFailed::CallRejected(_) => {
356                                // If the call was rejected, we need to wait for a dummy management canister call
357                                // to finish, in order to simulate the passage of time in the current call context.
358                                let err = call_failed.clone();
359                                let cdk_fut = CdkCall::bounded_wait(
360                                    Principal::management_canister(),
361                                    "canister_info",
362                                )
363                                .with_arg(ic_cdk::management_canister::CanisterInfoArgs {
364                                    canister_id: ic_cdk::api::canister_self(),
365                                    num_requested_changes: None,
366                                })
367                                .into_future();
368                                (cdk_fut, Some(err))
369                            }
370                            _ => {
371                                // The policy failed the call synchronously, just return the result
372                                let err = call_failed.clone();
373                                fut.state = CallFutureState::Completed;
374                                return Poll::Ready(Err(err));
375                            }
376                        }
377                    }
378                }
379            }
380            CallFutureState::Allowed(ref mut cdk_fut) => {
381                // Replace with something dummy to take ownership
382                let mut cdk_fut = mem::replace(
383                    cdk_fut,
384                    CdkCall::bounded_wait(Principal::anonymous(), "nothing").into_future(),
385                );
386                fut.state = CallFutureState::Completed;
387                return Pin::new(&mut cdk_fut).poll(context);
388            }
389            CallFutureState::Rejected(ref call_failed) => {
390                let err = call_failed.clone();
391                fut.state = CallFutureState::Completed;
392                return Poll::Ready(Err(err));
393            }
394        };
395        let res = Pin::new(&mut cdk_fut).poll(context);
396        match opt_err {
397            Some(err) => fut.state = CallFutureState::Rejected(err),
398            None => fut.state = CallFutureState::Allowed(cdk_fut),
399        }
400        res
401    }
402}