ic_call_chaos/lib.rs
1//! Chaos testing library for Internet Computer inter-canister calls
2//!
3//! `ic_call_chaos` is a small library to enable testing the resilience of Internet Computer canisters to inter-canister call failures.
4//! It allows you to simulate various failure scenarios, such as dropped, timed out, or rejected calls, to ensure that your canisters
5//! can handle these situations gracefully.
6//!
7//! It is designed to be used in conjunction with the `ic-cdk` library, which provides the necessary tools for building canisters on
8//! the Internet Computer.
9//!
10//! ## Usage
11//!
12//! 1. Import `Call` and friends from `ic_call_chaos` instead of `ic_cdk::call`. The provided interface is the
13//! same as `ic_cdk::call`, but with additional functionality to simulate failures. The default policy is
14//! `AllowAll`, which means that all calls will be passed to the underlying `ic_cdk` library.
15//! You likely want to make the replacement import conditional on a feature flag, so that you don't inherit
16//! the overhead of (or any bugs in) the wrapper in production.
17//! 1. Provide a way to change the failure policy from tests.
18//! 1. In your tests, apply the desired policy.
19//!
20//! For examples, look at the source of this library, and in particular `canister/src/lib.rs` for an example of how to
21//! add `ic_call_chaos` to your canister code, and `pocket_ic_test/tests/integration_test.rs` for an example of how to
22//! use it in your tests.
23
24use candid::utils::ArgumentEncoder;
25use candid::{CandidType, Principal};
26use ic_cdk::call::{
27 Call as CdkCall, CallFailed, CallFuture as CdkCallFuture, CallPerformFailed, CallRejected,
28 OnewayError, RejectCode, Response,
29};
30use lazy_static::lazy_static;
31use rand::{Rng, SeedableRng};
32use rand_chacha::ChaCha8Rng;
33use std::future::IntoFuture;
34use std::mem;
35use std::pin::Pin;
36use std::sync::Mutex;
37use std::task::Poll;
38
39/// A trait that defines a policy for allowing or rejecting calls.
40pub trait Policy: Send + Sync {
41 /// Whether to allow a call.
42 ///
43 /// If the call is allowed, the ic_cdk call will be executed. This still doesn't mean that the
44 /// call will succeed, as it might actually fail for an arbitrary reason (e.g., not having enough cycles,
45 /// system being under load, etc).
46 ///
47 /// If the call is not allowed, the ic_cdk call will not be executed. The policy can, however,
48 /// execute the call under the hood and still return an error. For example, for bounded-wait
49 /// calls, you may return `RejectCode::SysUnknown` and still issue the call, simulating a
50 /// timeout in production. Returning a `CallFailed::CallRejected` will cause the chaos library
51 /// to actually delay producing the rejection, by calling a no-op on the management canister.
52 /// This is done in order to simulate the time it would take for the call to be rejected in
53 /// production.
54 ///
55 /// Note that this takes a mutable reference to the policy, so it can be used to maintain state
56 /// if needed (e.g., drop the first `N` calls, and then allow all calls to go through)
57 fn allow(&mut self, call: &Call) -> Result<(), CallFailed>;
58
59 /// Whether to allow a one-way call.
60 ///
61 /// If the call is allowed, the ic_cdk call will be executed. This still doesn't mean that the
62 /// call will succeed, as it might fail for an arbitrary reason (e.g., not having enough cycles,
63 /// system being under load, etc).
64 ///
65 /// If the call is not allowed, the ic_cdk call will not be executed. To allow simulating the
66 /// call failing silently, the error is returned as an `Option<OnewayError>`. An error of
67 /// `None` means that the call shouldn't be executed but no error should be returned either. An
68 /// error of `Some(OnewayError)` means that the call shouldn't be executed and the error should
69 /// be returned to the caller.
70 ///
71 /// Note that this takes a mutable reference to the policy, so it can be used to maintain state
72 /// if needed (e.g., drop the first `N` calls, and then allow all calls to go through)
73 fn allow_oneway(&mut self, call: &Call) -> Result<(), Option<OnewayError>>;
74}
75
76/// A simple policy that allows all calls.
77#[derive(Default)]
78pub struct AllowAll {}
79
80impl Policy for AllowAll {
81 fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
82 Ok(())
83 }
84
85 fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
86 Ok(())
87 }
88}
89
90/// A simple policy that denies all calls, returning a `SysTransient` reject code.
91#[derive(Default)]
92pub struct DenyAll {}
93
94impl Policy for DenyAll {
95 fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
96 Err(CallFailed::CallRejected(CallRejected::with_rejection(
97 RejectCode::SysTransient as u32,
98 "Chaos testing: call rejected".to_string(),
99 )))
100 }
101
102 fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
103 Err(Some(CallPerformFailed.into()))
104 }
105}
106
107/// A simple policy that fails every other call with a `SysTransient` reject code.
108#[derive(Default)]
109pub struct AllowEveryOther {
110 pub allow_next: bool,
111}
112
113impl Policy for AllowEveryOther {
114 fn allow(&mut self, _call: &Call) -> Result<(), CallFailed> {
115 self.allow_next = !self.allow_next;
116 if !self.allow_next {
117 Ok(())
118 } else {
119 Err(CallFailed::CallRejected(CallRejected::with_rejection(
120 RejectCode::SysTransient as u32,
121 "Chaos testing: call rejected".to_string(),
122 )))
123 }
124 }
125
126 fn allow_oneway(&mut self, _call: &Call) -> Result<(), Option<OnewayError>> {
127 self.allow_next = !self.allow_next;
128 if !self.allow_next {
129 Ok(())
130 } else {
131 Err(Some(CallPerformFailed.into()))
132 }
133 }
134}
135
136/// A policy that allows calls with a given probability. The probability is a float between 0 and 1.
137/// For bounded wait calls, if `silently_perform_bounded_wait_calls` is set to true, bounded-wat calls
138/// A will be executed though a reject code (`SysUnknown`) will be returned. This is useful for
139/// simulating timeouts in production.
140pub struct WithProbability {
141 probability: f32,
142 silently_perform_bounded_wait_calls: bool,
143 rng: ChaCha8Rng,
144}
145
146impl WithProbability {
147 /// Create a new `WithProbability` policy with the given probability and seed.
148 ///
149 /// # Arguments
150 ///
151 /// * `probability` - A float between 0 and 1 representing the probability of allowing calls.
152 /// * `seed` - A u64 seed for the random number generator.
153 /// * `silently_perform_bounded_wait_calls` - A boolean indicating whether to silently perform bounded-wait calls even if they are reported as rejected (with a `SysUnknown` reject code).
154 pub fn new(probability: f32, seed: u64, silently_perform_bounded_wait_calls: bool) -> Self {
155 assert!(probability >= 0.0, "Probability should be >= 0");
156 assert!(probability <= 1.0, "Probability should be <= 1");
157 Self {
158 probability,
159 silently_perform_bounded_wait_calls,
160 rng: ChaCha8Rng::seed_from_u64(seed),
161 }
162 }
163}
164
165impl Policy for WithProbability {
166 fn allow(&mut self, call: &Call) -> Result<(), CallFailed> {
167 let allow = self.rng.random::<f32>() < self.probability;
168 if allow {
169 Ok(())
170 } else if call.call_type == CallType::BoundedWait
171 && self.silently_perform_bounded_wait_calls
172 {
173 let _res = call.call.oneway();
174 Err(CallFailed::CallRejected(CallRejected::with_rejection(
175 RejectCode::SysUnknown as u32,
176 "Chaos testing: timing call out".to_string(),
177 )))
178 } else {
179 Err(CallFailed::CallRejected(CallRejected::with_rejection(
180 RejectCode::SysTransient as u32,
181 "Chaos testing: call rejected".to_string(),
182 )))
183 }
184 }
185
186 fn allow_oneway(&mut self, call: &Call) -> Result<(), Option<OnewayError>> {
187 let allow = self.rng.random::<f32>() < self.probability;
188 if allow {
189 Ok(())
190 } else if call.call_type == CallType::BoundedWait
191 && self.silently_perform_bounded_wait_calls
192 {
193 Err(None)
194 } else {
195 Err(Some(CallPerformFailed.into()))
196 }
197 }
198}
199
200lazy_static! {
201 static ref POLICY: Mutex<Box<dyn Policy>> = Mutex::new(Box::new(AllowAll::default()));
202}
203
204pub fn set_policy<P: Policy + 'static>(policy: P) -> () {
205 let mut guard = POLICY
206 .lock()
207 .expect("Couldn't lock the policy mutex when setting the policy");
208 *guard = Box::new(policy);
209}
210
211#[derive(Clone, Debug, PartialEq, Eq)]
212pub enum CallType {
213 BoundedWait,
214 UnboundedWait,
215}
216
217/// A wrapper around `ic_cdk::call::Call` that enables "chaos testing" by failing calls
218/// according to a policy set by `set_policy`. It's implemented as a drop-in replacement
219/// for `ic_cdk::call::Call`, so it can be used in lieu of it by simple changing imports.
220/// See the documentation on `ic_cdk::call::Call` for more details on the individual methods.
221#[derive(Clone, Debug)]
222pub struct Call<'m, 'a> {
223 pub canister_id: Principal,
224 pub method: &'m str,
225 pub call_type: CallType,
226 call: CdkCall<'m, 'a>,
227}
228
229impl<'m> Call<'m, '_> {
230 pub fn bounded_wait(canister_id: Principal, method: &'m str) -> Self {
231 Call {
232 canister_id,
233 method,
234 call_type: CallType::BoundedWait,
235 call: CdkCall::bounded_wait(canister_id, method),
236 }
237 }
238
239 pub fn unbounded_wait(canister_id: Principal, method: &'m str) -> Self {
240 Call {
241 canister_id,
242 method,
243 call_type: CallType::UnboundedWait,
244 call: CdkCall::unbounded_wait(canister_id, method),
245 }
246 }
247}
248
249impl<'a> Call<'_, 'a> {
250 pub fn with_arg<T: CandidType>(self, arg: &T) -> Self {
251 Self {
252 call: self.call.with_arg(arg),
253 ..self
254 }
255 }
256
257 pub fn with_args<A: ArgumentEncoder>(self, args: &A) -> Self {
258 Self {
259 call: self.call.with_args(args),
260 ..self
261 }
262 }
263
264 pub fn with_raw_args(self, raw_args: &'a [u8]) -> Self {
265 Self {
266 call: self.call.with_raw_args(raw_args),
267 ..self
268 }
269 }
270
271 pub fn with_cycles(mut self, cycles: u128) -> Self {
272 self.call = self.call.with_cycles(cycles);
273 self
274 }
275
276 pub fn change_timeout(mut self, timeout_seconds: u32) -> Self {
277 self.call = self.call.change_timeout(timeout_seconds);
278 self
279 }
280
281 pub fn get_cost(&self) -> u128 {
282 self.call.get_cost()
283 }
284}
285
286impl Call<'_, '_> {
287 /// Sends the call and ignores the reply.
288 pub fn oneway(&self) -> Result<(), OnewayError> {
289 let mut policy = POLICY
290 .lock()
291 .expect("Couldn't lock the policy mutex when sending a one-way call");
292 match policy.allow_oneway(self) {
293 Ok(_) => self.call.oneway(),
294 Err(None) =>
295 // Don't execute the call, but don't return an error either
296 {
297 Ok(())
298 }
299 Err(Some(err)) => Err(err),
300 }
301 }
302}
303
304enum CallFutureState<'m, 'a> {
305 // The call has been rejected, however, we're waiting for a dummy management canister call
306 // to finish, in order to simulate the passage of time that would happen when an asynchronous
307 // reject happens in reality.
308 Rejected(CallFailed),
309 // The call has been allowed, and we're waiting for the result.
310 Allowed(CdkCallFuture<'m, 'a>),
311 // The policy hasn't been applied yet, so this is before awaiting
312 Outstanding(Call<'m, 'a>),
313 // We've already returned a `Poll::Ready`. We shouldn't get polled again.
314 Completed,
315}
316
317pub struct CallFuture<'m, 'a> {
318 state: CallFutureState<'m, 'a>,
319}
320
321impl<'m, 'a> IntoFuture for Call<'m, 'a> {
322 type IntoFuture = CallFuture<'m, 'a>;
323 type Output = Result<Response, CallFailed>;
324
325 fn into_future(self) -> Self::IntoFuture {
326 CallFuture {
327 state: CallFutureState::Outstanding(self),
328 }
329 }
330}
331
332impl std::future::Future for CallFuture<'_, '_> {
333 type Output = Result<Response, CallFailed>;
334
335 fn poll(
336 self: Pin<&mut Self>,
337 context: &mut std::task::Context<'_>,
338 ) -> std::task::Poll<Self::Output> {
339 let fut = Pin::into_inner(self);
340 let (mut cdk_fut, opt_err) = match fut.state {
341 CallFutureState::Completed => {
342 panic!("CallFuture is already completed, it shouldn't be polled again")
343 }
344 CallFutureState::Outstanding(ref mut call) => {
345 let mut policy = POLICY
346 .lock()
347 .expect("Couldn't lock the policy mutex when sending a call");
348 match policy.allow(&call) {
349 Ok(()) => {
350 let call = call.clone();
351 (call.call.into_future(), None)
352 }
353 Err(call_failed) => {
354 match call_failed {
355 CallFailed::CallRejected(_) => {
356 // If the call was rejected, we need to wait for a dummy management canister call
357 // to finish, in order to simulate the passage of time in the current call context.
358 let err = call_failed.clone();
359 let cdk_fut = CdkCall::bounded_wait(
360 Principal::management_canister(),
361 "canister_info",
362 )
363 .with_arg(ic_cdk::management_canister::CanisterInfoArgs {
364 canister_id: ic_cdk::api::canister_self(),
365 num_requested_changes: None,
366 })
367 .into_future();
368 (cdk_fut, Some(err))
369 }
370 _ => {
371 // The policy failed the call synchronously, just return the result
372 let err = call_failed.clone();
373 fut.state = CallFutureState::Completed;
374 return Poll::Ready(Err(err));
375 }
376 }
377 }
378 }
379 }
380 CallFutureState::Allowed(ref mut cdk_fut) => {
381 // Replace with something dummy to take ownership
382 let mut cdk_fut = mem::replace(
383 cdk_fut,
384 CdkCall::bounded_wait(Principal::anonymous(), "nothing").into_future(),
385 );
386 fut.state = CallFutureState::Completed;
387 return Pin::new(&mut cdk_fut).poll(context);
388 }
389 CallFutureState::Rejected(ref call_failed) => {
390 let err = call_failed.clone();
391 fut.state = CallFutureState::Completed;
392 return Poll::Ready(Err(err));
393 }
394 };
395 let res = Pin::new(&mut cdk_fut).poll(context);
396 match opt_err {
397 Some(err) => fut.state = CallFutureState::Rejected(err),
398 None => fut.state = CallFutureState::Allowed(cdk_fut),
399 }
400 res
401 }
402}