mbarc-map 0.11.0

Implementation of a Minimally-blocking, Atomic Reference Counted Map
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};

struct DataBlock<T, const BLOCK_SIZE: usize> {
	free_space: usize,
	data: [Option<T>; BLOCK_SIZE],
}

impl<T, const BLOCK_SIZE: usize> DataBlock<T, BLOCK_SIZE> {
	const EMPTY_ELEMENT: Option<T> = None;

	fn new() -> Self {
		Self {
			free_space: BLOCK_SIZE,
			data: [Self::EMPTY_ELEMENT; BLOCK_SIZE],
		}
	}

	fn insert(&mut self, val: T) -> usize {
		assert!(self.free_space > 0);

		//the above assert means this MUST find a value, assuming free_space is accurate
		let mut stored_offset = 0;
		//TODO: improve this loop further.  maybe iters like find?
		for (i, item) in self.data.iter_mut().enumerate() {
			if item.is_none() {
				let _ = item.insert(val);
				stored_offset = i;
				break;
			}
			//TODO: if we get here and i is last element, we should maybe panic with std::unreachable
		}
		self.free_space -= 1;

		stored_offset
	}
}

//#[derive(Copy, Clone)]
pub(crate) struct FaVecIndex<const BLOCK_SIZE: usize> {
	absolute_index: usize,
	//TODO: can store pieces individually, determine if this is worth doing
	//block_index:usize,
	//offset:usize,
}

impl<const BLOCK_SIZE: usize> FaVecIndex<BLOCK_SIZE> {
	fn index_from_block_offset(block_index: usize, offset: usize) -> Self {
		FaVecIndex {
			absolute_index: block_index * BLOCK_SIZE + offset,
		}
	}

	fn index_to_block_offset(&self) -> (usize, usize) {
		(
			self.absolute_index / BLOCK_SIZE,
			self.absolute_index % BLOCK_SIZE,
		)
	}

	fn as_absolute_index(&self) -> usize {
		self.absolute_index
	}
	pub(crate) fn from_absolute_index(absolute_index: usize) -> Self {
		FaVecIndex { absolute_index }
	}
}

pub(crate) struct FaVec<T, const BLOCK_SIZE: usize> {
	//TODO: we use Box to keep data_blocks small and allow efficient resize, however we also only add to/remove from data_blocks at the end index.
	//TODO: Determine if it's most efficient to a) keep as-is, b) remove the Box, or c) remove empty blocks from the middle as well, rather than just the end
	//TODO: keep in mind that, if Box is removed here, we still must ensure that individual elements are pointer safe, ie never moved in memory
	data_blocks: Vec<Box<DataBlock<T, BLOCK_SIZE>>>,

	//maps free space to a set of indexes (in data)
	free_space_map: BTreeMap<usize, BTreeSet<usize>>,
}

impl<T, const BLOCK_SIZE: usize> FaVec<T, BLOCK_SIZE> {
	//need to ensure block*blocksize+offset<=usize::MAX -1
	//max for offset is blocksize-1
	//block*blocksize+blocksize-1 <= usize::max - 1
	//block*(blocksize+1) <= usize::max
	//block <= usize::max / (blocksize+1)
	//if block = usize::max/(blocksize+1) and we are full, push should panic instead of adding new data
	const MAX_BLOCK_COUNT: usize = usize::MAX / (BLOCK_SIZE + 1);

	pub fn new() -> Self {
		let mut initial_set = BTreeSet::new();
		initial_set.insert(0usize);
		let mut initial_map = BTreeMap::new();
		initial_map.insert(BLOCK_SIZE, initial_set);

		Self {
			data_blocks: vec![Box::new(DataBlock::new())],
			free_space_map: initial_map,
		}
	}

	fn update_block_free_spaces(&mut self, block_index: usize, new_free_space: usize) {
		assert!(block_index < self.data_blocks.len());
		assert!(new_free_space <= BLOCK_SIZE);

		//println!("updating block {} free space to {}", block_index, new_free_space);

		assert!(block_index < self.data_blocks.len());
		let block = self.data_blocks[block_index].as_mut();
		let old_free_space = block.free_space;

		//println!("block {} previously had {} free indices", block_index, old_free_space);

		assert!(self.free_space_map.contains_key(&old_free_space));
		let old_free_blocks = self.free_space_map.get_mut(&old_free_space).unwrap();
		assert!(!old_free_blocks.is_empty());
		assert!(old_free_blocks.contains(&block_index));
		old_free_blocks.remove(&block_index);

		if old_free_blocks.is_empty() {
			self.free_space_map.remove(&old_free_space);

			//println!("block {} was the last block with {} free indices, removing", block_index, old_free_space);
		}

		match self.free_space_map.entry(new_free_space) {
			Entry::Occupied(mut entry) => {
				entry.get_mut().insert(block_index);

				//println!("block {} was added to set with {} free indices", block_index, new_free_space);
			}
			Entry::Vacant(entry) => {
				let mut new_set = BTreeSet::new();
				new_set.insert(block_index);
				entry.insert(new_set);

				//println!("block {} was added to new set with {} free indices", block_index, new_free_space);
			}
		}

		block.free_space = new_free_space;
	}

	fn get_or_create_block_for_insert(&mut self) -> usize {
		//the current strategy here is to first, find the block(s) with the least amount of non-zero free space remaining
		//we want to concentrate data to "hot" blocks, where there's already lots of other data, so we have fewer gaps
		//from these blocks, we then select the lowest-index block, so that high index blocks are more likely to empty, and thus be dropped later during remove()

		let lowest_free_space = self.free_space_map.keys().find(|key| **key > 0);

		//if no blocks with more than 0 free space, need to make a new block
		let lowest_free_space = match lowest_free_space {
			Some(lowest_free_space) => {
				//println!("lowest has {} free blocks", lowest_free_space);
				*lowest_free_space
			}
			None => {
				if self.data_blocks.len() == Self::MAX_BLOCK_COUNT {
					//TODO: technically we can avoid this
					//consider instead using a struct that wraps the block index + block number, rather than packing them together at the end
					//this, however, comes at a cost of having a more complex type for hashing purposes
					std::process::abort(); //TODO: is abort the right exit here?
				}

				//add new data block
				let new_index = self.data_blocks.len();
				self.data_blocks.push(Box::new(DataBlock::new()));

				//track the new block's free space in our map
				let mut new_set = BTreeSet::new();
				new_set.insert(new_index);
				self.free_space_map.insert(BLOCK_SIZE, new_set);

				//we now have a block that's completely empty
				BLOCK_SIZE
			}
		};

		//pick the lowest-indexed block (with the least free space)
		//println!("{:?}", self.free_space_map);
		let possible_blocks = self.free_space_map.get_mut(&lowest_free_space).unwrap();
		assert!(!possible_blocks.is_empty());
		let block_index = possible_blocks.pop_first().unwrap();

		//if no more blocks have lowest_free_space left, then remove the index
		if possible_blocks.is_empty() {
			self.free_space_map.remove(&lowest_free_space);
		}

		//println!("{:?}", self.free_space_map);
		assert!(block_index < self.data_blocks.len());
		block_index
	}

	//TODO: insert?
	pub fn push(&mut self, val: T) -> FaVecIndex<BLOCK_SIZE> {
		let block_index = self.get_or_create_block_for_insert();

		//find an index in our block to insert the new data
		let data_block = self.data_blocks.get_mut(block_index).unwrap();
		let stored_offset = data_block.insert(val);

		//store the block's index back in its proper place in the free space map
		match self.free_space_map.get_mut(&data_block.free_space) {
			Some(block_indexes) => {
				block_indexes.insert(block_index);
			}
			None => {
				let mut new_indexes = BTreeSet::new();
				new_indexes.insert(block_index);

				self.free_space_map
					.insert(data_block.free_space, new_indexes);
			}
		}

		//println!("added item, free space map is now: {:?}", self.free_space_map);

		FaVecIndex::index_from_block_offset(block_index, stored_offset)
	}

	pub fn get(&self, index: &FaVecIndex<BLOCK_SIZE>) -> Option<&T> {
		let (block_index, offset) = FaVecIndex::index_to_block_offset(index);
		//let (block_index, offset) = (index.block_index,index.offset);

		if block_index >= self.data_blocks.len() || offset >= BLOCK_SIZE {
			return None;
		}

		match self.data_blocks.get(block_index) {
			Some(block) => block.data[offset].as_ref(),
			None => None,
		}
	}

	//todo: evaluate necessity of mut
	pub fn get_mut(&mut self, index: &FaVecIndex<BLOCK_SIZE>) -> Option<&mut T> {
		let (block_index, offset) = FaVecIndex::index_to_block_offset(index);

		if block_index >= self.data_blocks.len() || offset >= BLOCK_SIZE {
			return None;
		}

		match self.data_blocks.get_mut(block_index) {
			Some(block) => block.data[offset].as_mut(),
			None => None,
		}
	}

	fn prune_end_blocks(&mut self) {
		if let Some(empty_blocks) = self.free_space_map.get_mut(&BLOCK_SIZE) {
			//keep at least one block allocation, but otherwise we can remove empty blocks from the end
			while self.data_blocks.len() > 1
				&& self.data_blocks.last().unwrap().free_space == BLOCK_SIZE
			{
				self.data_blocks.pop();

				//new length is the index of the popped element
				//equivalent to using len()-1 before the above pop as long as len>0, len cannot be zero due to loop conditions
				let old_end_index = self.data_blocks.len();

				assert!(!empty_blocks.is_empty());
				empty_blocks.remove(&old_end_index);
			}

			if empty_blocks.is_empty() {
				self.free_space_map.remove(&BLOCK_SIZE);
			}
		};
	}

	//todo: evaluate necessity of mut
	pub fn remove(&mut self, index: &FaVecIndex<BLOCK_SIZE>) -> Option<T> {
		let (block_index, offset) = FaVecIndex::index_to_block_offset(index);

		assert!(block_index < self.data_blocks.len());
		assert!(offset < BLOCK_SIZE);

		if block_index >= self.data_blocks.len() || offset >= BLOCK_SIZE {
			//TODO: consider panic instead
			return None;
		}

		let removed_item = match self.data_blocks.get_mut(block_index) {
			Some(block) => {
				let removed = block.data.get_mut(offset).unwrap().take();

				if removed.is_some() {
					let _old_free_space = block.free_space; //for debugging

					let new_free_space = block.free_space + 1;
					self.update_block_free_spaces(block_index, new_free_space);

					assert!(self.free_space_map.contains_key(&new_free_space));
					assert!(!self.free_space_map[&new_free_space].is_empty());

					//TODO: while this should keep the map more compact, it also may cause more allocation/deallocation, which can have a cost
					//testing seems to indicate within-margin runtime, but keep a close eye on this one
					self.prune_end_blocks();
				}

				removed
			}
			None => None,
		};

		//println!("removed item, free space map is now: {:?}", self.free_space_map);
		removed_item
	}

	pub fn capacity(&self) -> usize {
		self.data_blocks.len() * BLOCK_SIZE
	}

	pub fn len(&self) -> usize {
		self.free_space_map
			.iter()
			.fold(0, |acc, (free_space, items)| {
				let used_space = BLOCK_SIZE - free_space;
				let block_count = items.len();

				acc + used_space * block_count
			})
	}

	pub fn iter(&self) -> FaVecIter<T, BLOCK_SIZE> {
		FaVecIter {
			block_index: 0,
			offset: 0,
			real_count: 0,
			fa_vec: self,
		}
	}
}

pub struct FaVecIter<'a, T, const BLOCK_SIZE: usize> {
	block_index: usize,
	offset: usize,
	real_count: usize,

	fa_vec: &'a FaVec<T, BLOCK_SIZE>,
}

impl<'a, T, const BLOCK_SIZE: usize> FaVecIter<'a, T, BLOCK_SIZE> {
	fn increment(&mut self) {
		self.offset += 1;
		if self.offset >= BLOCK_SIZE {
			self.offset = 0;
			self.block_index += 1;
		}
	}
}

impl<'a, T, const BLOCK_SIZE: usize> Iterator for FaVecIter<'a, T, BLOCK_SIZE> {
	type Item = &'a T;

	fn next(&mut self) -> Option<Self::Item> {
		while self.block_index < self.fa_vec.data_blocks.len() {
			let block_ref = self.fa_vec.data_blocks.get(self.block_index).unwrap();
			let item = &block_ref.data[self.offset];

			self.increment();

			if item.is_some() {
				self.real_count += 1;
				return item.as_ref();
			}
		}
		None
	}

	fn size_hint(&self) -> (usize, Option<usize>) {
		//TODO: cache len?
		let size = self.fa_vec.len() - self.real_count;
		(size, Some(size))
	}
}

#[cfg(test)]
mod tests {
	use std::collections::BTreeMap;

	use rand::prelude::*;
	use rand_chacha::ChaCha8Rng;

	use crate::fixed_address_continuous_allocation::{FaVec, FaVecIndex};

	const TEST_BLOCK_SIZE: usize = 512;

	#[test]
	fn fa_vec_properties_test_random_io() {
		//randomly add and remove elements in a 2:1 ratio until data reaches a certain count

		const ITEM_COUNT: usize = 100000;

		let mut rng = ChaCha8Rng::seed_from_u64(0xDEADBEEF);
		let mut vec = FaVec::<i64, TEST_BLOCK_SIZE>::new();
		let mut keys = Vec::<FaVecIndex<TEST_BLOCK_SIZE>>::new();

		//note, this access pattern is pretty basic, as it is very unlikely to leave holes in the vec
		while keys.len() < ITEM_COUNT {
			let remove_element = rng.random_ratio(1, 3);

			if remove_element && !keys.is_empty() {
				let key_index = rng.random_range(0..keys.len());
				let key = keys.swap_remove(key_index);
				vec.remove(&key);
			} else {
				let new_value = rng.random::<i64>();
				let new_key = vec.push(new_value);
				keys.push(new_key);
			}
		}

		//this output should visually prove there's a random-ish (But generally increasing) order to inserted indexes
		//it's noisy, but worth looking at if there's an issue
		//println!("indices inserted: {:?}",keys);

		assert_eq!(vec.len(), keys.len());
		validate_vec_properties(&mut vec);
	}

	#[test]
	fn fa_vec_properties_test_add_remove_add() {
		const ITEM_COUNT: usize = 100000;

		let mut rng = ChaCha8Rng::seed_from_u64(0xDEADBEEF);
		let mut vec = FaVec::<i64, TEST_BLOCK_SIZE>::new();
		let mut keys = Vec::<FaVecIndex<TEST_BLOCK_SIZE>>::new();

		const INITIAL_ITEM_COUNT: usize = ITEM_COUNT * 2;
		for total_items in 0..INITIAL_ITEM_COUNT {
			let new_value = rng.random::<i64>();
			let new_key = vec.push(new_value);
			keys.push(new_key);

			assert_eq!(
				vec.capacity(),
				TEST_BLOCK_SIZE * (1 + total_items / TEST_BLOCK_SIZE)
			);
		}

		const HALF_ITEM_COUNT: usize = ITEM_COUNT / 2;
		for _ in 0..(ITEM_COUNT + HALF_ITEM_COUNT) {
			let key_index = rng.random_range(0..keys.len());
			let key = keys.swap_remove(key_index);
			vec.remove(&key);

			//TODO: it's impossible to assess capacity in-loop, due to the random remove order.  Adjust/implement test(s) to account for this
		}

		for _ in 0..HALF_ITEM_COUNT {
			let new_value = rng.random::<i64>();
			let new_key = vec.push(new_value);
			keys.push(new_key);
		}

		//this output should visually prove there's a random-ish (But generally increasing) order to inserted indexes
		//it's noisy, but worth looking at if there's an issue
		//println!("indices inserted: {:?}",keys);

		assert_eq!(vec.len(), keys.len());
		validate_vec_properties(&mut vec);
	}

	#[test]
	fn fa_vec_properties_test_add_remove_repeat() {
		const ITEM_COUNT: usize = 100000;

		let mut rng = ChaCha8Rng::seed_from_u64(0xDEADBEEF);
		let mut vec = FaVec::<i64, TEST_BLOCK_SIZE>::new();
		let mut keys = Vec::<FaVecIndex<TEST_BLOCK_SIZE>>::new();

		for _ in 0..2 * ITEM_COUNT {
			let new_value = rng.random::<i64>();
			let new_key = vec.push(new_value);
			keys.push(new_key);
		}

		for _ in 0..10 {
			for _ in 0..ITEM_COUNT {
				let key_index = rng.random_range(0..keys.len());
				let key = keys.swap_remove(key_index);
				vec.remove(&key);
			}

			for _ in 0..ITEM_COUNT {
				let new_value = rng.random::<i64>();
				let new_key = vec.push(new_value);
				keys.push(new_key);
			}
		}

		/*
		//Using this final remove step to end with N values causes a bell curve distribution
		//This is valid as a result, but useless from a testing perspective
		//this is because I cannot choose what address to remove from, only which one I pick when pushing new data
		for i in 0..ITEM_COUNT{
			let key_index=rng.gen_range(0..keys.len());
			let key= keys.swap_remove(key_index);
			vec.remove(&key);
		}
		// */

		//this output should visually prove there's a random-ish (But generally increasing) order to inserted indexes
		//it's noisy, but worth looking at if there's an issue
		//println!("indices inserted: {:?}",keys);

		assert_eq!(vec.len(), keys.len());
		validate_vec_properties(&mut vec);
	}

	#[test]
	fn iterator_basic() {
		let mut vec = FaVec::<i64, TEST_BLOCK_SIZE>::new();

		let _first = vec.push(5);
		let _second = vec.push(7);
		let _third = vec.push(2);

		vec.remove(&_second);

		let vals = vec.iter().copied().collect::<Vec<i64>>();

		assert_eq!(vals, vec![5, 2]);
		assert_eq!(vec.len(), 2);
		validate_vec_properties(&mut vec);
	}

	#[test]
	fn ensure_correct_prune_on_block_edge() {
		let mut vec = FaVec::<i64, TEST_BLOCK_SIZE>::new();

		let mut last_block = FaVecIndex::from_absolute_index(0);
		for i in 0..(TEST_BLOCK_SIZE + 1) {
			last_block = vec.push(i as i64);
		}

		//previously, pruning was not correctly occurring when the end block became empty and no other blocks were vacant
		vec.remove(&last_block);
		//the free space map index tracking "completely vacant" blocks was being left as an empty set, instead of removed entirely, by the prune pass
		//the application would then panic when trying to insert, due to not expecting a state where the set existed but was empty when later trying to insert
		//the logic for finding where to insert expects that all tracked vacancy counts have at least one associated block, and becomes more complicated if it has to handle empty sets
		//TODO: consider explicit test for auditing this assumption in the validate function below, though not sure it's relevant to other cases
		vec.push((TEST_BLOCK_SIZE + 1) as i64);

		assert_eq!(vec.len(), TEST_BLOCK_SIZE + 1);
		validate_vec_properties(&mut vec);
	}

	fn validate_vec_properties(vec: &mut FaVec<i64, TEST_BLOCK_SIZE>) {
		//assert that actual data matches free_space
		//assert that free space map matches free_space
		//statistically, most blocks should be pretty full, or pretty empty
		//statistically, full blocks should be mostly low-index blocks

		let mut free_space_buckets = BTreeMap::new();
		for (block_index, block) in vec.data_blocks.iter().enumerate() {
			let total_free_blocks = block
				.data
				.iter()
				.filter(|item| -> bool { item.is_none() })
				.count();
			assert_eq!(total_free_blocks, block.free_space);

			assert!(vec.free_space_map.contains_key(&block.free_space));
			assert!(vec.free_space_map[&block.free_space].contains(&block_index));

			match free_space_buckets.get_mut(&block.free_space) {
				Some(count) => {
					*count += 1;
				}
				None => {
					free_space_buckets.insert(block.free_space, 1usize);
				}
			}
		}

		//there should be lots of free or empty blocks, but not as many in between
		//this vec should have a bathtub curve
		//TODO: this is just a basic guess approach, but we should ideally investigate how to determine this more rigorously
		//actually, it seems like rather than forming a bathtub, we do aggressively push towards keeping blocks full
		//barring pathologic bad remove behavior, we see a decreasing trend of free space vs count
		//TODO: reassess metrics
		let free_space_counts: Vec<usize> = free_space_buckets.values().copied().collect();
		let least_free_block_count = *free_space_counts.first().unwrap();
		let most_free_block_count = *free_space_counts.last().unwrap();
		let middle_free_block_count = free_space_counts[free_space_counts.len() / 2];

		println!("free space map: {:?}", vec.free_space_map);
		println!("free count vec: {:?}", free_space_counts);
		println!(
			"least/middle/most free: {} - {} - {}",
			least_free_block_count, middle_free_block_count, most_free_block_count
		);
		assert!(least_free_block_count >= middle_free_block_count);
		//assert!(most_free_block_count >= middle_free_block_count);

		//also, we should have more full blocks than empty ones
		//TODO: linear regression over the whole list should give a negative slope, instead of this basic comparison
		assert!(least_free_block_count >= most_free_block_count);
	}
}