dag/idmap.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8//! # idmap
9//!
10//! See [`IdMap`] for the main structure.
11
12use std::borrow::Cow;
13
14use crate::errors::bug;
15use crate::id::Group;
16use crate::id::Id;
17use crate::id::VertexName;
18use crate::ops::IdConvert;
19use crate::ops::Parents;
20use crate::segment::PreparedFlatSegments;
21use crate::types_ext::PreparedFlatSegmentsExt;
22use crate::Error;
23use crate::IdSet;
24use crate::Result;
25
26#[cfg(any(test, feature = "indexedlog-backend"))]
27mod indexedlog_idmap;
28mod mem_idmap;
29
30#[cfg(any(test, feature = "indexedlog-backend"))]
31pub use indexedlog_idmap::IdMap;
32pub(crate) use mem_idmap::CoreMemIdMap;
33pub use mem_idmap::MemIdMap;
34
35/// DAG-aware write operations.
36#[async_trait::async_trait]
37pub trait IdMapAssignHead: IdConvert + IdMapWrite {
38 /// Assign an id for a head in a DAG. This implies ancestors of the
39 /// head will also have ids assigned.
40 ///
41 /// This function is incremental. If the head or any of its ancestors
42 /// already have an id stored in this map, the existing ids will be
43 /// reused.
44 ///
45 /// This function needs roughly `O(N)` heap memory. `N` is the number of
46 /// ids to assign. When `N` is very large, try assigning ids to a known
47 /// ancestor first.
48 ///
49 /// New `id`s inserted by this function will have the specified `group`.
50 /// Existing `id`s that are ancestors of `head` will get re-assigned
51 /// if they have a higher `group`.
52 ///
53 /// `covered_ids` specifies what ranges of `Id`s are already covered.
54 /// This is usually obtained from `IdDag::all_ids_in_groups(&Group::ALL)`.
55 /// `IdMap` itself might not be able to provide that information
56 /// efficiently because it might be lazy. `covered_ids` will be updated
57 /// to cover newly inserted `Id`s.
58 ///
59 /// `reserved_ids` specifies what ranges are reserved for future growth
60 /// of other important heads (usually a couple of mainline branches that
61 /// are long-lived, growing, and used by many people). This is useful
62 /// to reduce fragmentation.
63 async fn assign_head(
64 &mut self,
65 head: VertexName,
66 parents_by_name: &dyn Parents,
67 group: Group,
68 covered_ids: &mut IdSet,
69 reserved_ids: &IdSet,
70 ) -> Result<PreparedFlatSegments> {
71 // There are some interesting cases to optimize the numbers:
72 //
73 // C For a merge C, it has choice to assign numbers to A or B
74 // |\ first (A and B are abstract branches that have many nodes).
75 // A B Suppose branch A is linear and B have merges, and D is
76 // |/ (::A & ::B). Then:
77 // D
78 //
79 // - If `D` is empty or already assigned, it's better to assign A last.
80 // This is because (A+C) can then always form a segment regardless of
81 // the complexity of B:
82 //
83 // B A C vs. A B C
84 // ~~~ ^^^^^ ~~~
85 // xxxxxx *****
86 // xxxxx
87 //
88 // [~]: Might be complex (ex. many segments)
89 // [^]: Can always form a segment. (better)
90 // [*]: Can only be a segment if segment size is large enough.
91 // [x]: Cannot form a segment.
92 //
93 // - If `D` is not empty (and not assigned), it _might_ be better to
94 // assign D and A first. This provides benefits for A and D to be
95 // continuous, with the downside that A and C are not continuous.
96 //
97 // A typical pattern is one branch continuously merges into the other
98 // (see also segmented-changelog.pdf, page 19):
99 //
100 // B---D---F
101 // \ \ \
102 // A---C---E---G
103 //
104 // The code below is optimized for cases where p1 branch is linear,
105 // but p2 branch is not.
106 //
107 // However, the above visit order (first parent last) is not optimal
108 // for incremental build case with pushrebase branches. Because
109 // pushrebase uses the first parent as the mainline. For example:
110 //
111 // A---B---C-...-D---M (parents(M) = [D, F])
112 // /
113 // E-...-F
114 //
115 // The A ... M branch is the mainline. The head of the mainline
116 // was A ... D, then M. An incremental build job might have built up
117 // A, B, ..., D before it sees M. In this case it's better to make
118 // the incremental build finish the A ... D part before jumping to
119 // E ... F.
120 //
121 // We choose first parent last order if `covered` is empty, or when
122 // visiting ancestors of non-first parents.
123 let mut outcome = PreparedFlatSegments::default();
124
125 #[derive(Copy, Clone, Debug)]
126 enum VisitOrder {
127 /// Visit the first parent first.
128 FirstFirst,
129 /// Visit the first parent last.
130 FirstLast,
131 }
132
133 // Emulate the stack in heap to avoid overflow.
134 #[derive(Debug)]
135 enum Todo {
136 /// Visit parents. Finally assign self. This will eventually turn into AssignedId.
137 Visit {
138 head: VertexName,
139
140 /// The `Id` in `IdMap` that is known assigned to the `head`.
141 /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
142 known_id: Option<Id>,
143
144 order: VisitOrder,
145 },
146
147 /// Assign an `Id` if not assigned. Their parents are prepared in the
148 /// `parent_ids` stack. `Assign` `head` and `Visit` `head`'s parents
149 /// are pushed together so the `Visit` entries can turn into `Id`s in
150 /// the `parent_ids` stack.
151 Assign {
152 /// The vertex to assign. Its parents are already visited and assigned.
153 head: VertexName,
154
155 /// The `Id` in `IdMap` that is known assigned to the `head`.
156 /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
157 known_id: Option<Id>,
158
159 /// The number of parents, at the end of the `parent_ids`.
160 parent_len: usize,
161
162 /// The order of parents if extracted from `parent_ids`.
163 order: VisitOrder,
164 },
165
166 /// Assigned Id. Will be picked by and pushed to the current `parent_ids` stack.
167 AssignedId { id: Id },
168 }
169 use Todo::Assign;
170 use Todo::AssignedId;
171 use Todo::Visit;
172 let mut parent_ids: Vec<Id> = Vec::new();
173
174 let mut todo_stack: Vec<Todo> = {
175 let order = if covered_ids.is_empty() {
176 // Assume re-building from scratch.
177 VisitOrder::FirstLast
178 } else {
179 // Assume incremental updates with pushrebase.
180 VisitOrder::FirstFirst
181 };
182 vec![Visit {
183 head: head.clone(),
184 known_id: None,
185 order,
186 }]
187 };
188 while let Some(todo) = todo_stack.pop() {
189 tracing::trace!(target: "dag::assign", "todo: {:?}", &todo);
190 match todo {
191 Visit {
192 head,
193 known_id,
194 order,
195 } => {
196 // If the id was not assigned, or was assigned to a higher group,
197 // (re-)assign it to this group.
198 //
199 // PERF: This might trigger remote fetch too frequently.
200 let known_id = match known_id {
201 Some(id) => Some(id),
202 None => self.vertex_id_with_max_group(&head, group).await?,
203 };
204 match known_id {
205 Some(id) if covered_ids.contains(id) => todo_stack.push(AssignedId { id }),
206 _ => {
207 let parents = parents_by_name.parent_names(head.clone()).await?;
208 tracing::trace!(target: "dag::assign", "visit {:?} ({:?}) with parents {:?}", &head, known_id, &parents);
209 todo_stack.push(Assign {
210 head,
211 known_id,
212 parent_len: parents.len(),
213 order,
214 });
215 let mut visit = parents;
216 match order {
217 VisitOrder::FirstFirst => {}
218 VisitOrder::FirstLast => visit.reverse(),
219 }
220 for (i, p) in visit.into_iter().enumerate() {
221 // If the parent was not assigned, or was assigned to a higher group,
222 // (re-)assign the parent to this group.
223 match self.vertex_id_with_max_group(&p, group).await {
224 Ok(Some(id)) if covered_ids.contains(id) => {
225 todo_stack.push(AssignedId { id })
226 }
227 // Go deeper if IdMap has the entry but IdDag misses it.
228 Ok(Some(id)) => todo_stack.push(Visit {
229 head: p,
230 known_id: Some(id),
231 order,
232 }),
233 Ok(None) => {
234 let parent_order = match (order, i) {
235 (VisitOrder::FirstFirst, 0) => VisitOrder::FirstFirst,
236 _ => VisitOrder::FirstLast,
237 };
238 todo_stack.push(Visit {
239 head: p,
240 known_id: None,
241 order: parent_order,
242 })
243 }
244 Err(e) => return Err(e),
245 }
246 }
247 }
248 }
249 }
250 Assign {
251 head,
252 known_id,
253 parent_len,
254 order,
255 } => {
256 let parent_start = parent_ids.len() - parent_len;
257 let known_id = match known_id {
258 Some(id) => Some(id),
259 None => self.vertex_id_with_max_group(&head, group).await?,
260 };
261 let id = match known_id {
262 Some(id) if covered_ids.contains(id) => id,
263 _ => {
264 let parents = match order {
265 VisitOrder::FirstLast => Cow::Borrowed(&parent_ids[parent_start..]),
266 VisitOrder::FirstFirst => Cow::Owned(
267 parent_ids[parent_start..]
268 .iter()
269 .cloned()
270 .rev()
271 .collect::<Vec<_>>(),
272 ),
273 };
274 let id = match known_id {
275 Some(id) => id,
276 None => {
277 let candidate_id = match parents.iter().max() {
278 Some(&max_parent_id) => {
279 (max_parent_id + 1).max(group.min_id())
280 }
281 None => group.min_id(),
282 };
283 adjust_candidate_id(
284 self,
285 covered_ids,
286 reserved_ids,
287 candidate_id,
288 )
289 .await?
290 }
291 };
292 if id.group() != group {
293 return Err(Error::IdOverflow(group));
294 }
295 covered_ids.push(id);
296 if known_id.is_none() {
297 tracing::trace!(target: "dag::assign", "assign {:?} = {:?}", &head, id);
298 self.insert(id, head.as_ref()).await?;
299 } else {
300 tracing::trace!(target: "dag::assign", "assign {:?} = {:?} (known)", &head, id);
301 }
302 if parents.iter().any(|&p| p >= id) {
303 return bug(format!(
304 "IdMap Ids are not topo-sorted: {:?} ({:?}) has parent ids {:?}",
305 id, head, &parents,
306 ));
307 }
308 outcome.push_edge(id, &parents);
309 id
310 }
311 };
312 parent_ids.truncate(parent_start);
313 todo_stack.push(AssignedId { id });
314 }
315 AssignedId { id } => {
316 if !covered_ids.contains(id) {
317 return bug(format!(
318 concat!(
319 "attempted to assign id with wrong order: {:?} ",
320 "is being pushed as parent id but it cannot be used ",
321 "because it is not yet covered by IdDag",
322 ),
323 &id
324 ));
325 }
326 parent_ids.push(id);
327 }
328 }
329 }
330
331 Ok(outcome)
332 }
333}
334
335/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not "covered",
336/// not "reserved", and not in the "map". Return the picked `Id`.
337async fn adjust_candidate_id(
338 map: &(impl IdConvert + ?Sized),
339 covered_ids: &IdSet,
340 reserved_ids: &IdSet,
341 mut candidate_id: Id,
342) -> Result<Id> {
343 loop {
344 // (Fast) test using covered_ids + reserved_ids.
345 loop {
346 if let Some(span) = covered_ids.span_contains(candidate_id) {
347 candidate_id = span.high + 1;
348 continue;
349 }
350 if let Some(span) = reserved_ids.span_contains(candidate_id) {
351 candidate_id = span.high + 1;
352 continue;
353 }
354 break;
355 }
356 // (Slow) test using the IdMap.
357 let new_candidate_id = ensure_id_not_exist_in_map(map, candidate_id).await?;
358 if new_candidate_id == candidate_id {
359 break;
360 } else {
361 // Check the covered_ids + reserved_ids.
362 candidate_id = new_candidate_id;
363 }
364 }
365 Ok(candidate_id)
366}
367
368/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not in the
369/// "map". Return the picked `Id`.
370async fn ensure_id_not_exist_in_map(
371 map: &(impl IdConvert + ?Sized),
372 mut candidate_id: Id,
373) -> Result<Id> {
374 // PERF: This lacks of batching if it forms a loop. But it
375 // is also expected to be rare - only when the server
376 // tailer (assuming only one tailer is writing globally) is
377 // killed abnormally, *and* the branch being assigned has
378 // non-fast-forward move, this code path becomes useful.
379 //
380 // Technically, not using `locally` is more correct in a
381 // lazy `IdMap`. However, lazy `IdMap` is only used by
382 // client (local) dag, which ensures `IdMap` and `IdDag`
383 // are in-sync, meaning that the above `covered_ids` check
384 // is sufficient. So this is really only protecting the
385 // server's out-of-sync `IdMap` use-case, where the
386 // `locally` variant is the same as the non-`locally`,
387 // since the server has a non-lazy `IdMap`.
388 while let [true] = &map.contains_vertex_id_locally(&[candidate_id]).await?[..] {
389 candidate_id = candidate_id + 1;
390 }
391 Ok(candidate_id)
392}
393
394impl<T> IdMapAssignHead for T where T: IdConvert + IdMapWrite {}
395
396/// Write operations for IdMap.
397#[async_trait::async_trait]
398pub trait IdMapWrite {
399 /// Insert a new `(id, name)` pair to the map.
400 ///
401 /// The `id` and `name` mapping should be unique, it's an error to map an id
402 /// to multiple names, or map a name to multiple ids. Note: older versions
403 /// of `IdMap` allowed mapping a name to a non-master Id, then a master Id
404 /// (in this order), and the master Id is used for lookups. This is no
405 /// longer permitted.
406 async fn insert(&mut self, id: Id, name: &[u8]) -> Result<()>;
407 /// Remove ids in the range `low..=high` and their associated names.
408 /// Return removed names.
409 async fn remove_range(&mut self, low: Id, high: Id) -> Result<Vec<VertexName>>;
410}
411
412#[cfg(test)]
413mod tests {
414 use nonblocking::non_blocking_result as r;
415 #[cfg(feature = "indexedlog-backend")]
416 use tempfile::tempdir;
417
418 use super::*;
419 #[cfg(feature = "indexedlog-backend")]
420 use crate::ops::Persist;
421 #[cfg(feature = "indexedlog-backend")]
422 use crate::ops::PrefixLookup;
423
424 #[cfg(feature = "indexedlog-backend")]
425 #[test]
426 fn test_basic_operations() {
427 let dir = tempdir().unwrap();
428 let mut map = IdMap::open(dir.path()).unwrap();
429 let lock = map.lock().unwrap();
430 map.reload(&lock).unwrap();
431 map.insert(Id(1), b"abc").unwrap();
432 map.insert(Id(2), b"def").unwrap();
433 map.insert(Id(10), b"ghi").unwrap();
434 map.insert(Id(11), b"ghi").unwrap_err(); // ghi maps to 10
435 map.insert(Id(10), b"ghi2").unwrap_err(); // 10 maps to ghi
436
437 // Test another group.
438 let id = Group::NON_MASTER.min_id();
439 map.insert(id, b"jkl").unwrap();
440 map.insert(id, b"jkl").unwrap();
441 map.insert(id, b"jkl2").unwrap_err(); // id maps to jkl
442 map.insert(id + 1, b"jkl2").unwrap();
443 map.insert(id + 2, b"jkl2").unwrap_err(); // jkl2 maps to id + 1
444 map.insert(Id(15), b"jkl2").unwrap_err(); // reassign jkl2 to master group - error.
445 map.insert(id + 3, b"abc").unwrap_err(); // reassign abc to non-master group - error.
446
447 // Test hex lookup.
448 assert_eq!(0x6a, b'j');
449 assert_eq!(
450 r(map.vertexes_by_hex_prefix(b"6a", 3)).unwrap(),
451 [
452 VertexName::from(&b"jkl"[..]),
453 VertexName::from(&b"jkl2"[..])
454 ]
455 );
456 assert_eq!(
457 r(map.vertexes_by_hex_prefix(b"6a", 1)).unwrap(),
458 [VertexName::from(&b"jkl"[..])]
459 );
460 assert!(r(map.vertexes_by_hex_prefix(b"6b", 1)).unwrap().is_empty());
461
462 for _ in 0..=1 {
463 assert_eq!(map.find_name_by_id(Id(1)).unwrap().unwrap(), b"abc");
464 assert_eq!(map.find_name_by_id(Id(2)).unwrap().unwrap(), b"def");
465 assert!(map.find_name_by_id(Id(3)).unwrap().is_none());
466 assert_eq!(map.find_name_by_id(Id(10)).unwrap().unwrap(), b"ghi");
467
468 assert_eq!(map.find_id_by_name(b"abc").unwrap().unwrap().0, 1);
469 assert_eq!(map.find_id_by_name(b"def").unwrap().unwrap().0, 2);
470 assert_eq!(map.find_id_by_name(b"ghi").unwrap().unwrap().0, 10);
471 assert_eq!(map.find_id_by_name(b"jkl").unwrap().unwrap(), id);
472 assert_eq!(
473 format!("{:?}", map.find_id_by_name(b"jkl2").unwrap().unwrap()),
474 "N1"
475 );
476 assert!(map.find_id_by_name(b"jkl3").unwrap().is_none());
477 }
478
479 // Test Debug
480 assert_eq!(
481 format!("{:?}", &map),
482 r#"IdMap {
483 abc: 1,
484 def: 2,
485 ghi: 10,
486 jkl: N0,
487 jkl2: N1,
488}
489"#
490 );
491 }
492
493 #[test]
494 fn test_remove_range() {
495 let map = MemIdMap::new();
496 check_remove_range(map);
497
498 #[cfg(feature = "indexedlog-backend")]
499 {
500 let dir = tempdir().unwrap();
501 let path = dir.path();
502 let map = IdMap::open(path).unwrap();
503 check_remove_range(map);
504 }
505 }
506
507 fn check_remove_range(mut map: impl IdConvert + IdMapWrite) {
508 let items: &[(Id, &[u8])] = &[
509 (Id(0), b"z"),
510 (Id(1), b"a"),
511 (Id(2), b"bbb"),
512 (Id(3), b"bb"),
513 (Id(4), b"cc"),
514 (Id(5), b"ccc"),
515 (Id(9), b"ddd"),
516 (Id(11), b"e"),
517 (Id(13), b"ff"),
518 (nid(0), b"n"),
519 (nid(1), b"n1"),
520 (nid(2), b"n2"),
521 (nid(3), b"n3"),
522 (nid(4), b"n4"),
523 (nid(5), b"n5"),
524 (nid(12), b"n12"),
525 (nid(20), b"n20"),
526 ];
527 for (id, name) in items {
528 r(map.insert(*id, name)).unwrap();
529 }
530
531 // deleted ids in a string, with extra consistency checks.
532 let deleted = |map: &dyn IdConvert| -> String {
533 let mut deleted_ids = Vec::new();
534 for (id, name) in items {
535 let name = VertexName::copy_from(name);
536 let id = *id;
537 let has_id = r(map.contains_vertex_id_locally(&[id])).unwrap()[0];
538 let lookup_id = r(map.vertex_id_optional(&name)).unwrap();
539 let lookup_name = if has_id {
540 Some(r(map.vertex_name(id)).unwrap())
541 } else {
542 None
543 };
544
545 match (lookup_id, lookup_name) {
546 (None, None) => deleted_ids.push(id),
547 (None, Some(_)) => {
548 panic!("name->id deleted but not id->name: ({:?} {:?})", id, name)
549 }
550 (Some(_), None) => {
551 panic!("id->name deleted but not name->id: ({:?} {:?})", id, name)
552 }
553 (Some(lid), Some(lname)) => {
554 assert_eq!(lid, id);
555 assert_eq!(lname, name);
556 }
557 }
558 }
559 format!("{:?}", deleted_ids)
560 };
561
562 let f = |vs: Vec<VertexName>| -> String {
563 let mut vs = vs;
564 vs.sort_unstable();
565 format!("{:?}", vs)
566 };
567
568 let removed = r(map.remove_range(Id(1), Id(3))).unwrap();
569 assert_eq!(f(removed), "[a, bb, bbb]");
570 assert_eq!(deleted(&map), "[1, 2, 3]");
571
572 let removed = r(map.remove_range(Id(8), Id(12))).unwrap();
573 assert_eq!(f(removed), "[ddd, e]");
574 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11]");
575
576 let removed = r(map.remove_range(nid(2), nid(4))).unwrap();
577 assert_eq!(f(removed), "[n2, n3, n4]");
578 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4]");
579
580 let removed = r(map.remove_range(nid(20), nid(10000))).unwrap();
581 assert_eq!(f(removed), "[n20]");
582 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20]");
583 }
584
585 fn nid(i: u64) -> Id {
586 Group::NON_MASTER.min_id() + i
587 }
588}