dag/idmap.rs
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8//! # idmap
9//!
10//! See [`IdMap`] for the main structure.
11
12use std::borrow::Cow;
13
14use crate::errors::bug;
15use crate::id::Group;
16use crate::id::Id;
17use crate::id::Vertex;
18use crate::ops::IdConvert;
19use crate::ops::Parents;
20use crate::segment::PreparedFlatSegments;
21use crate::types_ext::PreparedFlatSegmentsExt;
22use crate::Error;
23use crate::IdSet;
24use crate::Result;
25
26#[cfg(any(test, feature = "indexedlog-backend"))]
27mod indexedlog_idmap;
28mod mem_idmap;
29
30#[cfg(any(test, feature = "indexedlog-backend"))]
31pub use indexedlog_idmap::IdMap;
32pub(crate) use mem_idmap::CoreMemIdMap;
33pub use mem_idmap::MemIdMap;
34
35/// DAG-aware write operations.
36#[async_trait::async_trait]
37pub trait IdMapAssignHead: IdConvert + IdMapWrite {
38 /// Assign an id for a head in a DAG. This implies ancestors of the
39 /// head will also have ids assigned.
40 ///
41 /// This function is incremental. If the head or any of its ancestors
42 /// already have an id stored in this map, the existing ids will be
43 /// reused.
44 ///
45 /// This function needs roughly `O(N)` heap memory. `N` is the number of
46 /// ids to assign. When `N` is very large, try assigning ids to a known
47 /// ancestor first.
48 ///
49 /// New `id`s inserted by this function will have the specified `group`.
50 /// Existing `id`s that are ancestors of `head` will get re-assigned
51 /// if they have a higher `group`.
52 ///
53 /// `covered_ids` specifies what ranges of `Id`s are already covered.
54 /// This is usually obtained from `IdDag::all_ids_in_groups(&Group::ALL)`.
55 /// `IdMap` itself might not be able to provide that information
56 /// efficiently because it might be lazy. `covered_ids` will be updated
57 /// to cover newly inserted `Id`s.
58 ///
59 /// `reserved_ids` specifies what ranges are reserved for future growth
60 /// of other important heads (usually a couple of mainline branches that
61 /// are long-lived, growing, and used by many people). This is useful
62 /// to reduce fragmentation.
63 async fn assign_head(
64 &mut self,
65 head: Vertex,
66 parents_by_name: &dyn Parents,
67 group: Group,
68 covered_ids: &mut IdSet,
69 reserved_ids: &IdSet,
70 ) -> Result<PreparedFlatSegments> {
71 // There are some interesting cases to optimize the numbers:
72 //
73 // C For a merge C, it has choice to assign numbers to A or B
74 // |\ first (A and B are abstract branches that have many nodes).
75 // A B Suppose branch A is linear and B have merges, and D is
76 // |/ (::A & ::B). Then:
77 // D
78 //
79 // - If `D` is empty or already assigned, it's better to assign A last.
80 // This is because (A+C) can then always form a segment regardless of
81 // the complexity of B:
82 //
83 // B A C vs. A B C
84 // ~~~ ^^^^^ ~~~
85 // xxxxxx *****
86 // xxxxx
87 //
88 // [~]: Might be complex (ex. many segments)
89 // [^]: Can always form a segment. (better)
90 // [*]: Can only be a segment if segment size is large enough.
91 // [x]: Cannot form a segment.
92 //
93 // - If `D` is not empty (and not assigned), it _might_ be better to
94 // assign D and A first. This provides benefits for A and D to be
95 // continuous, with the downside that A and C are not continuous.
96 //
97 // A typical pattern is one branch continuously merges into the other
98 // (see also segmented-changelog.pdf, page 19):
99 //
100 // B---D---F
101 // \ \ \
102 // A---C---E---G
103 //
104 // The code below is optimized for cases where p1 branch is linear,
105 // but p2 branch is not.
106 //
107 // However, the above visit order (first parent last) is not optimal
108 // for incremental build case with pushrebase branches. Because
109 // pushrebase uses the first parent as the mainline. For example:
110 //
111 // A---B---C-...-D---M (parents(M) = [D, F])
112 // /
113 // E-...-F
114 //
115 // The A ... M branch is the mainline. The head of the mainline
116 // was A ... D, then M. An incremental build job might have built up
117 // A, B, ..., D before it sees M. In this case it's better to make
118 // the incremental build finish the A ... D part before jumping to
119 // E ... F.
120 //
121 // We choose first parent last order if `covered` is empty, or when
122 // visiting ancestors of non-first parents.
123 let mut outcome = PreparedFlatSegments::default();
124
125 #[derive(Copy, Clone, Debug)]
126 enum VisitOrder {
127 /// Visit the first parent first.
128 FirstFirst,
129 /// Visit the first parent last.
130 FirstLast,
131 }
132
133 // Emulate the stack in heap to avoid overflow.
134 #[derive(Debug)]
135 enum Todo {
136 /// Visit parents. Finally assign self. This will eventually turn into AssignedId.
137 Visit {
138 head: Vertex,
139
140 /// The `Id` in `IdMap` that is known assigned to the `head`.
141 /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
142 known_id: Option<Id>,
143
144 order: VisitOrder,
145 },
146
147 /// Assign an `Id` if not assigned. Their parents are prepared in the
148 /// `parent_ids` stack. `Assign` `head` and `Visit` `head`'s parents
149 /// are pushed together so the `Visit` entries can turn into `Id`s in
150 /// the `parent_ids` stack.
151 Assign {
152 /// The vertex to assign. Its parents are already visited and assigned.
153 head: Vertex,
154
155 /// The `Id` in `IdMap` that is known assigned to the `head`.
156 /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
157 known_id: Option<Id>,
158
159 /// The number of parents, at the end of the `parent_ids`.
160 parent_len: usize,
161
162 /// The order of parents if extracted from `parent_ids`.
163 order: VisitOrder,
164 },
165
166 /// Assigned Id. Will be picked by and pushed to the current `parent_ids` stack.
167 AssignedId { id: Id },
168 }
169 use Todo::Assign;
170 use Todo::AssignedId;
171 use Todo::Visit;
172 let mut parent_ids: Vec<Id> = Vec::new();
173
174 let mut todo_stack: Vec<Todo> = {
175 let order = if covered_ids.is_empty() {
176 // Assume re-building from scratch.
177 VisitOrder::FirstLast
178 } else {
179 // Assume incremental updates with pushrebase.
180 VisitOrder::FirstFirst
181 };
182 vec![Visit {
183 head: head.clone(),
184 known_id: None,
185 order,
186 }]
187 };
188 while let Some(todo) = todo_stack.pop() {
189 tracing::trace!(target: "dag::assign", "todo: {:?}", &todo);
190 match todo {
191 Visit {
192 head,
193 known_id,
194 order,
195 } => {
196 // If the id was not assigned, or was assigned to a higher group,
197 // (re-)assign it to this group.
198 //
199 // PERF: This might trigger remote fetch too frequently.
200 let known_id = match known_id {
201 Some(id) => Some(id),
202 None => self.vertex_id_with_max_group(&head, group).await?,
203 };
204 match known_id {
205 Some(id) if covered_ids.contains(id) => todo_stack.push(AssignedId { id }),
206 _ => {
207 let parents = parents_by_name.parent_names(head.clone()).await?;
208 tracing::trace!(target: "dag::assign", "visit {:?} ({:?}) with parents {:?}", &head, known_id, &parents);
209 todo_stack.push(Assign {
210 head,
211 known_id,
212 parent_len: parents.len(),
213 order,
214 });
215 let mut visit = parents;
216 match order {
217 VisitOrder::FirstFirst => {}
218 VisitOrder::FirstLast => visit.reverse(),
219 }
220 for (i, p) in visit.into_iter().enumerate() {
221 // If the parent was not assigned, or was assigned to a higher group,
222 // (re-)assign the parent to this group.
223 match self.vertex_id_with_max_group(&p, group).await {
224 Ok(Some(id)) if covered_ids.contains(id) => {
225 todo_stack.push(AssignedId { id })
226 }
227 // Go deeper if IdMap has the entry but IdDag misses it.
228 Ok(Some(id)) => todo_stack.push(Visit {
229 head: p,
230 known_id: Some(id),
231 order,
232 }),
233 Ok(None) => {
234 let parent_order = match (order, i) {
235 (VisitOrder::FirstFirst, 0) => VisitOrder::FirstFirst,
236 _ => VisitOrder::FirstLast,
237 };
238 todo_stack.push(Visit {
239 head: p,
240 known_id: None,
241 order: parent_order,
242 })
243 }
244 Err(e) => return Err(e),
245 }
246 }
247 }
248 }
249 }
250 Assign {
251 head,
252 known_id,
253 parent_len,
254 order,
255 } => {
256 let parent_start = parent_ids.len() - parent_len;
257 let known_id = match known_id {
258 Some(id) => Some(id),
259 None => self.vertex_id_with_max_group(&head, group).await?,
260 };
261 let id = match known_id {
262 Some(id) if covered_ids.contains(id) => id,
263 _ => {
264 let parents = match order {
265 VisitOrder::FirstLast => Cow::Borrowed(&parent_ids[parent_start..]),
266 VisitOrder::FirstFirst => Cow::Owned(
267 parent_ids[parent_start..]
268 .iter()
269 .cloned()
270 .rev()
271 .collect::<Vec<_>>(),
272 ),
273 };
274 let id = match known_id {
275 Some(id) => id,
276 None => {
277 let candidate_id = match parents.iter().max() {
278 Some(&max_parent_id) => {
279 (max_parent_id + 1).max(group.min_id())
280 }
281 None => group.min_id(),
282 };
283 adjust_candidate_id(
284 self,
285 covered_ids,
286 reserved_ids,
287 candidate_id,
288 )
289 .await?
290 }
291 };
292 if id.group() != group {
293 return Err(Error::IdOverflow(group));
294 }
295 covered_ids.push(id);
296 if known_id.is_none() {
297 tracing::trace!(target: "dag::assign", "assign {:?} = {:?}", &head, id);
298 self.insert(id, head.as_ref()).await?;
299 } else {
300 tracing::trace!(target: "dag::assign", "assign {:?} = {:?} (known)", &head, id);
301 }
302 if parents.iter().any(|&p| p >= id) {
303 return bug(format!(
304 "IdMap Ids are not topo-sorted: {:?} ({:?}) has parent ids {:?}",
305 id, head, &parents,
306 ));
307 }
308 outcome.push_edge(id, &parents);
309 id
310 }
311 };
312 parent_ids.truncate(parent_start);
313 todo_stack.push(AssignedId { id });
314 }
315 AssignedId { id } => {
316 if !covered_ids.contains(id) {
317 return bug(format!(
318 concat!(
319 "attempted to assign id with wrong order: {:?} ",
320 "is being pushed as parent id but it cannot be used ",
321 "because it is not yet covered by IdDag",
322 ),
323 &id
324 ));
325 }
326 parent_ids.push(id);
327 }
328 }
329 }
330
331 Ok(outcome)
332 }
333}
334
335/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not "covered",
336/// not "reserved", and not in the "map". Return the picked `Id`.
337async fn adjust_candidate_id(
338 map: &(impl IdConvert + ?Sized),
339 covered_ids: &IdSet,
340 reserved_ids: &IdSet,
341 mut candidate_id: Id,
342) -> Result<Id> {
343 loop {
344 // (Fast) test using covered_ids + reserved_ids.
345 loop {
346 if let Some(span) = covered_ids.span_contains(candidate_id) {
347 candidate_id = span.high + 1;
348 continue;
349 }
350 if let Some(span) = reserved_ids.span_contains(candidate_id) {
351 candidate_id = span.high + 1;
352 continue;
353 }
354 break;
355 }
356 // (Slow) test using the IdMap.
357 // PERF: This lacks of batching if it forms a loop. But it
358 // is also expected to be rare - only when the server
359 // tailer (assuming only one tailer is writing globally) is
360 // killed abnormally, *and* the branch being assigned has
361 // non-fast-forward move, this code path becomes useful.
362 //
363 // Technically, not using `locally` is more correct in a
364 // lazy `IdMap`. However, lazy `IdMap` is only used by
365 // client (local) dag, which ensures `IdMap` and `IdDag`
366 // are in-sync, meaning that the above `covered_ids` check
367 // is sufficient. So this is really only protecting the
368 // server's out-of-sync `IdMap` use-case, where the
369 // `locally` variant is the same as the non-`locally`,
370 // since the server has a non-lazy `IdMap`.
371 if let [true] = &map.contains_vertex_id_locally(&[candidate_id]).await?[..] {
372 candidate_id = candidate_id + 1;
373 } else {
374 break;
375 }
376 }
377 Ok(candidate_id)
378}
379
380impl<T> IdMapAssignHead for T where T: IdConvert + IdMapWrite {}
381
382/// Write operations for IdMap.
383#[async_trait::async_trait]
384pub trait IdMapWrite {
385 /// Insert a new `(id, name)` pair to the map.
386 ///
387 /// The `id` and `name` mapping should be unique, it's an error to map an id
388 /// to multiple names, or map a name to multiple ids. Note: older versions
389 /// of `IdMap` allowed mapping a name to a non-master Id, then a master Id
390 /// (in this order), and the master Id is used for lookups. This is no
391 /// longer permitted.
392 async fn insert(&mut self, id: Id, name: &[u8]) -> Result<()>;
393 /// Remove ids in the range `low..=high` and their associated names.
394 /// Return removed names.
395 async fn remove_range(&mut self, low: Id, high: Id) -> Result<Vec<Vertex>>;
396}
397
398#[cfg(test)]
399mod tests {
400 use nonblocking::non_blocking_result as r;
401 #[cfg(feature = "indexedlog-backend")]
402 use tempfile::tempdir;
403
404 use super::*;
405 #[cfg(feature = "indexedlog-backend")]
406 use crate::ops::Persist;
407 #[cfg(feature = "indexedlog-backend")]
408 use crate::ops::PrefixLookup;
409 use crate::tests::dbg;
410 use crate::tests::nid;
411 use crate::tests::vid;
412
413 #[cfg(feature = "indexedlog-backend")]
414 #[test]
415 fn test_basic_operations() {
416 let dir = tempdir().unwrap();
417 let mut map = IdMap::open(dir.path()).unwrap();
418 let lock = map.lock().unwrap();
419 map.reload(&lock).unwrap();
420 map.insert(Id(1), b"abc").unwrap();
421 map.insert(Id(2), b"def").unwrap();
422 map.insert(Id(10), b"ghi").unwrap();
423 map.insert(Id(11), b"ghi").unwrap_err(); // ghi maps to 10
424 map.insert(Id(10), b"ghi2").unwrap_err(); // 10 maps to ghi
425
426 // Test another group.
427 let id = Group::NON_MASTER.min_id();
428 map.insert(id, b"jkl").unwrap();
429 map.insert(id, b"jkl").unwrap();
430 map.insert(id, b"jkl2").unwrap_err(); // id maps to jkl
431 map.insert(id + 1, b"jkl2").unwrap();
432 map.insert(id + 2, b"jkl2").unwrap_err(); // jkl2 maps to id + 1
433 map.insert(Id(15), b"jkl2").unwrap_err(); // reassign jkl2 to master group - error.
434 map.insert(id + 3, b"abc").unwrap_err(); // reassign abc to non-master group - error.
435
436 // The 3rd group.
437 map.insert(vid(0), b"xy").unwrap();
438 map.insert(vid(0), b"xy").unwrap();
439
440 map.insert(vid(0), b"xyz").unwrap_err();
441 map.insert(vid(1), b"xy").unwrap_err();
442 map.insert(nid(1), b"xy").unwrap_err();
443
444 map.insert(vid(1), b"xyz").unwrap();
445 map.insert(vid(2), b"jkl3").unwrap();
446
447 // Test hex lookup.
448 assert_eq!(0x6a, b'j');
449 assert_eq!(
450 r(map.vertexes_by_hex_prefix(b"6a", 3)).unwrap(),
451 [
452 // Virutal "jkl3" requires full hex match so not here.
453 Vertex::from(&b"jkl"[..]),
454 Vertex::from(&b"jkl2"[..]),
455 ]
456 );
457 assert_eq!(
458 // Virutal "jkl3" matches when the hex is full.
459 r(map.vertexes_by_hex_prefix(b"6a6b6c33", 3)).unwrap(),
460 [Vertex::from(&b"jkl3"[..])]
461 );
462 assert_eq!(
463 r(map.vertexes_by_hex_prefix(b"6a", 1)).unwrap(),
464 [Vertex::from(&b"jkl"[..])]
465 );
466 assert!(r(map.vertexes_by_hex_prefix(b"6b", 1)).unwrap().is_empty());
467
468 // 'xy' and 'xyz' are virutal, requires full hex to match
469 assert_eq!(0x78, b'x');
470 assert_eq!(
471 r(map.vertexes_by_hex_prefix(b"78", 3)).unwrap(),
472 &[] as &[Vertex],
473 );
474 assert_eq!(
475 // Virutal "xy" matches when the hex is full.
476 r(map.vertexes_by_hex_prefix(b"7879", 3)).unwrap(),
477 [Vertex::from(&b"xy"[..])]
478 );
479 assert_eq!(
480 // Virutal "xyz" matches when the hex is full.
481 r(map.vertexes_by_hex_prefix(b"78797a", 3)).unwrap(),
482 [Vertex::from(&b"xyz"[..])]
483 );
484
485 for _ in 0..=1 {
486 assert_eq!(map.find_name_by_id(Id(1)).unwrap().unwrap(), b"abc");
487 assert_eq!(map.find_name_by_id(Id(2)).unwrap().unwrap(), b"def");
488 assert!(map.find_name_by_id(Id(3)).unwrap().is_none());
489 assert_eq!(map.find_name_by_id(Id(10)).unwrap().unwrap(), b"ghi");
490 assert_eq!(map.find_name_by_id(vid(1)).unwrap().unwrap(), b"xyz");
491
492 assert_eq!(map.find_id_by_name(b"abc").unwrap().unwrap().0, 1);
493 assert_eq!(map.find_id_by_name(b"def").unwrap().unwrap().0, 2);
494 assert_eq!(map.find_id_by_name(b"ghi").unwrap().unwrap().0, 10);
495 assert_eq!(map.find_id_by_name(b"jkl").unwrap().unwrap(), id);
496 assert_eq!(map.find_id_by_name(b"jkl2").unwrap().unwrap(), nid(1));
497 assert_eq!(map.find_id_by_name(b"jkl3").unwrap().unwrap(), vid(2));
498
499 assert!(map.find_id_by_name(b"jkl4").unwrap().is_none());
500 }
501
502 // Test Debug
503 assert_eq!(
504 dbg(&map),
505 r#"IdMap {
506 abc: 1,
507 def: 2,
508 ghi: 10,
509 jkl: N0,
510 jkl2: N1,
511 xy: V0,
512 xyz: V1,
513 jkl3: V2,
514}
515"#
516 );
517 }
518
519 #[test]
520 fn test_remove_range() {
521 let map = MemIdMap::new();
522 check_remove_range(map);
523
524 #[cfg(feature = "indexedlog-backend")]
525 {
526 let dir = tempdir().unwrap();
527 let path = dir.path();
528 let map = IdMap::open(path).unwrap();
529 check_remove_range(map);
530 }
531 }
532
533 fn check_remove_range(mut map: impl IdConvert + IdMapWrite) {
534 let items: &[(Id, &[u8])] = &[
535 (Id(0), b"z"),
536 (Id(1), b"a"),
537 (Id(2), b"bbb"),
538 (Id(3), b"bb"),
539 (Id(4), b"cc"),
540 (Id(5), b"ccc"),
541 (Id(9), b"ddd"),
542 (Id(11), b"e"),
543 (Id(13), b"ff"),
544 (nid(0), b"n"),
545 (nid(1), b"n1"),
546 (nid(2), b"n2"),
547 (nid(3), b"n3"),
548 (nid(4), b"n4"),
549 (nid(5), b"n5"),
550 (nid(12), b"n12"),
551 (nid(20), b"n20"),
552 (vid(0), b"v0"),
553 (vid(1), b"v1"),
554 (vid(10), b"v10"),
555 ];
556 for (id, name) in items {
557 r(map.insert(*id, name)).unwrap();
558 }
559
560 // deleted ids in a string, with extra consistency checks.
561 let deleted = |map: &dyn IdConvert| -> String {
562 let mut deleted_ids = Vec::new();
563 for (id, name) in items {
564 let name = Vertex::copy_from(name);
565 let id = *id;
566 let has_id = r(map.contains_vertex_id_locally(&[id])).unwrap()[0];
567 let lookup_id = r(map.vertex_id_optional(&name)).unwrap();
568 let lookup_name = if has_id {
569 Some(r(map.vertex_name(id)).unwrap())
570 } else {
571 None
572 };
573
574 match (lookup_id, lookup_name) {
575 (None, None) => deleted_ids.push(id),
576 (None, Some(_)) => {
577 panic!("name->id deleted but not id->name: ({:?} {:?})", id, name)
578 }
579 (Some(_), None) => {
580 panic!("id->name deleted but not name->id: ({:?} {:?})", id, name)
581 }
582 (Some(lid), Some(lname)) => {
583 assert_eq!(lid, id);
584 assert_eq!(lname, name);
585 }
586 }
587 }
588 dbg(deleted_ids)
589 };
590
591 let f = |vs: Vec<Vertex>| -> String {
592 let mut vs = vs;
593 vs.sort_unstable();
594 dbg(vs)
595 };
596
597 let removed = r(map.remove_range(Id(1), Id(3))).unwrap();
598 assert_eq!(f(removed), "[a, bb, bbb]");
599 assert_eq!(deleted(&map), "[1, 2, 3]");
600
601 let removed = r(map.remove_range(Id(8), Id(12))).unwrap();
602 assert_eq!(f(removed), "[ddd, e]");
603 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11]");
604
605 let removed = r(map.remove_range(nid(2), nid(4))).unwrap();
606 assert_eq!(f(removed), "[n2, n3, n4]");
607 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4]");
608
609 let removed = r(map.remove_range(nid(20), nid(10000))).unwrap();
610 assert_eq!(f(removed), "[n20]");
611 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20]");
612
613 let removed = r(map.remove_range(vid(0), vid(1))).unwrap();
614 assert_eq!(f(removed), "[v0, v1]");
615 assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20, V0, V1]");
616
617 let removed = r(map.remove_range(vid(0), vid(10000))).unwrap();
618 assert_eq!(f(removed), "[v10]");
619 assert_eq!(
620 deleted(&map),
621 "[1, 2, 3, 9, 11, N2, N3, N4, N20, V0, V1, V10]"
622 );
623 }
624}