dag/
idmap.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8//! # idmap
9//!
10//! See [`IdMap`] for the main structure.
11
12use std::borrow::Cow;
13
14use crate::errors::bug;
15use crate::id::Group;
16use crate::id::Id;
17use crate::id::VertexName;
18use crate::ops::IdConvert;
19use crate::ops::Parents;
20use crate::segment::PreparedFlatSegments;
21use crate::types_ext::PreparedFlatSegmentsExt;
22use crate::Error;
23use crate::IdSet;
24use crate::Result;
25
26#[cfg(any(test, feature = "indexedlog-backend"))]
27mod indexedlog_idmap;
28mod mem_idmap;
29
30#[cfg(any(test, feature = "indexedlog-backend"))]
31pub use indexedlog_idmap::IdMap;
32pub(crate) use mem_idmap::CoreMemIdMap;
33pub use mem_idmap::MemIdMap;
34
35/// DAG-aware write operations.
36#[async_trait::async_trait]
37pub trait IdMapAssignHead: IdConvert + IdMapWrite {
38    /// Assign an id for a head in a DAG. This implies ancestors of the
39    /// head will also have ids assigned.
40    ///
41    /// This function is incremental. If the head or any of its ancestors
42    /// already have an id stored in this map, the existing ids will be
43    /// reused.
44    ///
45    /// This function needs roughly `O(N)` heap memory. `N` is the number of
46    /// ids to assign. When `N` is very large, try assigning ids to a known
47    /// ancestor first.
48    ///
49    /// New `id`s inserted by this function will have the specified `group`.
50    /// Existing `id`s that are ancestors of `head` will get re-assigned
51    /// if they have a higher `group`.
52    ///
53    /// `covered_ids` specifies what ranges of `Id`s are already covered.
54    /// This is usually obtained from `IdDag::all_ids_in_groups(&Group::ALL)`.
55    /// `IdMap` itself might not be able to provide that information
56    /// efficiently because it might be lazy. `covered_ids` will be updated
57    /// to cover newly inserted `Id`s.
58    ///
59    /// `reserved_ids` specifies what ranges are reserved for future growth
60    /// of other important heads (usually a couple of mainline branches that
61    /// are long-lived, growing, and used by many people). This is useful
62    /// to reduce fragmentation.
63    async fn assign_head(
64        &mut self,
65        head: VertexName,
66        parents_by_name: &dyn Parents,
67        group: Group,
68        covered_ids: &mut IdSet,
69        reserved_ids: &IdSet,
70    ) -> Result<PreparedFlatSegments> {
71        // There are some interesting cases to optimize the numbers:
72        //
73        // C     For a merge C, it has choice to assign numbers to A or B
74        // |\    first (A and B are abstract branches that have many nodes).
75        // A B   Suppose branch A is linear and B have merges, and D is
76        // |/    (::A & ::B). Then:
77        // D
78        //
79        // - If `D` is empty or already assigned, it's better to assign A last.
80        //   This is because (A+C) can then always form a segment regardless of
81        //   the complexity of B:
82        //
83        //      B   A   C       vs.        A   B   C
84        //     ~~~  ^^^^^                     ~~~
85        //     xxxxxx                          *****
86        //                                 xxxxx
87        //
88        //   [~]: Might be complex (ex. many segments)
89        //   [^]: Can always form a segment. (better)
90        //   [*]: Can only be a segment if segment size is large enough.
91        //   [x]: Cannot form a segment.
92        //
93        // - If `D` is not empty (and not assigned), it _might_ be better to
94        //   assign D and A first. This provides benefits for A and D to be
95        //   continuous, with the downside that A and C are not continuous.
96        //
97        //   A typical pattern is one branch continuously merges into the other
98        //   (see also segmented-changelog.pdf, page 19):
99        //
100        //        B---D---F
101        //         \   \   \
102        //      A---C---E---G
103        //
104        // The code below is optimized for cases where p1 branch is linear,
105        // but p2 branch is not.
106        //
107        // However, the above visit order (first parent last) is not optimal
108        // for incremental build case with pushrebase branches. Because
109        // pushrebase uses the first parent as the mainline. For example:
110        //
111        //    A---B---C-...-D---M  (parents(M) = [D, F])
112        //                     /
113        //              E-...-F
114        //
115        // The A ... M branch is the mainline. The head of the mainline
116        // was A ... D, then M. An incremental build job might have built up
117        // A, B, ..., D before it sees M. In this case it's better to make
118        // the incremental build finish the A ... D part before jumping to
119        // E ... F.
120        //
121        // We choose first parent last order if `covered` is empty, or when
122        // visiting ancestors of non-first parents.
123        let mut outcome = PreparedFlatSegments::default();
124
125        #[derive(Copy, Clone, Debug)]
126        enum VisitOrder {
127            /// Visit the first parent first.
128            FirstFirst,
129            /// Visit the first parent last.
130            FirstLast,
131        }
132
133        // Emulate the stack in heap to avoid overflow.
134        #[derive(Debug)]
135        enum Todo {
136            /// Visit parents. Finally assign self. This will eventually turn into AssignedId.
137            Visit {
138                head: VertexName,
139
140                /// The `Id` in `IdMap` that is known assigned to the `head`.
141                /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
142                known_id: Option<Id>,
143
144                order: VisitOrder,
145            },
146
147            /// Assign an `Id` if not assigned. Their parents are prepared in the
148            /// `parent_ids` stack. `Assign` `head` and `Visit` `head`'s parents
149            /// are pushed together so the `Visit` entries can turn into `Id`s in
150            /// the `parent_ids` stack.
151            Assign {
152                /// The vertex to assign. Its parents are already visited and assigned.
153                head: VertexName,
154
155                /// The `Id` in `IdMap` that is known assigned to the `head`.
156                /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
157                known_id: Option<Id>,
158
159                /// The number of parents, at the end of the `parent_ids`.
160                parent_len: usize,
161
162                /// The order of parents if extracted from `parent_ids`.
163                order: VisitOrder,
164            },
165
166            /// Assigned Id. Will be picked by and pushed to the current `parent_ids` stack.
167            AssignedId { id: Id },
168        }
169        use Todo::Assign;
170        use Todo::AssignedId;
171        use Todo::Visit;
172        let mut parent_ids: Vec<Id> = Vec::new();
173
174        let mut todo_stack: Vec<Todo> = {
175            let order = if covered_ids.is_empty() {
176                // Assume re-building from scratch.
177                VisitOrder::FirstLast
178            } else {
179                // Assume incremental updates with pushrebase.
180                VisitOrder::FirstFirst
181            };
182            vec![Visit {
183                head: head.clone(),
184                known_id: None,
185                order,
186            }]
187        };
188        while let Some(todo) = todo_stack.pop() {
189            tracing::trace!(target: "dag::assign", "todo: {:?}", &todo);
190            match todo {
191                Visit {
192                    head,
193                    known_id,
194                    order,
195                } => {
196                    // If the id was not assigned, or was assigned to a higher group,
197                    // (re-)assign it to this group.
198                    //
199                    // PERF: This might trigger remote fetch too frequently.
200                    let known_id = match known_id {
201                        Some(id) => Some(id),
202                        None => self.vertex_id_with_max_group(&head, group).await?,
203                    };
204                    match known_id {
205                        Some(id) if covered_ids.contains(id) => todo_stack.push(AssignedId { id }),
206                        _ => {
207                            let parents = parents_by_name.parent_names(head.clone()).await?;
208                            tracing::trace!(target: "dag::assign", "visit {:?} ({:?}) with parents {:?}", &head, known_id, &parents);
209                            todo_stack.push(Assign {
210                                head,
211                                known_id,
212                                parent_len: parents.len(),
213                                order,
214                            });
215                            let mut visit = parents;
216                            match order {
217                                VisitOrder::FirstFirst => {}
218                                VisitOrder::FirstLast => visit.reverse(),
219                            }
220                            for (i, p) in visit.into_iter().enumerate() {
221                                // If the parent was not assigned, or was assigned to a higher group,
222                                // (re-)assign the parent to this group.
223                                match self.vertex_id_with_max_group(&p, group).await {
224                                    Ok(Some(id)) if covered_ids.contains(id) => {
225                                        todo_stack.push(AssignedId { id })
226                                    }
227                                    // Go deeper if IdMap has the entry but IdDag misses it.
228                                    Ok(Some(id)) => todo_stack.push(Visit {
229                                        head: p,
230                                        known_id: Some(id),
231                                        order,
232                                    }),
233                                    Ok(None) => {
234                                        let parent_order = match (order, i) {
235                                            (VisitOrder::FirstFirst, 0) => VisitOrder::FirstFirst,
236                                            _ => VisitOrder::FirstLast,
237                                        };
238                                        todo_stack.push(Visit {
239                                            head: p,
240                                            known_id: None,
241                                            order: parent_order,
242                                        })
243                                    }
244                                    Err(e) => return Err(e),
245                                }
246                            }
247                        }
248                    }
249                }
250                Assign {
251                    head,
252                    known_id,
253                    parent_len,
254                    order,
255                } => {
256                    let parent_start = parent_ids.len() - parent_len;
257                    let known_id = match known_id {
258                        Some(id) => Some(id),
259                        None => self.vertex_id_with_max_group(&head, group).await?,
260                    };
261                    let id = match known_id {
262                        Some(id) if covered_ids.contains(id) => id,
263                        _ => {
264                            let parents = match order {
265                                VisitOrder::FirstLast => Cow::Borrowed(&parent_ids[parent_start..]),
266                                VisitOrder::FirstFirst => Cow::Owned(
267                                    parent_ids[parent_start..]
268                                        .iter()
269                                        .cloned()
270                                        .rev()
271                                        .collect::<Vec<_>>(),
272                                ),
273                            };
274                            let id = match known_id {
275                                Some(id) => id,
276                                None => {
277                                    let candidate_id = match parents.iter().max() {
278                                        Some(&max_parent_id) => {
279                                            (max_parent_id + 1).max(group.min_id())
280                                        }
281                                        None => group.min_id(),
282                                    };
283                                    adjust_candidate_id(
284                                        self,
285                                        covered_ids,
286                                        reserved_ids,
287                                        candidate_id,
288                                    )
289                                    .await?
290                                }
291                            };
292                            if id.group() != group {
293                                return Err(Error::IdOverflow(group));
294                            }
295                            covered_ids.push(id);
296                            if known_id.is_none() {
297                                tracing::trace!(target: "dag::assign", "assign {:?} = {:?}", &head, id);
298                                self.insert(id, head.as_ref()).await?;
299                            } else {
300                                tracing::trace!(target: "dag::assign", "assign {:?} = {:?} (known)", &head, id);
301                            }
302                            if parents.iter().any(|&p| p >= id) {
303                                return bug(format!(
304                                    "IdMap Ids are not topo-sorted: {:?} ({:?}) has parent ids {:?}",
305                                    id, head, &parents,
306                                ));
307                            }
308                            outcome.push_edge(id, &parents);
309                            id
310                        }
311                    };
312                    parent_ids.truncate(parent_start);
313                    todo_stack.push(AssignedId { id });
314                }
315                AssignedId { id } => {
316                    if !covered_ids.contains(id) {
317                        return bug(format!(
318                            concat!(
319                                "attempted to assign id with wrong order: {:?} ",
320                                "is being pushed as parent id but it cannot be used ",
321                                "because it is not yet covered by IdDag",
322                            ),
323                            &id
324                        ));
325                    }
326                    parent_ids.push(id);
327                }
328            }
329        }
330
331        Ok(outcome)
332    }
333}
334
335/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not "covered",
336/// not "reserved", and not in the "map".  Return the picked `Id`.
337async fn adjust_candidate_id(
338    map: &(impl IdConvert + ?Sized),
339    covered_ids: &IdSet,
340    reserved_ids: &IdSet,
341    mut candidate_id: Id,
342) -> Result<Id> {
343    loop {
344        // (Fast) test using covered_ids + reserved_ids.
345        loop {
346            if let Some(span) = covered_ids.span_contains(candidate_id) {
347                candidate_id = span.high + 1;
348                continue;
349            }
350            if let Some(span) = reserved_ids.span_contains(candidate_id) {
351                candidate_id = span.high + 1;
352                continue;
353            }
354            break;
355        }
356        // (Slow) test using the IdMap.
357        let new_candidate_id = ensure_id_not_exist_in_map(map, candidate_id).await?;
358        if new_candidate_id == candidate_id {
359            break;
360        } else {
361            // Check the covered_ids + reserved_ids.
362            candidate_id = new_candidate_id;
363        }
364    }
365    Ok(candidate_id)
366}
367
368/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not in the
369/// "map". Return the picked `Id`.
370async fn ensure_id_not_exist_in_map(
371    map: &(impl IdConvert + ?Sized),
372    mut candidate_id: Id,
373) -> Result<Id> {
374    // PERF: This lacks of batching if it forms a loop. But it
375    // is also expected to be rare - only when the server
376    // tailer (assuming only one tailer is writing globally) is
377    // killed abnormally, *and* the branch being assigned has
378    // non-fast-forward move, this code path becomes useful.
379    //
380    // Technically, not using `locally` is more correct in a
381    // lazy `IdMap`. However, lazy `IdMap` is only used by
382    // client (local) dag, which ensures `IdMap` and `IdDag`
383    // are in-sync, meaning that the above `covered_ids` check
384    // is sufficient. So this is really only protecting the
385    // server's out-of-sync `IdMap` use-case, where the
386    // `locally` variant is the same as the non-`locally`,
387    // since the server has a non-lazy `IdMap`.
388    while let [true] = &map.contains_vertex_id_locally(&[candidate_id]).await?[..] {
389        candidate_id = candidate_id + 1;
390    }
391    Ok(candidate_id)
392}
393
394impl<T> IdMapAssignHead for T where T: IdConvert + IdMapWrite {}
395
396/// Write operations for IdMap.
397#[async_trait::async_trait]
398pub trait IdMapWrite {
399    /// Insert a new `(id, name)` pair to the map.
400    ///
401    /// The `id` and `name` mapping should be unique, it's an error to map an id
402    /// to multiple names, or map a name to multiple ids. Note: older versions
403    /// of `IdMap` allowed mapping a name to a non-master Id, then a master Id
404    /// (in this order), and the master Id is used for lookups. This is no
405    /// longer permitted.
406    async fn insert(&mut self, id: Id, name: &[u8]) -> Result<()>;
407    /// Remove ids in the range `low..=high` and their associated names.
408    /// Return removed names.
409    async fn remove_range(&mut self, low: Id, high: Id) -> Result<Vec<VertexName>>;
410}
411
412#[cfg(test)]
413mod tests {
414    use nonblocking::non_blocking_result as r;
415    #[cfg(feature = "indexedlog-backend")]
416    use tempfile::tempdir;
417
418    use super::*;
419    #[cfg(feature = "indexedlog-backend")]
420    use crate::ops::Persist;
421    #[cfg(feature = "indexedlog-backend")]
422    use crate::ops::PrefixLookup;
423
424    #[cfg(feature = "indexedlog-backend")]
425    #[test]
426    fn test_basic_operations() {
427        let dir = tempdir().unwrap();
428        let mut map = IdMap::open(dir.path()).unwrap();
429        let lock = map.lock().unwrap();
430        map.reload(&lock).unwrap();
431        map.insert(Id(1), b"abc").unwrap();
432        map.insert(Id(2), b"def").unwrap();
433        map.insert(Id(10), b"ghi").unwrap();
434        map.insert(Id(11), b"ghi").unwrap_err(); // ghi maps to 10
435        map.insert(Id(10), b"ghi2").unwrap_err(); // 10 maps to ghi
436
437        // Test another group.
438        let id = Group::NON_MASTER.min_id();
439        map.insert(id, b"jkl").unwrap();
440        map.insert(id, b"jkl").unwrap();
441        map.insert(id, b"jkl2").unwrap_err(); // id maps to jkl
442        map.insert(id + 1, b"jkl2").unwrap();
443        map.insert(id + 2, b"jkl2").unwrap_err(); // jkl2 maps to id + 1
444        map.insert(Id(15), b"jkl2").unwrap_err(); // reassign jkl2 to master group - error.
445        map.insert(id + 3, b"abc").unwrap_err(); // reassign abc to non-master group - error.
446
447        // Test hex lookup.
448        assert_eq!(0x6a, b'j');
449        assert_eq!(
450            r(map.vertexes_by_hex_prefix(b"6a", 3)).unwrap(),
451            [
452                VertexName::from(&b"jkl"[..]),
453                VertexName::from(&b"jkl2"[..])
454            ]
455        );
456        assert_eq!(
457            r(map.vertexes_by_hex_prefix(b"6a", 1)).unwrap(),
458            [VertexName::from(&b"jkl"[..])]
459        );
460        assert!(r(map.vertexes_by_hex_prefix(b"6b", 1)).unwrap().is_empty());
461
462        for _ in 0..=1 {
463            assert_eq!(map.find_name_by_id(Id(1)).unwrap().unwrap(), b"abc");
464            assert_eq!(map.find_name_by_id(Id(2)).unwrap().unwrap(), b"def");
465            assert!(map.find_name_by_id(Id(3)).unwrap().is_none());
466            assert_eq!(map.find_name_by_id(Id(10)).unwrap().unwrap(), b"ghi");
467
468            assert_eq!(map.find_id_by_name(b"abc").unwrap().unwrap().0, 1);
469            assert_eq!(map.find_id_by_name(b"def").unwrap().unwrap().0, 2);
470            assert_eq!(map.find_id_by_name(b"ghi").unwrap().unwrap().0, 10);
471            assert_eq!(map.find_id_by_name(b"jkl").unwrap().unwrap(), id);
472            assert_eq!(
473                format!("{:?}", map.find_id_by_name(b"jkl2").unwrap().unwrap()),
474                "N1"
475            );
476            assert!(map.find_id_by_name(b"jkl3").unwrap().is_none());
477        }
478
479        // Test Debug
480        assert_eq!(
481            format!("{:?}", &map),
482            r#"IdMap {
483  abc: 1,
484  def: 2,
485  ghi: 10,
486  jkl: N0,
487  jkl2: N1,
488}
489"#
490        );
491    }
492
493    #[test]
494    fn test_remove_range() {
495        let map = MemIdMap::new();
496        check_remove_range(map);
497
498        #[cfg(feature = "indexedlog-backend")]
499        {
500            let dir = tempdir().unwrap();
501            let path = dir.path();
502            let map = IdMap::open(path).unwrap();
503            check_remove_range(map);
504        }
505    }
506
507    fn check_remove_range(mut map: impl IdConvert + IdMapWrite) {
508        let items: &[(Id, &[u8])] = &[
509            (Id(0), b"z"),
510            (Id(1), b"a"),
511            (Id(2), b"bbb"),
512            (Id(3), b"bb"),
513            (Id(4), b"cc"),
514            (Id(5), b"ccc"),
515            (Id(9), b"ddd"),
516            (Id(11), b"e"),
517            (Id(13), b"ff"),
518            (nid(0), b"n"),
519            (nid(1), b"n1"),
520            (nid(2), b"n2"),
521            (nid(3), b"n3"),
522            (nid(4), b"n4"),
523            (nid(5), b"n5"),
524            (nid(12), b"n12"),
525            (nid(20), b"n20"),
526        ];
527        for (id, name) in items {
528            r(map.insert(*id, name)).unwrap();
529        }
530
531        // deleted ids in a string, with extra consistency checks.
532        let deleted = |map: &dyn IdConvert| -> String {
533            let mut deleted_ids = Vec::new();
534            for (id, name) in items {
535                let name = VertexName::copy_from(name);
536                let id = *id;
537                let has_id = r(map.contains_vertex_id_locally(&[id])).unwrap()[0];
538                let lookup_id = r(map.vertex_id_optional(&name)).unwrap();
539                let lookup_name = if has_id {
540                    Some(r(map.vertex_name(id)).unwrap())
541                } else {
542                    None
543                };
544
545                match (lookup_id, lookup_name) {
546                    (None, None) => deleted_ids.push(id),
547                    (None, Some(_)) => {
548                        panic!("name->id deleted but not id->name: ({:?} {:?})", id, name)
549                    }
550                    (Some(_), None) => {
551                        panic!("id->name deleted but not name->id: ({:?} {:?})", id, name)
552                    }
553                    (Some(lid), Some(lname)) => {
554                        assert_eq!(lid, id);
555                        assert_eq!(lname, name);
556                    }
557                }
558            }
559            format!("{:?}", deleted_ids)
560        };
561
562        let f = |vs: Vec<VertexName>| -> String {
563            let mut vs = vs;
564            vs.sort_unstable();
565            format!("{:?}", vs)
566        };
567
568        let removed = r(map.remove_range(Id(1), Id(3))).unwrap();
569        assert_eq!(f(removed), "[a, bb, bbb]");
570        assert_eq!(deleted(&map), "[1, 2, 3]");
571
572        let removed = r(map.remove_range(Id(8), Id(12))).unwrap();
573        assert_eq!(f(removed), "[ddd, e]");
574        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11]");
575
576        let removed = r(map.remove_range(nid(2), nid(4))).unwrap();
577        assert_eq!(f(removed), "[n2, n3, n4]");
578        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4]");
579
580        let removed = r(map.remove_range(nid(20), nid(10000))).unwrap();
581        assert_eq!(f(removed), "[n20]");
582        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20]");
583    }
584
585    fn nid(i: u64) -> Id {
586        Group::NON_MASTER.min_id() + i
587    }
588}