Skip to main content

dag/
idmap.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8//! # idmap
9//!
10//! See [`IdMap`] for the main structure.
11
12use std::borrow::Cow;
13
14use crate::errors::bug;
15use crate::id::Group;
16use crate::id::Id;
17use crate::id::Vertex;
18use crate::ops::IdConvert;
19use crate::ops::Parents;
20use crate::segment::PreparedFlatSegments;
21use crate::types_ext::PreparedFlatSegmentsExt;
22use crate::Error;
23use crate::IdSet;
24use crate::Result;
25
26#[cfg(any(test, feature = "indexedlog-backend"))]
27mod indexedlog_idmap;
28mod mem_idmap;
29
30#[cfg(any(test, feature = "indexedlog-backend"))]
31pub use indexedlog_idmap::IdMap;
32pub(crate) use mem_idmap::CoreMemIdMap;
33pub use mem_idmap::MemIdMap;
34
35/// DAG-aware write operations.
36#[async_trait::async_trait]
37pub trait IdMapAssignHead: IdConvert + IdMapWrite {
38    /// Assign an id for a head in a DAG. This implies ancestors of the
39    /// head will also have ids assigned.
40    ///
41    /// This function is incremental. If the head or any of its ancestors
42    /// already have an id stored in this map, the existing ids will be
43    /// reused.
44    ///
45    /// This function needs roughly `O(N)` heap memory. `N` is the number of
46    /// ids to assign. When `N` is very large, try assigning ids to a known
47    /// ancestor first.
48    ///
49    /// New `id`s inserted by this function will have the specified `group`.
50    /// Existing `id`s that are ancestors of `head` will get re-assigned
51    /// if they have a higher `group`.
52    ///
53    /// `covered_ids` specifies what ranges of `Id`s are already covered.
54    /// This is usually obtained from `IdDag::all_ids_in_groups(&Group::ALL)`.
55    /// `IdMap` itself might not be able to provide that information
56    /// efficiently because it might be lazy. `covered_ids` will be updated
57    /// to cover newly inserted `Id`s.
58    ///
59    /// `reserved_ids` specifies what ranges are reserved for future growth
60    /// of other important heads (usually a couple of mainline branches that
61    /// are long-lived, growing, and used by many people). This is useful
62    /// to reduce fragmentation.
63    async fn assign_head(
64        &mut self,
65        head: Vertex,
66        parents_by_name: &dyn Parents,
67        group: Group,
68        covered_ids: &mut IdSet,
69        reserved_ids: &IdSet,
70    ) -> Result<PreparedFlatSegments> {
71        // There are some interesting cases to optimize the numbers:
72        //
73        // C     For a merge C, it has choice to assign numbers to A or B
74        // |\    first (A and B are abstract branches that have many nodes).
75        // A B   Suppose branch A is linear and B have merges, and D is
76        // |/    (::A & ::B). Then:
77        // D
78        //
79        // - If `D` is empty or already assigned, it's better to assign A last.
80        //   This is because (A+C) can then always form a segment regardless of
81        //   the complexity of B:
82        //
83        //      B   A   C       vs.        A   B   C
84        //     ~~~  ^^^^^                     ~~~
85        //     xxxxxx                          *****
86        //                                 xxxxx
87        //
88        //   [~]: Might be complex (ex. many segments)
89        //   [^]: Can always form a segment. (better)
90        //   [*]: Can only be a segment if segment size is large enough.
91        //   [x]: Cannot form a segment.
92        //
93        // - If `D` is not empty (and not assigned), it _might_ be better to
94        //   assign D and A first. This provides benefits for A and D to be
95        //   continuous, with the downside that A and C are not continuous.
96        //
97        //   A typical pattern is one branch continuously merges into the other
98        //   (see also segmented-changelog.pdf, page 19):
99        //
100        //        B---D---F
101        //         \   \   \
102        //      A---C---E---G
103        //
104        // The code below is optimized for cases where p1 branch is linear,
105        // but p2 branch is not.
106        //
107        // However, the above visit order (first parent last) is not optimal
108        // for incremental build case with pushrebase branches. Because
109        // pushrebase uses the first parent as the mainline. For example:
110        //
111        //    A---B---C-...-D---M  (parents(M) = [D, F])
112        //                     /
113        //              E-...-F
114        //
115        // The A ... M branch is the mainline. The head of the mainline
116        // was A ... D, then M. An incremental build job might have built up
117        // A, B, ..., D before it sees M. In this case it's better to make
118        // the incremental build finish the A ... D part before jumping to
119        // E ... F.
120        //
121        // We choose first parent last order if `covered` is empty, or when
122        // visiting ancestors of non-first parents.
123        let mut outcome = PreparedFlatSegments::default();
124
125        #[derive(Copy, Clone, Debug)]
126        enum VisitOrder {
127            /// Visit the first parent first.
128            FirstFirst,
129            /// Visit the first parent last.
130            FirstLast,
131        }
132
133        // Emulate the stack in heap to avoid overflow.
134        #[derive(Debug)]
135        enum Todo {
136            /// Visit parents. Finally assign self. This will eventually turn into AssignedId.
137            Visit {
138                head: Vertex,
139
140                /// The `Id` in `IdMap` that is known assigned to the `head`.
141                /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
142                known_id: Option<Id>,
143
144                order: VisitOrder,
145            },
146
147            /// Assign an `Id` if not assigned. Their parents are prepared in the
148            /// `parent_ids` stack. `Assign` `head` and `Visit` `head`'s parents
149            /// are pushed together so the `Visit` entries can turn into `Id`s in
150            /// the `parent_ids` stack.
151            Assign {
152                /// The vertex to assign. Its parents are already visited and assigned.
153                head: Vertex,
154
155                /// The `Id` in `IdMap` that is known assigned to the `head`.
156                /// This can be non-`None` if `IdMap` has more entries than `IdDag`.
157                known_id: Option<Id>,
158
159                /// The number of parents, at the end of the `parent_ids`.
160                parent_len: usize,
161
162                /// The order of parents if extracted from `parent_ids`.
163                order: VisitOrder,
164            },
165
166            /// Assigned Id. Will be picked by and pushed to the current `parent_ids` stack.
167            AssignedId { id: Id },
168        }
169        use Todo::Assign;
170        use Todo::AssignedId;
171        use Todo::Visit;
172        let mut parent_ids: Vec<Id> = Vec::new();
173
174        let mut todo_stack: Vec<Todo> = {
175            let order = if covered_ids.is_empty() {
176                // Assume re-building from scratch.
177                VisitOrder::FirstLast
178            } else {
179                // Assume incremental updates with pushrebase.
180                VisitOrder::FirstFirst
181            };
182            vec![Visit {
183                head: head.clone(),
184                known_id: None,
185                order,
186            }]
187        };
188        while let Some(todo) = todo_stack.pop() {
189            tracing::trace!(target: "dag::assign", "todo: {:?}", &todo);
190            match todo {
191                Visit {
192                    head,
193                    known_id,
194                    order,
195                } => {
196                    // If the id was not assigned, or was assigned to a higher group,
197                    // (re-)assign it to this group.
198                    //
199                    // PERF: This might trigger remote fetch too frequently.
200                    let known_id = match known_id {
201                        Some(id) => Some(id),
202                        None => self.vertex_id_with_max_group(&head, group).await?,
203                    };
204                    match known_id {
205                        Some(id) if covered_ids.contains(id) => todo_stack.push(AssignedId { id }),
206                        _ => {
207                            let parents = parents_by_name.parent_names(head.clone()).await?;
208                            tracing::trace!(target: "dag::assign", "visit {:?} ({:?}) with parents {:?}", &head, known_id, &parents);
209                            todo_stack.push(Assign {
210                                head,
211                                known_id,
212                                parent_len: parents.len(),
213                                order,
214                            });
215                            let mut visit = parents;
216                            match order {
217                                VisitOrder::FirstFirst => {}
218                                VisitOrder::FirstLast => visit.reverse(),
219                            }
220                            for (i, p) in visit.into_iter().enumerate() {
221                                // If the parent was not assigned, or was assigned to a higher group,
222                                // (re-)assign the parent to this group.
223                                match self.vertex_id_with_max_group(&p, group).await {
224                                    Ok(Some(id)) if covered_ids.contains(id) => {
225                                        todo_stack.push(AssignedId { id })
226                                    }
227                                    // Go deeper if IdMap has the entry but IdDag misses it.
228                                    Ok(Some(id)) => todo_stack.push(Visit {
229                                        head: p,
230                                        known_id: Some(id),
231                                        order,
232                                    }),
233                                    Ok(None) => {
234                                        let parent_order = match (order, i) {
235                                            (VisitOrder::FirstFirst, 0) => VisitOrder::FirstFirst,
236                                            _ => VisitOrder::FirstLast,
237                                        };
238                                        todo_stack.push(Visit {
239                                            head: p,
240                                            known_id: None,
241                                            order: parent_order,
242                                        })
243                                    }
244                                    Err(e) => return Err(e),
245                                }
246                            }
247                        }
248                    }
249                }
250                Assign {
251                    head,
252                    known_id,
253                    parent_len,
254                    order,
255                } => {
256                    let parent_start = parent_ids.len() - parent_len;
257                    let known_id = match known_id {
258                        Some(id) => Some(id),
259                        None => self.vertex_id_with_max_group(&head, group).await?,
260                    };
261                    let id = match known_id {
262                        Some(id) if covered_ids.contains(id) => id,
263                        _ => {
264                            let parents = match order {
265                                VisitOrder::FirstLast => Cow::Borrowed(&parent_ids[parent_start..]),
266                                VisitOrder::FirstFirst => Cow::Owned(
267                                    parent_ids[parent_start..]
268                                        .iter()
269                                        .cloned()
270                                        .rev()
271                                        .collect::<Vec<_>>(),
272                                ),
273                            };
274                            let id = match known_id {
275                                Some(id) => id,
276                                None => {
277                                    let candidate_id = match parents.iter().max() {
278                                        Some(&max_parent_id) => {
279                                            (max_parent_id + 1).max(group.min_id())
280                                        }
281                                        None => group.min_id(),
282                                    };
283                                    adjust_candidate_id(
284                                        self,
285                                        covered_ids,
286                                        reserved_ids,
287                                        candidate_id,
288                                    )
289                                    .await?
290                                }
291                            };
292                            if id.group() != group {
293                                return Err(Error::IdOverflow(group));
294                            }
295                            covered_ids.push(id);
296                            if known_id.is_none() {
297                                tracing::trace!(target: "dag::assign", "assign {:?} = {:?}", &head, id);
298                                self.insert(id, head.as_ref()).await?;
299                            } else {
300                                tracing::trace!(target: "dag::assign", "assign {:?} = {:?} (known)", &head, id);
301                            }
302                            if parents.iter().any(|&p| p >= id) {
303                                return bug(format!(
304                                    "IdMap Ids are not topo-sorted: {:?} ({:?}) has parent ids {:?}",
305                                    id, head, &parents,
306                                ));
307                            }
308                            outcome.push_edge(id, &parents);
309                            id
310                        }
311                    };
312                    parent_ids.truncate(parent_start);
313                    todo_stack.push(AssignedId { id });
314                }
315                AssignedId { id } => {
316                    if !covered_ids.contains(id) {
317                        return bug(format!(
318                            concat!(
319                                "attempted to assign id with wrong order: {:?} ",
320                                "is being pushed as parent id but it cannot be used ",
321                                "because it is not yet covered by IdDag",
322                            ),
323                            &id
324                        ));
325                    }
326                    parent_ids.push(id);
327                }
328            }
329        }
330
331        Ok(outcome)
332    }
333}
334
335/// Pick a minimal `n`, so `candidate_id + n` is an `Id` that is not "covered",
336/// not "reserved", and not in the "map".  Return the picked `Id`.
337async fn adjust_candidate_id(
338    map: &(impl IdConvert + ?Sized),
339    covered_ids: &IdSet,
340    reserved_ids: &IdSet,
341    mut candidate_id: Id,
342) -> Result<Id> {
343    loop {
344        // (Fast) test using covered_ids + reserved_ids.
345        loop {
346            if let Some(span) = covered_ids.span_contains(candidate_id) {
347                candidate_id = span.high + 1;
348                continue;
349            }
350            if let Some(span) = reserved_ids.span_contains(candidate_id) {
351                candidate_id = span.high + 1;
352                continue;
353            }
354            break;
355        }
356        // (Slow) test using the IdMap.
357        // PERF: This lacks of batching if it forms a loop. But it
358        // is also expected to be rare - only when the server
359        // tailer (assuming only one tailer is writing globally) is
360        // killed abnormally, *and* the branch being assigned has
361        // non-fast-forward move, this code path becomes useful.
362        //
363        // Technically, not using `locally` is more correct in a
364        // lazy `IdMap`. However, lazy `IdMap` is only used by
365        // client (local) dag, which ensures `IdMap` and `IdDag`
366        // are in-sync, meaning that the above `covered_ids` check
367        // is sufficient. So this is really only protecting the
368        // server's out-of-sync `IdMap` use-case, where the
369        // `locally` variant is the same as the non-`locally`,
370        // since the server has a non-lazy `IdMap`.
371        if let [true] = &map.contains_vertex_id_locally(&[candidate_id]).await?[..] {
372            candidate_id = candidate_id + 1;
373        } else {
374            break;
375        }
376    }
377    Ok(candidate_id)
378}
379
380impl<T> IdMapAssignHead for T where T: IdConvert + IdMapWrite {}
381
382/// Write operations for IdMap.
383#[async_trait::async_trait]
384pub trait IdMapWrite {
385    /// Insert a new `(id, name)` pair to the map.
386    ///
387    /// The `id` and `name` mapping should be unique, it's an error to map an id
388    /// to multiple names, or map a name to multiple ids. Note: older versions
389    /// of `IdMap` allowed mapping a name to a non-master Id, then a master Id
390    /// (in this order), and the master Id is used for lookups. This is no
391    /// longer permitted.
392    async fn insert(&mut self, id: Id, name: &[u8]) -> Result<()>;
393    /// Remove ids in the range `low..=high` and their associated names.
394    /// Return removed names.
395    async fn remove_range(&mut self, low: Id, high: Id) -> Result<Vec<Vertex>>;
396}
397
398#[cfg(test)]
399mod tests {
400    use nonblocking::non_blocking_result as r;
401    #[cfg(feature = "indexedlog-backend")]
402    use tempfile::tempdir;
403
404    use super::*;
405    #[cfg(feature = "indexedlog-backend")]
406    use crate::ops::Persist;
407    #[cfg(feature = "indexedlog-backend")]
408    use crate::ops::PrefixLookup;
409    use crate::tests::dbg;
410    use crate::tests::nid;
411    use crate::tests::vid;
412
413    #[cfg(feature = "indexedlog-backend")]
414    #[test]
415    fn test_basic_operations() {
416        let dir = tempdir().unwrap();
417        let mut map = IdMap::open(dir.path()).unwrap();
418        let lock = map.lock().unwrap();
419        map.reload(&lock).unwrap();
420        map.insert(Id(1), b"abc").unwrap();
421        map.insert(Id(2), b"def").unwrap();
422        map.insert(Id(10), b"ghi").unwrap();
423        map.insert(Id(11), b"ghi").unwrap_err(); // ghi maps to 10
424        map.insert(Id(10), b"ghi2").unwrap_err(); // 10 maps to ghi
425
426        // Test another group.
427        let id = Group::NON_MASTER.min_id();
428        map.insert(id, b"jkl").unwrap();
429        map.insert(id, b"jkl").unwrap();
430        map.insert(id, b"jkl2").unwrap_err(); // id maps to jkl
431        map.insert(id + 1, b"jkl2").unwrap();
432        map.insert(id + 2, b"jkl2").unwrap_err(); // jkl2 maps to id + 1
433        map.insert(Id(15), b"jkl2").unwrap_err(); // reassign jkl2 to master group - error.
434        map.insert(id + 3, b"abc").unwrap_err(); // reassign abc to non-master group - error.
435
436        // The 3rd group.
437        map.insert(vid(0), b"xy").unwrap();
438        map.insert(vid(0), b"xy").unwrap();
439
440        map.insert(vid(0), b"xyz").unwrap_err();
441        map.insert(vid(1), b"xy").unwrap_err();
442        map.insert(nid(1), b"xy").unwrap_err();
443
444        map.insert(vid(1), b"xyz").unwrap();
445        map.insert(vid(2), b"jkl3").unwrap();
446
447        // Test hex lookup.
448        assert_eq!(0x6a, b'j');
449        assert_eq!(
450            r(map.vertexes_by_hex_prefix(b"6a", 3)).unwrap(),
451            [
452                // Virutal "jkl3" requires full hex match so not here.
453                Vertex::from(&b"jkl"[..]),
454                Vertex::from(&b"jkl2"[..]),
455            ]
456        );
457        assert_eq!(
458            // Virutal "jkl3" matches when the hex is full.
459            r(map.vertexes_by_hex_prefix(b"6a6b6c33", 3)).unwrap(),
460            [Vertex::from(&b"jkl3"[..])]
461        );
462        assert_eq!(
463            r(map.vertexes_by_hex_prefix(b"6a", 1)).unwrap(),
464            [Vertex::from(&b"jkl"[..])]
465        );
466        assert!(r(map.vertexes_by_hex_prefix(b"6b", 1)).unwrap().is_empty());
467
468        // 'xy' and 'xyz' are virutal, requires full hex to match
469        assert_eq!(0x78, b'x');
470        assert_eq!(
471            r(map.vertexes_by_hex_prefix(b"78", 3)).unwrap(),
472            &[] as &[Vertex],
473        );
474        assert_eq!(
475            // Virutal "xy" matches when the hex is full.
476            r(map.vertexes_by_hex_prefix(b"7879", 3)).unwrap(),
477            [Vertex::from(&b"xy"[..])]
478        );
479        assert_eq!(
480            // Virutal "xyz" matches when the hex is full.
481            r(map.vertexes_by_hex_prefix(b"78797a", 3)).unwrap(),
482            [Vertex::from(&b"xyz"[..])]
483        );
484
485        for _ in 0..=1 {
486            assert_eq!(map.find_name_by_id(Id(1)).unwrap().unwrap(), b"abc");
487            assert_eq!(map.find_name_by_id(Id(2)).unwrap().unwrap(), b"def");
488            assert!(map.find_name_by_id(Id(3)).unwrap().is_none());
489            assert_eq!(map.find_name_by_id(Id(10)).unwrap().unwrap(), b"ghi");
490            assert_eq!(map.find_name_by_id(vid(1)).unwrap().unwrap(), b"xyz");
491
492            assert_eq!(map.find_id_by_name(b"abc").unwrap().unwrap().0, 1);
493            assert_eq!(map.find_id_by_name(b"def").unwrap().unwrap().0, 2);
494            assert_eq!(map.find_id_by_name(b"ghi").unwrap().unwrap().0, 10);
495            assert_eq!(map.find_id_by_name(b"jkl").unwrap().unwrap(), id);
496            assert_eq!(map.find_id_by_name(b"jkl2").unwrap().unwrap(), nid(1));
497            assert_eq!(map.find_id_by_name(b"jkl3").unwrap().unwrap(), vid(2));
498
499            assert!(map.find_id_by_name(b"jkl4").unwrap().is_none());
500        }
501
502        // Test Debug
503        assert_eq!(
504            dbg(&map),
505            r#"IdMap {
506  abc: 1,
507  def: 2,
508  ghi: 10,
509  jkl: N0,
510  jkl2: N1,
511  xy: V0,
512  xyz: V1,
513  jkl3: V2,
514}
515"#
516        );
517    }
518
519    #[test]
520    fn test_remove_range() {
521        let map = MemIdMap::new();
522        check_remove_range(map);
523
524        #[cfg(feature = "indexedlog-backend")]
525        {
526            let dir = tempdir().unwrap();
527            let path = dir.path();
528            let map = IdMap::open(path).unwrap();
529            check_remove_range(map);
530        }
531    }
532
533    fn check_remove_range(mut map: impl IdConvert + IdMapWrite) {
534        let items: &[(Id, &[u8])] = &[
535            (Id(0), b"z"),
536            (Id(1), b"a"),
537            (Id(2), b"bbb"),
538            (Id(3), b"bb"),
539            (Id(4), b"cc"),
540            (Id(5), b"ccc"),
541            (Id(9), b"ddd"),
542            (Id(11), b"e"),
543            (Id(13), b"ff"),
544            (nid(0), b"n"),
545            (nid(1), b"n1"),
546            (nid(2), b"n2"),
547            (nid(3), b"n3"),
548            (nid(4), b"n4"),
549            (nid(5), b"n5"),
550            (nid(12), b"n12"),
551            (nid(20), b"n20"),
552            (vid(0), b"v0"),
553            (vid(1), b"v1"),
554            (vid(10), b"v10"),
555        ];
556        for (id, name) in items {
557            r(map.insert(*id, name)).unwrap();
558        }
559
560        // deleted ids in a string, with extra consistency checks.
561        let deleted = |map: &dyn IdConvert| -> String {
562            let mut deleted_ids = Vec::new();
563            for (id, name) in items {
564                let name = Vertex::copy_from(name);
565                let id = *id;
566                let has_id = r(map.contains_vertex_id_locally(&[id])).unwrap()[0];
567                let lookup_id = r(map.vertex_id_optional(&name)).unwrap();
568                let lookup_name = if has_id {
569                    Some(r(map.vertex_name(id)).unwrap())
570                } else {
571                    None
572                };
573
574                match (lookup_id, lookup_name) {
575                    (None, None) => deleted_ids.push(id),
576                    (None, Some(_)) => {
577                        panic!("name->id deleted but not id->name: ({:?} {:?})", id, name)
578                    }
579                    (Some(_), None) => {
580                        panic!("id->name deleted but not name->id: ({:?} {:?})", id, name)
581                    }
582                    (Some(lid), Some(lname)) => {
583                        assert_eq!(lid, id);
584                        assert_eq!(lname, name);
585                    }
586                }
587            }
588            dbg(deleted_ids)
589        };
590
591        let f = |vs: Vec<Vertex>| -> String {
592            let mut vs = vs;
593            vs.sort_unstable();
594            dbg(vs)
595        };
596
597        let removed = r(map.remove_range(Id(1), Id(3))).unwrap();
598        assert_eq!(f(removed), "[a, bb, bbb]");
599        assert_eq!(deleted(&map), "[1, 2, 3]");
600
601        let removed = r(map.remove_range(Id(8), Id(12))).unwrap();
602        assert_eq!(f(removed), "[ddd, e]");
603        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11]");
604
605        let removed = r(map.remove_range(nid(2), nid(4))).unwrap();
606        assert_eq!(f(removed), "[n2, n3, n4]");
607        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4]");
608
609        let removed = r(map.remove_range(nid(20), nid(10000))).unwrap();
610        assert_eq!(f(removed), "[n20]");
611        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20]");
612
613        let removed = r(map.remove_range(vid(0), vid(1))).unwrap();
614        assert_eq!(f(removed), "[v0, v1]");
615        assert_eq!(deleted(&map), "[1, 2, 3, 9, 11, N2, N3, N4, N20, V0, V1]");
616
617        let removed = r(map.remove_range(vid(0), vid(10000))).unwrap();
618        assert_eq!(f(removed), "[v10]");
619        assert_eq!(
620            deleted(&map),
621            "[1, 2, 3, 9, 11, N2, N3, N4, N20, V0, V1, V10]"
622        );
623    }
624}