swh_graph/
graph_builder.rs

1/*
2 * Copyright (C) 2024-2025  The Software Heritage developers
3 * See the AUTHORS file at the top-level directory of this distribution
4 * License: GNU General Public License version 3, or any later version
5 * See top-level LICENSE file for more information
6 */
7
8//! Utility to dynamically build a small graph in memory
9
10use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use webgraph::graphs::vec_graph::LabeledVecGraph;
16
17use crate::graph::*;
18use crate::labels::{
19    Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
20};
21use crate::properties;
22use crate::SwhGraphProperties;
23use crate::{NodeType, SWHID};
24
25/// How to sort label names in the produced graph. Defaults to `Lexicographic`.
26#[derive(Default, Clone, Copy, Debug)]
27pub enum LabelNamesOrder {
28    #[default]
29    /// Sort label names in their lexicographic order, as for graphs compressed with Rust
30    /// (2023-09-06 and newer)
31    Lexicographic,
32    /// Sort label names in the lexicographic order of their base64-encoding, as for graphs
33    /// compressed with Java (2022-12-07 and older)
34    LexicographicBase64,
35}
36
37// Type (alias) of the graph built by the graph builder
38pub type BuiltGraph = SwhBidirectionalGraph<
39    SwhGraphProperties<
40        properties::VecMaps,
41        properties::VecTimestamps,
42        properties::VecPersons,
43        properties::VecContents,
44        properties::VecStrings,
45        properties::VecLabelNames,
46    >,
47    LabeledVecGraph<Vec<u64>>,
48    LabeledVecGraph<Vec<u64>>,
49>;
50
51/// Dynamically builds a small graph in memory
52///
53/// # Examples
54///
55/// ## Directories and contents
56///
57/// ```
58/// use swh_graph::swhid;
59/// use swh_graph::graph_builder::GraphBuilder;
60/// use swh_graph::labels::Permission;
61///
62/// let mut builder = GraphBuilder::default();
63/// let a = builder
64///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000010)).unwrap()
65///     .done();
66/// let b = builder
67///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000020)).unwrap()
68///     .done();
69/// let c = builder
70///     .node(swhid!(swh:1:cnt:0000000000000000000000000000000000000030)).unwrap()
71///     .done();
72/// builder.dir_arc(a, b, Permission::Directory, b"tests");
73/// builder.dir_arc(a, c, Permission::ExecutableContent, b"run.sh");
74/// builder.dir_arc(b, c, Permission::Content, b"test.c");
75/// let _ = builder.done().unwrap();
76/// ```
77///
78/// # Revisions and releases
79///
80/// ```
81/// use swh_graph::swhid;
82///
83/// use swh_graph::graph_builder::GraphBuilder;
84/// let mut builder = GraphBuilder::default();
85/// let a = builder
86///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010)).unwrap()
87///     .author_timestamp(1708950743, 60)
88///     .done();
89/// let b = builder
90///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020)).unwrap()
91///     .committer_timestamp(1708950821, 120)
92///     .done();
93/// builder.arc(a, b);
94/// let _ = builder.done().unwrap();
95/// ```
96#[derive(Clone, Debug, Default)]
97pub struct GraphBuilder {
98    pub label_names_order: LabelNamesOrder,
99
100    name_to_id: HashMap<Vec<u8>, u64>,
101    persons: HashMap<Vec<u8>, u32>,
102
103    arcs: Vec<(NodeId, NodeId, Option<EdgeLabel>)>,
104
105    swhids: Vec<SWHID>,
106    is_skipped_content: Vec<bool>,
107    content_lengths: Vec<Option<u64>>,
108    author_ids: Vec<Option<u32>>,
109    committer_ids: Vec<Option<u32>>,
110    messages: Vec<Option<Vec<u8>>>,
111    tag_names: Vec<Option<Vec<u8>>>,
112    author_timestamps: Vec<Option<i64>>,
113    author_timestamp_offsets: Vec<Option<i16>>,
114    committer_timestamps: Vec<Option<i64>>,
115    committer_timestamp_offsets: Vec<Option<i16>>,
116    label_names: Vec<Vec<u8>>,
117}
118
119impl GraphBuilder {
120    /// Adds a node to the graph.
121    ///
122    /// Returns `Err` if there is already a node with this SWHID.
123    pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
124        ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
125        let node_id = self.swhids.len();
126        self.swhids.push(swhid);
127        self.is_skipped_content.push(false);
128        self.content_lengths.push(None);
129        self.author_ids.push(None);
130        self.committer_ids.push(None);
131        self.messages.push(None);
132        self.tag_names.push(None);
133        self.author_timestamps.push(None);
134        self.author_timestamp_offsets.push(None);
135        self.committer_timestamps.push(None);
136        self.committer_timestamp_offsets.push(None);
137        Ok(NodeBuilder {
138            node_id,
139            graph_builder: self,
140        })
141    }
142
143    /// Adds an unlabeled arc to the graph
144    pub fn arc(&mut self, src: NodeId, dst: NodeId) {
145        self.arcs.push((src, dst, None));
146    }
147
148    /// Adds a labeled dir->{cnt,dir,rev} arc to the graph
149    pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
150        &mut self,
151        src: NodeId,
152        dst: NodeId,
153        permission: P,
154        name: N,
155    ) {
156        let permission = permission.into();
157        let name = name.into();
158        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
159            self.label_names.push(name);
160            (self.label_names.len() - 1)
161                .try_into()
162                .expect("label_names length overflowed u64")
163        });
164        self.l_arc(
165            src,
166            dst,
167            DirEntry::new(permission, LabelNameId(name_id))
168                .expect("label_names is larger than 2^61 items"),
169        );
170    }
171
172    /// Adds a labeled snp->{cnt,dir,rev,rel} arc to the graph
173    pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
174        let name = name.into();
175        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
176            self.label_names.push(name);
177            (self.label_names.len() - 1)
178                .try_into()
179                .expect("label_names length overflowed u64")
180        });
181        self.l_arc(
182            src,
183            dst,
184            Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
185        );
186    }
187
188    /// Adds a labeled ori->snp arc to the graph
189    pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
190        self.l_arc(
191            src,
192            dst,
193            Visit::new(status, timestamp).expect("invalid timestamp"),
194        );
195    }
196
197    /// Adds a labeled arc to the graph
198    pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
199        self.arcs.push((src, dst, Some(label.into())));
200    }
201
202    #[allow(clippy::type_complexity)]
203    pub fn done(&self) -> Result<BuiltGraph> {
204        let num_nodes = self.swhids.len();
205        let mut seen = sux::prelude::BitVec::new(num_nodes);
206        for (src, dst, _) in self.arcs.iter() {
207            seen.set(*src, true);
208            seen.set(*dst, true);
209        }
210        for node in 0..num_nodes {
211            ensure!(
212                seen.get(node),
213                "VecGraph requires every node to have at least one arc, and {} does not have any",
214                node
215            );
216        }
217
218        let mut label_names_with_index: Vec<_> =
219            self.label_names.iter().cloned().enumerate().collect();
220        match self.label_names_order {
221            LabelNamesOrder::Lexicographic => label_names_with_index
222                .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
223            LabelNamesOrder::LexicographicBase64 => {
224                let base64 = base64_simd::STANDARD;
225                label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
226                    base64.encode_to_string(label_name).into_bytes()
227                });
228            }
229        }
230        let mut label_permutation = vec![0; label_names_with_index.len()];
231        for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
232            label_permutation[*old_index] = new_index;
233        }
234        let label_names: Vec<_> = label_names_with_index
235            .into_iter()
236            .map(|(_index, label_name)| label_name)
237            .collect();
238
239        let mut arcs = self.arcs.clone();
240        arcs.sort_by_key(|(src, dst, _label)| (*src, *dst)); // stable sort, it makes tests easier to write
241
242        let arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
243            .into_iter()
244            .group_by(|(src, dst, _label)| (*src, *dst))
245            .into_iter()
246            .map(|((src, dst), arcs)| -> (NodeId, NodeId, Vec<u64>) {
247                let labels = arcs
248                    .flat_map(|(_src, _dst, labels)| {
249                        labels.map(|label| {
250                            UntypedEdgeLabel::from(match label {
251                                EdgeLabel::Branch(branch) => EdgeLabel::Branch(
252                                    Branch::new(LabelNameId(
253                                        label_permutation[branch.label_name_id().0 as usize] as u64,
254                                    ))
255                                    .expect("Label name permutation overflowed"),
256                                ),
257                                EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
258                                    DirEntry::new(
259                                        entry.permission().expect("invalid permission"),
260                                        LabelNameId(
261                                            label_permutation[entry.label_name_id().0 as usize]
262                                                as u64,
263                                        ),
264                                    )
265                                    .expect("Label name permutation overflowed"),
266                                ),
267                                EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
268                            })
269                            .0
270                        })
271                    })
272                    .collect();
273                (src, dst, labels)
274            })
275            .collect();
276
277        let backward_arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
278            .iter()
279            .map(|(src, dst, labels)| (*dst, *src, labels.clone()))
280            .collect();
281
282        SwhBidirectionalGraph::from_underlying_graphs(
283            std::path::PathBuf::default(),
284            LabeledVecGraph::from_arcs(arcs),
285            LabeledVecGraph::from_arcs(backward_arcs),
286        )
287        .init_properties()
288        .load_properties(|properties| {
289            properties
290                .with_maps(properties::VecMaps::new(self.swhids.clone()))
291                .context("Could not join maps")?
292                .with_contents(
293                    properties::VecContents::new(
294                        self.is_skipped_content
295                            .iter()
296                            .copied()
297                            .zip(self.content_lengths.iter().copied())
298                            .collect(),
299                    )
300                    .context("Could not build VecContents")?,
301                )
302                .context("Could not join VecContents")?
303                .with_label_names(
304                    properties::VecLabelNames::new(label_names.clone())
305                        .context("Could not build VecLabelNames")?,
306                )
307                .context("Could not join maps")?
308                .with_persons(
309                    properties::VecPersons::new(
310                        self.author_ids
311                            .iter()
312                            .copied()
313                            .zip(self.committer_ids.iter().copied())
314                            .collect(),
315                    )
316                    .context("Could not build VecPersons")?,
317                )
318                .context("Could not join persons")?
319                .with_strings(
320                    properties::VecStrings::new(
321                        self.messages
322                            .iter()
323                            .cloned()
324                            .zip(self.tag_names.iter().cloned())
325                            .collect(),
326                    )
327                    .context("Could not build VecStrings")?,
328                )
329                .context("Could not join strings")?
330                .with_timestamps(
331                    properties::VecTimestamps::new(
332                        self.author_timestamps
333                            .iter()
334                            .copied()
335                            .zip(self.author_timestamp_offsets.iter().copied())
336                            .zip(self.committer_timestamps.iter().copied())
337                            .zip(self.committer_timestamp_offsets.iter().copied())
338                            .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
339                            .collect(),
340                    )
341                    .context("Could not build VecTimestamps")?,
342                )
343                .context("Could not join timestamps")
344        })
345    }
346}
347
348pub struct NodeBuilder<'builder> {
349    node_id: NodeId,
350    graph_builder: &'builder mut GraphBuilder,
351}
352
353impl NodeBuilder<'_> {
354    pub fn done(&self) -> NodeId {
355        self.node_id
356    }
357
358    pub fn content_length(&mut self, content_length: u64) -> &mut Self {
359        self.graph_builder.content_lengths[self.node_id] = Some(content_length);
360        self
361    }
362
363    pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
364        self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
365        self
366    }
367
368    pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
369        let next_author_id = self
370            .graph_builder
371            .persons
372            .len()
373            .try_into()
374            .expect("person names overflowed u32");
375        let author_id = self
376            .graph_builder
377            .persons
378            .entry(author)
379            .or_insert(next_author_id);
380        self.graph_builder.author_ids[self.node_id] = Some(*author_id);
381        self
382    }
383
384    pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
385        let next_committer_id = self
386            .graph_builder
387            .persons
388            .len()
389            .try_into()
390            .expect("person names overflowed u32");
391        let committer_id = self
392            .graph_builder
393            .persons
394            .entry(committer)
395            .or_insert(next_committer_id);
396        self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
397        self
398    }
399
400    pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
401        self.graph_builder.messages[self.node_id] = Some(message);
402        self
403    }
404
405    pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
406        self.graph_builder.tag_names[self.node_id] = Some(tag_name);
407        self
408    }
409
410    pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
411        self.graph_builder.author_timestamps[self.node_id] = Some(ts);
412        self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
413        self
414    }
415
416    pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
417        self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
418        self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
419        self
420    }
421}
422
423pub fn codegen_from_full_graph<
424    G: SwhLabeledForwardGraph
425        + SwhGraphWithProperties<
426            Maps: properties::Maps,
427            Timestamps: properties::Timestamps,
428            Persons: properties::Persons,
429            Contents: properties::Contents,
430            Strings: properties::Strings,
431            LabelNames: properties::LabelNames,
432        >,
433    W: Write,
434>(
435    graph: &G,
436    mut writer: W,
437) -> Result<(), std::io::Error> {
438    // Turns a Vec<u8> into an escaped bytestring
439    let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
440
441    writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
442    writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
443    writer.write_all(b"\n")?;
444    writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
445    for node in 0..graph.num_nodes() {
446        writer.write_all(
447            format!(
448                "builder\n    .node(swhid!({}))\n    .unwrap()\n",
449                graph.properties().swhid(node)
450            )
451            .as_bytes(),
452        )?;
453        let mut write_line = |s: String| writer.write_all(format!("    {s}\n").as_bytes());
454        match graph.properties().node_type(node) {
455            NodeType::Content => {
456                write_line(format!(
457                    ".is_skipped_content({})",
458                    graph.properties().is_skipped_content(node),
459                ))?;
460            }
461            NodeType::Revision
462            | NodeType::Release
463            | NodeType::Directory
464            | NodeType::Snapshot
465            | NodeType::Origin => {}
466        }
467        if let Some(v) = graph.properties().content_length(node) {
468            write_line(format!(".content_length({v})"))?;
469        }
470        if let Some(v) = graph.properties().message(node) {
471            write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
472        }
473        if let Some(v) = graph.properties().tag_name(node) {
474            write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
475        }
476        if let Some(v) = graph.properties().author_id(node) {
477            write_line(format!(".author(b\"{v}\".to_vec())"))?;
478        }
479        if let Some(ts) = graph.properties().author_timestamp(node) {
480            let offset = graph
481                .properties()
482                .author_timestamp_offset(node)
483                .expect("Node has author_timestamp but no author_timestamp_offset");
484            write_line(format!(".author_timestamp({ts}, {offset})"))?;
485        }
486        if let Some(v) = graph.properties().committer_id(node) {
487            write_line(format!(".committer(b\"{v}\".to_vec())"))?;
488        }
489        if let Some(ts) = graph.properties().committer_timestamp(node) {
490            let offset = graph
491                .properties()
492                .committer_timestamp_offset(node)
493                .expect("Node has committer_timestamp but no committer_timestamp_offset");
494            write_line(format!(".committer_timestamp({ts}, {offset})"))?;
495        }
496        writer.write_all(b"    .done();\n")?;
497    }
498
499    writer.write_all(b"\n")?;
500
501    for node in 0..graph.num_nodes() {
502        for (succ, labels) in graph.labeled_successors(node) {
503            let mut has_labels = false;
504            for label in labels {
505                writer.write_all(
506                    match label {
507                        EdgeLabel::DirEntry(label) => format!(
508                            "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
509                            node,
510                            succ,
511                            label.permission().expect("Invalid permission"),
512                            bytestring(graph.properties().label_name(label.label_name_id())),
513                        ),
514                        EdgeLabel::Branch(label) => format!(
515                            "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
516                            node,
517                            succ,
518                            bytestring(graph.properties().label_name(label.label_name_id())),
519                        ),
520                        EdgeLabel::Visit(label) => format!(
521                            "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
522                            node,
523                            succ,
524                            label.status(),
525                            label.timestamp(),
526                        ),
527                    }
528                    .as_bytes(),
529                )?;
530                has_labels = true;
531            }
532            if !has_labels {
533                writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
534            }
535        }
536    }
537
538    writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
539
540    Ok(())
541}