swh_graph/
graph_builder.rs

1/*
2 * Copyright (C) 2024-2025  The Software Heritage developers
3 * See the AUTHORS file at the top-level directory of this distribution
4 * License: GNU General Public License version 3, or any later version
5 * See top-level LICENSE file for more information
6 */
7
8//! Utility to dynamically build a small graph in memory
9
10use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use webgraph::graphs::vec_graph::LabeledVecGraph;
16
17use crate::graph::*;
18use crate::labels::{
19    Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
20};
21use crate::properties;
22use crate::SwhGraphProperties;
23use crate::{NodeType, SWHID};
24
25/// How to sort label names in the produced graph. Defaults to `Lexicographic`.
26#[derive(Default, Clone, Copy, Debug)]
27pub enum LabelNamesOrder {
28    #[default]
29    /// Sort label names in their lexicographic order, as for graphs compressed with Rust
30    /// (2023-09-06 and newer)
31    Lexicographic,
32    /// Sort label names in the lexicographic order of their base64-encoding, as for graphs
33    /// compressed with Java (2022-12-07 and older)
34    LexicographicBase64,
35}
36
37// Type (alias) of the graph built by the graph builder
38pub type BuiltGraph = SwhBidirectionalGraph<
39    SwhGraphProperties<
40        properties::VecMaps,
41        properties::VecTimestamps,
42        properties::VecPersons,
43        properties::VecContents,
44        properties::VecStrings,
45        properties::VecLabelNames,
46    >,
47    LabeledVecGraph<Vec<u64>>,
48    LabeledVecGraph<Vec<u64>>,
49>;
50
51/// Dynamically builds a small graph in memory
52///
53/// # Examples
54///
55/// ## Directories and contents
56///
57/// ```
58/// use swh_graph::swhid;
59/// use swh_graph::graph_builder::GraphBuilder;
60/// use swh_graph::labels::Permission;
61///
62/// let mut builder = GraphBuilder::default();
63/// let a = builder
64///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000010)).unwrap()
65///     .done();
66/// let b = builder
67///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000020)).unwrap()
68///     .done();
69/// let c = builder
70///     .node(swhid!(swh:1:cnt:0000000000000000000000000000000000000030)).unwrap()
71///     .done();
72/// builder.dir_arc(a, b, Permission::Directory, b"tests");
73/// builder.dir_arc(a, c, Permission::ExecutableContent, b"run.sh");
74/// builder.dir_arc(b, c, Permission::Content, b"test.c");
75/// let _ = builder.done().unwrap();
76/// ```
77///
78/// # Revisions and releases
79///
80/// ```
81/// use swh_graph::swhid;
82///
83/// use swh_graph::graph_builder::GraphBuilder;
84/// let mut builder = GraphBuilder::default();
85/// let a = builder
86///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010)).unwrap()
87///     .author_timestamp(1708950743, 60)
88///     .done();
89/// let b = builder
90///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020)).unwrap()
91///     .committer_timestamp(1708950821, 120)
92///     .done();
93/// builder.arc(a, b);
94/// let _ = builder.done().unwrap();
95/// ```
96#[derive(Clone, Debug, Default)]
97pub struct GraphBuilder {
98    pub label_names_order: LabelNamesOrder,
99
100    name_to_id: HashMap<Vec<u8>, u64>,
101    persons: HashMap<Vec<u8>, u32>,
102
103    arcs: Vec<(NodeId, NodeId, Option<EdgeLabel>)>,
104
105    swhids: Vec<SWHID>,
106    is_skipped_content: Vec<bool>,
107    content_lengths: Vec<Option<u64>>,
108    author_ids: Vec<Option<u32>>,
109    committer_ids: Vec<Option<u32>>,
110    messages: Vec<Option<Vec<u8>>>,
111    tag_names: Vec<Option<Vec<u8>>>,
112    author_timestamps: Vec<Option<i64>>,
113    author_timestamp_offsets: Vec<Option<i16>>,
114    committer_timestamps: Vec<Option<i64>>,
115    committer_timestamp_offsets: Vec<Option<i16>>,
116    label_names: Vec<Vec<u8>>,
117}
118
119impl GraphBuilder {
120    /// Adds a node to the graph.
121    ///
122    /// Returns `Err` if there is already a node with this SWHID.
123    pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
124        ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
125        let node_id = self.swhids.len();
126        self.swhids.push(swhid);
127        self.is_skipped_content.push(false);
128        self.content_lengths.push(None);
129        self.author_ids.push(None);
130        self.committer_ids.push(None);
131        self.messages.push(None);
132        self.tag_names.push(None);
133        self.author_timestamps.push(None);
134        self.author_timestamp_offsets.push(None);
135        self.committer_timestamps.push(None);
136        self.committer_timestamp_offsets.push(None);
137        Ok(NodeBuilder {
138            node_id,
139            graph_builder: self,
140        })
141    }
142
143    /// Returns `NodeId` that represents this SWHID in the graph, if it exists
144    pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
145        self.swhids.iter().position(|x| *x == swhid)
146    }
147
148    /// Adds an unlabeled arc to the graph
149    pub fn arc(&mut self, src: NodeId, dst: NodeId) {
150        self.arcs.push((src, dst, None));
151    }
152
153    /// Adds a labeled dir->{cnt,dir,rev} arc to the graph
154    pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
155        &mut self,
156        src: NodeId,
157        dst: NodeId,
158        permission: P,
159        name: N,
160    ) {
161        let permission = permission.into();
162        let name = name.into();
163        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
164            self.label_names.push(name);
165            (self.label_names.len() - 1)
166                .try_into()
167                .expect("label_names length overflowed u64")
168        });
169        self.l_arc(
170            src,
171            dst,
172            DirEntry::new(permission, LabelNameId(name_id))
173                .expect("label_names is larger than 2^61 items"),
174        );
175    }
176
177    /// Adds a labeled snp->{cnt,dir,rev,rel} arc to the graph
178    pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
179        let name = name.into();
180        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
181            self.label_names.push(name);
182            (self.label_names.len() - 1)
183                .try_into()
184                .expect("label_names length overflowed u64")
185        });
186        self.l_arc(
187            src,
188            dst,
189            Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
190        );
191    }
192
193    /// Adds a labeled ori->snp arc to the graph
194    pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
195        self.l_arc(
196            src,
197            dst,
198            Visit::new(status, timestamp).expect("invalid timestamp"),
199        );
200    }
201
202    /// Adds a labeled arc to the graph
203    pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
204        self.arcs.push((src, dst, Some(label.into())));
205    }
206
207    #[allow(clippy::type_complexity)]
208    pub fn done(&self) -> Result<BuiltGraph> {
209        let num_nodes = self.swhids.len();
210        let mut seen = sux::prelude::BitVec::new(num_nodes);
211        for (src, dst, _) in self.arcs.iter() {
212            seen.set(*src, true);
213            seen.set(*dst, true);
214        }
215        for node in 0..num_nodes {
216            ensure!(
217                seen.get(node),
218                "VecGraph requires every node to have at least one arc, and {} does not have any",
219                node
220            );
221        }
222
223        let mut label_names_with_index: Vec<_> =
224            self.label_names.iter().cloned().enumerate().collect();
225        match self.label_names_order {
226            LabelNamesOrder::Lexicographic => label_names_with_index
227                .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
228            LabelNamesOrder::LexicographicBase64 => {
229                let base64 = base64_simd::STANDARD;
230                label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
231                    base64.encode_to_string(label_name).into_bytes()
232                });
233            }
234        }
235        let mut label_permutation = vec![0; label_names_with_index.len()];
236        for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
237            label_permutation[*old_index] = new_index;
238        }
239        let label_names: Vec<_> = label_names_with_index
240            .into_iter()
241            .map(|(_index, label_name)| label_name)
242            .collect();
243
244        let mut arcs = self.arcs.clone();
245        arcs.sort_by_key(|(src, dst, _label)| (*src, *dst)); // stable sort, it makes tests easier to write
246
247        let arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
248            .into_iter()
249            .group_by(|(src, dst, _label)| (*src, *dst))
250            .into_iter()
251            .map(|((src, dst), arcs)| -> (NodeId, NodeId, Vec<u64>) {
252                let labels = arcs
253                    .flat_map(|(_src, _dst, labels)| {
254                        labels.map(|label| {
255                            UntypedEdgeLabel::from(match label {
256                                EdgeLabel::Branch(branch) => EdgeLabel::Branch(
257                                    Branch::new(LabelNameId(
258                                        label_permutation[branch.label_name_id().0 as usize] as u64,
259                                    ))
260                                    .expect("Label name permutation overflowed"),
261                                ),
262                                EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
263                                    DirEntry::new(
264                                        entry.permission().expect("invalid permission"),
265                                        LabelNameId(
266                                            label_permutation[entry.label_name_id().0 as usize]
267                                                as u64,
268                                        ),
269                                    )
270                                    .expect("Label name permutation overflowed"),
271                                ),
272                                EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
273                            })
274                            .0
275                        })
276                    })
277                    .collect();
278                (src, dst, labels)
279            })
280            .collect();
281
282        let backward_arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
283            .iter()
284            .map(|(src, dst, labels)| (*dst, *src, labels.clone()))
285            .collect();
286
287        SwhBidirectionalGraph::from_underlying_graphs(
288            std::path::PathBuf::default(),
289            LabeledVecGraph::from_arcs(arcs),
290            LabeledVecGraph::from_arcs(backward_arcs),
291        )
292        .init_properties()
293        .load_properties(|properties| {
294            properties
295                .with_maps(properties::VecMaps::new(self.swhids.clone()))
296                .context("Could not join maps")?
297                .with_contents(
298                    properties::VecContents::new(
299                        self.is_skipped_content
300                            .iter()
301                            .copied()
302                            .zip(self.content_lengths.iter().copied())
303                            .collect(),
304                    )
305                    .context("Could not build VecContents")?,
306                )
307                .context("Could not join VecContents")?
308                .with_label_names(
309                    properties::VecLabelNames::new(label_names.clone())
310                        .context("Could not build VecLabelNames")?,
311                )
312                .context("Could not join maps")?
313                .with_persons(
314                    properties::VecPersons::new(
315                        self.author_ids
316                            .iter()
317                            .copied()
318                            .zip(self.committer_ids.iter().copied())
319                            .collect(),
320                    )
321                    .context("Could not build VecPersons")?,
322                )
323                .context("Could not join persons")?
324                .with_strings(
325                    properties::VecStrings::new(
326                        self.messages
327                            .iter()
328                            .cloned()
329                            .zip(self.tag_names.iter().cloned())
330                            .collect(),
331                    )
332                    .context("Could not build VecStrings")?,
333                )
334                .context("Could not join strings")?
335                .with_timestamps(
336                    properties::VecTimestamps::new(
337                        self.author_timestamps
338                            .iter()
339                            .copied()
340                            .zip(self.author_timestamp_offsets.iter().copied())
341                            .zip(self.committer_timestamps.iter().copied())
342                            .zip(self.committer_timestamp_offsets.iter().copied())
343                            .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
344                            .collect(),
345                    )
346                    .context("Could not build VecTimestamps")?,
347                )
348                .context("Could not join timestamps")
349        })
350    }
351}
352
353pub struct NodeBuilder<'builder> {
354    node_id: NodeId,
355    graph_builder: &'builder mut GraphBuilder,
356}
357
358impl NodeBuilder<'_> {
359    pub fn done(&self) -> NodeId {
360        self.node_id
361    }
362
363    pub fn content_length(&mut self, content_length: u64) -> &mut Self {
364        self.graph_builder.content_lengths[self.node_id] = Some(content_length);
365        self
366    }
367
368    pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
369        self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
370        self
371    }
372
373    pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
374        let next_author_id = self
375            .graph_builder
376            .persons
377            .len()
378            .try_into()
379            .expect("person names overflowed u32");
380        let author_id = self
381            .graph_builder
382            .persons
383            .entry(author)
384            .or_insert(next_author_id);
385        self.graph_builder.author_ids[self.node_id] = Some(*author_id);
386        self
387    }
388
389    pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
390        let next_committer_id = self
391            .graph_builder
392            .persons
393            .len()
394            .try_into()
395            .expect("person names overflowed u32");
396        let committer_id = self
397            .graph_builder
398            .persons
399            .entry(committer)
400            .or_insert(next_committer_id);
401        self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
402        self
403    }
404
405    pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
406        self.graph_builder.messages[self.node_id] = Some(message);
407        self
408    }
409
410    pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
411        self.graph_builder.tag_names[self.node_id] = Some(tag_name);
412        self
413    }
414
415    pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
416        self.graph_builder.author_timestamps[self.node_id] = Some(ts);
417        self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
418        self
419    }
420
421    pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
422        self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
423        self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
424        self
425    }
426}
427
428pub fn codegen_from_full_graph<
429    G: SwhLabeledForwardGraph
430        + SwhGraphWithProperties<
431            Maps: properties::Maps,
432            Timestamps: properties::Timestamps,
433            Persons: properties::Persons,
434            Contents: properties::Contents,
435            Strings: properties::Strings,
436            LabelNames: properties::LabelNames,
437        >,
438    W: Write,
439>(
440    graph: &G,
441    mut writer: W,
442) -> Result<(), std::io::Error> {
443    // Turns a Vec<u8> into an escaped bytestring
444    let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
445
446    writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
447    writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
448    writer.write_all(b"\n")?;
449    writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
450    for node in 0..graph.num_nodes() {
451        writer.write_all(
452            format!(
453                "builder\n    .node(swhid!({}))\n    .unwrap()\n",
454                graph.properties().swhid(node)
455            )
456            .as_bytes(),
457        )?;
458        let mut write_line = |s: String| writer.write_all(format!("    {s}\n").as_bytes());
459        match graph.properties().node_type(node) {
460            NodeType::Content => {
461                write_line(format!(
462                    ".is_skipped_content({})",
463                    graph.properties().is_skipped_content(node),
464                ))?;
465            }
466            NodeType::Revision
467            | NodeType::Release
468            | NodeType::Directory
469            | NodeType::Snapshot
470            | NodeType::Origin => {}
471        }
472        if let Some(v) = graph.properties().content_length(node) {
473            write_line(format!(".content_length({v})"))?;
474        }
475        if let Some(v) = graph.properties().message(node) {
476            write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
477        }
478        if let Some(v) = graph.properties().tag_name(node) {
479            write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
480        }
481        if let Some(v) = graph.properties().author_id(node) {
482            write_line(format!(".author(b\"{v}\".to_vec())"))?;
483        }
484        if let Some(ts) = graph.properties().author_timestamp(node) {
485            let offset = graph
486                .properties()
487                .author_timestamp_offset(node)
488                .expect("Node has author_timestamp but no author_timestamp_offset");
489            write_line(format!(".author_timestamp({ts}, {offset})"))?;
490        }
491        if let Some(v) = graph.properties().committer_id(node) {
492            write_line(format!(".committer(b\"{v}\".to_vec())"))?;
493        }
494        if let Some(ts) = graph.properties().committer_timestamp(node) {
495            let offset = graph
496                .properties()
497                .committer_timestamp_offset(node)
498                .expect("Node has committer_timestamp but no committer_timestamp_offset");
499            write_line(format!(".committer_timestamp({ts}, {offset})"))?;
500        }
501        writer.write_all(b"    .done();\n")?;
502    }
503
504    writer.write_all(b"\n")?;
505
506    for node in 0..graph.num_nodes() {
507        for (succ, labels) in graph.labeled_successors(node) {
508            let mut has_labels = false;
509            for label in labels {
510                writer.write_all(
511                    match label {
512                        EdgeLabel::DirEntry(label) => format!(
513                            "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
514                            node,
515                            succ,
516                            label.permission().expect("Invalid permission"),
517                            bytestring(graph.properties().label_name(label.label_name_id())),
518                        ),
519                        EdgeLabel::Branch(label) => format!(
520                            "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
521                            node,
522                            succ,
523                            bytestring(graph.properties().label_name(label.label_name_id())),
524                        ),
525                        EdgeLabel::Visit(label) => format!(
526                            "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
527                            node,
528                            succ,
529                            label.status(),
530                            label.timestamp(),
531                        ),
532                    }
533                    .as_bytes(),
534                )?;
535                has_labels = true;
536            }
537            if !has_labels {
538                writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
539            }
540        }
541    }
542
543    writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
544
545    Ok(())
546}