swh_graph/
graph_builder.rs

1/*
2 * Copyright (C) 2024-2025  The Software Heritage developers
3 * See the AUTHORS file at the top-level directory of this distribution
4 * License: GNU General Public License version 3, or any later version
5 * See top-level LICENSE file for more information
6 */
7
8//! Utility to dynamically build a small graph in memory
9
10use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use sux::traits::bit_vec_ops::{BitVecOps, BitVecOpsMut};
16use webgraph::graphs::vec_graph::LabeledVecGraph;
17
18use crate::graph::*;
19use crate::labels::{
20    Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
21};
22use crate::properties;
23use crate::SwhGraphProperties;
24use crate::{NodeType, SWHID};
25
26/// How to sort label names in the produced graph. Defaults to `Lexicographic`.
27#[derive(Default, Clone, Copy, Debug)]
28pub enum LabelNamesOrder {
29    #[default]
30    /// Sort label names in their lexicographic order, as for graphs compressed with Rust
31    /// (2023-09-06 and newer)
32    Lexicographic,
33    /// Sort label names in the lexicographic order of their base64-encoding, as for graphs
34    /// compressed with Java (2022-12-07 and older)
35    LexicographicBase64,
36}
37
38// Type (alias) of the graph built by the graph builder
39pub type BuiltGraph = SwhBidirectionalGraph<
40    SwhGraphProperties<
41        properties::VecMaps,
42        properties::VecTimestamps,
43        properties::VecPersons,
44        properties::VecContents,
45        properties::VecStrings,
46        properties::VecLabelNames,
47    >,
48    LabeledVecGraph<Vec<u64>>,
49    LabeledVecGraph<Vec<u64>>,
50>;
51
52/// Dynamically builds a small graph in memory
53///
54/// # Examples
55///
56/// ## Directories and contents
57///
58/// ```
59/// use swh_graph::swhid;
60/// use swh_graph::graph_builder::GraphBuilder;
61/// use swh_graph::labels::Permission;
62///
63/// let mut builder = GraphBuilder::default();
64/// let a = builder
65///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000010)).unwrap()
66///     .done();
67/// let b = builder
68///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000020)).unwrap()
69///     .done();
70/// let c = builder
71///     .node(swhid!(swh:1:cnt:0000000000000000000000000000000000000030)).unwrap()
72///     .done();
73/// builder.dir_arc(a, b, Permission::Directory, b"tests");
74/// builder.dir_arc(a, c, Permission::ExecutableContent, b"run.sh");
75/// builder.dir_arc(b, c, Permission::Content, b"test.c");
76/// let _ = builder.done().unwrap();
77/// ```
78///
79/// # Revisions and releases
80///
81/// ```
82/// use swh_graph::swhid;
83///
84/// use swh_graph::graph_builder::GraphBuilder;
85/// let mut builder = GraphBuilder::default();
86/// let a = builder
87///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010)).unwrap()
88///     .author_timestamp(1708950743, 60)
89///     .done();
90/// let b = builder
91///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020)).unwrap()
92///     .committer_timestamp(1708950821, 120)
93///     .done();
94/// builder.arc(a, b);
95/// let _ = builder.done().unwrap();
96/// ```
97#[derive(Clone, Debug, Default)]
98pub struct GraphBuilder {
99    pub label_names_order: LabelNamesOrder,
100
101    name_to_id: HashMap<Vec<u8>, u64>,
102    persons: HashMap<Vec<u8>, u32>,
103
104    arcs: Vec<((NodeId, NodeId), Option<EdgeLabel>)>,
105
106    swhids: Vec<SWHID>,
107    is_skipped_content: Vec<bool>,
108    content_lengths: Vec<Option<u64>>,
109    author_ids: Vec<Option<u32>>,
110    committer_ids: Vec<Option<u32>>,
111    messages: Vec<Option<Vec<u8>>>,
112    tag_names: Vec<Option<Vec<u8>>>,
113    author_timestamps: Vec<Option<i64>>,
114    author_timestamp_offsets: Vec<Option<i16>>,
115    committer_timestamps: Vec<Option<i64>>,
116    committer_timestamp_offsets: Vec<Option<i16>>,
117    label_names: Vec<Vec<u8>>,
118}
119
120impl GraphBuilder {
121    /// Adds a node to the graph.
122    ///
123    /// Returns `Err` if there is already a node with this SWHID.
124    pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
125        ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
126        let node_id = self.swhids.len();
127        self.swhids.push(swhid);
128        self.is_skipped_content.push(false);
129        self.content_lengths.push(None);
130        self.author_ids.push(None);
131        self.committer_ids.push(None);
132        self.messages.push(None);
133        self.tag_names.push(None);
134        self.author_timestamps.push(None);
135        self.author_timestamp_offsets.push(None);
136        self.committer_timestamps.push(None);
137        self.committer_timestamp_offsets.push(None);
138        Ok(NodeBuilder {
139            node_id,
140            graph_builder: self,
141        })
142    }
143
144    /// Returns `NodeId` that represents this SWHID in the graph, if it exists
145    pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
146        self.swhids.iter().position(|x| *x == swhid)
147    }
148
149    /// Adds an unlabeled arc to the graph
150    pub fn arc(&mut self, src: NodeId, dst: NodeId) {
151        self.arcs.push(((src, dst), None));
152    }
153
154    /// Adds a labeled dir->{cnt,dir,rev} arc to the graph
155    pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
156        &mut self,
157        src: NodeId,
158        dst: NodeId,
159        permission: P,
160        name: N,
161    ) {
162        let permission = permission.into();
163        let name = name.into();
164        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
165            self.label_names.push(name);
166            (self.label_names.len() - 1)
167                .try_into()
168                .expect("label_names length overflowed u64")
169        });
170        self.l_arc(
171            src,
172            dst,
173            DirEntry::new(permission, LabelNameId(name_id))
174                .expect("label_names is larger than 2^61 items"),
175        );
176    }
177
178    /// Adds a labeled snp->{cnt,dir,rev,rel} arc to the graph
179    pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
180        let name = name.into();
181        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
182            self.label_names.push(name);
183            (self.label_names.len() - 1)
184                .try_into()
185                .expect("label_names length overflowed u64")
186        });
187        self.l_arc(
188            src,
189            dst,
190            Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
191        );
192    }
193
194    /// Adds a labeled ori->snp arc to the graph
195    pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
196        self.l_arc(
197            src,
198            dst,
199            Visit::new(status, timestamp).expect("invalid timestamp"),
200        );
201    }
202
203    /// Adds a labeled arc to the graph
204    pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
205        self.arcs.push(((src, dst), Some(label.into())));
206    }
207
208    #[allow(clippy::type_complexity)]
209    pub fn done(&self) -> Result<BuiltGraph> {
210        let num_nodes = self.swhids.len();
211        let mut seen = sux::prelude::BitVec::new(num_nodes);
212        for ((src, dst), _) in self.arcs.iter() {
213            seen.set(*src, true);
214            seen.set(*dst, true);
215        }
216        for node in 0..num_nodes {
217            ensure!(
218                seen.get(node),
219                "VecGraph requires every node to have at least one arc, and {} does not have any",
220                node
221            );
222        }
223
224        let mut label_names_with_index: Vec<_> =
225            self.label_names.iter().cloned().enumerate().collect();
226        match self.label_names_order {
227            LabelNamesOrder::Lexicographic => label_names_with_index
228                .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
229            LabelNamesOrder::LexicographicBase64 => {
230                let base64 = base64_simd::STANDARD;
231                label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
232                    base64.encode_to_string(label_name).into_bytes()
233                });
234            }
235        }
236        let mut label_permutation = vec![0; label_names_with_index.len()];
237        for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
238            label_permutation[*old_index] = new_index;
239        }
240        let label_names: Vec<_> = label_names_with_index
241            .into_iter()
242            .map(|(_index, label_name)| label_name)
243            .collect();
244
245        let mut arcs = self.arcs.clone();
246        arcs.sort_by_key(|((src, dst), _label)| (*src, *dst)); // stable sort, it makes tests easier to write
247
248        let arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
249            .into_iter()
250            .group_by(|((src, dst), _label)| (*src, *dst))
251            .into_iter()
252            .map(|((src, dst), arcs)| -> ((NodeId, NodeId), Vec<u64>) {
253                let labels = arcs
254                    .flat_map(|((_src, _dst), labels)| {
255                        labels.map(|label| {
256                            UntypedEdgeLabel::from(match label {
257                                EdgeLabel::Branch(branch) => EdgeLabel::Branch(
258                                    Branch::new(LabelNameId(
259                                        label_permutation[branch.label_name_id().0 as usize] as u64,
260                                    ))
261                                    .expect("Label name permutation overflowed"),
262                                ),
263                                EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
264                                    DirEntry::new(
265                                        entry.permission().expect("invalid permission"),
266                                        LabelNameId(
267                                            label_permutation[entry.label_name_id().0 as usize]
268                                                as u64,
269                                        ),
270                                    )
271                                    .expect("Label name permutation overflowed"),
272                                ),
273                                EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
274                            })
275                            .0
276                        })
277                    })
278                    .collect();
279                ((src, dst), labels)
280            })
281            .collect();
282
283        let backward_arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
284            .iter()
285            .map(|((src, dst), labels)| ((*dst, *src), labels.clone()))
286            .collect();
287
288        SwhBidirectionalGraph::from_underlying_graphs(
289            std::path::PathBuf::default(),
290            LabeledVecGraph::from_arcs(arcs),
291            LabeledVecGraph::from_arcs(backward_arcs),
292        )
293        .init_properties()
294        .load_properties(|properties| {
295            properties
296                .with_maps(properties::VecMaps::new(self.swhids.clone()))
297                .context("Could not join maps")?
298                .with_contents(
299                    properties::VecContents::new(
300                        self.is_skipped_content
301                            .iter()
302                            .copied()
303                            .zip(self.content_lengths.iter().copied())
304                            .collect(),
305                    )
306                    .context("Could not build VecContents")?,
307                )
308                .context("Could not join VecContents")?
309                .with_label_names(
310                    properties::VecLabelNames::new(label_names.clone())
311                        .context("Could not build VecLabelNames")?,
312                )
313                .context("Could not join maps")?
314                .with_persons(
315                    properties::VecPersons::new(
316                        self.author_ids
317                            .iter()
318                            .copied()
319                            .zip(self.committer_ids.iter().copied())
320                            .collect(),
321                    )
322                    .context("Could not build VecPersons")?,
323                )
324                .context("Could not join persons")?
325                .with_strings(
326                    properties::VecStrings::new(
327                        self.messages
328                            .iter()
329                            .cloned()
330                            .zip(self.tag_names.iter().cloned())
331                            .collect(),
332                    )
333                    .context("Could not build VecStrings")?,
334                )
335                .context("Could not join strings")?
336                .with_timestamps(
337                    properties::VecTimestamps::new(
338                        self.author_timestamps
339                            .iter()
340                            .copied()
341                            .zip(self.author_timestamp_offsets.iter().copied())
342                            .zip(self.committer_timestamps.iter().copied())
343                            .zip(self.committer_timestamp_offsets.iter().copied())
344                            .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
345                            .collect(),
346                    )
347                    .context("Could not build VecTimestamps")?,
348                )
349                .context("Could not join timestamps")
350        })
351    }
352}
353
354pub struct NodeBuilder<'builder> {
355    node_id: NodeId,
356    graph_builder: &'builder mut GraphBuilder,
357}
358
359impl NodeBuilder<'_> {
360    pub fn done(&self) -> NodeId {
361        self.node_id
362    }
363
364    pub fn content_length(&mut self, content_length: u64) -> &mut Self {
365        self.graph_builder.content_lengths[self.node_id] = Some(content_length);
366        self
367    }
368
369    pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
370        self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
371        self
372    }
373
374    pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
375        let next_author_id = self
376            .graph_builder
377            .persons
378            .len()
379            .try_into()
380            .expect("person names overflowed u32");
381        let author_id = self
382            .graph_builder
383            .persons
384            .entry(author)
385            .or_insert(next_author_id);
386        self.graph_builder.author_ids[self.node_id] = Some(*author_id);
387        self
388    }
389
390    pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
391        let next_committer_id = self
392            .graph_builder
393            .persons
394            .len()
395            .try_into()
396            .expect("person names overflowed u32");
397        let committer_id = self
398            .graph_builder
399            .persons
400            .entry(committer)
401            .or_insert(next_committer_id);
402        self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
403        self
404    }
405
406    pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
407        self.graph_builder.messages[self.node_id] = Some(message);
408        self
409    }
410
411    pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
412        self.graph_builder.tag_names[self.node_id] = Some(tag_name);
413        self
414    }
415
416    pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
417        self.graph_builder.author_timestamps[self.node_id] = Some(ts);
418        self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
419        self
420    }
421
422    pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
423        self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
424        self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
425        self
426    }
427}
428
429pub fn codegen_from_full_graph<
430    G: SwhLabeledForwardGraph
431        + SwhGraphWithProperties<
432            Maps: properties::Maps,
433            Timestamps: properties::Timestamps,
434            Persons: properties::Persons,
435            Contents: properties::Contents,
436            Strings: properties::Strings,
437            LabelNames: properties::LabelNames,
438        >,
439    W: Write,
440>(
441    graph: &G,
442    mut writer: W,
443) -> Result<(), std::io::Error> {
444    // Turns a Vec<u8> into an escaped bytestring
445    let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
446
447    writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
448    writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
449    writer.write_all(b"\n")?;
450    writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
451    for node in 0..graph.num_nodes() {
452        writer.write_all(
453            format!(
454                "builder\n    .node(swhid!({}))\n    .unwrap()\n",
455                graph.properties().swhid(node)
456            )
457            .as_bytes(),
458        )?;
459        let mut write_line = |s: String| writer.write_all(format!("    {s}\n").as_bytes());
460        match graph.properties().node_type(node) {
461            NodeType::Content => {
462                write_line(format!(
463                    ".is_skipped_content({})",
464                    graph.properties().is_skipped_content(node),
465                ))?;
466            }
467            NodeType::Revision
468            | NodeType::Release
469            | NodeType::Directory
470            | NodeType::Snapshot
471            | NodeType::Origin => {}
472        }
473        if let Some(v) = graph.properties().content_length(node) {
474            write_line(format!(".content_length({v})"))?;
475        }
476        if let Some(v) = graph.properties().message(node) {
477            write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
478        }
479        if let Some(v) = graph.properties().tag_name(node) {
480            write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
481        }
482        if let Some(v) = graph.properties().author_id(node) {
483            write_line(format!(".author(b\"{v}\".to_vec())"))?;
484        }
485        if let Some(ts) = graph.properties().author_timestamp(node) {
486            let offset = graph
487                .properties()
488                .author_timestamp_offset(node)
489                .expect("Node has author_timestamp but no author_timestamp_offset");
490            write_line(format!(".author_timestamp({ts}, {offset})"))?;
491        }
492        if let Some(v) = graph.properties().committer_id(node) {
493            write_line(format!(".committer(b\"{v}\".to_vec())"))?;
494        }
495        if let Some(ts) = graph.properties().committer_timestamp(node) {
496            let offset = graph
497                .properties()
498                .committer_timestamp_offset(node)
499                .expect("Node has committer_timestamp but no committer_timestamp_offset");
500            write_line(format!(".committer_timestamp({ts}, {offset})"))?;
501        }
502        writer.write_all(b"    .done();\n")?;
503    }
504
505    writer.write_all(b"\n")?;
506
507    for node in 0..graph.num_nodes() {
508        for (succ, labels) in graph.labeled_successors(node) {
509            let mut has_labels = false;
510            for label in labels {
511                writer.write_all(
512                    match label {
513                        EdgeLabel::DirEntry(label) => format!(
514                            "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
515                            node,
516                            succ,
517                            label.permission().expect("Invalid permission"),
518                            bytestring(graph.properties().label_name(label.label_name_id())),
519                        ),
520                        EdgeLabel::Branch(label) => format!(
521                            "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
522                            node,
523                            succ,
524                            bytestring(graph.properties().label_name(label.label_name_id())),
525                        ),
526                        EdgeLabel::Visit(label) => format!(
527                            "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
528                            node,
529                            succ,
530                            label.status(),
531                            label.timestamp(),
532                        ),
533                    }
534                    .as_bytes(),
535                )?;
536                has_labels = true;
537            }
538            if !has_labels {
539                writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
540            }
541        }
542    }
543
544    writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
545
546    Ok(())
547}