Skip to main content

swh_graph/
graph_builder.rs

1/*
2 * Copyright (C) 2024-2026  The Software Heritage developers
3 * See the AUTHORS file at the top-level directory of this distribution
4 * License: GNU General Public License version 3, or any later version
5 * See top-level LICENSE file for more information
6 */
7
8//! Utility to dynamically build a small graph in memory
9
10use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use sux::traits::bit_vec_ops::{BitVecOps, BitVecOpsMut};
16use webgraph::graphs::vec_graph::LabeledVecGraph;
17
18use crate::graph::*;
19use crate::labels::{
20    Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
21};
22use crate::properties;
23use crate::SwhGraphProperties;
24use crate::{NodeType, SWHID};
25
26/// How to sort label names in the produced graph. Defaults to `Lexicographic`.
27#[derive(Default, Clone, Copy, Debug)]
28pub enum LabelNamesOrder {
29    #[default]
30    /// Sort label names in their lexicographic order, as for graphs compressed with Rust
31    /// (2023-09-06 and newer)
32    Lexicographic,
33    /// Sort label names in the lexicographic order of their base64-encoding, as for graphs
34    /// compressed with Java (2022-12-07 and older)
35    LexicographicBase64,
36}
37
38// Type (alias) of the graph built by the graph builder
39pub type BuiltGraph = SwhBidirectionalGraph<
40    SwhGraphProperties<
41        properties::VecMaps,
42        properties::VecTimestamps,
43        properties::VecPersons,
44        properties::VecContents,
45        properties::VecStrings,
46        properties::VecLabelNames,
47    >,
48    LabeledVecGraph<Vec<u64>>,
49    LabeledVecGraph<Vec<u64>>,
50>;
51
52/// Dynamically builds a small graph in memory
53///
54/// # Examples
55///
56/// ## Directories and contents
57///
58/// ```
59/// use swh_graph::swhid;
60/// use swh_graph::graph_builder::GraphBuilder;
61/// use swh_graph::labels::Permission;
62///
63/// let mut builder = GraphBuilder::default();
64/// let a = builder
65///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000010)).unwrap()
66///     .done();
67/// let b = builder
68///     .node(swhid!(swh:1:dir:0000000000000000000000000000000000000020)).unwrap()
69///     .done();
70/// let c = builder
71///     .node(swhid!(swh:1:cnt:0000000000000000000000000000000000000030)).unwrap()
72///     .done();
73/// builder.dir_arc(a, b, Permission::Directory, b"tests");
74/// builder.dir_arc(a, c, Permission::ExecutableContent, b"run.sh");
75/// builder.dir_arc(b, c, Permission::Content, b"test.c");
76/// let _ = builder.done().unwrap();
77/// ```
78///
79/// # Revisions and releases
80///
81/// ```
82/// use swh_graph::swhid;
83///
84/// use swh_graph::graph_builder::GraphBuilder;
85/// let mut builder = GraphBuilder::default();
86/// let a = builder
87///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000010)).unwrap()
88///     .author_timestamp(1708950743, 60)
89///     .done();
90/// let b = builder
91///     .node(swhid!(swh:1:rev:0000000000000000000000000000000000000020)).unwrap()
92///     .committer_timestamp(1708950821, 120)
93///     .done();
94/// builder.arc(a, b);
95/// let _ = builder.done().unwrap();
96/// ```
97#[derive(Clone, Debug, Default)]
98pub struct GraphBuilder {
99    pub label_names_order: LabelNamesOrder,
100
101    name_to_id: HashMap<Vec<u8>, u64>,
102    persons: HashMap<Vec<u8>, u32>,
103
104    arcs: Vec<((NodeId, NodeId), Option<EdgeLabel>)>,
105
106    swhids: Vec<SWHID>,
107    is_skipped_content: Vec<bool>,
108    content_lengths: Vec<Option<u64>>,
109    author_ids: Vec<Option<u32>>,
110    committer_ids: Vec<Option<u32>>,
111    messages: Vec<Option<Vec<u8>>>,
112    tag_names: Vec<Option<Vec<u8>>>,
113    author_timestamps: Vec<Option<i64>>,
114    author_timestamp_offsets: Vec<Option<i16>>,
115    committer_timestamps: Vec<Option<i64>>,
116    committer_timestamp_offsets: Vec<Option<i16>>,
117    label_names: Vec<Vec<u8>>,
118}
119
120impl GraphBuilder {
121    /// Adds a node to the graph.
122    ///
123    /// Returns `Err` if there is already a node with this SWHID.
124    pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
125        ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
126        let node_id = self.swhids.len();
127        self.swhids.push(swhid);
128        self.is_skipped_content.push(false);
129        self.content_lengths.push(None);
130        self.author_ids.push(None);
131        self.committer_ids.push(None);
132        self.messages.push(None);
133        self.tag_names.push(None);
134        self.author_timestamps.push(None);
135        self.author_timestamp_offsets.push(None);
136        self.committer_timestamps.push(None);
137        self.committer_timestamp_offsets.push(None);
138        Ok(NodeBuilder {
139            node_id,
140            graph_builder: self,
141        })
142    }
143
144    /// Returns `NodeId` that represents this SWHID in the graph, if it exists
145    #[inline(always)]
146    pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
147        self.swhids.iter().position(|x| *x == swhid)
148    }
149
150    /// Adds an unlabeled arc to the graph
151    pub fn arc(&mut self, src: NodeId, dst: NodeId) {
152        self.arcs.push(((src, dst), None));
153    }
154
155    /// Adds a labeled dir->{cnt,dir,rev} arc to the graph
156    pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
157        &mut self,
158        src: NodeId,
159        dst: NodeId,
160        permission: P,
161        name: N,
162    ) {
163        let permission = permission.into();
164        let name = name.into();
165        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
166            self.label_names.push(name);
167            (self.label_names.len() - 1)
168                .try_into()
169                .expect("label_names length overflowed u64")
170        });
171        self.l_arc(
172            src,
173            dst,
174            DirEntry::new(permission, LabelNameId(name_id))
175                .expect("label_names is larger than 2^61 items"),
176        );
177    }
178
179    /// Adds a labeled snp->{cnt,dir,rev,rel} arc to the graph
180    pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
181        let name = name.into();
182        let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
183            self.label_names.push(name);
184            (self.label_names.len() - 1)
185                .try_into()
186                .expect("label_names length overflowed u64")
187        });
188        self.l_arc(
189            src,
190            dst,
191            Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
192        );
193    }
194
195    /// Adds a labeled ori->snp arc to the graph
196    pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
197        self.l_arc(
198            src,
199            dst,
200            Visit::new(status, timestamp).expect("invalid timestamp"),
201        );
202    }
203
204    /// Adds a labeled arc to the graph
205    pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
206        self.arcs.push(((src, dst), Some(label.into())));
207    }
208
209    #[allow(clippy::type_complexity)]
210    pub fn done(&self) -> Result<BuiltGraph> {
211        let num_nodes = self.swhids.len();
212        let mut seen = sux::prelude::BitVec::new(num_nodes);
213        for ((src, dst), _) in self.arcs.iter() {
214            seen.set(*src, true);
215            seen.set(*dst, true);
216        }
217        for node in 0..num_nodes {
218            ensure!(
219                seen.get(node),
220                "VecGraph requires every node to have at least one arc, and node {} does not have any",
221                node
222            );
223        }
224
225        let mut label_names_with_index: Vec<_> =
226            self.label_names.iter().cloned().enumerate().collect();
227        match self.label_names_order {
228            LabelNamesOrder::Lexicographic => label_names_with_index
229                .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
230            LabelNamesOrder::LexicographicBase64 => {
231                let base64 = base64_simd::STANDARD;
232                label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
233                    base64.encode_to_string(label_name).into_bytes()
234                });
235            }
236        }
237        let mut label_permutation = vec![0; label_names_with_index.len()];
238        for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
239            label_permutation[*old_index] = new_index;
240        }
241        let label_names: Vec<_> = label_names_with_index
242            .into_iter()
243            .map(|(_index, label_name)| label_name)
244            .collect();
245
246        let mut arcs = self.arcs.clone();
247        arcs.sort_by_key(|((src, dst), _label)| (*src, *dst)); // stable sort, it makes tests easier to write
248
249        let arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
250            .into_iter()
251            .group_by(|((src, dst), _label)| (*src, *dst))
252            .into_iter()
253            .map(|((src, dst), arcs)| -> ((NodeId, NodeId), Vec<u64>) {
254                let labels = arcs
255                    .flat_map(|((_src, _dst), labels)| {
256                        labels.map(|label| {
257                            UntypedEdgeLabel::from(match label {
258                                EdgeLabel::Branch(branch) => EdgeLabel::Branch(
259                                    Branch::new(LabelNameId(
260                                        label_permutation[branch.label_name_id().0 as usize] as u64,
261                                    ))
262                                    .expect("Label name permutation overflowed"),
263                                ),
264                                EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
265                                    DirEntry::new(
266                                        entry.permission().expect("invalid permission"),
267                                        LabelNameId(
268                                            label_permutation[entry.label_name_id().0 as usize]
269                                                as u64,
270                                        ),
271                                    )
272                                    .expect("Label name permutation overflowed"),
273                                ),
274                                EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
275                            })
276                            .0
277                        })
278                    })
279                    .collect();
280                ((src, dst), labels)
281            })
282            .collect();
283
284        let backward_arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
285            .iter()
286            .map(|((src, dst), labels)| ((*dst, *src), labels.clone()))
287            .collect();
288
289        SwhBidirectionalGraph::from_underlying_graphs(
290            std::path::PathBuf::default(),
291            LabeledVecGraph::from_arcs(arcs),
292            LabeledVecGraph::from_arcs(backward_arcs),
293        )
294        .init_properties()
295        .load_properties(|properties| {
296            properties
297                .with_maps(properties::VecMaps::new(self.swhids.clone()))
298                .context("Could not join maps")?
299                .with_contents(
300                    properties::VecContents::new(
301                        self.is_skipped_content
302                            .iter()
303                            .copied()
304                            .zip(self.content_lengths.iter().copied())
305                            .collect(),
306                    )
307                    .context("Could not build VecContents")?,
308                )
309                .context("Could not join VecContents")?
310                .with_label_names(
311                    properties::VecLabelNames::new(label_names.clone())
312                        .context("Could not build VecLabelNames")?,
313                )
314                .context("Could not join maps")?
315                .with_persons(
316                    properties::VecPersons::new(
317                        self.author_ids
318                            .iter()
319                            .copied()
320                            .zip(self.committer_ids.iter().copied())
321                            .collect(),
322                    )
323                    .context("Could not build VecPersons")?,
324                )
325                .context("Could not join persons")?
326                .with_strings(
327                    properties::VecStrings::new(
328                        self.messages
329                            .iter()
330                            .cloned()
331                            .zip(self.tag_names.iter().cloned())
332                            .collect(),
333                    )
334                    .context("Could not build VecStrings")?,
335                )
336                .context("Could not join strings")?
337                .with_timestamps(
338                    properties::VecTimestamps::new(
339                        self.author_timestamps
340                            .iter()
341                            .copied()
342                            .zip(self.author_timestamp_offsets.iter().copied())
343                            .zip(self.committer_timestamps.iter().copied())
344                            .zip(self.committer_timestamp_offsets.iter().copied())
345                            .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
346                            .collect(),
347                    )
348                    .context("Could not build VecTimestamps")?,
349                )
350                .context("Could not join timestamps")
351        })
352    }
353}
354
355pub struct NodeBuilder<'builder> {
356    node_id: NodeId,
357    graph_builder: &'builder mut GraphBuilder,
358}
359
360impl NodeBuilder<'_> {
361    #[inline(always)]
362    pub fn done(&self) -> NodeId {
363        self.node_id
364    }
365
366    pub fn content_length(&mut self, content_length: u64) -> &mut Self {
367        self.graph_builder.content_lengths[self.node_id] = Some(content_length);
368        self
369    }
370
371    pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
372        self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
373        self
374    }
375
376    pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
377        let next_author_id = self
378            .graph_builder
379            .persons
380            .len()
381            .try_into()
382            .expect("person names overflowed u32");
383        let author_id = self
384            .graph_builder
385            .persons
386            .entry(author)
387            .or_insert(next_author_id);
388        self.graph_builder.author_ids[self.node_id] = Some(*author_id);
389        self
390    }
391
392    pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
393        let next_committer_id = self
394            .graph_builder
395            .persons
396            .len()
397            .try_into()
398            .expect("person names overflowed u32");
399        let committer_id = self
400            .graph_builder
401            .persons
402            .entry(committer)
403            .or_insert(next_committer_id);
404        self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
405        self
406    }
407
408    pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
409        self.graph_builder.messages[self.node_id] = Some(message);
410        self
411    }
412
413    pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
414        self.graph_builder.tag_names[self.node_id] = Some(tag_name);
415        self
416    }
417
418    pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
419        self.graph_builder.author_timestamps[self.node_id] = Some(ts);
420        self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
421        self
422    }
423
424    pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
425        self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
426        self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
427        self
428    }
429}
430
431pub fn codegen_from_full_graph<
432    G: SwhLabeledForwardGraph
433        + SwhGraphWithProperties<
434            Maps: properties::Maps,
435            Timestamps: properties::Timestamps,
436            Persons: properties::Persons,
437            Contents: properties::Contents,
438            Strings: properties::Strings,
439            LabelNames: properties::LabelNames,
440        >,
441    W: Write,
442>(
443    graph: &G,
444    mut writer: W,
445) -> Result<(), std::io::Error> {
446    // Turns a Vec<u8> into an escaped bytestring
447    let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
448
449    writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
450    writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
451    writer.write_all(b"\n")?;
452    writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
453    for node in 0..graph.num_nodes() {
454        writer.write_all(
455            format!(
456                "builder\n    .node(swhid!({}))\n    .unwrap()\n",
457                graph.properties().swhid(node)
458            )
459            .as_bytes(),
460        )?;
461        let mut write_line = |s: String| writer.write_all(format!("    {s}\n").as_bytes());
462        match graph.properties().node_type(node) {
463            NodeType::Content => {
464                write_line(format!(
465                    ".is_skipped_content({})",
466                    graph.properties().is_skipped_content(node),
467                ))?;
468            }
469            NodeType::Revision
470            | NodeType::Release
471            | NodeType::Directory
472            | NodeType::Snapshot
473            | NodeType::Origin => {}
474        }
475        if let Some(v) = graph.properties().content_length(node) {
476            write_line(format!(".content_length({v})"))?;
477        }
478        if let Some(v) = graph.properties().message(node) {
479            write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
480        }
481        if let Some(v) = graph.properties().tag_name(node) {
482            write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
483        }
484        if let Some(v) = graph.properties().author_id(node) {
485            write_line(format!(".author(b\"{v}\".to_vec())"))?;
486        }
487        if let Some(ts) = graph.properties().author_timestamp(node) {
488            let offset = graph
489                .properties()
490                .author_timestamp_offset(node)
491                .expect("Node has author_timestamp but no author_timestamp_offset");
492            write_line(format!(".author_timestamp({ts}, {offset})"))?;
493        }
494        if let Some(v) = graph.properties().committer_id(node) {
495            write_line(format!(".committer(b\"{v}\".to_vec())"))?;
496        }
497        if let Some(ts) = graph.properties().committer_timestamp(node) {
498            let offset = graph
499                .properties()
500                .committer_timestamp_offset(node)
501                .expect("Node has committer_timestamp but no committer_timestamp_offset");
502            write_line(format!(".committer_timestamp({ts}, {offset})"))?;
503        }
504        writer.write_all(b"    .done();\n")?;
505    }
506
507    writer.write_all(b"\n")?;
508
509    for node in 0..graph.num_nodes() {
510        for (succ, labels) in graph.labeled_successors(node) {
511            let mut has_labels = false;
512            for label in labels {
513                writer.write_all(
514                    match label {
515                        EdgeLabel::DirEntry(label) => format!(
516                            "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
517                            node,
518                            succ,
519                            label.permission().expect("Invalid permission"),
520                            bytestring(graph.properties().label_name(label.label_name_id())),
521                        ),
522                        EdgeLabel::Branch(label) => format!(
523                            "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
524                            node,
525                            succ,
526                            bytestring(graph.properties().label_name(label.label_name_id())),
527                        ),
528                        EdgeLabel::Visit(label) => format!(
529                            "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
530                            node,
531                            succ,
532                            label.status(),
533                            label.timestamp(),
534                        ),
535                    }
536                    .as_bytes(),
537                )?;
538                has_labels = true;
539            }
540            if !has_labels {
541                writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
542            }
543        }
544    }
545
546    writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
547
548    Ok(())
549}