1use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use sux::traits::bit_vec_ops::{BitVecOps, BitVecOpsMut};
16use webgraph::graphs::vec_graph::LabeledVecGraph;
17
18use crate::graph::*;
19use crate::labels::{
20 Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
21};
22use crate::properties;
23use crate::SwhGraphProperties;
24use crate::{NodeType, SWHID};
25
26#[derive(Default, Clone, Copy, Debug)]
28pub enum LabelNamesOrder {
29 #[default]
30 Lexicographic,
33 LexicographicBase64,
36}
37
38pub type BuiltGraph = SwhBidirectionalGraph<
40 SwhGraphProperties<
41 properties::VecMaps,
42 properties::VecTimestamps,
43 properties::VecPersons,
44 properties::VecContents,
45 properties::VecStrings,
46 properties::VecLabelNames,
47 >,
48 LabeledVecGraph<Vec<u64>>,
49 LabeledVecGraph<Vec<u64>>,
50>;
51
52#[derive(Clone, Debug, Default)]
98pub struct GraphBuilder {
99 pub label_names_order: LabelNamesOrder,
100
101 name_to_id: HashMap<Vec<u8>, u64>,
102 persons: HashMap<Vec<u8>, u32>,
103
104 arcs: Vec<((NodeId, NodeId), Option<EdgeLabel>)>,
105
106 swhids: Vec<SWHID>,
107 is_skipped_content: Vec<bool>,
108 content_lengths: Vec<Option<u64>>,
109 author_ids: Vec<Option<u32>>,
110 committer_ids: Vec<Option<u32>>,
111 messages: Vec<Option<Vec<u8>>>,
112 tag_names: Vec<Option<Vec<u8>>>,
113 author_timestamps: Vec<Option<i64>>,
114 author_timestamp_offsets: Vec<Option<i16>>,
115 committer_timestamps: Vec<Option<i64>>,
116 committer_timestamp_offsets: Vec<Option<i16>>,
117 label_names: Vec<Vec<u8>>,
118}
119
120impl GraphBuilder {
121 pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
125 ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
126 let node_id = self.swhids.len();
127 self.swhids.push(swhid);
128 self.is_skipped_content.push(false);
129 self.content_lengths.push(None);
130 self.author_ids.push(None);
131 self.committer_ids.push(None);
132 self.messages.push(None);
133 self.tag_names.push(None);
134 self.author_timestamps.push(None);
135 self.author_timestamp_offsets.push(None);
136 self.committer_timestamps.push(None);
137 self.committer_timestamp_offsets.push(None);
138 Ok(NodeBuilder {
139 node_id,
140 graph_builder: self,
141 })
142 }
143
144 #[inline(always)]
146 pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
147 self.swhids.iter().position(|x| *x == swhid)
148 }
149
150 pub fn arc(&mut self, src: NodeId, dst: NodeId) {
152 self.arcs.push(((src, dst), None));
153 }
154
155 pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
157 &mut self,
158 src: NodeId,
159 dst: NodeId,
160 permission: P,
161 name: N,
162 ) {
163 let permission = permission.into();
164 let name = name.into();
165 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
166 self.label_names.push(name);
167 (self.label_names.len() - 1)
168 .try_into()
169 .expect("label_names length overflowed u64")
170 });
171 self.l_arc(
172 src,
173 dst,
174 DirEntry::new(permission, LabelNameId(name_id))
175 .expect("label_names is larger than 2^61 items"),
176 );
177 }
178
179 pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
181 let name = name.into();
182 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
183 self.label_names.push(name);
184 (self.label_names.len() - 1)
185 .try_into()
186 .expect("label_names length overflowed u64")
187 });
188 self.l_arc(
189 src,
190 dst,
191 Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
192 );
193 }
194
195 pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
197 self.l_arc(
198 src,
199 dst,
200 Visit::new(status, timestamp).expect("invalid timestamp"),
201 );
202 }
203
204 pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
206 self.arcs.push(((src, dst), Some(label.into())));
207 }
208
209 #[allow(clippy::type_complexity)]
210 pub fn done(&self) -> Result<BuiltGraph> {
211 let num_nodes = self.swhids.len();
212 let mut seen = sux::prelude::BitVec::new(num_nodes);
213 for ((src, dst), _) in self.arcs.iter() {
214 seen.set(*src, true);
215 seen.set(*dst, true);
216 }
217 for node in 0..num_nodes {
218 ensure!(
219 seen.get(node),
220 "VecGraph requires every node to have at least one arc, and node {} does not have any",
221 node
222 );
223 }
224
225 let mut label_names_with_index: Vec<_> =
226 self.label_names.iter().cloned().enumerate().collect();
227 match self.label_names_order {
228 LabelNamesOrder::Lexicographic => label_names_with_index
229 .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
230 LabelNamesOrder::LexicographicBase64 => {
231 let base64 = base64_simd::STANDARD;
232 label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
233 base64.encode_to_string(label_name).into_bytes()
234 });
235 }
236 }
237 let mut label_permutation = vec![0; label_names_with_index.len()];
238 for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
239 label_permutation[*old_index] = new_index;
240 }
241 let label_names: Vec<_> = label_names_with_index
242 .into_iter()
243 .map(|(_index, label_name)| label_name)
244 .collect();
245
246 let mut arcs = self.arcs.clone();
247 arcs.sort_by_key(|((src, dst), _label)| (*src, *dst)); let arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
250 .into_iter()
251 .group_by(|((src, dst), _label)| (*src, *dst))
252 .into_iter()
253 .map(|((src, dst), arcs)| -> ((NodeId, NodeId), Vec<u64>) {
254 let labels = arcs
255 .flat_map(|((_src, _dst), labels)| {
256 labels.map(|label| {
257 UntypedEdgeLabel::from(match label {
258 EdgeLabel::Branch(branch) => EdgeLabel::Branch(
259 Branch::new(LabelNameId(
260 label_permutation[branch.label_name_id().0 as usize] as u64,
261 ))
262 .expect("Label name permutation overflowed"),
263 ),
264 EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
265 DirEntry::new(
266 entry.permission().expect("invalid permission"),
267 LabelNameId(
268 label_permutation[entry.label_name_id().0 as usize]
269 as u64,
270 ),
271 )
272 .expect("Label name permutation overflowed"),
273 ),
274 EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
275 })
276 .0
277 })
278 })
279 .collect();
280 ((src, dst), labels)
281 })
282 .collect();
283
284 let backward_arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
285 .iter()
286 .map(|((src, dst), labels)| ((*dst, *src), labels.clone()))
287 .collect();
288
289 SwhBidirectionalGraph::from_underlying_graphs(
290 std::path::PathBuf::default(),
291 LabeledVecGraph::from_arcs(arcs),
292 LabeledVecGraph::from_arcs(backward_arcs),
293 )
294 .init_properties()
295 .load_properties(|properties| {
296 properties
297 .with_maps(properties::VecMaps::new(self.swhids.clone()))
298 .context("Could not join maps")?
299 .with_contents(
300 properties::VecContents::new(
301 self.is_skipped_content
302 .iter()
303 .copied()
304 .zip(self.content_lengths.iter().copied())
305 .collect(),
306 )
307 .context("Could not build VecContents")?,
308 )
309 .context("Could not join VecContents")?
310 .with_label_names(
311 properties::VecLabelNames::new(label_names.clone())
312 .context("Could not build VecLabelNames")?,
313 )
314 .context("Could not join maps")?
315 .with_persons(
316 properties::VecPersons::new(
317 self.author_ids
318 .iter()
319 .copied()
320 .zip(self.committer_ids.iter().copied())
321 .collect(),
322 )
323 .context("Could not build VecPersons")?,
324 )
325 .context("Could not join persons")?
326 .with_strings(
327 properties::VecStrings::new(
328 self.messages
329 .iter()
330 .cloned()
331 .zip(self.tag_names.iter().cloned())
332 .collect(),
333 )
334 .context("Could not build VecStrings")?,
335 )
336 .context("Could not join strings")?
337 .with_timestamps(
338 properties::VecTimestamps::new(
339 self.author_timestamps
340 .iter()
341 .copied()
342 .zip(self.author_timestamp_offsets.iter().copied())
343 .zip(self.committer_timestamps.iter().copied())
344 .zip(self.committer_timestamp_offsets.iter().copied())
345 .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
346 .collect(),
347 )
348 .context("Could not build VecTimestamps")?,
349 )
350 .context("Could not join timestamps")
351 })
352 }
353}
354
355pub struct NodeBuilder<'builder> {
356 node_id: NodeId,
357 graph_builder: &'builder mut GraphBuilder,
358}
359
360impl NodeBuilder<'_> {
361 #[inline(always)]
362 pub fn done(&self) -> NodeId {
363 self.node_id
364 }
365
366 pub fn content_length(&mut self, content_length: u64) -> &mut Self {
367 self.graph_builder.content_lengths[self.node_id] = Some(content_length);
368 self
369 }
370
371 pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
372 self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
373 self
374 }
375
376 pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
377 let next_author_id = self
378 .graph_builder
379 .persons
380 .len()
381 .try_into()
382 .expect("person names overflowed u32");
383 let author_id = self
384 .graph_builder
385 .persons
386 .entry(author)
387 .or_insert(next_author_id);
388 self.graph_builder.author_ids[self.node_id] = Some(*author_id);
389 self
390 }
391
392 pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
393 let next_committer_id = self
394 .graph_builder
395 .persons
396 .len()
397 .try_into()
398 .expect("person names overflowed u32");
399 let committer_id = self
400 .graph_builder
401 .persons
402 .entry(committer)
403 .or_insert(next_committer_id);
404 self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
405 self
406 }
407
408 pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
409 self.graph_builder.messages[self.node_id] = Some(message);
410 self
411 }
412
413 pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
414 self.graph_builder.tag_names[self.node_id] = Some(tag_name);
415 self
416 }
417
418 pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
419 self.graph_builder.author_timestamps[self.node_id] = Some(ts);
420 self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
421 self
422 }
423
424 pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
425 self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
426 self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
427 self
428 }
429}
430
431pub fn codegen_from_full_graph<
432 G: SwhLabeledForwardGraph
433 + SwhGraphWithProperties<
434 Maps: properties::Maps,
435 Timestamps: properties::Timestamps,
436 Persons: properties::Persons,
437 Contents: properties::Contents,
438 Strings: properties::Strings,
439 LabelNames: properties::LabelNames,
440 >,
441 W: Write,
442>(
443 graph: &G,
444 mut writer: W,
445) -> Result<(), std::io::Error> {
446 let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
448
449 writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
450 writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
451 writer.write_all(b"\n")?;
452 writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
453 for node in 0..graph.num_nodes() {
454 writer.write_all(
455 format!(
456 "builder\n .node(swhid!({}))\n .unwrap()\n",
457 graph.properties().swhid(node)
458 )
459 .as_bytes(),
460 )?;
461 let mut write_line = |s: String| writer.write_all(format!(" {s}\n").as_bytes());
462 match graph.properties().node_type(node) {
463 NodeType::Content => {
464 write_line(format!(
465 ".is_skipped_content({})",
466 graph.properties().is_skipped_content(node),
467 ))?;
468 }
469 NodeType::Revision
470 | NodeType::Release
471 | NodeType::Directory
472 | NodeType::Snapshot
473 | NodeType::Origin => {}
474 }
475 if let Some(v) = graph.properties().content_length(node) {
476 write_line(format!(".content_length({v})"))?;
477 }
478 if let Some(v) = graph.properties().message(node) {
479 write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
480 }
481 if let Some(v) = graph.properties().tag_name(node) {
482 write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
483 }
484 if let Some(v) = graph.properties().author_id(node) {
485 write_line(format!(".author(b\"{v}\".to_vec())"))?;
486 }
487 if let Some(ts) = graph.properties().author_timestamp(node) {
488 let offset = graph
489 .properties()
490 .author_timestamp_offset(node)
491 .expect("Node has author_timestamp but no author_timestamp_offset");
492 write_line(format!(".author_timestamp({ts}, {offset})"))?;
493 }
494 if let Some(v) = graph.properties().committer_id(node) {
495 write_line(format!(".committer(b\"{v}\".to_vec())"))?;
496 }
497 if let Some(ts) = graph.properties().committer_timestamp(node) {
498 let offset = graph
499 .properties()
500 .committer_timestamp_offset(node)
501 .expect("Node has committer_timestamp but no committer_timestamp_offset");
502 write_line(format!(".committer_timestamp({ts}, {offset})"))?;
503 }
504 writer.write_all(b" .done();\n")?;
505 }
506
507 writer.write_all(b"\n")?;
508
509 for node in 0..graph.num_nodes() {
510 for (succ, labels) in graph.labeled_successors(node) {
511 let mut has_labels = false;
512 for label in labels {
513 writer.write_all(
514 match label {
515 EdgeLabel::DirEntry(label) => format!(
516 "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
517 node,
518 succ,
519 label.permission().expect("Invalid permission"),
520 bytestring(graph.properties().label_name(label.label_name_id())),
521 ),
522 EdgeLabel::Branch(label) => format!(
523 "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
524 node,
525 succ,
526 bytestring(graph.properties().label_name(label.label_name_id())),
527 ),
528 EdgeLabel::Visit(label) => format!(
529 "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
530 node,
531 succ,
532 label.status(),
533 label.timestamp(),
534 ),
535 }
536 .as_bytes(),
537 )?;
538 has_labels = true;
539 }
540 if !has_labels {
541 writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
542 }
543 }
544 }
545
546 writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
547
548 Ok(())
549}