1use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use sux::traits::bit_vec_ops::{BitVecOps, BitVecOpsMut};
16use webgraph::graphs::vec_graph::LabeledVecGraph;
17
18use crate::graph::*;
19use crate::labels::{
20 Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
21};
22use crate::properties;
23use crate::SwhGraphProperties;
24use crate::{NodeType, SWHID};
25
26#[derive(Default, Clone, Copy, Debug)]
28pub enum LabelNamesOrder {
29 #[default]
30 Lexicographic,
33 LexicographicBase64,
36}
37
38pub type BuiltGraph = SwhBidirectionalGraph<
40 SwhGraphProperties<
41 properties::VecMaps,
42 properties::VecTimestamps,
43 properties::VecPersons,
44 properties::VecContents,
45 properties::VecStrings,
46 properties::VecLabelNames,
47 >,
48 LabeledVecGraph<Vec<u64>>,
49 LabeledVecGraph<Vec<u64>>,
50>;
51
52#[derive(Clone, Debug, Default)]
98pub struct GraphBuilder {
99 pub label_names_order: LabelNamesOrder,
100
101 name_to_id: HashMap<Vec<u8>, u64>,
102 persons: HashMap<Vec<u8>, u32>,
103
104 arcs: Vec<((NodeId, NodeId), Option<EdgeLabel>)>,
105
106 swhids: Vec<SWHID>,
107 is_skipped_content: Vec<bool>,
108 content_lengths: Vec<Option<u64>>,
109 author_ids: Vec<Option<u32>>,
110 committer_ids: Vec<Option<u32>>,
111 messages: Vec<Option<Vec<u8>>>,
112 tag_names: Vec<Option<Vec<u8>>>,
113 author_timestamps: Vec<Option<i64>>,
114 author_timestamp_offsets: Vec<Option<i16>>,
115 committer_timestamps: Vec<Option<i64>>,
116 committer_timestamp_offsets: Vec<Option<i16>>,
117 label_names: Vec<Vec<u8>>,
118}
119
120impl GraphBuilder {
121 pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
125 ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
126 let node_id = self.swhids.len();
127 self.swhids.push(swhid);
128 self.is_skipped_content.push(false);
129 self.content_lengths.push(None);
130 self.author_ids.push(None);
131 self.committer_ids.push(None);
132 self.messages.push(None);
133 self.tag_names.push(None);
134 self.author_timestamps.push(None);
135 self.author_timestamp_offsets.push(None);
136 self.committer_timestamps.push(None);
137 self.committer_timestamp_offsets.push(None);
138 Ok(NodeBuilder {
139 node_id,
140 graph_builder: self,
141 })
142 }
143
144 pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
146 self.swhids.iter().position(|x| *x == swhid)
147 }
148
149 pub fn arc(&mut self, src: NodeId, dst: NodeId) {
151 self.arcs.push(((src, dst), None));
152 }
153
154 pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
156 &mut self,
157 src: NodeId,
158 dst: NodeId,
159 permission: P,
160 name: N,
161 ) {
162 let permission = permission.into();
163 let name = name.into();
164 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
165 self.label_names.push(name);
166 (self.label_names.len() - 1)
167 .try_into()
168 .expect("label_names length overflowed u64")
169 });
170 self.l_arc(
171 src,
172 dst,
173 DirEntry::new(permission, LabelNameId(name_id))
174 .expect("label_names is larger than 2^61 items"),
175 );
176 }
177
178 pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
180 let name = name.into();
181 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
182 self.label_names.push(name);
183 (self.label_names.len() - 1)
184 .try_into()
185 .expect("label_names length overflowed u64")
186 });
187 self.l_arc(
188 src,
189 dst,
190 Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
191 );
192 }
193
194 pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
196 self.l_arc(
197 src,
198 dst,
199 Visit::new(status, timestamp).expect("invalid timestamp"),
200 );
201 }
202
203 pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
205 self.arcs.push(((src, dst), Some(label.into())));
206 }
207
208 #[allow(clippy::type_complexity)]
209 pub fn done(&self) -> Result<BuiltGraph> {
210 let num_nodes = self.swhids.len();
211 let mut seen = sux::prelude::BitVec::new(num_nodes);
212 for ((src, dst), _) in self.arcs.iter() {
213 seen.set(*src, true);
214 seen.set(*dst, true);
215 }
216 for node in 0..num_nodes {
217 ensure!(
218 seen.get(node),
219 "VecGraph requires every node to have at least one arc, and {} does not have any",
220 node
221 );
222 }
223
224 let mut label_names_with_index: Vec<_> =
225 self.label_names.iter().cloned().enumerate().collect();
226 match self.label_names_order {
227 LabelNamesOrder::Lexicographic => label_names_with_index
228 .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
229 LabelNamesOrder::LexicographicBase64 => {
230 let base64 = base64_simd::STANDARD;
231 label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
232 base64.encode_to_string(label_name).into_bytes()
233 });
234 }
235 }
236 let mut label_permutation = vec![0; label_names_with_index.len()];
237 for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
238 label_permutation[*old_index] = new_index;
239 }
240 let label_names: Vec<_> = label_names_with_index
241 .into_iter()
242 .map(|(_index, label_name)| label_name)
243 .collect();
244
245 let mut arcs = self.arcs.clone();
246 arcs.sort_by_key(|((src, dst), _label)| (*src, *dst)); let arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
249 .into_iter()
250 .group_by(|((src, dst), _label)| (*src, *dst))
251 .into_iter()
252 .map(|((src, dst), arcs)| -> ((NodeId, NodeId), Vec<u64>) {
253 let labels = arcs
254 .flat_map(|((_src, _dst), labels)| {
255 labels.map(|label| {
256 UntypedEdgeLabel::from(match label {
257 EdgeLabel::Branch(branch) => EdgeLabel::Branch(
258 Branch::new(LabelNameId(
259 label_permutation[branch.label_name_id().0 as usize] as u64,
260 ))
261 .expect("Label name permutation overflowed"),
262 ),
263 EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
264 DirEntry::new(
265 entry.permission().expect("invalid permission"),
266 LabelNameId(
267 label_permutation[entry.label_name_id().0 as usize]
268 as u64,
269 ),
270 )
271 .expect("Label name permutation overflowed"),
272 ),
273 EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
274 })
275 .0
276 })
277 })
278 .collect();
279 ((src, dst), labels)
280 })
281 .collect();
282
283 let backward_arcs: Vec<((NodeId, NodeId), Vec<u64>)> = arcs
284 .iter()
285 .map(|((src, dst), labels)| ((*dst, *src), labels.clone()))
286 .collect();
287
288 SwhBidirectionalGraph::from_underlying_graphs(
289 std::path::PathBuf::default(),
290 LabeledVecGraph::from_arcs(arcs),
291 LabeledVecGraph::from_arcs(backward_arcs),
292 )
293 .init_properties()
294 .load_properties(|properties| {
295 properties
296 .with_maps(properties::VecMaps::new(self.swhids.clone()))
297 .context("Could not join maps")?
298 .with_contents(
299 properties::VecContents::new(
300 self.is_skipped_content
301 .iter()
302 .copied()
303 .zip(self.content_lengths.iter().copied())
304 .collect(),
305 )
306 .context("Could not build VecContents")?,
307 )
308 .context("Could not join VecContents")?
309 .with_label_names(
310 properties::VecLabelNames::new(label_names.clone())
311 .context("Could not build VecLabelNames")?,
312 )
313 .context("Could not join maps")?
314 .with_persons(
315 properties::VecPersons::new(
316 self.author_ids
317 .iter()
318 .copied()
319 .zip(self.committer_ids.iter().copied())
320 .collect(),
321 )
322 .context("Could not build VecPersons")?,
323 )
324 .context("Could not join persons")?
325 .with_strings(
326 properties::VecStrings::new(
327 self.messages
328 .iter()
329 .cloned()
330 .zip(self.tag_names.iter().cloned())
331 .collect(),
332 )
333 .context("Could not build VecStrings")?,
334 )
335 .context("Could not join strings")?
336 .with_timestamps(
337 properties::VecTimestamps::new(
338 self.author_timestamps
339 .iter()
340 .copied()
341 .zip(self.author_timestamp_offsets.iter().copied())
342 .zip(self.committer_timestamps.iter().copied())
343 .zip(self.committer_timestamp_offsets.iter().copied())
344 .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
345 .collect(),
346 )
347 .context("Could not build VecTimestamps")?,
348 )
349 .context("Could not join timestamps")
350 })
351 }
352}
353
354pub struct NodeBuilder<'builder> {
355 node_id: NodeId,
356 graph_builder: &'builder mut GraphBuilder,
357}
358
359impl NodeBuilder<'_> {
360 pub fn done(&self) -> NodeId {
361 self.node_id
362 }
363
364 pub fn content_length(&mut self, content_length: u64) -> &mut Self {
365 self.graph_builder.content_lengths[self.node_id] = Some(content_length);
366 self
367 }
368
369 pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
370 self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
371 self
372 }
373
374 pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
375 let next_author_id = self
376 .graph_builder
377 .persons
378 .len()
379 .try_into()
380 .expect("person names overflowed u32");
381 let author_id = self
382 .graph_builder
383 .persons
384 .entry(author)
385 .or_insert(next_author_id);
386 self.graph_builder.author_ids[self.node_id] = Some(*author_id);
387 self
388 }
389
390 pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
391 let next_committer_id = self
392 .graph_builder
393 .persons
394 .len()
395 .try_into()
396 .expect("person names overflowed u32");
397 let committer_id = self
398 .graph_builder
399 .persons
400 .entry(committer)
401 .or_insert(next_committer_id);
402 self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
403 self
404 }
405
406 pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
407 self.graph_builder.messages[self.node_id] = Some(message);
408 self
409 }
410
411 pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
412 self.graph_builder.tag_names[self.node_id] = Some(tag_name);
413 self
414 }
415
416 pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
417 self.graph_builder.author_timestamps[self.node_id] = Some(ts);
418 self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
419 self
420 }
421
422 pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
423 self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
424 self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
425 self
426 }
427}
428
429pub fn codegen_from_full_graph<
430 G: SwhLabeledForwardGraph
431 + SwhGraphWithProperties<
432 Maps: properties::Maps,
433 Timestamps: properties::Timestamps,
434 Persons: properties::Persons,
435 Contents: properties::Contents,
436 Strings: properties::Strings,
437 LabelNames: properties::LabelNames,
438 >,
439 W: Write,
440>(
441 graph: &G,
442 mut writer: W,
443) -> Result<(), std::io::Error> {
444 let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
446
447 writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
448 writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
449 writer.write_all(b"\n")?;
450 writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
451 for node in 0..graph.num_nodes() {
452 writer.write_all(
453 format!(
454 "builder\n .node(swhid!({}))\n .unwrap()\n",
455 graph.properties().swhid(node)
456 )
457 .as_bytes(),
458 )?;
459 let mut write_line = |s: String| writer.write_all(format!(" {s}\n").as_bytes());
460 match graph.properties().node_type(node) {
461 NodeType::Content => {
462 write_line(format!(
463 ".is_skipped_content({})",
464 graph.properties().is_skipped_content(node),
465 ))?;
466 }
467 NodeType::Revision
468 | NodeType::Release
469 | NodeType::Directory
470 | NodeType::Snapshot
471 | NodeType::Origin => {}
472 }
473 if let Some(v) = graph.properties().content_length(node) {
474 write_line(format!(".content_length({v})"))?;
475 }
476 if let Some(v) = graph.properties().message(node) {
477 write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
478 }
479 if let Some(v) = graph.properties().tag_name(node) {
480 write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
481 }
482 if let Some(v) = graph.properties().author_id(node) {
483 write_line(format!(".author(b\"{v}\".to_vec())"))?;
484 }
485 if let Some(ts) = graph.properties().author_timestamp(node) {
486 let offset = graph
487 .properties()
488 .author_timestamp_offset(node)
489 .expect("Node has author_timestamp but no author_timestamp_offset");
490 write_line(format!(".author_timestamp({ts}, {offset})"))?;
491 }
492 if let Some(v) = graph.properties().committer_id(node) {
493 write_line(format!(".committer(b\"{v}\".to_vec())"))?;
494 }
495 if let Some(ts) = graph.properties().committer_timestamp(node) {
496 let offset = graph
497 .properties()
498 .committer_timestamp_offset(node)
499 .expect("Node has committer_timestamp but no committer_timestamp_offset");
500 write_line(format!(".committer_timestamp({ts}, {offset})"))?;
501 }
502 writer.write_all(b" .done();\n")?;
503 }
504
505 writer.write_all(b"\n")?;
506
507 for node in 0..graph.num_nodes() {
508 for (succ, labels) in graph.labeled_successors(node) {
509 let mut has_labels = false;
510 for label in labels {
511 writer.write_all(
512 match label {
513 EdgeLabel::DirEntry(label) => format!(
514 "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
515 node,
516 succ,
517 label.permission().expect("Invalid permission"),
518 bytestring(graph.properties().label_name(label.label_name_id())),
519 ),
520 EdgeLabel::Branch(label) => format!(
521 "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
522 node,
523 succ,
524 bytestring(graph.properties().label_name(label.label_name_id())),
525 ),
526 EdgeLabel::Visit(label) => format!(
527 "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
528 node,
529 succ,
530 label.status(),
531 label.timestamp(),
532 ),
533 }
534 .as_bytes(),
535 )?;
536 has_labels = true;
537 }
538 if !has_labels {
539 writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
540 }
541 }
542 }
543
544 writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
545
546 Ok(())
547}