1use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use webgraph::graphs::vec_graph::LabeledVecGraph;
16
17use crate::graph::*;
18use crate::labels::{
19 Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
20};
21use crate::properties;
22use crate::SwhGraphProperties;
23use crate::{NodeType, SWHID};
24
25#[derive(Default, Clone, Copy, Debug)]
27pub enum LabelNamesOrder {
28 #[default]
29 Lexicographic,
32 LexicographicBase64,
35}
36
37pub type BuiltGraph = SwhBidirectionalGraph<
39 SwhGraphProperties<
40 properties::VecMaps,
41 properties::VecTimestamps,
42 properties::VecPersons,
43 properties::VecContents,
44 properties::VecStrings,
45 properties::VecLabelNames,
46 >,
47 LabeledVecGraph<Vec<u64>>,
48 LabeledVecGraph<Vec<u64>>,
49>;
50
51#[derive(Clone, Debug, Default)]
97pub struct GraphBuilder {
98 pub label_names_order: LabelNamesOrder,
99
100 name_to_id: HashMap<Vec<u8>, u64>,
101 persons: HashMap<Vec<u8>, u32>,
102
103 arcs: Vec<(NodeId, NodeId, Option<EdgeLabel>)>,
104
105 swhids: Vec<SWHID>,
106 is_skipped_content: Vec<bool>,
107 content_lengths: Vec<Option<u64>>,
108 author_ids: Vec<Option<u32>>,
109 committer_ids: Vec<Option<u32>>,
110 messages: Vec<Option<Vec<u8>>>,
111 tag_names: Vec<Option<Vec<u8>>>,
112 author_timestamps: Vec<Option<i64>>,
113 author_timestamp_offsets: Vec<Option<i16>>,
114 committer_timestamps: Vec<Option<i64>>,
115 committer_timestamp_offsets: Vec<Option<i16>>,
116 label_names: Vec<Vec<u8>>,
117}
118
119impl GraphBuilder {
120 pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
124 ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
125 let node_id = self.swhids.len();
126 self.swhids.push(swhid);
127 self.is_skipped_content.push(false);
128 self.content_lengths.push(None);
129 self.author_ids.push(None);
130 self.committer_ids.push(None);
131 self.messages.push(None);
132 self.tag_names.push(None);
133 self.author_timestamps.push(None);
134 self.author_timestamp_offsets.push(None);
135 self.committer_timestamps.push(None);
136 self.committer_timestamp_offsets.push(None);
137 Ok(NodeBuilder {
138 node_id,
139 graph_builder: self,
140 })
141 }
142
143 pub fn arc(&mut self, src: NodeId, dst: NodeId) {
145 self.arcs.push((src, dst, None));
146 }
147
148 pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
150 &mut self,
151 src: NodeId,
152 dst: NodeId,
153 permission: P,
154 name: N,
155 ) {
156 let permission = permission.into();
157 let name = name.into();
158 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
159 self.label_names.push(name);
160 (self.label_names.len() - 1)
161 .try_into()
162 .expect("label_names length overflowed u64")
163 });
164 self.l_arc(
165 src,
166 dst,
167 DirEntry::new(permission, LabelNameId(name_id))
168 .expect("label_names is larger than 2^61 items"),
169 );
170 }
171
172 pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
174 let name = name.into();
175 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
176 self.label_names.push(name);
177 (self.label_names.len() - 1)
178 .try_into()
179 .expect("label_names length overflowed u64")
180 });
181 self.l_arc(
182 src,
183 dst,
184 Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
185 );
186 }
187
188 pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
190 self.l_arc(
191 src,
192 dst,
193 Visit::new(status, timestamp).expect("invalid timestamp"),
194 );
195 }
196
197 pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
199 self.arcs.push((src, dst, Some(label.into())));
200 }
201
202 #[allow(clippy::type_complexity)]
203 pub fn done(&self) -> Result<BuiltGraph> {
204 let num_nodes = self.swhids.len();
205 let mut seen = sux::prelude::BitVec::new(num_nodes);
206 for (src, dst, _) in self.arcs.iter() {
207 seen.set(*src, true);
208 seen.set(*dst, true);
209 }
210 for node in 0..num_nodes {
211 ensure!(
212 seen.get(node),
213 "VecGraph requires every node to have at least one arc, and {} does not have any",
214 node
215 );
216 }
217
218 let mut label_names_with_index: Vec<_> =
219 self.label_names.iter().cloned().enumerate().collect();
220 match self.label_names_order {
221 LabelNamesOrder::Lexicographic => label_names_with_index
222 .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
223 LabelNamesOrder::LexicographicBase64 => {
224 let base64 = base64_simd::STANDARD;
225 label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
226 base64.encode_to_string(label_name).into_bytes()
227 });
228 }
229 }
230 let mut label_permutation = vec![0; label_names_with_index.len()];
231 for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
232 label_permutation[*old_index] = new_index;
233 }
234 let label_names: Vec<_> = label_names_with_index
235 .into_iter()
236 .map(|(_index, label_name)| label_name)
237 .collect();
238
239 let mut arcs = self.arcs.clone();
240 arcs.sort_by_key(|(src, dst, _label)| (*src, *dst)); let arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
243 .into_iter()
244 .group_by(|(src, dst, _label)| (*src, *dst))
245 .into_iter()
246 .map(|((src, dst), arcs)| -> (NodeId, NodeId, Vec<u64>) {
247 let labels = arcs
248 .flat_map(|(_src, _dst, labels)| {
249 labels.map(|label| {
250 UntypedEdgeLabel::from(match label {
251 EdgeLabel::Branch(branch) => EdgeLabel::Branch(
252 Branch::new(LabelNameId(
253 label_permutation[branch.label_name_id().0 as usize] as u64,
254 ))
255 .expect("Label name permutation overflowed"),
256 ),
257 EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
258 DirEntry::new(
259 entry.permission().expect("invalid permission"),
260 LabelNameId(
261 label_permutation[entry.label_name_id().0 as usize]
262 as u64,
263 ),
264 )
265 .expect("Label name permutation overflowed"),
266 ),
267 EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
268 })
269 .0
270 })
271 })
272 .collect();
273 (src, dst, labels)
274 })
275 .collect();
276
277 let backward_arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
278 .iter()
279 .map(|(src, dst, labels)| (*dst, *src, labels.clone()))
280 .collect();
281
282 SwhBidirectionalGraph::from_underlying_graphs(
283 std::path::PathBuf::default(),
284 LabeledVecGraph::from_arcs(arcs),
285 LabeledVecGraph::from_arcs(backward_arcs),
286 )
287 .init_properties()
288 .load_properties(|properties| {
289 properties
290 .with_maps(properties::VecMaps::new(self.swhids.clone()))
291 .context("Could not join maps")?
292 .with_contents(
293 properties::VecContents::new(
294 self.is_skipped_content
295 .iter()
296 .copied()
297 .zip(self.content_lengths.iter().copied())
298 .collect(),
299 )
300 .context("Could not build VecContents")?,
301 )
302 .context("Could not join VecContents")?
303 .with_label_names(
304 properties::VecLabelNames::new(label_names.clone())
305 .context("Could not build VecLabelNames")?,
306 )
307 .context("Could not join maps")?
308 .with_persons(
309 properties::VecPersons::new(
310 self.author_ids
311 .iter()
312 .copied()
313 .zip(self.committer_ids.iter().copied())
314 .collect(),
315 )
316 .context("Could not build VecPersons")?,
317 )
318 .context("Could not join persons")?
319 .with_strings(
320 properties::VecStrings::new(
321 self.messages
322 .iter()
323 .cloned()
324 .zip(self.tag_names.iter().cloned())
325 .collect(),
326 )
327 .context("Could not build VecStrings")?,
328 )
329 .context("Could not join strings")?
330 .with_timestamps(
331 properties::VecTimestamps::new(
332 self.author_timestamps
333 .iter()
334 .copied()
335 .zip(self.author_timestamp_offsets.iter().copied())
336 .zip(self.committer_timestamps.iter().copied())
337 .zip(self.committer_timestamp_offsets.iter().copied())
338 .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
339 .collect(),
340 )
341 .context("Could not build VecTimestamps")?,
342 )
343 .context("Could not join timestamps")
344 })
345 }
346}
347
348pub struct NodeBuilder<'builder> {
349 node_id: NodeId,
350 graph_builder: &'builder mut GraphBuilder,
351}
352
353impl NodeBuilder<'_> {
354 pub fn done(&self) -> NodeId {
355 self.node_id
356 }
357
358 pub fn content_length(&mut self, content_length: u64) -> &mut Self {
359 self.graph_builder.content_lengths[self.node_id] = Some(content_length);
360 self
361 }
362
363 pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
364 self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
365 self
366 }
367
368 pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
369 let next_author_id = self
370 .graph_builder
371 .persons
372 .len()
373 .try_into()
374 .expect("person names overflowed u32");
375 let author_id = self
376 .graph_builder
377 .persons
378 .entry(author)
379 .or_insert(next_author_id);
380 self.graph_builder.author_ids[self.node_id] = Some(*author_id);
381 self
382 }
383
384 pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
385 let next_committer_id = self
386 .graph_builder
387 .persons
388 .len()
389 .try_into()
390 .expect("person names overflowed u32");
391 let committer_id = self
392 .graph_builder
393 .persons
394 .entry(committer)
395 .or_insert(next_committer_id);
396 self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
397 self
398 }
399
400 pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
401 self.graph_builder.messages[self.node_id] = Some(message);
402 self
403 }
404
405 pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
406 self.graph_builder.tag_names[self.node_id] = Some(tag_name);
407 self
408 }
409
410 pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
411 self.graph_builder.author_timestamps[self.node_id] = Some(ts);
412 self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
413 self
414 }
415
416 pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
417 self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
418 self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
419 self
420 }
421}
422
423pub fn codegen_from_full_graph<
424 G: SwhLabeledForwardGraph
425 + SwhGraphWithProperties<
426 Maps: properties::Maps,
427 Timestamps: properties::Timestamps,
428 Persons: properties::Persons,
429 Contents: properties::Contents,
430 Strings: properties::Strings,
431 LabelNames: properties::LabelNames,
432 >,
433 W: Write,
434>(
435 graph: &G,
436 mut writer: W,
437) -> Result<(), std::io::Error> {
438 let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
440
441 writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
442 writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
443 writer.write_all(b"\n")?;
444 writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
445 for node in 0..graph.num_nodes() {
446 writer.write_all(
447 format!(
448 "builder\n .node(swhid!({}))\n .unwrap()\n",
449 graph.properties().swhid(node)
450 )
451 .as_bytes(),
452 )?;
453 let mut write_line = |s: String| writer.write_all(format!(" {s}\n").as_bytes());
454 match graph.properties().node_type(node) {
455 NodeType::Content => {
456 write_line(format!(
457 ".is_skipped_content({})",
458 graph.properties().is_skipped_content(node),
459 ))?;
460 }
461 NodeType::Revision
462 | NodeType::Release
463 | NodeType::Directory
464 | NodeType::Snapshot
465 | NodeType::Origin => {}
466 }
467 if let Some(v) = graph.properties().content_length(node) {
468 write_line(format!(".content_length({v})"))?;
469 }
470 if let Some(v) = graph.properties().message(node) {
471 write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
472 }
473 if let Some(v) = graph.properties().tag_name(node) {
474 write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
475 }
476 if let Some(v) = graph.properties().author_id(node) {
477 write_line(format!(".author(b\"{v}\".to_vec())"))?;
478 }
479 if let Some(ts) = graph.properties().author_timestamp(node) {
480 let offset = graph
481 .properties()
482 .author_timestamp_offset(node)
483 .expect("Node has author_timestamp but no author_timestamp_offset");
484 write_line(format!(".author_timestamp({ts}, {offset})"))?;
485 }
486 if let Some(v) = graph.properties().committer_id(node) {
487 write_line(format!(".committer(b\"{v}\".to_vec())"))?;
488 }
489 if let Some(ts) = graph.properties().committer_timestamp(node) {
490 let offset = graph
491 .properties()
492 .committer_timestamp_offset(node)
493 .expect("Node has committer_timestamp but no committer_timestamp_offset");
494 write_line(format!(".committer_timestamp({ts}, {offset})"))?;
495 }
496 writer.write_all(b" .done();\n")?;
497 }
498
499 writer.write_all(b"\n")?;
500
501 for node in 0..graph.num_nodes() {
502 for (succ, labels) in graph.labeled_successors(node) {
503 let mut has_labels = false;
504 for label in labels {
505 writer.write_all(
506 match label {
507 EdgeLabel::DirEntry(label) => format!(
508 "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
509 node,
510 succ,
511 label.permission().expect("Invalid permission"),
512 bytestring(graph.properties().label_name(label.label_name_id())),
513 ),
514 EdgeLabel::Branch(label) => format!(
515 "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
516 node,
517 succ,
518 bytestring(graph.properties().label_name(label.label_name_id())),
519 ),
520 EdgeLabel::Visit(label) => format!(
521 "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
522 node,
523 succ,
524 label.status(),
525 label.timestamp(),
526 ),
527 }
528 .as_bytes(),
529 )?;
530 has_labels = true;
531 }
532 if !has_labels {
533 writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
534 }
535 }
536 }
537
538 writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
539
540 Ok(())
541}