1use std::collections::HashMap;
11use std::io::Write;
12
13use anyhow::{ensure, Context, Result};
14use itertools::Itertools;
15use webgraph::graphs::vec_graph::LabeledVecGraph;
16
17use crate::graph::*;
18use crate::labels::{
19 Branch, DirEntry, EdgeLabel, LabelNameId, Permission, UntypedEdgeLabel, Visit, VisitStatus,
20};
21use crate::properties;
22use crate::SwhGraphProperties;
23use crate::{NodeType, SWHID};
24
25#[derive(Default, Clone, Copy, Debug)]
27pub enum LabelNamesOrder {
28 #[default]
29 Lexicographic,
32 LexicographicBase64,
35}
36
37pub type BuiltGraph = SwhBidirectionalGraph<
39 SwhGraphProperties<
40 properties::VecMaps,
41 properties::VecTimestamps,
42 properties::VecPersons,
43 properties::VecContents,
44 properties::VecStrings,
45 properties::VecLabelNames,
46 >,
47 LabeledVecGraph<Vec<u64>>,
48 LabeledVecGraph<Vec<u64>>,
49>;
50
51#[derive(Clone, Debug, Default)]
97pub struct GraphBuilder {
98 pub label_names_order: LabelNamesOrder,
99
100 name_to_id: HashMap<Vec<u8>, u64>,
101 persons: HashMap<Vec<u8>, u32>,
102
103 arcs: Vec<(NodeId, NodeId, Option<EdgeLabel>)>,
104
105 swhids: Vec<SWHID>,
106 is_skipped_content: Vec<bool>,
107 content_lengths: Vec<Option<u64>>,
108 author_ids: Vec<Option<u32>>,
109 committer_ids: Vec<Option<u32>>,
110 messages: Vec<Option<Vec<u8>>>,
111 tag_names: Vec<Option<Vec<u8>>>,
112 author_timestamps: Vec<Option<i64>>,
113 author_timestamp_offsets: Vec<Option<i16>>,
114 committer_timestamps: Vec<Option<i64>>,
115 committer_timestamp_offsets: Vec<Option<i16>>,
116 label_names: Vec<Vec<u8>>,
117}
118
119impl GraphBuilder {
120 pub fn node(&mut self, swhid: SWHID) -> Result<NodeBuilder<'_>> {
124 ensure!(!self.swhids.contains(&swhid), "Duplicate SWHID {swhid}");
125 let node_id = self.swhids.len();
126 self.swhids.push(swhid);
127 self.is_skipped_content.push(false);
128 self.content_lengths.push(None);
129 self.author_ids.push(None);
130 self.committer_ids.push(None);
131 self.messages.push(None);
132 self.tag_names.push(None);
133 self.author_timestamps.push(None);
134 self.author_timestamp_offsets.push(None);
135 self.committer_timestamps.push(None);
136 self.committer_timestamp_offsets.push(None);
137 Ok(NodeBuilder {
138 node_id,
139 graph_builder: self,
140 })
141 }
142
143 pub fn node_id(&self, swhid: SWHID) -> Option<NodeId> {
145 self.swhids.iter().position(|x| *x == swhid)
146 }
147
148 pub fn arc(&mut self, src: NodeId, dst: NodeId) {
150 self.arcs.push((src, dst, None));
151 }
152
153 pub fn dir_arc<P: Into<Permission>, N: Into<Vec<u8>>>(
155 &mut self,
156 src: NodeId,
157 dst: NodeId,
158 permission: P,
159 name: N,
160 ) {
161 let permission = permission.into();
162 let name = name.into();
163 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
164 self.label_names.push(name);
165 (self.label_names.len() - 1)
166 .try_into()
167 .expect("label_names length overflowed u64")
168 });
169 self.l_arc(
170 src,
171 dst,
172 DirEntry::new(permission, LabelNameId(name_id))
173 .expect("label_names is larger than 2^61 items"),
174 );
175 }
176
177 pub fn snp_arc<N: Into<Vec<u8>>>(&mut self, src: NodeId, dst: NodeId, name: N) {
179 let name = name.into();
180 let name_id = *self.name_to_id.entry(name.clone()).or_insert_with(|| {
181 self.label_names.push(name);
182 (self.label_names.len() - 1)
183 .try_into()
184 .expect("label_names length overflowed u64")
185 });
186 self.l_arc(
187 src,
188 dst,
189 Branch::new(LabelNameId(name_id)).expect("label_names is larger than 2^61 items"),
190 );
191 }
192
193 pub fn ori_arc(&mut self, src: NodeId, dst: NodeId, status: VisitStatus, timestamp: u64) {
195 self.l_arc(
196 src,
197 dst,
198 Visit::new(status, timestamp).expect("invalid timestamp"),
199 );
200 }
201
202 pub fn l_arc<L: Into<EdgeLabel>>(&mut self, src: NodeId, dst: NodeId, label: L) {
204 self.arcs.push((src, dst, Some(label.into())));
205 }
206
207 #[allow(clippy::type_complexity)]
208 pub fn done(&self) -> Result<BuiltGraph> {
209 let num_nodes = self.swhids.len();
210 let mut seen = sux::prelude::BitVec::new(num_nodes);
211 for (src, dst, _) in self.arcs.iter() {
212 seen.set(*src, true);
213 seen.set(*dst, true);
214 }
215 for node in 0..num_nodes {
216 ensure!(
217 seen.get(node),
218 "VecGraph requires every node to have at least one arc, and {} does not have any",
219 node
220 );
221 }
222
223 let mut label_names_with_index: Vec<_> =
224 self.label_names.iter().cloned().enumerate().collect();
225 match self.label_names_order {
226 LabelNamesOrder::Lexicographic => label_names_with_index
227 .sort_unstable_by_key(|(_index, label_name)| label_name.clone()),
228 LabelNamesOrder::LexicographicBase64 => {
229 let base64 = base64_simd::STANDARD;
230 label_names_with_index.sort_unstable_by_key(|(_index, label_name)| {
231 base64.encode_to_string(label_name).into_bytes()
232 });
233 }
234 }
235 let mut label_permutation = vec![0; label_names_with_index.len()];
236 for (new_index, (old_index, _)) in label_names_with_index.iter().enumerate() {
237 label_permutation[*old_index] = new_index;
238 }
239 let label_names: Vec<_> = label_names_with_index
240 .into_iter()
241 .map(|(_index, label_name)| label_name)
242 .collect();
243
244 let mut arcs = self.arcs.clone();
245 arcs.sort_by_key(|(src, dst, _label)| (*src, *dst)); let arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
248 .into_iter()
249 .group_by(|(src, dst, _label)| (*src, *dst))
250 .into_iter()
251 .map(|((src, dst), arcs)| -> (NodeId, NodeId, Vec<u64>) {
252 let labels = arcs
253 .flat_map(|(_src, _dst, labels)| {
254 labels.map(|label| {
255 UntypedEdgeLabel::from(match label {
256 EdgeLabel::Branch(branch) => EdgeLabel::Branch(
257 Branch::new(LabelNameId(
258 label_permutation[branch.label_name_id().0 as usize] as u64,
259 ))
260 .expect("Label name permutation overflowed"),
261 ),
262 EdgeLabel::DirEntry(entry) => EdgeLabel::DirEntry(
263 DirEntry::new(
264 entry.permission().expect("invalid permission"),
265 LabelNameId(
266 label_permutation[entry.label_name_id().0 as usize]
267 as u64,
268 ),
269 )
270 .expect("Label name permutation overflowed"),
271 ),
272 EdgeLabel::Visit(visit) => EdgeLabel::Visit(visit),
273 })
274 .0
275 })
276 })
277 .collect();
278 (src, dst, labels)
279 })
280 .collect();
281
282 let backward_arcs: Vec<(NodeId, NodeId, Vec<u64>)> = arcs
283 .iter()
284 .map(|(src, dst, labels)| (*dst, *src, labels.clone()))
285 .collect();
286
287 SwhBidirectionalGraph::from_underlying_graphs(
288 std::path::PathBuf::default(),
289 LabeledVecGraph::from_arcs(arcs),
290 LabeledVecGraph::from_arcs(backward_arcs),
291 )
292 .init_properties()
293 .load_properties(|properties| {
294 properties
295 .with_maps(properties::VecMaps::new(self.swhids.clone()))
296 .context("Could not join maps")?
297 .with_contents(
298 properties::VecContents::new(
299 self.is_skipped_content
300 .iter()
301 .copied()
302 .zip(self.content_lengths.iter().copied())
303 .collect(),
304 )
305 .context("Could not build VecContents")?,
306 )
307 .context("Could not join VecContents")?
308 .with_label_names(
309 properties::VecLabelNames::new(label_names.clone())
310 .context("Could not build VecLabelNames")?,
311 )
312 .context("Could not join maps")?
313 .with_persons(
314 properties::VecPersons::new(
315 self.author_ids
316 .iter()
317 .copied()
318 .zip(self.committer_ids.iter().copied())
319 .collect(),
320 )
321 .context("Could not build VecPersons")?,
322 )
323 .context("Could not join persons")?
324 .with_strings(
325 properties::VecStrings::new(
326 self.messages
327 .iter()
328 .cloned()
329 .zip(self.tag_names.iter().cloned())
330 .collect(),
331 )
332 .context("Could not build VecStrings")?,
333 )
334 .context("Could not join strings")?
335 .with_timestamps(
336 properties::VecTimestamps::new(
337 self.author_timestamps
338 .iter()
339 .copied()
340 .zip(self.author_timestamp_offsets.iter().copied())
341 .zip(self.committer_timestamps.iter().copied())
342 .zip(self.committer_timestamp_offsets.iter().copied())
343 .map(|(((a_ts, a_ts_o), c_ts), c_ts_o)| (a_ts, a_ts_o, c_ts, c_ts_o))
344 .collect(),
345 )
346 .context("Could not build VecTimestamps")?,
347 )
348 .context("Could not join timestamps")
349 })
350 }
351}
352
353pub struct NodeBuilder<'builder> {
354 node_id: NodeId,
355 graph_builder: &'builder mut GraphBuilder,
356}
357
358impl NodeBuilder<'_> {
359 pub fn done(&self) -> NodeId {
360 self.node_id
361 }
362
363 pub fn content_length(&mut self, content_length: u64) -> &mut Self {
364 self.graph_builder.content_lengths[self.node_id] = Some(content_length);
365 self
366 }
367
368 pub fn is_skipped_content(&mut self, is_skipped_content: bool) -> &mut Self {
369 self.graph_builder.is_skipped_content[self.node_id] = is_skipped_content;
370 self
371 }
372
373 pub fn author(&mut self, author: Vec<u8>) -> &mut Self {
374 let next_author_id = self
375 .graph_builder
376 .persons
377 .len()
378 .try_into()
379 .expect("person names overflowed u32");
380 let author_id = self
381 .graph_builder
382 .persons
383 .entry(author)
384 .or_insert(next_author_id);
385 self.graph_builder.author_ids[self.node_id] = Some(*author_id);
386 self
387 }
388
389 pub fn committer(&mut self, committer: Vec<u8>) -> &mut Self {
390 let next_committer_id = self
391 .graph_builder
392 .persons
393 .len()
394 .try_into()
395 .expect("person names overflowed u32");
396 let committer_id = self
397 .graph_builder
398 .persons
399 .entry(committer)
400 .or_insert(next_committer_id);
401 self.graph_builder.committer_ids[self.node_id] = Some(*committer_id);
402 self
403 }
404
405 pub fn message(&mut self, message: Vec<u8>) -> &mut Self {
406 self.graph_builder.messages[self.node_id] = Some(message);
407 self
408 }
409
410 pub fn tag_name(&mut self, tag_name: Vec<u8>) -> &mut Self {
411 self.graph_builder.tag_names[self.node_id] = Some(tag_name);
412 self
413 }
414
415 pub fn author_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
416 self.graph_builder.author_timestamps[self.node_id] = Some(ts);
417 self.graph_builder.author_timestamp_offsets[self.node_id] = Some(offset);
418 self
419 }
420
421 pub fn committer_timestamp(&mut self, ts: i64, offset: i16) -> &mut Self {
422 self.graph_builder.committer_timestamps[self.node_id] = Some(ts);
423 self.graph_builder.committer_timestamp_offsets[self.node_id] = Some(offset);
424 self
425 }
426}
427
428pub fn codegen_from_full_graph<
429 G: SwhLabeledForwardGraph
430 + SwhGraphWithProperties<
431 Maps: properties::Maps,
432 Timestamps: properties::Timestamps,
433 Persons: properties::Persons,
434 Contents: properties::Contents,
435 Strings: properties::Strings,
436 LabelNames: properties::LabelNames,
437 >,
438 W: Write,
439>(
440 graph: &G,
441 mut writer: W,
442) -> Result<(), std::io::Error> {
443 let bytestring = |b: Vec<u8>| b.into_iter().map(std::ascii::escape_default).join("");
445
446 writer.write_all(b"use swh_graph::swhid;\nuse swh_graph::graph_builder::GraphBuilder;\n")?;
447 writer.write_all(b"use swh_graph::labels::{Permission, VisitStatus};\n")?;
448 writer.write_all(b"\n")?;
449 writer.write_all(b"let mut builder = GraphBuilder::default();\n")?;
450 for node in 0..graph.num_nodes() {
451 writer.write_all(
452 format!(
453 "builder\n .node(swhid!({}))\n .unwrap()\n",
454 graph.properties().swhid(node)
455 )
456 .as_bytes(),
457 )?;
458 let mut write_line = |s: String| writer.write_all(format!(" {s}\n").as_bytes());
459 match graph.properties().node_type(node) {
460 NodeType::Content => {
461 write_line(format!(
462 ".is_skipped_content({})",
463 graph.properties().is_skipped_content(node),
464 ))?;
465 }
466 NodeType::Revision
467 | NodeType::Release
468 | NodeType::Directory
469 | NodeType::Snapshot
470 | NodeType::Origin => {}
471 }
472 if let Some(v) = graph.properties().content_length(node) {
473 write_line(format!(".content_length({v})"))?;
474 }
475 if let Some(v) = graph.properties().message(node) {
476 write_line(format!(".message(b\"{}\".to_vec())", bytestring(v)))?;
477 }
478 if let Some(v) = graph.properties().tag_name(node) {
479 write_line(format!(".tag_name(b\"{}\".to_vec())", bytestring(v)))?;
480 }
481 if let Some(v) = graph.properties().author_id(node) {
482 write_line(format!(".author(b\"{v}\".to_vec())"))?;
483 }
484 if let Some(ts) = graph.properties().author_timestamp(node) {
485 let offset = graph
486 .properties()
487 .author_timestamp_offset(node)
488 .expect("Node has author_timestamp but no author_timestamp_offset");
489 write_line(format!(".author_timestamp({ts}, {offset})"))?;
490 }
491 if let Some(v) = graph.properties().committer_id(node) {
492 write_line(format!(".committer(b\"{v}\".to_vec())"))?;
493 }
494 if let Some(ts) = graph.properties().committer_timestamp(node) {
495 let offset = graph
496 .properties()
497 .committer_timestamp_offset(node)
498 .expect("Node has committer_timestamp but no committer_timestamp_offset");
499 write_line(format!(".committer_timestamp({ts}, {offset})"))?;
500 }
501 writer.write_all(b" .done();\n")?;
502 }
503
504 writer.write_all(b"\n")?;
505
506 for node in 0..graph.num_nodes() {
507 for (succ, labels) in graph.labeled_successors(node) {
508 let mut has_labels = false;
509 for label in labels {
510 writer.write_all(
511 match label {
512 EdgeLabel::DirEntry(label) => format!(
513 "builder.dir_arc({}, {}, Permission::{:?}, b\"{}\".to_vec());\n",
514 node,
515 succ,
516 label.permission().expect("Invalid permission"),
517 bytestring(graph.properties().label_name(label.label_name_id())),
518 ),
519 EdgeLabel::Branch(label) => format!(
520 "builder.snp_arc({}, {}, b\"{}\".to_vec());\n",
521 node,
522 succ,
523 bytestring(graph.properties().label_name(label.label_name_id())),
524 ),
525 EdgeLabel::Visit(label) => format!(
526 "builder.ori_arc({}, {}, VisitStatus::{:?}, {});\n",
527 node,
528 succ,
529 label.status(),
530 label.timestamp(),
531 ),
532 }
533 .as_bytes(),
534 )?;
535 has_labels = true;
536 }
537 if !has_labels {
538 writer.write_all(format!("builder.arc({node}, {succ});\n").as_bytes())?;
539 }
540 }
541 }
542
543 writer.write_all(b"\nbuilder.done().expect(\"Could not build graph\")\n")?;
544
545 Ok(())
546}