webgraph/utils/batch_codec/mod.rs
1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5 */
6
7//! Traits and implementations to encode and decode batches of sorted triples
8//! to/from disk.
9//!
10//! The traits and implementations in this module are used to customize the
11//! encoding of batches of sorted triples to/from disk. They are used by
12//! [`SortPairs`](crate::utils::sort_pairs::SortPairs) and other utilities built
13//! on that (e.g.,
14//! [`ParSortPairs`](crate::utils::par_sort_pairs::ParSortPairs)).
15//!
16//! They usually do not need to be accessed or modified by the end users, albeit
17//! in some specific cases where performance or on-disk occupation is critical
18//! they can be customized.
19
20use anyhow::Result;
21
22use super::ArcMmapHelper;
23use core::fmt::Display;
24use dsi_bitstream::prelude::*;
25use rdst::*;
26use std::fs::File;
27use std::io::BufWriter;
28use std::path::Path;
29
30pub mod gaps;
31pub mod grouped_gaps;
32
33/// The recommended default batch codec for unlabelled batches.
34pub type DefaultBatchCodec = grouped_gaps::GroupedGapsCodec;
35
36pub type BitWriter<E> = BufBitWriter<E, WordAdapter<usize, BufWriter<File>>>;
37pub type BitReader<E> = BufBitReader<E, MemWordReader<u32, ArcMmapHelper<u32>>>;
38
39/// A trait for encoding and decoding batches of sorted triples.
40pub trait BatchCodec: Send + Sync {
41 /// The label type of the triples to encode and decode.
42 /// While the bounds are not really necessary, in all the practical cases
43 /// we need them.
44 type Label: Copy + Send + Sync + 'static;
45 //// The type returned by `decode_batch`, the iterator of which yields the
46 //// decoded triples in sorted order.
47 ///
48 /// The type `IntoIter` has to be `Send + Sync + Clone` because most often we want
49 /// to use them in [`SortPairs`](crate::utils::sort_pairs::SortPairs) and
50 /// then in [`ArcListGraph`](crate::graphs::arc_list_graph::ArcListGraph)
51 /// which require them.
52 type DecodedBatch: IntoIterator<Item = ((usize, usize), Self::Label), IntoIter: Send + Sync + Clone>;
53
54 /// A type representing statistics about the encoded batch.
55 /// This type has to implement `Display` so that we can log it.
56 type EncodedBatchStats: Display;
57
58 /// Given a batch of sorted triples, encodes them to disk and returns the
59 /// number of bits written.
60 ///
61 /// Note that the input batch must be already sorted. Use
62 /// [`encode_batch`](Self::encode_batch) otherwise.
63 fn encode_sorted_batch(
64 &self,
65 path: impl AsRef<Path>,
66 batch: &[((usize, usize), Self::Label)],
67 ) -> Result<(usize, Self::EncodedBatchStats)>;
68
69 /// Given a batch of triples, sort them, encodes them to disk, and returns
70 /// the number of bits written.
71 fn encode_batch(
72 &self,
73 path: impl AsRef<Path>,
74 batch: &mut [((usize, usize), Self::Label)],
75 ) -> Result<(usize, Self::EncodedBatchStats)>;
76
77 /// Decodes a batch of triples from disk.
78 ///
79 /// The returned type's iterator yields the serialized triples in sorted order.
80 fn decode_batch(&self, path: impl AsRef<Path>) -> Result<Self::DecodedBatch>;
81}
82
83/// Convenience alias to extract the iterator type of the decoded batch from a
84/// [`BatchCodec`].
85pub type CodecIter<C> = <<C as BatchCodec>::DecodedBatch as IntoIterator>::IntoIter;
86
87/// An arc expressed as a pair of nodes and the associated label.
88///
89/// Equality and order are defined only (lexicographically) on the pair of
90/// nodes.
91///
92/// Since we use this to sort a batch of `(usize, usize, L)` triples, in order
93/// to safely transmute between the two types, Triple has to be
94/// `repr(transparent)` of the same tuple type.
95///
96/// We use this to implement `RadixKey` for sorting batches of triples
97/// using the [`rdst`](https://crates.io/crates/rdst) crate.
98#[derive(Clone, Copy, Debug)]
99#[repr(transparent)]
100pub struct Triple<L>(((usize, usize), L));
101
102impl<L> Triple<L> {
103 /// slice of `Triple<L>`.
104 ///
105 /// The conversion is safe because `Triple` is `repr(transparent)` of the
106 /// same tuple type.
107 pub fn cast_batch(batch: &[((usize, usize), L)]) -> &[Triple<L>] {
108 // SAFETY: `Triple` is `repr(transparent)` of the same tuple type.
109 unsafe { std::mem::transmute(batch) }
110 }
111
112 /// Converts a mutable reference to a slice of `((usize, usize), L)` triples
113 /// into a mutable reference to a slice of `Triple<L>`.
114 ///
115 /// The conversion is safe because `Triple` is `repr(transparent)` of the
116 /// same tuple type.
117 pub fn cast_batch_mut(batch: &mut [((usize, usize), L)]) -> &mut [Triple<L>] {
118 // SAFETY: `Triple` is `repr(transparent)` of the same tuple type.
119 unsafe { std::mem::transmute(batch) }
120 }
121}
122
123impl<L> RadixKey for Triple<L> {
124 const LEVELS: usize = 16;
125
126 fn get_level(&self, level: usize) -> u8 {
127 (if level < 8 {
128 self.0.0.1 >> ((level % 8) * 8)
129 } else {
130 self.0.0.0 >> ((level % 8) * 8)
131 }) as u8
132 }
133}
134
135impl<L> PartialEq for Triple<L> {
136 fn eq(&self, other: &Self) -> bool {
137 self.0.0 == other.0.0
138 }
139}
140
141impl<L> Eq for Triple<L> {}
142
143impl<L> PartialOrd for Triple<L> {
144 fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
145 Some(self.cmp(other))
146 }
147}
148
149impl<L> Ord for Triple<L> {
150 fn cmp(&self, other: &Self) -> core::cmp::Ordering {
151 self.0.0.cmp(&other.0.0)
152 }
153}