webgraph/utils/batch_codec/
mod.rs

1/*
2 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5 */
6
7//! Traits and implementations to encode and decode batches of sorted triples
8//! to/from disk.
9//!
10//! The traits and implementations in this module are used to customize the
11//! encoding of batches of sorted triples to/from disk. They are used by
12//! [`SortPairs`](crate::utils::sort_pairs::SortPairs) and other utilities built
13//! on that (e.g.,
14//! [`ParSortPairs`](crate::utils::par_sort_pairs::ParSortPairs)).
15//!
16//! They usually do not need to be accessed or modified by the end users, albeit
17//! in some specific cases where performance or on-disk occupation is critical
18//! they can be customized.
19
20use anyhow::Result;
21
22use super::ArcMmapHelper;
23use core::fmt::Display;
24use dsi_bitstream::prelude::*;
25use rdst::*;
26use std::fs::File;
27use std::io::BufWriter;
28use std::path::Path;
29
30pub mod gaps;
31pub mod grouped_gaps;
32
33/// The recommended default batch codec for unlabelled batches.
34pub type DefaultBatchCodec = grouped_gaps::GroupedGapsCodec;
35
36pub type BitWriter<E> = BufBitWriter<E, WordAdapter<usize, BufWriter<File>>>;
37pub type BitReader<E> = BufBitReader<E, MemWordReader<u32, ArcMmapHelper<u32>>>;
38
39/// A trait for encoding and decoding batches of sorted triples.
40pub trait BatchCodec: Send + Sync {
41    /// The label type of the triples to encode and decode.
42    /// While the bounds are not really necessary, in all the practical cases
43    /// we need them.
44    type Label: Copy + Send + Sync + 'static;
45    //// The type returned by `decode_batch`, the iterator of which yields the
46    //// decoded triples in sorted order.
47    ///
48    /// The type `IntoIter` has to be `Send + Sync + Clone` because most often we want
49    /// to use them in [`SortPairs`](crate::utils::sort_pairs::SortPairs) and
50    /// then in [`ArcListGraph`](crate::graphs::arc_list_graph::ArcListGraph)
51    /// which require them.
52    type DecodedBatch: IntoIterator<Item = ((usize, usize), Self::Label), IntoIter: Send + Sync + Clone>;
53
54    /// A type representing statistics about the encoded batch.
55    /// This type has to implement `Display` so that we can log it.
56    type EncodedBatchStats: Display;
57
58    /// Given a batch of sorted triples, encodes them to disk and returns the
59    /// number of bits written.
60    ///
61    /// Note that the input batch must be already sorted. Use
62    /// [`encode_batch`](Self::encode_batch) otherwise.
63    fn encode_sorted_batch(
64        &self,
65        path: impl AsRef<Path>,
66        batch: &[((usize, usize), Self::Label)],
67    ) -> Result<(usize, Self::EncodedBatchStats)>;
68
69    /// Given a batch of triples, sort them, encodes them to disk, and returns
70    /// the number of bits written.
71    fn encode_batch(
72        &self,
73        path: impl AsRef<Path>,
74        batch: &mut [((usize, usize), Self::Label)],
75    ) -> Result<(usize, Self::EncodedBatchStats)>;
76
77    /// Decodes a batch of triples from disk.
78    ///
79    /// The returned type's iterator yields the serialized triples in sorted order.
80    fn decode_batch(&self, path: impl AsRef<Path>) -> Result<Self::DecodedBatch>;
81}
82
83/// Convenience alias to extract the iterator type of the decoded batch from a
84/// [`BatchCodec`].
85pub type CodecIter<C> = <<C as BatchCodec>::DecodedBatch as IntoIterator>::IntoIter;
86
87/// An arc expressed as a pair of nodes and the associated label.
88///
89/// Equality and order are defined only (lexicographically) on the pair of
90/// nodes.
91///
92/// Since we use this to sort a batch of `(usize, usize, L)` triples, in order
93/// to safely transmute between the two types, Triple has to be
94/// `repr(transparent)` of the same tuple type.
95///
96/// We use this to implement `RadixKey` for sorting batches of triples
97/// using the [`rdst`](https://crates.io/crates/rdst) crate.
98#[derive(Clone, Copy, Debug)]
99#[repr(transparent)]
100pub struct Triple<L>(((usize, usize), L));
101
102impl<L> Triple<L> {
103    /// slice of `Triple<L>`.
104    ///
105    /// The conversion is safe because `Triple` is `repr(transparent)` of the
106    /// same tuple type.
107    pub fn cast_batch(batch: &[((usize, usize), L)]) -> &[Triple<L>] {
108        // SAFETY: `Triple` is `repr(transparent)` of the same tuple type.
109        unsafe { std::mem::transmute(batch) }
110    }
111
112    /// Converts a mutable reference to a slice of `((usize, usize), L)` triples
113    /// into a mutable reference to a slice of `Triple<L>`.
114    ///
115    /// The conversion is safe because `Triple` is `repr(transparent)` of the
116    /// same tuple type.
117    pub fn cast_batch_mut(batch: &mut [((usize, usize), L)]) -> &mut [Triple<L>] {
118        // SAFETY: `Triple` is `repr(transparent)` of the same tuple type.
119        unsafe { std::mem::transmute(batch) }
120    }
121}
122
123impl<L> RadixKey for Triple<L> {
124    const LEVELS: usize = 16;
125
126    fn get_level(&self, level: usize) -> u8 {
127        (if level < 8 {
128            self.0.0.1 >> ((level % 8) * 8)
129        } else {
130            self.0.0.0 >> ((level % 8) * 8)
131        }) as u8
132    }
133}
134
135impl<L> PartialEq for Triple<L> {
136    fn eq(&self, other: &Self) -> bool {
137        self.0.0 == other.0.0
138    }
139}
140
141impl<L> Eq for Triple<L> {}
142
143impl<L> PartialOrd for Triple<L> {
144    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
145        Some(self.cmp(other))
146    }
147}
148
149impl<L> Ord for Triple<L> {
150    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
151        self.0.0.cmp(&other.0.0)
152    }
153}