Skip to main content

webgraph/graphs/bvgraph/codecs/
factories.rs

1/*
2 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
3 * SPDX-FileCopyrightText: 2023 Inria
4 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
5 *
6 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7 */
8
9/*!
10
11Factories for bit readers.
12
13Implementations of the [`CodesReaderFactory`] trait can be used to create
14bit readers accessing a graph data using different techniques.
15- [`FileFactory`] uses a [std::fs::File] to create a bit reader.
16- [`MemoryFactory`] creates bit readers from a slice of memory,
17  either [allocated](MemoryFactory::new_mem) or [mapped](MemoryFactory::new_mmap).
18- [`MmapHelper`] can be used to create a bit reader from a memory-mapped file.
19
20Any factory can be plugged either into a
21[`SequentialDecoderFactory`](super::SequentialDecoderFactory)
22or a [`RandomAccessDecoderFactory`](`super::RandomAccessDecoderFactory`),
23decoupling the choice of encoder from the underlying support.
24
25*/
26use crate::{
27    prelude::{FileBufReader, MemBufReader},
28    utils::MmapHelper,
29};
30use anyhow::{Context, ensure};
31use bitflags::bitflags;
32use common_traits::UnsignedInt;
33use dsi_bitstream::{
34    impls::{BufBitReader, MemWordReader, WordAdapter, buf_bit_reader},
35    prelude::{CodesRead, CodesReaderFactory},
36    traits::{BitRead, Endianness},
37};
38use epserde::Epserde;
39use std::{
40    fs::File,
41    io::{BufReader, Read},
42    marker::PhantomData,
43    mem::MaybeUninit,
44    path::Path,
45};
46use sux::traits::{IndexedSeq, Types};
47
48#[derive(Debug, Clone)]
49pub struct FileFactory<E: Endianness> {
50    path: Box<Path>,
51    _marker: core::marker::PhantomData<E>,
52}
53
54impl<E: Endianness> FileFactory<E> {
55    pub fn new(path: impl AsRef<Path>) -> anyhow::Result<Self> {
56        let path: Box<Path> = path.as_ref().into();
57        let metadata = std::fs::metadata(&path)
58            .with_context(|| format!("Could not stat {}", path.display()))?;
59        ensure!(metadata.is_file(), "File {} is not a file", path.display());
60
61        Ok(Self {
62            path,
63            _marker: core::marker::PhantomData,
64        })
65    }
66}
67
68impl<E: Endianness> CodesReaderFactory<E> for FileFactory<E>
69where
70    FileBufReader<E>: BitRead<E> + CodesRead<E>,
71{
72    type CodesReader<'a>
73        = BufBitReader<E, WordAdapter<u32, BufReader<File>>>
74    where
75        Self: 'a;
76
77    fn new_reader(&self) -> Self::CodesReader<'_> {
78        buf_bit_reader::from_path::<E, u32>(&self.path).unwrap()
79    }
80}
81
82bitflags! {
83    /// Flags for [`MemoryFactory`] and [`MmapHelper`].
84    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
85    pub struct MemoryFlags: u32 {
86        /// Suggest to map a region using transparent huge pages.
87        ///
88        /// This flag is only a suggestion, and it is ignored if the kernel does not
89        /// support transparent huge pages. It is mainly useful to support
90        /// `madvise()`-based huge pages on Linux. Note that at the time
91        /// of this writing Linux does not support transparent huge pages
92        /// in file-based memory mappings.
93        const TRANSPARENT_HUGE_PAGES = 1 << 0;
94        /// Suggest that the mapped region will be accessed sequentially.
95        ///
96        /// This flag is only a suggestion, and it is ignored if the kernel does
97        /// not support it. It is mainly useful to support `madvise()` on Linux.
98        const SEQUENTIAL = 1 << 1;
99        /// Suggest that the mapped region will be accessed randomly.
100        ///
101        /// This flag is only a suggestion, and it is ignored if the kernel does
102        /// not support it. It is mainly useful to support `madvise()` on Linux.
103        const RANDOM_ACCESS = 1 << 2;
104    }
105}
106
107/// Empty flags.
108impl core::default::Default for MemoryFlags {
109    fn default() -> Self {
110        MemoryFlags::empty()
111    }
112}
113
114impl From<MemoryFlags> for mmap_rs::MmapFlags {
115    fn from(flags: MemoryFlags) -> Self {
116        let mut mmap_flags = mmap_rs::MmapFlags::empty();
117        if flags.contains(MemoryFlags::SEQUENTIAL) {
118            mmap_flags |= mmap_rs::MmapFlags::SEQUENTIAL;
119        }
120        if flags.contains(MemoryFlags::RANDOM_ACCESS) {
121            mmap_flags |= mmap_rs::MmapFlags::RANDOM_ACCESS;
122        }
123        if flags.contains(MemoryFlags::TRANSPARENT_HUGE_PAGES) {
124            mmap_flags |= mmap_rs::MmapFlags::TRANSPARENT_HUGE_PAGES;
125        }
126
127        mmap_flags
128    }
129}
130
131impl From<MemoryFlags> for epserde::deser::Flags {
132    fn from(flags: MemoryFlags) -> Self {
133        let mut deser_flags = epserde::deser::Flags::empty();
134        if flags.contains(MemoryFlags::SEQUENTIAL) {
135            deser_flags |= epserde::deser::Flags::SEQUENTIAL;
136        }
137        if flags.contains(MemoryFlags::RANDOM_ACCESS) {
138            deser_flags |= epserde::deser::Flags::RANDOM_ACCESS;
139        }
140        if flags.contains(MemoryFlags::TRANSPARENT_HUGE_PAGES) {
141            deser_flags |= epserde::deser::Flags::TRANSPARENT_HUGE_PAGES;
142        }
143
144        deser_flags
145    }
146}
147
148#[derive(Debug, Clone)]
149pub struct MemoryFactory<E: Endianness, M: AsRef<[u32]>> {
150    data: M,
151    _marker: core::marker::PhantomData<E>,
152}
153
154impl<E: Endianness, T: AsRef<[u32]>> MemoryFactory<E, T> {
155    pub fn from_data(data: T) -> Self {
156        Self {
157            data,
158            _marker: core::marker::PhantomData,
159        }
160    }
161}
162
163impl<E: Endianness> MemoryFactory<E, Box<[u32]>> {
164    pub fn new_mem(path: impl AsRef<Path>) -> anyhow::Result<Self> {
165        let path = path.as_ref();
166        let file_len = path
167            .metadata()
168            .with_context(|| format!("Could not stat {}", path.display()))?
169            .len() as usize;
170        let mut file = std::fs::File::open(path)
171            .with_context(|| format!("Could not open {}", path.display()))?;
172        let vec_len = file_len.div_ceil(usize::try_from(u32::BITS / 8).unwrap());
173
174        let mut data = Vec::<u32>::with_capacity(vec_len);
175        *data.spare_capacity_mut().last_mut().unwrap() = MaybeUninit::zeroed();
176
177        // SAFETY: it's necessarily aligned (because it's a slice of bytes), and not out of bound
178        // by construction
179        let bytes =
180            unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, file_len) };
181
182        file.read_exact(bytes)
183            .with_context(|| format!("Could not read {}", path.display()))?;
184
185        // SAFETY: the entire vector but the last item was filled with bytes from disk;
186        // and the last item was pre-initialized with zeros.
187        unsafe { data.set_len(vec_len) };
188
189        Ok(Self {
190            data: data.into_boxed_slice(),
191            _marker: core::marker::PhantomData,
192        })
193    }
194}
195
196impl<E: Endianness> MemoryFactory<E, MmapHelper<u32>> {
197    pub fn new_mmap(path: impl AsRef<Path>, flags: MemoryFlags) -> anyhow::Result<Self> {
198        let path = path.as_ref();
199        let file_len = path
200            .metadata()
201            .with_context(|| format!("Could not stat {}", path.display()))?
202            .len() as usize;
203        let mut file = std::fs::File::open(path)
204            .with_context(|| format!("Could not open {}", path.display()))?;
205        let capacity = file_len.align_to(16);
206
207        let mut mmap = mmap_rs::MmapOptions::new(capacity)?
208            .with_flags(flags.into())
209            .map_mut()
210            .context("Could not create anonymous mmap")?;
211        file.read_exact(&mut mmap[..file_len])
212            .with_context(|| format!("Could not read {}", path.display()))?;
213        // Fixes the last few bytes to guarantee zero-extension semantics
214        // for bit vectors.
215        mmap[file_len..].fill(0);
216
217        Ok(Self {
218            // Safety: the length is a multiple of 16.
219            data: MmapHelper::try_from(
220                mmap.make_read_only()
221                    .map_err(|(_, err)| err)
222                    .context("Could not make memory read-only")?,
223            )
224            .context("Could not create mmap backend")?,
225            _marker: core::marker::PhantomData,
226        })
227    }
228}
229
230impl<E: Endianness, M: AsRef<[u32]>> CodesReaderFactory<E> for MemoryFactory<E, M>
231where
232    for<'a> MemBufReader<'a, E>: CodesRead<E>,
233{
234    type CodesReader<'a>
235        = MemBufReader<'a, E>
236    where
237        Self: 'a;
238
239    fn new_reader(&self) -> Self::CodesReader<'_> {
240        BufBitReader::<E, _>::new(MemWordReader::new(self.data.as_ref()))
241    }
242}
243
244#[derive(Epserde, Debug, Clone)]
245pub struct EmptyDict<I, O> {
246    _marker: core::marker::PhantomData<(I, O)>,
247}
248
249impl<I, O> Types for EmptyDict<I, O> {
250    type Input = usize;
251    type Output<'a> = usize;
252}
253
254impl<I, O> IndexedSeq for EmptyDict<I, O> {
255    fn get(&self, _key: Self::Input) -> Self::Output<'_> {
256        panic!();
257    }
258
259    unsafe fn get_unchecked(&self, _index: usize) -> Self::Output<'_> {
260        panic!();
261    }
262
263    fn len(&self) -> usize {
264        0
265    }
266}
267
268impl<I, O> Default for EmptyDict<I, O> {
269    fn default() -> Self {
270        Self {
271            _marker: PhantomData,
272        }
273    }
274}
275
276impl<E: Endianness> CodesReaderFactory<E> for MmapHelper<u32>
277where
278    for<'a> MemBufReader<'a, E>: CodesRead<E>,
279{
280    type CodesReader<'a>
281        = MemBufReader<'a, E>
282    where
283        Self: 'a;
284
285    fn new_reader(&self) -> Self::CodesReader<'_> {
286        BufBitReader::<E, _>::new(MemWordReader::new(self.as_ref()))
287    }
288}