winsfs_core/io/shuffle.rs
1//! Read and write pseudo-shuffled SAF format.
2//!
3//! The aim of the format is not to give a true, random shuffle of the sites in the input SAF
4//! file(s), but to move them around enough to break blocks of linkage disequilibrium. However,
5//! this must be done in constant memory, i.e. streaming through the input.
6//!
7//! In overview, the strategy is to pre-allocate a file of the correct size, which must be known up
8//! front, and then "split" this file into `B` blocks. Then consecutive writes go into consecutive
9//! blocks to be spread across the file.
10//!
11//! # Examples
12//!
13//! ## Write
14//!
15//! ```no_run
16//! # fn main() -> ::std::io::Result<()> {
17//! use winsfs_core::io::shuffle::{Header, Writer};
18//!
19//! // This must be known up front; for a single SAF, this can be gotten from the index.
20//! let sites = 100_000; // Number of sites in the input.
21//! let shape = vec![9]; // Number of values per site (per population, just one here).
22//! let blocks = 20; // Number of blocks to use for pseudo-shuffle.
23//! let header = Header::new(sites, shape, blocks);
24//!
25//! let mut writer = Writer::create("/path/to/saf.shuf", header)?;
26//!
27//! // Get sites from somewhere and write. The number of values must match the shape.
28//! let site = vec![0.; 9];
29//! for _ in 0..sites {
30//! writer.write_site(site.as_slice())?;
31//! }
32//!
33//! // Writer expects exactly as many sites as provided in the header:
34//! // any more will throw error in [`write_site`], any less will panic
35//! // in the drop check; use try_finish to check for this error.
36//! writer.try_finish()?;
37//!
38//! # Ok(()) }
39//! ```
40
41mod header;
42pub use header::{Header, MAGIC_NUMBER};
43
44mod reader;
45pub use reader::Reader;
46
47mod writer;
48pub use writer::Writer;
49
50/// Create checked conversion function.
51macro_rules! impl_convert_to_fn {
52 ($to:ty, $name:ident) => {
53 /// Converts a value into a usize with checking.
54 ///
55 /// Panics if the conversion cannot be made exactly.
56 pub(self) fn $name<T>(x: T) -> $to
57 where
58 $to: TryFrom<T>,
59 T: Copy + std::fmt::Display,
60 {
61 match x.try_into() {
62 Ok(v) => v,
63 Err(_) => {
64 panic!(
65 "cannot convert {x} ({ty}) into {to}",
66 ty = std::any::type_name::<T>(),
67 to = stringify!($to),
68 )
69 }
70 }
71 }
72 };
73}
74
75impl_convert_to_fn!(u16, to_u16);
76impl_convert_to_fn!(u32, to_u32);
77impl_convert_to_fn!(u64, to_u64);
78impl_convert_to_fn!(usize, to_usize);
79
80#[cfg(test)]
81mod tests {
82 use super::*;
83
84 use std::io;
85
86 use tempfile::NamedTempFile;
87
88 use crate::{io::ReadSite, saf::Site};
89
90 #[test]
91 fn test_write_then_read() -> io::Result<()> {
92 let file = NamedTempFile::new()?;
93 let path = file.path();
94
95 let header = Header::new(9, vec![1, 4], 4);
96 let mut writer = Writer::create(path, header)?;
97
98 let sites = vec![
99 &[0., 0., 0., 0., 0.],
100 &[1., 1., 1., 1., 1.],
101 &[2., 2., 2., 2., 2.],
102 &[3., 3., 3., 3., 3.],
103 &[4., 4., 4., 4., 4.],
104 &[5., 5., 5., 5., 5.],
105 &[6., 6., 6., 6., 6.],
106 &[7., 7., 7., 7., 7.],
107 &[8., 8., 8., 8., 8.],
108 ];
109
110 for &site in sites.iter() {
111 writer.write_site(site)?;
112 }
113
114 // Drop the writer to flush
115 writer.try_finish().unwrap();
116
117 let mut reader = Reader::try_from_path(path)?;
118
119 let expected_order = &[0, 4, 8, 1, 5, 2, 6, 3, 7];
120
121 let mut site = Site::new(vec![0.; 5], [1, 4]).unwrap();
122 for &i in expected_order {
123 reader.read_site(&mut site)?;
124
125 // Should be normalised, e.g. exp'd
126 let expected_site = sites[i].iter().map(|x| x.exp()).collect::<Vec<_>>();
127 assert_eq!(site.as_slice(), expected_site.as_slice());
128 }
129
130 assert!(reader.read_site(&mut site)?.is_done());
131
132 file.close()
133 }
134}