impl CorpusProvenance {
#[must_use]
pub fn new() -> Self {
Self {
sources: HashMap::new(),
final_size: 0,
duplicates_removed: 0,
}
}
pub fn add_source(&mut self, name: &str, original: usize, effective: usize) {
self.sources.insert(name.to_string(), (original, effective));
}
pub fn set_final_size(&mut self, size: usize) {
self.final_size = size;
}
}
impl Default for CorpusProvenance {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct CorpusMerger {
sources: Vec<CorpusSource>,
deduplicate: bool,
shuffle_seed: Option<u64>,
}
impl CorpusMerger {
#[must_use]
pub fn new() -> Self {
Self {
sources: Vec::new(),
deduplicate: true,
shuffle_seed: None,
}
}
pub fn add_source(&mut self, source: CorpusSource) -> &mut Self {
self.sources.push(source);
self
}
#[must_use]
pub fn deduplicate(mut self, enable: bool) -> Self {
self.deduplicate = enable;
self
}
#[must_use]
pub fn shuffle_seed(mut self, seed: u64) -> Self {
self.shuffle_seed = Some(seed);
self
}
pub fn merge(&self) -> Result<(CorpusBuffer, CorpusProvenance)> {
let mut provenance = CorpusProvenance::new();
let mut all_samples: Vec<(Sample, u8)> = Vec::new();
for source in &self.sources {
collect_source_samples(source, &mut all_samples, &mut provenance);
}
all_samples.sort_by(|a, b| b.1.cmp(&a.1));
let (mut buffer, duplicates) = self.build_buffer(&all_samples);
provenance.duplicates_removed = duplicates;
provenance.set_final_size(buffer.len());
if let Some(seed) = self.shuffle_seed {
shuffle_buffer(&mut buffer, seed);
}
Ok((buffer, provenance))
}
fn build_buffer(
&self,
all_samples: &[(Sample, u8)],
) -> (CorpusBuffer, usize) {
let config = CorpusBufferConfig {
max_size: all_samples.len(),
deduplicate: self.deduplicate,
policy: EvictionPolicy::FIFO,
seed: self.shuffle_seed,
};
let mut buffer = CorpusBuffer::with_config(config);
let mut duplicates = 0;
for (sample, _) in all_samples {
if !buffer.add(sample.clone()) {
duplicates += 1;
}
}
(buffer, duplicates)
}
}
fn collect_source_samples(
source: &CorpusSource,
all_samples: &mut Vec<(Sample, u8)>,
provenance: &mut CorpusProvenance,
) {
let original_count = source.samples.len();
let effective_count = (original_count as f64 * source.weight).round() as usize;
if source.weight >= 1.0 {
expand_oversampled_source(source, all_samples);
} else {
subsample_source(source, all_samples);
}
provenance.add_source(&source.name, original_count, effective_count);
}
fn expand_oversampled_source(source: &CorpusSource, all_samples: &mut Vec<(Sample, u8)>) {
let repeats = source.weight.floor() as usize;
let remainder = source.weight.fract();
for sample in &source.samples {
for _ in 0..repeats {
let mut s = sample.clone();
s.weight *= source.weight;
all_samples.push((s, source.priority));
}
}
let extra = (source.samples.len() as f64 * remainder).round() as usize;
for sample in source.samples.iter().take(extra) {
let mut s = sample.clone();
s.weight *= source.weight;
all_samples.push((s, source.priority));
}
}
fn subsample_source(source: &CorpusSource, all_samples: &mut Vec<(Sample, u8)>) {
let take = (source.samples.len() as f64 * source.weight).round() as usize;
for sample in source.samples.iter().take(take) {
all_samples.push((sample.clone(), source.priority));
}
}
fn shuffle_buffer(buffer: &mut CorpusBuffer, seed: u64) {
buffer.rng_state = seed;
let n = buffer.samples.len();
for i in (1..n).rev() {
let j = (buffer.next_random() as usize) % (i + 1);
buffer.samples.swap(i, j);
}
}
impl Default for CorpusMerger {
fn default() -> Self {
Self::new()
}
}