use crate::model::{Atom, Comp, Delims, Error, Field, Message, Rep, Segment};
use crate::synthetic::values::generate_value;
use crate::writer::write;
use rand::{RngExt, SeedableRng, rngs::StdRng};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::collections::HashMap;
pub use crate::synthetic::corpus::{
CorpusConfig, CorpusError, CorpusManifest, CorpusSplits, MessageInfo, ProfileInfo,
TemplateInfo, compute_message_hash, compute_sha256, extract_message_type,
};
pub use crate::synthetic::values::ValueSource;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Template {
pub name: String,
pub delims: String,
pub segments: Vec<String>,
#[serde(default)]
pub values: std::collections::HashMap<String, Vec<ValueSource>>,
}
pub fn generate(template: &Template, seed: u64, count: usize) -> Result<Vec<Message>, Error> {
let mut rng = StdRng::seed_from_u64(seed);
let mut messages = Vec::with_capacity(count);
for i in 0..count {
let message = generate_single_message(template, &mut rng, i)?;
messages.push(message);
}
Ok(messages)
}
fn generate_single_message(
template: &Template,
rng: &mut StdRng,
_index: usize,
) -> Result<Message, Error> {
let delims = parse_delimiters(&template.delims)?;
let mut segments = Vec::new();
for segment_template in &template.segments {
let segment = generate_segment(segment_template, &template.values, &delims, rng)?;
segments.push(segment);
}
Ok(Message {
delims,
segments,
charsets: vec![],
})
}
fn parse_delimiters(delims_str: &str) -> Result<Delims, Error> {
if delims_str.len() != 4 {
return Err(Error::BadDelimLength);
}
let mut chars = delims_str.chars();
let (Some(comp), Some(rep), Some(esc), Some(sub), None) = (
chars.next(),
chars.next(),
chars.next(),
chars.next(),
chars.next(),
) else {
return Err(Error::BadDelimLength);
};
if comp == rep || comp == esc || comp == sub || rep == esc || rep == sub || esc == sub {
return Err(Error::DuplicateDelims);
}
Ok(Delims {
field: '|', comp,
rep,
esc,
sub,
})
}
fn generate_segment(
segment_template: &str,
values: &HashMap<String, Vec<ValueSource>>,
delims: &Delims,
rng: &mut StdRng,
) -> Result<Segment, Error> {
let mut parts = segment_template.split('|');
let id_str = parts.next().ok_or(Error::InvalidSegmentId)?;
if id_str.len() != 3 {
return Err(Error::InvalidSegmentId);
}
let id: [u8; 3] = id_str
.as_bytes()
.try_into()
.map_err(|_err| Error::InvalidSegmentId)?;
for &byte in &id {
if !(byte.is_ascii_uppercase() || byte.is_ascii_digit()) {
return Err(Error::InvalidSegmentId);
}
}
let mut fields = Vec::new();
let field_templates: Vec<&str> = parts.collect();
if id_str == "MSH" {
if let Some(encoding_template) = field_templates.first() {
let encoding_field = generate_field(encoding_template, values, "MSH.2", delims, rng)?;
fields.push(encoding_field);
}
for (offset, field_template) in field_templates.iter().enumerate().skip(1) {
let field_number = offset
.checked_add(2)
.ok_or_else(|| Error::InvalidFieldFormat {
details: "field number overflow".to_string(),
})?;
let field_path = format!("MSH.{field_number}");
let field = generate_field(field_template, values, &field_path, delims, rng)?;
fields.push(field);
}
} else {
for (offset, field_template) in field_templates.iter().enumerate() {
let field_number = offset
.checked_add(2)
.ok_or_else(|| Error::InvalidFieldFormat {
details: "field number overflow".to_string(),
})?;
let field_path = format!("{id_str}.{field_number}");
let field = generate_field(field_template, values, &field_path, delims, rng)?;
fields.push(field);
}
}
Ok(Segment { id, fields })
}
fn generate_field(
field_template: &str,
values: &HashMap<String, Vec<ValueSource>>,
field_path: &str,
delims: &Delims,
rng: &mut StdRng,
) -> Result<Field, Error> {
let rep_templates: Vec<&str> = field_template.split(delims.rep).collect();
let mut reps = Vec::new();
for rep_template in rep_templates {
let rep = generate_rep(rep_template, values, field_path, delims, rng)?;
reps.push(rep);
}
Ok(Field { reps })
}
fn generate_rep(
rep_template: &str,
values: &HashMap<String, Vec<ValueSource>>,
field_path: &str,
delims: &Delims,
rng: &mut StdRng,
) -> Result<Rep, Error> {
let comp_templates: Vec<&str> = rep_template.split(delims.comp).collect();
let mut comps = Vec::new();
for comp_template in comp_templates {
let comp = generate_comp(comp_template, values, field_path, delims, rng)?;
comps.push(comp);
}
Ok(Rep { comps })
}
fn generate_comp(
comp_template: &str,
values: &HashMap<String, Vec<ValueSource>>,
field_path: &str,
delims: &Delims,
rng: &mut StdRng,
) -> Result<Comp, Error> {
let sub_templates: Vec<&str> = comp_template.split(delims.sub).collect();
let mut subs = Vec::new();
for sub_template in sub_templates {
let atom = generate_atom(sub_template, values, field_path, rng)?;
subs.push(atom);
}
Ok(Comp { subs })
}
fn generate_atom(
atom_template: &str,
values: &HashMap<String, Vec<ValueSource>>,
field_path: &str,
rng: &mut StdRng,
) -> Result<Atom, Error> {
if let Some(value_sources) = values.get(field_path)
&& !value_sources.is_empty()
{
if let Some(value_source) = value_sources.first() {
let value = generate_value(value_source, rng)?;
return Ok(Atom::Text(value));
}
}
Ok(Atom::Text(atom_template.to_string()))
}
pub fn generate_corpus(
template: &Template,
seed: u64,
count: usize,
batch_size: usize,
) -> Result<Vec<Message>, Error> {
let mut rng = StdRng::seed_from_u64(seed);
let mut messages = Vec::with_capacity(count);
let effective_batch_size = batch_size.max(1);
let mut remaining = count;
while remaining > 0 {
let batch_count = std::cmp::min(effective_batch_size, remaining);
for _ in 0..batch_count {
let message = generate_single_message(template, &mut rng, messages.len())?;
messages.push(message);
}
remaining = remaining.saturating_sub(batch_count);
}
Ok(messages)
}
pub fn generate_diverse_corpus(
templates: &[Template],
seed: u64,
count: usize,
) -> Result<Vec<Message>, Error> {
let mut rng = StdRng::seed_from_u64(seed);
let mut messages = Vec::with_capacity(count);
if templates.is_empty() {
return Err(Error::InvalidFieldFormat {
details: "at least one template is required".to_string(),
});
}
for i in 0..count {
let template_index = rng.random_range(0..templates.len());
let Some(template) = templates.get(template_index) else {
return Err(Error::InvalidFieldFormat {
details: "template index out of range".to_string(),
});
};
let message = generate_single_message(template, &mut rng, i)?;
messages.push(message);
}
Ok(messages)
}
pub fn generate_distributed_corpus(
template_distributions: &[(Template, f64)],
seed: u64,
count: usize,
) -> Result<Vec<Message>, Error> {
let mut rng = StdRng::seed_from_u64(seed);
let mut messages = Vec::with_capacity(count);
if template_distributions.is_empty() {
return Err(Error::InvalidFieldFormat {
details: "at least one template distribution is required".to_string(),
});
}
let total_percentage: f64 = template_distributions.iter().map(|(_, p)| *p).sum();
if !total_percentage.is_finite() || total_percentage <= 0.0 {
return Err(Error::InvalidFieldFormat {
details: "template distribution total must be positive".to_string(),
});
}
let normalized_distributions: Vec<(Template, f64)> = template_distributions
.iter()
.map(|(t, p)| (t.clone(), p / total_percentage))
.collect();
let mut cumulative_distribution = Vec::new();
let mut cumulative = 0.0;
for (template, percentage) in &normalized_distributions {
cumulative += percentage;
cumulative_distribution.push((template.clone(), cumulative));
}
for i in 0..count {
let random_value = rng.random_range(0.0..1.0);
let template = cumulative_distribution
.iter()
.find(|(_, cumulative)| random_value <= *cumulative)
.map(|(t, _)| t)
.or_else(|| {
normalized_distributions
.last()
.map(|(template, _)| template)
})
.ok_or_else(|| Error::InvalidFieldFormat {
details: "no template distribution selected".to_string(),
})?;
let message = generate_single_message(template, &mut rng, i)?;
messages.push(message);
}
Ok(messages)
}
pub fn generate_golden_hashes(
template: &Template,
seed: u64,
count: usize,
) -> Result<Vec<String>, Error> {
let messages = generate(template, seed, count)?;
let mut hashes = Vec::with_capacity(count);
for message in &messages {
let message_string = write(message);
let mut hasher = Sha256::new();
hasher.update(&message_string);
let hash_result = hasher.finalize();
let hash_hex = format!("{hash_result:x}");
hashes.push(hash_hex);
}
Ok(hashes)
}
pub fn verify_golden_hashes(
template: &Template,
seed: u64,
count: usize,
expected_hashes: &[String],
) -> Result<Vec<bool>, Error> {
let messages = generate(template, seed, count)?;
let mut results = Vec::with_capacity(count);
for (i, message) in messages.iter().enumerate() {
let message_string = write(message);
let mut hasher = Sha256::new();
hasher.update(&message_string);
let hash_result = hasher.finalize();
let hash_hex = format!("{hash_result:x}");
results.push(
expected_hashes
.get(i)
.is_some_and(|expected| hash_hex == *expected),
);
}
Ok(results)
}
pub fn create_manifest(
seed: u64,
templates: &[(String, Template)],
messages: &[Message],
base_path: &str,
) -> CorpusManifest {
let mut manifest = CorpusManifest::new(seed);
for (path, template) in templates {
let template_json = serde_json::to_string(template).unwrap_or_default();
manifest.add_template(path, &template_json);
}
for (i, message) in messages.iter().enumerate() {
let content = write(message);
let content_str = String::from_utf8_lossy(&content);
let message_number = i.saturating_add(1);
let path = format!("{base_path}/message_{message_number:06}.hl7");
let message_type = extract_message_type(message);
manifest.add_message(&path, &content_str, &message_type, 0);
}
manifest
}