use crate::model::{Atom, Field, Message};
use crate::parser::{parse, parse_mllp};
use crate::transport::mllp::is_mllp_framed;
use crate::writer::write;
use chrono::{DateTime, Utc};
use rand::{RngExt, SeedableRng};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::cmp::Ordering;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fs;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorpusConfig {
pub seed: u64,
pub count: usize,
pub batch_size: usize,
pub output_dir: Option<String>,
pub create_splits: bool,
pub split_ratios: Option<(f64, f64, f64)>,
}
impl Default for CorpusConfig {
fn default() -> Self {
Self {
seed: 42,
count: 100,
batch_size: 50,
output_dir: None,
create_splits: false,
split_ratios: Some((0.7, 0.15, 0.15)),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TemplateInfo {
pub path: String,
pub sha256: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProfileInfo {
pub path: String,
pub sha256: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MessageInfo {
pub path: String,
pub sha256: String,
pub message_type: String,
pub template_index: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusCount {
pub value: String,
pub count: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFieldPresence {
pub path: String,
pub message_count: usize,
pub occurrence_count: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusParseFailure {
pub path: String,
pub error: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusSummary {
pub root: String,
pub file_count: usize,
pub message_count: usize,
pub parse_error_count: usize,
pub total_bytes: usize,
pub message_types: Vec<CorpusCount>,
pub segments: Vec<CorpusCount>,
pub field_presence: Vec<CorpusFieldPresence>,
pub parse_errors: Vec<CorpusParseFailure>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusTotalDiff {
pub before: usize,
pub after: usize,
pub delta: i128,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusCountDiff {
pub value: String,
pub before: usize,
pub after: usize,
pub delta: i128,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFieldPresenceDiff {
pub path: String,
pub before_message_count: usize,
pub after_message_count: usize,
pub message_count_delta: i128,
pub before_occurrence_count: usize,
pub after_occurrence_count: usize,
pub occurrence_count_delta: i128,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFieldCardinalityDiff {
pub path: String,
pub before_min_per_message: usize,
pub after_min_per_message: usize,
pub min_per_message_delta: i128,
pub before_max_per_message: usize,
pub after_max_per_message: usize,
pub max_per_message_delta: i128,
pub before_total_occurrences: usize,
pub after_total_occurrences: usize,
pub total_occurrences_delta: i128,
pub before_message_count: usize,
pub after_message_count: usize,
pub message_count_delta: i128,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusValueShapeStatsDiff {
pub path: String,
pub coded_count: CorpusTotalDiff,
pub timestamp_count: CorpusTotalDiff,
pub numeric_count: CorpusTotalDiff,
pub null_count: CorpusTotalDiff,
pub text_count: CorpusTotalDiff,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusDiffReport {
pub diff_version: String,
pub tool_version: String,
pub before_root: String,
pub after_root: String,
pub profile: Option<CorpusFingerprintProfile>,
pub file_count: CorpusTotalDiff,
pub message_count: CorpusTotalDiff,
pub parse_error_count: CorpusTotalDiff,
pub new_message_types: Vec<String>,
pub removed_message_types: Vec<String>,
pub new_segments: Vec<String>,
pub removed_segments: Vec<String>,
pub message_type_counts: Vec<CorpusCountDiff>,
pub segment_counts: Vec<CorpusCountDiff>,
pub field_presence: Vec<CorpusFieldPresenceDiff>,
pub field_cardinality: Vec<CorpusFieldCardinalityDiff>,
pub value_shape_stats: Vec<CorpusValueShapeStatsDiff>,
pub validation_issue_code_counts: Vec<CorpusCountDiff>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFieldCardinality {
pub path: String,
pub min_per_message: usize,
pub max_per_message: usize,
pub total_occurrences: usize,
pub message_count: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusValueShapeStats {
pub path: String,
pub coded_count: usize,
pub timestamp_count: usize,
pub numeric_count: usize,
pub null_count: usize,
pub text_count: usize,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFingerprintProfile {
pub path: String,
pub sha256: String,
pub version: String,
pub message_structure: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CorpusFingerprint {
pub fingerprint_version: String,
pub tool_version: String,
pub root: String,
pub profile: Option<CorpusFingerprintProfile>,
pub file_count: usize,
pub message_count: usize,
pub parse_error_count: usize,
pub message_type_counts: Vec<CorpusCount>,
pub segment_counts: Vec<CorpusCount>,
pub field_presence: Vec<CorpusFieldPresence>,
pub field_cardinality: Vec<CorpusFieldCardinality>,
pub value_shape_stats: Vec<CorpusValueShapeStats>,
pub validation_issue_code_counts: Vec<CorpusCount>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct CorpusSplits {
pub train: Vec<String>,
pub validation: Vec<String>,
pub test: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorpusManifest {
pub version: String,
pub tool_version: String,
pub seed: u64,
pub templates: Vec<TemplateInfo>,
#[serde(default)]
pub profiles: Vec<ProfileInfo>,
pub messages: Vec<MessageInfo>,
pub generated_at: DateTime<Utc>,
#[serde(default)]
pub splits: CorpusSplits,
}
impl CorpusManifest {
pub fn new(seed: u64) -> Self {
Self {
version: "1.0.0".to_string(),
tool_version: env!("CARGO_PKG_VERSION").to_string(),
seed,
templates: Vec::new(),
profiles: Vec::new(),
messages: Vec::new(),
generated_at: Utc::now(),
splits: CorpusSplits::default(),
}
}
pub fn add_template(&mut self, path: &str, content: &str) {
let sha256 = compute_sha256(content);
self.templates.push(TemplateInfo {
path: path.to_string(),
sha256,
});
}
pub fn add_profile(&mut self, path: &str, content: &str) {
let sha256 = compute_sha256(content);
self.profiles.push(ProfileInfo {
path: path.to_string(),
sha256,
});
}
pub fn add_message(
&mut self,
path: &str,
content: &str,
message_type: &str,
template_index: usize,
) {
let sha256 = compute_sha256(content);
self.messages.push(MessageInfo {
path: path.to_string(),
sha256,
message_type: message_type.to_string(),
template_index,
});
}
pub fn to_json(&self) -> Result<String, CorpusError> {
serde_json::to_string_pretty(self)
.map_err(|e| CorpusError::SerializationError(e.to_string()))
}
pub fn from_json(json: &str) -> Result<Self, CorpusError> {
serde_json::from_str(json).map_err(|e| CorpusError::SerializationError(e.to_string()))
}
pub fn message_count(&self) -> usize {
self.messages.len()
}
pub fn message_type_counts(&self) -> HashMap<String, usize> {
let mut counts = HashMap::new();
for msg in &self.messages {
let count = counts.entry(msg.message_type.clone()).or_insert(0usize);
*count = count.saturating_add(1);
}
counts
}
pub fn create_splits(&mut self, ratios: (f64, f64, f64)) {
let total = self.messages.len();
if total == 0 {
return;
}
let train_count = rounded_ratio_count(total, ratios.0);
let remaining_after_train = total.saturating_sub(train_count);
let val_count = rounded_ratio_count(total, ratios.1).min(remaining_after_train);
let validation_end = train_count.saturating_add(val_count);
let mut rng = rand::rngs::StdRng::seed_from_u64(self.seed);
let mut indices: Vec<usize> = (0..total).collect();
for i in (1..total).rev() {
let j = rng.random_range(0..=i);
indices.swap(i, j);
}
self.splits.train = indices
.get(..train_count)
.unwrap_or_default()
.iter()
.filter_map(|&i| self.messages.get(i).map(|message| message.path.clone()))
.collect();
self.splits.validation = indices
.get(train_count..validation_end)
.unwrap_or_default()
.iter()
.filter_map(|&i| self.messages.get(i).map(|message| message.path.clone()))
.collect();
self.splits.test = indices
.get(validation_end..)
.unwrap_or_default()
.iter()
.filter_map(|&i| self.messages.get(i).map(|message| message.path.clone()))
.collect();
}
}
#[expect(
clippy::cast_possible_truncation,
clippy::cast_precision_loss,
clippy::cast_sign_loss,
reason = "split ratios are configured as f64 percentages by the public API"
)]
fn rounded_ratio_count(total: usize, ratio: f64) -> usize {
if !ratio.is_finite() || ratio <= 0.0 {
return 0;
}
let total_f64 = total as f64;
let rounded = (total_f64 * ratio).round();
if rounded <= 0.0 {
0
} else if rounded >= total_f64 {
total
} else {
rounded as usize
}
}
pub fn compute_sha256(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
let hash_result = hasher.finalize();
format!("{hash_result:x}")
}
pub fn compute_message_hash(message: &Message) -> String {
let message_bytes = write(message);
let message_string = String::from_utf8_lossy(&message_bytes);
compute_sha256(&message_string)
}
#[derive(Debug, Clone, thiserror::Error)]
pub enum CorpusError {
#[error("Serialization error: {0}")]
SerializationError(String),
#[error("IO error: {0}")]
IoError(String),
#[error("Invalid configuration: {0}")]
InvalidConfig(String),
#[error("Invalid split ratios: must sum to 1.0")]
InvalidSplitRatios,
}
pub fn summarize_corpus_path(path: impl AsRef<Path>) -> Result<CorpusSummary, CorpusError> {
let root = path.as_ref();
let mut files = Vec::new();
collect_corpus_files(root, &mut files)?;
files.sort();
let mut message_type_counts: BTreeMap<String, usize> = BTreeMap::new();
let mut segment_counts: BTreeMap<String, usize> = BTreeMap::new();
let mut field_message_counts: BTreeMap<String, usize> = BTreeMap::new();
let mut field_occurrence_counts: BTreeMap<String, usize> = BTreeMap::new();
let mut parse_errors = Vec::new();
let mut total_bytes = 0usize;
let mut message_count = 0usize;
for file in &files {
let relative_path = relative_corpus_path(root, file);
let bytes =
fs::read(file).map_err(|e| CorpusError::IoError(format!("{relative_path}: {e}")))?;
total_bytes = total_bytes.saturating_add(bytes.len());
let parsed = if is_mllp_framed(&bytes) {
parse_mllp(&bytes)
} else {
parse(&bytes)
};
match parsed {
Ok(message) => {
message_count = message_count.saturating_add(1);
increment_count(&mut message_type_counts, extract_message_type(&message));
record_message_shape(
&message,
&mut segment_counts,
&mut field_message_counts,
&mut field_occurrence_counts,
);
}
Err(error) => parse_errors.push(CorpusParseFailure {
path: relative_path,
error: error.to_string(),
}),
}
}
let mut field_presence: Vec<CorpusFieldPresence> = field_occurrence_counts
.into_iter()
.map(|(path, occurrence_count)| CorpusFieldPresence {
message_count: field_message_counts.get(&path).copied().unwrap_or_default(),
path,
occurrence_count,
})
.collect();
field_presence.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
Ok(CorpusSummary {
root: root.to_string_lossy().to_string(),
file_count: files.len(),
message_count,
parse_error_count: parse_errors.len(),
total_bytes,
message_types: counts_to_vec(message_type_counts),
segments: counts_to_vec(segment_counts),
field_presence,
parse_errors,
})
}
pub fn diff_corpus_paths(
before: impl AsRef<Path>,
after: impl AsRef<Path>,
) -> Result<CorpusDiffReport, CorpusError> {
let before_fingerprint = fingerprint_corpus_path(before)?;
let after_fingerprint = fingerprint_corpus_path(after)?;
Ok(diff_corpus_fingerprints(
&before_fingerprint,
&after_fingerprint,
))
}
pub fn diff_corpus_fingerprints(
before: &CorpusFingerprint,
after: &CorpusFingerprint,
) -> CorpusDiffReport {
let message_type_counts = diff_counts(&before.message_type_counts, &after.message_type_counts);
let segment_counts = diff_counts(&before.segment_counts, &after.segment_counts);
CorpusDiffReport {
diff_version: "1".to_string(),
tool_version: env!("CARGO_PKG_VERSION").to_string(),
before_root: before.root.clone(),
after_root: after.root.clone(),
profile: before.profile.clone().or_else(|| after.profile.clone()),
file_count: total_diff(before.file_count, after.file_count),
message_count: total_diff(before.message_count, after.message_count),
parse_error_count: total_diff(before.parse_error_count, after.parse_error_count),
new_message_types: new_values(&message_type_counts),
removed_message_types: removed_values(&message_type_counts),
new_segments: new_values(&segment_counts),
removed_segments: removed_values(&segment_counts),
message_type_counts,
segment_counts,
field_presence: diff_field_presence(&before.field_presence, &after.field_presence),
field_cardinality: diff_field_cardinality(
&before.field_cardinality,
&after.field_cardinality,
),
value_shape_stats: diff_value_shape_stats(
&before.value_shape_stats,
&after.value_shape_stats,
),
validation_issue_code_counts: diff_counts(
&before.validation_issue_code_counts,
&after.validation_issue_code_counts,
),
}
}
pub fn fingerprint_corpus_path(path: impl AsRef<Path>) -> Result<CorpusFingerprint, CorpusError> {
let root = path.as_ref();
let summary = summarize_corpus_path(root)?;
let (field_cardinality, value_shape_stats) =
collect_fingerprint_details(root, summary.message_count)?;
Ok(CorpusFingerprint {
fingerprint_version: "1".to_string(),
tool_version: env!("CARGO_PKG_VERSION").to_string(),
root: summary.root,
profile: None,
file_count: summary.file_count,
message_count: summary.message_count,
parse_error_count: summary.parse_error_count,
message_type_counts: summary.message_types,
segment_counts: summary.segments,
field_presence: summary.field_presence,
field_cardinality,
value_shape_stats,
validation_issue_code_counts: Vec::new(),
})
}
fn collect_fingerprint_details(
root: &Path,
message_count: usize,
) -> Result<(Vec<CorpusFieldCardinality>, Vec<CorpusValueShapeStats>), CorpusError> {
let mut files = Vec::new();
collect_corpus_files(root, &mut files)?;
files.sort();
let mut cardinality: BTreeMap<String, FieldCardinalityAccumulator> = BTreeMap::new();
let mut value_shapes: BTreeMap<String, CorpusValueShapeStats> = BTreeMap::new();
for file in &files {
let relative_path = relative_corpus_path(root, file);
let bytes =
fs::read(file).map_err(|e| CorpusError::IoError(format!("{relative_path}: {e}")))?;
let parsed = if is_mllp_framed(&bytes) {
parse_mllp(&bytes)
} else {
parse(&bytes)
};
let Ok(message) = parsed else {
continue;
};
let mut message_occurrences: BTreeMap<String, usize> = BTreeMap::new();
record_fingerprint_message_shape(&message, &mut message_occurrences, &mut value_shapes);
for (path, occurrences) in message_occurrences {
let entry = cardinality.entry(path).or_default();
entry.present_message_count = entry.present_message_count.saturating_add(1);
entry.total_occurrences = entry.total_occurrences.saturating_add(occurrences);
entry.max_per_message = entry.max_per_message.max(occurrences);
entry.min_present_per_message = match entry.min_present_per_message {
Some(current) => Some(current.min(occurrences)),
None => Some(occurrences),
};
}
}
let mut cardinality: Vec<CorpusFieldCardinality> = cardinality
.into_iter()
.map(|(path, stats)| {
let min_per_message = if stats.present_message_count < message_count {
0
} else {
stats.min_present_per_message.unwrap_or_default()
};
CorpusFieldCardinality {
path,
min_per_message,
max_per_message: stats.max_per_message,
total_occurrences: stats.total_occurrences,
message_count: stats.present_message_count,
}
})
.collect();
cardinality.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
let mut value_shape_stats: Vec<CorpusValueShapeStats> = value_shapes.into_values().collect();
value_shape_stats.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
Ok((cardinality, value_shape_stats))
}
fn collect_corpus_files(path: &Path, files: &mut Vec<PathBuf>) -> Result<(), CorpusError> {
if path.is_file() {
files.push(path.to_path_buf());
return Ok(());
}
if !path.is_dir() {
return Err(CorpusError::InvalidConfig(format!(
"{} is not a file or directory",
path.display()
)));
}
for entry in fs::read_dir(path).map_err(|e| CorpusError::IoError(e.to_string()))? {
let entry = entry.map_err(|e| CorpusError::IoError(e.to_string()))?;
let child = entry.path();
if child.is_dir() {
collect_corpus_files(&child, files)?;
} else if child.is_file() {
files.push(child);
}
}
Ok(())
}
fn relative_corpus_path(root: &Path, file: &Path) -> String {
let relative = if root.is_dir() {
file.strip_prefix(root).unwrap_or(file)
} else {
file.file_name().map(Path::new).unwrap_or(file)
};
relative.to_string_lossy().replace('\\', "/")
}
fn record_message_shape(
message: &Message,
segment_counts: &mut BTreeMap<String, usize>,
field_message_counts: &mut BTreeMap<String, usize>,
field_occurrence_counts: &mut BTreeMap<String, usize>,
) {
let mut message_field_paths = BTreeSet::new();
for segment in &message.segments {
let segment_id = segment.id_str().to_string();
increment_count(segment_counts, segment_id.clone());
for (field_index, field) in segment.fields.iter().enumerate() {
if !field_is_present(field) {
continue;
}
let display_index = if segment_id == "MSH" {
field_index.saturating_add(2)
} else {
field_index.saturating_add(1)
};
let path = format!("{segment_id}.{display_index}");
increment_count(field_occurrence_counts, path.clone());
message_field_paths.insert(path);
}
}
for path in message_field_paths {
increment_count(field_message_counts, path);
}
}
#[derive(Default)]
struct FieldCardinalityAccumulator {
present_message_count: usize,
total_occurrences: usize,
max_per_message: usize,
min_present_per_message: Option<usize>,
}
fn record_fingerprint_message_shape(
message: &Message,
message_occurrences: &mut BTreeMap<String, usize>,
value_shapes: &mut BTreeMap<String, CorpusValueShapeStats>,
) {
for segment in &message.segments {
let segment_id = segment.id_str().to_string();
for (field_index, field) in segment.fields.iter().enumerate() {
if !field_is_present(field) {
continue;
}
let display_index = if segment_id == "MSH" {
field_index.saturating_add(2)
} else {
field_index.saturating_add(1)
};
let path = format!("{segment_id}.{display_index}");
increment_count(message_occurrences, path.clone());
record_value_shape(value_shapes, &path, field);
}
}
}
fn field_is_present(field: &Field) -> bool {
field.reps.iter().any(|rep| {
rep.comps.iter().any(|comp| {
comp.subs.iter().any(|atom| match atom {
Atom::Text(text) => !text.is_empty(),
Atom::Null => true,
})
})
})
}
#[derive(Clone, Copy)]
enum ValueShape {
Coded,
Timestamp,
Numeric,
Null,
Text,
}
fn record_value_shape(
value_shapes: &mut BTreeMap<String, CorpusValueShapeStats>,
path: &str,
field: &Field,
) {
let stats = value_shapes
.entry(path.to_string())
.or_insert_with(|| empty_value_shape_stats(path));
for shape in field_value_shapes(field) {
match shape {
ValueShape::Coded => stats.coded_count = stats.coded_count.saturating_add(1),
ValueShape::Timestamp => {
stats.timestamp_count = stats.timestamp_count.saturating_add(1);
}
ValueShape::Numeric => stats.numeric_count = stats.numeric_count.saturating_add(1),
ValueShape::Null => stats.null_count = stats.null_count.saturating_add(1),
ValueShape::Text => stats.text_count = stats.text_count.saturating_add(1),
}
}
}
fn empty_value_shape_stats(path: &str) -> CorpusValueShapeStats {
CorpusValueShapeStats {
path: path.to_string(),
coded_count: 0,
timestamp_count: 0,
numeric_count: 0,
null_count: 0,
text_count: 0,
}
}
fn field_value_shapes(field: &Field) -> Vec<ValueShape> {
field
.reps
.iter()
.filter_map(repetition_value_shape)
.collect()
}
fn repetition_value_shape(rep: &crate::model::Rep) -> Option<ValueShape> {
if rep
.comps
.iter()
.flat_map(|component| component.subs.iter())
.any(|atom| matches!(atom, Atom::Null))
{
return Some(ValueShape::Null);
}
if rep.comps.len() > 1 {
return Some(ValueShape::Coded);
}
let text = rep.first_text()?;
if text.is_empty() {
return None;
}
if is_hl7_timestamp_shape(text) {
Some(ValueShape::Timestamp)
} else if text.parse::<f64>().is_ok() {
Some(ValueShape::Numeric)
} else {
Some(ValueShape::Text)
}
}
fn is_hl7_timestamp_shape(text: &str) -> bool {
matches!(text.len(), 8 | 12 | 14) && text.chars().all(|character| character.is_ascii_digit())
}
fn compare_field_paths(left: &str, right: &str) -> Ordering {
let (left_segment, left_index) = split_field_path(left);
let (right_segment, right_index) = split_field_path(right);
left_segment
.cmp(right_segment)
.then(left_index.cmp(&right_index))
.then(left.cmp(right))
}
fn total_diff(before: usize, after: usize) -> CorpusTotalDiff {
CorpusTotalDiff {
before,
after,
delta: signed_delta(before, after),
}
}
fn diff_counts(before: &[CorpusCount], after: &[CorpusCount]) -> Vec<CorpusCountDiff> {
let before_counts = count_map(before);
let after_counts = count_map(after);
let mut values = BTreeSet::new();
for value in before_counts.keys() {
values.insert(value.as_str());
}
for value in after_counts.keys() {
values.insert(value.as_str());
}
values
.into_iter()
.map(|value| {
let before = before_counts.get(value).copied().unwrap_or_default();
let after = after_counts.get(value).copied().unwrap_or_default();
CorpusCountDiff {
value: value.to_string(),
before,
after,
delta: signed_delta(before, after),
}
})
.filter(|count| count.delta != 0)
.collect()
}
fn count_map(counts: &[CorpusCount]) -> BTreeMap<String, usize> {
counts
.iter()
.map(|count| (count.value.clone(), count.count))
.collect()
}
fn new_values(counts: &[CorpusCountDiff]) -> Vec<String> {
counts
.iter()
.filter(|count| count.before == 0 && count.after > 0)
.map(|count| count.value.clone())
.collect()
}
fn removed_values(counts: &[CorpusCountDiff]) -> Vec<String> {
counts
.iter()
.filter(|count| count.before > 0 && count.after == 0)
.map(|count| count.value.clone())
.collect()
}
fn diff_field_presence(
before: &[CorpusFieldPresence],
after: &[CorpusFieldPresence],
) -> Vec<CorpusFieldPresenceDiff> {
let before_fields = field_presence_map(before);
let after_fields = field_presence_map(after);
let mut paths = BTreeSet::new();
for path in before_fields.keys() {
paths.insert(path.as_str());
}
for path in after_fields.keys() {
paths.insert(path.as_str());
}
let mut diffs: Vec<CorpusFieldPresenceDiff> = paths
.into_iter()
.map(|path| {
let before = before_fields.get(path);
let after = after_fields.get(path);
let before_message_count = before.map_or(0, |field| field.message_count);
let after_message_count = after.map_or(0, |field| field.message_count);
let before_occurrence_count = before.map_or(0, |field| field.occurrence_count);
let after_occurrence_count = after.map_or(0, |field| field.occurrence_count);
CorpusFieldPresenceDiff {
path: path.to_string(),
before_message_count,
after_message_count,
message_count_delta: signed_delta(before_message_count, after_message_count),
before_occurrence_count,
after_occurrence_count,
occurrence_count_delta: signed_delta(
before_occurrence_count,
after_occurrence_count,
),
}
})
.filter(|field| field.message_count_delta != 0 || field.occurrence_count_delta != 0)
.collect();
diffs.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
diffs
}
fn field_presence_map(fields: &[CorpusFieldPresence]) -> BTreeMap<String, &CorpusFieldPresence> {
fields
.iter()
.map(|field| (field.path.clone(), field))
.collect()
}
fn diff_field_cardinality(
before: &[CorpusFieldCardinality],
after: &[CorpusFieldCardinality],
) -> Vec<CorpusFieldCardinalityDiff> {
let before_fields = field_cardinality_map(before);
let after_fields = field_cardinality_map(after);
let mut paths = BTreeSet::new();
for path in before_fields.keys() {
paths.insert(path.as_str());
}
for path in after_fields.keys() {
paths.insert(path.as_str());
}
let mut diffs: Vec<CorpusFieldCardinalityDiff> = paths
.into_iter()
.map(|path| {
let before = before_fields.get(path);
let after = after_fields.get(path);
let before_min_per_message = before.map_or(0, |field| field.min_per_message);
let after_min_per_message = after.map_or(0, |field| field.min_per_message);
let before_max_per_message = before.map_or(0, |field| field.max_per_message);
let after_max_per_message = after.map_or(0, |field| field.max_per_message);
let before_total_occurrences = before.map_or(0, |field| field.total_occurrences);
let after_total_occurrences = after.map_or(0, |field| field.total_occurrences);
let before_message_count = before.map_or(0, |field| field.message_count);
let after_message_count = after.map_or(0, |field| field.message_count);
CorpusFieldCardinalityDiff {
path: path.to_string(),
before_min_per_message,
after_min_per_message,
min_per_message_delta: signed_delta(before_min_per_message, after_min_per_message),
before_max_per_message,
after_max_per_message,
max_per_message_delta: signed_delta(before_max_per_message, after_max_per_message),
before_total_occurrences,
after_total_occurrences,
total_occurrences_delta: signed_delta(
before_total_occurrences,
after_total_occurrences,
),
before_message_count,
after_message_count,
message_count_delta: signed_delta(before_message_count, after_message_count),
}
})
.filter(|field| {
field.min_per_message_delta != 0
|| field.max_per_message_delta != 0
|| field.total_occurrences_delta != 0
|| field.message_count_delta != 0
})
.collect();
diffs.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
diffs
}
fn field_cardinality_map(
fields: &[CorpusFieldCardinality],
) -> BTreeMap<String, &CorpusFieldCardinality> {
fields
.iter()
.map(|field| (field.path.clone(), field))
.collect()
}
fn diff_value_shape_stats(
before: &[CorpusValueShapeStats],
after: &[CorpusValueShapeStats],
) -> Vec<CorpusValueShapeStatsDiff> {
let before_shapes = value_shape_stats_map(before);
let after_shapes = value_shape_stats_map(after);
let mut paths = BTreeSet::new();
for path in before_shapes.keys() {
paths.insert(path.as_str());
}
for path in after_shapes.keys() {
paths.insert(path.as_str());
}
let mut diffs: Vec<CorpusValueShapeStatsDiff> = paths
.into_iter()
.map(|path| {
let before = before_shapes.get(path);
let after = after_shapes.get(path);
CorpusValueShapeStatsDiff {
path: path.to_string(),
coded_count: total_diff(
before.map_or(0, |shape| shape.coded_count),
after.map_or(0, |shape| shape.coded_count),
),
timestamp_count: total_diff(
before.map_or(0, |shape| shape.timestamp_count),
after.map_or(0, |shape| shape.timestamp_count),
),
numeric_count: total_diff(
before.map_or(0, |shape| shape.numeric_count),
after.map_or(0, |shape| shape.numeric_count),
),
null_count: total_diff(
before.map_or(0, |shape| shape.null_count),
after.map_or(0, |shape| shape.null_count),
),
text_count: total_diff(
before.map_or(0, |shape| shape.text_count),
after.map_or(0, |shape| shape.text_count),
),
}
})
.filter(|shape| {
shape.coded_count.delta != 0
|| shape.timestamp_count.delta != 0
|| shape.numeric_count.delta != 0
|| shape.null_count.delta != 0
|| shape.text_count.delta != 0
})
.collect();
diffs.sort_by(|left, right| compare_field_paths(&left.path, &right.path));
diffs
}
fn value_shape_stats_map(
stats: &[CorpusValueShapeStats],
) -> BTreeMap<String, &CorpusValueShapeStats> {
stats
.iter()
.map(|shape| (shape.path.clone(), shape))
.collect()
}
fn signed_delta(before: usize, after: usize) -> i128 {
let before = i128::try_from(before).unwrap_or(i128::MAX);
let after = i128::try_from(after).unwrap_or(i128::MAX);
after.saturating_sub(before)
}
fn split_field_path(path: &str) -> (&str, usize) {
let Some((segment, field)) = path.split_once('.') else {
return (path, usize::MAX);
};
let index = field.parse::<usize>().unwrap_or(usize::MAX);
(segment, index)
}
fn increment_count(counts: &mut BTreeMap<String, usize>, value: String) {
let count = counts.entry(value).or_insert(0);
*count = count.saturating_add(1);
}
fn counts_to_vec(counts: BTreeMap<String, usize>) -> Vec<CorpusCount> {
counts
.into_iter()
.map(|(value, count)| CorpusCount { value, count })
.collect()
}
pub fn extract_message_type(message: &Message) -> String {
for segment in &message.segments {
if &segment.id == b"MSH" {
if let Some(field) = segment.fields.get(7)
&& let Some(rep) = field.reps.first()
&& !rep.comps.is_empty()
{
let parts: Vec<String> = rep
.comps
.iter()
.filter_map(|c| match c.subs.first() {
Some(Atom::Text(t)) => Some(t.clone()),
_ => None,
})
.collect();
return parts.join("^");
}
}
}
"UNKNOWN".to_string()
}
#[cfg(test)]
mod summary_tests {
#![expect(
clippy::panic,
reason = "Corpus summary tests fail explicitly on test setup errors."
)]
use super::*;
const ADT_A01: &str = "MSH|^~\\&|SENDAPP|SENDFAC|RECVAPP|RECVFAC|202605080101||ADT^A01|CTRL123|P|2.5\rPID|1||123456^^^HOSP^MR||Doe^John||19700101|M";
const ORU_R01: &str = "MSH|^~\\&|LAB|LAB|EHR|HOSP|202605080101||ORU^R01|CTRL456|P|2.5\rPID|1||123456^^^HOSP^MR||Doe^John||19700101|M\rOBR|1|ORD1|FILL1|CBC^Complete Blood Count\rOBX|1|NM|718-7^Hemoglobin||13.2|g/dL";
fn write_message(path: &Path, contents: &str) {
let result = fs::write(path, contents);
assert!(result.is_ok(), "test message should be written: {result:?}");
}
#[test]
fn summarize_corpus_path_counts_messages_segments_and_fields() {
let Ok(dir) = tempfile::tempdir() else {
panic!("test temp dir should be created");
};
write_message(&dir.path().join("adt.hl7"), ADT_A01);
write_message(&dir.path().join("oru.hl7"), ORU_R01);
let Ok(summary) = summarize_corpus_path(dir.path()) else {
panic!("corpus should summarize");
};
assert_eq!(summary.file_count, 2);
assert_eq!(summary.message_count, 2);
assert_eq!(summary.parse_error_count, 0);
assert!(
summary
.message_types
.iter()
.any(|count| count.value == "ADT^A01" && count.count == 1)
);
assert!(
summary
.message_types
.iter()
.any(|count| count.value == "ORU^R01" && count.count == 1)
);
assert!(
summary
.segments
.iter()
.any(|count| count.value == "PID" && count.count == 2)
);
assert!(
summary
.field_presence
.iter()
.any(|field| field.path == "PID.3" && field.message_count == 2)
);
}
#[test]
fn summarize_corpus_path_records_parse_failures() {
let Ok(dir) = tempfile::tempdir() else {
panic!("test temp dir should be created");
};
write_message(&dir.path().join("valid.hl7"), ADT_A01);
write_message(&dir.path().join("invalid.hl7"), "not an hl7 message");
let Ok(summary) = summarize_corpus_path(dir.path()) else {
panic!("corpus should summarize");
};
assert_eq!(summary.file_count, 2);
assert_eq!(summary.message_count, 1);
assert_eq!(summary.parse_error_count, 1);
assert_eq!(
summary
.parse_errors
.first()
.map(|failure| failure.path.as_str()),
Some("invalid.hl7")
);
}
#[test]
fn diff_corpus_paths_reports_count_deltas() {
let Ok(before) = tempfile::tempdir() else {
panic!("before temp dir should be created");
};
let Ok(after) = tempfile::tempdir() else {
panic!("after temp dir should be created");
};
write_message(&before.path().join("adt.hl7"), ADT_A01);
write_message(&after.path().join("adt.hl7"), ADT_A01);
write_message(&after.path().join("oru.hl7"), ORU_R01);
let Ok(diff) = diff_corpus_paths(before.path(), after.path()) else {
panic!("corpus diff should be created");
};
assert_eq!(diff.file_count.before, 1);
assert_eq!(diff.file_count.after, 2);
assert_eq!(diff.file_count.delta, 1);
assert_eq!(diff.message_count.delta, 1);
assert!(
diff.message_type_counts
.iter()
.any(|count| count.value == "ORU^R01" && count.before == 0 && count.after == 1)
);
assert_eq!(diff.new_message_types, vec!["ORU^R01".to_string()]);
assert!(
diff.segment_counts
.iter()
.any(|count| count.value == "OBX" && count.before == 0 && count.after == 1)
);
assert!(diff.new_segments.iter().any(|segment| segment == "OBX"));
assert!(
diff.field_presence
.iter()
.any(|field| field.path == "OBX.5" && field.message_count_delta == 1)
);
assert!(
diff.field_cardinality
.iter()
.any(|field| field.path == "OBX.5"
&& field.max_per_message_delta == 1
&& field.total_occurrences_delta == 1)
);
assert!(
diff.value_shape_stats
.iter()
.any(|shape| shape.path == "OBX.5" && shape.numeric_count.delta == 1)
);
}
#[test]
fn fingerprint_corpus_path_reports_shape_and_cardinality() {
let Ok(dir) = tempfile::tempdir() else {
panic!("test temp dir should be created");
};
write_message(&dir.path().join("adt.hl7"), ADT_A01);
write_message(&dir.path().join("oru.hl7"), ORU_R01);
write_message(&dir.path().join("invalid.hl7"), "not an hl7 message");
let Ok(fingerprint) = fingerprint_corpus_path(dir.path()) else {
panic!("corpus should fingerprint");
};
assert_eq!(fingerprint.fingerprint_version, "1");
assert_eq!(fingerprint.file_count, 3);
assert_eq!(fingerprint.message_count, 2);
assert_eq!(fingerprint.parse_error_count, 1);
assert!(
fingerprint
.message_type_counts
.iter()
.any(|count| count.value == "ORU^R01" && count.count == 1)
);
assert!(
fingerprint
.field_cardinality
.iter()
.any(|field| field.path == "OBX.5"
&& field.min_per_message == 0
&& field.max_per_message == 1)
);
assert!(
fingerprint
.value_shape_stats
.iter()
.any(|shape| shape.path == "OBX.5" && shape.numeric_count == 1)
);
}
}