import subprocess
import pandas as pd
import sys
import json
import os
from pathlib import Path
def run_cgdist(args):
cmd = ['../target/release/cgdist'] + args
result = subprocess.run(cmd, capture_output=True, text=True)
return result.stdout, result.stderr, result.returncode
def load_distance_matrix(filepath):
with open(filepath, 'r') as f:
lines = [line for line in f if not line.startswith('#')]
from io import StringIO
df = pd.read_csv(StringIO(''.join(lines)), sep='\t', index_col=0)
return df
def validate_cache_consistency():
print("=" * 80)
print("CACHE CONSISTENCY VALIDATION")
print("=" * 80)
cache_file = "results/validation_cache.lz4"
if not os.path.exists(cache_file):
print("❌ Cache file not found. Run with --cache-file first.")
return False
schema_dir = "schema_crc32"
profiles = "profiles/test_profiles_crc32.tsv"
distance_modes = [
("hamming", "Hamming distance"),
("snps", "SNPs only"),
("snps-indel-contiguous", "SNPs + InDel-contiguous"),
("snps-indel-bases", "SNPs + InDel bases")
]
results = {}
all_passed = True
for mode, description in distance_modes:
print(f"\n📊 Testing {description} mode...")
output_with_cache = f"results/cache_test_{mode}_with.tsv"
stdout, stderr, rc = run_cgdist([
'--schema', schema_dir,
'--profiles', profiles,
'--output', output_with_cache,
'--mode', mode,
'--hasher-type', 'crc32',
'--cache-file', cache_file
])
if rc != 0:
print(f"❌ Failed with cache: {stderr}")
all_passed = False
continue
output_without_cache = f"results/cache_test_{mode}_without.tsv"
stdout, stderr, rc = run_cgdist([
'--schema', schema_dir,
'--profiles', profiles,
'--output', output_without_cache,
'--mode', mode,
'--hasher-type', 'crc32',
'--force-recompute'
])
if rc != 0:
print(f"❌ Failed without cache: {stderr}")
all_passed = False
continue
try:
matrix_with = load_distance_matrix(output_with_cache)
matrix_without = load_distance_matrix(output_without_cache)
if matrix_with.equals(matrix_without):
print(f" ✅ PASS - Matrices identical with/without cache")
results[mode] = True
else:
print(f" ❌ FAIL - Matrices differ with/without cache")
diff_mask = matrix_with != matrix_without
if diff_mask.any().any():
print(" Differences found:")
for i in matrix_with.index:
for j in matrix_with.columns:
if diff_mask.loc[i, j]:
print(f" {i} vs {j}: with_cache={matrix_with.loc[i, j]}, without_cache={matrix_without.loc[i, j]}")
results[mode] = False
all_passed = False
except Exception as e:
print(f" ❌ Error comparing matrices: {e}")
results[mode] = False
all_passed = False
try:
os.remove(output_with_cache)
os.remove(output_without_cache)
except:
pass
return all_passed, results
def validate_cache_metadata():
print("\n" + "=" * 80)
print("CACHE METADATA VALIDATION")
print("=" * 80)
cache_file = "results/validation_cache.lz4"
stdout, stderr, rc = run_cgdist(['--inspector', cache_file])
if rc != 0:
print(f"❌ Failed to inspect cache: {stderr}")
return False
print("📋 Cache inspection results:")
print(stdout)
checks = [
("Version: 0.1.0", "✅ Version check"),
("Hasher type: crc32", "✅ Hasher type check"),
("Distance mode: snps-indel-bases", "✅ Distance mode check"),
("Total entries: 70", "✅ Entry count check"),
("Unique loci: 3", "✅ Loci count check"),
("Validation test cache", "✅ User note check")
]
all_passed = True
for check_string, pass_msg in checks:
if check_string in stdout:
print(f" {pass_msg}")
else:
print(f" ❌ Missing: {check_string}")
all_passed = False
return all_passed
def validate_cache_performance():
print("\n" + "=" * 80)
print("CACHE PERFORMANCE VALIDATION")
print("=" * 80)
cache_file = "results/validation_cache.lz4"
schema_dir = "schema_crc32"
profiles = "profiles/test_profiles_crc32.tsv"
print("🚀 Testing with cache (should be fast)...")
stdout, stderr, rc = run_cgdist([
'--schema', schema_dir,
'--profiles', profiles,
'--output', 'results/perf_test_cached.tsv',
'--mode', 'snps-indel-bases',
'--hasher-type', 'crc32',
'--cache-file', cache_file
])
if rc != 0:
print(f"❌ Failed with cache: {stderr}")
return False
if "Already in cache: 70 (100.0%)" in stdout:
print(" ✅ PASS - 100% cache hit rate achieved")
cache_performance = True
else:
print(" ❌ FAIL - Expected 100% cache hit rate")
cache_performance = False
try:
os.remove('results/perf_test_cached.tsv')
except:
pass
return cache_performance
def main():
print("🔍 cgDist Cache Validation Suite")
print("=" * 80)
print("Validating cache consistency, metadata, and performance...")
print()
all_tests_passed = True
try:
consistency_passed, mode_results = validate_cache_consistency()
if not consistency_passed:
all_tests_passed = False
except Exception as e:
print(f"❌ Cache consistency test failed: {e}")
all_tests_passed = False
try:
metadata_passed = validate_cache_metadata()
if not metadata_passed:
all_tests_passed = False
except Exception as e:
print(f"❌ Cache metadata test failed: {e}")
all_tests_passed = False
try:
performance_passed = validate_cache_performance()
if not performance_passed:
all_tests_passed = False
except Exception as e:
print(f"❌ Cache performance test failed: {e}")
all_tests_passed = False
print("\n" + "=" * 80)
if all_tests_passed:
print("🎉 ALL CACHE VALIDATION TESTS PASSED!")
print("✅ Cache is consistent across all distance modes")
print("✅ Cache metadata is correct and complete")
print("✅ Cache provides expected performance benefits")
print("✅ Scientific community can trust cache integrity")
else:
print("⚠️ SOME CACHE VALIDATION TESTS FAILED")
print("Review the detailed output above for specific issues")
print("=" * 80)
return all_tests_passed
if __name__ == '__main__':
success = main()
sys.exit(0 if success else 1)