cgdist 0.1.1

Ultra-fast SNP/indel-level distance calculator for core genome MLST analysis
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
// main.rs - CLI entry point

#![allow(unknown_lints, clippy::manual_is_multiple_of)]

use std::collections::HashSet;
use std::time::Instant;
// Removed unused import
use cgdist::cli::Config;
use cgdist::core::distance::ModernCache;
use cgdist::data::SequenceDatabase;
use cgdist::prelude::*;

// Type alias for complex cache key/value types
type LegacyCacheData = std::collections::HashMap<(String, u32, u32, u64), (usize, usize, usize)>;

#[derive(serde::Deserialize)]
struct LegacyCache {
    #[allow(dead_code)]
    data: LegacyCacheData,
    #[allow(dead_code)]
    version: Option<String>,
    #[allow(dead_code)]
    created: Option<String>,
    #[allow(dead_code)]
    alignment_config: AlignmentConfig,
}

fn main() {
    if let Err(e) = run_main() {
        eprintln!("❌ ERROR: {e}");
        std::process::exit(1);
    }
}

fn run_main() -> Result<(), String> {
    let mut args: Args = argh::from_env();

    // Handle --version / -V before any other processing.
    if args.version {
        println!("cgdist {}", env!("CARGO_PKG_VERSION"));
        return Ok(());
    }

    let command_line = std::env::args().collect::<Vec<String>>().join(" ");

    // ---- Deprecation warnings & alias normalization ----
    if args.mode.eq_ignore_ascii_case("snps-indel-events")
        || args.mode.eq_ignore_ascii_case("snps+indel-events")
    {
        eprintln!(
            "⚠️  Note: --mode {} is a deprecated alias for --mode snps-indel-contiguous. \
             The legacy name is still accepted; please update your scripts.",
            args.mode
        );
    }
    if args.recombination_log.is_some() && args.candidate_recombination_log.is_none() {
        eprintln!(
            "⚠️  Note: --recombination-log is a deprecated alias for \
             --candidate-recombination-log. The legacy name is still accepted; \
             please update your scripts."
        );
        args.candidate_recombination_log = args.recombination_log.clone();
    }
    if args.recombination_threshold.is_some() && args.candidate_recombination_threshold.is_none() {
        eprintln!(
            "⚠️  Note: --recombination-threshold is a deprecated alias for \
             --candidate-recombination-threshold. The legacy name is still accepted; \
             please update your scripts."
        );
        args.candidate_recombination_threshold = args.recombination_threshold;
    }
    if args.no_hamming_fallback {
        eprintln!(
            "⚠️  Note: --no-hamming-fallback is now a no-op. The Hamming fallback is opt-in \
             via --hamming-fallback (default disabled), so explicitly disabling it is \
             redundant. The flag is accepted for backward compatibility."
        );
    }
    if args.hamming_fallback {
        eprintln!(
            "⚠️  Note: --hamming-fallback is active. In SNPs-only mode, allele pairs with 0 SNPs \
             but different hashes (i.e. InDel-only differences) will contribute +1 instead of 0. \
             The +1 is a design-preserving placeholder, not an inferred SNP count."
        );
    }

    // Handle generate config first
    if args.generate_config {
        let sample_config = Config::generate_sample();
        println!("{sample_config}");
        println!("\n💡 Save this content to a .toml file and use --config /path/to/config.toml");
        return Ok(());
    }

    // Load configuration file if specified
    if let Some(config_path) = args.config.clone() {
        args = args.with_config_file(&config_path)?;
    }

    // Handle inspector mode
    if let Some(cache_path) = &args.inspector {
        run_inspector(cache_path);
        return Ok(());
    }

    // Validate required parameters when not using inspector
    let profiles = args.profiles.as_ref().ok_or("--profiles is required")?;

    let output = if args.stats_only || args.benchmark || args.cache_only {
        None
    } else {
        Some(args.output.as_ref().ok_or("--output is required")?)
    };

    // Schema is only required for non-hamming hashers and when not using stats-only or cache-only
    let schema = if args.hasher_type == "hamming" || args.stats_only {
        None
    } else {
        Some(
            args.schema
                .as_ref()
                .ok_or("--schema is required (not needed for hamming hasher or stats-only mode)")?,
        )
    };

    println!("🚀 cgDist v{}", env!("CARGO_PKG_VERSION"));
    println!("⚡ Strategy: Pre-compute unique pairs → Batch Parasail → Fast assembly");

    // Configure thread pool. Default is 1 thread: on shared systems, auto-detection can
    // interfere with other processes. Pass `--threads 0` for auto-detect, or `--threads N`
    // to request a specific number.
    let requested_threads = args.threads.unwrap_or(1);
    if requested_threads == 0 {
        let num_threads = rayon::current_num_threads();
        println!("🧵 Threads: {num_threads} (auto-detected via --threads 0)");
    } else {
        rayon::ThreadPoolBuilder::new()
            .num_threads(requested_threads)
            .build_global()
            .expect("Failed to configure thread pool");
        println!(
            "🧵 Threads: {}{}",
            requested_threads,
            if args.threads.is_none() {
                " (default; pass --threads N or 0 for auto-detect)"
            } else {
                ""
            }
        );
    }

    // Initialize hasher registry and validate hasher type
    let registry = HasherRegistry::new();
    println!("🔨 Hasher: {}", args.hasher_type);

    if !registry.has_hasher(&args.hasher_type) {
        eprintln!("❌ Error: Invalid hasher type '{}'", args.hasher_type);
        eprintln!("Available hashers:");
        for (name, desc) in registry.list_hashers() {
            eprintln!("  - {name}: {desc}");
        }
        std::process::exit(1);
    }

    // Handle benchmark mode
    if args.benchmark {
        return run_benchmark(&args);
    }

    // Validate all arguments
    let validation_result = validate_args(&args)?;

    let total_start = Instant::now();

    // Load allelic profiles
    let matrix = match AllelicMatrix::from_file_with_hasher(
        std::path::Path::new(profiles), // Now always required
        &args.missing_char,
        &args.hasher_type,
        args.sample_threshold,
        args.locus_threshold,
        validation_result.sample_include_regex.as_ref(),
        validation_result.sample_exclude_regex.as_ref(),
        validation_result.samples_include_set.as_ref(),
        validation_result.samples_exclude_set.as_ref(),
        validation_result.loci_include_regex.as_ref(),
        validation_result.loci_exclude_regex.as_ref(),
        validation_result.loci_include_set.as_ref(),
        validation_result.loci_exclude_set.as_ref(),
    ) {
        Ok(m) => m,
        Err(e) => {
            eprintln!("❌ ERROR loading profiles: {e}");
            std::process::exit(1);
        }
    };

    if args.dry_run {
        println!("✅ Dry run completed successfully");
        println!(
            "📊 Final matrix: {} samples × {} loci",
            matrix.samples.len(),
            matrix.loci_names.len()
        );
        return Ok(());
    }

    // Handle stats-only mode
    if args.stats_only {
        println!("\n📈 === GENETIC DIVERSITY ANALYSIS ===");
        let diversity_stats = matrix.calculate_diversity_metrics();

        println!("🧬 Allelic Diversity Metrics:");
        println!(
            "  • Average unique alleles per locus: {:.1}",
            diversity_stats.avg_unique_alleles
        );
        println!(
            "  • Diversity index: {:.3} ({})",
            diversity_stats.diversity_index, diversity_stats.diversity_category
        );
        println!(
            "  • Total unique CRC pairs: {}",
            diversity_stats.total_unique_pairs
        );

        if diversity_stats.diversity_index < 0.3 {
            println!("  🟢 Population appears clonal with low genetic diversity");
        } else if diversity_stats.diversity_index < 0.6 {
            println!("  🟡 Population shows moderate genetic diversity");
        } else {
            println!("  🔴 Population is highly diverse and genetically heterogeneous");
        }

        println!("\n✅ Statistics analysis completed");
        return Ok(());
    }

    if args.hasher_type == "hamming" {
        println!("\n🎯 Distance calculation: Hamming hasher (allelic level)");
    } else {
        println!(
            "\n🎯 Distance calculation mode: {} ({})",
            validation_result.distance_mode.description(),
            if args.hamming_fallback {
                "Hamming fallback enabled (opt-in)"
            } else {
                "no Hamming fallback (default)"
            }
        );
    }

    if args.min_loci > 0 {
        println!("📏 Minimum shared loci: {}", args.min_loci);
    }

    // Pre-compute unique CRC pairs from the matrix
    println!("🔍 Pre-computing unique allele pairs from matrix...");
    let unique_pairs = extract_unique_crc_pairs(&matrix);
    if args.hasher_type == "hamming" {
        println!(
            "✅ Found {} unique CRC pairs for Hamming distance",
            unique_pairs.len()
        );
    } else {
        println!("✅ Found {} unique CRC pairs to align", unique_pairs.len());
    }

    // Load only required sequences from schema (skip for hamming hasher)
    let sequence_db = if args.hasher_type == "hamming" {
        SequenceDatabase::empty()
    } else {
        match SequenceDatabase::from_schema_selective(
            std::path::Path::new(schema.unwrap()),
            &unique_pairs,
            &matrix.loci_names,
        ) {
            Ok(db) => db,
            Err(e) => {
                eprintln!("❌ ERROR loading schema: {e}");
                std::process::exit(1);
            }
        }
    };

    // Create distance engine with hasher type
    let mut engine = DistanceEngine::with_sequences(
        validation_result.alignment_config,
        sequence_db,
        args.hasher_type.clone(),
    );

    // Set cache note if provided
    if let Some(note) = &args.cache_note {
        engine.set_cache_note(note.clone());
    }

    // Set alignment saving if requested
    if let Some(ref save_path) = args.save_alignments {
        engine.set_save_alignments(save_path.clone());
        println!("💾 Alignment details will be saved to: {save_path}");
    }

    // Try to load existing cache first (skip for hamming hasher)
    if args.hasher_type != "hamming" {
        if let Some(ref cache_path) = args.cache_file {
            if std::path::Path::new(cache_path).exists() && !args.force_recompute {
                // Compatibility already checked early, proceed with full cache load
                match engine.load_cache(cache_path, validation_result.distance_mode) {
                    Ok(()) => {
                        let (loaded_entries, _) = engine.cache_stats();
                        println!("🎯 Cache loaded successfully with {loaded_entries} entries");

                        // Check for automatic or explicit enrichment with sequence lengths
                        let should_enrich = if args.enrich_lengths {
                            // User explicitly requested enrichment
                            true
                        } else if loaded_entries == 0 {
                            // Empty cache - enrich automatically if schema is available
                            schema.is_some()
                        } else {
                            // Non-empty cache - no automatic enrichment unless explicitly requested
                            false
                        };

                        if should_enrich {
                            if let Some(schema_path) = schema {
                                let enrich_reason = if args.enrich_lengths {
                                    "explicitly requested"
                                } else {
                                    "empty cache (automatic)"
                                };
                                println!(
                                    "🔍 Enriching cache with sequence lengths from schema ({enrich_reason})..."
                                );
                                let output_cache_path =
                                    args.enrich_output.as_ref().unwrap_or(cache_path);
                                match engine.enrich_cache_with_lengths_from_input(
                                    schema_path,
                                    cache_path,
                                    output_cache_path,
                                ) {
                                    Ok(enriched_count) => {
                                        if args.enrich_output.is_some() {
                                            println!("✅ Enriched {enriched_count} entries with sequence lengths, saved to: {output_cache_path}");
                                        } else {
                                            println!("✅ Enriched {enriched_count} entries with sequence lengths (cache updated in-place)");
                                        }
                                    }
                                    Err(e) => {
                                        println!("⚠️  Failed to enrich cache: {e}");
                                    }
                                }
                            } else {
                                println!("⚠️  Cannot enrich cache: schema path required");
                            }
                        }
                    }
                    Err(e) => {
                        eprintln!("❌ FATAL ERROR loading cache: {e}");
                        eprintln!("💡 Solutions:");
                        eprintln!("   - Use --force-recompute to ignore existing cache");
                        eprintln!("   - Delete incompatible cache file");
                        eprintln!("   - Use consistent alignment parameters");
                        std::process::exit(1);
                    }
                }
            }

            // Always run precompute to handle missing pairs and show stats
            println!("🚀 Pre-computing missing alignments...");
            engine.precompute_alignments(&unique_pairs, validation_result.distance_mode);
        } else {
            println!("🚀 Pre-computing all alignments in batch...");
            engine.precompute_alignments(&unique_pairs, validation_result.distance_mode);
        }
    } else {
        println!("🔨 Using Hamming hasher - no alignment precomputation needed");
    }

    // Check if cache-only mode
    if args.cache_only {
        println!("\n✅ Cache-only mode: Alignments computed and cached successfully");
        if let Some(ref cache_path) = args.cache_file {
            let had_new_entries = engine.has_new_entries();
            if had_new_entries {
                if let Err(e) = engine.save_cache(cache_path, validation_result.distance_mode) {
                    eprintln!("⚠️  Warning: Failed to save cache: {e}");
                } else {
                    println!("📁 Cache saved to: {cache_path}");

                    // Check if we should automatically enrich the newly saved cache
                    let should_auto_enrich = !args.enrich_lengths && schema.is_some();

                    if should_auto_enrich {
                        if let Some(schema_path) = schema {
                            println!("🔍 Enriching cache with sequence lengths (automatic)...");
                            let output_cache_path =
                                args.enrich_output.as_ref().unwrap_or(cache_path);
                            match engine.enrich_cache_with_lengths_from_input(
                                schema_path,
                                cache_path,
                                output_cache_path,
                            ) {
                                Ok(enriched_count) => {
                                    if args.enrich_output.is_some() {
                                        println!("✅ Enriched {enriched_count} entries with sequence lengths, saved to: {output_cache_path}");
                                    } else {
                                        println!("✅ Enriched {enriched_count} entries with sequence lengths (cache updated in-place)");
                                    }
                                }
                                Err(e) => {
                                    println!("⚠️  Warning: Failed to enrich cache: {e}");
                                }
                            }
                        }
                    }
                }
            } else {
                println!("📌 Cache unchanged - skipping save");
            }
        } else {
            println!("⚠️  Warning: No cache file specified - computed alignments not saved");
        }
        let total_elapsed = total_start.elapsed();
        println!(
            "\n⏱️  Total execution time: {:.2}s",
            total_elapsed.as_secs_f64()
        );
        return Ok(());
    }

    // Set candidate-region recombination flagging if requested (TODO: implement)
    if let Some(ref _log_path) = args.candidate_recombination_log {
        // TODO: Implement candidate-region flagging
        println!(
            "🧬 Candidate-region flagging requested (threshold: {} bp) - not implemented yet",
            args.candidate_recombination_threshold.unwrap_or(20)
        );
    }

    // Calculate distance matrix.
    // The internal `no_hamming_fallback` parameter is the inverse of the opt-in flag:
    // pass true when the user has NOT requested fallback (the default).
    println!("\n🔄 Computing distance matrix...");
    let distance_matrix = calculate_distance_matrix(
        &matrix.samples,
        &matrix.loci_names,
        &engine,
        validation_result.distance_mode,
        args.min_loci,
        !args.hamming_fallback,
    );

    // Write output
    let output_path = output.unwrap(); // Safe because stats_only returns early
    if let Err(e) = write_matrix(
        output_path,
        &args.format,
        &matrix.samples,
        &distance_matrix,
        &command_line,
    ) {
        eprintln!("❌ ERROR writing output: {e}");
        std::process::exit(1);
    }

    // Save cache if specified (skip for hamming hasher) and only if there are new entries
    if args.hasher_type != "hamming" {
        if let Some(ref cache_path) = args.cache_file {
            let had_new_entries = engine.has_new_entries();
            if had_new_entries {
                if let Err(e) = engine.save_cache(cache_path, validation_result.distance_mode) {
                    eprintln!("⚠️  Warning: Failed to save cache: {e}");
                } else {
                    // Check if we should automatically enrich the newly saved cache
                    // This covers the case where cache was created for the first time
                    let should_auto_enrich = !args.enrich_lengths && schema.is_some();

                    if should_auto_enrich {
                        if let Some(schema_path) = schema {
                            println!("🔍 Enriching newly created cache with sequence lengths (automatic)...");
                            let output_cache_path =
                                args.enrich_output.as_ref().unwrap_or(cache_path);
                            match engine.enrich_cache_with_lengths_from_input(
                                schema_path,
                                cache_path,
                                output_cache_path,
                            ) {
                                Ok(enriched_count) => {
                                    if args.enrich_output.is_some() {
                                        println!("✅ Enriched {enriched_count} entries with sequence lengths, saved to: {output_cache_path}");
                                    } else {
                                        println!("✅ Enriched {enriched_count} entries with sequence lengths (cache updated in-place)");
                                    }
                                }
                                Err(e) => {
                                    println!(
                                        "⚠️  Warning: Failed to enrich newly created cache: {e}"
                                    );
                                }
                            }
                        }
                    }
                }
            } else {
                println!("📌 Cache unchanged - skipping save");
            }
        }
    }

    // Save alignment details if requested
    if let Err(e) = engine.save_alignments() {
        eprintln!("⚠️  Warning: Failed to save alignment details: {e}");
    }

    // Save candidate-region flagging log if specified (TODO: implement).
    // candidate_recombination_log was already aliased from the deprecated --recombination-log
    // earlier in run_main(), so checking the canonical field is sufficient here.
    if let Some(ref _log_path) = args.candidate_recombination_log {
        println!("⚠️  Candidate-region flagging log saving not implemented yet");
    }

    // Print summary
    let total_elapsed = total_start.elapsed();
    println!("\n🎉 === CGDIST COMPLETED SUCCESSFULLY ===");
    println!(
        "⏱️  Total execution time: {:.2}s",
        total_elapsed.as_secs_f64()
    );
    println!(
        "📊 Final matrix: {} samples × {} loci",
        matrix.samples.len(),
        matrix.loci_names.len()
    );
    println!("📁 Output written to: {output_path}");
    println!("🔧 Command: {command_line}");

    Ok(())
}

/// Run benchmark mode to measure alignment processing speed with real data
fn run_benchmark(args: &Args) -> Result<(), String> {
    println!("\n🏁 === BENCHMARK MODE ===");
    println!(
        "📊 Testing alignment speed with {} threads",
        rayon::current_num_threads()
    );

    // Get initial memory
    let _initial_memory = get_memory_usage();
    let _start_total = Instant::now();

    // Validate arguments
    let validation_result = validate_args(args)?;

    let profiles = args.profiles.as_ref().unwrap();

    // Quick setup - minimal data loading
    println!("\n🚀 Quick setup...");
    let setup_start = Instant::now();

    // Load just enough data to get unique pairs
    let matrix = match AllelicMatrix::from_file_with_hasher(
        std::path::Path::new(profiles),
        &args.missing_char,
        &args.hasher_type,
        args.sample_threshold,
        args.locus_threshold,
        validation_result.sample_include_regex.as_ref(),
        validation_result.sample_exclude_regex.as_ref(),
        validation_result.samples_include_set.as_ref(),
        validation_result.samples_exclude_set.as_ref(),
        validation_result.loci_include_regex.as_ref(),
        validation_result.loci_exclude_regex.as_ref(),
        validation_result.loci_include_set.as_ref(),
        validation_result.loci_exclude_set.as_ref(),
    ) {
        Ok(m) => m,
        Err(e) => return Err(format!("Failed to load profiles: {e}")),
    };

    let unique_pairs = extract_unique_crc_pairs(&matrix);
    println!(
        "   ✅ Setup complete: {} samples, {} loci, {} unique pairs",
        matrix.samples.len(),
        matrix.loci_names.len(),
        unique_pairs.len()
    );

    // Load all necessary sequences for realistic benchmark
    let sequence_db = if args.hasher_type != "hamming" {
        let schema = args
            .schema
            .as_ref()
            .ok_or("--schema is required for benchmark mode")?;

        match SequenceDatabase::from_schema_selective(
            std::path::Path::new(schema),
            &unique_pairs,
            &matrix.loci_names,
        ) {
            Ok(db) => {
                println!("   📂 Loaded {} sequences from schema", db.total_sequences);
                Some(db)
            }
            Err(e) => {
                println!("   ⚠️  Failed to load schema: {e}");
                None
            }
        }
    } else {
        None
    };

    let _engine = if let Some(ref seq_db) = sequence_db {
        DistanceEngine::with_sequences(
            validation_result.alignment_config.clone(),
            seq_db.clone(),
            args.hasher_type.clone(),
        )
    } else {
        DistanceEngine::new(
            validation_result.alignment_config.clone(),
            args.hasher_type.clone(),
        )
    };

    let setup_elapsed = setup_start.elapsed();
    println!("   ⏱️  Setup time: {:.2}s", setup_elapsed.as_secs_f64());

    // Create a single engine for the entire benchmark
    let _engine = if args.hasher_type != "hamming" {
        if let Some(seq_db) = sequence_db.as_ref() {
            DistanceEngine::with_sequences(
                validation_result.alignment_config.clone(),
                seq_db.clone(),
                args.hasher_type.clone(),
            )
        } else {
            DistanceEngine::new(
                validation_result.alignment_config.clone(),
                args.hasher_type.clone(),
            )
        }
    } else {
        DistanceEngine::new(
            validation_result.alignment_config.clone(),
            args.hasher_type.clone(),
        )
    };

    // Benchmark with configurable timeout
    println!(
        "\n⚡ Benchmarking alignment speed for {} seconds...",
        args.benchmark_duration
    );
    println!("📊 Mode: {:?}", validation_result.distance_mode);
    println!("🔧 Hasher: {}", args.hasher_type);

    // Analyze sequence lengths
    let (total_sequences, min_len, max_len, avg_len) = if let Some(ref seq_db) = sequence_db {
        let mut lengths = Vec::new();
        for locus_map in seq_db.loci.values() {
            for seq_info in locus_map.values() {
                lengths.push(seq_info.sequence.len());
            }
        }

        if !lengths.is_empty() {
            let min = *lengths.iter().min().unwrap();
            let max = *lengths.iter().max().unwrap();
            let avg = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
            (lengths.len(), min, max, avg)
        } else {
            (0, 0, 0, 0.0)
        }
    } else {
        (0, 0, 0, 0.0)
    };

    println!("🧬 Loaded sequences: {total_sequences}");
    if total_sequences > 0 {
        println!("📏 Sequence lengths: min={min_len}bp, max={max_len}bp, avg={avg_len:.0}bp");
    }

    let benchmark_start = Instant::now();
    let timeout = std::time::Duration::from_secs(args.benchmark_duration);
    let mut max_pairs_per_second = 0.0;
    let mut total_pairs_processed = 0;

    // Memory tracking
    let initial_memory = get_memory_usage();
    let peak_memory = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(initial_memory));
    let peak_memory_clone = peak_memory.clone();

    // Spawn memory monitor thread
    let monitor_handle = std::thread::spawn(move || {
        while peak_memory_clone.load(std::sync::atomic::Ordering::Relaxed) != 0 {
            let current_mem = get_memory_usage();
            let mut peak = peak_memory_clone.load(std::sync::atomic::Ordering::Relaxed);
            while current_mem > peak
                && peak_memory_clone
                    .compare_exchange_weak(
                        peak,
                        current_mem,
                        std::sync::atomic::Ordering::Relaxed,
                        std::sync::atomic::Ordering::Relaxed,
                    )
                    .is_err()
            {
                peak = peak_memory_clone.load(std::sync::atomic::Ordering::Relaxed);
            }
            std::thread::sleep(std::time::Duration::from_millis(100));
        }
    });

    // Convert to vector for chunked processing
    let pairs_vec: Vec<_> = unique_pairs.iter().cloned().collect();
    let chunk_size = 1000;
    let mut chunk_idx = 0;

    println!("⏱️  Processing chunks of {chunk_size} pairs until timeout...");

    while benchmark_start.elapsed() < timeout && chunk_idx * chunk_size < pairs_vec.len() {
        let start_idx = chunk_idx * chunk_size;
        let end_idx = std::cmp::min(start_idx + chunk_size, pairs_vec.len());
        let chunk_pairs: HashSet<_> = pairs_vec[start_idx..end_idx].iter().cloned().collect();

        // Create fresh engine for this chunk to avoid cache
        let mut fresh_engine = if let Some(ref seq_db) = sequence_db {
            DistanceEngine::with_sequences(
                validation_result.alignment_config.clone(),
                seq_db.clone(),
                args.hasher_type.clone(),
            )
        } else {
            DistanceEngine::new(
                validation_result.alignment_config.clone(),
                args.hasher_type.clone(),
            )
        };

        let chunk_start = Instant::now();

        // Debug: count how many pairs actually have sequences
        let mut pairs_with_sequences = 0;
        let mut pairs_without_sequences = 0;

        if let Some(ref seq_db) = sequence_db {
            for (locus, crc1, crc2) in &chunk_pairs {
                let has_seq1 = seq_db.get_sequence(locus, *crc1).is_some();
                let has_seq2 = seq_db.get_sequence(locus, *crc2).is_some();
                if has_seq1 && has_seq2 {
                    pairs_with_sequences += 1;
                } else {
                    pairs_without_sequences += 1;
                }
            }
        }

        fresh_engine.precompute_alignments(&chunk_pairs, validation_result.distance_mode);
        let chunk_elapsed = chunk_start.elapsed();

        // Show debug info for first chunk
        if chunk_idx == 0 {
            println!("   🔍 Debug chunk analysis:");
            println!("     • Pairs with both sequences: {pairs_with_sequences}");
            println!("     • Pairs missing sequences: {pairs_without_sequences}");
            println!(
                "     • Alignment rate: {:.0} pairs/sec",
                chunk_pairs.len() as f64 / chunk_elapsed.as_secs_f64()
            );
        }

        let chunk_pairs_per_second = chunk_pairs.len() as f64 / chunk_elapsed.as_secs_f64();
        if chunk_pairs_per_second > max_pairs_per_second {
            max_pairs_per_second = chunk_pairs_per_second;
        }

        total_pairs_processed += chunk_pairs.len();
        chunk_idx += 1;

        // Progress update every 5 seconds
        if benchmark_start.elapsed().as_secs() % 5 == 0 && chunk_idx % 3 == 0 {
            let elapsed = benchmark_start.elapsed().as_secs();
            let remaining = args.benchmark_duration - elapsed;
            println!(
                "   ⏱️  {elapsed}s elapsed, {remaining}s remaining, processed {total_pairs_processed} pairs, peak: {max_pairs_per_second:.0} pairs/sec"
            );
        }
    }

    // Stop memory monitor
    let final_peak_memory = peak_memory.load(std::sync::atomic::Ordering::Relaxed);
    peak_memory.store(0, std::sync::atomic::Ordering::Relaxed);
    let _ = monitor_handle.join();

    let benchmark_elapsed = benchmark_start.elapsed();
    let total_elapsed = benchmark_elapsed.as_secs_f64();

    // Calculate ETA for full dataset
    let total_pairs = unique_pairs.len();
    let eta_seconds = total_pairs as f64 / max_pairs_per_second;
    let eta_minutes = eta_seconds / 60.0;
    let eta_hours = eta_minutes / 60.0;

    // Calculate average performance
    let avg_pairs_per_second = total_pairs_processed as f64 / total_elapsed;

    // Calculate completion percentage
    let completion_percentage = (total_pairs_processed as f64 / total_pairs as f64) * 100.0;

    // Results
    println!("\n📈 === BENCHMARK RESULTS ===");
    println!("⏱️  Test duration: {total_elapsed:.1} seconds");
    println!(
        "🔢 Pairs processed: {total_pairs_processed} / {total_pairs} ({completion_percentage:.2}%)"
    );
    println!("⚡ Peak performance: {max_pairs_per_second:.0} pairs/second");
    println!("📊 Average performance: {avg_pairs_per_second:.0} pairs/second");
    println!("🧵 Using {} threads", rayon::current_num_threads());
    println!(
        "📊 Per-thread peak: {:.0} pairs/sec/thread",
        max_pairs_per_second / rayon::current_num_threads() as f64
    );
    println!(
        "📊 Per-thread average: {:.0} pairs/sec/thread",
        avg_pairs_per_second / rayon::current_num_threads() as f64
    );

    println!("\n💾 Memory usage:");
    println!("   • Initial: {} MB", initial_memory / 1024 / 1024);
    println!(
        "   • Peak: {} MB (+{} MB)",
        final_peak_memory / 1024 / 1024,
        (final_peak_memory - initial_memory) / 1024 / 1024
    );

    // Estimate memory for full dataset using diversity-based approach
    let memory_per_pair = if total_pairs_processed > 0 {
        (final_peak_memory - initial_memory) as f64 / total_pairs_processed as f64
    } else {
        0.0
    };

    // Get diversity metrics for smarter estimation
    let diversity_stats = matrix.calculate_diversity_metrics();

    // Diversity-based memory estimation (empirical formula based on real-world data)
    // Base memory: setup + sequences
    // Cache memory: depends on diversity index and number of loci
    let base_memory_mb = initial_memory as f64 / 1024.0 / 1024.0;
    let cache_memory_mb = if diversity_stats.diversity_index < 0.05 {
        // Very low diversity (clonal) - small cache
        1000.0 + (matrix.loci_names.len() as f64 * 0.3)
    } else if diversity_stats.diversity_index < 0.15 {
        // Low-moderate diversity
        2000.0 + (matrix.loci_names.len() as f64 * 0.5)
    } else if diversity_stats.diversity_index < 0.3 {
        // Moderate diversity
        3000.0 + (matrix.loci_names.len() as f64 * 0.8)
    } else {
        // High diversity - larger cache needed
        4000.0 + (matrix.loci_names.len() as f64 * 1.2)
    };

    let diversity_based_estimate_mb = base_memory_mb + cache_memory_mb;
    let linear_estimate_mb =
        (initial_memory as f64 + (memory_per_pair * total_pairs as f64)) / 1024.0 / 1024.0;

    println!(
        "   • Memory per pair: {:.1} KB/pair",
        memory_per_pair / 1024.0
    );
    println!(
        "   • Linear estimate: {:.0} MB ({:.1} GB) ⚠️  Likely overestimated",
        linear_estimate_mb,
        linear_estimate_mb / 1024.0
    );
    println!(
        "   • Diversity-based estimate: {:.0} MB ({:.1} GB) 🎯 More realistic",
        diversity_based_estimate_mb,
        diversity_based_estimate_mb / 1024.0
    );
    println!(
        "     └─ Based on diversity index {:.3} ({})",
        diversity_stats.diversity_index, diversity_stats.diversity_category
    );
    println!(
        "   • Per sample estimate: {:.1} MB/sample",
        (final_peak_memory - initial_memory) as f64 / 1024.0 / 1024.0 / matrix.samples.len() as f64
    );
    println!("   📊 Diversity-based estimate accounts for cache stabilization patterns");

    println!("\n🎯 === FULL DATASET ETA ===");
    println!("🔢 Total unique pairs: {total_pairs}");

    if eta_hours >= 1.0 {
        println!("⏱️  ETA at peak speed: {eta_hours:.1} hours");
    } else if eta_minutes >= 1.0 {
        println!("⏱️  ETA at peak speed: {eta_minutes:.1} minutes");
    } else {
        println!("⏱️  ETA at peak speed: {eta_seconds:.0} seconds");
    }

    // Save benchmark results to JSON
    let timestamp = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap()
        .as_secs();
    let json_filename = format!("cgdist_benchmark_{timestamp}.json");

    // Reconstruct the command line
    let command_line = std::env::args().collect::<Vec<String>>().join(" ");

    let benchmark_results = serde_json::json!({
        "timestamp": timestamp,
        "command_line": command_line,
        "test_info": {
            "duration_seconds": total_elapsed,
            "timeout_seconds": args.benchmark_duration,
            "threads": rayon::current_num_threads(),
            "hasher_type": args.hasher_type,
            "distance_mode": format!("{:?}", validation_result.distance_mode)
        },
        "dataset_info": {
            "samples": matrix.samples.len(),
            "loci": matrix.loci_names.len(),
            "total_unique_pairs": total_pairs,
            "pairs_processed": total_pairs_processed,
            "completion_percentage": completion_percentage
        },
        "performance": {
            "peak_pairs_per_second": max_pairs_per_second,
            "average_pairs_per_second": avg_pairs_per_second,
            "per_thread_peak": max_pairs_per_second / rayon::current_num_threads() as f64,
            "per_thread_average": avg_pairs_per_second / rayon::current_num_threads() as f64
        },
        "memory_usage": {
            "initial_mb": initial_memory / 1024 / 1024,
            "peak_mb": final_peak_memory / 1024 / 1024,
            "peak_increase_mb": (final_peak_memory - initial_memory) / 1024 / 1024,
            "memory_per_pair_kb": memory_per_pair / 1024.0,
            "linear_estimate_mb": linear_estimate_mb,
            "linear_estimate_gb": linear_estimate_mb / 1024.0,
            "diversity_estimate_mb": diversity_based_estimate_mb,
            "diversity_estimate_gb": diversity_based_estimate_mb / 1024.0,
            "diversity_index": diversity_stats.diversity_index,
            "diversity_category": diversity_stats.diversity_category,
            "per_sample_mb": (final_peak_memory - initial_memory) as f64 / 1024.0 / 1024.0 / matrix.samples.len() as f64
        },
        "eta": {
            "eta_seconds": eta_seconds,
            "eta_minutes": eta_minutes,
            "eta_hours": eta_hours,
            "eta_formatted": if eta_hours >= 1.0 {
                format!("{eta_hours:.1} hours")
            } else if eta_minutes >= 1.0 {
                format!("{eta_minutes:.1} minutes")
            } else {
                format!("{eta_seconds:.0} seconds")
            }
        },
        "sequence_info": {
            "total_sequences": total_sequences,
            "min_length_bp": min_len,
            "max_length_bp": max_len,
            "avg_length_bp": avg_len
        },
        "notes": {
            "linear_estimate_warning": format!("Linear estimate extrapolated from {:.1}s test - likely overestimated", total_elapsed),
            "diversity_estimate_note": "Diversity-based estimate uses genetic diversity patterns for more realistic prediction",
            "recommendation": "Use diversity-based estimate for memory planning"
        }
    });

    if let Err(e) = std::fs::write(
        &json_filename,
        serde_json::to_string_pretty(&benchmark_results).unwrap(),
    ) {
        println!("⚠️  Failed to save benchmark results to {json_filename}: {e}");
    } else {
        println!("💾 Benchmark results saved to: {json_filename}");
    }

    Ok(())
}

/// Get current memory usage in bytes
fn get_memory_usage() -> u64 {
    // Read from /proc/self/status on Linux
    #[cfg(target_os = "linux")]
    {
        use std::fs::read_to_string;

        if let Ok(status) = read_to_string("/proc/self/status") {
            for line in status.lines() {
                if line.starts_with("VmRSS:") {
                    let parts: Vec<&str> = line.split_whitespace().collect();
                    if parts.len() >= 2 {
                        if let Ok(kb) = parts[1].parse::<u64>() {
                            return kb * 1024; // Convert KB to bytes
                        }
                    }
                }
            }
        }
    }

    // Fallback for non-Linux or if reading fails
    0
}

/// Extract unique CRC pairs from allelic matrix
fn extract_unique_crc_pairs(matrix: &AllelicMatrix) -> HashSet<(String, u32, u32)> {
    let mut unique_pairs = HashSet::new();

    for locus in &matrix.loci_names {
        let mut locus_crcs = HashSet::new();

        // Collect all CRCs for this locus
        for sample in &matrix.samples {
            if let Some(hash) = sample.loci_hashes.get(locus) {
                if let Some(crc) = hash.as_crc32() {
                    if crc != u32::MAX {
                        // Skip missing values
                        locus_crcs.insert(crc);
                    }
                }
            }
        }

        // Generate all unique pairs for this locus
        let crcs: Vec<u32> = locus_crcs.into_iter().collect();
        for i in 0..crcs.len() {
            for j in i + 1..crcs.len() {
                let crc1 = crcs[i];
                let crc2 = crcs[j];
                unique_pairs.insert((locus.clone(), crc1.min(crc2), crc1.max(crc2)));
            }
        }
    }

    unique_pairs
}

/// Run integrated inspector functionality
fn run_inspector(cache_path: &str) {
    println!("🔍 cgDist Cache Inspector");
    println!("==========================");

    // Read and decompress cache file
    let compressed = match std::fs::read(cache_path) {
        Ok(data) => data,
        Err(e) => {
            eprintln!("❌ ERROR reading cache file: {e}");
            std::process::exit(1);
        }
    };

    let decompressed = match lz4_flex::decompress_size_prepended(&compressed) {
        Ok(data) => data,
        Err(e) => {
            eprintln!("❌ ERROR decompressing cache: {e}");
            std::process::exit(1);
        }
    };

    // Try to load as modern format first
    if let Ok(modern_cache) = serde_json::from_slice::<ModernCache>(&decompressed) {
        print_modern_cache_info(&modern_cache, &compressed);
    } else {
        // Try legacy format
        println!("🔄 Attempting to load as legacy format...");

        match bincode::deserialize::<LegacyCache>(&decompressed) {
            Ok(legacy_cache) => {
                print_legacy_cache_info(&legacy_cache, &compressed);
            }
            Err(e) => {
                eprintln!("❌ ERROR: Unable to parse cache file (tried both modern and legacy formats): {e}");
                std::process::exit(1);
            }
        }
    }
}

fn print_modern_cache_info(cache: &ModernCache, compressed: &[u8]) {
    let metadata = &cache.metadata;

    println!("✅ Modern cache format (v{})", metadata.format_version);
    println!();

    println!("=== METADATA ===");
    println!("Version: {}", metadata.version);
    println!("Created: {}", metadata.created);
    println!("Last modified: {}", metadata.last_modified);
    println!("Hasher type: {}", metadata.hasher_type);
    println!("Distance mode: {}", metadata.distance_mode);
    println!("Total entries: {}", metadata.total_entries);
    println!("Unique loci: {}", metadata.unique_loci);

    if let Some(note) = &metadata.user_note {
        println!("User note: {note}");
    }

    println!();
    println!("=== ALIGNMENT CONFIG ===");
    println!("Match score: {}", metadata.alignment_config.match_score);
    println!(
        "Mismatch penalty: {}",
        metadata.alignment_config.mismatch_penalty
    );
    println!("Gap open: {}", metadata.alignment_config.gap_open);
    println!("Gap extend: {}", metadata.alignment_config.gap_extend);

    println!();
    println!("=== STORAGE INFO ===");
    println!("Compressed size: {} KB", compressed.len() / 1024);
    println!(
        "Compression ratio: {:.2}:1",
        cache.data.len() as f64 * 50.0 / compressed.len() as f64
    );

    // Show top loci by entry count
    let mut locus_counts = std::collections::HashMap::new();
    for string_key in cache.data.keys() {
        // Parse locus from string key "locus:hash1:hash2"
        if let Some(locus) = string_key.split(':').next() {
            *locus_counts.entry(locus.to_string()).or_insert(0) += 1;
        }
    }

    let mut loci_sorted: Vec<_> = locus_counts.iter().collect();
    loci_sorted.sort_by(|a, b| b.1.cmp(a.1));

    println!();
    println!("=== TOP LOCI BY ENTRY COUNT ===");
    for (locus, count) in loci_sorted.iter().take(10) {
        println!("{locus:<20} {count:>8} entries");
    }

    if loci_sorted.len() > 10 {
        println!("... and {} more loci", loci_sorted.len() - 10);
    }

    // Show sample cache values to verify content
    println!();
    println!("=== SAMPLE CACHE VALUES (first 5 entries) ===");
    for (count, (string_key, cache_value)) in cache.data.iter().enumerate() {
        if count >= 5 {
            break;
        }
        println!("Key: {string_key}");
        println!(
            "  └─ SNPs: {}, indel_events: {}, indel_bases: {}, computed: {}",
            cache_value.snps,
            cache_value.indel_events,
            cache_value.indel_bases,
            cache_value.computed_at
        );
    }

    // Show statistics of the values
    println!();
    println!("=== CACHE VALUES ANALYSIS ===");
    let mut snps_sum = 0;
    let mut indel_events_sum = 0;
    let mut indel_bases_sum = 0;
    let mut non_zero_snps = 0;
    let mut non_zero_indel_events = 0;
    let mut non_zero_indel_bases = 0;

    for cache_value in cache.data.values() {
        snps_sum += cache_value.snps;
        indel_events_sum += cache_value.indel_events;
        indel_bases_sum += cache_value.indel_bases;

        if cache_value.snps > 0 {
            non_zero_snps += 1;
        }
        if cache_value.indel_events > 0 {
            non_zero_indel_events += 1;
        }
        if cache_value.indel_bases > 0 {
            non_zero_indel_bases += 1;
        }
    }

    let total_entries = cache.data.len();
    println!("📊 Values distribution:");
    println!(
        "  SNPs: sum={}, non-zero entries={}/{} ({:.1}%)",
        snps_sum,
        non_zero_snps,
        total_entries,
        (non_zero_snps as f64 / total_entries as f64) * 100.0
    );
    println!(
        "  Indel events: sum={}, non-zero entries={}/{} ({:.1}%)",
        indel_events_sum,
        non_zero_indel_events,
        total_entries,
        (non_zero_indel_events as f64 / total_entries as f64) * 100.0
    );
    println!(
        "  Indel bases: sum={}, non-zero entries={}/{} ({:.1}%)",
        indel_bases_sum,
        non_zero_indel_bases,
        total_entries,
        (non_zero_indel_bases as f64 / total_entries as f64) * 100.0
    );

    // Determine if this is a legacy cache with all zeros for indels
    if indel_events_sum == 0 && indel_bases_sum == 0 {
        println!();
        println!("⚠️  WARNING: This cache contains ONLY SNPs data!");
        println!("   All indel_events and indel_bases values are 0.");
        println!("   This cache was generated with old code or SNPs-only mode.");
        println!("   💡 To use with indel modes, regenerate cache with new code.");
    } else {
        println!();
        println!("✅ GOOD: This cache contains complete alignment data!");
        println!(
            "   Can be used with any distance mode (SNPs, SNPs+indel_events, SNPs+indel_bases)."
        );
    }
}

fn print_legacy_cache_info(_cache: &LegacyCache, compressed: &[u8]) {
    println!("⚠️  Legacy cache format detected");
    println!();

    // We can't easily introspect the legacy format without defining all structures
    // Just show basic info
    println!("=== STORAGE INFO ===");
    println!("Compressed size: {} KB", compressed.len() / 1024);
    println!();
    println!("💡 Recommendation: Regenerate cache with modern format for better features");
    println!("   - Hasher type support");
    println!("   - User notes");
    println!("   - Enhanced metadata");
    println!("   - Better inspection capabilities");
}