Skip to main content

ref_solver/parsing/
ncbi_report.rs

1//! Parser for NCBI assembly report files.
2//!
3//! NCBI assembly reports contain rich metadata about contigs including multiple
4//! naming conventions. The key columns are:
5//!
6//! - Sequence-Name: The primary name (e.g., "1", "X", "MT")
7//! - GenBank-Accn: `GenBank` accession (e.g., "CM000663.2")
8//! - RefSeq-Accn: `RefSeq` accession (e.g., "`NC_000001.11`")
9//! - UCSC-style-name: UCSC-style name (e.g., "chr1")
10//! - Sequence-Length: Length in base pairs
11//!
12//! All non-empty names from these columns become aliases for matching.
13//!
14//! ## UCSC-style Name Generation for Patches
15//!
16//! For `GRCh38` assembly reports prior to p13, the UCSC-style-name column shows "na"
17//! for fix-patches and novel-patches. However, UCSC does assign names to these
18//! patches following a specific convention:
19//!
20//! - **Format**: `chr{chromosome}_{accession}v{version}_{suffix}`
21//! - **Suffix**: `_fix` for fix-patches, `_alt` for novel-patches
22//! - **Example**: `GenBank` accession `KN196472.1` on chromosome 1 as a fix-patch
23//!   becomes `chr1_KN196472v1_fix`
24//!
25//! This module can optionally generate these UCSC-style names when they are missing
26//! from the assembly report. This is controlled by the `generate_ucsc_names` parameter.
27//!
28//! ### Sources and References
29//!
30//! - UCSC FAQ on chromosome naming: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
31//! - UCSC Patches blog post: <https://genome-blog.soe.ucsc.edu/blog/2019/02/22/patches/>
32//! - UCSC hg38.p12 chrom.sizes: <https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/>
33//! - GRC Patches documentation: <https://www.ncbi.nlm.nih.gov/grc/help/patches/>
34//!
35//! ### Verification
36//!
37//! The naming convention has been verified against UCSC's official chromosome size
38//! files for GRCh38.p12 and cross-referenced with NCBI assembly reports for p12-p14.
39
40use std::collections::HashMap;
41
42use crate::core::contig::{Contig, SequenceRole};
43use crate::parsing::sam::ParseError;
44use crate::utils::validation::check_contig_limit;
45
46/// Patch type for NCBI assembly report entries
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum PatchType {
49    /// A fix-patch corrects errors in the primary assembly
50    Fix,
51    /// A novel-patch adds new alternate sequence (labeled `_alt` by UCSC)
52    Novel,
53}
54
55/// A parsed contig from an NCBI assembly report with all naming variants
56#[derive(Debug, Clone)]
57pub struct NcbiContigEntry {
58    /// Primary sequence name (from Sequence-Name column)
59    pub sequence_name: String,
60    /// Sequence length
61    pub length: u64,
62    /// `GenBank` accession
63    pub genbank_accn: Option<String>,
64    /// `RefSeq` accession
65    pub refseq_accn: Option<String>,
66    /// UCSC-style name
67    pub ucsc_name: Option<String>,
68    /// Sequence role (e.g., "assembled-molecule", "unlocalized-scaffold")
69    pub role: Option<String>,
70    /// Assigned molecule (chromosome number, e.g., "1", "X", "Y", "MT")
71    pub assigned_molecule: Option<String>,
72    /// Patch type if this is a patch contig
73    pub patch_type: Option<PatchType>,
74}
75
76/// Generate a UCSC-style name for a patch contig.
77///
78/// This function implements the UCSC naming convention for fix-patches and novel-patches:
79/// - Format: `chr{chromosome}_{accession}v{version}_{suffix}`
80/// - Where `suffix` is `fix` for fix-patches and `alt` for novel-patches
81///
82/// # Arguments
83///
84/// * `genbank_accession` - The `GenBank` accession (e.g., "KN196472.1")
85/// * `chromosome` - The assigned chromosome (e.g., "1", "X", "Y")
86/// * `patch_type` - Whether this is a fix-patch or novel-patch
87///
88/// # Returns
89///
90/// The generated UCSC-style name, or `None` if the accession format is invalid.
91///
92/// # Examples
93///
94/// ```
95/// use ref_solver::parsing::ncbi_report::{generate_ucsc_patch_name, PatchType};
96///
97/// // Fix-patch example
98/// let name = generate_ucsc_patch_name("KN196472.1", "1", PatchType::Fix);
99/// assert_eq!(name, Some("chr1_KN196472v1_fix".to_string()));
100///
101/// // Novel-patch (alt) example
102/// let name = generate_ucsc_patch_name("KQ458382.1", "1", PatchType::Novel);
103/// assert_eq!(name, Some("chr1_KQ458382v1_alt".to_string()));
104///
105/// // Y chromosome fix-patch
106/// let name = generate_ucsc_patch_name("KN196487.1", "Y", PatchType::Fix);
107/// assert_eq!(name, Some("chrY_KN196487v1_fix".to_string()));
108/// ```
109#[must_use]
110pub fn generate_ucsc_patch_name(
111    genbank_accession: &str,
112    chromosome: &str,
113    patch_type: PatchType,
114) -> Option<String> {
115    // Parse the accession: expect format like "KN196472.1" -> ("KN196472", "1")
116    let parts: Vec<&str> = genbank_accession.split('.').collect();
117    if parts.len() != 2 {
118        return None;
119    }
120
121    let accession_base = parts[0];
122    let version = parts[1];
123
124    // Validate that version is numeric
125    if !version.chars().all(|c| c.is_ascii_digit()) {
126        return None;
127    }
128
129    // Validate accession base is alphanumeric
130    if !accession_base
131        .chars()
132        .all(|c| c.is_ascii_alphanumeric() || c == '_')
133    {
134        return None;
135    }
136
137    // Determine suffix based on patch type
138    let suffix = match patch_type {
139        PatchType::Fix => "fix",
140        PatchType::Novel => "alt",
141    };
142
143    // Generate UCSC-style name: chr{chromosome}_{accession}v{version}_{suffix}
144    Some(format!(
145        "chr{chromosome}_{accession_base}v{version}_{suffix}"
146    ))
147}
148
149impl NcbiContigEntry {
150    /// Get all unique, non-empty names for this contig as aliases.
151    ///
152    /// # Arguments
153    ///
154    /// * `generate_ucsc_names` - If `true` and this is a patch contig without a
155    ///   UCSC-style name in the assembly report, generate one using the UCSC
156    ///   naming convention. This is useful for assembly reports prior to p13
157    ///   where patch UCSC names were not included.
158    ///
159    /// # UCSC Name Generation
160    ///
161    /// When `generate_ucsc_names` is `true` and:
162    /// - This contig is a fix-patch or novel-patch (determined from sequence role)
163    /// - The UCSC-style-name column is "na" or missing
164    /// - The `GenBank` accession and assigned molecule are available
165    ///
166    /// Then a UCSC-style name will be generated following the convention:
167    /// `chr{chromosome}_{accession}v{version}_{suffix}`
168    ///
169    /// Where `suffix` is `_fix` for fix-patches and `_alt` for novel-patches.
170    ///
171    /// ## Sources
172    ///
173    /// - UCSC FAQ: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
174    /// - UCSC Patches blog: <https://genome-blog.soe.ucsc.edu/blog/2019/02/22/patches/>
175    /// - Verified against: <https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/>
176    #[must_use]
177    pub fn all_names_with_options(&self, generate_ucsc_names: bool) -> Vec<String> {
178        let mut names = vec![self.sequence_name.clone()];
179
180        // Add other naming variants as aliases
181        if let Some(ref name) = self.genbank_accn {
182            if !name.is_empty() && name != "na" && !names.contains(name) {
183                names.push(name.clone());
184            }
185        }
186        if let Some(ref name) = self.refseq_accn {
187            if !name.is_empty() && name != "na" && !names.contains(name) {
188                names.push(name.clone());
189            }
190        }
191
192        // Handle UCSC name - either from assembly report or generated
193        let effective_ucsc_name = if let Some(ref name) = self.ucsc_name {
194            if !name.is_empty() && name != "na" {
195                Some(name.clone())
196            } else {
197                None
198            }
199        } else {
200            None
201        };
202
203        if let Some(ucsc_name) = effective_ucsc_name {
204            if !names.contains(&ucsc_name) {
205                names.push(ucsc_name);
206            }
207        } else if generate_ucsc_names {
208            // Try to generate UCSC name for patches
209            if let (Some(ref genbank), Some(ref chromosome), Some(patch_type)) =
210                (&self.genbank_accn, &self.assigned_molecule, self.patch_type)
211            {
212                if !genbank.is_empty() && genbank != "na" && !chromosome.is_empty() {
213                    if let Some(generated_name) =
214                        generate_ucsc_patch_name(genbank, chromosome, patch_type)
215                    {
216                        if !names.contains(&generated_name) {
217                            names.push(generated_name);
218                        }
219                    }
220                }
221            }
222        }
223
224        names
225    }
226
227    /// Convert to a Contig with aliases populated.
228    ///
229    /// This is equivalent to calling `to_contig_with_options(true)` - UCSC name
230    /// generation is enabled by default.
231    #[must_use]
232    pub fn to_contig(&self) -> Contig {
233        self.to_contig_with_options(true)
234    }
235
236    /// Convert to a Contig with aliases populated.
237    ///
238    /// # Arguments
239    ///
240    /// * `generate_ucsc_names` - If `true`, generate UCSC-style names for patches
241    ///   that don't have them in the assembly report. See [`Self::all_names_with_options`]
242    ///   for details on the naming convention.
243    #[must_use]
244    pub fn to_contig_with_options(&self, generate_ucsc_names: bool) -> Contig {
245        let mut contig = Contig::new(&self.sequence_name, self.length);
246
247        // All names except the primary become aliases
248        let all = self.all_names_with_options(generate_ucsc_names);
249        if all.len() > 1 {
250            contig.aliases = all.into_iter().skip(1).collect();
251        }
252
253        // Set sequence role if available
254        if let Some(ref role_str) = self.role {
255            contig.sequence_role = SequenceRole::parse(role_str);
256        }
257
258        contig
259    }
260}
261
262/// Parse NCBI assembly report from text
263///
264/// # Errors
265///
266/// Returns `ParseError::InvalidFormat` if the header is missing, required columns
267/// are not found, or field values cannot be parsed.
268#[allow(clippy::too_many_lines)]
269pub fn parse_ncbi_report_text(text: &str) -> Result<Vec<NcbiContigEntry>, ParseError> {
270    let mut entries = Vec::new();
271    // Use lowercase keys for case-insensitive matching
272    let mut header_map: HashMap<String, usize> = HashMap::new();
273    let mut found_header = false;
274
275    for line in text.lines() {
276        // Skip comment lines except the header
277        if line.starts_with('#') {
278            // The header line starts with "# " and contains column names
279            // Use case-insensitive detection
280            let line_lower = line.to_lowercase();
281            if line_lower.contains("sequence-name") {
282                let header_line = line.trim_start_matches('#').trim();
283                for (idx, col) in header_line.split('\t').enumerate() {
284                    // Store column names in lowercase for case-insensitive lookup
285                    header_map.insert(col.trim().to_lowercase(), idx);
286                }
287                found_header = true;
288            }
289            continue;
290        }
291
292        if line.trim().is_empty() {
293            continue;
294        }
295
296        if !found_header {
297            return Err(ParseError::InvalidFormat(
298                "NCBI assembly report header not found".to_string(),
299            ));
300        }
301
302        let fields: Vec<&str> = line.split('\t').collect();
303
304        // Get required fields (case-insensitive)
305        let seq_name_idx = header_map
306            .get("sequence-name")
307            .ok_or_else(|| ParseError::InvalidFormat("Missing Sequence-Name column".to_string()))?;
308        let length_idx = header_map.get("sequence-length").ok_or_else(|| {
309            ParseError::InvalidFormat("Missing Sequence-Length column".to_string())
310        })?;
311
312        if fields.len() <= *seq_name_idx || fields.len() <= *length_idx {
313            continue; // Skip malformed lines
314        }
315
316        let sequence_name = fields[*seq_name_idx].trim().to_string();
317        let length: u64 = fields[*length_idx].trim().parse().map_err(|_| {
318            ParseError::InvalidFormat(format!(
319                "Invalid length for '{}': {}",
320                sequence_name, fields[*length_idx]
321            ))
322        })?;
323
324        // Get optional fields (returns None for empty or "na")
325        let get_optional = |name: &str| -> Option<String> {
326            header_map
327                .get(name)
328                .and_then(|&idx| {
329                    fields.get(idx).map(|s| {
330                        let s = s.trim();
331                        if s.is_empty() || s == "na" {
332                            None
333                        } else {
334                            Some(s.to_string())
335                        }
336                    })
337                })
338                .flatten()
339        };
340
341        // Get raw optional fields (keeps "na" and empty values)
342        let get_raw_optional = |name: &str| -> Option<String> {
343            header_map
344                .get(name)
345                .and_then(|&idx| {
346                    fields.get(idx).map(|s| {
347                        let s = s.trim();
348                        if s.is_empty() {
349                            None
350                        } else {
351                            Some(s.to_string())
352                        }
353                    })
354                })
355                .flatten()
356        };
357
358        // Get sequence role to determine patch type
359        let role = get_raw_optional("sequence-role");
360        let patch_type = role.as_ref().and_then(|r| {
361            let r_lower = r.to_lowercase();
362            if r_lower == "fix-patch" {
363                Some(PatchType::Fix)
364            } else if r_lower == "novel-patch" {
365                Some(PatchType::Novel)
366            } else {
367                None
368            }
369        });
370
371        // Get assigned molecule (chromosome number)
372        let assigned_molecule = get_optional("assigned-molecule");
373
374        // Check contig limit for DOS protection
375        if check_contig_limit(entries.len()).is_some() {
376            return Err(ParseError::TooManyContigs(entries.len()));
377        }
378
379        entries.push(NcbiContigEntry {
380            sequence_name,
381            length,
382            // Column names are lowercase in header_map for case-insensitive matching
383            genbank_accn: get_optional("genbank-accn"),
384            refseq_accn: get_optional("refseq-accn"),
385            ucsc_name: get_raw_optional("ucsc-style-name"), // Keep "na" to detect missing names
386            role,
387            assigned_molecule,
388            patch_type,
389        });
390    }
391
392    if entries.is_empty() {
393        return Err(ParseError::InvalidFormat(
394            "No contigs found in NCBI assembly report".to_string(),
395        ));
396    }
397
398    Ok(entries)
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404
405    #[test]
406    fn test_parse_ncbi_report() {
407        // Simplified NCBI assembly report format
408        let report = r"# Assembly name:  GRCh38.p14
409# Organism name:  Homo sapiens
410# Sequence-Name	Sequence-Role	Assigned-Molecule	Assigned-Molecule-Location/Type	GenBank-Accn	Relationship	RefSeq-Accn	Assembly-Unit	Sequence-Length	UCSC-style-name
4111	assembled-molecule	1	Chromosome	CM000663.2	=	NC_000001.11	Primary Assembly	248956422	chr1
4122	assembled-molecule	2	Chromosome	CM000664.2	=	NC_000002.12	Primary Assembly	242193529	chr2
413MT	assembled-molecule	MT	Mitochondrion	J01415.2	=	NC_012920.1	non-nuclear	16569	chrM
414";
415
416        let entries = parse_ncbi_report_text(report).unwrap();
417        assert_eq!(entries.len(), 3);
418
419        // Check chr1
420        let chr1 = &entries[0];
421        assert_eq!(chr1.sequence_name, "1");
422        assert_eq!(chr1.length, 248_956_422);
423        assert_eq!(chr1.genbank_accn, Some("CM000663.2".to_string()));
424        assert_eq!(chr1.refseq_accn, Some("NC_000001.11".to_string()));
425        assert_eq!(chr1.ucsc_name, Some("chr1".to_string()));
426        assert_eq!(chr1.assigned_molecule, Some("1".to_string()));
427        assert!(chr1.patch_type.is_none()); // Not a patch
428
429        // Check all names returns aliases correctly
430        let names = chr1.all_names_with_options(true);
431        assert_eq!(names.len(), 4); // 1, CM000663.2, NC_000001.11, chr1
432        assert!(names.contains(&"1".to_string()));
433        assert!(names.contains(&"chr1".to_string()));
434        assert!(names.contains(&"NC_000001.11".to_string()));
435
436        // Check MT
437        let mt = &entries[2];
438        assert_eq!(mt.sequence_name, "MT");
439        assert_eq!(mt.length, 16569);
440        assert_eq!(mt.ucsc_name, Some("chrM".to_string()));
441    }
442
443    #[test]
444    fn test_ncbi_entry_to_contig() {
445        let entry = NcbiContigEntry {
446            sequence_name: "1".to_string(),
447            length: 248_956_422,
448            genbank_accn: Some("CM000663.2".to_string()),
449            refseq_accn: Some("NC_000001.11".to_string()),
450            ucsc_name: Some("chr1".to_string()),
451            role: Some("assembled-molecule".to_string()),
452            assigned_molecule: Some("1".to_string()),
453            patch_type: None,
454        };
455
456        let contig = entry.to_contig();
457        assert_eq!(contig.name, "1");
458        assert_eq!(contig.length, 248_956_422);
459        assert_eq!(contig.aliases.len(), 3); // CM000663.2, NC_000001.11, chr1
460        assert!(contig.aliases.contains(&"chr1".to_string()));
461        assert!(contig.aliases.contains(&"NC_000001.11".to_string()));
462    }
463
464    #[test]
465    fn test_parse_ncbi_report_no_header() {
466        let report = "1\tassembled-molecule\t1\t248956422\n";
467        let result = parse_ncbi_report_text(report);
468        assert!(result.is_err());
469    }
470
471    // ========================================================================
472    // UCSC Name Generation Tests
473    // ========================================================================
474
475    #[test]
476    fn test_generate_ucsc_patch_name_fix() {
477        // Fix-patch on chromosome 1
478        let name = generate_ucsc_patch_name("KN196472.1", "1", PatchType::Fix);
479        assert_eq!(name, Some("chr1_KN196472v1_fix".to_string()));
480
481        // Fix-patch on chromosome 8
482        let name = generate_ucsc_patch_name("KZ208915.1", "8", PatchType::Fix);
483        assert_eq!(name, Some("chr8_KZ208915v1_fix".to_string()));
484
485        // Fix-patch on Y chromosome
486        let name = generate_ucsc_patch_name("KN196487.1", "Y", PatchType::Fix);
487        assert_eq!(name, Some("chrY_KN196487v1_fix".to_string()));
488    }
489
490    #[test]
491    fn test_generate_ucsc_patch_name_novel() {
492        // Novel-patch (alt) on chromosome 1
493        let name = generate_ucsc_patch_name("KQ458382.1", "1", PatchType::Novel);
494        assert_eq!(name, Some("chr1_KQ458382v1_alt".to_string()));
495
496        // Novel-patch on X chromosome
497        let name = generate_ucsc_patch_name("KV766199.1", "X", PatchType::Novel);
498        assert_eq!(name, Some("chrX_KV766199v1_alt".to_string()));
499    }
500
501    #[test]
502    fn test_generate_ucsc_patch_name_version_2() {
503        // Accession with version 2
504        let name = generate_ucsc_patch_name("GL000256.2", "6", PatchType::Novel);
505        assert_eq!(name, Some("chr6_GL000256v2_alt".to_string()));
506    }
507
508    #[test]
509    fn test_generate_ucsc_patch_name_invalid_format() {
510        // Missing version
511        let name = generate_ucsc_patch_name("KN196472", "1", PatchType::Fix);
512        assert_eq!(name, None);
513
514        // Too many dots
515        let name = generate_ucsc_patch_name("KN196472.1.2", "1", PatchType::Fix);
516        assert_eq!(name, None);
517
518        // Non-numeric version
519        let name = generate_ucsc_patch_name("KN196472.a", "1", PatchType::Fix);
520        assert_eq!(name, None);
521
522        // Empty accession
523        let name = generate_ucsc_patch_name("", "1", PatchType::Fix);
524        assert_eq!(name, None);
525    }
526
527    #[test]
528    fn test_parse_ncbi_report_with_patches() {
529        // Test parsing assembly report with fix-patch and novel-patch entries
530        let report = r"# Assembly name:  GRCh38.p12
531# Organism name:  Homo sapiens
532# Sequence-Name	Sequence-Role	Assigned-Molecule	Assigned-Molecule-Location/Type	GenBank-Accn	Relationship	RefSeq-Accn	Assembly-Unit	Sequence-Length	UCSC-style-name
5331	assembled-molecule	1	Chromosome	CM000663.2	=	NC_000001.11	Primary Assembly	248956422	chr1
534HG986_PATCH	fix-patch	1	Chromosome	KN196472.1	=	NW_009646194.1	PATCHES	186494	na
535HSCHR1_3_CTG3	novel-patch	1	Chromosome	KQ458382.1	=	NW_014040925.1	PATCHES	141019	na
536";
537
538        let entries = parse_ncbi_report_text(report).unwrap();
539        assert_eq!(entries.len(), 3);
540
541        // Check the fix-patch
542        let fix_patch = &entries[1];
543        assert_eq!(fix_patch.sequence_name, "HG986_PATCH");
544        assert_eq!(fix_patch.patch_type, Some(PatchType::Fix));
545        assert_eq!(fix_patch.assigned_molecule, Some("1".to_string()));
546        assert_eq!(fix_patch.genbank_accn, Some("KN196472.1".to_string()));
547        assert_eq!(fix_patch.ucsc_name, Some("na".to_string())); // "na" is kept
548
549        // Check the novel-patch
550        let novel_patch = &entries[2];
551        assert_eq!(novel_patch.sequence_name, "HSCHR1_3_CTG3");
552        assert_eq!(novel_patch.patch_type, Some(PatchType::Novel));
553        assert_eq!(novel_patch.assigned_molecule, Some("1".to_string()));
554    }
555
556    #[test]
557    fn test_all_names_with_ucsc_generation_enabled() {
558        // Fix-patch without UCSC name in assembly report
559        let entry = NcbiContigEntry {
560            sequence_name: "HG986_PATCH".to_string(),
561            length: 186_494,
562            genbank_accn: Some("KN196472.1".to_string()),
563            refseq_accn: Some("NW_009646194.1".to_string()),
564            ucsc_name: Some("na".to_string()), // "na" means no UCSC name
565            role: Some("fix-patch".to_string()),
566            assigned_molecule: Some("1".to_string()),
567            patch_type: Some(PatchType::Fix),
568        };
569
570        // With UCSC name generation enabled (default)
571        let names = entry.all_names_with_options(true);
572        assert!(
573            names.contains(&"chr1_KN196472v1_fix".to_string()),
574            "Generated UCSC name should be present: {names:?}"
575        );
576        assert!(names.contains(&"HG986_PATCH".to_string()));
577        assert!(names.contains(&"KN196472.1".to_string()));
578        assert!(names.contains(&"NW_009646194.1".to_string()));
579    }
580
581    #[test]
582    fn test_all_names_with_ucsc_generation_disabled() {
583        // Fix-patch without UCSC name in assembly report
584        let entry = NcbiContigEntry {
585            sequence_name: "HG986_PATCH".to_string(),
586            length: 186_494,
587            genbank_accn: Some("KN196472.1".to_string()),
588            refseq_accn: Some("NW_009646194.1".to_string()),
589            ucsc_name: Some("na".to_string()),
590            role: Some("fix-patch".to_string()),
591            assigned_molecule: Some("1".to_string()),
592            patch_type: Some(PatchType::Fix),
593        };
594
595        // With UCSC name generation disabled
596        let names = entry.all_names_with_options(false);
597        assert!(
598            !names.contains(&"chr1_KN196472v1_fix".to_string()),
599            "Generated UCSC name should NOT be present: {names:?}"
600        );
601        assert!(names.contains(&"HG986_PATCH".to_string()));
602        assert!(names.contains(&"KN196472.1".to_string()));
603    }
604
605    #[test]
606    fn test_all_names_with_existing_ucsc_name() {
607        // Entry that already has a UCSC name in the assembly report
608        let entry = NcbiContigEntry {
609            sequence_name: "1".to_string(),
610            length: 248_956_422,
611            genbank_accn: Some("CM000663.2".to_string()),
612            refseq_accn: Some("NC_000001.11".to_string()),
613            ucsc_name: Some("chr1".to_string()), // Has UCSC name
614            role: Some("assembled-molecule".to_string()),
615            assigned_molecule: Some("1".to_string()),
616            patch_type: None,
617        };
618
619        // Both options should return the same result - use existing UCSC name
620        let names_enabled = entry.all_names_with_options(true);
621        let names_disabled = entry.all_names_with_options(false);
622
623        assert!(names_enabled.contains(&"chr1".to_string()));
624        assert!(names_disabled.contains(&"chr1".to_string()));
625        assert_eq!(names_enabled.len(), names_disabled.len());
626    }
627
628    #[test]
629    fn test_to_contig_with_ucsc_generation() {
630        // Fix-patch without UCSC name
631        let entry = NcbiContigEntry {
632            sequence_name: "HG986_PATCH".to_string(),
633            length: 186_494,
634            genbank_accn: Some("KN196472.1".to_string()),
635            refseq_accn: Some("NW_009646194.1".to_string()),
636            ucsc_name: Some("na".to_string()),
637            role: Some("fix-patch".to_string()),
638            assigned_molecule: Some("1".to_string()),
639            patch_type: Some(PatchType::Fix),
640        };
641
642        // With UCSC generation enabled
643        let contig = entry.to_contig_with_options(true);
644        assert!(
645            contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
646            "Contig aliases should include generated UCSC name: {:?}",
647            contig.aliases
648        );
649
650        // With UCSC generation disabled
651        let contig = entry.to_contig_with_options(false);
652        assert!(
653            !contig.aliases.contains(&"chr1_KN196472v1_fix".to_string()),
654            "Contig aliases should NOT include generated UCSC name: {:?}",
655            contig.aliases
656        );
657    }
658
659    #[test]
660    fn test_novel_patch_ucsc_generation() {
661        // Novel-patch should get "_alt" suffix
662        let entry = NcbiContigEntry {
663            sequence_name: "HSCHR1_3_CTG3".to_string(),
664            length: 141_019,
665            genbank_accn: Some("KQ458382.1".to_string()),
666            refseq_accn: Some("NW_014040925.1".to_string()),
667            ucsc_name: Some("na".to_string()),
668            role: Some("novel-patch".to_string()),
669            assigned_molecule: Some("1".to_string()),
670            patch_type: Some(PatchType::Novel),
671        };
672
673        let names = entry.all_names_with_options(true);
674        assert!(
675            names.contains(&"chr1_KQ458382v1_alt".to_string()),
676            "Novel patch should have _alt suffix: {names:?}"
677        );
678    }
679
680    #[test]
681    fn test_ucsc_generation_for_different_chromosomes() {
682        // Test various chromosome values
683        let test_cases = vec![
684            ("1", "KN196472.1", PatchType::Fix, "chr1_KN196472v1_fix"),
685            ("X", "KV766199.1", PatchType::Novel, "chrX_KV766199v1_alt"),
686            ("Y", "KN196487.1", PatchType::Fix, "chrY_KN196487v1_fix"),
687            ("22", "KZ208920.1", PatchType::Fix, "chr22_KZ208920v1_fix"),
688        ];
689
690        for (chrom, accession, patch_type, expected) in test_cases {
691            let entry = NcbiContigEntry {
692                sequence_name: "TEST_PATCH".to_string(),
693                length: 1000,
694                genbank_accn: Some(accession.to_string()),
695                refseq_accn: None,
696                ucsc_name: Some("na".to_string()),
697                role: None,
698                assigned_molecule: Some(chrom.to_string()),
699                patch_type: Some(patch_type),
700            };
701
702            let names = entry.all_names_with_options(true);
703            assert!(
704                names.contains(&expected.to_string()),
705                "Expected {expected} for chromosome {chrom}, accession {accession}: {names:?}"
706            );
707        }
708    }
709}