Skip to main content

cyanea_seq/
protein_properties.rs

1//! Protein sequence property analysis.
2//!
3//! Computes physicochemical and structural properties from amino acid sequences:
4//!
5//! - **Amino acid composition** — residue counts and fractions
6//! - **Hydrophobicity profiles** — Kyte-Doolittle and Hopp-Woods sliding window
7//! - **GRAVY** — grand average of hydropathicity
8//! - **Isoelectric point** — pI via Henderson-Hasselbalch bisection
9//! - **Extinction coefficient** — molar absorptivity at 280 nm
10//! - **Secondary structure prediction** — Chou-Fasman nucleation/extension, GOR
11//! - **Intrinsic disorder prediction** — windowed propensity with logistic smoothing
12
13use cyanea_core::{CyaneaError, Result};
14
15// ── Amino acid indexing ─────────────────────────────────────────
16
17/// Map amino acid byte to index 0–19. Returns None for non-standard residues.
18fn aa_index(aa: u8) -> Option<usize> {
19    match aa {
20        b'A' => Some(0),
21        b'C' => Some(1),
22        b'D' => Some(2),
23        b'E' => Some(3),
24        b'F' => Some(4),
25        b'G' => Some(5),
26        b'H' => Some(6),
27        b'I' => Some(7),
28        b'K' => Some(8),
29        b'L' => Some(9),
30        b'M' => Some(10),
31        b'N' => Some(11),
32        b'P' => Some(12),
33        b'Q' => Some(13),
34        b'R' => Some(14),
35        b'S' => Some(15),
36        b'T' => Some(16),
37        b'V' => Some(17),
38        b'W' => Some(18),
39        b'Y' => Some(19),
40        _ => None,
41    }
42}
43
44/// Normalize a protein sequence: uppercase and validate standard 20 AAs.
45fn normalize_protein(seq: &[u8]) -> Result<Vec<u8>> {
46    if seq.is_empty() {
47        return Err(CyaneaError::InvalidInput(
48            "empty protein sequence".to_string(),
49        ));
50    }
51    seq.iter()
52        .map(|&b| {
53            let upper = b.to_ascii_uppercase();
54            if aa_index(upper).is_some() {
55                Ok(upper)
56            } else {
57                Err(CyaneaError::InvalidInput(format!(
58                    "invalid amino acid '{}' in protein sequence",
59                    b as char
60                )))
61            }
62        })
63        .collect()
64}
65
66// ── Hydrophobicity scales ───────────────────────────────────────
67
68/// Kyte-Doolittle (1982) hydropathy values, indexed by aa_index.
69const KYTE_DOOLITTLE: [f64; 20] = [
70    1.8,  // A
71    2.5,  // C
72    -3.5, // D
73    -3.5, // E
74    2.8,  // F
75    -0.4, // G
76    -3.2, // H
77    4.5,  // I
78    -3.9, // K
79    3.8,  // L
80    1.9,  // M
81    -3.5, // N
82    -1.6, // P
83    -3.5, // Q
84    -4.5, // R
85    -0.8, // S
86    -0.7, // T
87    4.2,  // V
88    -0.9, // W
89    -1.3, // Y
90];
91
92/// Hopp-Woods (1981) hydrophilicity values, indexed by aa_index.
93const HOPP_WOODS: [f64; 20] = [
94    -0.5, // A
95    -1.0, // C
96    3.0,  // D
97    3.0,  // E
98    -2.5, // F
99    0.0,  // G
100    -0.5, // H
101    -1.8, // I
102    3.0,  // K
103    -1.8, // L
104    -1.3, // M
105    0.2,  // N
106    0.0,  // P
107    0.2,  // Q
108    3.0,  // R
109    0.3,  // S
110    -0.4, // T
111    -1.5, // V
112    -3.4, // W
113    -2.3, // Y
114];
115
116// ── pKa values (EMBOSS) ────────────────────────────────────────
117
118const PKA_NTERM: f64 = 9.69;
119const PKA_CTERM: f64 = 2.34;
120const PKA_D: f64 = 3.65;
121const PKA_E: f64 = 4.25;
122const PKA_C: f64 = 8.18;
123const PKA_Y: f64 = 10.07;
124const PKA_H: f64 = 6.00;
125const PKA_K: f64 = 10.53;
126const PKA_R: f64 = 12.48;
127
128// ── Extinction coefficient constants (280 nm) ──────────────────
129
130const EXT_TRP: f64 = 5500.0;
131const EXT_TYR: f64 = 1490.0;
132const EXT_CYSTINE: f64 = 125.0;
133
134// ── Chou-Fasman propensities (1978) ────────────────────────────
135
136/// (P_alpha, P_beta, P_turn) for each amino acid, indexed by aa_index.
137const CHOU_FASMAN: [(f64, f64, f64); 20] = [
138    (1.42, 0.83, 0.66), // A — strong helix former
139    (0.70, 1.19, 1.19), // C
140    (1.01, 0.54, 1.46), // D
141    (1.51, 0.37, 0.74), // E — strong helix former
142    (1.13, 1.38, 0.60), // F
143    (0.57, 0.75, 1.56), // G — strong turn former
144    (1.00, 0.87, 0.95), // H
145    (1.08, 1.60, 0.47), // I
146    (1.16, 0.74, 1.01), // K
147    (1.21, 1.30, 0.59), // L
148    (1.45, 1.05, 0.60), // M — strong helix former
149    (0.67, 0.89, 1.56), // N — strong turn former
150    (0.57, 0.55, 1.52), // P — strong turn former, helix breaker
151    (1.11, 1.10, 0.98), // Q
152    (0.98, 0.93, 0.95), // R
153    (0.77, 0.75, 1.43), // S
154    (0.83, 1.19, 0.96), // T
155    (1.06, 1.70, 0.50), // V — strong strand former
156    (1.08, 1.37, 0.96), // W
157    (0.69, 1.47, 1.14), // Y
158];
159
160// ── GOR information values ─────────────────────────────────────
161
162/// Simplified GOR single-residue information values (I_helix, I_strand, I_coil),
163/// indexed by aa_index. Derived from published GOR I statistics.
164const GOR_INFO: [(f64, f64, f64); 20] = [
165    (0.36, -0.23, -0.13),  // A
166    (-0.20, 0.17, 0.03),   // C
167    (0.07, -0.42, 0.35),   // D
168    (0.42, -0.37, -0.05),  // E
169    (-0.09, 0.32, -0.23),  // F
170    (-0.43, -0.18, 0.61),  // G
171    (0.04, -0.09, 0.05),   // H
172    (-0.06, 0.42, -0.36),  // I
173    (0.13, -0.25, 0.12),   // K
174    (0.21, 0.22, -0.43),   // L
175    (0.36, 0.03, -0.39),   // M
176    (-0.29, -0.18, 0.47),  // N
177    (-0.42, -0.37, 0.79),  // P
178    (0.18, -0.10, -0.08),  // Q
179    (-0.01, -0.15, 0.16),  // R
180    (-0.15, -0.07, 0.22),  // S
181    (-0.11, 0.16, -0.05),  // T
182    (-0.06, 0.52, -0.46),  // V
183    (-0.02, 0.27, -0.25),  // W
184    (-0.17, 0.31, -0.14),  // Y
185];
186
187/// GOR window half-width.
188const GOR_HALF_WIDTH: usize = 8;
189
190// ── Disorder propensity scale ──────────────────────────────────
191
192/// Per-residue disorder propensity (order→disorder frequency ratio, mean-centered).
193/// Positive values indicate disorder tendency. Indexed by aa_index.
194const DISORDER_PROPENSITY: [f64; 20] = [
195    -0.26, // A — slightly ordered
196    -0.20, // C — ordered (disulfides)
197    0.70,  // D — disordered (charged)
198    0.55,  // E — disordered (charged)
199    -0.60, // F — ordered (hydrophobic)
200    0.16,  // G — slightly disordered (flexible)
201    0.06,  // H — neutral
202    -0.70, // I — ordered (hydrophobic)
203    0.60,  // K — disordered (charged)
204    -0.50, // L — ordered (hydrophobic)
205    -0.14, // M — slightly ordered
206    0.38,  // N — disordered (polar)
207    0.55,  // P — disordered (rigid kink)
208    0.45,  // Q — disordered (polar)
209    0.30,  // R — disordered (charged)
210    0.20,  // S — slightly disordered (polar)
211    0.12,  // T — slightly disordered (polar)
212    -0.60, // V — ordered (hydrophobic)
213    -0.55, // W — ordered (hydrophobic, bulky)
214    -0.45, // Y — ordered (hydrophobic)
215];
216
217// ── Public types ────────────────────────────────────────────────
218
219/// Choice of hydrophobicity scale.
220#[derive(Debug, Clone, Copy, PartialEq, Eq)]
221pub enum HydrophobicityScale {
222    /// Kyte-Doolittle (1982) — positive = hydrophobic.
223    KyteDoolittle,
224    /// Hopp-Woods (1981) — positive = hydrophilic.
225    HoppWoods,
226}
227
228/// Amino acid composition of a protein sequence.
229#[derive(Debug, Clone)]
230pub struct AminoAcidComposition {
231    /// Absolute count for each of the 20 standard amino acids (indexed by `aa_index`).
232    pub counts: [usize; 20],
233    /// Fraction (0.0–1.0) for each amino acid.
234    pub fractions: [f64; 20],
235    /// Sequence length.
236    pub length: usize,
237}
238
239/// Molar extinction coefficient at 280 nm.
240#[derive(Debug, Clone)]
241pub struct ExtinctionCoefficient {
242    /// Extinction coefficient assuming all cysteines are reduced (M⁻¹ cm⁻¹).
243    pub reduced: f64,
244    /// Extinction coefficient assuming all cysteines form cystines (M⁻¹ cm⁻¹).
245    pub cystine: f64,
246    /// Absorbance at 0.1% (1 mg/mL), reduced cysteines.
247    pub abs_01_reduced: f64,
248    /// Absorbance at 0.1% (1 mg/mL), cystine bridges.
249    pub abs_01_cystine: f64,
250}
251
252/// Predicted secondary structure state.
253#[derive(Debug, Clone, Copy, PartialEq, Eq)]
254pub enum SecondaryStructure {
255    Helix,
256    Strand,
257    Coil,
258}
259
260/// Result of secondary structure prediction.
261#[derive(Debug, Clone)]
262pub struct SecondaryStructurePrediction {
263    /// Per-residue predicted state.
264    pub states: Vec<SecondaryStructure>,
265    /// Per-residue helix score.
266    pub helix_scores: Vec<f64>,
267    /// Per-residue strand score.
268    pub strand_scores: Vec<f64>,
269    /// Per-residue coil score.
270    pub coil_scores: Vec<f64>,
271    /// Fraction of residues predicted as helix.
272    pub helix_fraction: f64,
273    /// Fraction of residues predicted as strand.
274    pub strand_fraction: f64,
275    /// Fraction of residues predicted as coil.
276    pub coil_fraction: f64,
277}
278
279/// Result of intrinsic disorder prediction.
280#[derive(Debug, Clone)]
281pub struct DisorderPrediction {
282    /// Per-residue disorder score (0.0–1.0).
283    pub scores: Vec<f64>,
284    /// Per-residue disorder call (true if score > 0.5).
285    pub disordered: Vec<bool>,
286    /// Fraction of residues predicted as disordered.
287    pub disorder_fraction: f64,
288}
289
290// ── Amino acid composition ──────────────────────────────────────
291
292/// Compute amino acid composition of a protein sequence.
293///
294/// # Errors
295///
296/// Returns an error if the sequence is empty or contains non-standard residues.
297///
298/// # Example
299///
300/// ```
301/// use cyanea_seq::protein_properties::amino_acid_composition;
302///
303/// let comp = amino_acid_composition(b"ACDEFGHIKLMNPQRSTVWY").unwrap();
304/// assert_eq!(comp.length, 20);
305/// // Each AA appears once
306/// for &c in comp.counts.iter() {
307///     assert_eq!(c, 1);
308/// }
309/// ```
310pub fn amino_acid_composition(seq: &[u8]) -> Result<AminoAcidComposition> {
311    let norm = normalize_protein(seq)?;
312    let mut counts = [0usize; 20];
313    for &aa in &norm {
314        counts[aa_index(aa).unwrap()] += 1;
315    }
316    let len = norm.len() as f64;
317    let mut fractions = [0.0f64; 20];
318    for i in 0..20 {
319        fractions[i] = counts[i] as f64 / len;
320    }
321    Ok(AminoAcidComposition {
322        counts,
323        fractions,
324        length: norm.len(),
325    })
326}
327
328// ── Hydrophobicity ──────────────────────────────────────────────
329
330/// Compute a hydrophobicity profile using a sliding window average.
331///
332/// # Arguments
333///
334/// * `seq` — Protein sequence (standard 20 AAs)
335/// * `window` — Window size (must be odd and ≥ 1)
336/// * `scale` — Hydrophobicity scale to use
337///
338/// # Errors
339///
340/// Returns an error if the sequence is empty, contains invalid residues,
341/// or window is even or larger than the sequence.
342///
343/// # Example
344///
345/// ```
346/// use cyanea_seq::protein_properties::{hydrophobicity_profile, HydrophobicityScale};
347///
348/// let profile = hydrophobicity_profile(b"IIIIIIIII", 3, HydrophobicityScale::KyteDoolittle).unwrap();
349/// assert!((profile[1] - 4.5).abs() < 1e-10); // I = 4.5 on KD scale
350/// ```
351pub fn hydrophobicity_profile(
352    seq: &[u8],
353    window: usize,
354    scale: HydrophobicityScale,
355) -> Result<Vec<f64>> {
356    let norm = normalize_protein(seq)?;
357    if window == 0 || window % 2 == 0 {
358        return Err(CyaneaError::InvalidInput(
359            "window size must be odd and >= 1".to_string(),
360        ));
361    }
362    if window > norm.len() {
363        return Err(CyaneaError::InvalidInput(format!(
364            "window size {} exceeds sequence length {}",
365            window,
366            norm.len()
367        )));
368    }
369
370    let table = match scale {
371        HydrophobicityScale::KyteDoolittle => &KYTE_DOOLITTLE,
372        HydrophobicityScale::HoppWoods => &HOPP_WOODS,
373    };
374
375    let values: Vec<f64> = norm
376        .iter()
377        .map(|&aa| table[aa_index(aa).unwrap()])
378        .collect();
379
380    let n = norm.len();
381    let mut profile = Vec::with_capacity(n - window + 1);
382
383    // Initial window sum
384    let mut sum: f64 = values[..window].iter().sum();
385    profile.push(sum / window as f64);
386
387    // Slide
388    for i in 1..=(n - window) {
389        sum += values[i + window - 1] - values[i - 1];
390        profile.push(sum / window as f64);
391    }
392
393    Ok(profile)
394}
395
396/// Compute the GRAVY (grand average of hydropathicity) score.
397///
398/// GRAVY is the mean Kyte-Doolittle hydropathy over the entire sequence.
399/// Positive values indicate overall hydrophobic character.
400///
401/// # Example
402///
403/// ```
404/// use cyanea_seq::protein_properties::gravy;
405///
406/// let g = gravy(b"IIIII").unwrap();
407/// assert!((g - 4.5).abs() < 1e-10);
408/// ```
409pub fn gravy(seq: &[u8]) -> Result<f64> {
410    let norm = normalize_protein(seq)?;
411    let sum: f64 = norm
412        .iter()
413        .map(|&aa| KYTE_DOOLITTLE[aa_index(aa).unwrap()])
414        .sum();
415    Ok(sum / norm.len() as f64)
416}
417
418// ── Isoelectric point ───────────────────────────────────────────
419
420/// Net charge at a given pH via Henderson-Hasselbalch.
421fn net_charge(seq: &[u8], ph: f64) -> f64 {
422    let mut charge = 0.0;
423
424    // N-terminus (positive)
425    charge += 1.0 / (1.0 + 10_f64.powf(ph - PKA_NTERM));
426    // C-terminus (negative)
427    charge -= 1.0 / (1.0 + 10_f64.powf(PKA_CTERM - ph));
428
429    for &aa in seq {
430        match aa {
431            b'D' => charge -= 1.0 / (1.0 + 10_f64.powf(PKA_D - ph)),
432            b'E' => charge -= 1.0 / (1.0 + 10_f64.powf(PKA_E - ph)),
433            b'C' => charge -= 1.0 / (1.0 + 10_f64.powf(PKA_C - ph)),
434            b'Y' => charge -= 1.0 / (1.0 + 10_f64.powf(PKA_Y - ph)),
435            b'H' => charge += 1.0 / (1.0 + 10_f64.powf(ph - PKA_H)),
436            b'K' => charge += 1.0 / (1.0 + 10_f64.powf(ph - PKA_K)),
437            b'R' => charge += 1.0 / (1.0 + 10_f64.powf(ph - PKA_R)),
438            _ => {}
439        }
440    }
441    charge
442}
443
444/// Compute the isoelectric point (pI) of a protein sequence.
445///
446/// Uses bisection on the Henderson-Hasselbalch charge equation with
447/// EMBOSS pKa values. Converges to |charge| < 0.001 (~47 iterations).
448///
449/// # Example
450///
451/// ```
452/// use cyanea_seq::protein_properties::isoelectric_point;
453///
454/// let pi = isoelectric_point(b"DDDDD").unwrap();
455/// assert!(pi < 4.0); // highly acidic
456/// ```
457pub fn isoelectric_point(seq: &[u8]) -> Result<f64> {
458    let norm = normalize_protein(seq)?;
459    let mut lo = 0.0_f64;
460    let mut hi = 14.0_f64;
461
462    for _ in 0..100 {
463        let mid = (lo + hi) / 2.0;
464        let charge = net_charge(&norm, mid);
465        if charge.abs() < 0.001 {
466            return Ok(mid);
467        }
468        if charge > 0.0 {
469            lo = mid;
470        } else {
471            hi = mid;
472        }
473    }
474    Ok((lo + hi) / 2.0)
475}
476
477// ── Extinction coefficient ──────────────────────────────────────
478
479/// Estimate the molar extinction coefficient at 280 nm.
480///
481/// Uses the Pace et al. method based on Trp, Tyr, and Cys content.
482/// Returns both reduced (all Cys free) and cystine (all Cys paired) estimates.
483///
484/// # Example
485///
486/// ```
487/// use cyanea_seq::protein_properties::extinction_coefficient;
488///
489/// let ec = extinction_coefficient(b"W").unwrap();
490/// assert!((ec.reduced - 5500.0).abs() < 1e-10);
491/// ```
492pub fn extinction_coefficient(seq: &[u8]) -> Result<ExtinctionCoefficient> {
493    let norm = normalize_protein(seq)?;
494    let comp = amino_acid_composition(&norm)?;
495
496    let n_trp = comp.counts[aa_index(b'W').unwrap()] as f64;
497    let n_tyr = comp.counts[aa_index(b'Y').unwrap()] as f64;
498    let n_cys = comp.counts[aa_index(b'C').unwrap()] as f64;
499
500    let reduced = n_trp * EXT_TRP + n_tyr * EXT_TYR;
501    let n_cystine = (n_cys / 2.0).floor();
502    let cystine = reduced + n_cystine * EXT_CYSTINE;
503
504    // Molecular weight for A280 0.1% calculation
505    let mw = molecular_weight_from_seq(&norm);
506
507    let abs_01_reduced = if mw > 0.0 { reduced / mw } else { 0.0 };
508    let abs_01_cystine = if mw > 0.0 { cystine / mw } else { 0.0 };
509
510    Ok(ExtinctionCoefficient {
511        reduced,
512        cystine,
513        abs_01_reduced,
514        abs_01_cystine,
515    })
516}
517
518/// Molecular weight from raw normalized sequence bytes.
519fn molecular_weight_from_seq(seq: &[u8]) -> f64 {
520    // Average monoisotopic weights
521    const WEIGHTS: [f64; 20] = [
522        89.09,  // A
523        121.16, // C
524        133.10, // D
525        147.13, // E
526        165.19, // F
527        75.03,  // G
528        155.16, // H
529        131.17, // I
530        146.19, // K
531        131.17, // L
532        149.21, // M
533        132.12, // N
534        115.13, // P
535        146.15, // Q
536        174.20, // R
537        105.09, // S
538        119.12, // T
539        117.15, // V
540        204.23, // W
541        181.19, // Y
542    ];
543    if seq.is_empty() {
544        return 0.0;
545    }
546    let sum: f64 = seq
547        .iter()
548        .map(|&aa| WEIGHTS[aa_index(aa).unwrap()])
549        .sum();
550    sum - (seq.len() as f64 - 1.0) * 18.015
551}
552
553// ── Secondary structure: Chou-Fasman ────────────────────────────
554
555/// Predict secondary structure using the Chou-Fasman algorithm (1978).
556///
557/// Nucleation/extension method:
558/// 1. Find helix nuclei: 6-residue windows with ≥4 having P_alpha ≥ 1.03
559/// 2. Find strand nuclei: 5-residue windows with ≥3 having P_beta ≥ 1.05
560/// 3. Extend nuclei while tetrapeptide average ≥ 1.00
561/// 4. Resolve overlaps: higher average propensity wins
562///
563/// # Errors
564///
565/// Returns an error if the sequence is shorter than 6 residues.
566///
567/// # Example
568///
569/// ```
570/// use cyanea_seq::protein_properties::{chou_fasman, SecondaryStructure};
571///
572/// // Poly-alanine is a strong helix former
573/// let pred = chou_fasman(b"AAAAAAAAAAAAAAAAAAAA").unwrap();
574/// assert!(pred.helix_fraction > 0.5);
575/// ```
576pub fn chou_fasman(seq: &[u8]) -> Result<SecondaryStructurePrediction> {
577    let norm = normalize_protein(seq)?;
578    let n = norm.len();
579    if n < 6 {
580        return Err(CyaneaError::InvalidInput(
581            "sequence must be at least 6 residues for Chou-Fasman prediction".to_string(),
582        ));
583    }
584
585    let props: Vec<(f64, f64, f64)> = norm
586        .iter()
587        .map(|&aa| CHOU_FASMAN[aa_index(aa).unwrap()])
588        .collect();
589
590    // State assignment: 0 = unassigned, 1 = helix, 2 = strand
591    let mut assignment = vec![0u8; n];
592
593    // Step 1: Find helix nuclei (6-residue windows, ≥4 with P_alpha ≥ 1.03)
594    let mut helix_regions: Vec<(usize, usize)> = Vec::new();
595    for i in 0..=n.saturating_sub(6) {
596        let count = (0..6).filter(|&j| props[i + j].0 >= 1.03).count();
597        if count >= 4 {
598            helix_regions.push((i, i + 5));
599        }
600    }
601
602    // Merge overlapping helix nuclei
603    let helix_regions = merge_regions(helix_regions);
604
605    // Extend helix nuclei while tetrapeptide average P_alpha ≥ 1.00
606    let helix_regions: Vec<(usize, usize)> = helix_regions
607        .into_iter()
608        .map(|(start, end)| extend_region(&props, start, end, n, |p| p.0, 1.00))
609        .collect();
610
611    // Step 2: Find strand nuclei (5-residue windows, ≥3 with P_beta ≥ 1.05)
612    let mut strand_regions: Vec<(usize, usize)> = Vec::new();
613    for i in 0..=n.saturating_sub(5) {
614        let count = (0..5).filter(|&j| props[i + j].1 >= 1.05).count();
615        if count >= 3 {
616            strand_regions.push((i, i + 4));
617        }
618    }
619
620    let strand_regions = merge_regions(strand_regions);
621    let strand_regions: Vec<(usize, usize)> = strand_regions
622        .into_iter()
623        .map(|(start, end)| extend_region(&props, start, end, n, |p| p.1, 1.00))
624        .collect();
625
626    // Assign helices
627    for &(start, end) in &helix_regions {
628        for i in start..=end {
629            assignment[i] = 1;
630        }
631    }
632
633    // Assign strands, resolving overlaps
634    for &(start, end) in &strand_regions {
635        for i in start..=end {
636            if assignment[i] == 1 {
637                // Overlap: compare average propensities
638                let alpha_avg: f64 = props[i].0;
639                let beta_avg: f64 = props[i].1;
640                if beta_avg > alpha_avg {
641                    assignment[i] = 2;
642                }
643                // else keep helix
644            } else {
645                assignment[i] = 2;
646            }
647        }
648    }
649
650    // Build scores and states
651    let mut helix_scores = Vec::with_capacity(n);
652    let mut strand_scores = Vec::with_capacity(n);
653    let mut coil_scores = Vec::with_capacity(n);
654    let mut states = Vec::with_capacity(n);
655
656    for i in 0..n {
657        let (pa, pb, pt) = props[i];
658        helix_scores.push(pa);
659        strand_scores.push(pb);
660        coil_scores.push(pt);
661        states.push(match assignment[i] {
662            1 => SecondaryStructure::Helix,
663            2 => SecondaryStructure::Strand,
664            _ => SecondaryStructure::Coil,
665        });
666    }
667
668    let helix_count = states.iter().filter(|&&s| s == SecondaryStructure::Helix).count();
669    let strand_count = states.iter().filter(|&&s| s == SecondaryStructure::Strand).count();
670    let coil_count = n - helix_count - strand_count;
671
672    Ok(SecondaryStructurePrediction {
673        states,
674        helix_scores,
675        strand_scores,
676        coil_scores,
677        helix_fraction: helix_count as f64 / n as f64,
678        strand_fraction: strand_count as f64 / n as f64,
679        coil_fraction: coil_count as f64 / n as f64,
680    })
681}
682
683/// Merge overlapping (start, end) regions into non-overlapping regions.
684fn merge_regions(mut regions: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
685    if regions.is_empty() {
686        return regions;
687    }
688    regions.sort_by_key(|r| r.0);
689    let mut merged = vec![regions[0]];
690    for &(start, end) in &regions[1..] {
691        let last = merged.last_mut().unwrap();
692        if start <= last.1 + 1 {
693            last.1 = last.1.max(end);
694        } else {
695            merged.push((start, end));
696        }
697    }
698    merged
699}
700
701/// Extend a nucleated region while tetrapeptide average propensity ≥ threshold.
702fn extend_region(
703    props: &[(f64, f64, f64)],
704    start: usize,
705    end: usize,
706    n: usize,
707    accessor: fn(&(f64, f64, f64)) -> f64,
708    threshold: f64,
709) -> (usize, usize) {
710    let mut s = start;
711    let mut e = end;
712
713    // Extend left
714    while s >= 4 {
715        let avg: f64 = (0..4).map(|j| accessor(&props[s - 4 + j])).sum::<f64>() / 4.0;
716        if avg >= threshold {
717            s -= 1;
718        } else {
719            break;
720        }
721    }
722
723    // Extend right
724    while e + 4 < n {
725        let avg: f64 = (0..4).map(|j| accessor(&props[e + 1 + j])).sum::<f64>() / 4.0;
726        if avg >= threshold {
727            e += 1;
728        } else {
729            break;
730        }
731    }
732
733    (s, e)
734}
735
736// ── Secondary structure: GOR ────────────────────────────────────
737
738/// Predict secondary structure using the GOR (Garnier-Osguthorpe-Robson) method.
739///
740/// Applies single-residue GOR information values with a triangular window
741/// (half-width = 8). Isolated single-residue predictions surrounded by
742/// a different state are smoothed to the surrounding state.
743///
744/// # Errors
745///
746/// Returns an error if the sequence is empty or contains invalid residues.
747///
748/// # Example
749///
750/// ```
751/// use cyanea_seq::protein_properties::gor;
752///
753/// let pred = gor(b"EEEEEEEEEEEEEEEEEEEE").unwrap();
754/// // E (glutamate) has high helix propensity in GOR
755/// assert!(pred.helix_fraction > pred.strand_fraction);
756/// ```
757pub fn gor(seq: &[u8]) -> Result<SecondaryStructurePrediction> {
758    let norm = normalize_protein(seq)?;
759    let n = norm.len();
760
761    let indices: Vec<usize> = norm
762        .iter()
763        .map(|&aa| aa_index(aa).unwrap())
764        .collect();
765
766    let mut helix_scores = vec![0.0f64; n];
767    let mut strand_scores = vec![0.0f64; n];
768    let mut coil_scores = vec![0.0f64; n];
769
770    // Compute windowed GOR scores with triangular weighting
771    for i in 0..n {
772        let mut h = 0.0;
773        let mut e = 0.0;
774        let mut c = 0.0;
775        let mut weight_sum = 0.0;
776
777        let w_start = if i >= GOR_HALF_WIDTH { i - GOR_HALF_WIDTH } else { 0 };
778        let w_end = (i + GOR_HALF_WIDTH).min(n - 1);
779
780        for j in w_start..=w_end {
781            let dist = if j >= i { j - i } else { i - j };
782            let weight = 1.0 - (dist as f64 / (GOR_HALF_WIDTH as f64 + 1.0));
783            let (ih, ie, ic) = GOR_INFO[indices[j]];
784            h += ih * weight;
785            e += ie * weight;
786            c += ic * weight;
787            weight_sum += weight;
788        }
789
790        if weight_sum > 0.0 {
791            helix_scores[i] = h / weight_sum;
792            strand_scores[i] = e / weight_sum;
793            coil_scores[i] = c / weight_sum;
794        }
795    }
796
797    // Assign states by argmax
798    let mut states: Vec<SecondaryStructure> = (0..n)
799        .map(|i| {
800            let h = helix_scores[i];
801            let e = strand_scores[i];
802            let c = coil_scores[i];
803            if h >= e && h >= c {
804                SecondaryStructure::Helix
805            } else if e >= h && e >= c {
806                SecondaryStructure::Strand
807            } else {
808                SecondaryStructure::Coil
809            }
810        })
811        .collect();
812
813    // Smooth: flip isolated single-residue H/E surrounded by a different state
814    if n >= 3 {
815        for i in 1..n - 1 {
816            let prev = states[i - 1];
817            let next = states[i + 1];
818            if prev == next && states[i] != prev {
819                states[i] = prev;
820            }
821        }
822    }
823
824    let helix_count = states.iter().filter(|&&s| s == SecondaryStructure::Helix).count();
825    let strand_count = states.iter().filter(|&&s| s == SecondaryStructure::Strand).count();
826    let coil_count = n - helix_count - strand_count;
827
828    Ok(SecondaryStructurePrediction {
829        states,
830        helix_scores,
831        strand_scores,
832        coil_scores,
833        helix_fraction: helix_count as f64 / n as f64,
834        strand_fraction: strand_count as f64 / n as f64,
835        coil_fraction: coil_count as f64 / n as f64,
836    })
837}
838
839// ── Intrinsic disorder ─────────────────────────────────────────
840
841/// Predict intrinsic disorder using windowed amino acid propensities.
842///
843/// Averages per-residue disorder propensities in a centered window and
844/// applies a logistic sigmoid (k=5.0) to produce a 0–1 score.
845/// Residues with score > 0.5 are predicted disordered.
846///
847/// # Arguments
848///
849/// * `seq` — Protein sequence
850/// * `window` — Window size (must be odd and ≥ 1)
851///
852/// # Example
853///
854/// ```
855/// use cyanea_seq::protein_properties::predict_disorder;
856///
857/// // Charged residues → high disorder
858/// let pred = predict_disorder(b"KKKKKKKKDDDDDDDDD", 7).unwrap();
859/// assert!(pred.disorder_fraction > 0.5);
860/// ```
861pub fn predict_disorder(seq: &[u8], window: usize) -> Result<DisorderPrediction> {
862    let norm = normalize_protein(seq)?;
863    let n = norm.len();
864
865    if window == 0 || window % 2 == 0 {
866        return Err(CyaneaError::InvalidInput(
867            "window size must be odd and >= 1".to_string(),
868        ));
869    }
870    if window > n {
871        return Err(CyaneaError::InvalidInput(format!(
872            "window size {} exceeds sequence length {}",
873            window, n
874        )));
875    }
876
877    let raw_props: Vec<f64> = norm
878        .iter()
879        .map(|&aa| DISORDER_PROPENSITY[aa_index(aa).unwrap()])
880        .collect();
881
882    let half = window / 2;
883    let mut scores = Vec::with_capacity(n);
884
885    for i in 0..n {
886        let w_start = if i >= half { i - half } else { 0 };
887        let w_end = (i + half).min(n - 1);
888        let avg: f64 = raw_props[w_start..=w_end].iter().sum::<f64>()
889            / (w_end - w_start + 1) as f64;
890
891        // Logistic sigmoid: 1 / (1 + exp(-k * x))
892        let score = 1.0 / (1.0 + (-5.0 * avg).exp());
893        scores.push(score);
894    }
895
896    let disordered: Vec<bool> = scores.iter().map(|&s| s > 0.5).collect();
897    let n_disordered = disordered.iter().filter(|&&d| d).count();
898
899    Ok(DisorderPrediction {
900        scores,
901        disordered,
902        disorder_fraction: n_disordered as f64 / n as f64,
903    })
904}
905
906// ── Tests ───────────────────────────────────────────────────────
907
908#[cfg(test)]
909mod tests {
910    use super::*;
911
912    // ── normalize_protein ──
913
914    #[test]
915    fn normalize_protein_uppercase() {
916        let r = normalize_protein(b"ACDEFGHIKLMNPQRSTVWY").unwrap();
917        assert_eq!(r, b"ACDEFGHIKLMNPQRSTVWY");
918    }
919
920    #[test]
921    fn normalize_protein_lowercase() {
922        let r = normalize_protein(b"acdefghiklmnpqrstvwy").unwrap();
923        assert_eq!(r, b"ACDEFGHIKLMNPQRSTVWY");
924    }
925
926    #[test]
927    fn normalize_protein_invalid() {
928        assert!(normalize_protein(b"ABCDE").is_err()); // B is not standard
929    }
930
931    // ── amino_acid_composition ──
932
933    #[test]
934    fn composition_all_alanine() {
935        let comp = amino_acid_composition(b"AAAAAAAAAA").unwrap();
936        assert_eq!(comp.counts[0], 10); // A at index 0
937        assert!((comp.fractions[0] - 1.0).abs() < 1e-10);
938        assert_eq!(comp.length, 10);
939        // All others are 0
940        for i in 1..20 {
941            assert_eq!(comp.counts[i], 0);
942        }
943    }
944
945    #[test]
946    fn composition_each_aa_once() {
947        let comp = amino_acid_composition(b"ACDEFGHIKLMNPQRSTVWY").unwrap();
948        assert_eq!(comp.length, 20);
949        for i in 0..20 {
950            assert_eq!(comp.counts[i], 1);
951            assert!((comp.fractions[i] - 0.05).abs() < 1e-10);
952        }
953    }
954
955    #[test]
956    fn composition_empty_error() {
957        assert!(amino_acid_composition(b"").is_err());
958    }
959
960    // ── hydrophobicity_profile ──
961
962    #[test]
963    fn hydro_poly_i_kd() {
964        // I = 4.5 on Kyte-Doolittle
965        let profile =
966            hydrophobicity_profile(b"IIIIIIIII", 3, HydrophobicityScale::KyteDoolittle).unwrap();
967        for &v in &profile {
968            assert!((v - 4.5).abs() < 1e-10);
969        }
970    }
971
972    #[test]
973    fn hydro_poly_r_kd() {
974        // R = -4.5 on Kyte-Doolittle
975        let profile =
976            hydrophobicity_profile(b"RRRRRRRRR", 3, HydrophobicityScale::KyteDoolittle).unwrap();
977        for &v in &profile {
978            assert!((v - (-4.5)).abs() < 1e-10);
979        }
980    }
981
982    #[test]
983    fn hydro_hopp_woods_sign() {
984        // D = 3.0 on Hopp-Woods (hydrophilic, positive)
985        let profile =
986            hydrophobicity_profile(b"DDDDDDDDD", 3, HydrophobicityScale::HoppWoods).unwrap();
987        for &v in &profile {
988            assert!(v > 0.0);
989        }
990    }
991
992    #[test]
993    fn hydro_even_window_error() {
994        assert!(
995            hydrophobicity_profile(b"AAAAAA", 4, HydrophobicityScale::KyteDoolittle).is_err()
996        );
997    }
998
999    #[test]
1000    fn hydro_window_1_raw() {
1001        // window=1 should return raw per-residue values
1002        let profile =
1003            hydrophobicity_profile(b"AIV", 1, HydrophobicityScale::KyteDoolittle).unwrap();
1004        assert_eq!(profile.len(), 3);
1005        assert!((profile[0] - 1.8).abs() < 1e-10); // A
1006        assert!((profile[1] - 4.5).abs() < 1e-10); // I
1007        assert!((profile[2] - 4.2).abs() < 1e-10); // V
1008    }
1009
1010    // ── gravy ──
1011
1012    #[test]
1013    fn gravy_poly_i() {
1014        let g = gravy(b"IIIII").unwrap();
1015        assert!((g - 4.5).abs() < 1e-10);
1016    }
1017
1018    #[test]
1019    fn gravy_mixed() {
1020        // A=1.8, R=-4.5 → mean = (1.8 + -4.5) / 2 = -1.35
1021        let g = gravy(b"AR").unwrap();
1022        assert!((g - (-1.35)).abs() < 1e-10);
1023    }
1024
1025    #[test]
1026    fn gravy_empty_error() {
1027        assert!(gravy(b"").is_err());
1028    }
1029
1030    // ── isoelectric_point ──
1031
1032    #[test]
1033    fn pi_poly_d_acidic() {
1034        let pi = isoelectric_point(b"DDDDD").unwrap();
1035        assert!(pi < 3.5, "poly-D pI should be < 3.5, got {}", pi);
1036    }
1037
1038    #[test]
1039    fn pi_poly_k_basic() {
1040        let pi = isoelectric_point(b"KKKKK").unwrap();
1041        assert!(pi > 10.0, "poly-K pI should be > 10.0, got {}", pi);
1042    }
1043
1044    #[test]
1045    fn pi_poly_g_neutral() {
1046        let pi = isoelectric_point(b"GGGGG").unwrap();
1047        // Glycine has no charged side chains; pI near (9.69 + 2.34)/2 ≈ 6.0
1048        assert!(pi > 5.0 && pi < 7.0, "poly-G pI should be ~6.0, got {}", pi);
1049    }
1050
1051    #[test]
1052    fn pi_single_residue() {
1053        // Should not panic
1054        let pi = isoelectric_point(b"A").unwrap();
1055        assert!(pi > 0.0 && pi < 14.0);
1056    }
1057
1058    #[test]
1059    fn pi_known_protein() {
1060        // Insulin B chain: FVNQHLCGSHLVEALYLVCGERGFFYTPKT
1061        // Expected pI ≈ 6.9 (literature value varies 6.5-7.2)
1062        let pi = isoelectric_point(b"FVNQHLCGSHLVEALYLVCGERGFFYTPKT").unwrap();
1063        assert!(
1064            pi > 5.5 && pi < 8.5,
1065            "insulin B chain pI should be ~6.9, got {}",
1066            pi
1067        );
1068    }
1069
1070    // ── extinction_coefficient ──
1071
1072    #[test]
1073    fn ext_trp_only() {
1074        let ec = extinction_coefficient(b"W").unwrap();
1075        assert!((ec.reduced - 5500.0).abs() < 1e-10);
1076        assert!((ec.cystine - 5500.0).abs() < 1e-10); // no Cys
1077    }
1078
1079    #[test]
1080    fn ext_tyr_only() {
1081        let ec = extinction_coefficient(b"Y").unwrap();
1082        assert!((ec.reduced - 1490.0).abs() < 1e-10);
1083    }
1084
1085    #[test]
1086    fn ext_cystine_contribution() {
1087        // Two cysteines can form one cystine bond
1088        let ec = extinction_coefficient(b"CC").unwrap();
1089        assert!((ec.reduced - 0.0).abs() < 1e-10); // no Trp/Tyr
1090        assert!((ec.cystine - 125.0).abs() < 1e-10); // one cystine
1091    }
1092
1093    #[test]
1094    fn ext_no_absorbers() {
1095        // No Trp, Tyr, or Cys
1096        let ec = extinction_coefficient(b"AAAAAA").unwrap();
1097        assert!((ec.reduced - 0.0).abs() < 1e-10);
1098        assert!((ec.cystine - 0.0).abs() < 1e-10);
1099    }
1100
1101    // ── chou_fasman ──
1102
1103    #[test]
1104    fn cf_poly_a_helix() {
1105        // Alanine has high P_alpha (1.42) — strong helix former
1106        let pred = chou_fasman(b"AAAAAAAAAAAAAAAAAAA").unwrap();
1107        assert!(
1108            pred.helix_fraction > 0.5,
1109            "poly-A should be mostly helix, got helix_fraction={}",
1110            pred.helix_fraction
1111        );
1112    }
1113
1114    #[test]
1115    fn cf_poly_v_strand() {
1116        // Valine has high P_beta (1.70) — strong strand former
1117        let pred = chou_fasman(b"VVVVVVVVVVVVVVVVVVV").unwrap();
1118        assert!(
1119            pred.strand_fraction > 0.5,
1120            "poly-V should be mostly strand, got strand_fraction={}",
1121            pred.strand_fraction
1122        );
1123    }
1124
1125    #[test]
1126    fn cf_poly_g_coil() {
1127        // Glycine has low P_alpha (0.57) and P_beta (0.75), high P_turn (1.56)
1128        let pred = chou_fasman(b"GGGGGGGGGGGGGGGGGGG").unwrap();
1129        assert!(
1130            pred.coil_fraction > 0.5,
1131            "poly-G should be mostly coil, got coil_fraction={}",
1132            pred.coil_fraction
1133        );
1134    }
1135
1136    #[test]
1137    fn cf_short_error() {
1138        assert!(chou_fasman(b"AAAA").is_err());
1139    }
1140
1141    // ── gor ──
1142
1143    #[test]
1144    fn gor_helix_rich() {
1145        // E (glutamate) has highest GOR helix info (0.42)
1146        let pred = gor(b"EEEEEEEEEEEEEEEEEEEE").unwrap();
1147        assert!(
1148            pred.helix_fraction > pred.strand_fraction,
1149            "E-rich should be mostly helix: H={}, E={}",
1150            pred.helix_fraction,
1151            pred.strand_fraction
1152        );
1153    }
1154
1155    #[test]
1156    fn gor_strand_rich() {
1157        // V (valine) has highest GOR strand info (0.52)
1158        let pred = gor(b"VVVVVVVVVVVVVVVVVVVV").unwrap();
1159        assert!(
1160            pred.strand_fraction > pred.helix_fraction,
1161            "V-rich should be mostly strand: E={}, H={}",
1162            pred.strand_fraction,
1163            pred.helix_fraction
1164        );
1165    }
1166
1167    #[test]
1168    fn gor_output_length() {
1169        let pred = gor(b"ACDEFGHIKLMNPQRSTVWY").unwrap();
1170        assert_eq!(pred.states.len(), 20);
1171        assert_eq!(pred.helix_scores.len(), 20);
1172        assert_eq!(pred.strand_scores.len(), 20);
1173        assert_eq!(pred.coil_scores.len(), 20);
1174    }
1175
1176    #[test]
1177    fn gor_fractions_sum() {
1178        let pred = gor(b"ACDEFGHIKLMNPQRSTVWY").unwrap();
1179        let sum = pred.helix_fraction + pred.strand_fraction + pred.coil_fraction;
1180        assert!(
1181            (sum - 1.0).abs() < 1e-10,
1182            "fractions should sum to 1.0, got {}",
1183            sum
1184        );
1185    }
1186
1187    // ── predict_disorder ──
1188
1189    #[test]
1190    fn disorder_charged_stretch() {
1191        // Highly charged (K, D, E) → disordered
1192        let pred = predict_disorder(b"KKKDDDEEEKKKDDDEEEK", 7).unwrap();
1193        assert!(
1194            pred.disorder_fraction > 0.5,
1195            "charged stretch should be mostly disordered, got {}",
1196            pred.disorder_fraction
1197        );
1198    }
1199
1200    #[test]
1201    fn disorder_hydrophobic_stretch() {
1202        // Hydrophobic (I, V, L, F, W) → ordered
1203        let pred = predict_disorder(b"IVLFWIVLFWIVLFWIVLFW", 7).unwrap();
1204        assert!(
1205            pred.disorder_fraction < 0.5,
1206            "hydrophobic stretch should be mostly ordered, got {}",
1207            pred.disorder_fraction
1208        );
1209    }
1210
1211    #[test]
1212    fn disorder_scores_bounded() {
1213        let pred = predict_disorder(b"ACDEFGHIKLMNPQRSTVWY", 5).unwrap();
1214        for &s in &pred.scores {
1215            assert!(s >= 0.0 && s <= 1.0, "score {} out of [0,1]", s);
1216        }
1217    }
1218
1219    #[test]
1220    fn disorder_window_zero_error() {
1221        assert!(predict_disorder(b"AAAA", 0).is_err());
1222    }
1223}