pub type CrossModalInput = (Array2<f64>, Array2<f64>);
Input for cross-modal learning: (modality1, modality2)