1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
//! Embedding from GraphK
//!
//! The embedding is based on the graph (see [KGraph]) extracted from the Hnsw structure.
//! Edges out a given point are given an exponential weight scaled related to the distance their neighbour.
//! This weight is modulated locally by a scale parameter computed by the mean of the distance of a point to
//! its nearest neighbour observed locally around each point.
//!
//! **A more complete description of the model used can be found in module embedparams with hints to
//! initialize parameters**.
//!
//! To go through the entropy optimization the type F defining the probability of edges must satisfy:
//!
//! F: Float + NumAssign + std::iter::Sum + num_traits::cast::FromPrimitive + Send + Sync + ndarray::ScalarOperand
//!
//! in fact it is f32 or f64.
use num_traits::{Float, NumAssign};
use ndarray::{Array1, Array2, ArrayView1};
// use ndarray_linalg::{Lapack, Scalar};
use lax::Lapack;
use crate::fromhnsw::hubness::Hubness;
use crate::tools::io::write_csv_labeled_array2;
use csv::Writer;
use quantiles::ckms::CKMS; // we could use also greenwald_khanna
// threading needs
use parking_lot::RwLock;
use rayon::prelude::*;
use std::sync::Arc;
use rand::distr::Uniform;
use rand::{Rng, rng};
use rand_distr::weighted::WeightedAliasIndex;
use rand_distr::{Distribution, Normal};
use indexmap::set::*;
use cpu_time::ProcessTime;
use std::time::{Duration, SystemTime};
use crate::diffmaps::*;
use crate::embedparams::*;
use crate::fromhnsw::{kgproj::*, kgraph::KGraph, kgraph::kgraph_from_hnsw_all};
use crate::tools::{clip::clip, dichotomy::*, kdumap::*, nodeparam::*};
use hnsw_rs::prelude::*;
/// do not consider probabilities under PROBA_MIN, thresolded!!
pub(crate) const PROBA_MIN: f32 = 1.0E-5;
// to be used in emdedded space so small dimension. no need for simd and
#[inline]
fn distl2<F>(a: &[F], b: &[F]) -> F
where
F: Float + ndarray::ScalarOperand + Send + Sync + std::iter::Sum,
{
assert_eq!(a.len(), b.len());
let norm: F = a
.iter()
.zip(b.iter())
.map(|t| (*t.0 - *t.1) * (*t.0 - *t.1))
.sum();
num_traits::Float::sqrt(norm)
}
struct DistL2F;
impl<F> Distance<F> for DistL2F
where
F: Float + ndarray::ScalarOperand + std::iter::Sum + Send + Sync,
{
fn eval(&self, va: &[F], vb: &[F]) -> f32 {
distl2::<F>(va, vb).to_f32().unwrap()
} // end of compute
} // end of impl block
//=====================================================================================
/// The structure corresponding to the embedding process.
/// It must be initialized by the graph extracted from Hnsw according to the choosen strategy
/// and the asked dimension for embedding.
/// We get as a result the embedding and potentially the hubness of the grap if hubness node sampling was asked for in EmbedderParams
pub struct Embedder<'a, F> {
/// graph constrcuted with fromhnsw module
kgraph: Option<&'a KGraph<F>>,
/// projection
hkgraph: Option<&'a KGraphProjection<F>>,
/// parameters
parameters: EmbedderParams,
/// contains edge probabilities according to the probabilized graph constructed before laplacian symetrization
/// It is this representation that is used for cross entropy optimization!
initial_space: Option<NodeParams>,
/// initial embedding (option for degugging analyzing)
initial_embedding: Option<Array2<F>>,
/// final embedding
embedding: Option<Array2<F>>,
/// hubness can be an added result if EmbedderParams.hubness_weighting is asked
hubness: Option<Hubness<'a, F>>,
} // end of Embedder
impl<'a, F> Embedder<'a, F>
where
F: Float + Lapack + ndarray::ScalarOperand + Send + Sync + Into<f64>,
{
/// constructor from a graph and asked embedding dimension
pub fn new(kgraph: &'a KGraph<F>, parameters: EmbedderParams) -> Self {
Embedder::<F> {
kgraph: Some(kgraph),
hkgraph: None,
parameters,
initial_space: None,
initial_embedding: None,
embedding: None,
hubness: None,
}
} // end of new
/// construction from a hierarchical graph
pub fn from_hkgraph(
graph_projection: &'a KGraphProjection<F>,
parameters: EmbedderParams,
) -> Self {
Embedder::<F> {
kgraph: None,
hkgraph: Some(graph_projection),
parameters,
initial_space: None,
initial_embedding: None,
embedding: None,
hubness: None,
}
} // end of from_hkgraph
pub fn get_asked_dimension(&self) -> usize {
self.parameters.asked_dim
}
pub fn get_scale_rho(&self) -> f64 {
self.parameters.scale_rho
}
pub fn get_b(&self) -> f64 {
self.parameters.b
}
pub fn get_grad_step(&self) -> f64 {
self.parameters.grad_step
}
pub fn get_nb_grad_batch(&self) -> usize {
self.parameters.nb_grad_batch
}
/// return hubness if obtained as a by product of entropy optimization
pub fn get_hubness(&self) -> Option<&Hubness<'a, F>> {
self.hubness.as_ref()
}
/// return the underlying KGraph possibly via the GrapProjection
pub fn get_kgraph(&self) -> Option<&KGraph<F>> {
if self.kgraph.is_some() {
self.kgraph
} else if self.hkgraph.is_some() {
Some(self.hkgraph.as_ref().unwrap().get_large_graph())
} else {
None
}
}
/// get neighbourhood size used in embedding
pub fn get_kgraph_nbng(&self) -> usize {
if self.kgraph.is_some() {
self.kgraph.as_ref().unwrap().get_max_nbng()
} else if self.hkgraph.is_some() {
self.hkgraph.as_ref().unwrap().get_nbng()
} else {
0
}
}
/// dispatch to one_step embed or hierarchical embedding
pub fn embed(&mut self) -> Result<usize, usize> {
if self.kgraph.is_some() {
log::info!("doing one step embedding");
self.one_step_embed()
} else {
log::info!("doing 2 step embedding");
self.h_embed()
}
} // end of embed
/// do hierarchical embedding on GraphPrrojection
pub fn h_embed(&mut self) -> Result<usize, usize> {
if self.hkgraph.is_none() {
log::error!("Embedder::h_embed , graph projection is none");
return Err(1);
}
log::debug!("in h_embed");
// one_step embed of the small graph.
let graph_projection = self.hkgraph.as_ref().unwrap();
log::info!(" embedding first (small) graph");
let mut first_step_parameters = self.parameters;
first_step_parameters.nb_grad_batch =
self.parameters.grad_factor * self.parameters.nb_grad_batch;
log::info!("nb initial batch : {}", first_step_parameters.nb_grad_batch);
first_step_parameters.grad_step = 1.;
first_step_parameters.hierarchy_layer = 0;
let mut embedder_first_step =
Embedder::new(graph_projection.get_small_graph(), first_step_parameters);
let cpu_start = ProcessTime::now();
let sys_start = SystemTime::now();
let res_first = embedder_first_step.one_step_embed();
if res_first.is_err() {
log::error!("Embedder::h_embed first step failed");
return res_first;
}
log::info!(
" first step embedding sys time(ms) {:.2e} cpu time(ms) {:.2e}",
sys_start.elapsed().unwrap().as_millis(),
cpu_start.elapsed().as_millis()
);
// get initial embedding
let large_graph = graph_projection.get_large_graph();
log::info!("computing proba edges for large graph ...");
self.initial_space = Some(to_proba_edges(
large_graph,
self.parameters.scale_rho as f32,
self.parameters.beta as f32,
));
let nb_nodes_large = large_graph.get_nb_nodes();
let first_embedding = embedder_first_step.get_embedded().unwrap();
// use projection to initialize large graph
let quant = graph_projection.get_projection_distance_quant();
if quant.count() > 0 {
log::debug!(
" projection distance quantile at 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
quant.query(0.05).unwrap().1,
quant.query(0.5).unwrap().1,
quant.query(0.95).unwrap().1,
quant.query(0.99).unwrap().1
);
};
let dim = self.parameters.get_dimension();
let mut second_step_init = Array2::<F>::zeros((nb_nodes_large, dim));
log::info!("doing projection");
let (nb_nodes_small, _) = first_embedding.dim();
// we were cautious on indexation so we can do:
let mut rng = rand::rng();
for i in 0..nb_nodes_small {
for j in 0..dim {
second_step_init[[i, j]] = first_embedding[[i, j]];
}
}
let median_dist = quant.query(0.5).unwrap().1;
// we sample a random position around first embedding position
// TODO: amount of clipping
let normal = Normal::<f32>::new(0., 1.0).unwrap();
for i in nb_nodes_small..nb_nodes_large {
let projected_edge = graph_projection.get_projection_by_nodeidx(&i);
// we get looseness around projected point depend on where we are in quantiles on distance
let ratio = projected_edge.weight.to_f32().unwrap() / median_dist;
let correction = (ratio / dim as f32).sqrt();
for j in 0..dim {
let clipped_correction = clip(correction * normal.sample(&mut rng), 2.);
second_step_init[[i, j]] = first_embedding[[projected_edge.node, j]]
+ F::from(clipped_correction).unwrap();
}
}
log::debug!("projection done");
//
self.initial_embedding = Some(second_step_init);
// cross entropy optimize
log::info!("optimizing second step");
let embedding_res =
self.entropy_optimize(&self.parameters, self.initial_embedding.as_ref().unwrap());
//
log::info!(
" first + second step embedding sys time(s) {:.2e} cpu time(s) {:.2e}",
sys_start.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
//
match embedding_res {
Ok((embedding, hubness)) => {
self.embedding = Some(embedding);
self.hubness = hubness;
Ok(1)
}
_ => {
log::error!("Embedder::embed : embedding optimization failed");
Err(1)
}
}
} // end of h_embed
/// do the embedding
pub fn one_step_embed(&mut self) -> Result<usize, usize> {
//
log::info!("doing 1 step embedding");
self.parameters.log();
let graph_to_embed = self.kgraph.unwrap();
// construction of initial neighbourhood, scales and proba of edges from distances.
// we will need initial_space representation for graph laplacian and in cross entropy optimization
// we can initialize embedding with diffusion maps or pure random.
let mut initial_embedding;
if self.parameters.dmap_init {
// initial embedding via diffusion maps, in this case we have to have a coherent box normalization with random case
//
let cpu_start = ProcessTime::now();
let sys_start = SystemTime::now();
let dmapnew = false;
//
if dmapnew {
log::info!("using new dmaps");
let dtime = 5.;
let gnbn = 16;
let mut dparams: DiffusionParams =
DiffusionParams::new(10, Some(dtime), Some(gnbn));
dparams.set_alfa(1.);
dparams.set_beta(-1.);
let mut diffusion_map = DiffusionMaps::new(dparams);
initial_embedding = diffusion_map
.embed_from_kgraph::<F>(graph_to_embed, &dparams)
.unwrap();
} else {
log::info!("using old dmaps");
self.initial_space = Some(to_proba_edges(
graph_to_embed,
self.parameters.scale_rho as f32,
self.parameters.beta as f32,
));
initial_embedding = get_dmap_embedding(
self.initial_space.as_ref().unwrap(),
self.parameters.get_dimension(),
None,
);
}
log::info!(
" dmap initialization sys time(ms) {:.2e} cpu time(ms) {:.2e}",
sys_start.elapsed().unwrap().as_millis(),
cpu_start.elapsed().as_millis()
);
set_data_box(&mut initial_embedding, F::from(10.).unwrap());
} else {
// if we use random initialization we must have a box size coherent with renormalizes scales, so box size is 1.
initial_embedding = self.get_random_init(1.);
}
//
self.initial_space = Some(to_proba_edges(
graph_to_embed,
self.parameters.scale_rho as f32,
self.parameters.beta as f32,
));
let embedding_res = self.entropy_optimize(&self.parameters, &initial_embedding);
// optional store dump initial embedding
self.initial_embedding = Some(initial_embedding);
//
match embedding_res {
Ok((embedding, hubness)) => {
self.embedding = Some(embedding);
self.hubness = hubness;
Ok(1)
}
_ => {
log::error!("Embedder::embed : embedding optimization failed");
Err(1)
}
}
} // end embed
/// At the end returns the embedded data as Matrix.
/// The row of the matrix corresponds to the embedded dat vectors but after reindexation of DataId
/// to ensure a contiguous indexation.
/// To get a matrix with row corresponding to DataId if they were already contiguous for 0 to nbdata use
/// function get_embedded_reindexed to get the permutation/reindexation unrolled!
pub fn get_embedded(&self) -> Option<&Array2<F>> {
self.embedding.as_ref()
}
/// returns embedded data reindexed by DataId. This requires the DataId to be contiguous from 0 to nbdata.
/// See [crate::fromhnsw::kgraph::KGraph::get_idx_from_dataid]
pub fn get_embedded_reindexed(&self) -> Array2<F> {
let emmbedded = self.embedding.as_ref().unwrap();
let (nbrow, dim) = emmbedded.dim();
let mut reindexed = Array2::<F>::zeros((nbrow, dim));
//
let kgraph = if self.hkgraph.is_some() {
self.hkgraph.as_ref().unwrap().get_large_graph()
} else {
self.kgraph.as_ref().unwrap()
};
// TODO version 0.15 provides move_into and push_row
// Here we must not forget that to interpret results we must go
// back from indexset to original points (One week bug!)
for i in 0..nbrow {
let row = emmbedded.row(i);
let origin_id = kgraph.get_data_id_from_idx(i).unwrap();
for j in 0..dim {
reindexed[[*origin_id, j]] = row[j];
}
}
reindexed
}
/// **return the embedded vector corresponding to original data vector corresponding to data_id**
/// This methods fails if data_id do not exist. Use KGraph.get_data_id_from_idx to check before if necessary.
pub fn get_embedded_by_dataid(&self, data_id: &DataId) -> ArrayView1<'_, F> {
// we must get data index as stored in IndexSet
let kgraph = if self.hkgraph.is_some() {
self.hkgraph.as_ref().unwrap().get_large_graph()
} else {
self.kgraph.as_ref().unwrap()
};
let data_idx = kgraph.get_idx_from_dataid(data_id).unwrap();
self.embedding.as_ref().unwrap().row(data_idx)
} // end of get_data_embedding
/// **get embedding of a given node index after reindexation by the embedding to index in [0..nb_nodes]**
pub fn get_embedded_by_nodeid(&self, node: NodeIdx) -> ArrayView1<'_, F> {
self.embedding.as_ref().unwrap().row(node)
}
/// returns the initial embedding. Same remark as for method get_embedded. Storage is optional TODO
pub fn get_initial_embedding(&self) -> Option<&Array2<F>> {
self.initial_embedding.as_ref()
}
pub fn get_initial_embedding_reindexed(&self) -> Array2<F> {
//
let emmbedded = self.initial_embedding.as_ref().unwrap();
let (nbrow, dim) = emmbedded.dim();
let mut reindexed = Array2::<F>::zeros((nbrow, dim));
//
let kgraph = if self.hkgraph.is_some() {
self.hkgraph.as_ref().unwrap().get_large_graph()
} else {
self.kgraph.as_ref().unwrap()
};
//
// TODO version 0.15 provides move_into and push_row
// Here we must not forget that to interpret results we must go
// back from indexset to original points (One week bug!)
for i in 0..nbrow {
let row = emmbedded.row(i);
let origin_id = kgraph.get_data_id_from_idx(i).unwrap();
for j in 0..dim {
reindexed[[*origin_id, j]] = row[j];
}
}
reindexed
} // end of get_initial_embedding_reindexed
/// get random initialization in a square of side size
fn get_random_init(&self, size: f32) -> Array2<F> {
log::trace!("Embedder get_random_init with size {:.2e}", size);
//
let nb_nodes = self.initial_space.as_ref().unwrap().get_nb_nodes();
let mut initial_embedding = Array2::<F>::zeros((nb_nodes, self.get_asked_dimension()));
let unif = Uniform::<f32>::new(-size / 2., size / 2.).unwrap();
let mut rng = rand::rng();
for i in 0..nb_nodes {
for j in 0..self.get_asked_dimension() {
initial_embedding[[i, j]] = F::from(rng.sample(unif)).unwrap();
}
}
//
initial_embedding
} // end of get_random_init
/// For each node we in the orginal kgraph compute the transformed neighbourhood info in the embedded space.
/// Precisely: for each node n1 in initial space, for each neighbour n2 of n1, we compute l2 distance of
/// embedded points corresponding to n1, n2.
/// So we have an embedded edge which is not always an edge in kgraph computed from embedded data.
/// The conservation of edges through the embedding is a measure of neighborhood conservation.
/// Function returns for each node , increasing sorted distances (L2) in embedded space to its neighbours in original space.
fn get_transformed_kgraph(&self) -> Option<Vec<(usize, Vec<OutEdge<F>>)>> {
// we check we have kgraph
let kgraph: &KGraph<F>;
if self.kgraph.is_some() {
kgraph = self.kgraph.unwrap();
log::info!("found kgraph");
} else if self.hkgraph.is_some() {
kgraph = self.hkgraph.as_ref().unwrap().get_large_graph();
log::info!("found kgraph from projection");
} else {
log::info!("could not find kgraph");
std::process::exit(1);
}
// we loop on kgraph nodes, loop on edges of node, get extremity id , converts to index, compute embedded distance and sum
let neighbours = kgraph.get_neighbours();
//
let transformed_neighbours: Vec<(usize, Vec<OutEdge<F>>)> = (0..neighbours.len())
.into_par_iter()
.map(|n| -> (usize, Vec<OutEdge<F>>) {
let node_embedded = self.get_embedded_by_nodeid(n);
let mut transformed_neighborhood =
Vec::<OutEdge<F>>::with_capacity(neighbours[n].len());
let mut node_edge_length = F::max_value();
for edge in &neighbours[n] {
let ext_embedded = self.get_embedded_by_nodeid(edge.node);
// now we can compute distance for corresponding edge in embedded space. We must use L2
node_edge_length = distl2(
node_embedded.as_slice().unwrap(),
ext_embedded.as_slice().unwrap(),
)
.min(node_edge_length);
transformed_neighborhood.push(OutEdge::<F>::new(edge.node, node_edge_length));
}
// sort transformed_neighborhood
transformed_neighborhood
.sort_unstable_by(|a, b| a.weight.partial_cmp(&b.weight).unwrap());
(n, transformed_neighborhood)
})
.collect();
// We need to ensure that parallel iter did not permute data.
for (i, item) in transformed_neighbours.iter().enumerate() {
assert_eq!(i, item.0);
}
Some(transformed_neighbours)
} // end of get_transformed_kgraph
/// compute hnsw and kgraph from embedded data and get maximal edge length by node
/// Returns for each node, distance of nbng'th neighbours to node.
/// This function can require much memory for large number of nodes with large nbng
fn get_max_edge_length_embedded_kgraph(&self, nbng: usize) -> Option<Vec<(usize, f64)>> {
let embedding = self.embedding.as_ref().unwrap();
// TODO use the same parameters as the hnsw given to kgraph, and adjust ef_c accordingly
let max_nb_connection = nbng;
let ef_c = 64;
// compute hnsw
let nb_nodes = embedding.nrows();
let nb_layer = 16.min((nb_nodes as f32).ln().trunc() as usize);
let mut hnsw =
Hnsw::<F, DistL2F>::new(max_nb_connection, nb_nodes, nb_layer, ef_c, DistL2F {});
hnsw.set_keeping_pruned(true);
// need to store arrayviews to get a sufficient lifetime to call as_slice later
let vectors: Vec<ArrayView1<F>> = (0..nb_nodes).map(|i| embedding.row(i)).collect();
let mut data_with_id = Vec::<(&[F], usize)>::with_capacity(nb_nodes);
for (i, v) in vectors.iter().enumerate().take(nb_nodes) {
data_with_id.push((v.as_slice().unwrap(), i));
}
hnsw.parallel_insert_slice(&data_with_id);
// compute kgraph from hnsw and sum edge length
let optimal_graph: anyhow::Result<KGraph<F>> = kgraph_from_hnsw_all(&hnsw, nbng);
if optimal_graph.is_err() {
log::error!("could not compute optimal graph");
return None;
}
let optimal_graph = optimal_graph.unwrap();
let optimal_max_edge = optimal_graph.compute_max_edge();
Some(optimal_max_edge)
} // end of get_max_edge_length_embedded_kgraph
/// This function is an attempt to quantify the quality of the embedding using the graph projection defined in [KGraph].
///
/// The graph projection defines a natural neighborhood of a point as the set of edges around a point. The size of this neighbourhood is related to
/// the parameter *max_nb_connection* in Hnsw and roughly correspond to a number of neighbours between 2 and 3 times this parameter.
/// The argument *nbng* to this function should be set accordingly.
///
/// It tries to assess how neighbourhood of points in original space and neighbourhood in embedded space match.
/// The size of neighbourhood considered is *nbng*.
///
/// In each neighbourhood of a point, taken as center in the initial space we:
///
/// - count the number of its neighbours for which the distance to the center is less than the radius of the neighbourhood (of size *nbng*) in embedded space.
/// For neighbourhood that have a match , we give the mean number of matches.
/// This quantify the conservation of neighborhoods through the embedding. The lower the number of neighbourhoods without a match and the higher
/// the mean number of matches, the better is the embedding.
///
/// - compute the length of embedded edges joining original neighbours to a node and provide quantiles summary.
///
/// - compute the ratio of these edge length to the radius of the ball in embedded space corresponding to nbng 'th neighbours.
/// (question is: how much do we need to dilate the neighborhood in embedded space to retrieve the neighbours in original space?)
///
/// The quantiles on ratio these distance are then dumped. The lower the median (or the mean), the better is the embedding.
///
/// It gives a rough idea of the continuity of the embedding. The lesser the ratio the tighter the embedding.
///
/// Moreover the continuity ratio around each node is computed and dumped in a file name *continuity_ratio.csv".
/// This file can be displayed by the function *plotCsvContinuity* in visu.jl. see associated doc.
///
///
/// For example for the **fashion mnist** in the hierarchical case we get consistently (see examples):
///
/// - With an embedding dimension of 2 and a target neighbourhood of 50:
/// ```text
/// a guess at quality
/// neighbourhood size used in embedding : 6
/// nb neighbourhoods without a match : 20260, mean number of neighbours conserved when match : 5.069e0
/// embedded radii quantiles at 0.05 : 3.15e-2 , 0.25 : 4.52e-2, 0.5 : 5.68e-2, 0.75 : 7.80e-2, 0.85 : 9.32e-2, 0.95 : 1.36e-1
///
/// statistics on conservation of neighborhood (of size nbng)
/// neighbourhood size used in target space : 50
/// quantiles on ratio : distance in embedded space of neighbours of origin space / distance of last neighbour in embedded space
///
/// quantiles at 0.05 : 5.40e-2 , 0.25 : 3.28e-1, 0.5 : 7.46e-1, 0.75 : 1.57e0, 0.85 : 2.38e0, 0.95 : 4.50e0
/// ```
///
/// So 20000 out of 70000 data points have no conserved neighbours, for the others points 5 out of 6 neighbours are retrived.
/// 75% of neighbourhood is conserved within a radius increased by a factor 1.57.
///
/// - With an embedding dimension of 15 and a target neighbourhood of 50:
///
/// ```text
/// a guess at quality
/// neighbourhood size used in embedding : 6
/// nb neighbourhoods without a match : 9124, mean number of neighbours conserved when match : 5.585e0
/// embedded radii quantiles at 0.05 : 5.55e-2 , 0.25 : 8.66e-2, 0.5 : 1.15e-1, 0.75 : 1.53e-1, 0.85 : 1.80e-1, 0.95 : 2.41e-1
///
/// statistics on conservation of neighborhood (of size nbng)
/// neighbourhood size used in target space : 50
/// quantiles on ratio : distance in embedded space of neighbours of origin space / distance of last neighbour in embedded space
/// quantiles at 0.05 : 5.03e-2 , 0.25 : 2.24e-1, 0.5 : 4.36e-1, 0.75 : 8.06e-1, 0.85 : 1.13e0, 0.95 : 1.92e0
/// ```
/// So 9000 out of 70000 data points have no conserved neighbours, for the others points 5.6 out of 6 neighbours are retrieved.
/// 75% of neighbourhood is conserved within a radius increased by a factor 0.8.
// called by external binary (see examples)
pub fn get_quality_estimate_from_edge_length(&self, nbng: usize) -> Option<f64> {
//
log::info!("doing quality estimation ...");
let cpu_start = ProcessTime::now();
let sys_now = SystemTime::now();
//
let quality = 0f64;
// compute min edge length from initial kgraph
let transformed_kgraph = self.get_transformed_kgraph();
if transformed_kgraph.is_none() {
log::error!("cannot ask for embedded quality before embedding");
std::process::exit(1);
}
let transformed_kgraph = transformed_kgraph.unwrap();
// compute max edge length from kgraph constructed from embedded points corresponding to nbng neighbours
let max_edges_embedded = self.get_max_edge_length_embedded_kgraph(nbng);
if max_edges_embedded.is_none() {
log::error!(
"get_quality_estimate_from_edge_length : cannot compute mean edge length from embedded data"
);
return None;
}
let max_edges_embedded = max_edges_embedded.unwrap();
// now we can for each node see if best of propagated initial edges encounter ball in reconstructed kgraph from embedded data
assert_eq!(max_edges_embedded.len(), transformed_kgraph.len());
let mut embedded_radii = CKMS::<f64>::new(0.01);
let mut ratio_dist_q = CKMS::<f64>::new(0.01);
let nb_nodes = max_edges_embedded.len();
let mut nodes_match = Vec::with_capacity(nb_nodes);
let mut first_dist = Vec::with_capacity(nb_nodes);
let mut ratio_by_node = vec![0.; nb_nodes]; // the lower the better
let mut mean_ratio = (0., 0usize);
for i in 0..nb_nodes {
// check we speak of same node
assert_eq!(i, max_edges_embedded[i].0);
assert_eq!(i, transformed_kgraph[i].0);
nodes_match.push(0);
embedded_radii.insert(max_edges_embedded[i].1);
// how many transformed edges are in maximal neighborhood of size nbng?
let neighbours = &transformed_kgraph[i].1;
let mut node_ratio: f64 = 0.;
for e in neighbours {
if e.weight.to_f64().unwrap() <= max_edges_embedded[i].1 {
nodes_match[i] += 1;
}
ratio_dist_q.insert(e.weight.to_f64().unwrap() / max_edges_embedded[i].1);
mean_ratio.0 += e.weight.to_f64().unwrap() / max_edges_embedded[i].1;
node_ratio += e.weight.to_f64().unwrap() / max_edges_embedded[i].1;
}
mean_ratio.1 += neighbours.len();
node_ratio /= 1.0.max(neighbours.len() as f64);
first_dist.push(neighbours[0].weight.to_f64().unwrap());
ratio_by_node[i] = node_ratio;
}
// some stats
let nb_without_match = nodes_match
.iter()
.fold(0, |acc, x| if *x == 0 { acc + 1 } else { acc });
let mean_nbmatch: f64 = nodes_match.iter().sum::<usize>() as f64
/ (nodes_match.len() - nb_without_match) as f64;
log::info!("\n\n a guess at quality ");
log::info!(
" neighbourhood size used in embedding : {}",
self.get_kgraph_nbng()
);
log::info!(
" nb neighbourhoods without a match : {}, mean number of neighbours conserved when match : {:.3e}",
nb_without_match,
mean_nbmatch
);
log::info!(
" embedded radii quantiles at 0.05 : {:.2e} , 0.25 : {:.2e}, 0.5 : {:.2e}, 0.75 : {:.2e}, 0.85 : {:.2e}, 0.95 : {:.2e} \n",
embedded_radii.query(0.05).unwrap().1,
embedded_radii.query(0.25).unwrap().1,
embedded_radii.query(0.5).unwrap().1,
embedded_radii.query(0.75).unwrap().1,
embedded_radii.query(0.85).unwrap().1,
embedded_radii.query(0.95).unwrap().1
);
//
// The smaller the better!
// we give quantiles on ratio : distance of neighbours in origin space / distance of last neighbour in embedded space
println!("\n statistics on conservation of neighborhood (of size nbng)");
println!(" neighbourhood size used in target space : {}", nbng);
println!(
" quantiles on ratio : distance in embedded space of neighbours of origin space / distance of last neighbour in embedded space"
);
println!(
" quantiles at 0.05 : {:.2e} , 0.25 : {:.2e}, 0.5 : {:.2e}, 0.75 : {:.2e}, 0.85 : {:.2e}, 0.95 : {:.2e} \n",
ratio_dist_q.query(0.05).unwrap().1,
ratio_dist_q.query(0.25).unwrap().1,
ratio_dist_q.query(0.5).unwrap().1,
ratio_dist_q.query(0.75).unwrap().1,
ratio_dist_q.query(0.85).unwrap().1,
ratio_dist_q.query(0.95).unwrap().1
);
let median_ratio = ratio_dist_q.query(0.5).unwrap().1;
println!(
"\n quality index: ratio of distance to neighbours in origin space / distance to last neighbour in embedded space"
);
println!(
" neighborhood are conserved in radius multiplied by median : {:.2e}, mean {:.2e} ",
median_ratio,
mean_ratio.0 / mean_ratio.1 as f64
);
log::info!("");
// dump first dist
let mut csv_dist = Writer::from_path("first_dist.csv").unwrap();
let _res = write_csv_labeled_array2(
&mut csv_dist,
first_dist.as_slice(),
&self.get_embedded_reindexed(),
);
csv_dist.flush().unwrap();
// dump continuity ratio
let mut cont_ratio_csv = Writer::from_path("continuity_ratio.csv").unwrap();
let _res = write_csv_labeled_array2(
&mut cont_ratio_csv,
ratio_by_node.as_slice(),
&self.get_embedded_reindexed(),
);
cont_ratio_csv.flush().unwrap();
//
let cpu_time: Duration = cpu_start.elapsed();
log::info!(
" quality estimation, sys time(s) {:?} cpu time {:?}",
sys_now.elapsed().unwrap().as_secs(),
cpu_time.as_secs()
);
//
Some(quality)
} // end of get_quality_estimate_from_edge_length
// given neighbours of a node we choose scale to satisfy a normalization constraint.
// p_i = exp[- beta * (d(x,y_i) - d(x, y_1)/ local_scale ]
// We return beta/local_scale
// as function is monotonic with respect to scale, we use dichotomy.
#[allow(unused)]
fn get_scale_from_umap(&self, norm: f64, neighbours: &[OutEdge<F>]) -> (f32, Vec<f32>) {
// p_i = exp[- beta * (d(x,y_i)/ local_scale) ]
let nbgh = neighbours.len();
let rho_x = neighbours[0].weight.to_f32().unwrap();
let mut dist = neighbours
.iter()
.map(|n| n.weight.to_f32().unwrap())
.collect::<Vec<f32>>();
//
let f = |beta: f32| {
dist.iter()
.map(|d| (-(d - rho_x) * beta).exp())
.sum::<f32>()
};
// f is decreasing
// TODO we could also normalize as usual?
let beta = dichotomy_solver(false, f, 0f32, f32::MAX, norm as f32).unwrap();
// reuse rho_y_s to return proba of edge
for d in dist.iter_mut() {
*d = (-(*d - rho_x) * beta).exp();
}
// in this state neither sum of proba adds up to 1 neither is any entropy (Shannon or Renyi) normailed.
(1. / beta, dist)
} // end of get_scale_from_umap
pub fn get_nb_nodes(&self) -> usize {
self.initial_space.as_ref().unwrap().get_nb_nodes()
}
// minimize divergence between embedded and initial distribution probability
// We use cross entropy as in Umap. The edge weight function must take into acccount an initial density estimate and a scale.
// The initial density makes the embedded graph asymetric as the initial graph.
// The optimization function thus should try to restore asymetry and local scale as far as possible.
// returns the embedded data after restauration of the original indexation/identification of datas! (time consuming bug)
fn entropy_optimize(
&self,
params: &EmbedderParams,
initial_embedding: &Array2<F>,
) -> Result<(Array2<F>, Option<Hubness<'a, F>>), String> {
//
log::debug!("in Embedder::entropy_optimize");
//
if self.initial_space.is_none() {
log::error!("Embedder::entropy_optimize : initial_space not constructed, exiting");
return Err(String::from(
" initial_space not constructed, no NodeParams",
));
}
// do we use hubness to sample negative nodes?
let hubness_opt: Option<Hubness<'a, F>>;
let neg_node_sampler = if params.hubness_weighting {
log::info!("using hubness for negative node sampling");
// compute node hubness from KGraph
let kgraph;
if let Some(graph) = self.kgraph {
kgraph = graph;
} else if let Some(projection) = self.hkgraph {
log::debug!("using large graph from projection");
kgraph = projection.get_large_graph();
} else {
log::error!("cannot find kgraph");
return Err(String::from("entropy_optimize: cannot find kraph "));
}
let hubness = Hubness::new(kgraph);
// as we have info we get/dump info!
let _ = hubness.get_hubness_histogram();
let counts = hubness.get_counts();
let upper_bound = counts.len() as f32;
let f_counts: Vec<f32> = counts
.iter()
.map(|n| (*n as f32).max(1.).min(upper_bound))
.collect();
hubness_opt = Some(hubness);
Some(NodeSampler::new(f_counts))
} else {
hubness_opt = None;
None
};
let ce_optimization = EntropyOptim::new(
self.initial_space.as_ref().unwrap(),
params,
neg_node_sampler,
initial_embedding,
);
// compute initial value of objective function
let start = ProcessTime::now();
let initial_ce = ce_optimization.ce_compute_threaded();
let cpu_time: Duration = start.elapsed();
log::info!(
" initial cross entropy value {:.2e}, in time {:?}",
initial_ce,
cpu_time
);
// We manage some iterations on gradient computing
let grad_step_init = params.grad_step;
log::info!("grad_step_init : {:.2e}", grad_step_init);
//
log::debug!("in Embedder::entropy_optimize ... gradient iterations");
let nb_sample_by_iter = params.nb_sampling_by_edge * ce_optimization.get_nb_edges();
//
log::info!("\n optimizing embedding");
log::info!(
" nb edges {} , number of edge sampling by grad iteration {}",
ce_optimization.get_nb_edges(),
nb_sample_by_iter
);
log::info!(
" nb iteration : {} sampling size {} ",
self.get_nb_grad_batch(),
nb_sample_by_iter
);
let cpu_start = ProcessTime::now();
let sys_start = SystemTime::now();
for iter in 1..=self.get_nb_grad_batch() {
// loop on edges
let grad_step = grad_step_init * (1. - iter as f64 / self.get_nb_grad_batch() as f64);
ce_optimization.gradient_iteration_threaded(nb_sample_by_iter, grad_step);
// let cpu_time: Duration = start.elapsed();
// log::debug!("ce after grad iteration time(ms) {:.2e} grad iter {:.2e}", cpu_time.as_millis(), ce_optimization.ce_compute_threaded());
}
log::info!(
" gradient iterations sys time(s) {:.2e} , cpu_time(s) {:.2e}",
sys_start.elapsed().unwrap().as_secs(),
cpu_start.elapsed().as_secs()
);
let final_ce = ce_optimization.ce_compute_threaded();
log::info!(" final cross entropy value {:.2e}", final_ce);
// return reindexed data (if possible)
let dim = self.get_asked_dimension();
let nbrow = self.get_nb_nodes();
let mut reindexed = Array2::<F>::zeros((nbrow, dim));
// TODO version 0.15 provides move_into and push_row
// Here we must not forget that to interpret results we must go
// back from indexset to original points (One week bug!)
for i in 0..nbrow {
let row = ce_optimization.get_embedded_data(i);
for j in 0..dim {
reindexed[[i, j]] = row.read()[j];
}
}
//
Ok((reindexed, hubness_opt))
//
} // end of entropy_optimize
} // end of impl Embedder
//==================================================================================================================
struct NodeSampler {
neg_node_distribution: WeightedAliasIndex<f32>,
weights: Vec<f32>,
}
impl NodeSampler {
pub(crate) fn new(weight_arg: Vec<f32>) -> Self {
let mean: f32 = weight_arg.iter().sum::<f32>() / weight_arg.len() as f32;
// normalize so that mesure is normalized to sample size! (better precision)
let normalized: Vec<f32> = weight_arg.iter().map(|w| *w / mean).collect();
let neg_node_distribution = WeightedAliasIndex::new(normalized.clone()).unwrap();
NodeSampler {
neg_node_distribution,
weights: normalized,
}
}
/// returns node sampled and associated proba change (ratio to uniform distribution normalized to sample size)
pub(crate) fn sample(&self) -> (usize, f32) {
let node = self.neg_node_distribution.sample(&mut rng());
(node, self.weights[node])
}
} // end of NodeSampler
/// All we need to optimize entropy discrepancy
/// A list of edge with its weight, an array of scale for each origin node of an edge, proba (weight) of each edge
/// and coordinates in embedded_space with lock protection for //
struct EntropyOptim<'a, F> {
/// initial space by neighbourhood of each node
node_params: &'a NodeParams,
/// for each edge , initial node, end node, proba (weight of edge) 24 bytes
edges: Vec<(NodeIdx, OutEdge<f32>)>,
/// embedded coordinates of each node, under RwLock to // optimization nbnodes * (embedded dim * f32 + lock size))
embedded: Vec<Arc<RwLock<Array1<F>>>>,
/// embedded_scales
embedded_scales: Vec<f32>,
/// weighted array for sampling positive edges
pos_edge_distribution: WeightedAliasIndex<f32>,
/// possible use of hubness to samples negative node
neg_node_sampler: Option<NodeSampler>,
/// embedding parameters
params: &'a EmbedderParams,
} // end of EntropyOptim
impl<'a, F> EntropyOptim<'a, F>
where
F: Float
+ NumAssign
+ std::iter::Sum
+ num_traits::cast::FromPrimitive
+ Send
+ Sync
+ ndarray::ScalarOperand,
{
//
pub fn new(
node_params: &'a NodeParams,
params: &'a EmbedderParams,
neg_node_sampler: Option<NodeSampler>,
initial_embed: &Array2<F>,
) -> Self {
log::debug!("entering EntropyOptim::new");
// TODO what if not the same number of neighbours!!
let nbng = node_params.params[0].edges.len();
let nbnodes = node_params.get_nb_nodes();
let mut edges = Vec::<(NodeIdx, OutEdge<f32>)>::with_capacity(nbnodes * nbng);
let mut edges_weight = Vec::<f32>::with_capacity(nbnodes * nbng);
let mut initial_scales = Vec::<f32>::with_capacity(nbnodes);
// construct field edges
for i in 0..nbnodes {
initial_scales.push(node_params.params[i].scale);
for j in 0..node_params.params[i].edges.len() {
edges.push((i, node_params.params[i].edges[j]));
edges_weight.push(node_params.params[i].edges[j].weight);
}
}
log::debug!("construction alias table for sampling edges..");
let start = ProcessTime::now();
let pos_edge_sampler = WeightedAliasIndex::new(edges_weight).unwrap();
let cpu_time: Duration = start.elapsed();
log::debug!(
"constructied alias table for sampling edges.. , time : {:?}",
cpu_time
);
// construct embedded, initial embed can be droped now
let mut embedded = Vec::<Arc<RwLock<Array1<F>>>>::new();
let nbrow = initial_embed.nrows();
for i in 0..nbrow {
embedded.push(Arc::new(RwLock::new(initial_embed.row(i).to_owned())));
}
// compute embedded scales
let embedded_scales = estimate_embedded_scales_from_initial_scales(&initial_scales);
// get qunatile on embedded scales
let mut scales_q = CKMS::<f32>::new(0.001);
for s in &embedded_scales {
scales_q.insert(*s);
}
log::debug!(
"\n\n embedded scales quantiles at 0.05 : {:.2e} , 0.5 : {:.2e}, 0.95 : {:.2e}, 0.99 : {:.2e}",
scales_q.query(0.05).unwrap().1,
scales_q.query(0.5).unwrap().1,
scales_q.query(0.95).unwrap().1,
scales_q.query(0.99).unwrap().1
);
log::debug!("");
//
EntropyOptim {
node_params,
edges,
embedded,
embedded_scales,
pos_edge_distribution: pos_edge_sampler,
neg_node_sampler,
params,
}
// construct field embedded
} // end of new
pub fn get_nb_edges(&self) -> usize {
self.edges.len()
} // end of get_nb_edges
// return result as an Array2<F> cloning data to result to struct Embedder
// We return data in rows as (re)indexed in graph construction after hnsw!!
#[allow(unused)]
fn get_embedded_raw(&self) -> Array2<F> {
let nbrow = self.embedded.len();
let nbcol = self.params.asked_dim;
let mut embedding_res = Array2::<F>::zeros((nbrow, nbcol));
// TODO version 0.15 provides move_into and push_row
//
for i in 0..nbrow {
let row = self.embedded[i].read();
for j in 0..nbcol {
embedding_res[[i, j]] = row[j];
}
}
embedding_res
}
// return result as an Array2<F> cloning data to return result to struct Embedder
// Here we reindex data according to their original order. DataId must be contiguous and fill [0..nb_data]
// for this method not to panic!
#[allow(unused)]
pub fn get_embedded_reindexed(&self, indexset: &IndexSet<NodeIdx>) -> Array2<F> {
let nbrow = self.embedded.len();
let nbcol = self.params.asked_dim;
let mut embedding_res = Array2::<F>::zeros((nbrow, nbcol));
// TODO version 0.15 provides move_into and push_row
// Here we must not forget that to interpret results we must go
// back from indexset to original points (use One week bug!
for i in 0..nbrow {
let row = self.embedded[i].read();
let origin_id = indexset.get_index(i).unwrap();
for j in 0..nbcol {
embedding_res[[*origin_id, j]] = row[j];
}
}
embedding_res
}
/// row is here a NodeIdx (after reindexation of DataId), hence the not public interface
fn get_embedded_data(&self, row: usize) -> Arc<RwLock<Array1<F>>> {
Arc::clone(&self.embedded[row])
}
// computes croos entropy between initial space and embedded distribution.
// necessary to monitor optimization
#[allow(unused)]
fn ce_compute(&self) -> f64 {
log::trace!("\n entering EntropyOptim::ce_compute");
//
let mut ce_entropy = 0.;
let b: f64 = self.params.b;
for edge in self.edges.iter() {
let node_i = edge.0;
let node_j = edge.1.node;
assert!(node_i != node_j);
let weight_ij = edge.1.weight as f64;
let weight_ij_embed = cauchy_edge_weight(
&self.embedded[node_i].read(),
self.embedded_scales[node_i] as f64,
b,
&self.embedded[node_j].read(),
)
.to_f64()
.unwrap();
if weight_ij_embed > 0. {
ce_entropy += -weight_ij * weight_ij_embed.ln();
}
if weight_ij_embed < 1. {
ce_entropy += -(1. - weight_ij) * (1. - weight_ij_embed).ln();
}
if !ce_entropy.is_finite() {
log::debug!(
"weight_ij {} weight_ij_embed {}",
weight_ij,
weight_ij_embed
);
std::panic!();
}
}
//
ce_entropy
} // end of ce_compute
// use natural sampling or biased sampling, return selection proba to be used in unbiasing
fn sample_neg_node(&self) -> (usize, f32) {
if let Some(node_sampler) = &self.neg_node_sampler {
node_sampler.sample()
} else {
(rng().random_range(0..self.embedded_scales.len()), 1.)
}
}
// threaded version for computing cross entropy between initial distribution and embedded distribution with Cauchy law.
fn ce_compute_threaded(&self) -> f64 {
log::trace!("\n entering EntropyOptim::ce_compute_threaded");
//
let b: f64 = self.params.b;
let ce_entropy = self
.edges
.par_iter()
.fold(
|| 0.0f64,
|entropy: f64, edge| {
entropy + {
let node_i = edge.0;
let node_j = edge.1.node;
let weight_ij = edge.1.weight as f64;
let weight_ij_embed = cauchy_edge_weight(
&self.embedded[node_i].read(),
self.embedded_scales[node_i] as f64,
b,
&self.embedded[node_j].read(),
)
.to_f64()
.unwrap();
let mut term = 0.;
if weight_ij_embed > 0. {
term += -weight_ij * weight_ij_embed.ln();
}
if weight_ij_embed < 1. {
term += -(1. - weight_ij) * (1. - weight_ij_embed).ln();
}
term
}
},
)
.sum::<f64>();
//
ce_entropy
} // end of ce_compute_threaded
// TODO : pass functions corresponding to edge_weight and grad_edge_weight as arguments to test others weight function
/// This function optimize cross entropy for Shannon cross entropy
fn ce_optim_edge_shannon(&self, threaded: bool, grad_step: f64)
where
F: Float
+ NumAssign
+ std::iter::Sum
+ num_traits::cast::FromPrimitive
+ ndarray::ScalarOperand,
{
//
let edge_idx_sampled: usize;
let mut y_i;
let mut y_j;
let node_j;
let node_i;
if threaded {
edge_idx_sampled = rand::rng().sample(&self.pos_edge_distribution);
node_i = self.edges[edge_idx_sampled].0;
node_j = self.edges[edge_idx_sampled].1.node;
y_i = self.get_embedded_data(node_i).read().to_owned();
y_j = self.get_embedded_data(node_j).read().to_owned();
}
// end threaded
else {
edge_idx_sampled = rand::rng().sample(&self.pos_edge_distribution);
node_i = self.edges[edge_idx_sampled].0;
y_i = self.get_embedded_data(node_i).write().to_owned();
node_j = self.edges[edge_idx_sampled].1.node;
y_j = self.get_embedded_data(node_j).write().to_owned();
};
// get coordinate of node
// we locks once and directly a write lock as conflicts should be small, many edges, some threads. see Recht Hogwild!
let dim = self.params.asked_dim;
let mut gradient = Array1::<F>::zeros(dim);
//
assert!(node_i != node_j);
let weight = self.edges[edge_idx_sampled].1.weight as f64;
assert!(weight <= 1.);
let scale = self.embedded_scales[node_i] as f64;
let b: f64 = self.params.b;
// compute l2 norm of y_j - y_i
let d_ij: f64 = y_i
.iter()
.zip(y_j.iter())
.map(|(vi, vj)| (*vi - *vj) * (*vi - *vj))
.sum::<F>()
.to_f64()
.unwrap();
let d_ij_scaled = d_ij / (scale * scale);
// this coeff is common for P and 1.-P part
let coeff: f64 = if b != 1. {
let cauchy_weight = 1. / (1. + d_ij_scaled.powf(b));
2. * b * cauchy_weight * d_ij_scaled.powf(b - 1.) / (scale * scale)
} else {
let cauchy_weight = 1. / (1. + d_ij_scaled);
2. * b * cauchy_weight / (scale * scale)
};
if d_ij_scaled > 0. {
// repulsion annhinilate attraction if P<= 1. / (alfa + 1). choose 0.1/ PROBA_MIN
let alfa = (1. / PROBA_MIN) as f64;
let coeff_repulsion = 1. / (d_ij_scaled * d_ij_scaled).max(alfa);
// clipping makes each point i or j making at most half way to the other in case of attraction
let coeff_ij =
(grad_step * coeff * (-weight + (1. - weight) * coeff_repulsion)).max(-0.49);
gradient = (&y_j - &y_i) * F::from(coeff_ij).unwrap();
log::trace!(
"norm attracting coeff {:.2e} gradient {:.2e}",
coeff_ij,
l2_norm(&gradient.view()).to_f64().unwrap()
);
}
y_i -= &gradient;
y_j += &gradient;
*(self.get_embedded_data(node_j).write()) = y_j;
// now we loop on negative sampling filtering out nodes that are either node_i or are in node_i neighbours.
let asked_nb_neg = 5;
let mut got_nb_neg = 0;
let mut _nb_failed = 0;
while got_nb_neg < asked_nb_neg {
let (neg_node, _weight) = self.sample_neg_node();
if neg_node != node_i
&& neg_node != node_j
&& self
.node_params
.get_node_param(node_i)
.get_edge(neg_node)
.is_none()
{
// get a read lock, as neg_node is not the locked nodes node_i and node_j
let neg_data = self.get_embedded_data(neg_node);
let y_k;
if let Some(lock_read) = neg_data.try_read() {
y_k = lock_read.to_owned();
got_nb_neg += 1;
} else {
// to cover contention case... not seen
log::trace!("neg lock failed");
_nb_failed += 1;
continue;
}
// compute the common part of coeff as in function grad_common_coeff
let d_ik: f64 = y_i
.iter()
.zip(y_k.iter())
.map(|(vi, vj)| (*vi - *vj) * (*vi - *vj))
.sum::<F>()
.to_f64()
.unwrap();
let d_ik_scaled = d_ik / (scale * scale);
let coeff = if b != 1. {
let cauchy_weight = 1. / (1. + d_ik_scaled.powf(b));
2. * b * cauchy_weight * d_ik_scaled.powf(b - 1.) / (scale * scale)
} else {
let cauchy_weight = 1. / (1. + d_ik_scaled);
2. * b * cauchy_weight / (scale * scale)
};
// we know node_j is not in neighbour, we smooth repulsion for point with dist less than scale/4
// the part of repulsion comming from coeff is less than 1/(scale * scale)
//
let alfa = 1. / 16.;
if d_ik > 0. {
let coeff_repulsion = 1. / (d_ik_scaled * d_ik_scaled).max(alfa); // !!
let coeff_ik = (grad_step * coeff * coeff_repulsion).min(2.);
gradient = (&y_k - &y_i) * F::from_f64(coeff_ik).unwrap();
log::trace!(
"norm repulsive coeff gradient {:.2e} {:.2e}",
coeff_ik,
l2_norm(&gradient.view()).to_f64().unwrap()
);
}
// do not renormalize by proba, we sample according neg_node_sampler law
y_i -= &gradient;
} // end node_neg is accepted
} // end of loop on neg sampling
// final update of node_i
*(self.get_embedded_data(node_i).write()) = y_i;
} // end of ce_optim_from_point
#[allow(unused)]
fn gradient_iteration(&self, nb_sample: usize, grad_step: f64) {
for _ in 0..nb_sample {
self.ce_optim_edge_shannon(false, grad_step);
}
} // end of gradient_iteration
fn gradient_iteration_threaded(&self, nb_sample: usize, grad_step: f64) {
(0..nb_sample)
.into_par_iter()
.for_each(|_| self.ce_optim_edge_shannon(true, grad_step));
} // end of gradient_iteration_threaded
} // end of impl EntropyOptim
//===============================================================================================================
/// computes the weight of an embedded edge.
/// scale correspond at density observed at initial point in original graph (hence the asymetry)
fn cauchy_edge_weight<F>(initial_point: &Array1<F>, scale: f64, b: f64, other: &Array1<F>) -> F
where
F: Float + std::iter::Sum + num_traits::FromPrimitive,
{
let dist = initial_point
.iter()
.zip(other.iter())
.map(|(i, f)| (*i - *f) * (*i - *f))
.sum::<F>();
let mut dist_f64 = dist.to_f64().unwrap() / (scale * scale);
//
dist_f64 = dist_f64.powf(b);
// assert!(dist_f64 > 0.);
//
let weight = 1. / (1. + dist_f64);
let mut weight_f = F::from_f64(weight).unwrap();
if !(weight_f < F::one()) {
weight_f = F::one() - F::epsilon();
log::trace!("cauchy_edge_weight fixing dist_f64 {:2.e}", dist_f64);
}
assert!(weight_f < F::one());
assert!(weight_f.is_normal());
weight_f
} // end of cauchy_edge_weight
fn l2_norm<F>(v: &ArrayView1<'_, F>) -> F
where
F: Float + std::iter::Sum + num_traits::cast::FromPrimitive,
{
//
v.iter().map(|x| (*x) * (*x)).sum::<F>().sqrt()
} // end of l2_norm
// in embedded space (in unit ball or unit box) the scale is chosen as the scale at corresponding point / divided by mean initial scales.
fn estimate_embedded_scales_from_initial_scales(initial_scales: &[f32]) -> Vec<f32> {
log::trace!("estimate_embedded_scale_from_initial_scales");
let mean_scale: f32 = initial_scales.iter().sum::<f32>() / (initial_scales.len() as f32);
let scale_sup = 4.0; // CAVEAT seems we can go up to 4.
let scale_inf = 1. / scale_sup;
let width = 0.2;
// We want embedded scae impact between 0.5 and 2 (amplitude 4) , we take into account the square in cauchy weight
let embedded_scale: Vec<f32> = initial_scales
.iter()
.map(|&x| width * (x / mean_scale).min(scale_sup).max(scale_inf))
.collect();
//
for (i, scale) in embedded_scale.iter().enumerate() {
log::trace!("embedded scale for node {} : {:.2e}", i, scale);
}
//
embedded_scale
} // end of estimate_embedded_scale_from_initial_scales
// renormalize data (center and enclose in a box of a given box size) before optimization of cross entropy
fn set_data_box<F>(data: &mut Array2<F>, box_size: F)
where
F: Float
+ NumAssign
+ std::iter::Sum<F>
+ num_traits::cast::FromPrimitive
+ ndarray::ScalarOperand,
{
let nbdata = data.nrows();
let dim = data.ncols();
//
let mut means = Array1::<F>::zeros(dim);
let mut max_max = F::zero();
//
for j in 0..dim {
for i in 0..nbdata {
means[j] += data[[i, j]];
}
means[j] /= F::from(nbdata).unwrap();
}
for j in 0..dim {
for i in 0..nbdata {
data[[i, j]] -= means[j];
max_max = max_max.max(data[[i, j]].abs());
}
}
//
max_max /= box_size / F::from(2.).unwrap();
for f in data.iter_mut() {
*f /= max_max;
assert!((*f).abs() <= box_size);
}
} // end of set_data_box
#[cfg(test)]
#[allow(clippy::range_zip_with_len)]
mod tests {
// cargo test embedder -- --nocapture
use super::*;
fn log_init_test() {
let _ = env_logger::builder().is_test(true).try_init();
}
#[cfg(test)]
fn gen_rand_data_f32(nb_elem: usize, dim: usize) -> Vec<Vec<f32>> {
let mut data = Vec::<Vec<f32>>::with_capacity(nb_elem);
let mut rng = rng();
let unif = Uniform::<f32>::new(0., 1.).unwrap();
for i in 0..nb_elem {
let val = 2. * i as f32 * rng.sample(unif);
let v: Vec<f32> = (0..dim).map(|_| val * rng.sample(unif)).collect();
data.push(v);
}
data
} // end of gen_rand_data_f32
#[test]
fn mini_embed_full() {
log_init_test();
// generate datz
let nb_elem = 500;
let embed_dim = 20;
let data = gen_rand_data_f32(nb_elem, embed_dim);
let data_with_id = data
.iter()
.zip(0..data.len())
.collect::<Vec<(&Vec<f32>, usize)>>();
// hnsw construction
let ef_c = 50;
let max_nb_connection = 50;
let nb_layer = 16.min((nb_elem as f32).ln().trunc() as usize);
let mut hns =
Hnsw::<f32, DistL1>::new(max_nb_connection, nb_elem, nb_layer, ef_c, DistL1 {});
// to enforce the asked number of neighbour
hns.set_keeping_pruned(true);
hns.parallel_insert(&data_with_id);
hns.dump_layer_info();
// go to kgraph
let knbn = 10;
log::info!("calling kgraph.init_from_hnsw_all");
let kgraph: KGraph<f32> = kgraph_from_hnsw_all(&hns, knbn).unwrap();
log::info!("minimum number of neighbours {}", kgraph.get_max_nbng());
let _kgraph_stats = kgraph.get_kraph_stats();
let mut embed_params = EmbedderParams::default();
embed_params.asked_dim = 5;
let mut embedder = Embedder::new(&kgraph, embed_params);
let embed_res = embedder.embed();
assert!(embed_res.is_ok());
} // end of mini_embed_full
} // end of tests