phantompurger_rs/
phantompurger.rs

1use crate::disjoint::DisjointSubsets;
2use crate::utils::ec_mapper_dict_from_busfolders;
3use crate::binomialreg::phantom_binomial_regression;
4use bustools_core::consistent_genes::Ec2GeneMapper;
5use bustools_core::io::{BusFolder, BusRecord};
6use bustools_core::merger::MultiIterator;
7use bustools_core::utils::get_spinner;
8use bustools_core::{
9    consistent_genes::{GeneId, EC},
10    iterators::CbUmiGroupIterator,
11};
12use itertools::izip;
13use polars::io::SerWriter;
14use polars::prelude::Column;
15use serde::{Deserialize, Serialize};
16use std::{
17    collections::{HashMap, HashSet},
18    fs::File,
19    hash::Hash,
20    io::{BufRead, BufReader, Write},
21    time::Instant,
22};
23
24/// given a collection of busfiles, FingerPrintHistogram keeps track of the
25/// number of times we see a molecule (CB/UMI/gene) in a particular distribution
26/// across the busfiles. Instead recording individual CUGs, we aggregate all the ones
27/// having the same fingerprint across samples, i.e. we create a histogram of fingerprints
28#[derive(Debug)]
29pub struct FingerprintHistogram {
30    // Names of the samples across which we calcualte the fingerprints
31    pub(crate) samplenames: Vec<String>,
32    // Histogram FingerPrint -> Frequency
33    pub(crate) histogram: HashMap<Vec<u32>, usize>,
34}
35
36#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug, Copy, Deserialize, Serialize)]
37pub struct AmpFactor(u32);
38
39impl FingerprintHistogram {
40
41    pub fn get_samplenames(&self) -> Vec<String> {
42        self.samplenames.clone()
43    }
44
45    pub fn get_histogram(&self) -> HashMap<Vec<u32>, usize> {
46        self.histogram.clone()
47    }
48        
49    /// load histogram from disk
50    pub fn from_csv(filename: &str) -> Self {
51        let fh = File::open(filename).unwrap();
52        let mut samplenames: Vec<String> = Vec::new();
53        let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
54        for (i, line) in BufReader::new(fh).lines().flatten().enumerate() {
55            if i == 0 {
56                for s in line.split(',') {
57                    samplenames.push(s.to_string());
58                }
59                // last one is frequency, remove that
60                let fstring = samplenames.pop().unwrap();
61                assert_eq!(fstring, "frequency");
62            } else {
63                let mut fingerprint: Vec<u32> =
64                    line.split(',').map(|e| e.parse::<u32>().unwrap()).collect();
65                // last element is the frequency of the fingerprint
66                let freq = fingerprint.pop().unwrap();
67                histogram.insert(fingerprint, freq as usize);
68            }
69        }
70        FingerprintHistogram {
71            samplenames,
72            histogram,
73        }
74    }
75
76    pub fn new(sample_order: &[String]) -> Self {
77        let hist = HashMap::new();
78        FingerprintHistogram {
79            samplenames: sample_order.to_owned(), // not sure why, linter is suggesting it
80            histogram: hist,
81        }
82    }
83
84    /// adds a CUG (observed over several busfiles) into the histogram
85    pub fn add(
86        &mut self,
87        record_dict: HashMap<String, Vec<BusRecord>>,
88        ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
89    ) {
90        // creates a fingerprint of the record_dict, updates the counts in the histogram
91
92        let grouped_record_dicts = groupby_gene_across_samples(&record_dict, ecmapper_dict);
93        let mut fingerprints: Vec<Vec<u32>> = Vec::new();
94        for rd in grouped_record_dicts {
95            let fp_hash = make_fingerprint_simple(&rd);
96            // turn the hashmap into a vector, sorted acc to order
97            let fp: Vec<_> = self
98                .samplenames
99                .iter()
100                .map(|s| fp_hash.get(s).unwrap_or(&0))
101                .cloned()
102                .collect();
103            fingerprints.push(fp);
104        }
105
106        for fp in fingerprints {
107            // update frequency
108            let v = self.histogram.entry(fp).or_insert(0);
109            *v += 1;
110        }
111    }
112
113    /// turn the fingerprint histgram into a polars Dataframe
114    pub fn to_dataframe(&self) -> polars::frame::DataFrame {
115        // each column is a different experiment
116        // each row is a differnet fingerprint
117        let mut columns:HashMap<String, Vec<u32>> = HashMap::new();
118
119        // we're more or less transposing, working through the rows, building the columns
120        for (fingerprint, freq) in self.histogram.iter() {
121            assert_eq!(fingerprint.len(), self.samplenames.len());
122            for (s, fp) in izip!(&self.samplenames, fingerprint) {
123                let v = columns.entry(s.to_string()).or_default();
124                v.push(*fp)
125            }
126            let v= columns.entry("frequency".to_owned()).or_default();
127            v.push(*freq as u32)
128        }   
129
130        let mut series = Vec::new();
131        for (name, values) in columns {
132            series.push(Column::new(name.into(), values))
133        }
134        polars::frame::DataFrame::new(series).unwrap()
135    }
136
137    /// save the histogram to disk
138    pub fn to_csv(&self, outfile: &str) {
139        let mut fh = File::create(outfile).expect("Cant create file: {outfile}");
140        let mut header = self.samplenames.join(",");
141        header.push_str(",frequency");
142        writeln!(fh, "{}", header).unwrap();
143
144        for (fingerprint, freq) in self.histogram.iter() {
145            // concat with commas
146            let mut s = fingerprint
147                .iter()
148                .map(|i| i.to_string())
149                .collect::<Vec<String>>()
150                .join(",");
151            s.push_str(&format!(",{}", freq));
152            writeln!(fh, "{}", s).unwrap();
153        }
154    }
155
156    /// save the histogram to disk
157    pub fn to_csv2(&self, outfile: &str) {
158        let mut df = self.to_dataframe();
159        let mut file = std::fs::File::create(outfile).unwrap();
160        polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
161    }
162
163    /// get the number of non-chimeric molecules as a function of r
164    fn get_z_r(&self) -> HashMap<AmpFactor, usize>{
165        let mut z_r: HashMap<AmpFactor, usize> = HashMap::new();
166
167        for (fingerprint, freq) in self.histogram.iter() {
168            let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
169            let n_experiments = fingerprint.iter().filter(|x| **x > 0).count();
170            if n_experiments == 1 {
171                let v = z_r.entry(r).or_insert(0);
172                *v += freq;
173            }
174        }      
175        z_r  
176    }
177
178    /// get the number of total molecules as a function of r
179    fn get_m_r(&self) -> HashMap<AmpFactor, usize>{
180
181        let mut m_r: HashMap<AmpFactor, usize> = HashMap::new();
182        for (fingerprint, freq) in self.histogram.iter() {
183            let r: AmpFactor = AmpFactor(fingerprint.iter().sum());
184            let v = m_r.entry(r).or_insert(0);
185            *v += freq;
186        }
187        m_r
188    }
189    
190    /// Estimate SIHR from the histogram via binomial regression
191    /// TODO: the paper suggests to filter some values:
192    /// - remove samples (rows over r) with less than 10 observations (m_r <10)
193    /// - remove samples with r>25
194    pub fn estimate_sihr(&self) -> f64 {
195        // number of non-chimeric molecules of amp r
196
197        let z_r = self.get_z_r();
198        let m_r = self.get_m_r();
199
200        let mut r: Vec<AmpFactor> = m_r.keys().map(|k| k.to_owned()).collect();
201        r.sort();
202
203        let z: Vec<usize> = r.iter().map(|x| *z_r.get(x).unwrap_or(&0)).collect(); // in case there's not a single non-chimeric molecule at amplification r, return 0
204        let m: Vec<usize> = r.iter().map(|x| *m_r.get(x).unwrap()).collect();
205
206        let r_usize: Vec<_> = r.iter().map(|x| x.0 as usize).collect(); // convert from AmpFactor -> usize
207        let (pmax, _prange, _loglike_range) =
208            phantom_binomial_regression(&z, &m, &r_usize, self.samplenames.len());
209
210        // let mut fh = File::create("/tmp/phat.csv").unwrap();
211        // writeln!(fh, "p,logp").unwrap();
212        // for (p, logp) in izip!(_prange, _loglike_range) {
213        //     writeln!(fh, "{},{}", p, logp).unwrap();
214        // }
215        pmax
216    }
217}
218
219pub fn groupby_gene_across_samples(
220    record_dict: &HashMap<String, Vec<BusRecord>>,
221    ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
222) -> Vec<HashMap<String, BusRecord>> {
223    // - build the disjoint set using samplename-> genes, i.e. the disjoint sets elements are samplenames
224    // - iterate over the disjoint_set elements (samplenames) and build the Busrecords
225
226    // check if the genes are consistent across samples
227    // if so yield the record as is
228    // otherwise split into multiple records
229    if record_dict.len() == 1 {
230        // return vec![record_dict];
231    };
232
233    // give each element in record_dict a unique name ( the key)
234    // and store i) the element itself, ii) its genes iii) its samplename
235    let mut big_hash: HashMap<String, (&BusRecord, String, &HashSet<GeneId>)> =
236        HashMap::with_capacity(record_dict.len());
237    let mut id_counter: i32 = 0;
238    for (sname, records) in record_dict.iter() {
239        let ecmapper = ecmapper_dict.get(sname).unwrap();
240        for r in records {
241            let g = ecmapper.get_genes(EC(r.EC));
242            let id_str = id_counter.to_string();
243            big_hash.insert(id_str, (r, sname.clone(), g));
244            id_counter += 1;
245        }
246    }
247
248    // build disjoint set based on samplename and genes
249    let mut disjoint_set = DisjointSubsets::new();
250    for (id, (_r, _sname, gset)) in big_hash.iter() {
251        disjoint_set.add(id.clone(), (*gset).clone());
252    }
253
254    // build the emitted dict
255    let mut emit_vector: Vec<HashMap<String, BusRecord>> = Vec::new();
256    for (ids_of_set_elements, _geneset) in disjoint_set.get_disjoint_set_ids_and_set() {
257        // these are now all records across the samples that map to the same gene
258        // evne if there's multiple records per sample, they can be aggregated into
259        // a single record
260
261        // group the records by sample, aggregate
262
263        // deprecated:
264        // let the_gene = *geneset.iter().next().unwrap(); // rbitrarily choose the first consissnt gene to mark in busrecords
265
266        let mut sample_grouped: HashMap<String, Vec<BusRecord>> = HashMap::new();
267        for el_id in ids_of_set_elements {
268            // pop out the element. not needed, but shrinks the map
269            let (record, samplename, _genes) = big_hash.remove(&el_id).unwrap();
270
271            let rlist = sample_grouped.entry(samplename).or_default();
272            rlist.push(record.clone());
273        }
274
275        let mut sample_grouped_aggr: HashMap<String, BusRecord> = HashMap::new();
276        for (s, recordlist) in sample_grouped.iter() {
277            let freq = recordlist.iter().map(|r| r.COUNT).sum();
278            let mut rnew = recordlist.first().unwrap().clone();
279            rnew.COUNT = freq;
280
281            // bad idea here: this turns the BusRecord into something not quantifiable by bustools!!
282            // rnew.EC = the_gene;
283            // rather just leave the EC as is. It's guaranteed to be consistent eith the_gene
284            // but might not be unique to the_gene
285            // TODO: figure out a EC that maps ONLY to the_gene
286            sample_grouped_aggr.insert(s.to_string(), rnew);
287        }
288        emit_vector.push(sample_grouped_aggr);
289    }
290    emit_vector
291}
292
293
294
295/// main function here: takes a dict of busfolders, creates fingerprints for each molecule
296/// returns the fingerprints histogram (how often each fingerprint was observed)
297/// and the ordering of the fingerprint (i.e. which element corresponds to which experiment)
298pub fn make_fingerprint_histogram(busfolders: HashMap<String, BusFolder>, t2g_file: &str) -> FingerprintHistogram {
299
300    // create the EC2gene mappers
301    // silly, cant create one where ECMapper is a reference
302    // need to instantiate/own it, then create ref
303
304    println!("Making EC mappers");
305    let ec_tmp = ec_mapper_dict_from_busfolders(&busfolders, t2g_file );
306    let ecmapper_dict: HashMap<String, &Ec2GeneMapper> = ec_tmp.iter().map(|(name, d)| (name.clone(), d )).collect();
307
308    let now = Instant::now();
309    let result = _make_fingerprint_histogram(&busfolders, &ecmapper_dict);
310    let elapsed_time = now.elapsed();
311    println!(
312        "Ran _make_fingerprint_histogram, took {} seconds.",
313        elapsed_time.as_secs()
314    );
315    result
316}
317
318fn _make_fingerprint_histogram(
319    busfolders: &HashMap<String, BusFolder>,
320    ecmapper_dict: &HashMap<String, &Ec2GeneMapper>,
321) -> FingerprintHistogram {
322
323    // the actual workhorse, make_fingerprint_histogram is just a convenient wrapper
324    let iterators = busfolders.iter().map(|(k,v)| (k.clone(), v.get_iterator().groupby_cbumi())).collect();
325    let multi_iter = MultiIterator::new(iterators);
326
327    let bar = get_spinner();
328    
329    let mut order: Vec<_> = busfolders.keys().cloned().collect();
330    order.sort();
331
332    let mut fp_histo = FingerprintHistogram::new(&order);
333
334    for (i, ((_cb, _umi), record_dict)) in multi_iter.enumerate() {
335        fp_histo.add(record_dict, ecmapper_dict);
336
337        if i % 1000000 == 0 {
338            bar.inc(1000000);
339        }
340    }
341    fp_histo
342}
343
344pub type Fingerprint = HashMap<String, u32>;
345
346//#[inline(never)]
347pub fn make_fingerprint_simple(record_dict: &HashMap<String, BusRecord>) -> Fingerprint {
348    // for a molecule detected over several experiments,
349    // get is fingerprint, i.e. freq across the experiments
350    let fingerprint: Fingerprint = record_dict
351        .iter()
352        .map(|(k, v)| (k.clone(), v.COUNT))
353        .collect();
354    fingerprint
355}
356
357#[cfg(test)]
358pub mod tests {
359    use bustools_core::{
360        consistent_genes::{Ec2GeneMapper, Genename},
361        io::{BusFolder, BusRecord},
362    };
363    use polars::io::SerWriter;
364    use statrs::assert_almost_eq;
365    use std::collections::HashMap;
366    use super::*;
367
368    fn create_dummy_ec() -> Ec2GeneMapper {
369        let ec0: HashSet<Genename> = vec![Genename("A".to_string())].into_iter().collect();
370        let ec1: HashSet<Genename> = vec![Genename("B".to_string())].into_iter().collect();
371        let ec2: HashSet<Genename> = vec![Genename("A".to_string()), Genename("B".to_string())].into_iter().collect();
372        let ec3: HashSet<Genename> = vec![Genename("C".to_string()), Genename("D".to_string())].into_iter().collect();
373    
374        let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
375            (EC(0), ec0), // A
376            (EC(1), ec1), // B
377            (EC(2), ec2), // A,B
378            (EC(3), ec3), // C,D
379        ]);
380        Ec2GeneMapper::new(ec_dict)
381    }
382
383    #[test]
384    fn test_groupby_gene_across_samples() {
385        let es1 = create_dummy_ec();
386        let es2 = create_dummy_ec();
387        let es_dict: HashMap<String, &Ec2GeneMapper> =
388            vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
389                .into_iter()
390                .collect();
391
392        // first sample: two records, with consistent gene A
393        let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
394        let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0};
395
396        // second sample: two records, with consistent gene A,B
397        let s1 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
398        let s2 = BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 3, FLAG: 0};
399
400        let record_dict = vec![
401            ("s1".to_string(), vec![r1, r2]),
402            ("s2".to_string(), vec![s1, s2]),
403        ]
404        .into_iter()
405        .collect();
406
407        let res = groupby_gene_across_samples(&record_dict, &es_dict);
408
409        // println!("{:?}", res);
410        assert_eq!(res.len(), 1);
411        // make sure it aggregtes correctly
412        let r = res[0].clone();
413        assert_eq!(r.get("s1").unwrap().COUNT, 4);
414        assert_eq!(r.get("s2").unwrap().COUNT, 6);
415
416        /*
417        more complicated example where the ECs dont fully agree
418        in sample things indicate A, in sampel B i'ts inconsistent
419        looking at all samples together A,B could be an option
420         */
421        // first sample: two records, with consistent gene A
422        let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0}; //A
423        let r2 =BusRecord{CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0}; //A,B
424
425        // second sample: two records, with consistent gene A the other consistent with gene B
426        let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0}; // A
427        let s2 = BusRecord{CB: 0, UMI: 1, EC: 1, COUNT: 3, FLAG: 0}; //B
428
429        let record_dict = vec![
430            ("s1".to_string(), vec![r1, r2]),
431            ("s2".to_string(), vec![s1, s2]),
432        ]
433        .into_iter()
434        .collect();
435        let res = groupby_gene_across_samples(&record_dict, &es_dict);
436        println!("{:?}", res);
437    }
438
439    #[test]
440    fn test_make_fingerprint_histogram() {
441        use bustools_core::io::setup_busfile;
442
443        // a pair, same EC
444        let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
445        let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
446
447        // singleton in sample1
448        let r2 =BusRecord{CB: 1, UMI: 2, EC: 0, COUNT: 12, FLAG: 0}; 
449
450        // singleton in sample2
451        let s2 = BusRecord{CB: 1, UMI: 2, EC: 1, COUNT: 10, FLAG: 0};
452
453
454        // pair but only gene overlap
455        let r3 =BusRecord{CB: 1, UMI: 3, EC: 0, COUNT:  2, FLAG: 0}; 
456        let s3 = BusRecord{CB: 1, UMI: 3, EC: 2, COUNT:  3, FLAG: 0}; 
457
458        //overall we should get
459        // [12, 0] = 1
460        // [0, 10] = 1
461        // [2, 3] = 2
462
463        let v1 = vec![r1.clone(), r2.clone(), r3.clone()];
464        let v2 = vec![s1.clone(), s2.clone(), s3.clone()];
465
466        // write the records to file
467        let (busname1, _dir1) = setup_busfile(&v1);
468        let (busname2, _dir2) = setup_busfile(&v2);
469
470        let hashmap = HashMap::from([
471            ("s1".to_string(), BusFolder::new(&busname1)),
472            ("s2".to_string(), BusFolder::new(&busname2)),
473        ]);
474
475        let es1 = create_dummy_ec();
476        let es2 = create_dummy_ec();
477        let es_dict: HashMap<String, &Ec2GeneMapper> =
478            vec![("s1".to_string(), &es1), ("s2".to_string(), &es2)]
479                .into_iter()
480                .collect();
481
482        let s = _make_fingerprint_histogram(&hashmap, &es_dict);
483        println!("{:?}", s);
484
485        let e = *s.histogram.get(&vec![12, 0]).unwrap();
486        assert_eq!(e, 1);
487
488        let e = *s.histogram.get(&vec![0, 10]).unwrap();
489        assert_eq!(e, 1);
490
491        let e = *s.histogram.get(&vec![2, 3]).unwrap();
492        assert_eq!(e, 2);
493
494        s.to_csv("/tmp/finger.csv")
495
496        // let s = detect_overlap(hashmap);
497    }
498
499    #[test]
500    fn test_make_fingerprint_simple() {
501
502        let r1 =BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0};
503        let s1 = BusRecord{CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0};
504
505        let record_dict = vec![("s1".to_string(), r1), ("s2".to_string(), s1)]
506            .into_iter()
507            .collect();
508        let res = make_fingerprint_simple(&record_dict);
509        println!("{:?}", res);
510
511        let exp: HashMap<_, _> = vec![("s1".to_string(), 2), ("s2".to_string(), 3)]
512            .into_iter()
513            .collect();
514        assert_eq!(res, exp);
515    }
516
517    // #[test]
518    pub fn testing2() {
519        let t2g = "/home/michi/bus_testing/transcripts_to_genes.txt";
520        let b1 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
521        let b2 = BusFolder::new("/home/michi/bus_testing/bus_output_short/");
522        let hashmap = HashMap::from([("full".to_string(), b1), ("short".to_string(), b2)]);
523        let s = make_fingerprint_histogram(hashmap, t2g);
524
525        s.to_csv("/tmp/testing2.csv");
526        println!("{:?}", s)
527    }
528
529    #[test]
530    pub fn test_csv_read_write() {
531        let order = vec!["s1", "s2"]
532            .into_iter()
533            .map(|x| x.to_string())
534            .collect();
535        let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
536
537        histogram.insert(vec![1, 0], 1);
538        histogram.insert(vec![0, 1], 1);
539        histogram.insert(vec![1, 1], 1);
540
541        let fph = FingerprintHistogram {
542            samplenames: order,
543            histogram,
544        };
545
546        // fph.to_csv("/tmp/finger.csv");
547
548        println!("{:?}", fph);
549        let t = FingerprintHistogram::from_csv("/tmp/finger.csv");
550        println!("{:?}", t);
551
552        assert_eq!(fph.histogram, t.histogram);
553
554        let p = t.estimate_sihr();
555        println!("{}", p);
556    }
557
558    #[test]
559    pub fn test_estimate_sihr() {
560        let order = vec!["s1", "s2"]
561            .into_iter()
562            .map(|x| x.to_string())
563            .collect();
564        let mut histogram: HashMap<Vec<u32>, usize> = HashMap::new();
565
566        histogram.insert(vec![1, 0], 1);
567        histogram.insert(vec![0, 1], 1);
568        histogram.insert(vec![1, 1], 1);
569
570        let fph = FingerprintHistogram {
571            samplenames: order,
572            histogram,
573        };
574
575        let p = fph.estimate_sihr();
576        println!("{}", p);
577
578        assert_almost_eq!(p, 0.5, 0.001);
579    }
580
581    #[test]
582    fn test_sihr_real(){
583        let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
584        let sihr = fph.estimate_sihr();
585        // check that the SIHR is still the same
586        insta::assert_yaml_snapshot!(sihr, @r###"
587        ---
588        0.0016278754068794856
589        "###);
590    }
591
592    #[test]
593    fn test_zr_real(){
594        let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
595        // nubmer of non-chimeric reads
596        let z_r = fph.get_z_r();
597        
598        // need to enforce hashmap sorting! otherwise the resulting Hashmap (Even if same)
599        // will differ from the snapshot on disk
600        insta::with_settings!({sort_maps => true}, {
601            insta::assert_yaml_snapshot!(z_r);
602        });        
603    }
604
605    #[test]
606    fn test_mr_real(){
607        let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
608        // nubmer of total reads
609        let m_r = fph.get_m_r();
610        
611        // need to enforce hashmap sorting! otherwise the resulting Hashmap (Even if same)
612        // will differ from the snapshot on disk
613        insta::with_settings!({sort_maps => true}, {
614            insta::assert_yaml_snapshot!(m_r);
615        });
616    }
617
618    #[test]
619    fn test_to_dataframe() {
620        let fph = FingerprintHistogram::from_csv("/home/michi/Dropbox/rustphantompurger/IR56_57_phantom.csv");
621        let mut df = fph.to_dataframe();
622
623        let mut file = std::fs::File::create("/tmp/path.csv").unwrap();
624        polars::prelude::CsvWriter::new(&mut file).finish(&mut df).unwrap();
625    }
626}