1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//! Document SMILES
//!     standardized database with id and SMILES
//!     converted from other datasets

use serde::{Serialize, Deserialize}; 

#[derive(Serialize, Deserialize)]
pub struct DocSMILES {
    ids: Vec<chiral_common::DatasetID>,
    smiles: Vec<chiral_common::SMILES>,
}

impl DocSMILES {
    pub fn new(ids_in: Vec<chiral_common::DatasetID>, smiles_in: Vec<chiral_common::SMILES>) -> Self {
        let mut perm = permutation::sort(&ids_in);
        let mut ids = ids_in;
        perm.apply_slice_in_place(&mut ids);
        let mut smiles = smiles_in;
        perm.apply_slice_in_place(&mut smiles);

        Self { ids, smiles }
    }

    pub fn serialize(&self) -> chiral_common::SerializedFormat { serde_json::to_string(self).unwrap() }
    pub fn deserialize(content: &chiral_common::SerializedFormat) -> Self { serde_json::from_str(content).unwrap() }

    pub fn get_smiles(&self, id: &chiral_common::DatasetID) -> Option<&chiral_common::SMILES> {
        match self.ids.binary_search(id) {
            Ok(index) => self.smiles.get(index),
            Err(_) => None
        }
    }

    pub fn get_smiles_vec(&self) -> &Vec<chiral_common::SMILES> { &self.smiles }
    pub fn get_ids(&self) -> &Vec<chiral_common::DatasetID> { &self.ids }
    pub fn len(&self) -> usize { self.ids.len() }
}

impl crate::Dummy for DocSMILES {
    fn dummy() -> Self {
        let ids = vec![
            "label_1".to_string(),
            "label_3".to_string(),
            "label_2".to_string(),
            "label_4".to_string()
        ];
        let smiles = vec![
            String::from("O=C(C)Oc1ccccc1C(=O)O"),
            String::from("N1=C(c3c(Sc2c1cccc2)cccc3)N4CCN(CCOCCO)CC4"),
            String::from("O=C(O)C[C@H](O)C[C@H](O)CCn2c(c(c(c2c1ccc(F)cc1)c3ccccc3)C(=O)Nc4ccccc4)C(C)C"),
            String::from("CC(=O)Nc1ccc(O)cc1")
        ];

        Self::new(ids, smiles)
    }
}

impl From<crate::SourceChembl> for DocSMILES {
    fn from(sc: crate::SourceChembl) -> Self {
        let (smiles, ids) = sc.get_smiles_id_pairs();
        DocSMILES::new(ids, smiles)
    }
}

/// Datastore for DocSMILES
pub type DocStoreSMILES = std::collections::HashMap<chiral_common::kinds::Dataset, DocSMILES>;

impl crate::Info for DocStoreSMILES {
    fn info(&self) -> String {
        format!("{:15} {:>15}\n", "name", "entries")
        + vec!["=";31].join("").as_str() + "\n"
        + self.iter()
            .map(|(k, v)| format!("{:15} {:15}", k, v.len()))
            .collect::<Vec<String>>()
            .join("\n")
            .as_str()
    }
}


#[cfg(test)]
mod tests {
    use super::*;
    use crate::{Dummy, Info};

    #[test]
    fn test_doc_smiles() {
        let doc = DocSMILES::dummy();
        assert_eq!(doc.ids.len(), 4);
        assert_eq!(doc.smiles.len(), 4);
        assert_eq!(doc.get_smiles(&"label_3".to_string()), Some(&String::from("N1=C(c3c(Sc2c1cccc2)cccc3)N4CCN(CCOCCO)CC4")));
        assert_eq!(doc.get_smiles(&"label_5".to_string()), None); 
    }

    #[test]
    fn test_chembl() {
        let mut sc = crate::SourceChembl::new();
        sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt"));
        sc.load_all();
        let doc = DocSMILES::from(sc);
        assert_eq!(doc.ids.len(), 100);
        assert_eq!(doc.smiles.len(), 100);
    }

    #[test]
    fn test_store() {
        let mut store = DocStoreSMILES::new();
        let doc_1 = DocSMILES::dummy();
        store.insert(chiral_common::kinds::Dataset::Dummy, doc_1);
        let mut sc = crate::SourceChembl::new();
        sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_10k.txt"));
        sc.load_all();
        let doc_2 = DocSMILES::from(sc);
        store.insert(chiral_common::kinds::Dataset::TestChembl, doc_2);
        println!("{}", store.info());
        assert!(store.info().contains("test_chembl"));
        assert!(store.info().contains("dummy"));
        assert!(store.info().contains("4"));
        assert!(store.info().contains("100"));
    }

    #[test]
    fn test_permutation() {
        let dsk = chiral_common::kinds::Dataset::TestChembl;
        let doc = crate::load::<DocSMILES>(&dsk);
        let id = "CHEMBL10030".to_string();
        assert_eq!(doc.get_smiles(&id).unwrap().to_string(), "O=C(c1ccc(OCCN2CCCC2)cc1)c1c(-c2ccc(O)cc2)sc2cc(O)ccc12".to_string());
    }
}