galaxy_rs/
lib.rs

1//! # Galaxy
2//!
3//! `galaxy-rs` is a library providing bindings to the Python `galaxy` project, found [here](https://github.com/quinntyx/galaxy). 
4//! Galaxy is a node-based, extensible, multimedia database system. 
5//!
6//! This crate provides a way to interact with a Galaxy database through Rust using PyO3,
7//! preserving compatibility with Galaxy's extensibility through its Python API while allowing use
8//! with Rust programs. 
9
10use pyo3::prelude::*;
11use pyo3::types::{PyString, PyBool, PyIterator};
12use std::collections::HashMap;
13pub use pyo3::exceptions::*;
14
15/// A struct representing a Galaxy database. 
16/// Because this interfaces with files on disk, ever having >1 instance of a Galaxy database
17/// pointing to the same data is an error and will cause UB. 
18///
19/// A Galaxy DB can be created with `galaxy_rs::Galaxy::new(module_path: &str, api_path: &str, silent: bool)`. 
20///
21/// Because every method in Galaxy requires acquiring the Python Global Interpreter Lock (GIL),
22/// Galaxy is technically threadsafe for shared memory (although running multiple instances of
23/// Galaxy is not recommended as the database is lazy and may become out of sync with disk until
24/// flushed, causing likely data races). 
25pub struct Galaxy {
26    db: Py<PyAny>
27}
28
29impl Galaxy {
30    /// Creates a new Galaxy database pointing to the given data. 
31    ///
32    /// `module_path: &str` is the path to the galaxy database's extension module folder, relative to the 
33    /// location the Binary is being executed from. 
34    ///
35    /// `api_path: &str` is the path to the Galaxy API, for use by modules as an imported module. This can
36    /// be left as a blank string if Galaxy is installed to the system interpreter or as a venv in
37    /// the current environment (as is the default setup when using Cargo).
38    ///
39    /// `silent: bool` is a boolean that will suppress prints from Python while initializing the
40    /// database through a flag on the Python-internal Galaxy.post() method. Note that this is not
41    /// guaranteed to suppress all output, depending on the implementation of added modules.
42    ///
43    /// All data is stored in the `data` folder relative to the execution location of the final
44    /// binary, and when instancing a Galaxy database it automatically loads all data from that
45    /// directory. 
46    pub fn new (module_path: &str, api_path: &str, silent: bool) -> Self {
47        match Python::with_gil(|py| -> PyResult<Self> {
48            let db: Py<PyAny> = PyModule::import(py, "galaxy")?
49                .getattr("galaxy")?
50                .getattr("Galaxy")?
51                .call1((silent, module_path, api_path))?
52                .into();
53            Ok(Galaxy { db })
54        }) {
55            Ok(x) => x,
56            Err(_) => todo!(),
57        }
58    }
59
60    /// Creates a new Galaxy database pointing to the given data inside of the given directory. 
61    ///
62    /// `module_path: &str` is the path to the galaxy database's extension module folder, relative to the 
63    /// location the Binary is being executed from. 
64    ///
65    /// `api_path: &str` is the path to the Galaxy API, for use by modules as an imported module. This can
66    /// be left as a blank string if Galaxy is installed to the system interpreter or as a venv in
67    /// the current environment (as is the default setup when using Cargo).
68    ///
69    /// `silent: bool` is a boolean that will suppress prints from Python while initializing the
70    /// database through a flag on the Python-internal Galaxy.post() method. Note that this is not
71    /// guaranteed to suppress all output, depending on the implementation of added modules.
72    ///
73    /// All data is stored in the `data` folder relative to the passed directory. 
74    pub fn with_dir (dir: &str, module_path: &str, api_path: &str, silent: bool) -> Self {
75        Python::with_gil(|py| {
76            PyModule::import(py, "os").unwrap()
77                .getattr("chdir").unwrap()
78                .call1((dir,)).unwrap();
79        });
80        Self::new(module_path, api_path, silent)
81    }
82
83    /// Gets the Registry Handler. 
84    pub fn registry_handler (&self) -> RegistryHandler {
85        let res: PyResult<RegistryHandler> = Python::with_gil(|py| {
86            let handler: Py<PyAny> = PyModule::import(py, "galaxy")?
87                .getattr("registry")?
88                .into();
89            Ok(RegistryHandler::new(handler))
90        });
91        match res {
92            Ok(x) => x,
93            Err(_) => todo!()
94        }
95    }
96
97
98    /// Loads a node to the Galaxy database and returns a `Result<(), PyErr>` holding any resulting
99    /// Python errors that may have arisen from attempting to load that node. 
100    ///
101    /// `loc: &str` is the location of the node, in the format `source:name`. This is expanded to
102    /// `data/source:name.json` and `data/source:name.match` to instance the (lazy) Galaxy node and
103    /// add it to the nodes in the Galaxy db. 
104    pub fn load_node (&mut self, loc: &str) -> Result<(), PyErr> {
105       Python::with_gil(|py| {
106            self.db.as_ref(py)
107                .getattr("load_node")
108                .expect("Galaxy object should have attribute load_node")
109                .call1((loc,))?;
110            
111            Ok(())
112        })
113    }
114
115    /// Calling this makes the Galaxy database aware that a new node has been fully initialized and
116    /// added to the database and that it should now begin making connections using the match
117    /// files stored in that node. 
118    ///
119    /// `srcnode: &str` is the ID of the node that was just added, which is the name of the node
120    /// (without the source). This is subject to change, and may become `source:name` in the future
121    /// to disambiguate nodes with the same name (e.g. Nuclear Energy from Britannica and Nuclear
122    /// Energy from Wikipedia). For now, it is impossible to have two nodes with identical names. 
123    pub fn process_new_match (&mut self, srcnode: &str) -> Result<(), PyErr> {
124        Python::with_gil(|py| {
125            self.db.as_ref(py)
126                .getattr("process_new_match")
127                .expect("Galaxy object should have attribute process_new_match")
128                .call1((srcnode,))?;
129
130            Ok(())
131        })
132    }
133
134    /// Flushes the data of the nodes to disk. Because the implementation of Galaxy is as lazy as
135    /// possible in the API, minimal disk writes are made to make it faster.
136    ///
137    /// Therefore, the `flush` method is provided to tell each node to dump its contents to disk.
138    /// There is however the caveat that while nodes provided in modules are required to implement
139    /// this method as part of their API, it may be a no-op in certain cases where nodes need to
140    /// flush immediately after writes for one reason or another and therefore are already in sync
141    /// with the version on disk. 
142    ///
143    /// This may be optimized later for larger numbers of nodes using timestamps, though at the
144    /// moment this is not the case. 
145    pub fn flush (&mut self) -> Result<(), PyErr> {
146        Python::with_gil(|py| {
147            self.db.as_ref(py)
148                .getattr("flush")
149                .expect("Galaxy object should have attribute flush")
150                .call0()?;
151
152            Ok(())
153        })
154    }
155
156    /// Returns a `Result<Node, PyErr>` enum of the node name given. 
157    ///
158    /// This name is unqualified with the source, meaning it cannot disambiguate between two nodes
159    /// with identical names. This is subject to change in future updates.
160    ///
161    /// `node: &str` is a string name of the node to be retrieved.
162    pub fn get (&self, node: &str) -> Result<Node, PyErr> {
163        Python::with_gil(|py| {
164            let output = self.db.as_ref(py)
165                .getattr("get")
166                .expect("Galaxy object should have attribute get")
167                .call1((node,))?;
168
169            Ok(Node::new(output.into()))
170        })
171    }
172
173    /// Returns a `Result<HashMap<String, Node>, PyErr>` representing all of the nodes registered
174    /// in the Galaxy db. 
175    pub fn nodes (&self) -> Result<HashMap<String, Node>, PyErr> {
176        Python::with_gil(|py| {
177            let db = self.db.as_ref(py);
178            let nodes = db.getattr("nodes").expect("Galaxy object should have nodes");
179            
180
181            let mut output: HashMap<String, Node> = HashMap::new();
182
183            for i in nodes.iter()? {
184                let key = i?.downcast::<PyString>()?.to_str()?;
185                let node: Node = Node::new(nodes.get_item(key)?.into());
186
187                output.insert(String::from(key), node);
188            }
189
190            Ok(output)
191        })
192    }
193
194    /// Returns a `Result<Vec<String>, PyErr>` containing all of the names of the registered
195    /// Galaxy modules. 
196    ///
197    /// In the future I plan to add a way to actually access functions inside modules for more
198    /// advanced behavior, but it's hard to manage because modules are not required to present any
199    /// API, really. 
200    pub fn modules (&self) -> Result<Vec<String>, PyErr> {
201        Python::with_gil(|py| {
202            Ok(self.db.as_ref(py)
203                .getattr("modules").expect("Galaxy object should have modules")
204                .getattr("keys").expect("Python dict object should have keys method")
205                .call0().expect("Python dict.keys() should not error")
206                .iter().expect("Python dict.keys() should be iterable")
207                .map(|x| String::from(x.unwrap().downcast::<PyString>().unwrap().to_str().unwrap()))
208                .collect())
209        })
210    }
211    
212    /// Returns `Result<bool, PyErr>` containing whether or not the Galaxy db is currently
213    /// `silent`. This refers to the boolean the Galaxy db was initialized with; for more
214    /// information, see the constructor `galaxy_rs::Galaxy::new`. 
215    pub fn silent (&self) -> Result<bool, PyErr> {
216        Python::with_gil(|py| {
217            Ok(self.db.as_ref(py)
218               .getattr("silent").expect("Galaxy object should have bool silent")
219               .downcast::<PyBool>().expect("silent flag should be bool")
220               .extract().unwrap())
221        })
222    }
223}
224
225
226/// A struct representing a Galaxy db data node. 
227pub struct Node {
228    py_obj: Py<PyAny>,
229}
230
231impl Node {
232    fn new (py_obj: Py<PyAny>) -> Self {
233        Node {
234            py_obj
235        }
236    }
237
238    /// Gets the content of this `Node` as a `Result<String, PyErr>`.
239    ///
240    /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
241    /// operation. 
242    pub fn content (&self) -> Result<String, PyErr> {
243        Python::with_gil(|py| {
244            let result = self.py_obj.as_ref(py)
245                .getattr("content")?
246                .downcast::<PyString>()?
247                .to_str()?;
248
249            Ok(String::from(result))
250        })
251    }
252
253    /// Gets the match data of this `Node` as a `Result<String, PyErr>`. 
254    ///
255    /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
256    /// operation.
257    pub fn match_data (&self) -> Result<String, PyErr> {
258        Python::with_gil(|py| {
259            let result = self.py_obj.as_ref(py)
260                .getattr("match_data")?
261                .downcast::<PyString>()?
262                .to_str()?;
263
264            Ok(String::from(result))
265        })
266    }
267
268    /// Gets the parsed data of this `Node` as a `Result<NodeData, PyErr>`. 
269    ///
270    /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
271    /// operation. 
272    pub fn parsed_data (&self) -> Result<NodeData, PyErr> {
273        Python::with_gil(|py| {
274            let result = self.py_obj.as_ref(py)
275                .getattr("parsed_data")?
276                .into();
277
278            Ok(NodeData::new(result))
279        })
280    }
281}
282
283/// A representation of the internal data of the Node. 
284/// Cannot be constructed, but is returned by `galaxy_rs::Node::parsed_data`. 
285pub struct NodeData {
286    py_obj: Py<PyAny>,
287}
288
289impl NodeData {
290    fn new (py_obj: Py<PyAny>) -> Self {
291        Self {
292            py_obj
293        }
294    }
295
296    /// Gets the title stored in the `NodeData` as a `Result<String, PyErr>`.
297    pub fn title (&self) -> Result<String, PyErr> {
298        Python::with_gil(|py| {
299            let result = self.py_obj.as_ref(py)
300                .get_item("title")?
301                .downcast::<PyString>()?
302                .to_str()?;
303
304            Ok(String::from(result))
305        })
306    }
307
308    /// Gets the data type stored in the `NodeData` as a `Result<String, PyErr>`.
309    ///
310    /// By default, Galaxy databases support a "txt" datatype, which is used to ingest all 
311    /// input data. Modules may implement more using the extensible Galaxy Python API. 
312    pub fn data_type (&self) -> Result<String, PyErr> {
313        Python::with_gil(|py| {
314            let result = self.py_obj.as_ref(py)
315                .get_item("type")?
316                .downcast::<PyString>()?
317                .to_str()?;
318
319            Ok(String::from(result))
320        })
321    }
322
323    /// Returns the source the data was retrieved from as a `Result<String, PyErr>`.
324    pub fn source (&self) -> Result<String, PyErr> {
325        Python::with_gil(|py| {
326            let result = self.py_obj.as_ref(py)
327                .get_item("source")?
328                .downcast::<PyString>()?
329                .to_str()?;
330
331            Ok(String::from(result))
332        })
333    }
334
335    /// Returns the links stored in this `NodeData` as a `Result<Vec<Link>, PyErr>`. 
336    pub fn links (&self) -> Result<Vec<Link>, PyErr> {
337        Python::with_gil(|py| {
338            let result = self.py_obj.as_ref(py)
339                .get_item("links")?
340                .iter()?
341                .map(|i| Link::new((i.unwrap()).into()))
342                .collect();
343
344            Ok(result)
345        })
346    }
347
348    /// Flushes an individual node to disk. Returns a `Result<(), PyErr>` for error handling.
349    ///
350    /// Galaxy demands that all module-added node types implement this method, but dependent on the
351    /// implementation this may or may not actually flush the node's cache to disk. See
352    /// `galaxy_rs::Galaxy::flush` for more details. 
353    pub fn flush (&self) -> Result<(), PyErr> {
354        Python::with_gil(|py| {
355            self.py_obj.as_ref(py)
356                .getattr("flush")?
357                .call0()?;
358            Ok(())
359        })
360    }
361                
362}
363
364/// Represents a single link between two nodes in the Galaxy db. 
365///
366/// Individual links don't know what node they point from, they merely have a weight and point to
367/// another node. Links should never be stored separate from the `Node` (or `NodeData`) that they
368/// are tied to for this reason. 
369pub struct Link {
370    py_obj: Py<PyAny>
371}
372
373impl Link {
374    fn new (py_obj: Py<PyAny>) -> Self {
375        Self {
376            py_obj
377        }
378    }
379
380    /// Returns the node that is being targeted by this link, as a `Result<String, PyErr>`. 
381    pub fn target (&self) -> Result<String, PyErr> {
382        Python::with_gil(|py| {
383            let result = self.py_obj.as_ref(py)
384                .get_item("target")?
385                .downcast::<PyString>()?
386                .to_str()?;
387
388            Ok(String::from(result))
389        })
390    }
391
392    /// Returns the strength (or "weight") of this `Link`, as a `Result<i64, PyErr>`. 
393    pub fn strength (&self) -> Result<i64, PyErr> {
394        Python::with_gil(|py| {
395            let result = self.py_obj.as_ref(py)
396                .get_item("strength")?
397                .extract()?;
398            
399            Ok(result)
400        })
401    }
402
403}
404
405fn from_pystring_unchecked (x: PyResult<&PyAny>) -> String {
406    String::from(x.unwrap().downcast::<PyString>().unwrap().to_str().unwrap())
407}
408
409/// Wraps the registries in the Galaxy db. 
410pub struct RegistryHandler {
411    py_obj: Py<PyAny>
412}
413
414impl RegistryHandler {
415    fn new (py_obj: Py<PyAny>) -> Self {
416        Self {
417            py_obj
418        }
419    }
420
421    fn node_registry (&self) -> Py<PyAny> {
422        Python::with_gil(|py| {
423            self.py_obj.as_ref(py)
424                .getattr("NODE_REGISTRY").expect("Node Registry should exist")
425                .into()
426        })
427    }
428
429    fn ingest_manager_registry (&self) -> Py<PyAny> {
430        Python::with_gil(|py| {
431            self.py_obj.as_ref(py)
432                .getattr("INGEST_MANAGER_REGISTRY").expect("Ingest Manager Registry should exist")
433                .into()
434        })
435    }
436
437    fn keys (x: &PyAny) -> &PyIterator {
438        x.getattr("keys").expect("Registry object should have keys")
439            .iter().expect("Python list should be iterable")
440    }
441        
442
443    /// Gets a list of the IDs of all registered node types as a `Vec<String>`. Can sometimes
444    /// be more reliable than getting the loaded modules, as there is no guarantee that each module
445    /// registers a node, or that each module registers only one node. Each node is targeted by a
446    /// type field in the JSON data on disk, so there is a 1:1 correlation of this list to all of
447    /// the currently supported data types in the Galaxy db. 
448    ///
449    /// Identifiers are namespaced as `module::nodetype`, as enforced by the Registry in Python;
450    /// however, Identifiers default to the `core` namespace when no namespace is provided, meaning
451    /// that some nodes in badly written, non-idiomatic ways may be registered under `core`.
452    /// Looking at the registry namespaces should illustrate that modules with the same namespace
453    /// may register multiple, or no, nodes. 
454    pub fn registered_nodes (&self) -> Vec<String> {
455        Python::with_gil(|py| {
456            Self::keys(self.node_registry().as_ref(py))
457                .map(from_pystring_unchecked)
458                .collect()
459        })
460    }
461
462    /// Gets a list of the IDs of all the registered ingest helpers as a `Vec<String>`. Can
463    /// sometimes be more reliable than getting all the loaded modules, as there is no guarantee
464    /// that each module registers an ingest manager, or that each module registers only one ingest
465    /// manager. Each ingest manager represents a single source of data. This may not necessary map
466    /// onto all possible nominal sources in node metadata (ex. a WebIngestManager may generate the 
467    /// source from the scraped web data) but it represents all possible ways to obtain data. 
468    ///
469    /// Identifiers are namespaced as `module::nodetype`, as enforced by the Registry in Python;
470    /// however, Identifiers default to the `core` namespace when no namespace is provided, meaning
471    /// that some nodes in badly written, non-idiomatic ways may be registered under `core`. 
472    /// Looking at the registry namespaces should illustrate that modules with the same namespace
473    /// may register multiple, or no, ingest managers. 
474    pub fn registered_ingest_managers (&self) -> Vec<String> {
475        Python::with_gil(|py| {
476            Self::keys(self.ingest_manager_registry().as_ref(py))
477                .map(from_pystring_unchecked)
478                .collect()
479        })
480    }
481
482}