galaxy_rs/lib.rs
1//! # Galaxy
2//!
3//! `galaxy-rs` is a library providing bindings to the Python `galaxy` project, found [here](https://github.com/quinntyx/galaxy).
4//! Galaxy is a node-based, extensible, multimedia database system.
5//!
6//! This crate provides a way to interact with a Galaxy database through Rust using PyO3,
7//! preserving compatibility with Galaxy's extensibility through its Python API while allowing use
8//! with Rust programs.
9
10use pyo3::prelude::*;
11use pyo3::types::{PyString, PyBool, PyIterator};
12use std::collections::HashMap;
13pub use pyo3::exceptions::*;
14
15/// A struct representing a Galaxy database.
16/// Because this interfaces with files on disk, ever having >1 instance of a Galaxy database
17/// pointing to the same data is an error and will cause UB.
18///
19/// A Galaxy DB can be created with `galaxy_rs::Galaxy::new(module_path: &str, api_path: &str, silent: bool)`.
20///
21/// Because every method in Galaxy requires acquiring the Python Global Interpreter Lock (GIL),
22/// Galaxy is technically threadsafe for shared memory (although running multiple instances of
23/// Galaxy is not recommended as the database is lazy and may become out of sync with disk until
24/// flushed, causing likely data races).
25pub struct Galaxy {
26 db: Py<PyAny>
27}
28
29impl Galaxy {
30 /// Creates a new Galaxy database pointing to the given data.
31 ///
32 /// `module_path: &str` is the path to the galaxy database's extension module folder, relative to the
33 /// location the Binary is being executed from.
34 ///
35 /// `api_path: &str` is the path to the Galaxy API, for use by modules as an imported module. This can
36 /// be left as a blank string if Galaxy is installed to the system interpreter or as a venv in
37 /// the current environment (as is the default setup when using Cargo).
38 ///
39 /// `silent: bool` is a boolean that will suppress prints from Python while initializing the
40 /// database through a flag on the Python-internal Galaxy.post() method. Note that this is not
41 /// guaranteed to suppress all output, depending on the implementation of added modules.
42 ///
43 /// All data is stored in the `data` folder relative to the execution location of the final
44 /// binary, and when instancing a Galaxy database it automatically loads all data from that
45 /// directory.
46 pub fn new (module_path: &str, api_path: &str, silent: bool) -> Self {
47 match Python::with_gil(|py| -> PyResult<Self> {
48 let db: Py<PyAny> = PyModule::import(py, "galaxy")?
49 .getattr("galaxy")?
50 .getattr("Galaxy")?
51 .call1((silent, module_path, api_path))?
52 .into();
53 Ok(Galaxy { db })
54 }) {
55 Ok(x) => x,
56 Err(_) => todo!(),
57 }
58 }
59
60 /// Creates a new Galaxy database pointing to the given data inside of the given directory.
61 ///
62 /// `module_path: &str` is the path to the galaxy database's extension module folder, relative to the
63 /// location the Binary is being executed from.
64 ///
65 /// `api_path: &str` is the path to the Galaxy API, for use by modules as an imported module. This can
66 /// be left as a blank string if Galaxy is installed to the system interpreter or as a venv in
67 /// the current environment (as is the default setup when using Cargo).
68 ///
69 /// `silent: bool` is a boolean that will suppress prints from Python while initializing the
70 /// database through a flag on the Python-internal Galaxy.post() method. Note that this is not
71 /// guaranteed to suppress all output, depending on the implementation of added modules.
72 ///
73 /// All data is stored in the `data` folder relative to the passed directory.
74 pub fn with_dir (dir: &str, module_path: &str, api_path: &str, silent: bool) -> Self {
75 Python::with_gil(|py| {
76 PyModule::import(py, "os").unwrap()
77 .getattr("chdir").unwrap()
78 .call1((dir,)).unwrap();
79 });
80 Self::new(module_path, api_path, silent)
81 }
82
83 /// Gets the Registry Handler.
84 pub fn registry_handler (&self) -> RegistryHandler {
85 let res: PyResult<RegistryHandler> = Python::with_gil(|py| {
86 let handler: Py<PyAny> = PyModule::import(py, "galaxy")?
87 .getattr("registry")?
88 .into();
89 Ok(RegistryHandler::new(handler))
90 });
91 match res {
92 Ok(x) => x,
93 Err(_) => todo!()
94 }
95 }
96
97
98 /// Loads a node to the Galaxy database and returns a `Result<(), PyErr>` holding any resulting
99 /// Python errors that may have arisen from attempting to load that node.
100 ///
101 /// `loc: &str` is the location of the node, in the format `source:name`. This is expanded to
102 /// `data/source:name.json` and `data/source:name.match` to instance the (lazy) Galaxy node and
103 /// add it to the nodes in the Galaxy db.
104 pub fn load_node (&mut self, loc: &str) -> Result<(), PyErr> {
105 Python::with_gil(|py| {
106 self.db.as_ref(py)
107 .getattr("load_node")
108 .expect("Galaxy object should have attribute load_node")
109 .call1((loc,))?;
110
111 Ok(())
112 })
113 }
114
115 /// Calling this makes the Galaxy database aware that a new node has been fully initialized and
116 /// added to the database and that it should now begin making connections using the match
117 /// files stored in that node.
118 ///
119 /// `srcnode: &str` is the ID of the node that was just added, which is the name of the node
120 /// (without the source). This is subject to change, and may become `source:name` in the future
121 /// to disambiguate nodes with the same name (e.g. Nuclear Energy from Britannica and Nuclear
122 /// Energy from Wikipedia). For now, it is impossible to have two nodes with identical names.
123 pub fn process_new_match (&mut self, srcnode: &str) -> Result<(), PyErr> {
124 Python::with_gil(|py| {
125 self.db.as_ref(py)
126 .getattr("process_new_match")
127 .expect("Galaxy object should have attribute process_new_match")
128 .call1((srcnode,))?;
129
130 Ok(())
131 })
132 }
133
134 /// Flushes the data of the nodes to disk. Because the implementation of Galaxy is as lazy as
135 /// possible in the API, minimal disk writes are made to make it faster.
136 ///
137 /// Therefore, the `flush` method is provided to tell each node to dump its contents to disk.
138 /// There is however the caveat that while nodes provided in modules are required to implement
139 /// this method as part of their API, it may be a no-op in certain cases where nodes need to
140 /// flush immediately after writes for one reason or another and therefore are already in sync
141 /// with the version on disk.
142 ///
143 /// This may be optimized later for larger numbers of nodes using timestamps, though at the
144 /// moment this is not the case.
145 pub fn flush (&mut self) -> Result<(), PyErr> {
146 Python::with_gil(|py| {
147 self.db.as_ref(py)
148 .getattr("flush")
149 .expect("Galaxy object should have attribute flush")
150 .call0()?;
151
152 Ok(())
153 })
154 }
155
156 /// Returns a `Result<Node, PyErr>` enum of the node name given.
157 ///
158 /// This name is unqualified with the source, meaning it cannot disambiguate between two nodes
159 /// with identical names. This is subject to change in future updates.
160 ///
161 /// `node: &str` is a string name of the node to be retrieved.
162 pub fn get (&self, node: &str) -> Result<Node, PyErr> {
163 Python::with_gil(|py| {
164 let output = self.db.as_ref(py)
165 .getattr("get")
166 .expect("Galaxy object should have attribute get")
167 .call1((node,))?;
168
169 Ok(Node::new(output.into()))
170 })
171 }
172
173 /// Returns a `Result<HashMap<String, Node>, PyErr>` representing all of the nodes registered
174 /// in the Galaxy db.
175 pub fn nodes (&self) -> Result<HashMap<String, Node>, PyErr> {
176 Python::with_gil(|py| {
177 let db = self.db.as_ref(py);
178 let nodes = db.getattr("nodes").expect("Galaxy object should have nodes");
179
180
181 let mut output: HashMap<String, Node> = HashMap::new();
182
183 for i in nodes.iter()? {
184 let key = i?.downcast::<PyString>()?.to_str()?;
185 let node: Node = Node::new(nodes.get_item(key)?.into());
186
187 output.insert(String::from(key), node);
188 }
189
190 Ok(output)
191 })
192 }
193
194 /// Returns a `Result<Vec<String>, PyErr>` containing all of the names of the registered
195 /// Galaxy modules.
196 ///
197 /// In the future I plan to add a way to actually access functions inside modules for more
198 /// advanced behavior, but it's hard to manage because modules are not required to present any
199 /// API, really.
200 pub fn modules (&self) -> Result<Vec<String>, PyErr> {
201 Python::with_gil(|py| {
202 Ok(self.db.as_ref(py)
203 .getattr("modules").expect("Galaxy object should have modules")
204 .getattr("keys").expect("Python dict object should have keys method")
205 .call0().expect("Python dict.keys() should not error")
206 .iter().expect("Python dict.keys() should be iterable")
207 .map(|x| String::from(x.unwrap().downcast::<PyString>().unwrap().to_str().unwrap()))
208 .collect())
209 })
210 }
211
212 /// Returns `Result<bool, PyErr>` containing whether or not the Galaxy db is currently
213 /// `silent`. This refers to the boolean the Galaxy db was initialized with; for more
214 /// information, see the constructor `galaxy_rs::Galaxy::new`.
215 pub fn silent (&self) -> Result<bool, PyErr> {
216 Python::with_gil(|py| {
217 Ok(self.db.as_ref(py)
218 .getattr("silent").expect("Galaxy object should have bool silent")
219 .downcast::<PyBool>().expect("silent flag should be bool")
220 .extract().unwrap())
221 })
222 }
223}
224
225
226/// A struct representing a Galaxy db data node.
227pub struct Node {
228 py_obj: Py<PyAny>,
229}
230
231impl Node {
232 fn new (py_obj: Py<PyAny>) -> Self {
233 Node {
234 py_obj
235 }
236 }
237
238 /// Gets the content of this `Node` as a `Result<String, PyErr>`.
239 ///
240 /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
241 /// operation.
242 pub fn content (&self) -> Result<String, PyErr> {
243 Python::with_gil(|py| {
244 let result = self.py_obj.as_ref(py)
245 .getattr("content")?
246 .downcast::<PyString>()?
247 .to_str()?;
248
249 Ok(String::from(result))
250 })
251 }
252
253 /// Gets the match data of this `Node` as a `Result<String, PyErr>`.
254 ///
255 /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
256 /// operation.
257 pub fn match_data (&self) -> Result<String, PyErr> {
258 Python::with_gil(|py| {
259 let result = self.py_obj.as_ref(py)
260 .getattr("match_data")?
261 .downcast::<PyString>()?
262 .to_str()?;
263
264 Ok(String::from(result))
265 })
266 }
267
268 /// Gets the parsed data of this `Node` as a `Result<NodeData, PyErr>`.
269 ///
270 /// Galaxy encourages module implementations of Nodes to be lazy, so this may cause an IO
271 /// operation.
272 pub fn parsed_data (&self) -> Result<NodeData, PyErr> {
273 Python::with_gil(|py| {
274 let result = self.py_obj.as_ref(py)
275 .getattr("parsed_data")?
276 .into();
277
278 Ok(NodeData::new(result))
279 })
280 }
281}
282
283/// A representation of the internal data of the Node.
284/// Cannot be constructed, but is returned by `galaxy_rs::Node::parsed_data`.
285pub struct NodeData {
286 py_obj: Py<PyAny>,
287}
288
289impl NodeData {
290 fn new (py_obj: Py<PyAny>) -> Self {
291 Self {
292 py_obj
293 }
294 }
295
296 /// Gets the title stored in the `NodeData` as a `Result<String, PyErr>`.
297 pub fn title (&self) -> Result<String, PyErr> {
298 Python::with_gil(|py| {
299 let result = self.py_obj.as_ref(py)
300 .get_item("title")?
301 .downcast::<PyString>()?
302 .to_str()?;
303
304 Ok(String::from(result))
305 })
306 }
307
308 /// Gets the data type stored in the `NodeData` as a `Result<String, PyErr>`.
309 ///
310 /// By default, Galaxy databases support a "txt" datatype, which is used to ingest all
311 /// input data. Modules may implement more using the extensible Galaxy Python API.
312 pub fn data_type (&self) -> Result<String, PyErr> {
313 Python::with_gil(|py| {
314 let result = self.py_obj.as_ref(py)
315 .get_item("type")?
316 .downcast::<PyString>()?
317 .to_str()?;
318
319 Ok(String::from(result))
320 })
321 }
322
323 /// Returns the source the data was retrieved from as a `Result<String, PyErr>`.
324 pub fn source (&self) -> Result<String, PyErr> {
325 Python::with_gil(|py| {
326 let result = self.py_obj.as_ref(py)
327 .get_item("source")?
328 .downcast::<PyString>()?
329 .to_str()?;
330
331 Ok(String::from(result))
332 })
333 }
334
335 /// Returns the links stored in this `NodeData` as a `Result<Vec<Link>, PyErr>`.
336 pub fn links (&self) -> Result<Vec<Link>, PyErr> {
337 Python::with_gil(|py| {
338 let result = self.py_obj.as_ref(py)
339 .get_item("links")?
340 .iter()?
341 .map(|i| Link::new((i.unwrap()).into()))
342 .collect();
343
344 Ok(result)
345 })
346 }
347
348 /// Flushes an individual node to disk. Returns a `Result<(), PyErr>` for error handling.
349 ///
350 /// Galaxy demands that all module-added node types implement this method, but dependent on the
351 /// implementation this may or may not actually flush the node's cache to disk. See
352 /// `galaxy_rs::Galaxy::flush` for more details.
353 pub fn flush (&self) -> Result<(), PyErr> {
354 Python::with_gil(|py| {
355 self.py_obj.as_ref(py)
356 .getattr("flush")?
357 .call0()?;
358 Ok(())
359 })
360 }
361
362}
363
364/// Represents a single link between two nodes in the Galaxy db.
365///
366/// Individual links don't know what node they point from, they merely have a weight and point to
367/// another node. Links should never be stored separate from the `Node` (or `NodeData`) that they
368/// are tied to for this reason.
369pub struct Link {
370 py_obj: Py<PyAny>
371}
372
373impl Link {
374 fn new (py_obj: Py<PyAny>) -> Self {
375 Self {
376 py_obj
377 }
378 }
379
380 /// Returns the node that is being targeted by this link, as a `Result<String, PyErr>`.
381 pub fn target (&self) -> Result<String, PyErr> {
382 Python::with_gil(|py| {
383 let result = self.py_obj.as_ref(py)
384 .get_item("target")?
385 .downcast::<PyString>()?
386 .to_str()?;
387
388 Ok(String::from(result))
389 })
390 }
391
392 /// Returns the strength (or "weight") of this `Link`, as a `Result<i64, PyErr>`.
393 pub fn strength (&self) -> Result<i64, PyErr> {
394 Python::with_gil(|py| {
395 let result = self.py_obj.as_ref(py)
396 .get_item("strength")?
397 .extract()?;
398
399 Ok(result)
400 })
401 }
402
403}
404
405fn from_pystring_unchecked (x: PyResult<&PyAny>) -> String {
406 String::from(x.unwrap().downcast::<PyString>().unwrap().to_str().unwrap())
407}
408
409/// Wraps the registries in the Galaxy db.
410pub struct RegistryHandler {
411 py_obj: Py<PyAny>
412}
413
414impl RegistryHandler {
415 fn new (py_obj: Py<PyAny>) -> Self {
416 Self {
417 py_obj
418 }
419 }
420
421 fn node_registry (&self) -> Py<PyAny> {
422 Python::with_gil(|py| {
423 self.py_obj.as_ref(py)
424 .getattr("NODE_REGISTRY").expect("Node Registry should exist")
425 .into()
426 })
427 }
428
429 fn ingest_manager_registry (&self) -> Py<PyAny> {
430 Python::with_gil(|py| {
431 self.py_obj.as_ref(py)
432 .getattr("INGEST_MANAGER_REGISTRY").expect("Ingest Manager Registry should exist")
433 .into()
434 })
435 }
436
437 fn keys (x: &PyAny) -> &PyIterator {
438 x.getattr("keys").expect("Registry object should have keys")
439 .iter().expect("Python list should be iterable")
440 }
441
442
443 /// Gets a list of the IDs of all registered node types as a `Vec<String>`. Can sometimes
444 /// be more reliable than getting the loaded modules, as there is no guarantee that each module
445 /// registers a node, or that each module registers only one node. Each node is targeted by a
446 /// type field in the JSON data on disk, so there is a 1:1 correlation of this list to all of
447 /// the currently supported data types in the Galaxy db.
448 ///
449 /// Identifiers are namespaced as `module::nodetype`, as enforced by the Registry in Python;
450 /// however, Identifiers default to the `core` namespace when no namespace is provided, meaning
451 /// that some nodes in badly written, non-idiomatic ways may be registered under `core`.
452 /// Looking at the registry namespaces should illustrate that modules with the same namespace
453 /// may register multiple, or no, nodes.
454 pub fn registered_nodes (&self) -> Vec<String> {
455 Python::with_gil(|py| {
456 Self::keys(self.node_registry().as_ref(py))
457 .map(from_pystring_unchecked)
458 .collect()
459 })
460 }
461
462 /// Gets a list of the IDs of all the registered ingest helpers as a `Vec<String>`. Can
463 /// sometimes be more reliable than getting all the loaded modules, as there is no guarantee
464 /// that each module registers an ingest manager, or that each module registers only one ingest
465 /// manager. Each ingest manager represents a single source of data. This may not necessary map
466 /// onto all possible nominal sources in node metadata (ex. a WebIngestManager may generate the
467 /// source from the scraped web data) but it represents all possible ways to obtain data.
468 ///
469 /// Identifiers are namespaced as `module::nodetype`, as enforced by the Registry in Python;
470 /// however, Identifiers default to the `core` namespace when no namespace is provided, meaning
471 /// that some nodes in badly written, non-idiomatic ways may be registered under `core`.
472 /// Looking at the registry namespaces should illustrate that modules with the same namespace
473 /// may register multiple, or no, ingest managers.
474 pub fn registered_ingest_managers (&self) -> Vec<String> {
475 Python::with_gil(|py| {
476 Self::keys(self.ingest_manager_registry().as_ref(py))
477 .map(from_pystring_unchecked)
478 .collect()
479 })
480 }
481
482}