use std::collections::HashSet;
use std::sync::Mutex;
use serde::{Deserialize, Serialize};
use crate::error::Result;
use super::types::Document;
#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Triple {
pub subject: String,
pub predicate: String,
pub object: String,
}
impl Triple {
pub fn new<S, P, O>(subject: S, predicate: P, object: O) -> Self
where
S: Into<String>,
P: Into<String>,
O: Into<String>,
{
let predicate = normalise_predicate(&predicate.into());
Self {
subject: subject.into().trim().to_owned(),
predicate,
object: object.into().trim().to_owned(),
}
}
}
fn normalise_predicate(raw: &str) -> String {
let mut out = String::with_capacity(raw.len());
let mut prev_underscore = false;
for ch in raw.trim().chars() {
if ch.is_whitespace() {
if !prev_underscore && !out.is_empty() {
out.push('_');
prev_underscore = true;
}
} else {
out.extend(ch.to_lowercase());
prev_underscore = false;
}
}
while out.ends_with('_') {
out.pop();
}
out
}
pub trait TripleExtractor: Send + Sync {
fn extract(
&self,
doc: &Document,
) -> impl std::future::Future<Output = Result<Vec<Triple>>> + Send;
}
pub trait GraphBaseline: Send + Sync {
fn contains(&self, triple: &Triple) -> impl std::future::Future<Output = Result<bool>> + Send;
}
#[derive(Debug, Clone)]
pub struct StubTripleExtractor {
triples: Vec<Triple>,
}
impl StubTripleExtractor {
pub fn new<I, S, P, O>(triples: I) -> Self
where
I: IntoIterator<Item = (S, P, O)>,
S: Into<String>,
P: Into<String>,
O: Into<String>,
{
let triples = triples
.into_iter()
.map(|(s, p, o)| Triple::new(s, p, o))
.collect();
Self { triples }
}
pub fn triples(&self) -> &[Triple] {
&self.triples
}
}
impl TripleExtractor for StubTripleExtractor {
async fn extract(&self, _doc: &Document) -> Result<Vec<Triple>> {
Ok(self.triples.clone())
}
}
#[derive(Debug, Default)]
pub struct InMemoryGraphBaseline {
edges: Mutex<HashSet<Triple>>,
}
impl InMemoryGraphBaseline {
pub fn new() -> Self {
Self {
edges: Mutex::new(HashSet::new()),
}
}
pub fn with_edges<I, S, P, O>(edges: I) -> Self
where
I: IntoIterator<Item = (S, P, O)>,
S: Into<String>,
P: Into<String>,
O: Into<String>,
{
let baseline = Self::new();
if let Ok(mut guard) = baseline.edges.lock() {
for (s, p, o) in edges {
guard.insert(Triple::new(s, p, o));
}
}
baseline
}
pub fn insert(&self, triple: Triple) -> Result<bool> {
let mut guard = self
.edges
.lock()
.map_err(|_| crate::error::Error::Ingestion("graph baseline mutex poisoned".into()))?;
Ok(guard.insert(triple))
}
pub fn len(&self) -> Result<usize> {
let guard = self
.edges
.lock()
.map_err(|_| crate::error::Error::Ingestion("graph baseline mutex poisoned".into()))?;
Ok(guard.len())
}
pub fn is_empty(&self) -> Result<bool> {
Ok(self.len()? == 0)
}
}
impl GraphBaseline for InMemoryGraphBaseline {
async fn contains(&self, triple: &Triple) -> Result<bool> {
let guard = self
.edges
.lock()
.map_err(|_| crate::error::Error::Ingestion("graph baseline mutex poisoned".into()))?;
Ok(guard.contains(triple))
}
}
#[cfg(feature = "ingestion-graph")]
pub use self::pet::PetgraphBaseline;
#[cfg(feature = "ingestion-graph")]
mod pet {
use std::collections::{HashMap, HashSet};
use std::sync::Mutex;
use petgraph::Graph;
use petgraph::graph::NodeIndex;
use super::{GraphBaseline, Triple};
use crate::error::{Error, Result};
#[derive(Debug)]
pub struct PetgraphBaseline {
inner: Mutex<Inner>,
}
#[derive(Debug, Default)]
struct Inner {
edges: HashSet<Triple>,
graph: Graph<String, String>,
node_index: HashMap<String, NodeIndex>,
}
impl Inner {
fn node_for(&mut self, value: &str) -> NodeIndex {
if let Some(idx) = self.node_index.get(value) {
return *idx;
}
let idx = self.graph.add_node(value.to_owned());
self.node_index.insert(value.to_owned(), idx);
idx
}
}
impl PetgraphBaseline {
pub fn new() -> Self {
Self {
inner: Mutex::new(Inner::default()),
}
}
pub fn with_edges<I, S, P, O>(edges: I) -> Self
where
I: IntoIterator<Item = (S, P, O)>,
S: Into<String>,
P: Into<String>,
O: Into<String>,
{
let baseline = Self::new();
for (s, p, o) in edges {
let _ = baseline.insert(Triple::new(s, p, o));
}
baseline
}
pub fn insert(&self, triple: Triple) -> Result<bool> {
let mut guard = self
.inner
.lock()
.map_err(|_| Error::Ingestion("petgraph baseline mutex poisoned".into()))?;
let newly_inserted = guard.edges.insert(triple.clone());
if newly_inserted {
let subject_idx = guard.node_for(&triple.subject);
let object_idx = guard.node_for(&triple.object);
guard
.graph
.add_edge(subject_idx, object_idx, triple.predicate);
}
Ok(newly_inserted)
}
pub fn len(&self) -> Result<usize> {
let guard = self
.inner
.lock()
.map_err(|_| Error::Ingestion("petgraph baseline mutex poisoned".into()))?;
Ok(guard.edges.len())
}
pub fn is_empty(&self) -> Result<bool> {
Ok(self.len()? == 0)
}
}
impl Default for PetgraphBaseline {
fn default() -> Self {
Self::new()
}
}
impl GraphBaseline for PetgraphBaseline {
async fn contains(&self, triple: &Triple) -> Result<bool> {
let guard = self
.inner
.lock()
.map_err(|_| Error::Ingestion("petgraph baseline mutex poisoned".into()))?;
Ok(guard.edges.contains(triple))
}
}
}