use std::collections::HashMap;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentGraph {
nodes: HashMap<String, DocumentGraphNode>,
edges: HashMap<String, Vec<GraphEdge>>,
keyword_index: HashMap<String, Vec<KeywordDocEntry>>,
metadata: GraphMetadata,
}
impl DocumentGraph {
pub(crate) fn take_edges(&mut self) -> HashMap<String, Vec<GraphEdge>> {
std::mem::take(&mut self.edges)
}
pub(crate) fn set_edges(&mut self, edges: HashMap<String, Vec<GraphEdge>>) {
self.metadata.edge_count = edges.values().map(|v| v.len()).sum();
self.edges = edges;
}
pub(crate) fn keyword_index_clone(&self) -> HashMap<String, Vec<KeywordDocEntry>> {
self.keyword_index.clone()
}
}
impl DocumentGraph {
pub fn new() -> Self {
Self {
nodes: HashMap::new(),
edges: HashMap::new(),
keyword_index: HashMap::new(),
metadata: GraphMetadata {
document_count: 0,
edge_count: 0,
},
}
}
pub fn add_node(&mut self, node: DocumentGraphNode) {
for kw in &node.top_keywords {
self.keyword_index
.entry(kw.keyword.clone())
.or_default()
.push(KeywordDocEntry {
doc_id: node.doc_id.clone(),
weight: kw.weight,
});
}
let doc_id = node.doc_id.clone();
self.nodes.insert(doc_id, node);
self.metadata.document_count = self.nodes.len();
}
pub fn add_edge(&mut self, source: &str, edge: GraphEdge) {
self.edges.entry(source.to_string()).or_default().push(edge);
self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum();
}
pub fn get_node(&self, doc_id: &str) -> Option<&DocumentGraphNode> {
self.nodes.get(doc_id)
}
pub fn get_neighbors(&self, doc_id: &str) -> &[GraphEdge] {
self.edges.get(doc_id).map_or(&[], Vec::as_slice)
}
pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] {
self.keyword_index.get(keyword).map_or(&[], Vec::as_slice)
}
pub fn node_count(&self) -> usize {
self.nodes.len()
}
pub fn edge_count(&self) -> usize {
self.edges.values().map(|v| v.len()).sum()
}
pub fn doc_ids(&self) -> impl Iterator<Item = &str> {
self.nodes.keys().map(|s| s.as_str())
}
pub fn metadata(&self) -> &GraphMetadata {
&self.metadata
}
pub fn is_empty(&self) -> bool {
self.nodes.is_empty()
}
}
impl Default for DocumentGraph {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentGraphNode {
pub doc_id: String,
pub title: String,
pub format: String,
pub top_keywords: Vec<WeightedKeyword>,
pub node_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WeightedKeyword {
pub keyword: String,
pub weight: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GraphEdge {
pub target_doc_id: String,
pub weight: f32,
pub evidence: EdgeEvidence,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EdgeEvidence {
pub shared_keywords: Vec<SharedKeyword>,
pub shared_keyword_count: usize,
pub keyword_jaccard: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SharedKeyword {
pub keyword: String,
pub source_weight: f32,
pub target_weight: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KeywordDocEntry {
pub doc_id: String,
pub weight: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GraphMetadata {
pub document_count: usize,
pub edge_count: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_graph() {
let graph = DocumentGraph::new();
assert!(graph.is_empty());
assert_eq!(graph.node_count(), 0);
assert_eq!(graph.edge_count(), 0);
}
#[test]
fn test_add_node() {
let mut graph = DocumentGraph::new();
graph.add_node(DocumentGraphNode {
doc_id: "doc1".to_string(),
title: "Test Doc".to_string(),
format: "md".to_string(),
top_keywords: vec![
WeightedKeyword {
keyword: "rust".to_string(),
weight: 0.9,
},
WeightedKeyword {
keyword: "async".to_string(),
weight: 0.7,
},
],
node_count: 10,
});
assert_eq!(graph.node_count(), 1);
assert!(graph.get_node("doc1").is_some());
assert_eq!(graph.find_by_keyword("rust").len(), 1);
assert_eq!(graph.find_by_keyword("async").len(), 1);
assert_eq!(graph.find_by_keyword("missing").len(), 0);
}
#[test]
fn test_add_edge() {
let mut graph = DocumentGraph::new();
graph.add_node(DocumentGraphNode {
doc_id: "doc1".to_string(),
title: "A".to_string(),
format: "md".to_string(),
top_keywords: vec![],
node_count: 5,
});
graph.add_node(DocumentGraphNode {
doc_id: "doc2".to_string(),
title: "B".to_string(),
format: "md".to_string(),
top_keywords: vec![],
node_count: 8,
});
graph.add_edge(
"doc1",
GraphEdge {
target_doc_id: "doc2".to_string(),
weight: 0.5,
evidence: EdgeEvidence {
shared_keywords: vec![SharedKeyword {
keyword: "rust".to_string(),
source_weight: 0.9,
target_weight: 0.8,
}],
shared_keyword_count: 1,
keyword_jaccard: 0.3,
},
},
);
assert_eq!(graph.edge_count(), 1);
assert_eq!(graph.get_neighbors("doc1").len(), 1);
assert_eq!(graph.get_neighbors("doc1")[0].target_doc_id, "doc2");
assert_eq!(graph.get_neighbors("doc2").len(), 0);
}
#[test]
fn test_serialization_roundtrip() {
let mut graph = DocumentGraph::new();
graph.add_node(DocumentGraphNode {
doc_id: "doc1".to_string(),
title: "Test".to_string(),
format: "md".to_string(),
top_keywords: vec![WeightedKeyword {
keyword: "test".to_string(),
weight: 1.0,
}],
node_count: 3,
});
let json = serde_json::to_string(&graph).unwrap();
let deserialized: DocumentGraph = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.node_count(), 1);
assert_eq!(deserialized.get_node("doc1").unwrap().title, "Test");
}
}