use std::collections::HashSet;
use std::io::Write;
use itertools::Itertools;
use rayon::prelude::*;
use serde::Serialize;
use crate::graph::basic_iter::GraphNodePointer;
use crate::graph::{Graph, GraphContext};
use crate::model::is_ref_url;
use crate::model::node::{Node, NodeIter, NodePointer};
use crate::model::Key;
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct BrokenLink {
#[serde(serialize_with = "serialize_key")]
pub source_key: Key,
#[serde(serialize_with = "serialize_key")]
pub target_key: Key,
}
fn serialize_key<S: serde::Serializer>(key: &Key, serializer: S) -> Result<S::Ok, S::Error> {
serializer.serialize_str(&key.to_string())
}
fn broken_links(graph: &Graph) -> Vec<BrokenLink> {
let existing_keys: HashSet<Key> = graph.keys().into_iter().collect();
let mut seen = HashSet::new();
let mut broken = Vec::new();
for target_key in graph.inclusion_edge_target_keys() {
if !existing_keys.contains(&target_key) {
for node_id in graph.get_inclusion_edges_to(&target_key) {
let source_key = graph.key_of(node_id);
if seen.insert((source_key.clone(), target_key.clone())) {
broken.push(BrokenLink {
source_key,
target_key: target_key.clone(),
});
}
}
}
}
for target_key in graph.reference_edge_target_keys() {
if !existing_keys.contains(&target_key) && is_ref_url(&target_key.to_string()) {
for node_id in graph.get_reference_edges_to(&target_key) {
let source_key = graph.key_of(node_id);
if seen.insert((source_key.clone(), target_key.clone())) {
broken.push(BrokenLink {
source_key,
target_key: target_key.clone(),
});
}
}
}
}
broken.sort_by(|a, b| (&a.source_key, &a.target_key).cmp(&(&b.source_key, &b.target_key)));
broken
}
#[derive(Debug, Clone, Serialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct KeyStatistics {
pub key: String,
pub title: String,
pub sections: usize,
pub paragraphs: usize,
pub lines: usize,
pub words: usize,
pub included_by_count: usize,
pub referenced_by_count: usize,
pub incoming_edges_count: usize,
pub includes_count: usize,
pub references_count: usize,
pub total_edges_count: usize,
pub bullet_lists: usize,
pub ordered_lists: usize,
pub code_blocks: usize,
pub tables: usize,
pub quotes: usize,
}
impl KeyStatistics {
pub fn new(key: String, title: String) -> Self {
Self {
key,
title,
..Default::default()
}
}
pub fn count_node(&mut self, node: &Node) {
match node {
Node::Section(_) => self.sections += 1,
Node::Leaf(_) => self.paragraphs += 1,
Node::BulletList() => self.bullet_lists += 1,
Node::OrderedList() => self.ordered_lists += 1,
Node::Raw(_, _) => self.code_blocks += 1,
Node::Table(_) => self.tables += 1,
Node::Quote() => self.quotes += 1,
_ => {}
}
}
pub fn merge(mut self, other: Self) -> Self {
self.sections += other.sections;
self.paragraphs += other.paragraphs;
self.lines += other.lines;
self.words += other.words;
self.included_by_count += other.included_by_count;
self.referenced_by_count += other.referenced_by_count;
self.incoming_edges_count += other.incoming_edges_count;
self.includes_count += other.includes_count;
self.references_count += other.references_count;
self.total_edges_count += other.total_edges_count;
self.bullet_lists += other.bullet_lists;
self.ordered_lists += other.ordered_lists;
self.code_blocks += other.code_blocks;
self.tables += other.tables;
self.quotes += other.quotes;
self
}
fn count_nodes_recursive<'a, T: NodeIter<'a> + NodePointer<'a>>(
iter: &T,
stats: &mut KeyStatistics,
graph: &'a Graph,
) {
if let Some(node) = iter.node() {
stats.count_node(&node);
if let Some(id) = iter.id() {
if let Some(line_id) = graph.graph_node(id).line_id() {
stats.references_count += graph.get_line(line_id).ref_keys().len();
}
}
}
if let Some(child) = iter.child() {
Self::count_nodes_recursive(&child, stats, graph);
let mut current = child;
while let Some(next) = current.next() {
Self::count_nodes_recursive(&next, stats, graph);
current = next;
}
}
}
pub fn from_graph(graph: &Graph) -> Vec<KeyStatistics> {
let mut key_stats: Vec<KeyStatistics> = graph
.keys()
.par_iter()
.map(|key| {
let key_str = key.to_string();
let title = graph.get_ref_text(key).unwrap_or_else(|| key.to_string());
let mut stats = KeyStatistics {
key: key_str,
title,
..Default::default()
};
if let Some(node_id) = graph.get_node_id(key) {
let root = GraphNodePointer::new(graph, node_id);
Self::count_nodes_recursive(&root, &mut stats, graph);
}
if let Some(content) = graph.get_document(key) {
stats.lines = content.lines().count();
stats.words = content.split_whitespace().count();
}
stats.included_by_count = graph.get_inclusion_edges_to(key).len();
stats.referenced_by_count = graph.get_reference_edges_to(key).len();
stats.incoming_edges_count = stats.included_by_count + stats.referenced_by_count;
stats.includes_count = graph.get_inclusion_edges_in(key).len();
stats.total_edges_count = stats.included_by_count
+ stats.referenced_by_count
+ stats.includes_count
+ stats.references_count;
stats
})
.collect();
key_stats.par_sort_by(|a, b| a.key.cmp(&b.key));
key_stats
}
}
#[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct GraphStatistics {
pub total_documents: usize,
pub total_nodes: usize,
pub total_paths: usize,
pub total_sections: usize,
pub avg_sections_per_doc: f64,
pub top_docs_by_sections: Vec<KeyStatistics>,
pub total_paragraphs: usize,
pub avg_paragraphs_per_doc: f64,
pub inclusion_edges: usize,
pub reference_edges: usize,
pub total_references: usize,
pub orphaned_documents: usize,
pub orphaned_percentage: f64,
pub leaf_documents: usize,
pub leaf_percentage: f64,
pub top_referenced: Vec<KeyStatistics>,
pub total_lines: usize,
pub avg_lines_per_doc: f64,
pub top_docs_by_lines: Vec<KeyStatistics>,
pub total_words: usize,
pub avg_words_per_doc: f64,
pub top_docs_by_words: Vec<KeyStatistics>,
pub root_sections: usize,
pub max_path_depth: usize,
pub avg_path_depth: f64,
pub bullet_lists: usize,
pub ordered_lists: usize,
pub code_blocks: usize,
pub tables: usize,
pub quotes: usize,
pub avg_refs_per_doc: f64,
pub most_connected: Vec<KeyStatistics>,
pub broken_link_count: usize,
pub broken_links: Vec<BrokenLink>,
}
impl GraphStatistics {
pub fn export_csv<W: Write>(graph: &Graph, writer: W) -> Result<(), csv::Error> {
let mut csv_writer = csv::Writer::from_writer(writer);
let key_stats = KeyStatistics::from_graph(graph);
for stat in key_stats {
csv_writer.serialize(stat)?;
}
csv_writer.flush()?;
Ok(())
}
pub fn from_graph(graph: &Graph) -> Self {
let key_stats = KeyStatistics::from_graph(graph);
let broken_links = broken_links(graph);
let all_nodes = graph.nodes();
let paths = graph.paths();
let total_nodes = all_nodes.len();
let total_paths = paths.len();
let root_sections = paths.iter().filter(|p| p.ids().len() == 1).count();
let max_path_depth = paths.iter().map(|p| p.ids().len()).max().unwrap_or(0);
let avg_path_depth = if total_paths > 0 {
paths.iter().map(|p| p.ids().len()).sum::<usize>() as f64 / total_paths as f64
} else {
0.0
};
Self::aggregate_statistics(
key_stats,
total_nodes,
total_paths,
root_sections,
max_path_depth,
avg_path_depth,
broken_links,
)
}
fn aggregate_statistics(
key_stats: Vec<KeyStatistics>,
total_nodes: usize,
total_paths: usize,
root_sections: usize,
max_path_depth: usize,
avg_path_depth: f64,
broken_links: Vec<BrokenLink>,
) -> Self {
let total_documents = key_stats.len();
let total_sections: usize = key_stats.iter().map(|ks| ks.sections).sum();
let avg_sections_per_doc = if total_documents > 0 {
total_sections as f64 / total_documents as f64
} else {
0.0
};
let top_docs_by_sections: Vec<KeyStatistics> = key_stats
.iter()
.sorted_by(|a, b| b.sections.cmp(&a.sections))
.take(10)
.cloned()
.collect();
let total_paragraphs: usize = key_stats.iter().map(|ks| ks.paragraphs).sum();
let avg_paragraphs_per_doc = if total_documents > 0 {
total_paragraphs as f64 / total_documents as f64
} else {
0.0
};
let total_incoming_block: usize = key_stats.iter().map(|ks| ks.included_by_count).sum();
let total_incoming_inline: usize = key_stats.iter().map(|ks| ks.referenced_by_count).sum();
let inclusion_edges = total_incoming_block;
let reference_edges = total_incoming_inline;
let orphaned_documents = key_stats
.iter()
.filter(|ks| ks.incoming_edges_count == 0)
.count();
let orphaned_percentage = if total_documents > 0 {
(orphaned_documents as f64 / total_documents as f64) * 100.0
} else {
0.0
};
let leaf_documents = key_stats
.iter()
.filter(|ks| ks.includes_count == 0)
.count();
let leaf_percentage = if total_documents > 0 {
(leaf_documents as f64 / total_documents as f64) * 100.0
} else {
0.0
};
let top_referenced: Vec<KeyStatistics> = key_stats
.iter()
.filter(|ks| ks.incoming_edges_count > 0)
.sorted_by(|a, b| b.incoming_edges_count.cmp(&a.incoming_edges_count))
.take(10)
.cloned()
.collect();
let total_lines: usize = key_stats.iter().map(|ks| ks.lines).sum();
let total_words: usize = key_stats.iter().map(|ks| ks.words).sum();
let avg_lines_per_doc = if total_documents > 0 {
total_lines as f64 / total_documents as f64
} else {
0.0
};
let avg_words_per_doc = if total_documents > 0 {
total_words as f64 / total_documents as f64
} else {
0.0
};
let top_docs_by_lines: Vec<KeyStatistics> = key_stats
.iter()
.sorted_by(|a, b| b.lines.cmp(&a.lines))
.take(10)
.cloned()
.collect();
let top_docs_by_words: Vec<KeyStatistics> = key_stats
.iter()
.sorted_by(|a, b| b.words.cmp(&a.words))
.take(10)
.cloned()
.collect();
let bullet_lists: usize = key_stats.iter().map(|ks| ks.bullet_lists).sum();
let ordered_lists: usize = key_stats.iter().map(|ks| ks.ordered_lists).sum();
let code_blocks: usize = key_stats.iter().map(|ks| ks.code_blocks).sum();
let tables: usize = key_stats.iter().map(|ks| ks.tables).sum();
let quotes: usize = key_stats.iter().map(|ks| ks.quotes).sum();
let most_connected: Vec<KeyStatistics> = key_stats
.iter()
.filter(|ks| ks.total_edges_count > 0)
.sorted_by(|a, b| b.total_edges_count.cmp(&a.total_edges_count))
.take(10)
.cloned()
.collect();
let avg_refs_per_doc = if total_documents > 0 {
(inclusion_edges + reference_edges) as f64 / total_documents as f64
} else {
0.0
};
GraphStatistics {
total_documents,
total_nodes,
total_paths,
total_sections,
avg_sections_per_doc,
top_docs_by_sections,
total_paragraphs,
avg_paragraphs_per_doc,
inclusion_edges,
reference_edges,
total_references: inclusion_edges + reference_edges,
orphaned_documents,
orphaned_percentage,
leaf_documents,
leaf_percentage,
top_referenced,
total_lines,
avg_lines_per_doc,
top_docs_by_lines,
total_words,
avg_words_per_doc,
top_docs_by_words,
root_sections,
max_path_depth,
avg_path_depth,
bullet_lists,
ordered_lists,
code_blocks,
tables,
quotes,
avg_refs_per_doc,
most_connected,
broken_link_count: broken_links.len(),
broken_links,
}
}
}