use std::time::Instant;
#[derive(Debug, Clone)]
pub struct StatsArgs {
pub file: String,
pub format: Option<String>,
pub output: StatsOutputFormat,
pub include_predicates: bool,
pub top_k: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StatsOutputFormat {
Text,
Json,
Csv,
}
#[derive(Debug, Clone)]
pub struct PredicateStats {
pub predicate: String,
pub count: usize,
pub pct: f64,
}
#[derive(Debug, Clone)]
pub struct DatasetStats {
pub total_triples: usize,
pub unique_subjects: usize,
pub unique_predicates: usize,
pub unique_objects: usize,
pub graphs: usize,
pub top_predicates: Vec<PredicateStats>,
pub literal_count: usize,
pub iri_object_count: usize,
pub blank_node_count: usize,
pub elapsed_ms: u64,
}
#[derive(Debug)]
pub enum StatsError {
FileNotFound(String),
ParseError(String),
UnsupportedFormat(String),
}
impl std::fmt::Display for StatsError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::FileNotFound(p) => write!(f, "File not found: {p}"),
Self::ParseError(m) => write!(f, "Parse error: {m}"),
Self::UnsupportedFormat(fmt) => write!(f, "Unsupported format: {fmt}"),
}
}
}
impl std::error::Error for StatsError {}
pub struct StatsCommand;
impl Default for StatsCommand {
fn default() -> Self {
Self::new()
}
}
impl StatsCommand {
pub fn new() -> Self {
Self
}
pub fn execute(&self, args: &StatsArgs) -> Result<DatasetStats, StatsError> {
if let Some(ref fmt) = args.format {
let supported = [
"turtle", "ttl", "ntriples", "nt", "nquads", "nq", "trig", "jsonld", "rdfxml",
"xml",
];
if !supported.contains(&fmt.to_lowercase().as_str()) {
return Err(StatsError::UnsupportedFormat(fmt.clone()));
}
}
if !std::path::Path::new(&args.file).exists() {
return Err(StatsError::FileNotFound(args.file.clone()));
}
let start = Instant::now();
let stats = Self::simulate_stats(&args.file, args.include_predicates, args.top_k);
let elapsed_ms = start.elapsed().as_millis() as u64;
Ok(DatasetStats {
elapsed_ms,
..stats
})
}
pub fn format_output(&self, stats: &DatasetStats, format: &StatsOutputFormat) -> String {
match format {
StatsOutputFormat::Text => Self::format_text(stats),
StatsOutputFormat::Json => Self::format_json(stats),
StatsOutputFormat::Csv => Self::format_csv(stats),
}
}
pub fn simulate_stats(filename: &str, include_predicates: bool, top_k: usize) -> DatasetStats {
let seed: u64 = filename
.bytes()
.enumerate()
.fold(0xcbf29ce484222325u64, |acc, (i, b)| {
acc.wrapping_mul(0x100000001b3)
.wrapping_add(b as u64)
.wrapping_add(i as u64)
});
let total_triples = ((seed >> 2) % 90_000 + 10_000) as usize;
let unique_subjects = ((seed >> 5) % (total_triples as u64 / 3) + 100) as usize;
let unique_predicates = ((seed >> 8) % 200 + 5) as usize;
let unique_objects = ((seed >> 11) % (total_triples as u64 / 2) + 200) as usize;
let graphs = ((seed >> 14) % 8 + 1) as usize;
let literal_count = ((seed >> 17) % (total_triples as u64 / 2)) as usize;
let blank_node_count = ((seed >> 20) % (total_triples as u64 / 10)) as usize;
let iri_object_count = total_triples
.saturating_sub(literal_count)
.saturating_sub(blank_node_count);
let top_predicates = if include_predicates {
Self::simulated_predicates(seed, unique_predicates, total_triples, top_k)
} else {
vec![]
};
DatasetStats {
total_triples,
unique_subjects,
unique_predicates,
unique_objects,
graphs,
top_predicates,
literal_count,
iri_object_count,
blank_node_count,
elapsed_ms: 0,
}
}
pub fn predicate_coverage(stats: &DatasetStats) -> f64 {
let sum: f64 = stats.top_predicates.iter().map(|p| p.pct).sum();
sum / 100.0
}
fn format_text(stats: &DatasetStats) -> String {
let mut out = String::new();
out.push_str("=== RDF Dataset Statistics ===\n");
out.push_str(&format!(
" Total triples : {}\n",
stats.total_triples
));
out.push_str(&format!(
" Unique subjects : {}\n",
stats.unique_subjects
));
out.push_str(&format!(
" Unique predicates : {}\n",
stats.unique_predicates
));
out.push_str(&format!(
" Unique objects : {}\n",
stats.unique_objects
));
out.push_str(&format!(" Named graphs : {}\n", stats.graphs));
out.push_str(&format!(
" Literal objects : {}\n",
stats.literal_count
));
out.push_str(&format!(
" IRI objects : {}\n",
stats.iri_object_count
));
out.push_str(&format!(
" Blank node objects : {}\n",
stats.blank_node_count
));
out.push_str(&format!(
" Elapsed : {} ms\n",
stats.elapsed_ms
));
if !stats.top_predicates.is_empty() {
out.push_str("\n--- Top Predicates ---\n");
for ps in &stats.top_predicates {
out.push_str(&format!(
" {:6} ({:5.1}%) {}\n",
ps.count, ps.pct, ps.predicate
));
}
}
out
}
fn format_json(stats: &DatasetStats) -> String {
let preds_json: Vec<String> = stats
.top_predicates
.iter()
.map(|p| {
format!(
"{{\"predicate\":\"{}\",\"count\":{},\"pct\":{:.2}}}",
p.predicate, p.count, p.pct
)
})
.collect();
format!(
"{{\
\"total_triples\":{},\
\"unique_subjects\":{},\
\"unique_predicates\":{},\
\"unique_objects\":{},\
\"graphs\":{},\
\"literal_count\":{},\
\"iri_object_count\":{},\
\"blank_node_count\":{},\
\"elapsed_ms\":{},\
\"top_predicates\":[{}]\
}}",
stats.total_triples,
stats.unique_subjects,
stats.unique_predicates,
stats.unique_objects,
stats.graphs,
stats.literal_count,
stats.iri_object_count,
stats.blank_node_count,
stats.elapsed_ms,
preds_json.join(",")
)
}
fn format_csv(stats: &DatasetStats) -> String {
let mut out = String::new();
out.push_str("metric,value\n");
out.push_str(&format!("total_triples,{}\n", stats.total_triples));
out.push_str(&format!("unique_subjects,{}\n", stats.unique_subjects));
out.push_str(&format!("unique_predicates,{}\n", stats.unique_predicates));
out.push_str(&format!("unique_objects,{}\n", stats.unique_objects));
out.push_str(&format!("graphs,{}\n", stats.graphs));
out.push_str(&format!("literal_count,{}\n", stats.literal_count));
out.push_str(&format!("iri_object_count,{}\n", stats.iri_object_count));
out.push_str(&format!("blank_node_count,{}\n", stats.blank_node_count));
out.push_str(&format!("elapsed_ms,{}\n", stats.elapsed_ms));
if !stats.top_predicates.is_empty() {
out.push_str("\npredicate,count,pct\n");
for ps in &stats.top_predicates {
out.push_str(&format!("{},{},{:.2}\n", ps.predicate, ps.count, ps.pct));
}
}
out
}
fn simulated_predicates(
seed: u64,
unique_predicates: usize,
total_triples: usize,
top_k: usize,
) -> Vec<PredicateStats> {
let n = unique_predicates.min(200); let mut predicates: Vec<(String, usize)> = (0..n)
.map(|i| {
let hash = seed.wrapping_mul(0x9e3779b97f4a7c15).wrapping_add(i as u64);
let count = ((hash % (total_triples as u64 / n as u64).max(1)) + 1) as usize;
let name = format!("http://example.org/prop/{:08x}", hash & 0xffff_ffff);
(name, count)
})
.collect();
predicates.sort_by_key(|item| std::cmp::Reverse(item.1));
let limit = top_k.min(predicates.len());
let selected: Vec<(String, usize)> = predicates.into_iter().take(limit).collect();
let denom = total_triples.max(1) as f64;
selected
.into_iter()
.map(|(predicate, count)| {
let pct = (count as f64 / denom) * 100.0;
PredicateStats {
predicate,
count,
pct,
}
})
.collect()
}
#[allow(dead_code)]
fn predicates_sorted_descending(preds: &[PredicateStats]) -> bool {
preds.windows(2).all(|w| w[0].count >= w[1].count)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn tmp_file(name: &str) -> String {
let dir = std::env::temp_dir();
let path = dir.join(format!("oxirs_stats_test_{name}.nt"));
std::fs::write(&path, "# temp RDF file for tests\n").unwrap_or_default();
path.to_string_lossy().to_string()
}
fn make_args(file: &str) -> StatsArgs {
StatsArgs {
file: file.to_string(),
format: None,
output: StatsOutputFormat::Text,
include_predicates: true,
top_k: 10,
}
}
#[test]
fn test_execute_file_not_found() {
let cmd = StatsCommand::new();
let args = make_args("/nonexistent/path/file.nt");
let err = cmd.execute(&args).expect_err("should fail");
assert!(matches!(err, StatsError::FileNotFound(_)));
}
#[test]
fn test_execute_unsupported_format() {
let path = tmp_file("unsupported");
let cmd = StatsCommand::new();
let args = StatsArgs {
file: path,
format: Some("binary-rdf".to_string()),
output: StatsOutputFormat::Text,
include_predicates: false,
top_k: 5,
};
let err = cmd.execute(&args).expect_err("unsupported format");
assert!(matches!(err, StatsError::UnsupportedFormat(_)));
}
#[test]
fn test_execute_valid_file_succeeds() {
let path = tmp_file("valid");
let cmd = StatsCommand::new();
let args = make_args(&path);
let stats = cmd.execute(&args).expect("should succeed");
assert!(stats.total_triples > 0);
}
#[test]
fn test_execute_supported_format_turtle() {
let path = tmp_file("turtle");
let cmd = StatsCommand::new();
let args = StatsArgs {
file: path,
format: Some("turtle".to_string()),
output: StatsOutputFormat::Text,
include_predicates: false,
top_k: 5,
};
cmd.execute(&args).expect("turtle is a supported format");
}
#[test]
fn test_execute_supported_format_ntriples() {
let path = tmp_file("nt");
let cmd = StatsCommand::new();
let args = StatsArgs {
file: path,
format: Some("ntriples".to_string()),
output: StatsOutputFormat::Text,
include_predicates: false,
top_k: 5,
};
cmd.execute(&args).expect("ntriples is a supported format");
}
#[test]
fn test_simulate_stats_deterministic() {
let s1 = StatsCommand::simulate_stats("test.nt", true, 10);
let s2 = StatsCommand::simulate_stats("test.nt", true, 10);
assert_eq!(
s1.total_triples, s2.total_triples,
"simulate_stats is deterministic"
);
assert_eq!(s1.unique_subjects, s2.unique_subjects);
}
#[test]
fn test_simulate_stats_different_filenames() {
let s1 = StatsCommand::simulate_stats("file_a.nt", true, 10);
let s2 = StatsCommand::simulate_stats("file_b.nt", true, 10);
assert_ne!(
s1.total_triples, s2.total_triples,
"different filenames should typically produce different stats"
);
}
#[test]
fn test_simulate_stats_total_triples_positive() {
let s = StatsCommand::simulate_stats("any.nt", false, 5);
assert!(s.total_triples > 0);
}
#[test]
fn test_simulate_stats_unique_subjects_positive() {
let s = StatsCommand::simulate_stats("any.nt", false, 5);
assert!(s.unique_subjects > 0);
}
#[test]
fn test_simulate_stats_unique_predicates_positive() {
let s = StatsCommand::simulate_stats("any.nt", false, 5);
assert!(s.unique_predicates > 0);
}
#[test]
fn test_simulate_stats_unique_objects_positive() {
let s = StatsCommand::simulate_stats("any.nt", false, 5);
assert!(s.unique_objects > 0);
}
#[test]
fn test_simulate_stats_no_predicates_when_disabled() {
let s = StatsCommand::simulate_stats("any.nt", false, 10);
assert!(
s.top_predicates.is_empty(),
"include_predicates=false → empty list"
);
}
#[test]
fn test_simulate_stats_top_k_limits_predicates() {
let s = StatsCommand::simulate_stats("any.nt", true, 3);
assert!(
s.top_predicates.len() <= 3,
"top_k=3 must limit to at most 3 predicates"
);
}
#[test]
fn test_simulate_stats_top_predicates_sorted_descending() {
let s = StatsCommand::simulate_stats("any.nt", true, 10);
assert!(
StatsCommand::predicates_sorted_descending(&s.top_predicates),
"top_predicates must be sorted descending by count"
);
}
#[test]
fn test_simulate_stats_predicate_pct_positive() {
let s = StatsCommand::simulate_stats("any.nt", true, 10);
for p in &s.top_predicates {
assert!(p.pct > 0.0, "pct must be positive");
}
}
#[test]
fn test_simulate_stats_graphs_at_least_one() {
let s = StatsCommand::simulate_stats("any.nt", false, 5);
assert!(s.graphs >= 1);
}
#[test]
fn test_predicate_coverage_empty_is_zero() {
let s = DatasetStats {
total_triples: 100,
unique_subjects: 10,
unique_predicates: 5,
unique_objects: 20,
graphs: 1,
top_predicates: vec![],
literal_count: 10,
iri_object_count: 80,
blank_node_count: 10,
elapsed_ms: 0,
};
assert_eq!(StatsCommand::predicate_coverage(&s), 0.0);
}
#[test]
fn test_predicate_coverage_sums_correctly() {
let s = DatasetStats {
total_triples: 100,
unique_subjects: 10,
unique_predicates: 2,
unique_objects: 20,
graphs: 1,
top_predicates: vec![
PredicateStats {
predicate: "p1".to_string(),
count: 60,
pct: 60.0,
},
PredicateStats {
predicate: "p2".to_string(),
count: 40,
pct: 40.0,
},
],
literal_count: 10,
iri_object_count: 80,
blank_node_count: 10,
elapsed_ms: 0,
};
let coverage = StatsCommand::predicate_coverage(&s);
assert!(
(coverage - 1.0).abs() < 1e-9,
"coverage should be 1.0 (100%)"
);
}
#[test]
fn test_predicate_coverage_partial() {
let s = DatasetStats {
total_triples: 100,
unique_subjects: 10,
unique_predicates: 5,
unique_objects: 20,
graphs: 1,
top_predicates: vec![PredicateStats {
predicate: "p".to_string(),
count: 50,
pct: 50.0,
}],
literal_count: 10,
iri_object_count: 80,
blank_node_count: 10,
elapsed_ms: 0,
};
let coverage = StatsCommand::predicate_coverage(&s);
assert!((coverage - 0.5).abs() < 1e-9, "coverage 50/100 = 0.5");
}
#[test]
fn test_format_text_contains_total_triples() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Text);
assert!(
out.to_lowercase().contains("total"),
"text output should mention total (case-insensitive)"
);
}
#[test]
fn test_format_text_contains_subjects() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Text);
assert!(out.contains("subject"));
}
#[test]
fn test_format_text_contains_predicates_section_when_present() {
let s = StatsCommand::simulate_stats("f.nt", true, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Text);
if !s.top_predicates.is_empty() {
assert!(out.contains("Predicate") || out.contains("predicate"));
}
}
#[test]
fn test_format_json_is_json_like() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Json);
assert!(out.starts_with('{') && out.ends_with('}'));
}
#[test]
fn test_format_json_contains_total_triples() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Json);
assert!(out.contains("\"total_triples\""));
}
#[test]
fn test_format_json_contains_top_predicates_key() {
let s = StatsCommand::simulate_stats("f.nt", true, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Json);
assert!(out.contains("\"top_predicates\""));
}
#[test]
fn test_format_csv_header() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Csv);
assert!(out.starts_with("metric,value"));
}
#[test]
fn test_format_csv_contains_total_triples() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Csv);
assert!(out.contains("total_triples"));
}
#[test]
fn test_format_csv_predicate_rows_when_present() {
let s = StatsCommand::simulate_stats("f.nt", true, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Csv);
if !s.top_predicates.is_empty() {
assert!(out.contains("predicate,count,pct"));
}
}
#[test]
fn test_stats_output_format_text_variant() {
let f = StatsOutputFormat::Text;
assert_eq!(f, StatsOutputFormat::Text);
}
#[test]
fn test_stats_output_format_json_variant() {
let f = StatsOutputFormat::Json;
assert_eq!(f, StatsOutputFormat::Json);
}
#[test]
fn test_stats_output_format_csv_variant() {
let f = StatsOutputFormat::Csv;
assert_eq!(f, StatsOutputFormat::Csv);
}
#[test]
fn test_top_k_zero_empty_predicates() {
let s = StatsCommand::simulate_stats("f.nt", true, 0);
assert!(s.top_predicates.is_empty(), "top_k=0 → empty predicates");
}
#[test]
fn test_execute_elapsed_ms_is_a_number() {
let path = tmp_file("elapsed");
let cmd = StatsCommand::new();
let args = make_args(&path);
let stats = cmd.execute(&args).expect("should succeed");
let _ = stats.elapsed_ms;
}
#[test]
fn test_simulate_unique_subjects_positive() {
let s = StatsCommand::simulate_stats("test.nt", false, 5);
assert!(s.unique_subjects > 0);
}
#[test]
fn test_simulate_unique_predicates_positive() {
let s = StatsCommand::simulate_stats("test.nt", false, 5);
assert!(s.unique_predicates > 0);
}
#[test]
fn test_simulate_unique_objects_positive() {
let s = StatsCommand::simulate_stats("test.nt", false, 5);
assert!(s.unique_objects > 0);
}
#[test]
fn test_simulate_graphs_at_least_one() {
let s = StatsCommand::simulate_stats("test.nt", false, 5);
assert!(s.graphs >= 1);
}
#[test]
fn test_simulate_top_predicates_capped_at_top_k() {
let top_k = 3;
let s = StatsCommand::simulate_stats("test.nt", true, top_k);
assert!(
s.top_predicates.len() <= top_k,
"top_predicates.len() {} should be <= top_k {}",
s.top_predicates.len(),
top_k
);
}
#[test]
fn test_simulate_predicate_pct_in_range() {
let s = StatsCommand::simulate_stats("test.nt", true, 10);
for ps in &s.top_predicates {
assert!(
ps.pct >= 0.0 && ps.pct <= 100.0,
"pct {} must be in [0, 100]",
ps.pct
);
}
}
#[test]
fn test_format_json_curly_braces() {
let s = StatsCommand::simulate_stats("f.nt", true, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Json);
let open = out.chars().filter(|&c| c == '{').count();
let close = out.chars().filter(|&c| c == '}').count();
assert_eq!(open, close, "JSON braces must be balanced");
}
#[test]
fn test_format_text_contains_predicates_word() {
let s = StatsCommand::simulate_stats("f.nt", false, 5);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Text);
assert!(
out.to_lowercase().contains("predicate"),
"text output should mention predicates"
);
}
#[test]
fn test_format_csv_lines_count() {
let s = StatsCommand::simulate_stats("f.nt", false, 0);
let out = StatsCommand::new().format_output(&s, &StatsOutputFormat::Csv);
let lines: Vec<&str> = out.lines().collect();
assert!(lines.len() >= 9, "CSV should have header + stats rows");
}
#[test]
fn test_simulate_different_files_different_stats() {
let s1 = StatsCommand::simulate_stats("alpha.nt", false, 5);
let s2 = StatsCommand::simulate_stats("beta.nt", false, 5);
let some_differ = s1.total_triples != s2.total_triples
|| s1.unique_subjects != s2.unique_subjects
|| s1.unique_objects != s2.unique_objects;
assert!(
some_differ,
"different filenames should produce different stats"
);
}
#[test]
fn test_predicate_stats_fields() {
let ps = PredicateStats {
predicate: "http://example.org/prop".to_string(),
count: 42,
pct: 4.2,
};
assert_eq!(ps.predicate, "http://example.org/prop");
assert_eq!(ps.count, 42);
assert!((ps.pct - 4.2).abs() < 1e-9);
}
#[test]
fn test_stats_error_display_file_not_found() {
let err = StatsError::FileNotFound("/some/path.nt".to_string());
let msg = format!("{err}");
assert!(
msg.contains("/some/path.nt"),
"error message should contain path"
);
}
}