use crate::client::apis::configuration::Configuration;
use crate::client::apis::default_api;
use crate::models::{FileModel, JobModel, RoCrateEntityModel};
use chrono::{DateTime, Utc};
use log::{debug, warn};
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::{BufReader, Read as IoRead};
use std::path::Path;
pub fn compute_file_sha256(path: &str) -> Option<String> {
let file = match File::open(path) {
Ok(f) => f,
Err(e) => {
debug!("Cannot open file for SHA256 computation '{}': {}", path, e);
return None;
}
};
let mut reader = BufReader::new(file);
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192];
loop {
match reader.read(&mut buffer) {
Ok(0) => break,
Ok(n) => hasher.update(&buffer[..n]),
Err(e) => {
debug!("Error reading file for SHA256 '{}': {}", path, e);
return None;
}
}
}
Some(format!("{:x}", hasher.finalize()))
}
pub fn build_file_entity(
workflow_id: i64,
run_id: i64,
file: &FileModel,
content_size: Option<u64>,
sha256: Option<String>,
) -> RoCrateEntityModel {
let file_path = &file.path;
let basename = Path::new(file_path)
.file_name()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| file_path.clone());
let mime_type = mime_guess::from_path(file_path)
.first()
.map(|m| m.to_string())
.unwrap_or_else(|| "application/octet-stream".to_string());
let mut metadata = serde_json::json!({
"@id": file_path,
"@type": "File",
"name": basename,
"encodingFormat": mime_type,
"torc:run_id": run_id
});
if let Some(size) = content_size {
metadata["contentSize"] = serde_json::json!(size);
}
if let Some(hash) = sha256 {
metadata["sha256"] = serde_json::json!(hash);
}
if let Some(st_mtime) = file.st_mtime {
let datetime = DateTime::<Utc>::from_timestamp(st_mtime as i64, 0).unwrap_or_else(Utc::now);
metadata["dateModified"] = serde_json::json!(datetime.to_rfc3339());
}
RoCrateEntityModel {
id: None,
workflow_id,
file_id: file.id,
entity_id: file_path.clone(),
entity_type: "File".to_string(),
metadata: metadata.to_string(),
}
}
pub fn build_file_entity_with_provenance(
workflow_id: i64,
run_id: i64,
file: &FileModel,
content_size: Option<u64>,
sha256: Option<String>,
job_id: i64,
attempt_id: i64,
) -> RoCrateEntityModel {
let file_path = &file.path;
let basename = Path::new(file_path)
.file_name()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| file_path.clone());
let mime_type = mime_guess::from_path(file_path)
.first()
.map(|m| m.to_string())
.unwrap_or_else(|| "application/octet-stream".to_string());
let create_action_id = format!("#job-{}-attempt-{}", job_id, attempt_id);
let mut metadata = serde_json::json!({
"@id": file_path,
"@type": "File",
"name": basename,
"encodingFormat": mime_type,
"wasGeneratedBy": { "@id": create_action_id },
"torc:run_id": run_id
});
if let Some(size) = content_size {
metadata["contentSize"] = serde_json::json!(size);
}
if let Some(hash) = sha256 {
metadata["sha256"] = serde_json::json!(hash);
}
if let Some(st_mtime) = file.st_mtime {
let datetime = DateTime::<Utc>::from_timestamp(st_mtime as i64, 0).unwrap_or_else(Utc::now);
metadata["dateModified"] = serde_json::json!(datetime.to_rfc3339());
}
RoCrateEntityModel {
id: None,
workflow_id,
file_id: file.id,
entity_id: file_path.clone(),
entity_type: "File".to_string(),
metadata: metadata.to_string(),
}
}
pub fn build_create_action_entity(
workflow_id: i64,
run_id: i64,
job: &JobModel,
attempt_id: i64,
output_file_paths: &[String],
) -> RoCrateEntityModel {
let action_id = format!("#job-{}-attempt-{}", job.id.unwrap_or(0), attempt_id);
let results: Vec<serde_json::Value> = output_file_paths
.iter()
.map(|path| serde_json::json!({ "@id": path }))
.collect();
let metadata = serde_json::json!({
"@id": action_id,
"@type": "CreateAction",
"name": job.name,
"instrument": { "@id": format!("#workflow-{}", workflow_id) },
"result": results,
"torc:run_id": run_id
});
RoCrateEntityModel {
id: None,
workflow_id,
file_id: None,
entity_id: action_id,
entity_type: "CreateAction".to_string(),
metadata: metadata.to_string(),
}
}
pub fn find_entity_for_file(
config: &Configuration,
workflow_id: i64,
file_id: i64,
) -> Option<RoCrateEntityModel> {
match default_api::list_ro_crate_entities(config, workflow_id, None, None) {
Ok(response) => {
if let Some(entities) = response.items {
entities.into_iter().find(|e| e.file_id == Some(file_id))
} else {
None
}
}
Err(e) => {
warn!("Failed to check for existing RO-Crate entities: {}", e);
None
}
}
}
pub fn entity_exists_for_file(config: &Configuration, workflow_id: i64, file_id: i64) -> bool {
find_entity_for_file(config, workflow_id, file_id).is_some()
}
pub fn create_ro_crate_entity_for_file(
config: &Configuration,
workflow_id: i64,
run_id: i64,
file: &FileModel,
content_size: Option<u64>,
) {
let file_id = match file.id {
Some(id) => id,
None => {
warn!("Cannot create RO-Crate entity: file has no ID");
return;
}
};
let sha256 = compute_file_sha256(&file.path);
let entity = build_file_entity(workflow_id, run_id, file, content_size, sha256);
if let Some(existing) = find_entity_for_file(config, workflow_id, file_id) {
let entity_id = match existing.id {
Some(id) => id,
None => {
warn!("Existing entity has no ID, cannot update");
return;
}
};
let updated_entity = RoCrateEntityModel {
id: Some(entity_id),
..entity
};
match default_api::update_ro_crate_entity(config, entity_id, updated_entity) {
Ok(_) => {
debug!(
"Updated RO-Crate entity for file '{}' (entity_id={})",
file.path, entity_id
);
}
Err(e) => {
warn!(
"Failed to update RO-Crate entity for file '{}': {}",
file.path, e
);
}
}
return;
}
match default_api::create_ro_crate_entity(config, entity) {
Ok(created) => {
debug!(
"Created RO-Crate entity for file '{}' (entity_id={})",
file.path,
created.id.unwrap_or(0)
);
}
Err(e) => {
warn!(
"Failed to create RO-Crate entity for file '{}': {}",
file.path, e
);
}
}
}
pub fn create_ro_crate_entity_for_output_file(
config: &Configuration,
workflow_id: i64,
run_id: i64,
file: &FileModel,
content_size: Option<u64>,
job_id: i64,
attempt_id: i64,
) {
let file_id = match file.id {
Some(id) => id,
None => {
warn!("Cannot create RO-Crate entity: file has no ID");
return;
}
};
let sha256 = compute_file_sha256(&file.path);
let entity = build_file_entity_with_provenance(
workflow_id,
run_id,
file,
content_size,
sha256,
job_id,
attempt_id,
);
if let Some(existing) = find_entity_for_file(config, workflow_id, file_id) {
let entity_id = match existing.id {
Some(id) => id,
None => {
warn!("Existing entity has no ID, cannot update");
return;
}
};
let updated_entity = RoCrateEntityModel {
id: Some(entity_id),
..entity
};
match default_api::update_ro_crate_entity(config, entity_id, updated_entity) {
Ok(_) => {
debug!(
"Updated RO-Crate entity for output file '{}' with provenance (entity_id={})",
file.path, entity_id
);
}
Err(e) => {
warn!(
"Failed to update RO-Crate entity for output file '{}': {}",
file.path, e
);
}
}
return;
}
match default_api::create_ro_crate_entity(config, entity) {
Ok(created) => {
debug!(
"Created RO-Crate entity for output file '{}' (entity_id={})",
file.path,
created.id.unwrap_or(0)
);
}
Err(e) => {
warn!(
"Failed to create RO-Crate entity for output file '{}': {}",
file.path, e
);
}
}
}
pub fn create_create_action_entity(
config: &Configuration,
workflow_id: i64,
run_id: i64,
job: &JobModel,
attempt_id: i64,
output_file_paths: &[String],
) {
let entity =
build_create_action_entity(workflow_id, run_id, job, attempt_id, output_file_paths);
match default_api::create_ro_crate_entity(config, entity) {
Ok(created) => {
debug!(
"Created RO-Crate CreateAction entity for job '{}' (entity_id={})",
job.name,
created.id.unwrap_or(0)
);
}
Err(e) => {
warn!(
"Failed to create RO-Crate CreateAction entity for job '{}': {}",
job.name, e
);
}
}
}
pub fn create_entities_for_input_files(
config: &Configuration,
workflow_id: i64,
run_id: i64,
files: &[FileModel],
) {
for file in files {
if file.st_mtime.is_some() {
let content_size = std::fs::metadata(&file.path).ok().map(|m| m.len());
create_ro_crate_entity_for_file(config, workflow_id, run_id, file, content_size);
}
}
}
struct BinaryInfo {
name: &'static str,
path: String,
version: String,
sha256: Option<String>,
file_size: Option<u64>,
}
fn detect_binary(name: &'static str) -> Option<BinaryInfo> {
let path = std::env::current_exe()
.ok()
.and_then(|exe| exe.parent().map(|dir| dir.join(name)))
.filter(|p| p.is_file())
.or_else(|| {
std::env::var_os("PATH").and_then(|paths| {
std::env::split_paths(&paths)
.map(|dir| dir.join(name))
.find(|p| p.is_file())
})
});
let path = match path {
Some(p) => p,
None => {
debug!("Binary '{}' not found", name);
return None;
}
};
let path_str = path.display().to_string();
let version = match std::process::Command::new(&path).arg("--version").output() {
Ok(output) => {
let stdout = String::from_utf8_lossy(&output.stdout);
stdout.lines().next().unwrap_or("").trim().to_string()
}
Err(e) => {
debug!("Could not get version for '{}': {}", name, e);
"unknown".to_string()
}
};
let sha256 = compute_file_sha256(&path_str);
let file_size = std::fs::metadata(&path).ok().map(|m| m.len());
Some(BinaryInfo {
name,
path: path_str,
version,
sha256,
file_size,
})
}
fn build_software_entity(workflow_id: i64, run_id: i64, info: &BinaryInfo) -> RoCrateEntityModel {
let entity_id = format!("#software-{}-run-{}", info.name, run_id);
let mut metadata = serde_json::json!({
"@id": entity_id,
"@type": "SoftwareApplication",
"name": info.name,
"version": info.version,
"url": info.path,
"torc:run_id": run_id,
});
if let Some(size) = info.file_size {
metadata["contentSize"] = serde_json::json!(size);
}
if let Some(ref hash) = info.sha256 {
metadata["sha256"] = serde_json::json!(hash);
}
RoCrateEntityModel {
id: None,
workflow_id,
file_id: None,
entity_id,
entity_type: "SoftwareApplication".to_string(),
metadata: metadata.to_string(),
}
}
pub fn create_software_entities(config: &Configuration, workflow_id: i64, run_id: i64) {
let mut binary_names: Vec<&str> = vec!["torc"];
if cfg!(target_os = "linux") {
binary_names.push("torc-slurm-job-runner");
}
let existing_ids: std::collections::HashSet<String> =
match default_api::list_ro_crate_entities(config, workflow_id, None, None) {
Ok(response) => response
.items
.unwrap_or_default()
.into_iter()
.map(|e| e.entity_id)
.collect(),
Err(e) => {
warn!(
"Failed to list existing RO-Crate entities for workflow {}: {}",
workflow_id, e
);
std::collections::HashSet::new()
}
};
for name in binary_names {
let entity_id = format!("#software-{}-run-{}", name, run_id);
if existing_ids.contains(&entity_id) {
debug!(
"SoftwareApplication entity '{}' already exists, skipping",
entity_id
);
continue;
}
if let Some(info) = detect_binary(name) {
let entity = build_software_entity(workflow_id, run_id, &info);
match default_api::create_ro_crate_entity(config, entity) {
Ok(created) => {
debug!(
"Created SoftwareApplication entity for '{}' version='{}' (entity_id={})",
name,
info.version,
created.id.unwrap_or(0)
);
}
Err(e) => {
warn!(
"Failed to create SoftwareApplication entity for '{}': {}",
name, e
);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_file_entity_basic() {
let file = FileModel {
id: Some(1),
workflow_id: 100,
name: "output.csv".to_string(),
path: "data/output.csv".to_string(),
st_mtime: Some(1704067200.0), };
let entity = build_file_entity(100, 1, &file, Some(1024), None);
assert_eq!(entity.workflow_id, 100);
assert_eq!(entity.file_id, Some(1));
assert_eq!(entity.entity_id, "data/output.csv");
assert_eq!(entity.entity_type, "File");
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["@id"], "data/output.csv");
assert_eq!(metadata["@type"], "File");
assert_eq!(metadata["name"], "output.csv");
assert_eq!(metadata["encodingFormat"], "text/csv");
assert_eq!(metadata["contentSize"], 1024);
assert_eq!(metadata["torc:run_id"], 1);
}
#[test]
fn test_build_file_entity_with_provenance() {
let file = FileModel {
id: Some(2),
workflow_id: 100,
name: "result.json".to_string(),
path: "output/result.json".to_string(),
st_mtime: Some(1704067200.0),
};
let entity = build_file_entity_with_provenance(100, 1, &file, None, None, 42, 1);
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["wasGeneratedBy"]["@id"], "#job-42-attempt-1");
assert_eq!(metadata["torc:run_id"], 1);
}
#[test]
fn test_build_create_action_entity() {
let job = JobModel::new(
100,
"process_data".to_string(),
"python process.py".to_string(),
);
let mut job_with_id = job;
job_with_id.id = Some(42);
let output_files = vec![
"output/result1.json".to_string(),
"output/result2.json".to_string(),
];
let entity = build_create_action_entity(100, 1, &job_with_id, 1, &output_files);
assert_eq!(entity.entity_id, "#job-42-attempt-1");
assert_eq!(entity.entity_type, "CreateAction");
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["@type"], "CreateAction");
assert_eq!(metadata["name"], "process_data");
assert_eq!(metadata["instrument"]["@id"], "#workflow-100");
assert!(metadata["result"].is_array());
assert_eq!(metadata["result"][0]["@id"], "output/result1.json");
assert_eq!(metadata["torc:run_id"], 1);
}
#[test]
fn test_mime_type_inference() {
let known_types = ["file.json", "file.csv", "file.txt", "file.py", "file.rs"];
for path in known_types {
let file = FileModel {
id: Some(1),
workflow_id: 1,
name: path.to_string(),
path: path.to_string(),
st_mtime: None,
};
let entity = build_file_entity(1, 1, &file, None, None);
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
let mime = metadata["encodingFormat"].as_str().unwrap();
assert_ne!(
mime, "application/octet-stream",
"Expected known file type '{}' to have a specific MIME type, not the default",
path
);
}
let unknown_types = ["file", "file.xyz123"];
for path in unknown_types {
let file = FileModel {
id: Some(1),
workflow_id: 1,
name: path.to_string(),
path: path.to_string(),
st_mtime: None,
};
let entity = build_file_entity(1, 1, &file, None, None);
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
let mime = metadata["encodingFormat"].as_str().unwrap();
assert_eq!(
mime, "application/octet-stream",
"Expected unknown file type '{}' to have the default MIME type",
path
);
}
}
#[test]
fn test_serde_json_deserialize_ro_crate() {
let json =
r#"{"workflow_id":1,"entity_id":"test.txt","entity_type":"File","metadata":"{}"}"#;
let model: crate::models::RoCrateEntityModel = serde_json::from_str(json).unwrap();
assert_eq!(model.workflow_id, 1);
assert_eq!(model.entity_id, "test.txt");
assert_eq!(model.entity_type, "File");
}
#[test]
fn test_ro_crate_entity_model_roundtrip() {
let model = crate::models::RoCrateEntityModel {
id: None,
workflow_id: 1,
file_id: None,
entity_id: "data/output.parquet".to_string(),
entity_type: "File".to_string(),
metadata: r#"{"name":"Test"}"#.to_string(),
};
let json = serde_json::to_string(&model).unwrap();
println!("Serialized JSON: {}", json);
let parsed: crate::models::RoCrateEntityModel = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.workflow_id, 1);
assert_eq!(parsed.entity_id, "data/output.parquet");
assert_eq!(parsed.entity_type, "File");
}
#[test]
fn test_compute_file_sha256() {
use std::io::Write;
let temp_dir = std::env::temp_dir();
let temp_file = temp_dir.join("test_sha256.txt");
let mut file = std::fs::File::create(&temp_file).unwrap();
file.write_all(b"hello world").unwrap();
drop(file);
let hash = compute_file_sha256(temp_file.to_str().unwrap());
assert!(hash.is_some());
assert_eq!(
hash.unwrap(),
"b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
);
std::fs::remove_file(&temp_file).unwrap();
}
#[test]
fn test_compute_file_sha256_nonexistent() {
let hash = compute_file_sha256("/nonexistent/path/to/file.txt");
assert!(hash.is_none());
}
#[test]
fn test_build_file_entity_with_sha256() {
let file = FileModel {
id: Some(1),
workflow_id: 100,
name: "output.csv".to_string(),
path: "data/output.csv".to_string(),
st_mtime: Some(1704067200.0),
};
let sha256 = Some("abc123def456".to_string());
let entity = build_file_entity(100, 1, &file, Some(1024), sha256);
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["sha256"], "abc123def456");
}
#[test]
fn test_build_file_entity_with_provenance_and_sha256() {
let file = FileModel {
id: Some(2),
workflow_id: 100,
name: "result.json".to_string(),
path: "output/result.json".to_string(),
st_mtime: Some(1704067200.0),
};
let sha256 = Some("deadbeef".to_string());
let entity = build_file_entity_with_provenance(100, 1, &file, None, sha256, 42, 1);
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["sha256"], "deadbeef");
assert_eq!(metadata["wasGeneratedBy"]["@id"], "#job-42-attempt-1");
}
#[test]
fn test_build_software_entity() {
let info = BinaryInfo {
name: "torc",
path: "/usr/local/bin/torc".to_string(),
version: "torc 0.19.2".to_string(),
sha256: Some("abc123".to_string()),
file_size: Some(50_000_000),
};
let entity = build_software_entity(100, 3, &info);
assert_eq!(entity.workflow_id, 100);
assert_eq!(entity.file_id, None);
assert_eq!(entity.entity_id, "#software-torc-run-3");
assert_eq!(entity.entity_type, "SoftwareApplication");
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["@id"], "#software-torc-run-3");
assert_eq!(metadata["@type"], "SoftwareApplication");
assert_eq!(metadata["name"], "torc");
assert_eq!(metadata["version"], "torc 0.19.2");
assert_eq!(metadata["url"], "/usr/local/bin/torc");
assert_eq!(metadata["contentSize"], 50_000_000);
assert_eq!(metadata["sha256"], "abc123");
assert_eq!(metadata["torc:run_id"], 3);
}
#[test]
fn test_build_software_entity_without_optional_fields() {
let info = BinaryInfo {
name: "torc-server",
path: "/usr/local/bin/torc-server".to_string(),
version: "unknown".to_string(),
sha256: None,
file_size: None,
};
let entity = build_software_entity(42, 1, &info);
assert_eq!(entity.entity_id, "#software-torc-server-run-1");
assert_eq!(entity.entity_type, "SoftwareApplication");
let metadata: serde_json::Value = serde_json::from_str(&entity.metadata).unwrap();
assert_eq!(metadata["name"], "torc-server");
assert_eq!(metadata["torc:run_id"], 1);
assert!(metadata.get("contentSize").is_none());
assert!(metadata.get("sha256").is_none());
}
#[test]
fn test_detect_binary_finds_current_exe() {
let exe = std::env::current_exe().unwrap();
let exe_name = exe.file_name().unwrap().to_str().unwrap();
let info = detect_binary(Box::leak(exe_name.to_string().into_boxed_str()));
if let Some(info) = info {
assert!(!info.path.is_empty());
}
}
}