use clap::{Parser, Subcommand};
use colored::Colorize;
use log::error;
use mdmodels_core::{
datamodel::DataModel,
error::DataModelError,
exporters::{render_jinja_template, Templates},
json::validation::validate_json,
linkml::export::serialize_linkml,
llm::extraction::query_openai,
pipeline::process_pipeline,
};
use serde::{Deserialize, Serialize};
use std::{
collections::HashMap,
error::Error,
fmt::Display,
fs,
io::Write,
path::{Path, PathBuf},
str::FromStr,
};
#[derive(Parser)]
#[command(name = "MD-Models CLI", version = "0.1.0")]
#[command(about = "Validate and convert Markdown Data Models", long_about = None)]
struct Cli {
#[command(subcommand)]
cmd: Commands,
}
#[derive(Subcommand)]
enum Commands {
Convert(ConvertArgs),
Validate(ValidateArgs),
Pipeline(PipelineArgs),
Extract(ExtractArgs),
Dataset(DatasetArgs),
}
#[derive(Parser, Debug)]
struct ValidateArgs {
#[arg(short, long, help = "Path or URL to the markdown file")]
input: InputType,
#[arg(long = "git", help = "GitHub repository in the form owner/repo[@ref]")]
git: Option<String>,
}
#[derive(Parser, Debug)]
struct ConvertArgs {
#[arg(short, long, help = "Path or URL to the markdown file")]
input: InputType,
#[arg(long = "git", help = "GitHub repository in the form owner/repo[@ref]")]
git: Option<String>,
#[arg(short, long, help = "Path to the output file")]
output: Option<PathBuf>,
#[arg(short, long, help = "Template to use for rendering")]
template: Templates,
#[arg(
short,
long,
help = "Root object to start rendering from (required for JSON Schema)"
)]
root: Option<String>,
#[arg(
short = 'O',
long,
value_parser,
num_args = 1.., value_delimiter = ',',
help = "Options to pass to the template"
)]
options: Vec<String>,
}
#[derive(Parser, Debug)]
struct PipelineArgs {
#[arg(short, long, help = "Path to the pipeline configuration YAML file")]
input: PathBuf,
}
#[derive(Parser, Debug)]
struct ExtractArgs {
#[arg(short, long, help = "Path or URL to the markdown model")]
model: InputType,
#[arg(long = "git", help = "GitHub repository in the form owner/repo[@ref]")]
git: Option<String>,
#[arg(short, long, help = "Path to the file to parse")]
input: PathBuf,
#[arg(
short,
long,
default_value = "You are a helpful assistant that extracts data from text input.",
help = "Pre-prompt to use for extraction"
)]
pre_prompt: String,
#[arg(
short,
long,
default_value = "gpt-4o",
help = "OpenAI model to use for extraction. Defaults to 'gpt-4o'."
)]
llm_model: String,
#[arg(
short,
long,
help = "Root object to parse into. Defaults to the first entity in the model."
)]
root: Option<String>,
#[arg(short, long, help = "Output file to write the extracted data to")]
output: Option<PathBuf>,
#[arg(long, help = "Whether to extract multiple objects")]
multiple: bool,
}
#[derive(Parser, Debug)]
struct DatasetArgs {
#[command(subcommand)]
command: DatasetCommands,
}
#[derive(Subcommand, Debug)]
enum DatasetCommands {
Validate(ValidateDatasetArgs),
}
#[derive(Parser, Debug)]
struct ValidateDatasetArgs {
#[arg(short, long, help = "Path to the dataset file")]
input: InputType,
#[arg(short, long, help = "Path to the markdown model")]
model: InputType,
#[arg(long = "git", help = "GitHub repository in the form owner/repo[@ref]")]
git: Option<String>,
}
#[derive(Deserialize, Serialize, Clone, Debug)]
enum InputType {
Remote(String),
Local(String),
}
impl FromStr for InputType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s.starts_with("http") {
Ok(InputType::Remote(s.to_string()))
} else {
Ok(InputType::Local(s.to_string()))
}
}
}
impl Display for InputType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
InputType::Remote(url) => write!(f, "{url}"),
InputType::Local(path) => write!(f, "{path}"),
}
}
}
fn main() -> Result<(), Box<dyn Error>> {
pretty_env_logger::init();
let args = Cli::parse();
match args.cmd {
Commands::Validate(args) => validate(args),
Commands::Convert(args) => convert(args),
Commands::Pipeline(args) => process_pipeline(&args.input),
Commands::Extract(args) => query_llm(args),
Commands::Dataset(args) => match args.command {
DatasetCommands::Validate(args) => validate_ds(args),
},
}
}
fn validate(args: ValidateArgs) -> Result<(), Box<dyn Error>> {
println!("\n Validating model {} ...", args.input.to_string().bold());
if args.git.is_some() {
let model = load_markdown_model(&args.input, args.git.as_deref());
return match model {
Ok(_) => {
print_validation_result(true);
Ok(())
}
Err(err) => {
print_validation_result(false);
Err(err)
}
};
}
let path = resolve_input_path(&args.input);
if is_json_schema(&path)? {
return validate_from_json_schema(&path);
}
let model = DataModel::from_markdown(&path);
match model {
Ok(_) => {
print_validation_result(true);
Ok(())
}
Err(result) => {
result.log_result();
print_validation_result(false);
Err("Model is invalid".into())
}
}
}
fn validate_from_json_schema(path: &Path) -> Result<(), Box<dyn Error>> {
if let Err(err) = DataModel::from_json_schema(path) {
match err {
DataModelError::ValidationError(validator) => {
validator.log_result();
Err("Model is invalid".into())
}
_ => Err(err.into()),
}
} else {
print_validation_result(true);
Ok(())
}
}
fn print_validation_result(result: bool) {
let message = if result {
"Model is valid".green().bold().to_string()
} else {
"Model is invalid".red().bold().to_string()
};
println!(" └── {message}\n");
}
fn query_llm(args: ExtractArgs) -> Result<(), Box<dyn Error>> {
let model = load_markdown_model(&args.model, args.git.as_deref())?;
let prompt = std::fs::read_to_string(&args.input)?;
let pre_prompt = args.pre_prompt;
let llm_model = args.llm_model;
let root = match args.root {
Some(root) => root,
None => model
.objects
.first()
.ok_or("No objects found in model".to_string())?
.name
.clone(),
};
let response = tokio::runtime::Runtime::new()?.block_on(query_openai(
&prompt,
&pre_prompt,
&model,
&root,
&llm_model,
args.multiple,
None,
))?;
match args.output {
Some(ref output) => {
let json_string = serde_json::to_string_pretty(&response)?;
std::fs::write(output, json_string).expect("Failed to write output");
}
None => {
let json_string = serde_json::to_string_pretty(&response)?;
println!("{json_string}");
}
}
Ok(())
}
fn convert(args: ConvertArgs) -> Result<(), Box<dyn Error>> {
let mut model = if let Some(repo) = args.git.as_deref() {
load_markdown_model(&args.input, Some(repo))?
} else {
let path = resolve_input_path(&args.input);
if is_json_schema(&path)? {
DataModel::from_json_schema(&path)?
} else {
DataModel::from_markdown(&path)?
}
};
if let Templates::JsonSchemaAll = args.template {
render_all_json_schemes(&model, &args.output)?;
return Ok(()); }
let config: HashMap<String, String> = args
.options
.iter()
.map(|s| (s.clone(), "true".to_string()))
.collect();
let rendered = match args.template {
Templates::JsonSchema => {
model.json_schema(args.root, args.options.contains(&"openai".to_string()))?
}
Templates::Linkml => serialize_linkml(model, args.output.as_ref())?,
Templates::Internal => render_internal_schema(&model)?,
Templates::JsonLd => {
let root = args.root;
serde_json::to_string_pretty(&model.json_ld_header(root.as_deref())?).unwrap()
}
_ => render_jinja_template(&args.template, &mut model, Some(&config))?,
};
match args.output {
Some(ref output) => {
std::fs::write(output, rendered.trim()).expect("Failed to write output");
}
None => {
println!("{}", rendered.trim());
}
}
Ok(())
}
fn is_json_schema(path: &PathBuf) -> Result<bool, Box<dyn Error>> {
let content = std::fs::read_to_string(path)?;
let parsed = serde_json::from_str::<serde_json::Value>(&content);
match parsed {
Ok(value) => Ok(value.is_object()),
Err(_) => Ok(false),
}
}
fn resolve_input_path(input: &InputType) -> PathBuf {
match input {
InputType::Remote(url) => {
let mut path = std::env::temp_dir();
path.push("markdown.md");
let mut file = std::fs::File::create(&path).expect("Failed to create file");
let content = reqwest::blocking::get(url)
.expect("Failed to fetch URL")
.text()
.expect("Failed to read response");
file.write_all(content.as_bytes())
.expect("Failed to write to file");
path
}
InputType::Local(path) => PathBuf::from(path),
}
}
fn load_markdown_model(input: &InputType, git: Option<&str>) -> Result<DataModel, Box<dyn Error>> {
if let Some(repo) = git {
let path = match input {
InputType::Local(path) => path.as_str(),
InputType::Remote(_) => {
return Err(
"When using --git, provide --input as a repository-root path, not a URL".into(),
)
}
};
return DataModel::from_github(repo, path);
}
let path = resolve_input_path(input);
let model = DataModel::from_markdown(&path)?;
Ok(model)
}
fn render_all_json_schemes(
model: &DataModel,
outdir: &Option<PathBuf>,
) -> Result<(), Box<dyn Error>> {
let outdir = match outdir {
Some(outdir) => outdir,
None => panic!("Output directory is required for JSON Schema all"),
};
if !outdir.is_dir() && outdir.exists() {
panic!("Output must be a directory");
}
fs::create_dir_all(outdir)?;
model.json_schema_all(outdir.to_path_buf(), false)?;
Ok(())
}
fn render_internal_schema(model: &DataModel) -> Result<String, Box<dyn Error>> {
serde_json::to_string_pretty(&model).map_err(|e| e.into())
}
fn validate_ds(args: ValidateDatasetArgs) -> Result<(), Box<dyn Error>> {
let model = load_markdown_model(&args.model, args.git.as_deref())?;
let dataset_path = resolve_input_path(&args.input);
let result = validate_json(dataset_path, &model, None)?;
for error in result {
error!("{}", error);
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use assert_cmd::Command;
use pretty_assertions::assert_eq;
#[test]
fn test_resolve_input_path() {
let path = resolve_input_path(&InputType::Local("tests/data/markdown.md".to_string()));
assert_eq!(path.to_str().unwrap(), "tests/data/markdown.md");
}
#[test]
fn test_display_input_type() {
let remote = InputType::Remote("https://example.com".to_string());
let local = InputType::Local("tests/data/markdown.md".to_string());
assert_eq!(remote.to_string(), "https://example.com");
assert_eq!(local.to_string(), "tests/data/markdown.md");
}
#[test]
fn test_successful_validation_result() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("validate")
.arg("-i")
.arg("tests/data/model.md")
.assert();
assert.success();
}
#[test]
fn test_failed_validation_result() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("validate")
.arg("-i")
.arg("tests/data/model_missing_types.md")
.assert();
assert.failure();
}
#[test]
fn test_successful_conversion() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("convert")
.arg("-i")
.arg("tests/data/model.md")
.arg("-t")
.arg("markdown")
.assert();
assert.success();
}
#[test]
fn test_json_schema_no_root() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("convert")
.arg("-i")
.arg("tests/data/model.md")
.arg("-t")
.arg("json-schema")
.assert();
assert.success();
}
#[test]
fn test_pipeline_single_model() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("pipeline")
.arg("-i")
.arg("tests/test_pipeline.toml")
.assert();
assert.success();
}
#[test]
fn test_pipeline_multiple_models() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("pipeline")
.arg("-i")
.arg("tests/test_pipeline_per_spec.toml")
.assert();
assert.success();
}
#[test]
fn test_pipeline_multiple_models_invalid() {
let mut cmd = Command::cargo_bin("md-models").unwrap();
let assert = cmd
.arg("pipeline")
.arg("-i")
.arg("tests/test_pipeline_per_spec_invalid.toml")
.assert();
assert.failure();
}
}