use crate::cache::Cache;
use crate::client::{add_form_field, build_form_with_file, DatalabClient};
use crate::error::{DatalabError, Result};
use crate::output::Progress;
use clap::Args;
use reqwest::multipart::Form;
use serde_json::json;
use std::fs;
use std::path::PathBuf;
#[derive(Args, Debug)]
pub struct SegmentArgs {
#[arg(value_name = "FILE|URL")]
pub input: String,
#[arg(long, value_name = "SCHEMA")]
pub schema: String,
#[arg(long, value_name = "ID", help_heading = "Processing Options")]
pub checkpoint_id: Option<String>,
#[arg(
long,
default_value = "fast",
value_name = "MODE",
help_heading = "Processing Options"
)]
pub mode: String,
#[arg(long, value_name = "N", help_heading = "Processing Options")]
pub max_pages: Option<u32>,
#[arg(long, help_heading = "Processing Options")]
pub save_checkpoint: bool,
#[arg(long, help_heading = "Cache Options")]
pub skip_cache: bool,
#[arg(long, short, value_name = "FILE", help_heading = "Output Options")]
pub output: Option<PathBuf>,
#[arg(
long,
default_value = "300",
value_name = "SECS",
help_heading = "Advanced Options"
)]
pub timeout: u64,
}
impl SegmentArgs {
fn to_cache_params(&self) -> serde_json::Value {
json!({
"schema": self.schema,
"checkpoint_id": self.checkpoint_id,
"mode": self.mode,
"max_pages": self.max_pages,
"save_checkpoint": self.save_checkpoint,
})
}
fn get_schema(&self) -> Result<String> {
let schema_path = PathBuf::from(&self.schema);
if schema_path.exists() {
Ok(fs::read_to_string(&schema_path)?)
} else {
serde_json::from_str::<serde_json::Value>(&self.schema).map_err(|_| {
DatalabError::InvalidInput(
"Schema must be valid JSON or a path to a JSON file".to_string(),
)
})?;
Ok(self.schema.clone())
}
}
fn add_to_form(&self, mut form: Form, schema: &str) -> Form {
form = add_form_field(form, "segmentation_schema", schema);
form = add_form_field(form, "mode", &self.mode);
if let Some(ref checkpoint_id) = self.checkpoint_id {
form = add_form_field(form, "checkpoint_id", checkpoint_id);
}
if let Some(max_pages) = self.max_pages {
form = add_form_field(form, "max_pages", &max_pages.to_string());
}
if self.save_checkpoint {
form = add_form_field(form, "save_checkpoint", "true");
}
form
}
}
pub async fn execute(args: SegmentArgs, progress: &Progress) -> Result<()> {
let client = DatalabClient::new(Some(args.timeout))?;
let cache = Cache::new()?;
let schema = args.get_schema()?;
let is_url = args.input.starts_with("http://") || args.input.starts_with("https://");
let file_path = if is_url {
None
} else {
Some(PathBuf::from(&args.input))
};
let file_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
progress.start("segment", file_str.as_deref());
let file_hash = if let Some(ref path) = file_path {
if !path.exists() {
return Err(DatalabError::FileNotFound(path.clone()));
}
Some(Cache::hash_file(path)?)
} else {
None
};
let cache_params = args.to_cache_params();
let cache_key = Cache::generate_key(
file_hash.as_deref(),
if is_url { Some(&args.input) } else { None },
"segment",
&cache_params,
);
if !args.skip_cache {
if let Some(cached) = cache.get(&cache_key) {
progress.cache_hit(&cache_key);
output_result(&cached, args.output.as_ref())?;
return Ok(());
}
}
let form = if let Some(ref path) = file_path {
let (form, _) = build_form_with_file(path)?;
args.add_to_form(form, &schema)
} else {
let form = Form::new().text("file_url", args.input.clone());
args.add_to_form(form, &schema)
};
let result = client.submit_and_poll("segment", form, progress).await?;
let file_path_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
cache.set(
&cache_key,
&result,
"segment",
file_hash.as_deref(),
file_path_str.as_deref(),
)?;
output_result(&result, args.output.as_ref())?;
Ok(())
}
fn output_result(result: &serde_json::Value, output_file: Option<&PathBuf>) -> Result<()> {
let json_output = serde_json::to_string_pretty(result)?;
if let Some(path) = output_file {
fs::write(path, &json_output)?;
} else {
println!("{}", json_output);
}
Ok(())
}