bookforge-cli 1.8.0

CLI-first EPUB translation engine with deterministic structure rebuild and review loop.
use std::{collections::BTreeSet, fs, path::PathBuf};

use anyhow::Result;
use bookforge_core::{GlossaryCategory, GlossaryScopeKind, GlossaryStatus, GlossaryTerm};
use bookforge_store::{JobRecord, JobStore, NewSegmentFlag};
use clap::Args;
use serde::Deserialize;

#[derive(Debug, Args)]
pub struct IngestFlagsArgs {
    pub job_id: String,

    #[arg(long)]
    pub flags: PathBuf,

    #[arg(long, value_enum)]
    pub default_scope: Option<GlossaryScopeKind>,
}

#[derive(Debug, Deserialize)]
struct FlagsFile {
    schema_version: u32,
    job_id: String,
    #[allow(dead_code)]
    exported_at: Option<String>,
    flags: Vec<FlagEntry>,
}

#[derive(Debug, Deserialize)]
struct FlagEntry {
    segment_id: String,
    kind: String,
    note: Option<String>,
    suggested_source: Option<String>,
    suggested_target: Option<String>,
}

pub async fn run(args: IngestFlagsArgs) -> Result<()> {
    let store = JobStore::open_default()?;
    let Some(job) = store.get_job(&args.job_id)? else {
        anyhow::bail!("job '{}' was not found", args.job_id);
    };

    let parsed: FlagsFile = serde_json::from_str(&fs::read_to_string(&args.flags)?)
        .map_err(|err| anyhow::anyhow!("invalid flags JSON: {err}"))?;
    validate_flags(&args.job_id, &parsed)?;

    let known_segments = store
        .segment_records(&args.job_id)?
        .into_iter()
        .map(|record| record.id)
        .collect::<BTreeSet<_>>();
    for flag in &parsed.flags {
        if !known_segments.contains(&flag.segment_id) {
            anyhow::bail!(
                "flags file references unknown segment '{}' for job '{}'",
                flag.segment_id,
                args.job_id
            );
        }
    }

    let wrong_translation_ids = parsed
        .flags
        .iter()
        .filter(|flag| flag.kind == "wrong_translation")
        .map(|flag| flag.segment_id.clone())
        .collect::<BTreeSet<_>>();
    let glossary_terms = glossary_terms_from_flags(&job, args.default_scope, &parsed.flags)?;

    let new_flags = parsed
        .flags
        .iter()
        .map(|flag| NewSegmentFlag {
            job_id: &args.job_id,
            segment_id: &flag.segment_id,
            kind: &flag.kind,
            note: flag.note.as_deref(),
            suggested_source: flag.suggested_source.as_deref(),
            suggested_target: flag.suggested_target.as_deref(),
            consumed: flag.kind == "wrong_translation"
                || (flag.kind == "name" && flag.suggested_target.is_some())
                || flag.kind == "register",
        })
        .collect::<Vec<_>>();

    let inserted = store.insert_segment_flags(&new_flags)?;
    let glossary_added = store.upsert_glossary_terms(&glossary_terms)?;
    let wrong_translation_ids = wrong_translation_ids.into_iter().collect::<Vec<_>>();
    let marked = store.mark_segments_needs_review(
        &args.job_id,
        &wrong_translation_ids,
        "flagged wrong_translation via ingest-flags",
    )?;

    println!(
        "Ingested {inserted} flags. {marked} segments marked needs-review. {glossary_added} glossary terms saved."
    );

    Ok(())
}

fn glossary_terms_from_flags(
    job: &JobRecord,
    default_scope: Option<GlossaryScopeKind>,
    flags: &[FlagEntry],
) -> Result<Vec<GlossaryTerm>> {
    let scope_kind = default_scope.unwrap_or(GlossaryScopeKind::Book);
    let scope_id = match scope_kind {
        GlossaryScopeKind::Global => None,
        GlossaryScopeKind::Book => Some(job.book_id.clone().unwrap_or_else(|| job.id.clone())),
        GlossaryScopeKind::Series => Some(job.series_id.clone().ok_or_else(|| {
            anyhow::anyhow!(
                "--default-scope series requires job '{}' to have a series_id",
                job.id
            )
        })?),
    };
    let source_language = job
        .source_lang
        .clone()
        .unwrap_or_else(|| "auto".to_string());
    let mut terms = Vec::new();
    for flag in flags {
        match flag.kind.as_str() {
            "name"
                if flag
                    .suggested_target
                    .as_deref()
                    .is_some_and(|value| !value.trim().is_empty()) =>
            {
                let target = flag.suggested_target.clone().unwrap_or_default();
                let source = flag
                    .suggested_source
                    .clone()
                    .filter(|value| !value.trim().is_empty())
                    .unwrap_or_else(|| target.clone());
                terms.push(GlossaryTerm {
                    id: None,
                    scope_kind,
                    scope_id: scope_id.clone(),
                    source_text: source,
                    target_text: target,
                    category: GlossaryCategory::Person,
                    notes: flag.note.clone(),
                    case_sensitive: true,
                    always_active: false,
                    status: GlossaryStatus::UserSeeded,
                    source_language: source_language.clone(),
                    target_language: job.target_lang.clone(),
                    source_count: 0,
                });
            }
            "register" => {
                let source = flag
                    .suggested_source
                    .clone()
                    .filter(|value| !value.trim().is_empty())
                    .unwrap_or_else(|| format!("__register:{}", flag.segment_id));
                let target = flag
                    .suggested_target
                    .clone()
                    .or_else(|| flag.note.clone())
                    .unwrap_or_else(|| "register".to_string());
                terms.push(GlossaryTerm {
                    id: None,
                    scope_kind,
                    scope_id: scope_id.clone(),
                    source_text: source,
                    target_text: target,
                    category: GlossaryCategory::Style,
                    notes: flag.note.clone(),
                    case_sensitive: false,
                    always_active: true,
                    status: GlossaryStatus::UserSeeded,
                    source_language: source_language.clone(),
                    target_language: job.target_lang.clone(),
                    source_count: 0,
                });
            }
            _ => {}
        }
    }
    Ok(terms)
}

fn validate_flags(job_id: &str, parsed: &FlagsFile) -> Result<()> {
    if parsed.schema_version != 1 {
        anyhow::bail!(
            "unsupported flags schema_version {}; expected 1",
            parsed.schema_version
        );
    }
    if parsed.job_id != job_id {
        anyhow::bail!(
            "flags job_id '{}' does not match requested job '{}'",
            parsed.job_id,
            job_id
        );
    }
    for (index, flag) in parsed.flags.iter().enumerate() {
        if flag.segment_id.trim().is_empty() {
            anyhow::bail!("flags[{index}].segment_id is required");
        }
        if !valid_kind(&flag.kind) {
            anyhow::bail!(
                "flags[{index}].kind '{}' is invalid; expected one of name, register, wrong_translation, formatting, tone, other",
                flag.kind
            );
        }
    }
    Ok(())
}

fn valid_kind(kind: &str) -> bool {
    matches!(
        kind,
        "name" | "register" | "wrong_translation" | "formatting" | "tone" | "other"
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn validate_flags_rejects_wrong_job() {
        let parsed = FlagsFile {
            schema_version: 1,
            job_id: "job_other".to_string(),
            exported_at: None,
            flags: Vec::new(),
        };
        let error = validate_flags("job_expected", &parsed).expect_err("should reject job");
        assert!(error.to_string().contains("does not match"));
    }

    #[test]
    fn validate_flags_rejects_unknown_kind() {
        let parsed = FlagsFile {
            schema_version: 1,
            job_id: "job_1".to_string(),
            exported_at: None,
            flags: vec![FlagEntry {
                segment_id: "seg_1".to_string(),
                kind: "bad".to_string(),
                note: None,
                suggested_source: None,
                suggested_target: None,
            }],
        };
        let error = validate_flags("job_1", &parsed).expect_err("should reject kind");
        assert!(error.to_string().contains("invalid"));
    }

    #[test]
    fn name_flags_seed_book_scoped_glossary_terms() {
        let job = JobRecord {
            id: "job_1".to_string(),
            input_path: "input.epub".into(),
            input_snapshot_path: None,
            input_sha256: None,
            output_path: "out.epub".into(),
            input_hash: "hash".to_string(),
            source_lang: Some("English".to_string()),
            target_lang: "Italian".to_string(),
            provider: "mock".to_string(),
            model: "mock".to_string(),
            base_url: None,
            api_key_env: None,
            status: "succeeded".to_string(),
            events_path: None,
            report_json_path: None,
            report_markdown_path: None,
            book_id: Some("fellowship".to_string()),
            series_id: Some("lotr".to_string()),
        };
        let terms = glossary_terms_from_flags(
            &job,
            None,
            &[FlagEntry {
                segment_id: "seg_1".to_string(),
                kind: "name".to_string(),
                note: Some("preserve".to_string()),
                suggested_source: Some("Aragorn".to_string()),
                suggested_target: Some("Aragorn".to_string()),
            }],
        )
        .expect("terms should build");

        assert_eq!(terms.len(), 1);
        assert_eq!(terms[0].scope_kind, GlossaryScopeKind::Book);
        assert_eq!(terms[0].scope_id.as_deref(), Some("fellowship"));
        assert_eq!(terms[0].category, GlossaryCategory::Person);
    }
}