psyche-subtitle-toolkit 0.3.1

use std::collections::{BTreeMap, HashMap, HashSet};

use crate::error::{Result, SubtitleToolkitError};

use super::model::{SubtitleCue, SubtitleDocument};

/// Serialize a subtitle document to numbered text for LLM translation.
///
/// Each cue becomes `<id> text` on its own line. Internal newlines are
/// encoded as `\\N` (ASS hard break).
pub fn to_numbered_text(document: &SubtitleDocument) -> String {
    document
        .cues
        .iter()
        .map(|cue| format!("<{}> {}", cue.id, cue.text.replace('\n', "\\N")))
        .collect::<Vec<_>>()
        .join("\n")
}

/// Parse numbered text from an LLM response back into a cue ID → text map.
///
/// Validates that all `expected_ids` are present, no duplicates exist,
/// and no unexpected IDs appear. Multiline continuation lines are joined
/// with `\n`.
pub fn parse_numbered_text(text: &str, expected_ids: &[usize]) -> Result<BTreeMap<usize, String>> {
    let expected: HashSet<usize> = expected_ids.iter().copied().collect();
    let mut parsed = BTreeMap::new();
    let mut current_id = None;

    for line in text.lines() {
        if let Some((id, value)) = parse_numbered_line(line)? {
            if !expected.contains(&id) {
                return Err(SubtitleToolkitError::InvalidTranslation {
                    message: format!("unexpected id <{id}>"),
                });
            }

            if parsed.insert(id, value.to_string()).is_some() {
                return Err(SubtitleToolkitError::InvalidTranslation {
                    message: format!("duplicate id <{id}>"),
                });
            }
            current_id = Some(id);
        } else if let Some(id) = current_id {
            let value = parsed.get_mut(&id).expect("current id must exist");
            if !value.is_empty() {
                value.push('\n');
            }
            value.push_str(line);
        } else if !line.trim().is_empty() {
            return Err(SubtitleToolkitError::InvalidTranslation {
                message: format!("text before first id: {line}"),
            });
        }
    }

    for id in expected_ids {
        if !parsed.contains_key(id) {
            return Err(SubtitleToolkitError::InvalidTranslation {
                message: format!("missing id <{id}>"),
            });
        }
    }

    Ok(parsed)
}

fn parse_numbered_line(line: &str) -> Result<Option<(usize, &str)>> {
    let trimmed = line.trim_start();
    let Some(rest) = trimmed.strip_prefix('<') else {
        return Ok(None);
    };
    let Some((id, value)) = rest.split_once('>') else {
        return Err(SubtitleToolkitError::InvalidTranslation {
            message: format!("malformed numbered line: {line}"),
        });
    };
    let id = id
        .parse::<usize>()
        .map_err(|_| SubtitleToolkitError::InvalidTranslation {
            message: format!("invalid id in line: {line}"),
        })?;

    Ok(Some((id, value.trim_start())))
}

/// Apply translated text to a document, replacing each cue's text by ID.
pub fn apply_translation(document: &mut SubtitleDocument, translated: BTreeMap<usize, String>) {
    for (id, text) in translated {
        document.replace_text(id, text);
    }
}

fn find_tag_end(text: &str) -> usize {
    if !text.starts_with('{') {
        return 0;
    }

    let mut depth = 0;
    for (i, ch) in text.char_indices() {
        match ch {
            '{' => depth += 1,
            '}' => {
                depth -= 1;
                if depth == 0 {
                    return i + ch.len_utf8();
                }
            }
            _ => {}
        }
    }

    0
}

fn strip_override_tags(text: &str) -> (String, String) {
    let mut tags = String::new();
    let mut remaining = text;

    loop {
        let end = find_tag_end(remaining);
        if end == 0 {
            break;
        }
        tags.push_str(&remaining[..end]);
        remaining = &remaining[end..];
    }

    (remaining.to_string(), tags)
}

/// Strip leading ASS override tags (`{\\pos(...)}`, `{\\an7}`, etc.) from all cues.
///
/// Returns a clean document (tags removed) and a map of cue ID → stripped tags.
/// Use [`reinject_tags`] to restore tags after translation.
pub fn strip_tags(document: &SubtitleDocument) -> (SubtitleDocument, HashMap<usize, String>) {
    let mut tag_map = HashMap::new();
    let mut clean_cues = Vec::with_capacity(document.cues.len());

    for cue in &document.cues {
        let (clean, tags) = strip_override_tags(&cue.text);
        if !tags.is_empty() {
            tag_map.insert(cue.id, tags);
        }
        clean_cues.push(SubtitleCue {
            id: cue.id,
            text: clean,
        });
    }

    (SubtitleDocument { cues: clean_cues }, tag_map)
}

/// Restore ASS override tags previously stripped by [`strip_tags`].
///
/// Prepends stored tags back onto each cue's text.
pub fn reinject_tags(document: &mut SubtitleDocument, tag_map: &HashMap<usize, String>) {
    for cue in &mut document.cues {
        if let Some(tags) = tag_map.get(&cue.id) {
            cue.text = format!("{}{}", tags, cue.text);
        }
    }
}

/// Split a subtitle document into chunks for batched LLM translation.
///
/// Each chunk's numbered text representation will be at most `max_chars` characters.
/// Cue IDs are preserved across chunks. A single cue longer than `max_chars`
/// gets its own chunk (never split mid-cue).
pub fn chunk_document(document: &SubtitleDocument, max_chars: usize) -> Vec<SubtitleDocument> {
    if document.cues.is_empty() {
        return vec![];
    }

    let mut chunks = Vec::new();
    let mut current_cues = Vec::new();
    let mut current_chars = 0;

    for cue in &document.cues {
        let line_chars = format!("<{}> {}", cue.id, cue.text.replace('\n', "\\N")).len();

        if !current_cues.is_empty() && current_chars + 1 + line_chars > max_chars {
            chunks.push(SubtitleDocument {
                cues: std::mem::take(&mut current_cues),
            });
            current_chars = 0;
        }

        current_chars += if current_cues.is_empty() {
            line_chars
        } else {
            1 + line_chars
        };
        current_cues.push(cue.clone());
    }

    if !current_cues.is_empty() {
        chunks.push(SubtitleDocument { cues: current_cues });
    }

    chunks
}

/// Split a document into chunks of at most `max_lines` cues each.
///
/// This is an alternative to [`chunk_document`] that chunks by line count
/// instead of character count, which is more predictable for LLM token usage.
pub fn chunk_document_by_lines(document: &SubtitleDocument, max_lines: usize) -> Vec<SubtitleDocument> {
    if document.cues.is_empty() || max_lines == 0 {
        return vec![];
    }

    document
        .cues
        .chunks(max_lines)
        .map(|chunk| SubtitleDocument {
            cues: chunk.to_vec(),
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::subtitles::model::{SubtitleCue, SubtitleDocument};

    #[test]
    fn formats_numbered_text() {
        let document = SubtitleDocument {
            cues: vec![
                SubtitleCue {
                    id: 1,
                    text: "hello".into(),
                },
                SubtitleCue {
                    id: 2,
                    text: "world".into(),
                },
            ],
        };

        assert_eq!(to_numbered_text(&document), "<1> hello\n<2> world");
    }

    #[test]
    fn parses_multiline_numbered_text() {
        let parsed = parse_numbered_text("<1> olá\ncontinua\n<2> mundo", &[1, 2]).unwrap();

        assert_eq!(parsed.get(&1).unwrap(), "olá\ncontinua");
        assert_eq!(parsed.get(&2).unwrap(), "mundo");
    }

    #[test]
    fn rejects_missing_ids() {
        let error = parse_numbered_text("<1> olá", &[1, 2]).unwrap_err();

        assert!(error.to_string().contains("missing id <2>"));
    }

    #[test]
    fn strip_tags_extracts_leading_tags() {
        let (clean, tags) = strip_override_tags(r"{\pos(857.6,122.4)}{\an7}STATUS");
        assert_eq!(clean, "STATUS");
        assert_eq!(tags, r"{\pos(857.6,122.4)}{\an7}");
    }

    #[test]
    fn strip_tags_no_tags() {
        let (clean, tags) = strip_override_tags("Hello world");
        assert_eq!(clean, "Hello world");
        assert!(tags.is_empty());
    }

    #[test]
    fn strip_tags_only_tags() {
        let (clean, tags) = strip_override_tags(r"{\pos(1,2)}{\an7}");
        assert!(clean.is_empty());
        assert_eq!(tags, r"{\pos(1,2)}{\an7}");
    }

    #[test]
    fn strip_tags_single_tag() {
        let (clean, tags) = strip_override_tags(r"{\b1}Bold text");
        assert_eq!(clean, "Bold text");
        assert_eq!(tags, r"{\b1}");
    }

    #[test]
    fn strip_tags_inner_braces_not_confused() {
        let (clean, tags) = strip_override_tags(r"{\pos(1.0,2.0)}Hello");
        assert_eq!(clean, "Hello");
        assert_eq!(tags, r"{\pos(1.0,2.0)}");
    }

    #[test]
    fn reinject_tags_roundtrip() {
        let document = SubtitleDocument {
            cues: vec![
                SubtitleCue {
                    id: 1,
                    text: r"{\pos(1,2)}{\an7}STATUS".into(),
                },
                SubtitleCue {
                    id: 2,
                    text: "No tags here".into(),
                },
            ],
        };

        let (clean_doc, tag_map) = strip_tags(&document);
        assert_eq!(clean_doc.cues[0].text, "STATUS");
        assert_eq!(clean_doc.cues[1].text, "No tags here");

        let mut restored = clean_doc;
        reinject_tags(&mut restored, &tag_map);
        assert_eq!(restored.cues[0].text, r"{\pos(1,2)}{\an7}STATUS");
        assert_eq!(restored.cues[1].text, "No tags here");
    }

    #[test]
    fn reinject_tags_empty() {
        let mut doc = SubtitleDocument {
            cues: vec![SubtitleCue {
                id: 1,
                text: "clean".into(),
            }],
        };
        let tag_map = HashMap::new();
        reinject_tags(&mut doc, &tag_map);
        assert_eq!(doc.cues[0].text, "clean");
    }

    #[test]
    fn to_numbered_text_after_stripping() {
        let document = SubtitleDocument {
            cues: vec![
                SubtitleCue {
                    id: 1,
                    text: r"{\pos(1,2)}{\an7}STATUS".into(),
                },
                SubtitleCue {
                    id: 2,
                    text: "Hello".into(),
                },
            ],
        };

        let (clean, _) = strip_tags(&document);
        let numbered = to_numbered_text(&clean);
        assert_eq!(numbered, "<1> STATUS\n<2> Hello");
    }

    #[test]
    fn chunk_document_splits_when_needed() {
        let document = SubtitleDocument {
            cues: (1..=10)
                .map(|i| SubtitleCue {
                    id: i,
                    text: format!("line {i}"),
                })
                .collect(),
        };

        // Each line is ~11 chars ("<N> line N"), total ~120 chars
        // With limit 50, should split into multiple chunks
        let chunks = chunk_document(&document, 50);
        assert!(chunks.len() > 1);

        // All cues preserved across chunks
        let total_cues: usize = chunks.iter().map(|c| c.cues.len()).sum();
        assert_eq!(total_cues, 10);

        // IDs preserved
        let all_ids: Vec<usize> = chunks
            .iter()
            .flat_map(|c| c.cues.iter().map(|cue| cue.id))
            .collect();
        assert_eq!(all_ids, (1..=10).collect::<Vec<_>>());
    }

    #[test]
    fn chunk_document_single_chunk_when_fits() {
        let document = SubtitleDocument {
            cues: vec![
                SubtitleCue {
                    id: 1,
                    text: "hello".into(),
                },
                SubtitleCue {
                    id: 2,
                    text: "world".into(),
                },
            ],
        };

        let chunks = chunk_document(&document, 10000);
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].cues.len(), 2);
    }

    #[test]
    fn chunk_document_oversized_cue_gets_own_chunk() {
        let big_text = "x".repeat(200);
        let document = SubtitleDocument {
            cues: vec![
                SubtitleCue {
                    id: 1,
                    text: big_text.clone(),
                },
                SubtitleCue {
                    id: 2,
                    text: "small".into(),
                },
            ],
        };

        let chunks = chunk_document(&document, 50);
        assert_eq!(chunks.len(), 2);
        assert_eq!(chunks[0].cues[0].id, 1);
        assert_eq!(chunks[0].cues[0].text, big_text);
        assert_eq!(chunks[1].cues[0].id, 2);
    }

    #[test]
    fn chunk_document_empty() {
        let document = SubtitleDocument { cues: vec![] };
        let chunks = chunk_document(&document, 5000);
        assert!(chunks.is_empty());
    }
}