rssume 0.3.5

RSS middleware with AI-powered translation and summarization
use super::{StreamResult, chat_stream};
use crate::config::LlmProviderConfig;
use crate::monitor::LogStatus;

const SYSTEM_PROMPT: &str = r#"Translate the article to the target language. Then write a one-sentence summary — just what the article is about, no filler.

Output format (each tag on its own line):

|||TITLE|||
<translated title>
|||END_TITLE|||

|||CONTENT|||
<translated content>
|||END_CONTENT|||

|||SUMMARY|||
<summary, one sentence, under 30 words>
|||END_SUMMARY|||

Keep HTML tags and code blocks intact."#;

#[derive(Debug, Default)]
pub struct ParsedArticle {
    pub title: Option<String>,
    pub content: Option<String>,
    pub summary: Option<String>,
}

enum Section {
    Title,
    Content,
    Summary,
}

pub async fn translate_and_summarize(
    config: &LlmProviderConfig,
    title: &str,
    content: &str,
    target_lang: &str,
    retry_ctx: &mut super::retry::RetryContext,
) -> Result<(StreamResult, ParsedArticle), crate::error::AppError> {
    let prompt = format!(
        "Target language: {}\n\nTitle: {}\n\nContent:\n{}",
        target_lang, title, content
    );
    let append = config.prompt_append.clone().unwrap_or_default();
    let full = if append.is_empty() {
        prompt
    } else {
        format!("{}\n{}", prompt, append)
    };

    loop {
        retry_ctx.prepare_retry().await;
        let log_id = retry_ctx.current_log_id().unwrap().to_string();
        let monitor = retry_ctx.monitor.clone();
        let feed_name = retry_ctx.feed_name.clone();

        let ot = move |t: &str| {
            let m = monitor.clone();
            let f = feed_name.clone();
            let l = log_id.clone();
            let s = t.to_string();
            tokio::task::spawn(async move {
                m.write().await.update_log(&f, &l, |log| {
                    log.streamed_text.push_str(&s);
                    log.status = LogStatus::Streaming {
                        tokens: log.streamed_text.clone(),
                    };
                });
            });
        };

        match chat_stream(config, SYSTEM_PROMPT, &full, ot).await {
            Ok(result) => match parse_llm_output(&result.text) {
                Ok(parsed) => {
                    retry_ctx.mark_success(&result.usage).await;
                    return Ok((result, parsed));
                }
                Err(e) => {
                    retry_ctx.record_failure(e).await;
                    if !retry_ctx.should_retry() {
                        return Err(retry_ctx.take_last_error().unwrap());
                    }
                    retry_ctx.wait().await;
                }
            },
            Err(e) => {
                retry_ctx.record_failure(e).await;
                if !retry_ctx.should_retry() {
                    return Err(retry_ctx.take_last_error().unwrap());
                }
                retry_ctx.wait().await;
            }
        }
    }
}

fn parse_llm_output(text: &str) -> Result<ParsedArticle, crate::error::AppError> {
    let mut result = ParsedArticle::default();
    let mut current_section: Option<Section> = None;
    let mut buffer = Vec::new();

    for line in text.lines() {
        match line.trim() {
            "|||TITLE|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = Some(Section::Title);
            }
            "|||END_TITLE|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = None;
            }
            "|||CONTENT|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = Some(Section::Content);
            }
            "|||END_CONTENT|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = None;
            }
            "|||SUMMARY|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = Some(Section::Summary);
            }
            "|||END_SUMMARY|||" => {
                flush_buffer(&mut buffer, &mut current_section, &mut result);
                current_section = None;
            }
            _ => {
                if current_section.is_some() {
                    buffer.push(line.to_string());
                }
            }
        }
    }
    flush_buffer(&mut buffer, &mut current_section, &mut result);

    if result.title.is_none() && result.content.is_none() {
        return Err(crate::error::AppError::Llm(
            "Failed to parse LLM output: no title or content found".into(),
        ));
    }

    Ok(result)
}

fn flush_buffer(
    buffer: &mut Vec<String>,
    section: &mut Option<Section>,
    result: &mut ParsedArticle,
) {
    if buffer.is_empty() {
        return;
    }
    let text = buffer.join("\n").trim().to_string();
    match section {
        Some(Section::Title) => result.title = Some(text),
        Some(Section::Content) => result.content = Some(text),
        Some(Section::Summary) => result.summary = Some(text),
        None => {}
    }
    buffer.clear();
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_all_sections() {
        let input = "\
|||TITLE|||
Translated Title
|||END_TITLE|||

|||CONTENT|||
First paragraph.

Second paragraph.
|||END_CONTENT|||

|||SUMMARY|||
A short summary.
|||END_SUMMARY|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.title.as_deref(), Some("Translated Title"));
        assert_eq!(
            result.content.as_deref(),
            Some("First paragraph.\n\nSecond paragraph.")
        );
        assert_eq!(result.summary.as_deref(), Some("A short summary."));
    }

    #[test]
    fn parse_title_and_content_only() {
        let input = "\
|||TITLE|||
Just a Title
|||END_TITLE|||

|||CONTENT|||
Just content.
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.title.as_deref(), Some("Just a Title"));
        assert_eq!(result.content.as_deref(), Some("Just content."));
        assert!(result.summary.is_none());
    }

    #[test]
    fn missing_title_and_content_is_error() {
        let input = "\
|||SUMMARY|||
A summary only.
|||END_SUMMARY|||";
        assert!(parse_llm_output(input).is_err());
    }

    #[test]
    fn empty_input_is_error() {
        assert!(parse_llm_output("").is_err());
    }

    #[test]
    fn multiline_content() {
        let input = "\
|||TITLE|||
T
|||END_TITLE|||

|||CONTENT|||
Line 1
Line 2
Line 3
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.content.as_deref(), Some("Line 1\nLine 2\nLine 3"));
    }

    #[test]
    fn whitespace_is_trimmed() {
        let input = "\
|||TITLE|||
  Padded Title
|||END_TITLE|||

|||CONTENT|||
  Padded content.
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.title.as_deref(), Some("Padded Title"));
        assert_eq!(result.content.as_deref(), Some("Padded content."));
    }

    #[test]
    fn leading_junk_before_tags_is_ignored() {
        let input = "\
Here is the translation:

|||TITLE|||
Real Title
|||END_TITLE|||

|||CONTENT|||
Real content.
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.title.as_deref(), Some("Real Title"));
        assert_eq!(result.content.as_deref(), Some("Real content."));
    }

    #[test]
    fn html_tags_preserved_in_content() {
        let input = "\
|||TITLE|||
T
|||END_TITLE|||

|||CONTENT|||
<p>Hello <b>world</b></p>
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.content.as_deref(), Some("<p>Hello <b>world</b></p>"));
    }

    #[test]
    fn out_of_order_sections() {
        let input = "\
|||SUMMARY|||
Sum.
|||END_SUMMARY|||

|||CONTENT|||
Body.
|||END_CONTENT|||

|||TITLE|||
Title.
|||END_TITLE|||";
        let result = parse_llm_output(input).unwrap();
        assert_eq!(result.title.as_deref(), Some("Title."));
        assert_eq!(result.content.as_deref(), Some("Body."));
        assert_eq!(result.summary.as_deref(), Some("Sum."));
    }

    #[test]
    fn content_only_is_valid() {
        let input = "\
|||CONTENT|||
Just content, no title tag.
|||END_CONTENT|||";
        let result = parse_llm_output(input).unwrap();
        assert!(result.title.is_none());
        assert_eq!(
            result.content.as_deref(),
            Some("Just content, no title tag.")
        );
    }
}