tail-fin-gemini 0.6.5

//! HTML / response parsers. Pure functions, no network — easy to unit-test.

use std::sync::OnceLock;

use regex::Regex;
use serde_json::Value;

use tail_fin_common::TailFinError;

/// Session-bound tokens pulled from `gemini.google.com/app`.
#[derive(Debug, Clone)]
pub struct SessionTokens {
    /// CSRF-style token passed as `at=` on StreamGenerate.
    pub snlm0e: String,
    /// Pre-registered upload identifier — required as `Push-ID` on
    /// push.clients6.google.com. Shape: `feeds/<id>`.
    pub push_id: String,
    /// Build label passed as `?bl=` on StreamGenerate. Rotates on
    /// server releases — self-updating keeps us from going stale.
    /// Shape: `boq_assistant-bard-web-server_<YYYYMMDD>.<NN>_pN`.
    pub build_label: String,
}

/// Extract all session-bound tokens in one pass over /app HTML.
/// Gemini embeds these in a `window.WIZ_global_data` JSON blob.
pub fn extract_session_tokens(html: &str) -> Result<SessionTokens, TailFinError> {
    Ok(SessionTokens {
        snlm0e: extract_snlm0e(html)?,
        push_id: extract_push_id(html)?,
        build_label: extract_build_label(html)?,
    })
}

/// Just the SNlM0e CSRF token.
pub fn extract_snlm0e(html: &str) -> Result<String, TailFinError> {
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE.get_or_init(|| Regex::new(r#""SNlM0e":"([^"]+)""#).unwrap());
    re.captures(html)
        .and_then(|c| c.get(1))
        .map(|m| m.as_str().to_string())
        .ok_or_else(|| {
            TailFinError::Api(
                "could not extract SNlM0e token from /app HTML — cookies may be stale".into(),
            )
        })
}

/// Extract the upload Push-ID. Gemini's /app page stores it under the
/// mangled key `qKIAYe` in WIZ_global_data — value shape `feeds/<id>`.
pub fn extract_push_id(html: &str) -> Result<String, TailFinError> {
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE.get_or_init(|| Regex::new(r#""qKIAYe":"(feeds/[^"]+)""#).unwrap());
    re.captures(html)
        .and_then(|c| c.get(1))
        .map(|m| m.as_str().to_string())
        .ok_or_else(|| {
            TailFinError::Api(
                "could not extract upload Push-ID from /app HTML — Gemini may have changed".into(),
            )
        })
}

/// Extract the `bl` build label (stored in WIZ_global_data under `cfb2h`).
/// Hardcoding this goes stale in weeks; self-extracting tracks upstream.
pub fn extract_build_label(html: &str) -> Result<String, TailFinError> {
    static RE: OnceLock<Regex> = OnceLock::new();
    let re = RE
        .get_or_init(|| Regex::new(r#""cfb2h":"(boq_assistant-bard-web-server_[^"]+)""#).unwrap());
    re.captures(html)
        .and_then(|c| c.get(1))
        .map(|m| m.as_str().to_string())
        .ok_or_else(|| {
            TailFinError::Api(
                "could not extract build label (cfb2h) from /app HTML — key may have been renamed"
                    .into(),
            )
        })
}

/// StreamGenerate returns a body shaped like:
///
/// ```text
/// )]}'
///
/// 4567
/// [[["wrb.fr","method","[[\"... response ...\"]]"], ...]]
/// …
/// ```
///
/// A `)]}'` prefix guards against JSON hijacking; subsequent lines
/// alternate `<length>` / `<json array>` chunks. Each chunk may contain
/// multiple `wrb.fr` frames — metadata, conversation IDs, and the model
/// response arrive as separate frames. We scan every frame, extract
/// candidates from known-good paths, and return the longest.
///
/// **No silent fallback**: if none of the paths hit we surface an error
/// rather than guess with "longest string anywhere in the tree" — a
/// nominally clever heuristic that happily returns prompt echoes,
/// telemetry blobs, or "Gemini is thinking…" as real answers.
pub fn extract_response_text(body: &str) -> Result<String, TailFinError> {
    let body = body.trim_start_matches(")]}'").trim_start();

    let mut primary: Vec<String> = Vec::new();
    for inner in iter_wrb_fr_inners(body) {
        // Known-good paths as of 2026-04. `inner[4][0][1][0]` is where
        // text-only and attachment responses have landed; `/4/0/1/1/0`
        // and `/4/0/2/0` are safety nets against shape drift.
        for path in ["/4/0/1/0", "/4/0/1/1/0", "/4/0/2/0"] {
            if let Some(s) = inner.pointer(path).and_then(Value::as_str) {
                if !s.is_empty() {
                    primary.push(s.to_string());
                }
            }
        }
    }
    // Pick the longest so multi-chunk streams (partial answers superseded
    // by final ones) land on the full reply.
    primary.sort_by_key(|s| std::cmp::Reverse(s.len()));
    primary.into_iter().next().ok_or_else(|| {
        TailFinError::Parse(
            "response shape drifted — none of /4/0/1/0, /4/0/1/1/0, /4/0/2/0 hit. \
             Dump a /app StreamGenerate response and add the new path in parsing.rs."
                .into(),
        )
    })
}

/// Scan the parsed stream for a committed conversation id (`c_<hex>`).
/// The id lives in a metadata `wrb.fr` frame at `inner[1][0]` — NOT in
/// arbitrary string positions throughout the body, which is why a raw
/// regex-on-body approach catches request ids and other transient
/// `c_<hex>` tokens that look the same.
pub fn extract_conversation_id(body: &str) -> Option<String> {
    extract_turn_ids(body).conversation_id
}

/// The three id strings Gemini returns alongside each turn. All three
/// are required to continue a conversation in a follow-up request.
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct TurnIds {
    /// `c_<hex>` — lives at `inner[1][0]` in any wrb.fr frame.
    pub conversation_id: Option<String>,
    /// `r_<hex>` — lives at `inner[1][1]` alongside the conversation id.
    pub response_id: Option<String>,
    /// `rc_<hex>` — lives at `inner[4][0][0]` in the reply frame.
    pub choice_id: Option<String>,
}

/// Walk every wrb.fr frame and collect the conversation/response/choice
/// ids. Multiple frames may carry partial info — the streamed response
/// typically has one frame with `inner[1]` populated and a different
/// frame with the `rc_` choice id at `inner[4][0][0]`.
///
/// When `TAIL_FIN_GEMINI_TRACE=1` is set and we saw wrb.fr frames but
/// none of the expected id slots matched, writes a one-line breadcrumb
/// to stderr. This is the "Gemini silently reshuffled the payload
/// shape" early-warning signal — without it the user just sees
/// `can_continue() == false` with no hint the upstream changed.
pub fn extract_turn_ids(body: &str) -> TurnIds {
    let body = body.trim_start_matches(")]}'").trim_start();
    let mut out = TurnIds::default();
    let mut frames_seen = 0usize;
    for inner in iter_wrb_fr_inners(body) {
        frames_seen += 1;
        if out.conversation_id.is_none() {
            if let Some(s) = inner.pointer("/1/0").and_then(Value::as_str) {
                if s.starts_with("c_") {
                    out.conversation_id = Some(s.to_string());
                }
            }
        }
        if out.response_id.is_none() {
            if let Some(s) = inner.pointer("/1/1").and_then(Value::as_str) {
                if s.starts_with("r_") {
                    out.response_id = Some(s.to_string());
                }
            }
        }
        if out.choice_id.is_none() {
            if let Some(s) = inner.pointer("/4/0/0").and_then(Value::as_str) {
                if s.starts_with("rc_") {
                    out.choice_id = Some(s.to_string());
                }
            }
        }
    }
    if frames_seen > 0
        && out.conversation_id.is_none()
        && out.response_id.is_none()
        && out.choice_id.is_none()
        && trace_enabled()
    {
        eprintln!(
            "[tail-fin-gemini] extract_turn_ids: saw {frames_seen} wrb.fr frames but no id \
             slots matched. Gemini may have reshuffled the payload shape; \
             check inner[1][0]/[1][1]/[4][0][0] in a fresh response."
        );
    }
    out
}

/// True iff `TAIL_FIN_GEMINI_TRACE` is set to any truthy string:
/// `1`/`true`/`yes`/`on` (case-insensitive, surrounding whitespace
/// tolerated). Anything else — including unset — is falsy.
fn trace_enabled() -> bool {
    match std::env::var("TAIL_FIN_GEMINI_TRACE") {
        Ok(v) => matches!(
            v.trim().to_ascii_lowercase().as_str(),
            "1" | "true" | "yes" | "on"
        ),
        Err(_) => false,
    }
}

/// Iterate over every parsed `wrb.fr` inner payload across all chunks.
fn iter_wrb_fr_inners(body: &str) -> impl Iterator<Item = Value> + '_ {
    iter_json_arrays(body).flat_map(|chunk| {
        let outer: Value = match serde_json::from_str(chunk) {
            Ok(v) => v,
            Err(_) => return Vec::new().into_iter(),
        };
        collect_wrb_fr_payloads(&outer)
            .into_iter()
            .filter_map(|p| serde_json::from_str::<Value>(&p).ok())
            .collect::<Vec<_>>()
            .into_iter()
    })
}

fn first_json_array_at(s: &str, start: usize) -> Option<&str> {
    let bytes = s.as_bytes();
    if start >= bytes.len() || bytes[start] != b'[' {
        return None;
    }
    let mut depth = 0i32;
    let mut in_str = false;
    let mut esc = false;
    for (i, &b) in bytes[start..].iter().enumerate() {
        if esc {
            esc = false;
            continue;
        }
        match b {
            b'\\' if in_str => esc = true,
            b'"' => in_str = !in_str,
            b'[' if !in_str => depth += 1,
            b']' if !in_str => {
                depth -= 1;
                if depth == 0 {
                    return Some(&s[start..start + i + 1]);
                }
            }
            _ => {}
        }
    }
    None
}

/// Iterate over every top-level balanced JSON array in `body`. Gemini's
/// stream wraps each chunk in a length prefix followed by a `[…]` — we
/// skip prefixes and pull out the arrays.
fn iter_json_arrays(body: &str) -> impl Iterator<Item = &str> {
    let mut pos = 0;
    std::iter::from_fn(move || {
        while pos < body.len() {
            let rel = body[pos..].find('[')?;
            let start = pos + rel;
            if let Some(s) = first_json_array_at(body, start) {
                pos = start + s.len();
                return Some(s);
            }
            pos = start + 1;
        }
        None
    })
}

fn collect_wrb_fr_payloads(v: &Value) -> Vec<String> {
    let mut out = Vec::new();
    walk_wrb_fr(v, &mut out);
    out
}

fn walk_wrb_fr(v: &Value, out: &mut Vec<String>) {
    if let Value::Array(items) = v {
        if items.first().and_then(Value::as_str) == Some("wrb.fr") {
            if let Some(s) = items.get(2).and_then(Value::as_str) {
                out.push(s.to_string());
            }
        }
        for item in items {
            walk_wrb_fr(item, out);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn snlm0e_regex_pulls_token_from_wiz_global_data() {
        let html =
            r#"<script>window.WIZ_global_data={"SNlM0e":"abc123token","other":"x"};</script>"#;
        assert_eq!(extract_snlm0e(html).unwrap(), "abc123token");
    }

    #[test]
    fn snlm0e_missing_returns_error() {
        assert!(extract_snlm0e("<html>no token</html>").is_err());
    }

    #[test]
    fn push_id_regex_matches_feeds_shape() {
        let html = r#"{"qKIAYe":"feeds/mcudyrk2a4khkz","qwAQke":"BardChatUi"}"#;
        assert_eq!(extract_push_id(html).unwrap(), "feeds/mcudyrk2a4khkz");
    }

    #[test]
    fn build_label_regex_matches_versioned_shape() {
        let html = r#"{"cfb2h":"boq_assistant-bard-web-server_20260415.04_p1"}"#;
        assert_eq!(
            extract_build_label(html).unwrap(),
            "boq_assistant-bard-web-server_20260415.04_p1"
        );
    }

    #[test]
    fn build_label_missing_errors_with_hint() {
        let err = extract_build_label("<html/>").unwrap_err();
        assert!(format!("{err}").contains("cfb2h"));
    }

    #[test]
    fn extract_session_tokens_pulls_all_three() {
        let html = r#"{"SNlM0e":"tok","qKIAYe":"feeds/abc","cfb2h":"boq_assistant-bard-web-server_20260415.04_p1"}"#;
        let t = extract_session_tokens(html).unwrap();
        assert_eq!(t.snlm0e, "tok");
        assert_eq!(t.push_id, "feeds/abc");
        assert_eq!(
            t.build_label,
            "boq_assistant-bard-web-server_20260415.04_p1"
        );
    }

    #[test]
    fn iter_json_arrays_finds_balanced_bounds() {
        let s = r#"prefix [1,[2,3],"]"] trailing"#;
        let arrays: Vec<&str> = iter_json_arrays(s).collect();
        assert_eq!(arrays, vec![r#"[1,[2,3],"]"]"#]);
    }

    #[test]
    fn iter_json_arrays_yields_multiple_chunks() {
        let arrays: Vec<&str> = iter_json_arrays("prefix [1,2] middle [3,[4]] tail").collect();
        assert_eq!(arrays, vec!["[1,2]", "[3,[4]]"]);
    }

    #[test]
    fn extract_response_text_happy_path() {
        let inner = r#"[null,null,null,null,[[null,["hello world"]]]]"#;
        let inner_escaped = serde_json::to_string(inner).unwrap();
        let outer = format!("[[[\"wrb.fr\",\"XkjvGc\",{}]]]", inner_escaped);
        let body = format!(")]}}'\n\n{}\n", outer);
        assert_eq!(extract_response_text(&body).unwrap(), "hello world");
    }

    #[test]
    fn extract_response_text_scans_all_chunks() {
        // Two chunks, second has the actual reply.
        let metadata = r#"[null,"c_abc","r_xyz"]"#;
        let reply_inner = r#"[null,null,null,null,[[null,["the real reply"]]]]"#;
        let c1 = format!(
            "[[[\"wrb.fr\",\"m1\",{}]]]",
            serde_json::to_string(metadata).unwrap()
        );
        let c2 = format!(
            "[[[\"wrb.fr\",\"m2\",{}]]]",
            serde_json::to_string(reply_inner).unwrap()
        );
        let body = format!(")]}}'\n\n{c1}\n\n{c2}\n");
        assert_eq!(extract_response_text(&body).unwrap(), "the real reply");
    }

    #[test]
    fn extract_response_text_errors_when_shape_drifts() {
        // When none of the known paths hit, we surface a Parse error
        // with an actionable hint — NOT a confident wrong answer.
        let inner = r#"["completely","different","shape","but contains the reply here"]"#;
        let inner_escaped = serde_json::to_string(inner).unwrap();
        let outer = format!("[[[\"wrb.fr\",\"XkjvGc\",{}]]]", inner_escaped);
        let body = format!(")]}}'\n\n{}\n", outer);
        let err = extract_response_text(&body).unwrap_err();
        let msg = format!("{err}");
        assert!(msg.contains("shape drifted"), "got: {msg}");
        assert!(msg.contains("parsing.rs"), "got: {msg}");
    }

    #[test]
    fn extract_response_text_errors_on_empty_body() {
        assert!(extract_response_text(")]}'\n\n").is_err());
    }

    #[test]
    fn extract_conversation_id_picks_metadata_frame() {
        // `inner[1][0]` = "c_..." — that's the one we want.
        let meta = r#"[null,["c_bc8e3e4c1f26c53a","r_542aec39e18192e8"]]"#;
        let reply = r#"[null,null,null,null,[[null,["hello"]]]]"#;
        let c1 = format!(
            "[[[\"wrb.fr\",\"m1\",{}]]]",
            serde_json::to_string(meta).unwrap()
        );
        let c2 = format!(
            "[[[\"wrb.fr\",\"m2\",{}]]]",
            serde_json::to_string(reply).unwrap()
        );
        let body = format!(")]}}'\n\n{c1}\n\n{c2}\n");
        assert_eq!(
            extract_conversation_id(&body).as_deref(),
            Some("c_bc8e3e4c1f26c53a")
        );
    }

    #[test]
    fn extract_turn_ids_collects_all_three_across_frames() {
        // Metadata frame has cid + rid; reply frame has rcid. Combined
        // extraction must yield all three.
        let meta = r#"[null,["c_aaa","r_bbb"]]"#;
        let reply = r#"[null,null,null,null,[["rc_ccc",["hello"]]]]"#;
        let c1 = format!(
            "[[[\"wrb.fr\",\"m1\",{}]]]",
            serde_json::to_string(meta).unwrap()
        );
        let c2 = format!(
            "[[[\"wrb.fr\",\"m2\",{}]]]",
            serde_json::to_string(reply).unwrap()
        );
        let body = format!(")]}}'\n\n{c1}\n\n{c2}\n");
        let ids = extract_turn_ids(&body);
        assert_eq!(ids.conversation_id.as_deref(), Some("c_aaa"));
        assert_eq!(ids.response_id.as_deref(), Some("r_bbb"));
        assert_eq!(ids.choice_id.as_deref(), Some("rc_ccc"));
    }

    #[test]
    fn extract_turn_ids_refuses_mismatched_prefixes() {
        // A wrb.fr frame where inner[1][0] starts with "r_" (not "c_")
        // must not be accepted as a conversation id.
        let frame = r#"[null,["r_wrong","c_alsowrong"]]"#;
        let outer = format!(
            "[[[\"wrb.fr\",\"m\",{}]]]",
            serde_json::to_string(frame).unwrap()
        );
        let body = format!(")]}}'\n\n{outer}\n");
        let ids = extract_turn_ids(&body);
        assert!(ids.conversation_id.is_none());
        assert!(ids.response_id.is_none());
    }

    #[test]
    fn extract_conversation_id_ignores_c_hex_elsewhere() {
        // A `c_<hex>` string buried in a non-metadata position (e.g. a
        // request id in a deeper array) must NOT be picked — that's the
        // false-positive the raw-body regex would suffer from.
        let reply = r#"[null,null,null,null,[[null,["c_00000000decoy"]]]]"#;
        let outer = format!(
            "[[[\"wrb.fr\",\"m\",{}]]]",
            serde_json::to_string(reply).unwrap()
        );
        let body = format!(")]}}'\n\n{outer}\n");
        assert_eq!(extract_conversation_id(&body), None);
    }
}