tail-fin-common 0.7.7

//! Library-style helpers for endpoint discovery / probing scripts.
//!
//! Extracted from duplicated logic in:
//! - `crates/tail-fin-cli/examples/discover_nansen_api.rs`
//! - `crates/tail-fin-cli/examples/discover_sa_api.rs`
//! - PR #1's `crates/tail-fin-sa/tests/sa_explore_browser.rs` (PerimeterX trick)
//!
//! Each function is a pure utility — no enforced flow. Discoverer scripts
//! pick what they need and assemble themselves.
//!
//! Top-level helpers (`summarize_json`, `FixtureWriter`) have no extra
//! dependencies. Browser-mode helpers (`browser::delete_perimeterx_cookies`,
//! `browser::drain_captured_responses_as_map`) are gated on the existing
//! `browser` feature.

use serde_json::Value;
use std::collections::BTreeMap;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};

/// One-line summary of a JSON response body for probe-loop logs.
///
/// Picks out the shape signals that matter when scanning probe output:
/// top-level keys, length of `data` array, presence/length of `included`
/// array, root type for non-object payloads.
///
/// Format examples:
/// - `keys=["data", "meta"] data:[5]` (JSON:API list response)
/// - `keys=["data", "included"] data:{obj} included:[3]` (JSON:API single)
/// - `array[12]` (raw array)
/// - `scalar` / `null`
pub fn summarize_json(body: &Value) -> String {
    match body {
        Value::Object(map) => {
            let keys: Vec<&str> = map.keys().take(8).map(String::as_str).collect();
            let data_shape = match map.get("data") {
                Some(Value::Array(a)) => format!(" data:[{}]", a.len()),
                Some(Value::Object(_)) => " data:{obj}".to_string(),
                _ => String::new(),
            };
            let included_shape = match map.get("included") {
                Some(Value::Array(a)) => format!(" included:[{}]", a.len()),
                _ => String::new(),
            };
            format!("keys={:?}{}{}", keys, data_shape, included_shape)
        }
        Value::Array(a) => format!("array[{}]", a.len()),
        Value::String(_) | Value::Number(_) | Value::Bool(_) => "scalar".to_string(),
        Value::Null => "null".to_string(),
    }
}

/// Writes captured response bodies to a directory with safe filenames
/// and a final `_index.json` manifest mapping each fixture back to its
/// source URL key, capturing page, row count, and size.
///
/// Use:
/// ```ignore
/// use tail_fin_common::explore::FixtureWriter;
///
/// let mut fw = FixtureWriter::new("crates/tail-fin-foo/tests/fixtures/discovered")?;
/// fw.write("/api/v3/symbols/AAPL/ratings", &body, "symbol_root")?;
/// fw.finalize()?; // writes _index.json
/// ```
pub struct FixtureWriter {
    dir: PathBuf,
    /// Keyed on slugified filename — last write wins (mirrors the
    /// filesystem reality, where `fs::write` overwrites). Avoids the
    /// "two index rows pointing at one file with different sizes"
    /// hazard that a `Vec<IndexEntry>` would have on dup keys.
    index: BTreeMap<String, IndexEntry>,
}

#[derive(Debug, serde::Serialize)]
struct IndexEntry {
    /// Original URL/path key as captured (preserves `?query=...` for traceability).
    key: String,
    /// Tag identifying which page/trigger first captured this response.
    source_page: String,
    /// Filename within the fixture directory.
    fixture_file: String,
    /// Length of `data` array if present, else 0.
    data_array_len: usize,
    /// Compact-serialized body size in bytes.
    body_size_bytes: usize,
}

impl FixtureWriter {
    /// Open (or create) the target directory.
    pub fn new(dir: impl AsRef<Path>) -> io::Result<Self> {
        let dir = dir.as_ref().to_path_buf();
        fs::create_dir_all(&dir)?;
        Ok(Self {
            dir,
            index: BTreeMap::new(),
        })
    }

    /// Write one fixture. Returns the absolute path written.
    ///
    /// The `key` is typically a URL path with optional `?query`; it gets
    /// slugified into a filesystem-safe filename. Different query strings
    /// on the same path produce different files (disambiguated via short
    /// hash of the query) so we don't overwrite e.g.
    /// `/estimates?period_type=quarterly` with `?period_type=annual`.
    ///
    /// **Duplicate keys:** if two `write` calls use the same key, the second
    /// overwrites the first on disk AND in the index. If distinct keys collide
    /// after slugification, the later key gets a deterministic hash suffix so
    /// both fixtures are preserved.
    pub fn write(&mut self, key: &str, body: &Value, source_page: &str) -> io::Result<PathBuf> {
        let (path_part, query_part) = key.split_once('?').unwrap_or((key, ""));
        let safe = slugify_path(path_part);
        let qhash = if query_part.is_empty() {
            String::new()
        } else {
            format!("__q{}", short_hash(query_part))
        };
        let base = format!("{safe}{qhash}");
        let mut fname = format!("{base}.json");
        if self.index.get(&fname).is_some_and(|entry| entry.key != key) {
            let key_hash = short_hash(key);
            fname = format!("{base}__k{key_hash}.json");
            let mut n = 2usize;
            while self.index.get(&fname).is_some_and(|entry| entry.key != key) {
                fname = format!("{base}__k{key_hash}_{n}.json");
                n += 1;
            }
        }
        let dst = self.dir.join(&fname);
        // serde_json::Value is always serializable — `to_string_pretty`
        // can only fail on numeric edge cases (NaN/Inf) that Value can't
        // even hold. Panic on the impossible rather than silently writing
        // an empty file.
        let pretty =
            serde_json::to_string_pretty(body).expect("serde_json::Value is always serializable");
        fs::write(&dst, &pretty)?;

        let data_array_len = body
            .get("data")
            .and_then(|d| d.as_array())
            .map(|a| a.len())
            .unwrap_or(0);
        let body_size_bytes = serde_json::to_string(body)
            .expect("serde_json::Value is always serializable")
            .len();

        // Last-write-wins on duplicate slugified filename — mirrors the
        // filesystem reality (we just overwrote whatever was on disk).
        self.index.insert(
            fname.clone(),
            IndexEntry {
                key: key.to_string(),
                source_page: source_page.to_string(),
                fixture_file: fname,
                data_array_len,
                body_size_bytes,
            },
        );
        Ok(dst)
    }

    /// Number of fixtures written so far (excluding `_index.json`).
    pub fn len(&self) -> usize {
        self.index.len()
    }

    pub fn is_empty(&self) -> bool {
        self.index.is_empty()
    }

    /// Write the `_index.json` manifest and return its path.
    /// Consumes the writer — no more writes after this.
    ///
    /// Manifest is a JSON array of entries sorted by fixture filename
    /// (deterministic — friendly to git diffs).
    pub fn finalize(self) -> io::Result<PathBuf> {
        let path = self.dir.join("_index.json");
        let entries: Vec<&IndexEntry> = self.index.values().collect();
        let json =
            serde_json::to_string_pretty(&entries).expect("IndexEntry is always serializable");
        fs::write(&path, json)?;
        Ok(path)
    }
}

/// Slugify a URL path into a filesystem-safe filename component.
///
/// Handles characters that are illegal on common filesystems
/// (`/`, `\\`, `:`, `[`, `]`, `<`, `>`, `"`, `|`, `?`, `*`) plus ASCII
/// control characters. Also avoids Windows reserved device basenames such as
/// `CON`, `PRN`, `COM1`, and `LPT1`.
fn slugify_path(path: &str) -> String {
    let mut slug: String = path
        .trim_matches(|c| c == '/' || c == '\\')
        .chars()
        .map(|c| match c {
            '/' | '\\' | ':' => '-',
            '[' | ']' | '<' | '>' | '"' | '|' | '?' | '*' => '_',
            c if c.is_ascii_control() => '_',
            other => other,
        })
        .collect();

    if slug.is_empty() {
        slug.push('_');
    }

    if is_windows_reserved_basename(&slug) {
        slug.insert(0, '_');
    }

    slug
}

fn is_windows_reserved_basename(name: &str) -> bool {
    let basename = name
        .split('.')
        .next()
        .unwrap_or(name)
        .trim_end_matches([' ', '.'])
        .to_ascii_uppercase();

    matches!(basename.as_str(), "CON" | "PRN" | "AUX" | "NUL")
        || basename
            .strip_prefix("COM")
            .and_then(|n| n.parse::<u8>().ok())
            .is_some_and(|n| (1..=9).contains(&n))
        || basename
            .strip_prefix("LPT")
            .and_then(|n| n.parse::<u8>().ok())
            .is_some_and(|n| (1..=9).contains(&n))
}

/// Stable short hash for fixture filename disambiguation.
///
/// Uses FNV-1a rather than Rust's `DefaultHasher` because these hashes become
/// part of checked-in fixture filenames; they must not change across Rust
/// toolchain upgrades.
fn short_hash(s: &str) -> String {
    const FNV_OFFSET: u64 = 0xcbf29ce484222325;
    const FNV_PRIME: u64 = 0x00000100000001b3;

    let mut hash = FNV_OFFSET;
    for byte in s.as_bytes() {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    format!("{:08x}", hash & 0xFFFFFFFF)
}

#[cfg(feature = "browser")]
pub mod browser {
    //! Browser-session helpers for endpoint discovery.

    use crate::TailFinError;
    use night_fury_core::BrowserSession;
    use serde_json::Value;
    use std::collections::BTreeMap;

    /// PerimeterX cookie names — bound to the TLS fingerprint that issued
    /// them, so replaying across browser installations causes captcha.
    pub const PX_COOKIE_NAMES: &[&str] = &["_px3", "_pxvid", "pxcts", "_pxde"];

    /// Delete PerimeterX cookies via CDP `Network.deleteCookies`
    /// (`BrowserSession::delete_cookie`). Returns the number of successful
    /// delete calls (4 names × 2 domain forms = up to 8 attempts; counts
    /// only the ones the CDP layer accepted).
    ///
    /// **When to call:** AFTER navigating to a page that successfully
    /// loaded with stale PX cookies present (the page boot let PX
    /// establish whatever session state it needs), BEFORE making in-page
    /// `fetch()` calls to data APIs. Calling on a fresh-launched browser
    /// with no PX history will not unblock anything — PX captchas the
    /// first request anyway.
    ///
    /// **Why this works:** discovered during PR #1 SA exploration
    /// (2026-04-28). PX is more lenient with "no PX cookies" than
    /// "stale fingerprint-mismatched PX cookies" once a successful
    /// page navigate has happened. Deleting the stale tokens lets
    /// subsequent XHRs go through.
    ///
    /// **Why CDP and not `document.cookie`:** PX sometimes sets `_px3`
    /// with `HttpOnly`, which is invisible to JS — `document.cookie`
    /// silently leaves those in the jar. CDP `Network.deleteCookies`
    /// reaches HttpOnly cookies and inspects each cookie's actual
    /// domain attribute (so we don't have to fan out across path
    /// combinations).
    pub async fn delete_perimeterx_cookies(
        session: &BrowserSession,
        site_domain: &str,
    ) -> Result<usize, TailFinError> {
        let domains = [format!(".{site_domain}"), site_domain.to_string()];
        let mut deleted = 0usize;
        for name in PX_COOKIE_NAMES {
            for d in &domains {
                if session.delete_cookie(name, d).await.is_ok() {
                    deleted += 1;
                }
            }
        }
        Ok(deleted)
    }

    /// Drain currently-buffered captured responses (registered via
    /// `BrowserSession::capture_responses(...)`) and bin them into a
    /// dedup'd map.
    ///
    /// `key_fn(url, body)` returns the dedup key (or `None` to skip the
    /// response — e.g. wrong URL pattern, irrelevant body shape). The
    /// `seen` map is updated with first-write-wins semantics: each key
    /// keeps the FIRST source/body it was captured under, so traces
    /// reflect "this page first triggered this XHR".
    ///
    /// Returns the number of newly-inserted entries (so the caller can
    /// log "+ 4 new" per page).
    ///
    /// Non-JSON bodies (binary, HTML 404, captcha pages) are silently
    /// skipped — `serde_json::from_str` failure → continue.
    pub async fn drain_captured_responses_as_map<K, F>(
        session: &BrowserSession,
        source_page: &str,
        seen: &mut BTreeMap<K, (String, Value)>,
        key_fn: F,
    ) -> Result<usize, TailFinError>
    where
        K: Ord,
        F: Fn(&str, &Value) -> Option<K>,
    {
        let captured = session
            .get_captured_responses()
            .await
            .map_err(|e| TailFinError::Api(format!("get_captured_responses: {e}")))?;
        let mut new_count = 0;
        for r in captured {
            let body: Value = match serde_json::from_str(&r.body) {
                Ok(v) => v,
                Err(_) => continue,
            };
            let Some(key) = key_fn(&r.url, &body) else {
                continue;
            };
            if let std::collections::btree_map::Entry::Vacant(e) = seen.entry(key) {
                e.insert((source_page.to_string(), body));
                new_count += 1;
            }
        }
        Ok(new_count)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    // --- summarize_json ---

    #[test]
    fn summarize_json_object_with_data_array() {
        let s = summarize_json(&json!({"data": [1, 2, 3], "meta": {}}));
        assert!(s.contains("\"data\""), "got {s}");
        assert!(s.contains("\"meta\""), "got {s}");
        assert!(s.contains("data:[3]"), "got {s}");
    }

    #[test]
    fn summarize_json_object_with_included() {
        let s = summarize_json(&json!({
            "data": [{"id": 1}],
            "included": [{"a": 1}, {"b": 2}, {"c": 3}],
        }));
        assert!(s.contains("data:[1]"), "got {s}");
        assert!(s.contains("included:[3]"), "got {s}");
    }

    #[test]
    fn summarize_json_object_with_data_object_not_array() {
        let s = summarize_json(&json!({"data": {"id": "146"}, "meta": {}}));
        assert!(s.contains("data:{obj}"), "got {s}");
    }

    #[test]
    fn summarize_json_array_root() {
        let s = summarize_json(&json!([1, 2, 3, 4, 5]));
        assert_eq!(s, "array[5]");
    }

    #[test]
    fn summarize_json_scalars_and_null() {
        assert_eq!(summarize_json(&json!(42)), "scalar");
        assert_eq!(summarize_json(&json!("hi")), "scalar");
        assert_eq!(summarize_json(&json!(true)), "scalar");
        assert_eq!(summarize_json(&json!(null)), "null");
    }

    #[test]
    fn summarize_json_keys_capped_at_eight() {
        let body = json!({
            "a": 1, "b": 2, "c": 3, "d": 4,
            "e": 5, "f": 6, "g": 7, "h": 8,
            "i": 9, "j": 10,
        });
        let s = summarize_json(&body);
        // First 8 keys appear, j (the 10th alphabetically) does not.
        assert!(s.contains("\"a\""), "got {s}");
        assert!(!s.contains("\"j\""), "got {s}");
    }

    // --- FixtureWriter ---

    #[test]
    fn fixture_writer_writes_files_and_index() {
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();

        let p1 = fw
            .write(
                "/api/v3/symbols/AAPL/ratings",
                &json!({"data": [{"id": 1}]}),
                "symbol_root",
            )
            .unwrap();
        let p2 = fw
            .write(
                "/api/v3/symbols/AAPL/dividend_history",
                &json!({"data": [{"a": 1}, {"b": 2}]}),
                "symbol_root",
            )
            .unwrap();

        assert!(p1.exists());
        assert!(p2.exists());
        assert_eq!(fw.len(), 2);

        let idx_path = fw.finalize().unwrap();
        let idx_raw = std::fs::read_to_string(&idx_path).unwrap();
        let idx: serde_json::Value = serde_json::from_str(&idx_raw).unwrap();
        let arr = idx.as_array().unwrap();
        assert_eq!(arr.len(), 2);
        // Manifest is sorted by fixture filename → dividend_history < ratings.
        let by_key: std::collections::HashMap<&str, &serde_json::Value> = arr
            .iter()
            .map(|e| (e["key"].as_str().unwrap(), e))
            .collect();
        assert_eq!(by_key["/api/v3/symbols/AAPL/ratings"]["data_array_len"], 1);
        assert_eq!(
            by_key["/api/v3/symbols/AAPL/dividend_history"]["data_array_len"],
            2
        );
        assert_eq!(arr[0]["source_page"], "symbol_root");
    }

    #[test]
    fn fixture_writer_dedupes_index_on_duplicate_filename() {
        // Two writes that slugify to the same filename — second should
        // overwrite both on disk AND in the manifest. No two index rows
        // pointing at the same file with conflicting body_size_bytes.
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();
        fw.write(
            "/api/v3/symbols/AAPL/ratings",
            &json!({"first": true}),
            "src1",
        )
        .unwrap();
        fw.write(
            "/api/v3/symbols/AAPL/ratings",
            &json!({"second": true}),
            "src2",
        )
        .unwrap();

        let idx_path = fw.finalize().unwrap();
        let idx_raw = std::fs::read_to_string(&idx_path).unwrap();
        let idx: serde_json::Value = serde_json::from_str(&idx_raw).unwrap();
        assert_eq!(idx.as_array().unwrap().len(), 1, "dedup'd to single row");
        assert_eq!(
            idx[0]["source_page"], "src2",
            "last write wins (matches the body now on disk)"
        );
    }

    #[test]
    fn slugify_replaces_colons() {
        // Regression: bare keys with `:` (e.g. Nansen's `typesense:foo`
        // routing prefix) used to land as literal-`:` filenames, which
        // are not portable.
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();
        let p = fw.write("typesense:portfolio", &json!({}), "p").unwrap();
        let fname = p.file_name().unwrap().to_str().unwrap();
        assert!(!fname.contains(':'), "no colons in filename: {fname}");
    }

    #[test]
    fn fixture_writer_disambiguates_query_strings() {
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();
        let p1 = fw
            .write(
                "/api/v3/symbols/AAPL/estimates?period_type=quarterly",
                &json!({"x": 1}),
                "p",
            )
            .unwrap();
        let p2 = fw
            .write(
                "/api/v3/symbols/AAPL/estimates?period_type=annual",
                &json!({"x": 2}),
                "p",
            )
            .unwrap();
        assert_ne!(
            p1.file_name(),
            p2.file_name(),
            "different queries must yield different files"
        );
    }

    #[test]
    fn fixture_writer_disambiguates_distinct_keys_with_same_slug() {
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();
        let p1 = fw
            .write("/api/foo:bar", &json!({"colon": true}), "p1")
            .unwrap();
        let p2 = fw
            .write("/api/foo/bar", &json!({"slash": true}), "p2")
            .unwrap();

        assert_ne!(
            p1.file_name(),
            p2.file_name(),
            "distinct keys that slugify identically must not overwrite each other"
        );

        let idx_path = fw.finalize().unwrap();
        let idx_raw = std::fs::read_to_string(&idx_path).unwrap();
        let idx: serde_json::Value = serde_json::from_str(&idx_raw).unwrap();
        assert_eq!(idx.as_array().unwrap().len(), 2);
    }

    #[test]
    fn fixture_writer_disambiguates_fallback_filename_collision() {
        // Synthetic trigger: the second write uses a key shaped exactly like
        // the `__k<hash>` fallback that the third write would naturally pick,
        // forcing it into the `_2` counter path. Real-world FNV-1a 32-bit
        // truncation collisions are rare; this is the cheapest way to
        // exercise the loop body deterministically.
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();

        let colliding_key = "/api/foo/bar";
        let hash = short_hash(colliding_key);
        fw.write("/api/foo:bar", &json!({"base": true}), "p1")
            .unwrap();
        fw.write(
            &format!("/api/foo-bar__k{hash}"),
            &json!({"natural_suffix": true}),
            "p2",
        )
        .unwrap();
        let p3 = fw
            .write(colliding_key, &json!({"fallback": true}), "p3")
            .unwrap();

        assert_eq!(
            p3.file_name().unwrap().to_str().unwrap(),
            format!("api-foo-bar__k{hash}_2.json")
        );

        let idx_path = fw.finalize().unwrap();
        let idx_raw = std::fs::read_to_string(&idx_path).unwrap();
        let idx: serde_json::Value = serde_json::from_str(&idx_raw).unwrap();
        assert_eq!(idx.as_array().unwrap().len(), 3);
    }

    #[test]
    fn short_hash_is_stable() {
        assert_eq!(short_hash("period_type=quarterly"), "265d2857");
    }

    #[test]
    fn slugify_replaces_all_forbidden_filename_chars() {
        let slug = slugify_path("/a\\b:c[d]e<f>g\"h|i?j*k\u{0007}/");
        for forbidden in [
            '/', '\\', ':', '[', ']', '<', '>', '"', '|', '?', '*', '\u{0007}',
        ] {
            assert!(
                !slug.contains(forbidden),
                "slug contains forbidden char {forbidden:?}: {slug}"
            );
        }
    }

    #[test]
    fn fixture_writer_filename_is_filesystem_safe() {
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();
        let p = fw
            .write(
                "/api/v3/symbols/AAPL/sec:filings\\bad<name>|x?y*\u{0007}?include=form_type&page[size]=10",
                &json!({}),
                "p",
            )
            .unwrap();
        let fname = p.file_name().unwrap().to_str().unwrap();
        for forbidden in [
            '/', '\\', ':', '[', ']', '<', '>', '"', '|', '?', '*', '\u{0007}',
        ] {
            assert!(
                !fname.contains(forbidden),
                "filename contains forbidden char {forbidden:?}: {fname}"
            );
        }
    }

    #[test]
    fn fixture_writer_handles_root_and_windows_reserved_names() {
        let tmp = tempfile::tempdir().unwrap();
        let mut fw = FixtureWriter::new(tmp.path()).unwrap();

        let root = fw.write("/", &json!({}), "root").unwrap();
        assert_eq!(root.file_name().unwrap().to_str().unwrap(), "_.json");

        let con = fw.write("/CON", &json!({}), "reserved").unwrap();
        assert_eq!(con.file_name().unwrap().to_str().unwrap(), "_CON.json");

        let com1 = fw.write("/com1", &json!({}), "reserved").unwrap();
        assert_eq!(com1.file_name().unwrap().to_str().unwrap(), "_com1.json");

        // Cover the other matches!(...) arms + the LPT* parse branch.
        let prn = fw.write("/PRN", &json!({}), "reserved").unwrap();
        assert_eq!(prn.file_name().unwrap().to_str().unwrap(), "_PRN.json");

        let lpt3 = fw.write("/lpt3", &json!({}), "reserved").unwrap();
        assert_eq!(lpt3.file_name().unwrap().to_str().unwrap(), "_lpt3.json");
    }

    #[test]
    fn fixture_writer_create_dir_is_idempotent() {
        let tmp = tempfile::tempdir().unwrap();
        let nested = tmp.path().join("a/b/c/discovered");
        let mut fw = FixtureWriter::new(&nested).unwrap();
        fw.write("/api/x", &json!({"data": []}), "src").unwrap();
        // Reopen — should not error.
        let mut fw2 = FixtureWriter::new(&nested).unwrap();
        fw2.write("/api/y", &json!({"data": []}), "src").unwrap();
        fw2.finalize().unwrap();
    }
}