Skip to main content

disney_loop/
humanize.rs

1//! # Humanization Loop Stages
2//!
3//! Implements the Disney Loop for text humanization:
4//! ρ(t) → ∂(¬σ⁻¹) → ∃(ν) → ρ(t+1)
5//!
6//! 1. ρ(t): Current state (robotic text)
7//! 2. ∂(¬σ⁻¹): Anti-regression gate — threshold filter on `prob_generated`
8//! 3. ∃(ν): Phrasing discovery — group-by-id aggregation (STUB: LLM integration pending)
9//! 4. ρ(t+1): New state (humanized text)
10
11use crate::Result;
12use nexcore_dataframe::{Agg, DataFrame};
13
14/// Stage 2: ∂(¬σ⁻¹) — Humanization Gate
15///
16/// Filters rows where `prob_generated` exceeds `threshold`, retaining
17/// only text that reads as sufficiently human. `threshold` is clamped
18/// to `[0.0, 1.0]`.
19///
20/// When a real antitransformer scorer is integrated, it will feed the
21/// `prob_generated` column that this gate consumes.
22pub fn transform_humanization_gate(df: DataFrame, threshold: f64) -> Result<DataFrame> {
23    let threshold = threshold.clamp(0.0, 1.0);
24    tracing::info!(
25        stage = "humanization-gate",
26        threshold = threshold,
27        "Applying anti-regression filter for AI-generated text"
28    );
29
30    // Filter: keep if probability_generated < threshold
31    let filtered = df.filter_by("prob_generated", |v| {
32        v.as_f64().is_some_and(|p| p < threshold)
33    })?;
34
35    Ok(filtered)
36}
37
38/// Stage 3: ∃(ν) — Phrasing Discovery (STUB)
39///
40/// Groups rows by `id`, keeping the first `text` value and the minimum
41/// `prob_generated` score per id. Output columns: `id`, `text_first`,
42/// `prob_generated_min`.
43///
44/// STUB: when nexcore-transform integration lands, this stage will use
45/// concept identification and phrasing suggestion rather than a bare
46/// first-value aggregation.
47pub fn transform_phrasing_discovery(df: DataFrame) -> Result<DataFrame> {
48    tracing::info!(
49        stage = "phrasing-discovery",
50        "Searching for natural phrasing alternatives"
51    );
52
53    // Aggregate by id — first text, min prob_generated
54    let aggregated = df
55        .group_by(&["id"])?
56        .agg(&[Agg::First("text".into()), Agg::Min("prob_generated".into())])?;
57
58    Ok(aggregated)
59}
60
61/// Run a single humanization pass on text.
62#[allow(dead_code, reason = "placeholder for LLM integration")]
63pub fn humanize_text(text: &str) -> String {
64    // This would ideally call an LLM to rephrase.
65    // For the autonomous loop, we use it as a placeholder for the ∃(ν) phase.
66    format!("Refactored: {}", text)
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72    use nexcore_dataframe::Column;
73
74    fn humanize_frame() -> crate::Result<DataFrame> {
75        Ok(DataFrame::new(vec![
76            Column::from_strs("id", &["a", "a", "b", "b"]),
77            Column::from_strs(
78                "text",
79                &["hello world", "greetings earth", "foo bar", "baz qux"],
80            ),
81            Column::from_f64s("prob_generated", vec![0.3, 0.8, 0.1, 0.9]),
82        ])?)
83    }
84
85    #[test]
86    fn humanization_gate_filters_above_threshold() -> crate::Result<()> {
87        let df = transform_humanization_gate(humanize_frame()?, 0.5)?;
88        // prob_generated: 0.3, 0.8, 0.1, 0.9 — only 0.3 and 0.1 < 0.5
89        assert_eq!(df.height(), 2);
90        Ok(())
91    }
92
93    #[test]
94    fn humanization_gate_clamps_threshold() -> crate::Result<()> {
95        // threshold > 1.0 gets clamped to 1.0 — all pass
96        let df = transform_humanization_gate(humanize_frame()?, 2.0)?;
97        assert_eq!(df.height(), 4);
98
99        // threshold < 0.0 gets clamped to 0.0 — none pass
100        let df = transform_humanization_gate(humanize_frame()?, -1.0)?;
101        assert_eq!(df.height(), 0);
102        Ok(())
103    }
104
105    #[test]
106    fn phrasing_discovery_aggregates_by_id() -> crate::Result<()> {
107        use nexcore_dataframe::Scalar;
108        let df = transform_phrasing_discovery(humanize_frame()?)?;
109        // 2 unique ids: "a" and "b"
110        assert_eq!(df.height(), 2);
111
112        let ids = df.column("id")?;
113        let mins = df.column("prob_generated_min")?;
114        // text_first must exist
115        let _text = df.column("text_first")?;
116
117        let mut found_a = false;
118        let mut found_b = false;
119        for i in 0..df.height() {
120            match ids.get(i).as_ref().map(|s| s.to_string()).as_deref() {
121                Some("a") => {
122                    // min(0.3, 0.8) = 0.3
123                    if let Some(Scalar::Float64(v)) = mins.get(i) {
124                        assert!(
125                            (v - 0.3_f64).abs() < 1e-9,
126                            "id=a min prob_generated must be 0.3, got {v}"
127                        );
128                    }
129                    found_a = true;
130                }
131                Some("b") => {
132                    // min(0.1, 0.9) = 0.1
133                    if let Some(Scalar::Float64(v)) = mins.get(i) {
134                        assert!(
135                            (v - 0.1_f64).abs() < 1e-9,
136                            "id=b min prob_generated must be 0.1, got {v}"
137                        );
138                    }
139                    found_b = true;
140                }
141                _ => {}
142            }
143        }
144        assert!(found_a, "id=a must appear in result");
145        assert!(found_b, "id=b must appear in result");
146        Ok(())
147    }
148
149    #[test]
150    fn humanize_text_stub_is_non_empty_for_non_empty_input() {
151        // humanize_text() is a placeholder for LLM integration.
152        // This test documents the STUB CONTRACT only: any non-empty input
153        // produces a non-empty, non-identical output string.
154        // When real LLM integration lands, this test must be replaced with
155        // assertions against actual humanization quality metrics.
156        let output = humanize_text("test input");
157        assert!(!output.is_empty(), "stub must return non-empty string");
158        assert_ne!(
159            output, "test input",
160            "stub must transform input, not echo it"
161        );
162    }
163}