Skip to main content

organism_planning/
dd.rs

1//! Due Diligence suggestor implementations.
2//!
3//! Generic, reusable suggestors for the DD convergence loop. Apps inject
4//! their search and LLM backends via the [`DdSearch`] and [`DdLlm`] traits.
5//! Organism owns the prompts, parsing, and convergence patterns.
6//!
7//! # Layer responsibilities
8//!
9//! | Layer | Owns |
10//! |-------|------|
11//! | **Organism** | DD suggestors, prompts, fact parsing, convergence patterns |
12//! | **App** | `DdSearch` + `DdLlm` implementations |
13//! | **Converge** | Engine, context, axioms, promotion gates |
14
15use std::cmp::Ordering;
16use std::collections::{HashMap, HashSet};
17use std::fmt;
18use std::sync::{Arc, Mutex};
19
20use converge_pack::{AgentEffect, Context, ContextKey, Fact, FactId, ProposedFact, Suggestor};
21use serde::{Deserialize, Serialize};
22use uuid::Uuid;
23
24use crate::suggestor::SharedBudget;
25
26// ── Error types ───────────────────────────────────────────────────
27
28/// Typed errors from DD backends. Apps classify raw provider errors
29/// into these variants. Organism uses the variant to decide whether
30/// to retry, abort, or record a constraint.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub enum DdError {
33    /// Account/billing problem — stop the entire run.
34    CreditsExhausted { provider: String, detail: String },
35    /// Throttled — could retry after backoff, but suggestor won't.
36    RateLimited {
37        provider: String,
38        retry_after_ms: Option<u64>,
39    },
40    /// Provider is down or unreachable.
41    ProviderUnavailable { provider: String, detail: String },
42    /// Provider returned something we couldn't use.
43    BadResponse { provider: String, detail: String },
44    /// The input was too large for the provider.
45    PromptTooLarge {
46        provider: String,
47        tokens: Option<usize>,
48    },
49    /// JSON parsing failed on provider output.
50    ParseFailed { provider: String, detail: String },
51}
52
53impl fmt::Display for DdError {
54    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
55        match self {
56            Self::CreditsExhausted { provider, detail } => {
57                write!(f, "[{provider}] credits exhausted: {detail}")
58            }
59            Self::RateLimited { provider, .. } => write!(f, "[{provider}] rate limited"),
60            Self::ProviderUnavailable { provider, detail } => {
61                write!(f, "[{provider}] unavailable: {detail}")
62            }
63            Self::BadResponse { provider, detail } => {
64                write!(f, "[{provider}] bad response: {detail}")
65            }
66            Self::PromptTooLarge { provider, tokens } => write!(
67                f,
68                "[{provider}] prompt too large ({})",
69                tokens.map_or("unknown".into(), |t| format!("{t} tokens"))
70            ),
71            Self::ParseFailed { provider, detail } => {
72                write!(f, "[{provider}] parse failed: {detail}")
73            }
74        }
75    }
76}
77
78impl DdError {
79    /// Is this an infrastructure failure that should NOT calibrate learning priors?
80    pub fn is_infra_failure(&self) -> bool {
81        matches!(
82            self,
83            Self::CreditsExhausted { .. }
84                | Self::RateLimited { .. }
85                | Self::ProviderUnavailable { .. }
86        )
87    }
88
89    /// Should the entire DD run abort on this error?
90    pub fn is_fatal(&self) -> bool {
91        matches!(self, Self::CreditsExhausted { .. })
92    }
93
94    /// The constraint fact ID for this error kind.
95    fn constraint_id(&self, suggestor: &str) -> String {
96        let kind = match self {
97            Self::CreditsExhausted { .. } => "credits-exhausted",
98            Self::RateLimited { .. } => "rate-limited",
99            Self::ProviderUnavailable { .. } => "provider-unavailable",
100            Self::BadResponse { .. } => "bad-response",
101            Self::PromptTooLarge { .. } => "prompt-too-large",
102            Self::ParseFailed { .. } => "parse-failed",
103        };
104        format!("dd:constraint:{suggestor}:{kind}")
105    }
106}
107
108/// Build a constraint fact from a DD error so it's visible in context.
109fn error_to_constraint(error: &DdError, suggestor: &str) -> ProposedFact {
110    let id = error.constraint_id(suggestor);
111    let content = serde_json::json!({
112        "type": "error",
113        "error": serde_json::to_value(error).unwrap_or_default(),
114        "is_infra_failure": error.is_infra_failure(),
115        "is_fatal": error.is_fatal(),
116        "message": error.to_string(),
117    })
118    .to_string();
119    ProposedFact::new(ContextKey::Constraints, id, content, suggestor).with_confidence(1.0)
120}
121
122// ── Backend traits ───────────────────────────────────────────────
123
124/// Async search backend. Apps implement this by wrapping their
125/// search providers (Brave, Tavily, etc.).
126///
127/// Apps are responsible for classifying raw HTTP/provider errors into
128/// [`DdError`] variants so organism can make informed decisions.
129#[async_trait::async_trait]
130pub trait DdSearch: Send + Sync {
131    async fn search(&self, query: &str) -> Result<Vec<SearchHit>, DdError>;
132}
133
134/// Async LLM backend. Apps implement this by wrapping their
135/// LLM providers (Anthropic, OpenAI, etc.).
136///
137/// Apps are responsible for classifying raw HTTP/provider errors into
138/// [`DdError`] variants so organism can make informed decisions.
139#[async_trait::async_trait]
140pub trait DdLlm: Send + Sync {
141    async fn complete(&self, prompt: &str) -> Result<String, DdError>;
142}
143
144/// A search hit returned by a [`DdSearch`] implementation.
145#[derive(Debug, Clone)]
146pub struct SearchHit {
147    pub title: String,
148    pub url: String,
149    pub content: String,
150    pub provider: String,
151}
152
153// ── Failover wrappers ─────────────────────────────────────────────
154
155/// Tries LLM backends in order. On retryable errors (credits exhausted,
156/// rate limited, provider unavailable), moves to the next backend.
157/// On non-retryable errors (parse failed, bad response), returns
158/// immediately — a different provider won't fix bad output.
159pub struct FailoverDdLlm {
160    backends: Vec<Arc<dyn DdLlm>>,
161}
162
163impl FailoverDdLlm {
164    pub fn new(backends: Vec<Arc<dyn DdLlm>>) -> Self {
165        Self { backends }
166    }
167}
168
169#[async_trait::async_trait]
170impl DdLlm for FailoverDdLlm {
171    async fn complete(&self, prompt: &str) -> Result<String, DdError> {
172        let mut last_error = None;
173        for backend in &self.backends {
174            match backend.complete(prompt).await {
175                Ok(result) => return Ok(result),
176                Err(e) => {
177                    let should_failover = e.is_infra_failure();
178                    eprintln!(
179                        "[failover] {} — {}",
180                        e,
181                        if should_failover {
182                            "trying next"
183                        } else {
184                            "not retryable"
185                        }
186                    );
187                    if !should_failover {
188                        return Err(e);
189                    }
190                    last_error = Some(e);
191                }
192            }
193        }
194        Err(last_error.unwrap_or_else(|| DdError::ProviderUnavailable {
195            provider: "failover".into(),
196            detail: "no backends configured".into(),
197        }))
198    }
199}
200
201/// Tries search backends in order with the same failover logic.
202pub struct FailoverDdSearch {
203    backends: Vec<Arc<dyn DdSearch>>,
204}
205
206impl FailoverDdSearch {
207    pub fn new(backends: Vec<Arc<dyn DdSearch>>) -> Self {
208        Self { backends }
209    }
210}
211
212#[async_trait::async_trait]
213impl DdSearch for FailoverDdSearch {
214    async fn search(&self, query: &str) -> Result<Vec<SearchHit>, DdError> {
215        let mut last_error = None;
216        for backend in &self.backends {
217            match backend.search(query).await {
218                Ok(result) => return Ok(result),
219                Err(e) => {
220                    let should_failover = e.is_infra_failure();
221                    eprintln!(
222                        "[failover] {} — {}",
223                        e,
224                        if should_failover {
225                            "trying next"
226                        } else {
227                            "not retryable"
228                        }
229                    );
230                    if !should_failover {
231                        return Err(e);
232                    }
233                    last_error = Some(e);
234                }
235            }
236        }
237        Err(last_error.unwrap_or_else(|| DdError::ProviderUnavailable {
238            provider: "failover".into(),
239            detail: "no backends configured".into(),
240        }))
241    }
242}
243
244// ── Breadth Research ──────────────────────────────────────────────
245
246/// Reacts to strategies tagged with a breadth marker.
247/// Searches wide and emits signal facts.
248pub struct BreadthResearchSuggestor {
249    subject: String,
250    budget: Arc<SharedBudget>,
251    search: Arc<dyn DdSearch>,
252    tag: String,
253    processed: Mutex<HashSet<FactId>>,
254}
255
256impl BreadthResearchSuggestor {
257    pub fn new(
258        subject: impl Into<String>,
259        budget: Arc<SharedBudget>,
260        search: Arc<dyn DdSearch>,
261    ) -> Self {
262        Self {
263            subject: subject.into(),
264            budget,
265            search,
266            tag: "breadth".into(),
267            processed: Mutex::new(HashSet::new()),
268        }
269    }
270
271    pub fn with_tag(mut self, tag: impl Into<String>) -> Self {
272        self.tag = tag.into();
273        self
274    }
275
276    fn unprocessed_strategies(&self, ctx: &dyn Context) -> Vec<String> {
277        let processed = self.processed.lock().unwrap();
278        ctx.get(ContextKey::Strategies)
279            .iter()
280            .filter(|f| f.content.contains(&self.tag))
281            .filter(|f| !processed.contains(&f.id))
282            .map(|f| f.content.clone())
283            .collect()
284    }
285}
286
287#[async_trait::async_trait]
288impl Suggestor for BreadthResearchSuggestor {
289    fn name(&self) -> &'static str {
290        "dd-breadth-research"
291    }
292
293    fn dependencies(&self) -> &[ContextKey] {
294        &[ContextKey::Strategies]
295    }
296
297    fn accepts(&self, ctx: &dyn Context) -> bool {
298        self.budget.remaining("searches") > 0 && !self.unprocessed_strategies(ctx).is_empty()
299    }
300
301    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
302        let strategies = self.unprocessed_strategies(ctx);
303        let mut proposals = Vec::new();
304
305        for strategy in strategies {
306            if !self.budget.try_use("searches") {
307                break;
308            }
309
310            let query = format!("{} {strategy}", self.subject);
311            match self.search.search(&query).await {
312                Ok(hits) => {
313                    for hit in &hits {
314                        if !is_relevant(&hit.title, &hit.content, &hit.url, &self.subject) {
315                            continue;
316                        }
317                        let id = format!("signal-breadth-{}", Uuid::new_v4());
318                        let content = serde_json::json!({
319                            "title": hit.title,
320                            "url": hit.url,
321                            "content": hit.content,
322                            "provider": hit.provider,
323                            "query": query,
324                        })
325                        .to_string();
326                        proposals.push(
327                            ProposedFact::new(
328                                ContextKey::Signals,
329                                id,
330                                content,
331                                "dd-breadth-research",
332                            )
333                            .with_confidence(1.0),
334                        );
335                    }
336                }
337                Err(e) => {
338                    proposals.push(error_to_constraint(&e, "dd-breadth-research"));
339                    if e.is_fatal() {
340                        break;
341                    }
342                }
343            }
344
345            self.processed.lock().unwrap().insert(
346                ctx.get(ContextKey::Strategies)
347                    .iter()
348                    .find(|f| f.content == strategy)
349                    .map_or_else(|| FactId::new(""), |f| f.id.clone()),
350            );
351        }
352
353        AgentEffect::with_proposals(proposals)
354    }
355}
356
357// ── Depth Research ────────────────────────────────────────────────
358
359/// Reacts to strategies tagged with a depth marker.
360/// Searches deep and emits signal facts.
361pub struct DepthResearchSuggestor {
362    subject: String,
363    budget: Arc<SharedBudget>,
364    search: Arc<dyn DdSearch>,
365    tag: String,
366    processed: Mutex<HashSet<FactId>>,
367}
368
369impl DepthResearchSuggestor {
370    pub fn new(
371        subject: impl Into<String>,
372        budget: Arc<SharedBudget>,
373        search: Arc<dyn DdSearch>,
374    ) -> Self {
375        Self {
376            subject: subject.into(),
377            budget,
378            search,
379            tag: "depth".into(),
380            processed: Mutex::new(HashSet::new()),
381        }
382    }
383
384    pub fn with_tag(mut self, tag: impl Into<String>) -> Self {
385        self.tag = tag.into();
386        self
387    }
388
389    fn unprocessed_strategies(&self, ctx: &dyn Context) -> Vec<String> {
390        let processed = self.processed.lock().unwrap();
391        ctx.get(ContextKey::Strategies)
392            .iter()
393            .filter(|f| f.content.contains(&self.tag))
394            .filter(|f| !processed.contains(&f.id))
395            .map(|f| f.content.clone())
396            .collect()
397    }
398}
399
400#[async_trait::async_trait]
401impl Suggestor for DepthResearchSuggestor {
402    fn name(&self) -> &'static str {
403        "dd-depth-research"
404    }
405
406    fn dependencies(&self) -> &[ContextKey] {
407        &[ContextKey::Strategies]
408    }
409
410    fn accepts(&self, ctx: &dyn Context) -> bool {
411        self.budget.remaining("searches") > 0 && !self.unprocessed_strategies(ctx).is_empty()
412    }
413
414    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
415        let strategies = self.unprocessed_strategies(ctx);
416        let mut proposals = Vec::new();
417
418        for strategy in strategies {
419            if !self.budget.try_use("searches") {
420                break;
421            }
422
423            let query = format!("{} {strategy}", self.subject);
424            match self.search.search(&query).await {
425                Ok(hits) => {
426                    for hit in &hits {
427                        if !is_relevant(&hit.title, &hit.content, &hit.url, &self.subject) {
428                            continue;
429                        }
430                        let id = format!("signal-depth-{}", Uuid::new_v4());
431                        let content = serde_json::json!({
432                            "title": hit.title,
433                            "url": hit.url,
434                            "content": hit.content,
435                            "provider": hit.provider,
436                            "query": query,
437                        })
438                        .to_string();
439                        proposals.push(
440                            ProposedFact::new(
441                                ContextKey::Signals,
442                                id,
443                                content,
444                                "dd-depth-research",
445                            )
446                            .with_confidence(1.0),
447                        );
448                    }
449                }
450                Err(e) => {
451                    proposals.push(error_to_constraint(&e, "dd-depth-research"));
452                    if e.is_fatal() {
453                        break;
454                    }
455                }
456            }
457
458            self.processed.lock().unwrap().insert(
459                ctx.get(ContextKey::Strategies)
460                    .iter()
461                    .find(|f| f.content == strategy)
462                    .map_or_else(|| FactId::new(""), |f| f.id.clone()),
463            );
464        }
465
466        AgentEffect::with_proposals(proposals)
467    }
468}
469
470// ── Fact Extractor ────────────────────────────────────────────────
471
472/// Reads raw signals and extracts tagged factual claims via LLM.
473/// Organism owns the extraction prompt and parsing.
474pub struct FactExtractorSuggestor {
475    subject: String,
476    budget: Arc<SharedBudget>,
477    llm: Arc<dyn DdLlm>,
478    processed_signal_count: Mutex<usize>,
479}
480
481impl FactExtractorSuggestor {
482    pub fn new(subject: impl Into<String>, budget: Arc<SharedBudget>, llm: Arc<dyn DdLlm>) -> Self {
483        Self {
484            subject: subject.into(),
485            budget,
486            llm,
487            processed_signal_count: Mutex::new(0),
488        }
489    }
490}
491
492#[async_trait::async_trait]
493impl Suggestor for FactExtractorSuggestor {
494    fn name(&self) -> &'static str {
495        "dd-fact-extractor"
496    }
497
498    fn dependencies(&self) -> &[ContextKey] {
499        &[ContextKey::Signals]
500    }
501
502    fn accepts(&self, ctx: &dyn Context) -> bool {
503        let current = ctx.count(ContextKey::Signals);
504        let processed = *self.processed_signal_count.lock().unwrap();
505        self.budget.remaining("llm") > 0 && current > processed
506    }
507
508    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
509        let all_signals = ctx.get(ContextKey::Signals);
510        let (start, end) = next_batch_bounds(
511            all_signals.len(),
512            *self.processed_signal_count.lock().unwrap(),
513            15,
514        );
515        let signals: Vec<_> = all_signals
516            .iter()
517            .skip(start)
518            .take(end - start)
519            .cloned()
520            .collect();
521
522        if signals.is_empty() || !self.budget.try_use("llm") {
523            return AgentEffect::empty();
524        }
525
526        *self.processed_signal_count.lock().unwrap() = end;
527        let prompt = prompts::fact_extraction(&self.subject, &signals);
528        let mut seen_fact_keys: HashSet<String> = ctx
529            .get(ContextKey::Hypotheses)
530            .iter()
531            .filter_map(|fact| existing_fact_signature(&fact.content))
532            .collect();
533
534        let mut proposals = Vec::new();
535        match self.llm.complete(&prompt).await {
536            Ok(raw) => match parse_json_array_response(&raw, "facts") {
537                Ok(facts) => {
538                    for (i, fact) in facts.iter().enumerate() {
539                        let Some(normalized_fact) = normalize_dd_fact(fact) else {
540                            continue;
541                        };
542                        let signature = dd_fact_signature(&normalized_fact);
543                        if !seen_fact_keys.insert(signature) {
544                            continue;
545                        }
546                        let id = format!("hypothesis-{}-{i}", Uuid::new_v4());
547                        proposals.push(
548                            ProposedFact::new(
549                                ContextKey::Hypotheses,
550                                id,
551                                normalized_fact.to_string(),
552                                "dd-fact-extractor",
553                            )
554                            .with_confidence(normalized_fact["confidence"].as_f64().unwrap_or(0.5)),
555                        );
556                    }
557                }
558                Err(detail) => {
559                    let parse_err = DdError::ParseFailed {
560                        provider: "llm".into(),
561                        detail: format!(
562                            "{detail} (first 200 chars: {})",
563                            &raw[..raw.len().min(200)]
564                        ),
565                    };
566                    proposals.push(error_to_constraint(&parse_err, "dd-fact-extractor"));
567                }
568            },
569            Err(e) => {
570                proposals.push(error_to_constraint(&e, "dd-fact-extractor"));
571            }
572        }
573
574        AgentEffect::with_proposals(proposals)
575    }
576}
577
578// ── Gap Detector ──────────────────────────────────────────────────
579
580/// Reviews hypotheses, identifies critical gaps, proposes follow-up strategies.
581/// Organism owns the gap-detection prompt and strategy parsing.
582pub struct GapDetectorSuggestor {
583    subject: String,
584    budget: Arc<SharedBudget>,
585    llm: Arc<dyn DdLlm>,
586    last_hypothesis_count: Mutex<usize>,
587    generation_count: Mutex<usize>,
588    max_generations: usize,
589    min_hypotheses: usize,
590}
591
592impl GapDetectorSuggestor {
593    pub fn new(subject: impl Into<String>, budget: Arc<SharedBudget>, llm: Arc<dyn DdLlm>) -> Self {
594        Self {
595            subject: subject.into(),
596            budget,
597            llm,
598            last_hypothesis_count: Mutex::new(0),
599            generation_count: Mutex::new(0),
600            max_generations: 3,
601            min_hypotheses: 5,
602        }
603    }
604
605    pub fn with_max_generations(mut self, max: usize) -> Self {
606        self.max_generations = max;
607        self
608    }
609
610    pub fn with_min_hypotheses(mut self, min: usize) -> Self {
611        self.min_hypotheses = min;
612        self
613    }
614}
615
616#[async_trait::async_trait]
617impl Suggestor for GapDetectorSuggestor {
618    fn name(&self) -> &'static str {
619        "dd-gap-detector"
620    }
621
622    fn dependencies(&self) -> &[ContextKey] {
623        &[ContextKey::Hypotheses]
624    }
625
626    fn accepts(&self, ctx: &dyn Context) -> bool {
627        let current = ctx.count(ContextKey::Hypotheses);
628        let last = *self.last_hypothesis_count.lock().unwrap();
629        let gens = *self.generation_count.lock().unwrap();
630
631        current >= self.min_hypotheses
632            && current > last
633            && gens < self.max_generations
634            && self.budget.remaining("llm") > 0
635    }
636
637    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
638        if !self.budget.try_use("llm") {
639            return AgentEffect::empty();
640        }
641
642        let hypotheses = ctx.get(ContextKey::Hypotheses);
643        *self.last_hypothesis_count.lock().unwrap() = hypotheses.len();
644        let generation = {
645            let mut g = self.generation_count.lock().unwrap();
646            *g += 1;
647            *g
648        };
649
650        let prompt =
651            prompts::gap_detection(&self.subject, hypotheses, generation, self.max_generations);
652        let mut proposals = Vec::new();
653        let mut seen_strategy_contents: HashSet<String> = ctx
654            .get(ContextKey::Strategies)
655            .iter()
656            .map(|fact| fact.content.clone())
657            .collect();
658
659        match self.llm.complete(&prompt).await {
660            Ok(raw) => match parse_json_array_response(&raw, "strategies") {
661                Ok(strategies) => {
662                    for (i, s) in strategies.iter().enumerate() {
663                        let mode = s["mode"].as_str().unwrap_or("breadth");
664                        let query = s["query"].as_str().unwrap_or("");
665                        let reason = s["reason"].as_str().unwrap_or("");
666                        let content = format!("[{mode}] {query} -- {reason}");
667                        if query.trim().is_empty()
668                            || !seen_strategy_contents.insert(content.clone())
669                        {
670                            continue;
671                        }
672                        let id = format!("strategy-gap-{i}-{}", Uuid::new_v4());
673
674                        proposals.push(ProposedFact::new(
675                            ContextKey::Strategies,
676                            id,
677                            content,
678                            "dd-gap-detector",
679                        ));
680                    }
681                }
682                Err(detail) => {
683                    let parse_err = DdError::ParseFailed {
684                        provider: "llm".into(),
685                        detail: format!(
686                            "{detail} (first 200 chars: {})",
687                            &raw[..raw.len().min(200)]
688                        ),
689                    };
690                    proposals.push(error_to_constraint(&parse_err, "dd-gap-detector"));
691                }
692            },
693            Err(e) => {
694                proposals.push(error_to_constraint(&e, "dd-gap-detector"));
695            }
696        }
697
698        AgentEffect::with_proposals(proposals)
699    }
700}
701
702// ── Contradiction Finder ──────────────────────────────────────────
703
704/// Detects conflicting claims across hypotheses on the same topic.
705/// Pure data analysis — no LLM needed.
706pub struct ContradictionFinderSuggestor {
707    last_hypothesis_count: Mutex<usize>,
708}
709
710impl ContradictionFinderSuggestor {
711    pub fn new() -> Self {
712        Self {
713            last_hypothesis_count: Mutex::new(0),
714        }
715    }
716}
717
718#[async_trait::async_trait]
719impl Suggestor for ContradictionFinderSuggestor {
720    fn name(&self) -> &'static str {
721        "dd-contradiction-finder"
722    }
723
724    fn dependencies(&self) -> &[ContextKey] {
725        &[ContextKey::Hypotheses]
726    }
727
728    fn accepts(&self, ctx: &dyn Context) -> bool {
729        let current = ctx.count(ContextKey::Hypotheses);
730        let last = *self.last_hypothesis_count.lock().unwrap();
731        current > last && current >= 3
732    }
733
734    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
735        let hypotheses = ctx.get(ContextKey::Hypotheses);
736        *self.last_hypothesis_count.lock().unwrap() = hypotheses.len();
737
738        // Group hypotheses by topic (from JSON "category" field)
739        let mut by_category: HashMap<String, Vec<(FactId, String)>> = HashMap::new();
740        for fact in hypotheses {
741            if let Ok(v) = serde_json::from_str::<serde_json::Value>(&fact.content) {
742                let category = v["category"].as_str().unwrap_or("unknown").to_string();
743                let claim = v["claim"].as_str().unwrap_or("").to_string();
744                if !claim.is_empty() {
745                    by_category
746                        .entry(category)
747                        .or_default()
748                        .push((fact.id.clone(), claim));
749                }
750            }
751        }
752
753        let mut proposals = Vec::new();
754        let existing_evaluations: HashSet<FactId> = ctx
755            .get(ContextKey::Evaluations)
756            .iter()
757            .map(|f| f.id.clone())
758            .collect();
759
760        // Flag categories where claims contain contradictory signals
761        for (category, claims) in &by_category {
762            if claims.len() < 2 {
763                continue;
764            }
765
766            // Look for numeric disagreements or explicit contradiction markers
767            let has_contradiction = claims.iter().any(|(_, c)| {
768                c.to_lowercase().contains("contradiction")
769                    || c.to_lowercase().contains("disagree")
770                    || c.to_lowercase().contains("conflict")
771            });
772
773            if has_contradiction {
774                let id = format!("contradiction-{category}-{}", Uuid::new_v4());
775                if existing_evaluations.contains(id.as_str()) {
776                    continue;
777                }
778
779                let claim_ids: Vec<&str> = claims.iter().map(|(id, _)| id.as_str()).collect();
780                let content = serde_json::json!({
781                    "category": category,
782                    "type": "contradiction",
783                    "claim_count": claims.len(),
784                    "claim_ids": claim_ids,
785                    "description": format!("Contradictory claims detected in {category} — sources disagree"),
786                    "needs_human_review": true,
787                })
788                .to_string();
789
790                proposals.push(
791                    ProposedFact::new(
792                        ContextKey::Evaluations,
793                        id,
794                        content,
795                        "dd-contradiction-finder",
796                    )
797                    .with_confidence(0.9),
798                );
799            }
800        }
801
802        AgentEffect::with_proposals(proposals)
803    }
804}
805
806// ── Synthesis ─────────────────────────────────────────────────────
807
808/// Produces final analysis when hypotheses stabilize.
809/// Organism owns the synthesis prompt.
810pub struct SynthesisSuggestor {
811    subject: String,
812    budget: Arc<SharedBudget>,
813    llm: Arc<dyn DdLlm>,
814    last_hypothesis_count: Mutex<usize>,
815    stable_cycles: Mutex<usize>,
816    required_stable_cycles: usize,
817}
818
819impl SynthesisSuggestor {
820    pub fn new(subject: impl Into<String>, budget: Arc<SharedBudget>, llm: Arc<dyn DdLlm>) -> Self {
821        Self {
822            subject: subject.into(),
823            budget,
824            llm,
825            last_hypothesis_count: Mutex::new(0),
826            stable_cycles: Mutex::new(0),
827            required_stable_cycles: 2,
828        }
829    }
830
831    pub fn with_required_stable_cycles(mut self, n: usize) -> Self {
832        self.required_stable_cycles = n;
833        self
834    }
835}
836
837#[async_trait::async_trait]
838impl Suggestor for SynthesisSuggestor {
839    fn name(&self) -> &'static str {
840        "dd-synthesis"
841    }
842
843    fn dependencies(&self) -> &[ContextKey] {
844        // Synthesis is stability-driven, not dirty-key driven.
845        // It must stay schedulable even on cycles where hypotheses stop changing.
846        &[]
847    }
848
849    fn accepts(&self, ctx: &dyn Context) -> bool {
850        let current = ctx.count(ContextKey::Hypotheses);
851        let mut last = self.last_hypothesis_count.lock().unwrap();
852        let mut stable = self.stable_cycles.lock().unwrap();
853
854        if current == *last && current > 0 {
855            *stable += 1;
856        } else {
857            *stable = 0;
858            *last = current;
859        }
860
861        *stable >= self.required_stable_cycles
862            && !ctx.has(ContextKey::Proposals)
863            && self.budget.remaining("llm") > 0
864    }
865
866    async fn execute(&self, ctx: &dyn Context) -> AgentEffect {
867        if !self.budget.try_use("llm") {
868            return AgentEffect::empty();
869        }
870
871        let hypotheses = ctx.get(ContextKey::Hypotheses);
872        let consolidated = consolidate_dd_hypotheses(hypotheses);
873        let prompt = prompts::synthesis(&self.subject, &consolidated);
874
875        match self.llm.complete(&prompt).await {
876            Ok(raw) => {
877                let id = format!("synthesis-{}", Uuid::new_v4());
878                AgentEffect::with_proposal(
879                    ProposedFact::new(ContextKey::Proposals, id, raw, "dd-synthesis")
880                        .with_confidence(0.8),
881                )
882            }
883            Err(e) => AgentEffect::with_proposal(error_to_constraint(&e, "dd-synthesis")),
884        }
885    }
886}
887
888#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
889pub struct DdFactSummary {
890    pub category: String,
891    pub claim: String,
892    pub confidence: f64,
893    pub support_count: usize,
894    pub evidence_count: usize,
895}
896
897#[derive(Debug, Clone)]
898struct ConsolidationCandidate {
899    summary: DdFactSummary,
900    distinctive_tokens: HashSet<String>,
901    topic_tokens: HashSet<String>,
902    numeric_tokens: Vec<String>,
903    approximate: bool,
904    priority_score: f64,
905}
906
907// ── Prompts (organism-owned DD intelligence) ──────────────────────
908
909pub mod prompts {
910    use super::{DdFactSummary, covered_dd_categories, missing_expected_dd_categories};
911    use converge_pack::Fact;
912
913    pub fn fact_extraction(subject: &str, signals: &[Fact]) -> String {
914        let sources_text: String = signals
915            .iter()
916            .enumerate()
917            .filter_map(|(i, f)| {
918                let v: serde_json::Value = serde_json::from_str(&f.content).ok()?;
919                Some(format!(
920                    "[Source {i}] ({}) {}\n  URL: {}\n  {}",
921                    v["provider"].as_str().unwrap_or("?"),
922                    v["title"].as_str().unwrap_or(""),
923                    v["url"].as_str().unwrap_or(""),
924                    v["content"].as_str().unwrap_or("")
925                ))
926            })
927            .collect::<Vec<_>>()
928            .join("\n\n");
929
930        format!(
931            r#"You are an analyst extracting facts about {subject} from research sources.
932
933{sources_text}
934
935Extract key facts as JSON array. ONLY valid JSON, no fences:
936{{
937  "facts": [
938    {{
939      "claim": "specific factual claim",
940      "category": "product|customers|technology|competition|market|financials|team|risk|governance",
941      "source_indices": [0, 3],
942      "confidence": 0.9
943    }}
944  ]
945}}
946
947Rules:
948- Return an object with a top-level "facts" array
949- Return at most 20 facts, prioritized by investment relevance
950- Return DISTINCT facts only. Do not restate the same claim with cosmetic wording changes.
951- Aim to cover these DD categories when evidence exists: product, customers, technology, competition, market, financials
952- Use category "technology" for platform, architecture, integrations, attack-surface management mechanics, threat-intelligence infrastructure, APIs, or technical moat
953- Do NOT label a clearly technical platform claim as "product" just because it mentions a product name
954- Every fact MUST cite source_indices
955- 0.9+ for primary sources, 0.7 for secondary, 0.5 for inferred
956- Flag contradictions between sources as separate facts with category "risk"
957- If no reliable facts can be extracted, return {{"facts":[]}}"#
958        )
959    }
960
961    pub fn gap_detection(
962        subject: &str,
963        hypotheses: &[Fact],
964        generation: usize,
965        max_generations: usize,
966    ) -> String {
967        let facts_text: String = hypotheses
968            .iter()
969            .map(|f| f.content.as_str())
970            .collect::<Vec<_>>()
971            .join("\n");
972        let covered_categories = covered_dd_categories(hypotheses);
973        let missing_categories = missing_expected_dd_categories(hypotheses);
974        let covered_text = if covered_categories.is_empty() {
975            "none yet".to_string()
976        } else {
977            covered_categories.join(", ")
978        };
979        let missing_text = if missing_categories.is_empty() {
980            "none".to_string()
981        } else {
982            missing_categories.join(", ")
983        };
984
985        format!(
986            r#"You are a PE analyst reviewing extracted facts about {subject}.
987
988Current facts:
989{facts_text}
990
991Covered categories:
992{covered_text}
993
994Missing expected categories:
995{missing_text}
996
997What critical gaps remain? Focus on:
998- Missing financials (ARR, growth, margins)
999- Unknown ownership/investors
1000- Unclear competitive positioning
1001- Missing tech stack details
1002- Unknown customer concentration
1003
1004Return JSON object:
1005{{
1006  "strategies": [
1007    {{"query": "search terms", "mode": "breadth|depth", "reason": "why this matters"}}
1008  ]
1009}}
1010
1011This is research pass {generation} of {max_generations}. Only propose searches for gaps that are CRITICAL for investment decision-making.
1012Pass 1: broad gaps (max 4). Pass 2+: only truly unresolved items (max 2).
1013ONLY valid JSON, no markdown fences. If no critical gaps remain, return {{"strategies":[]}}."#
1014        )
1015    }
1016
1017    pub fn synthesis(subject: &str, hypotheses: &[DdFactSummary]) -> String {
1018        let facts_text = if hypotheses.is_empty() {
1019            "No consolidated facts were available.".to_string()
1020        } else {
1021            hypotheses
1022                .iter()
1023                .map(|fact| {
1024                    format!(
1025                        "- [{} | confidence {:.2} | support {} | evidence {}] {}",
1026                        fact.category,
1027                        fact.confidence,
1028                        fact.support_count,
1029                        fact.evidence_count,
1030                        fact.claim
1031                    )
1032                })
1033                .collect::<Vec<_>>()
1034                .join("\n")
1035        };
1036
1037        format!(
1038            r#"You are a senior PE analyst producing a final due diligence synthesis for {subject}.
1039
1040Consolidated facts:
1041{facts_text}
1042
1043Produce a final analysis as JSON:
1044{{
1045  "summary": "2-3 paragraph executive summary",
1046  "market_analysis": "market analysis",
1047  "competitive_landscape": "competitive analysis",
1048  "technology_assessment": "tech assessment",
1049  "risk_factors": ["risk 1", "risk 2"],
1050  "growth_opportunities": ["opp 1", "opp 2"],
1051  "recommendation": "investment recommendation"
1052}}
1053
1054ONLY valid JSON, no markdown fences. All values plain strings."#
1055        )
1056    }
1057}
1058
1059// ── Helpers ───────────────────────────────────────────────────────
1060
1061fn strip_fences(raw: &str) -> &str {
1062    let s = raw.trim();
1063    let s = s
1064        .strip_prefix("```json")
1065        .or_else(|| s.strip_prefix("```"))
1066        .unwrap_or(s);
1067    s.strip_suffix("```").unwrap_or(s).trim()
1068}
1069
1070pub fn consolidate_dd_hypotheses(hypotheses: &[Fact]) -> Vec<DdFactSummary> {
1071    consolidate_dd_fact_values(
1072        hypotheses
1073            .iter()
1074            .filter_map(|fact| serde_json::from_str::<serde_json::Value>(&fact.content).ok())
1075            .collect::<Vec<_>>(),
1076    )
1077}
1078
1079// ── Hook Extraction ──────────────────────────────────────────────
1080
1081/// Entities extracted from DD facts for knowledge graph connections.
1082#[derive(Debug, Clone, Default)]
1083pub struct DdHooks {
1084    pub investors: Vec<String>,
1085    pub business_areas: Vec<String>,
1086    pub regions: Vec<String>,
1087    pub competitors: Vec<String>,
1088}
1089
1090/// Configurable patterns for hook extraction.
1091#[derive(Debug, Clone)]
1092pub struct HookPatterns {
1093    pub business_areas: Vec<(String, String)>,
1094    pub regions: Vec<(String, String)>,
1095    pub entity_triggers: Vec<String>,
1096}
1097
1098impl Default for HookPatterns {
1099    fn default() -> Self {
1100        Self {
1101            business_areas: vec![
1102                ("saas".into(), "SaaS".into()),
1103                (" grc".into(), "Governance, Risk & Compliance (GRC)".into()),
1104                (",grc".into(), "Governance, Risk & Compliance (GRC)".into()),
1105                (
1106                    "governance, risk".into(),
1107                    "Governance, Risk & Compliance (GRC)".into(),
1108                ),
1109                (" esg".into(), "ESG Reporting".into()),
1110                ("sustainability".into(), "ESG Reporting".into()),
1111                ("compliance".into(), "Compliance Management".into()),
1112                ("strategic planning".into(), "Strategic Planning".into()),
1113                ("quality management".into(), "Quality Management".into()),
1114                ("risk management".into(), "Risk Management".into()),
1115                ("edc".into(), "Electronic Data Capture (EDC)".into()),
1116                ("clinical trial".into(), "Clinical Trial Management".into()),
1117                ("eclinical".into(), "eClinical Solutions".into()),
1118                ("cybersecurity".into(), "Cybersecurity".into()),
1119                ("vulnerability".into(), "Vulnerability Management".into()),
1120                ("threat intelligence".into(), "Threat Intelligence".into()),
1121                ("penetration testing".into(), "Penetration Testing".into()),
1122                ("attack surface".into(), "Attack Surface Management".into()),
1123                ("workforce management".into(), "Workforce Management".into()),
1124                (
1125                    "scheduling".into(),
1126                    "Scheduling & Resource Management".into(),
1127                ),
1128                ("timetabling".into(), "Timetabling".into()),
1129                (
1130                    "higher education".into(),
1131                    "Higher Education Software".into(),
1132                ),
1133                ("edtech".into(), "EdTech".into()),
1134                (
1135                    "business intelligence".into(),
1136                    "Business Intelligence".into(),
1137                ),
1138                ("analytics".into(), "Analytics".into()),
1139                ("fintech".into(), "FinTech".into()),
1140                ("payment".into(), "Payment Solutions".into()),
1141                ("information security".into(), "Information Security".into()),
1142                ("regulatory".into(), "Regulatory Technology".into()),
1143                ("crm".into(), "CRM".into()),
1144                ("marketing automation".into(), "Marketing Automation".into()),
1145                ("sales automation".into(), "Sales Automation".into()),
1146            ],
1147            regions: vec![
1148                ("nordic".into(), "Nordics".into()),
1149                ("scandinav".into(), "Nordics".into()),
1150                ("sweden".into(), "Nordics".into()),
1151                ("norway".into(), "Nordics".into()),
1152                ("denmark".into(), "Nordics".into()),
1153                ("finland".into(), "Nordics".into()),
1154                ("europe".into(), "Europe".into()),
1155                ("north america".into(), "North America".into()),
1156                ("united states".into(), "North America".into()),
1157                (" us ".into(), "North America".into()),
1158                ("apac".into(), "APAC".into()),
1159                ("asia".into(), "APAC".into()),
1160                ("japan".into(), "Japan".into()),
1161                ("united kingdom".into(), "United Kingdom".into()),
1162                (" uk ".into(), "United Kingdom".into()),
1163                ("germany".into(), "Europe".into()),
1164                ("france".into(), "France".into()),
1165                ("global".into(), "Global".into()),
1166            ],
1167            entity_triggers: vec![
1168                "acquired by ".into(),
1169                "acquired ".into(),
1170                "investment from ".into(),
1171                "invested by ".into(),
1172                "backed by ".into(),
1173                "funded by ".into(),
1174                "partnership with ".into(),
1175                "partner ".into(),
1176                "competes with ".into(),
1177                "competitor ".into(),
1178                "competitors like ".into(),
1179                "competitors such as ".into(),
1180                "alternatives include ".into(),
1181                "compared to ".into(),
1182                "compared against ".into(),
1183                "compared against competitors like ".into(),
1184            ],
1185        }
1186    }
1187}
1188
1189/// Extract graph hooks (investors, business areas, regions, competitors) from consolidated facts.
1190pub fn extract_hooks_from_facts(
1191    subject: &str,
1192    facts: &[DdFactSummary],
1193    patterns: &HookPatterns,
1194) -> DdHooks {
1195    let mut business_areas = std::collections::BTreeSet::new();
1196    let mut competitors = std::collections::BTreeSet::new();
1197    let mut investors = std::collections::BTreeSet::new();
1198    let mut regions = std::collections::BTreeSet::new();
1199
1200    let subject_lower = subject.to_lowercase();
1201
1202    for fact in facts {
1203        let claim = &fact.claim;
1204        let claim_lower = claim.to_lowercase();
1205
1206        // Business areas from any claim
1207        for (pattern, label) in &patterns.business_areas {
1208            if claim_lower.contains(pattern.as_str()) {
1209                business_areas.insert(label.clone());
1210            }
1211        }
1212
1213        // Competitors from competition claims
1214        match fact.category.as_str() {
1215            "competition" | "competitors" => {
1216                for name in extract_named_entities(claim, &subject_lower, &patterns.entity_triggers)
1217                {
1218                    competitors.insert(name);
1219                }
1220            }
1221            "financials" => {
1222                for name in extract_named_entities(claim, &subject_lower, &patterns.entity_triggers)
1223                {
1224                    investors.insert(name);
1225                }
1226            }
1227            _ => {}
1228        }
1229
1230        // Regions from any claim
1231        let mut seen_regions = std::collections::HashSet::new();
1232        for (pattern, label) in &patterns.regions {
1233            if claim_lower.contains(pattern.as_str()) && seen_regions.insert(label.clone()) {
1234                regions.insert(label.clone());
1235            }
1236        }
1237    }
1238
1239    DdHooks {
1240        investors: investors.into_iter().collect(),
1241        business_areas: business_areas.into_iter().collect(),
1242        regions: regions.into_iter().collect(),
1243        competitors: competitors.into_iter().collect(),
1244    }
1245}
1246
1247fn extract_named_entities(claim: &str, exclude_lower: &str, triggers: &[String]) -> Vec<String> {
1248    let mut entities = Vec::new();
1249    let claim_lower = claim.to_lowercase();
1250
1251    for trigger in triggers {
1252        if let Some(pos) = claim_lower.find(trigger.as_str()) {
1253            let after = &claim[pos + trigger.len()..];
1254            let entity = after
1255                .split([',', '.', ';', '(', ')'])
1256                .next()
1257                .unwrap_or("")
1258                .trim();
1259            if !entity.is_empty() && entity.len() < 60 && entity.to_lowercase() != exclude_lower {
1260                entities.push(entity.to_string());
1261            }
1262        }
1263    }
1264
1265    entities
1266}
1267
1268fn next_batch_bounds(
1269    total_items: usize,
1270    processed_items: usize,
1271    max_batch: usize,
1272) -> (usize, usize) {
1273    let start = processed_items.min(total_items);
1274    let end = (start + max_batch).min(total_items);
1275    (start, end)
1276}
1277
1278fn consolidate_dd_fact_values<I>(values: I) -> Vec<DdFactSummary>
1279where
1280    I: IntoIterator<Item = serde_json::Value>,
1281{
1282    let mut by_signature: HashMap<String, DdFactSummary> = HashMap::new();
1283    for value in values {
1284        let Some(normalized) = normalize_dd_fact(&value) else {
1285            continue;
1286        };
1287        let Some(summary) = summary_from_normalized_fact(&normalized) else {
1288            continue;
1289        };
1290        let signature = dd_fact_signature(&normalized);
1291        if let Some(existing) = by_signature.get_mut(&signature) {
1292            merge_exact_summary(existing, summary);
1293        } else {
1294            by_signature.insert(signature, summary);
1295        }
1296    }
1297
1298    let summaries: Vec<DdFactSummary> = by_signature.into_values().collect();
1299    if summaries.is_empty() {
1300        return Vec::new();
1301    }
1302
1303    let token_frequencies = token_document_frequency(&summaries);
1304    let total_summaries = summaries.len();
1305    let mut candidates: Vec<ConsolidationCandidate> = summaries
1306        .into_iter()
1307        .map(|summary| build_consolidation_candidate(summary, &token_frequencies, total_summaries))
1308        .collect();
1309    candidates.sort_by(compare_candidates);
1310
1311    let mut kept = Vec::new();
1312    let mut counts_by_category: HashMap<String, usize> = HashMap::new();
1313    for candidate in candidates {
1314        if kept
1315            .iter()
1316            .any(|existing| should_skip_candidate(&candidate, existing))
1317        {
1318            continue;
1319        }
1320
1321        let count = counts_by_category
1322            .get(candidate.summary.category.as_str())
1323            .copied()
1324            .unwrap_or(0);
1325        if count >= category_fact_cap(candidate.summary.category.as_str()) {
1326            continue;
1327        }
1328
1329        counts_by_category
1330            .entry(candidate.summary.category.clone())
1331            .and_modify(|value| *value += 1)
1332            .or_insert(1);
1333        kept.push(candidate);
1334    }
1335
1336    kept.sort_by(compare_candidates);
1337    kept.into_iter()
1338        .map(|candidate| candidate.summary)
1339        .collect()
1340}
1341
1342fn summary_from_normalized_fact(fact: &serde_json::Value) -> Option<DdFactSummary> {
1343    let category = fact.get("category")?.as_str()?.to_string();
1344    let claim = fact.get("claim")?.as_str()?.trim().to_string();
1345    if claim.is_empty() {
1346        return None;
1347    }
1348
1349    let confidence = fact
1350        .get("confidence")
1351        .and_then(serde_json::Value::as_f64)
1352        .unwrap_or(0.5)
1353        .clamp(0.0, 1.0);
1354    let evidence_count = fact
1355        .get("source_indices")
1356        .and_then(serde_json::Value::as_array)
1357        .map_or(1, |values| values.len().max(1));
1358
1359    Some(DdFactSummary {
1360        category,
1361        claim,
1362        confidence,
1363        support_count: 1,
1364        evidence_count,
1365    })
1366}
1367
1368#[allow(clippy::float_cmp)]
1369fn merge_exact_summary(existing: &mut DdFactSummary, candidate: DdFactSummary) {
1370    existing.support_count += candidate.support_count;
1371    existing.evidence_count += candidate.evidence_count;
1372    if candidate.confidence > existing.confidence
1373        || (candidate.confidence == existing.confidence
1374            && candidate.claim.len() > existing.claim.len())
1375    {
1376        existing.claim = candidate.claim;
1377    }
1378    existing.confidence = existing.confidence.max(candidate.confidence);
1379}
1380
1381fn build_consolidation_candidate(
1382    summary: DdFactSummary,
1383    token_frequencies: &HashMap<String, usize>,
1384    total_summaries: usize,
1385) -> ConsolidationCandidate {
1386    let claim_tokens = informative_claim_tokens(&summary.claim);
1387    let topic_tokens: HashSet<String> = claim_tokens
1388        .iter()
1389        .filter(|token| !token.chars().any(|ch| ch.is_ascii_digit()))
1390        .cloned()
1391        .collect();
1392    let distinctive_tokens: HashSet<String> = claim_tokens
1393        .iter()
1394        .filter(|token| {
1395            token_frequencies.get(*token).copied().unwrap_or_default() * 2 <= total_summaries + 1
1396        })
1397        .cloned()
1398        .collect();
1399    let approximate = claim_is_approximate(&summary.claim);
1400    let numeric_tokens = numeric_claim_tokens(&summary.claim);
1401    let priority_score = fact_priority_score(&summary, approximate, numeric_tokens.len());
1402
1403    ConsolidationCandidate {
1404        summary,
1405        distinctive_tokens: if distinctive_tokens.is_empty() {
1406            claim_tokens.iter().cloned().collect()
1407        } else {
1408            distinctive_tokens
1409        },
1410        topic_tokens: if topic_tokens.is_empty() {
1411            claim_tokens.iter().cloned().collect()
1412        } else {
1413            topic_tokens
1414        },
1415        numeric_tokens,
1416        approximate,
1417        priority_score,
1418    }
1419}
1420
1421fn token_document_frequency(summaries: &[DdFactSummary]) -> HashMap<String, usize> {
1422    let mut frequencies = HashMap::new();
1423    for summary in summaries {
1424        let mut seen = HashSet::new();
1425        for token in informative_claim_tokens(&summary.claim) {
1426            if seen.insert(token.clone()) {
1427                frequencies
1428                    .entry(token)
1429                    .and_modify(|count| *count += 1)
1430                    .or_insert(1);
1431            }
1432        }
1433    }
1434    frequencies
1435}
1436
1437fn informative_claim_tokens(claim: &str) -> Vec<String> {
1438    canonicalize_claim(claim)
1439        .split_whitespace()
1440        .filter(|token| token.len() > 2 && !dd_stopwords().contains(token))
1441        .map(ToOwned::to_owned)
1442        .collect()
1443}
1444
1445fn numeric_claim_tokens(claim: &str) -> Vec<String> {
1446    canonicalize_claim(claim)
1447        .split_whitespace()
1448        .filter(|token| token.chars().any(|ch| ch.is_ascii_digit()))
1449        .map(ToOwned::to_owned)
1450        .collect()
1451}
1452
1453fn claim_is_approximate(claim: &str) -> bool {
1454    let normalized = claim.to_ascii_lowercase();
1455    [
1456        "estimated",
1457        "estimate",
1458        "approximately",
1459        "approx",
1460        "about ",
1461        "over ",
1462        "under ",
1463        "close to",
1464        "around ",
1465        "range of",
1466        "between ",
1467        "currently",
1468        "historically",
1469        "more than",
1470        "less than",
1471    ]
1472    .iter()
1473    .any(|needle| normalized.contains(needle))
1474}
1475
1476#[allow(clippy::cast_precision_loss)]
1477fn fact_priority_score(
1478    summary: &DdFactSummary,
1479    approximate: bool,
1480    numeric_token_count: usize,
1481) -> f64 {
1482    let confidence_score = summary.confidence * 100.0;
1483    let support_bonus = summary.support_count as f64 * 6.0;
1484    let evidence_bonus = summary.evidence_count as f64 * 2.0;
1485    let exactness_bonus = if approximate { 0.0 } else { 5.0 };
1486    let numeric_bonus = if numeric_token_count > 0 { 2.0 } else { 0.0 };
1487    confidence_score + support_bonus + evidence_bonus + exactness_bonus + numeric_bonus
1488}
1489
1490fn compare_candidates(left: &ConsolidationCandidate, right: &ConsolidationCandidate) -> Ordering {
1491    category_sort_order(left.summary.category.as_str())
1492        .cmp(&category_sort_order(right.summary.category.as_str()))
1493        .then_with(|| {
1494            right
1495                .priority_score
1496                .partial_cmp(&left.priority_score)
1497                .unwrap_or(Ordering::Equal)
1498        })
1499        .then_with(|| right.summary.claim.len().cmp(&left.summary.claim.len()))
1500}
1501
1502fn category_sort_order(category: &str) -> usize {
1503    match category {
1504        "product" => 0,
1505        "customers" => 1,
1506        "technology" => 2,
1507        "competition" => 3,
1508        "market" => 4,
1509        "financials" => 5,
1510        "team" => 6,
1511        "governance" => 7,
1512        "risk" => 8,
1513        _ => 9,
1514    }
1515}
1516
1517fn category_fact_cap(category: &str) -> usize {
1518    match category {
1519        "technology" => 5,
1520        "financials" => 4,
1521        "customers" | "competition" => 3,
1522        _ => 2,
1523    }
1524}
1525
1526fn should_skip_candidate(
1527    candidate: &ConsolidationCandidate,
1528    existing: &ConsolidationCandidate,
1529) -> bool {
1530    if candidate.summary.category != existing.summary.category {
1531        return false;
1532    }
1533
1534    let similarity = token_similarity(&candidate.distinctive_tokens, &existing.distinctive_tokens);
1535    let topic_similarity = token_similarity(&candidate.topic_tokens, &existing.topic_tokens);
1536    if similarity >= 0.86 {
1537        return true;
1538    }
1539
1540    if !candidate.numeric_tokens.is_empty()
1541        && candidate.numeric_tokens == existing.numeric_tokens
1542        && similarity >= 0.55
1543    {
1544        return true;
1545    }
1546
1547    candidate.approximate
1548        && topic_similarity >= 0.5
1549        && (!existing.approximate || candidate.numeric_tokens == existing.numeric_tokens)
1550}
1551
1552#[allow(clippy::cast_precision_loss)]
1553fn token_similarity(left: &HashSet<String>, right: &HashSet<String>) -> f64 {
1554    if left.is_empty() || right.is_empty() {
1555        return 0.0;
1556    }
1557
1558    let intersection = left.intersection(right).count() as f64;
1559    let union = left.union(right).count() as f64;
1560    if union == 0.0 {
1561        0.0
1562    } else {
1563        intersection / union
1564    }
1565}
1566
1567fn dd_stopwords() -> &'static [&'static str] {
1568    &[
1569        "and",
1570        "for",
1571        "the",
1572        "with",
1573        "into",
1574        "that",
1575        "from",
1576        "their",
1577        "this",
1578        "those",
1579        "these",
1580        "across",
1581        "through",
1582        "using",
1583        "used",
1584        "helps",
1585        "help",
1586        "offer",
1587        "offers",
1588        "provides",
1589        "provide",
1590        "company",
1591        "companies",
1592        "solution",
1593        "solutions",
1594    ]
1595}
1596
1597fn existing_fact_signature(content: &str) -> Option<String> {
1598    let value = serde_json::from_str::<serde_json::Value>(content).ok()?;
1599    let normalized = normalize_dd_fact(&value)?;
1600    Some(dd_fact_signature(&normalized))
1601}
1602
1603fn normalize_dd_fact(fact: &serde_json::Value) -> Option<serde_json::Value> {
1604    let claim = fact.get("claim")?.as_str()?.trim();
1605    if claim.is_empty() {
1606        return None;
1607    }
1608
1609    let category = normalize_dd_category(
1610        fact.get("category").and_then(serde_json::Value::as_str),
1611        claim,
1612    );
1613    let confidence = fact
1614        .get("confidence")
1615        .and_then(serde_json::Value::as_f64)
1616        .unwrap_or(0.5)
1617        .clamp(0.0, 1.0);
1618    let source_indices = fact
1619        .get("source_indices")
1620        .and_then(serde_json::Value::as_array)
1621        .map(|values| {
1622            values
1623                .iter()
1624                .filter(|value| value.is_i64() || value.is_u64())
1625                .cloned()
1626                .collect::<Vec<_>>()
1627        })
1628        .unwrap_or_default();
1629
1630    Some(serde_json::json!({
1631        "claim": claim,
1632        "category": category,
1633        "source_indices": source_indices,
1634        "confidence": confidence,
1635    }))
1636}
1637
1638fn normalize_dd_category(raw_category: Option<&str>, claim: &str) -> &'static str {
1639    match raw_category
1640        .unwrap_or_default()
1641        .trim()
1642        .to_ascii_lowercase()
1643        .as_str()
1644    {
1645        "product" => {
1646            if claim_looks_technical(claim) {
1647                "technology"
1648            } else {
1649                "product"
1650            }
1651        }
1652        "customer" | "customers" => "customers",
1653        "technology" | "tech" | "platform" | "architecture" | "engineering" | "integrations"
1654        | "integration" | "stack" => "technology",
1655        "competition" | "competitor" | "competitors" => "competition",
1656        "market" | "positioning" => "market",
1657        "financial" | "financials" | "finance" | "funding" | "ownership" | "investors" => {
1658            "financials"
1659        }
1660        "team" | "leadership" | "management" => "team",
1661        "risk" => "risk",
1662        "governance" => "governance",
1663        _ => infer_dd_category_from_claim(claim),
1664    }
1665}
1666
1667fn infer_dd_category_from_claim(claim: &str) -> &'static str {
1668    let claim = claim.to_ascii_lowercase();
1669    if claim_looks_technical(&claim) {
1670        "technology"
1671    } else if claim.contains("customer")
1672        || claim.contains("clients")
1673        || claim.contains("serves ")
1674        || claim.contains("countries")
1675    {
1676        "customers"
1677    } else if claim.contains("funding")
1678        || claim.contains("raised")
1679        || claim.contains("investor")
1680        || claim.contains("acquired")
1681        || claim.contains("revenue")
1682        || claim.contains("arr")
1683    {
1684        "financials"
1685    } else if claim.contains("competitor") || claim.contains("competes") {
1686        "competition"
1687    } else if claim.contains("market") || claim.contains("major player") || claim.contains("idc") {
1688        "market"
1689    } else if claim.contains("chief ")
1690        || claim.contains("officer")
1691        || claim.contains("executive")
1692        || claim.contains("leadership")
1693    {
1694        "team"
1695    } else {
1696        "product"
1697    }
1698}
1699
1700fn claim_looks_technical(claim: &str) -> bool {
1701    let claim = claim.to_ascii_lowercase();
1702    [
1703        "technology",
1704        "architecture",
1705        "platform",
1706        "integration",
1707        "integrations",
1708        "api",
1709        "apis",
1710        "cloud",
1711        "threat intelligence",
1712        "attack surface",
1713        "monitor",
1714        "monitoring",
1715        "ctem",
1716        "exposure management",
1717        "internet-facing",
1718        "dark web",
1719        "open web",
1720        "deep web",
1721        "technical moat",
1722    ]
1723    .iter()
1724    .any(|needle| claim.contains(needle))
1725}
1726
1727fn dd_fact_signature(fact: &serde_json::Value) -> String {
1728    let category = fact
1729        .get("category")
1730        .and_then(serde_json::Value::as_str)
1731        .unwrap_or("product");
1732    let claim = fact
1733        .get("claim")
1734        .and_then(serde_json::Value::as_str)
1735        .unwrap_or_default();
1736    format!("{category}:{}", canonicalize_claim(claim))
1737}
1738
1739fn canonicalize_claim(claim: &str) -> String {
1740    claim
1741        .chars()
1742        .map(|ch| {
1743            if ch.is_ascii_alphanumeric() {
1744                ch.to_ascii_lowercase()
1745            } else {
1746                ' '
1747            }
1748        })
1749        .collect::<String>()
1750        .split_whitespace()
1751        .collect::<Vec<_>>()
1752        .join(" ")
1753}
1754
1755fn covered_dd_categories(hypotheses: &[Fact]) -> Vec<String> {
1756    let mut categories: Vec<String> = hypotheses
1757        .iter()
1758        .filter_map(|fact| {
1759            let value = serde_json::from_str::<serde_json::Value>(&fact.content).ok()?;
1760            let normalized = normalize_dd_fact(&value)?;
1761            normalized["category"].as_str().map(ToOwned::to_owned)
1762        })
1763        .collect::<HashSet<_>>()
1764        .into_iter()
1765        .collect();
1766    categories.sort();
1767    categories
1768}
1769
1770fn missing_expected_dd_categories(hypotheses: &[Fact]) -> Vec<&'static str> {
1771    let covered: HashSet<String> = covered_dd_categories(hypotheses).into_iter().collect();
1772    expected_dd_categories()
1773        .into_iter()
1774        .filter(|category| !covered.contains(*category))
1775        .collect()
1776}
1777
1778fn expected_dd_categories() -> [&'static str; 6] {
1779    [
1780        "product",
1781        "customers",
1782        "technology",
1783        "competition",
1784        "market",
1785        "financials",
1786    ]
1787}
1788
1789fn parse_json_array_response(
1790    raw: &str,
1791    field_name: &str,
1792) -> Result<Vec<serde_json::Value>, String> {
1793    let cleaned = strip_fences(raw);
1794    try_parse_json_array(cleaned, field_name).or_else(|first_error| {
1795        extract_first_json_value(cleaned)
1796            .filter(|candidate| *candidate != cleaned)
1797            .ok_or(first_error.clone())
1798            .and_then(|candidate| {
1799                try_parse_json_array(candidate, field_name).map_err(|second_error| {
1800                    format!("{first_error}; recovered JSON failed: {second_error}")
1801                })
1802            })
1803    })
1804}
1805
1806fn try_parse_json_array(raw: &str, field_name: &str) -> Result<Vec<serde_json::Value>, String> {
1807    match serde_json::from_str::<serde_json::Value>(raw) {
1808        Ok(serde_json::Value::Array(values)) => Ok(values),
1809        Ok(serde_json::Value::Object(map)) => map
1810            .get(field_name)
1811            .and_then(serde_json::Value::as_array)
1812            .cloned()
1813            .ok_or_else(|| format!("expected object field `{field_name}` containing an array")),
1814        Ok(_) => Err(format!(
1815            "expected top-level JSON array or object with `{field_name}`"
1816        )),
1817        Err(error) => Err(error.to_string()),
1818    }
1819}
1820
1821fn extract_first_json_value(raw: &str) -> Option<&str> {
1822    let (start, _) = raw.char_indices().find(|(_, ch)| matches!(ch, '{' | '['))?;
1823    let mut stack = Vec::new();
1824    let mut in_string = false;
1825    let mut escaped = false;
1826
1827    for (offset, ch) in raw[start..].char_indices() {
1828        if in_string {
1829            if escaped {
1830                escaped = false;
1831                continue;
1832            }
1833            match ch {
1834                '\\' => escaped = true,
1835                '"' => in_string = false,
1836                _ => {}
1837            }
1838            continue;
1839        }
1840
1841        match ch {
1842            '"' => in_string = true,
1843            '{' => stack.push('}'),
1844            '[' => stack.push(']'),
1845            '}' | ']' => {
1846                if stack.pop() != Some(ch) {
1847                    return None;
1848                }
1849                if stack.is_empty() {
1850                    let end = start + offset + ch.len_utf8();
1851                    return Some(&raw[start..end]);
1852                }
1853            }
1854            _ => {}
1855        }
1856    }
1857
1858    None
1859}
1860
1861fn is_relevant(title: &str, content: &str, url: &str, subject: &str) -> bool {
1862    let s = subject.to_lowercase();
1863    let t = title.to_lowercase();
1864    let b = content.to_lowercase();
1865    let u = url.to_lowercase();
1866    t.contains(&s)
1867        || b.contains(&s)
1868        || u.contains(&s.replace(' ', ""))
1869        || u.contains(&s.replace(' ', "-"))
1870}
1871
1872#[cfg(test)]
1873mod tests {
1874    use std::sync::Arc;
1875
1876    use converge_pack::{Context, ContextKey, Fact, ProposedFact, Suggestor};
1877
1878    use super::{
1879        DdError, DdLlm, SharedBudget, SynthesisSuggestor, canonicalize_claim,
1880        consolidate_dd_fact_values, extract_first_json_value, next_batch_bounds, normalize_dd_fact,
1881        parse_json_array_response,
1882    };
1883
1884    struct StubLlm;
1885
1886    #[async_trait::async_trait]
1887    impl DdLlm for StubLlm {
1888        async fn complete(&self, prompt: &str) -> Result<String, DdError> {
1889            let _ = prompt;
1890            Ok("{}".to_string())
1891        }
1892    }
1893
1894    struct StubContext {
1895        hypothesis_count: usize,
1896        has_proposals: bool,
1897    }
1898
1899    impl Context for StubContext {
1900        fn has(&self, key: ContextKey) -> bool {
1901            match key {
1902                ContextKey::Hypotheses => self.hypothesis_count > 0,
1903                ContextKey::Proposals => self.has_proposals,
1904                _ => false,
1905            }
1906        }
1907
1908        fn get(&self, _key: ContextKey) -> &[Fact] {
1909            &[]
1910        }
1911
1912        fn get_proposals(&self, _key: ContextKey) -> &[ProposedFact] {
1913            &[]
1914        }
1915
1916        fn count(&self, key: ContextKey) -> usize {
1917            match key {
1918                ContextKey::Hypotheses => self.hypothesis_count,
1919                _ => 0,
1920            }
1921        }
1922    }
1923
1924    #[test]
1925    fn synthesis_suggestor_is_always_schedulable() {
1926        let budget = Arc::new(SharedBudget::new().with_limit("llm", 1));
1927        let suggestor = SynthesisSuggestor::new("Acme", budget, Arc::new(StubLlm));
1928
1929        assert!(suggestor.dependencies().is_empty());
1930    }
1931
1932    #[test]
1933    fn synthesis_accepts_after_hypotheses_stabilize() {
1934        let budget = Arc::new(SharedBudget::new().with_limit("llm", 1));
1935        let suggestor = SynthesisSuggestor::new("Acme", budget, Arc::new(StubLlm))
1936            .with_required_stable_cycles(2);
1937
1938        let first_fact_wave = StubContext {
1939            hypothesis_count: 5,
1940            has_proposals: false,
1941        };
1942        let first_stable_cycle = StubContext {
1943            hypothesis_count: 5,
1944            has_proposals: false,
1945        };
1946        let second_stable_cycle = StubContext {
1947            hypothesis_count: 5,
1948            has_proposals: false,
1949        };
1950
1951        assert!(!suggestor.accepts(&first_fact_wave));
1952        assert!(!suggestor.accepts(&first_stable_cycle));
1953        assert!(suggestor.accepts(&second_stable_cycle));
1954    }
1955
1956    #[test]
1957    fn parse_json_array_response_accepts_wrapped_object() {
1958        let parsed = parse_json_array_response(
1959            r#"{"facts":[{"claim":"Acme sells software","confidence":0.9}]}"#,
1960            "facts",
1961        )
1962        .expect("wrapped array should parse");
1963
1964        assert_eq!(parsed.len(), 1);
1965        assert_eq!(parsed[0]["claim"], "Acme sells software");
1966    }
1967
1968    #[test]
1969    fn parse_json_array_response_accepts_legacy_array_shape() {
1970        let parsed = parse_json_array_response(
1971            r#"[{"query":"Acme competitors","mode":"breadth","reason":"market"}]"#,
1972            "strategies",
1973        )
1974        .expect("legacy array should parse");
1975
1976        assert_eq!(parsed.len(), 1);
1977        assert_eq!(parsed[0]["query"], "Acme competitors");
1978    }
1979
1980    #[test]
1981    fn parse_json_array_response_recovers_json_from_prose() {
1982        let parsed = parse_json_array_response(
1983            "Here is the JSON you requested:\n```json\n{\"facts\":[{\"claim\":\"Acme grows\",\"confidence\":0.7}]}\n```\nThanks.",
1984            "facts",
1985        )
1986        .expect("embedded JSON should parse");
1987
1988        assert_eq!(parsed.len(), 1);
1989        assert_eq!(parsed[0]["claim"], "Acme grows");
1990    }
1991
1992    #[test]
1993    fn extract_first_json_value_handles_nested_arrays_and_objects() {
1994        let extracted = extract_first_json_value(
1995            "prefix {\"facts\":[{\"claim\":\"A\",\"source_indices\":[0,1]}]} suffix",
1996        )
1997        .expect("should find first JSON value");
1998
1999        assert_eq!(
2000            extracted,
2001            r#"{"facts":[{"claim":"A","source_indices":[0,1]}]}"#
2002        );
2003    }
2004
2005    #[test]
2006    fn next_batch_bounds_advances_through_unprocessed_signals() {
2007        assert_eq!(next_batch_bounds(37, 0, 15), (0, 15));
2008        assert_eq!(next_batch_bounds(37, 15, 15), (15, 30));
2009        assert_eq!(next_batch_bounds(37, 30, 15), (30, 37));
2010        assert_eq!(next_batch_bounds(37, 37, 15), (37, 37));
2011    }
2012
2013    #[test]
2014    fn normalize_dd_fact_reclassifies_technical_product_claims() {
2015        let normalized = normalize_dd_fact(&serde_json::json!({
2016            "claim": "Outpost24's Sweepatic Platform monitors internet-facing assets for attack surface management.",
2017            "category": "product",
2018            "source_indices": [0],
2019            "confidence": 0.9,
2020        }))
2021        .expect("fact should normalize");
2022
2023        assert_eq!(normalized["category"], "technology");
2024    }
2025
2026    #[test]
2027    fn canonicalize_claim_ignores_case_and_punctuation() {
2028        assert_eq!(
2029            canonicalize_claim("Outpost24 raised $23.8M!"),
2030            canonicalize_claim("outpost24 raised 23 8m")
2031        );
2032    }
2033
2034    #[test]
2035    fn consolidate_dd_fact_values_merges_exact_duplicates() {
2036        let summaries = consolidate_dd_fact_values(vec![
2037            serde_json::json!({
2038                "claim": "Outpost24 offers a 100% open API for easy integration into security operations.",
2039                "category": "technology",
2040                "source_indices": [0],
2041                "confidence": 0.9,
2042            }),
2043            serde_json::json!({
2044                "claim": "Outpost24 offers a 100% open API for easy integration into security operations.",
2045                "category": "technology",
2046                "source_indices": [1],
2047                "confidence": 0.8,
2048            }),
2049        ]);
2050
2051        assert_eq!(summaries.len(), 1);
2052        assert_eq!(summaries[0].support_count, 2);
2053        assert_eq!(summaries[0].evidence_count, 2);
2054    }
2055
2056    #[test]
2057    fn consolidate_dd_fact_values_drops_vague_same_topic_repeats() {
2058        let summaries = consolidate_dd_fact_values(vec![
2059            serde_json::json!({
2060                "claim": "Outpost24 has 195 employees.",
2061                "category": "team",
2062                "source_indices": [0],
2063                "confidence": 0.9,
2064            }),
2065            serde_json::json!({
2066                "claim": "Outpost24 has over 200 employees.",
2067                "category": "team",
2068                "source_indices": [1],
2069                "confidence": 0.7,
2070            }),
2071            serde_json::json!({
2072                "claim": "Outpost24 offers a 100% open API for easy integration into security operations.",
2073                "category": "technology",
2074                "source_indices": [2],
2075                "confidence": 0.9,
2076            }),
2077        ]);
2078
2079        let team_facts: Vec<_> = summaries
2080            .iter()
2081            .filter(|summary| summary.category == "team")
2082            .collect();
2083        assert_eq!(team_facts.len(), 1);
2084        assert_eq!(team_facts[0].claim, "Outpost24 has 195 employees.");
2085    }
2086
2087    #[test]
2088    fn consolidate_dd_fact_values_preserves_conflicting_exact_financials() {
2089        let summaries = consolidate_dd_fact_values(vec![
2090            serde_json::json!({
2091                "claim": "Outpost24's 2023 revenue was $42.19M.",
2092                "category": "financials",
2093                "source_indices": [0],
2094                "confidence": 0.9,
2095            }),
2096            serde_json::json!({
2097                "claim": "Outpost24 generates $67.5 million in revenue.",
2098                "category": "financials",
2099                "source_indices": [1],
2100                "confidence": 0.9,
2101            }),
2102        ]);
2103
2104        assert_eq!(summaries.len(), 2);
2105    }
2106
2107    // ── Negative tests ────────────────────────────────────────────
2108
2109    #[test]
2110    fn normalize_dd_fact_rejects_empty_claim() {
2111        assert!(
2112            normalize_dd_fact(&serde_json::json!({
2113                "claim": "",
2114                "category": "product",
2115            }))
2116            .is_none()
2117        );
2118    }
2119
2120    #[test]
2121    fn normalize_dd_fact_rejects_whitespace_only_claim() {
2122        assert!(
2123            normalize_dd_fact(&serde_json::json!({
2124                "claim": "   ",
2125                "category": "product",
2126            }))
2127            .is_none()
2128        );
2129    }
2130
2131    #[test]
2132    fn normalize_dd_fact_rejects_missing_claim() {
2133        assert!(
2134            normalize_dd_fact(&serde_json::json!({
2135                "category": "product",
2136            }))
2137            .is_none()
2138        );
2139    }
2140
2141    #[test]
2142    fn normalize_dd_fact_clamps_confidence() {
2143        let normalized = normalize_dd_fact(&serde_json::json!({
2144            "claim": "test",
2145            "category": "product",
2146            "confidence": 5.0,
2147        }))
2148        .unwrap();
2149        assert_eq!(normalized["confidence"], 1.0);
2150
2151        let normalized = normalize_dd_fact(&serde_json::json!({
2152            "claim": "test",
2153            "category": "product",
2154            "confidence": -1.0,
2155        }))
2156        .unwrap();
2157        assert_eq!(normalized["confidence"], 0.0);
2158    }
2159
2160    #[test]
2161    fn normalize_dd_fact_defaults_missing_confidence() {
2162        let normalized = normalize_dd_fact(&serde_json::json!({
2163            "claim": "test claim",
2164            "category": "product",
2165        }))
2166        .unwrap();
2167        assert_eq!(normalized["confidence"], 0.5);
2168    }
2169
2170    #[test]
2171    fn normalize_dd_fact_filters_non_integer_source_indices() {
2172        let normalized = normalize_dd_fact(&serde_json::json!({
2173            "claim": "test",
2174            "category": "product",
2175            "source_indices": [0, "bad", 2, null, 3],
2176        }))
2177        .unwrap();
2178        let indices = normalized["source_indices"].as_array().unwrap();
2179        assert_eq!(indices.len(), 3);
2180    }
2181
2182    #[test]
2183    fn parse_json_array_response_rejects_plain_text() {
2184        assert!(parse_json_array_response("just some text", "facts").is_err());
2185    }
2186
2187    #[test]
2188    fn parse_json_array_response_rejects_object_with_wrong_field() {
2189        assert!(parse_json_array_response(r#"{"results":[{"claim":"X"}]}"#, "facts").is_err());
2190    }
2191
2192    #[test]
2193    fn parse_json_array_response_rejects_scalar() {
2194        assert!(parse_json_array_response("42", "facts").is_err());
2195        assert!(parse_json_array_response("true", "facts").is_err());
2196        assert!(parse_json_array_response(r#""string""#, "facts").is_err());
2197    }
2198
2199    #[test]
2200    fn extract_first_json_value_returns_none_for_no_json() {
2201        assert!(extract_first_json_value("no json here").is_none());
2202    }
2203
2204    #[test]
2205    fn extract_first_json_value_returns_none_for_mismatched_braces() {
2206        assert!(extract_first_json_value("{unclosed").is_none());
2207        assert!(extract_first_json_value("[}").is_none());
2208    }
2209
2210    #[test]
2211    fn extract_first_json_value_handles_escaped_quotes_in_strings() {
2212        let result = extract_first_json_value(r#"prefix {"key":"val\"ue"} suffix"#);
2213        assert!(result.is_some());
2214        let parsed: serde_json::Value = serde_json::from_str(result.unwrap()).unwrap();
2215        assert_eq!(parsed["key"], r#"val"ue"#);
2216    }
2217
2218    #[test]
2219    fn consolidate_dd_fact_values_handles_empty_input() {
2220        assert!(consolidate_dd_fact_values(vec![]).is_empty());
2221    }
2222
2223    #[test]
2224    fn consolidate_dd_fact_values_handles_all_invalid_facts() {
2225        let summaries = consolidate_dd_fact_values(vec![
2226            serde_json::json!({"claim": "", "category": "product"}),
2227            serde_json::json!({"no_claim": true}),
2228            serde_json::json!(null),
2229        ]);
2230        assert!(summaries.is_empty());
2231    }
2232
2233    #[test]
2234    fn next_batch_bounds_zero_total() {
2235        assert_eq!(next_batch_bounds(0, 0, 15), (0, 0));
2236    }
2237
2238    #[test]
2239    fn next_batch_bounds_processed_exceeds_total() {
2240        assert_eq!(next_batch_bounds(5, 100, 15), (5, 5));
2241    }
2242
2243    #[test]
2244    fn canonicalize_claim_handles_empty_string() {
2245        assert_eq!(canonicalize_claim(""), "");
2246    }
2247
2248    #[test]
2249    fn canonicalize_claim_handles_only_punctuation() {
2250        assert_eq!(canonicalize_claim("!!!...???"), "");
2251    }
2252
2253    #[test]
2254    fn synthesis_does_not_accept_when_proposals_exist() {
2255        let budget = Arc::new(SharedBudget::new().with_limit("llm", 1));
2256        let suggestor = SynthesisSuggestor::new("Acme", budget, Arc::new(StubLlm));
2257
2258        let ctx_with_proposals = StubContext {
2259            hypothesis_count: 10,
2260            has_proposals: true,
2261        };
2262        assert!(!suggestor.accepts(&ctx_with_proposals));
2263    }
2264
2265    #[test]
2266    fn synthesis_does_not_accept_without_hypotheses() {
2267        let budget = Arc::new(SharedBudget::new().with_limit("llm", 1));
2268        let suggestor = SynthesisSuggestor::new("Acme", budget, Arc::new(StubLlm));
2269
2270        let empty_ctx = StubContext {
2271            hypothesis_count: 0,
2272            has_proposals: false,
2273        };
2274        assert!(!suggestor.accepts(&empty_ctx));
2275    }
2276
2277    #[test]
2278    fn dd_error_infra_vs_non_infra() {
2279        assert!(
2280            DdError::CreditsExhausted {
2281                provider: "x".into(),
2282                detail: "y".into()
2283            }
2284            .is_infra_failure()
2285        );
2286        assert!(
2287            DdError::RateLimited {
2288                provider: "x".into(),
2289                retry_after_ms: None
2290            }
2291            .is_infra_failure()
2292        );
2293        assert!(
2294            DdError::ProviderUnavailable {
2295                provider: "x".into(),
2296                detail: "y".into()
2297            }
2298            .is_infra_failure()
2299        );
2300
2301        assert!(
2302            !DdError::BadResponse {
2303                provider: "x".into(),
2304                detail: "y".into()
2305            }
2306            .is_infra_failure()
2307        );
2308        assert!(
2309            !DdError::ParseFailed {
2310                provider: "x".into(),
2311                detail: "y".into()
2312            }
2313            .is_infra_failure()
2314        );
2315        assert!(
2316            !DdError::PromptTooLarge {
2317                provider: "x".into(),
2318                tokens: None
2319            }
2320            .is_infra_failure()
2321        );
2322    }
2323
2324    #[test]
2325    fn dd_error_only_credits_exhausted_is_fatal() {
2326        assert!(
2327            DdError::CreditsExhausted {
2328                provider: "x".into(),
2329                detail: "y".into()
2330            }
2331            .is_fatal()
2332        );
2333        assert!(
2334            !DdError::RateLimited {
2335                provider: "x".into(),
2336                retry_after_ms: None
2337            }
2338            .is_fatal()
2339        );
2340        assert!(
2341            !DdError::ProviderUnavailable {
2342                provider: "x".into(),
2343                detail: "y".into()
2344            }
2345            .is_fatal()
2346        );
2347    }
2348
2349    // ── Proptests ─────────────────────────────────────────────────
2350
2351    #[allow(clippy::cast_precision_loss)]
2352    mod proptests {
2353        use super::*;
2354        use proptest::prelude::*;
2355
2356        proptest! {
2357            #[test]
2358            fn canonicalize_is_idempotent(claim in ".*") {
2359                let first = canonicalize_claim(&claim);
2360                let second = canonicalize_claim(&first);
2361                prop_assert_eq!(first, second);
2362            }
2363
2364            #[test]
2365            fn canonicalize_is_case_insensitive(claim in "[a-zA-Z0-9 ]{1,100}") {
2366                prop_assert_eq!(
2367                    canonicalize_claim(&claim),
2368                    canonicalize_claim(&claim.to_uppercase())
2369                );
2370            }
2371
2372            #[test]
2373            fn normalize_dd_fact_never_panics(
2374                claim in ".*",
2375                category in ".*",
2376                confidence in proptest::num::f64::ANY,
2377            ) {
2378                let _ = normalize_dd_fact(&serde_json::json!({
2379                    "claim": claim,
2380                    "category": category,
2381                    "confidence": confidence,
2382                }));
2383            }
2384
2385            #[test]
2386            fn normalize_preserves_non_empty_claims(
2387                claim in "[a-zA-Z]{1,50}",
2388                category in prop_oneof![
2389                    Just("product"), Just("technology"), Just("financials"),
2390                    Just("customers"), Just("competition"), Just("market"),
2391                ],
2392            ) {
2393                let normalized = normalize_dd_fact(&serde_json::json!({
2394                    "claim": claim,
2395                    "category": category,
2396                    "confidence": 0.8,
2397                }));
2398                prop_assert!(normalized.is_some());
2399                let n = normalized.unwrap();
2400                prop_assert!(!n["claim"].as_str().unwrap().is_empty());
2401            }
2402
2403            #[test]
2404            fn consolidate_never_panics(
2405                n in 0_usize..20,
2406            ) {
2407                let categories = ["product", "technology", "financials"];
2408                let facts: Vec<serde_json::Value> = (0..n).map(|i| {
2409                    serde_json::json!({
2410                        "claim": format!("Fact number {i} about the company"),
2411                        "category": categories[i % 3],
2412                        "source_indices": [i],
2413                        "confidence": 0.5 + (i as f64 * 0.02),
2414                    })
2415                }).collect();
2416                let result = consolidate_dd_fact_values(facts);
2417                prop_assert!(result.len() <= n);
2418            }
2419
2420            #[test]
2421            fn next_batch_bounds_always_valid(
2422                total in 0_usize..1000,
2423                processed in 0_usize..1000,
2424                max_batch in 1_usize..100,
2425            ) {
2426                let (start, end) = next_batch_bounds(total, processed, max_batch);
2427                prop_assert!(start <= total);
2428                prop_assert!(end <= total);
2429                prop_assert!(start <= end);
2430                prop_assert!(end - start <= max_batch);
2431            }
2432
2433            #[test]
2434            fn extract_first_json_value_never_panics(input in ".*") {
2435                let _ = extract_first_json_value(&input);
2436            }
2437
2438            #[test]
2439            fn parse_json_array_response_never_panics(
2440                input in ".*",
2441                field in "[a-z]{1,10}",
2442            ) {
2443                let _ = parse_json_array_response(&input, &field);
2444            }
2445        }
2446    }
2447}