reasonkit-core 0.1.8

The Reasoning Engine — Auditable Reasoning for Production AI | Rust-Native | Turn Prompts into Protocols
//! Simple synthetic data generator example for ReasonKit workflows.

use rand::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Configuration for synthetic data generation.
#[derive(Debug, Clone)]
pub struct GeneratorConfig {
    pub dataset_size: usize,
    pub complexity: ComplexityLevel,
    pub domain: Domain,
    pub noise_level: f64,
}

#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum ComplexityLevel {
    Simple,
    Medium,
    Complex,
}

#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum Domain {
    Technical,
    Business,
    General,
}

/// Synthetic data generator.
pub struct SyntheticGenerator {
    config: GeneratorConfig,
    rng: StdRng,
}

impl SyntheticGenerator {
    pub fn new(config: GeneratorConfig) -> Self {
        let rng = match config.dataset_size {
            0 => StdRng::from_entropy(),
            seed => StdRng::seed_from_u64(seed as u64),
        };

        Self { config, rng }
    }

    /// Generate synthetic RAG queries.
    pub fn generate_rag_queries(&mut self) -> Vec<String> {
        let templates = self.get_query_templates();
        let mut queries = Vec::new();

        for _ in 0..self.config.dataset_size {
            let template = templates.choose(&mut self.rng).unwrap();
            let query = self.fill_template(template);
            queries.push(query);
        }

        queries
    }

    /// Generate synthetic reasoning chains.
    pub fn generate_reasoning_chains(&mut self) -> Vec<ReasoningChain> {
        let mut chains = Vec::new();

        for _ in 0..self.config.dataset_size {
            let chain = self.generate_single_chain();
            chains.push(chain);
        }

        chains
    }

    /// Generate synthetic knowledge base documents.
    pub fn generate_kb_documents(&mut self) -> Vec<KnowledgeDocument> {
        let mut documents = Vec::new();

        for _ in 0..self.config.dataset_size {
            let doc = self.generate_single_document();
            documents.push(doc);
        }

        documents
    }

    fn get_query_templates(&self) -> Vec<&'static str> {
        match self.config.domain {
            Domain::Technical => vec![
                "How to implement {concept} in {language}?",
                "What are the best practices for {concept}?",
                "Compare {concept1} vs {concept2} performance",
                "Troubleshooting {concept} issues in production",
            ],
            Domain::Business => vec![
                "What is the ROI of implementing {concept}?",
                "How to scale {concept} for enterprise use?",
                "Market analysis for {concept} adoption",
                "Cost-benefit analysis of {concept} implementation",
            ],
            Domain::General => vec![
                "What is {concept} and how does it work?",
                "Explain {concept} in simple terms",
                "Benefits and drawbacks of {concept}",
                "Future trends in {concept} development",
            ],
        }
    }

    fn fill_template(&mut self, template: &str) -> String {
        let concepts = self.get_domain_concepts();
        let languages = ["Rust", "Python", "JavaScript", "Go", "Java"];

        template
            .replace("{concept}", concepts.choose(&mut self.rng).unwrap())
            .replace("{concept1}", concepts.choose(&mut self.rng).unwrap())
            .replace("{concept2}", concepts.choose(&mut self.rng).unwrap())
            .replace("{language}", languages.choose(&mut self.rng).unwrap())
    }

    fn get_domain_concepts(&self) -> Vec<&'static str> {
        match self.config.domain {
            Domain::Technical => vec![
                "machine learning",
                "microservices",
                "API design",
                "database optimization",
                "container orchestration",
                "serverless computing",
                "graph databases",
            ],
            Domain::Business => vec![
                "customer acquisition",
                "market expansion",
                "product development",
                "financial modeling",
                "team management",
                "strategic planning",
            ],
            Domain::General => vec![
                "artificial intelligence",
                "blockchain technology",
                "quantum computing",
                "sustainable energy",
                "remote work",
                "digital transformation",
            ],
        }
    }

    fn generate_single_chain(&mut self) -> ReasoningChain {
        let input = self.generate_input_query();
        let steps = self.generate_reasoning_steps();
        let output = self.generate_final_output();

        ReasoningChain {
            input,
            steps,
            output,
            quality_score: self.rng.gen_range(0.7..0.95),
        }
    }

    fn generate_input_query(&mut self) -> String {
        let templates = self.get_query_templates();
        let template = templates.choose(&mut self.rng).unwrap();
        self.fill_template(template)
    }

    fn generate_reasoning_steps(&mut self) -> Vec<ReasoningStep> {
        let thinktools = [
            "GigaThink",
            "LaserLogic",
            "BedRock",
            "ProofGuard",
            "BrutalHonesty",
        ];
        let step_count = match self.config.complexity {
            ComplexityLevel::Simple => 2,
            ComplexityLevel::Medium => 3,
            ComplexityLevel::Complex => 5,
        };

        (0..step_count)
            .map(|i| ReasoningStep {
                tool: thinktools[i % thinktools.len()].to_string(),
                input: format!("Step {} input", i + 1),
                output: format!(
                    "Analysis result from {} step {}",
                    thinktools[i % thinktools.len()],
                    i + 1
                ),
                confidence: self.rng.gen_range(0.75..0.95),
            })
            .collect()
    }

    fn generate_final_output(&mut self) -> String {
        "Final conclusion based on comprehensive analysis".to_string()
    }

    fn generate_single_document(&mut self) -> KnowledgeDocument {
        let concepts = self.get_domain_concepts();
        let concept = concepts.choose(&mut self.rng).unwrap();

        let title = format!("Understanding {}", concept);
        let content = self.generate_document_content(concept);

        KnowledgeDocument {
            title,
            content,
            metadata: HashMap::new(),
            quality_score: self.rng.gen_range(0.8..0.95),
            complexity: self.config.complexity,
            domain: self.config.domain,
        }
    }

    fn generate_document_content(&mut self, concept: &str) -> String {
        format!(
            "# {}\n\n{} is a fundamental concept in modern technology.\n\n## Key Benefits\n\n- Improved efficiency\n- Better scalability\n- Enhanced reliability\n\n## Implementation\n\nTo implement {}, follow these steps:\n\n1. Plan your approach\n2. Gather requirements\n3. Execute implementation\n4. Test thoroughly\n\n## Conclusion\n\n{} offers significant advantages for organizations looking to innovate.",
            concept, concept, concept, concept
        )
    }
}

/// A synthetic reasoning chain.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReasoningChain {
    pub input: String,
    pub steps: Vec<ReasoningStep>,
    pub output: String,
    pub quality_score: f64,
}

/// A single step in a reasoning chain.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReasoningStep {
    pub tool: String,
    pub input: String,
    pub output: String,
    pub confidence: f64,
}

/// A synthetic knowledge base document.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeDocument {
    pub title: String,
    pub content: String,
    pub metadata: HashMap<String, String>,
    pub quality_score: f64,
    pub complexity: ComplexityLevel,
    pub domain: Domain,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_generate_rag_queries() {
        let config = GeneratorConfig {
            dataset_size: 5,
            complexity: ComplexityLevel::Medium,
            domain: Domain::Technical,
            noise_level: 0.1,
        };

        let mut generator = SyntheticGenerator::new(config);
        let queries = generator.generate_rag_queries();

        assert_eq!(queries.len(), 5);
        for query in &queries {
            assert!(!query.is_empty());
            // Queries should have some content (not strict format check)
            assert!(query.len() > 10);
        }
    }

    #[test]
    fn test_generate_reasoning_chains() {
        let config = GeneratorConfig {
            dataset_size: 3,
            complexity: ComplexityLevel::Complex,
            domain: Domain::Business,
            noise_level: 0.1,
        };

        let mut generator = SyntheticGenerator::new(config);
        let chains = generator.generate_reasoning_chains();

        assert_eq!(chains.len(), 3);
        for chain in &chains {
            assert!(!chain.input.is_empty());
            assert_eq!(chain.steps.len(), 5); // Complex = 5 steps
            assert!(!chain.output.is_empty());
            assert!(chain.quality_score >= 0.7 && chain.quality_score <= 0.95);
        }
    }

    #[test]
    fn test_generate_kb_documents() {
        let config = GeneratorConfig {
            dataset_size: 2,
            complexity: ComplexityLevel::Simple,
            domain: Domain::General,
            noise_level: 0.1,
        };

        let mut generator = SyntheticGenerator::new(config);
        let documents = generator.generate_kb_documents();

        assert_eq!(documents.len(), 2);
        for doc in &documents {
            assert!(!doc.title.is_empty());
            assert!(!doc.content.is_empty());
            assert!(doc.content.contains('#'));
            assert!(doc.quality_score >= 0.8 && doc.quality_score <= 0.95);
        }
    }
}