use rand::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct GeneratorConfig {
pub dataset_size: usize,
pub complexity: ComplexityLevel,
pub domain: Domain,
pub noise_level: f64,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum ComplexityLevel {
Simple,
Medium,
Complex,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum Domain {
Technical,
Business,
General,
}
pub struct SyntheticGenerator {
config: GeneratorConfig,
rng: StdRng,
}
impl SyntheticGenerator {
pub fn new(config: GeneratorConfig) -> Self {
let rng = match config.dataset_size {
0 => StdRng::from_entropy(),
seed => StdRng::seed_from_u64(seed as u64),
};
Self { config, rng }
}
pub fn generate_rag_queries(&mut self) -> Vec<String> {
let templates = self.get_query_templates();
let mut queries = Vec::new();
for _ in 0..self.config.dataset_size {
let template = templates.choose(&mut self.rng).unwrap();
let query = self.fill_template(template);
queries.push(query);
}
queries
}
pub fn generate_reasoning_chains(&mut self) -> Vec<ReasoningChain> {
let mut chains = Vec::new();
for _ in 0..self.config.dataset_size {
let chain = self.generate_single_chain();
chains.push(chain);
}
chains
}
pub fn generate_kb_documents(&mut self) -> Vec<KnowledgeDocument> {
let mut documents = Vec::new();
for _ in 0..self.config.dataset_size {
let doc = self.generate_single_document();
documents.push(doc);
}
documents
}
fn get_query_templates(&self) -> Vec<&'static str> {
match self.config.domain {
Domain::Technical => vec![
"How to implement {concept} in {language}?",
"What are the best practices for {concept}?",
"Compare {concept1} vs {concept2} performance",
"Troubleshooting {concept} issues in production",
],
Domain::Business => vec![
"What is the ROI of implementing {concept}?",
"How to scale {concept} for enterprise use?",
"Market analysis for {concept} adoption",
"Cost-benefit analysis of {concept} implementation",
],
Domain::General => vec![
"What is {concept} and how does it work?",
"Explain {concept} in simple terms",
"Benefits and drawbacks of {concept}",
"Future trends in {concept} development",
],
}
}
fn fill_template(&mut self, template: &str) -> String {
let concepts = self.get_domain_concepts();
let languages = ["Rust", "Python", "JavaScript", "Go", "Java"];
template
.replace("{concept}", concepts.choose(&mut self.rng).unwrap())
.replace("{concept1}", concepts.choose(&mut self.rng).unwrap())
.replace("{concept2}", concepts.choose(&mut self.rng).unwrap())
.replace("{language}", languages.choose(&mut self.rng).unwrap())
}
fn get_domain_concepts(&self) -> Vec<&'static str> {
match self.config.domain {
Domain::Technical => vec![
"machine learning",
"microservices",
"API design",
"database optimization",
"container orchestration",
"serverless computing",
"graph databases",
],
Domain::Business => vec![
"customer acquisition",
"market expansion",
"product development",
"financial modeling",
"team management",
"strategic planning",
],
Domain::General => vec![
"artificial intelligence",
"blockchain technology",
"quantum computing",
"sustainable energy",
"remote work",
"digital transformation",
],
}
}
fn generate_single_chain(&mut self) -> ReasoningChain {
let input = self.generate_input_query();
let steps = self.generate_reasoning_steps();
let output = self.generate_final_output();
ReasoningChain {
input,
steps,
output,
quality_score: self.rng.gen_range(0.7..0.95),
}
}
fn generate_input_query(&mut self) -> String {
let templates = self.get_query_templates();
let template = templates.choose(&mut self.rng).unwrap();
self.fill_template(template)
}
fn generate_reasoning_steps(&mut self) -> Vec<ReasoningStep> {
let thinktools = [
"GigaThink",
"LaserLogic",
"BedRock",
"ProofGuard",
"BrutalHonesty",
];
let step_count = match self.config.complexity {
ComplexityLevel::Simple => 2,
ComplexityLevel::Medium => 3,
ComplexityLevel::Complex => 5,
};
(0..step_count)
.map(|i| ReasoningStep {
tool: thinktools[i % thinktools.len()].to_string(),
input: format!("Step {} input", i + 1),
output: format!(
"Analysis result from {} step {}",
thinktools[i % thinktools.len()],
i + 1
),
confidence: self.rng.gen_range(0.75..0.95),
})
.collect()
}
fn generate_final_output(&mut self) -> String {
"Final conclusion based on comprehensive analysis".to_string()
}
fn generate_single_document(&mut self) -> KnowledgeDocument {
let concepts = self.get_domain_concepts();
let concept = concepts.choose(&mut self.rng).unwrap();
let title = format!("Understanding {}", concept);
let content = self.generate_document_content(concept);
KnowledgeDocument {
title,
content,
metadata: HashMap::new(),
quality_score: self.rng.gen_range(0.8..0.95),
complexity: self.config.complexity,
domain: self.config.domain,
}
}
fn generate_document_content(&mut self, concept: &str) -> String {
format!(
"# {}\n\n{} is a fundamental concept in modern technology.\n\n## Key Benefits\n\n- Improved efficiency\n- Better scalability\n- Enhanced reliability\n\n## Implementation\n\nTo implement {}, follow these steps:\n\n1. Plan your approach\n2. Gather requirements\n3. Execute implementation\n4. Test thoroughly\n\n## Conclusion\n\n{} offers significant advantages for organizations looking to innovate.",
concept, concept, concept, concept
)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReasoningChain {
pub input: String,
pub steps: Vec<ReasoningStep>,
pub output: String,
pub quality_score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReasoningStep {
pub tool: String,
pub input: String,
pub output: String,
pub confidence: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KnowledgeDocument {
pub title: String,
pub content: String,
pub metadata: HashMap<String, String>,
pub quality_score: f64,
pub complexity: ComplexityLevel,
pub domain: Domain,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_generate_rag_queries() {
let config = GeneratorConfig {
dataset_size: 5,
complexity: ComplexityLevel::Medium,
domain: Domain::Technical,
noise_level: 0.1,
};
let mut generator = SyntheticGenerator::new(config);
let queries = generator.generate_rag_queries();
assert_eq!(queries.len(), 5);
for query in &queries {
assert!(!query.is_empty());
assert!(query.len() > 10);
}
}
#[test]
fn test_generate_reasoning_chains() {
let config = GeneratorConfig {
dataset_size: 3,
complexity: ComplexityLevel::Complex,
domain: Domain::Business,
noise_level: 0.1,
};
let mut generator = SyntheticGenerator::new(config);
let chains = generator.generate_reasoning_chains();
assert_eq!(chains.len(), 3);
for chain in &chains {
assert!(!chain.input.is_empty());
assert_eq!(chain.steps.len(), 5); assert!(!chain.output.is_empty());
assert!(chain.quality_score >= 0.7 && chain.quality_score <= 0.95);
}
}
#[test]
fn test_generate_kb_documents() {
let config = GeneratorConfig {
dataset_size: 2,
complexity: ComplexityLevel::Simple,
domain: Domain::General,
noise_level: 0.1,
};
let mut generator = SyntheticGenerator::new(config);
let documents = generator.generate_kb_documents();
assert_eq!(documents.len(), 2);
for doc in &documents {
assert!(!doc.title.is_empty());
assert!(!doc.content.is_empty());
assert!(doc.content.contains('#'));
assert!(doc.quality_score >= 0.8 && doc.quality_score <= 0.95);
}
}
}