use super::{Difficulty, EvalResult, Example, ExampleResult, ModelComparison};
use serde::{Deserialize, Serialize};
use std::fmt::Write;
use std::time::Duration;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Py2RsLevel {
Hello = 1,
Variables = 2,
Functions = 3,
Collections = 4,
ControlFlow = 5,
ErrorHandling = 6,
OopTraits = 7,
Concurrency = 8,
FfiUnsafe = 9,
Metaprogramming = 10,
}
impl Py2RsLevel {
#[must_use]
pub const fn all() -> [Self; 10] {
[
Self::Hello,
Self::Variables,
Self::Functions,
Self::Collections,
Self::ControlFlow,
Self::ErrorHandling,
Self::OopTraits,
Self::Concurrency,
Self::FfiUnsafe,
Self::Metaprogramming,
]
}
#[must_use]
pub const fn number(&self) -> u8 {
*self as u8
}
#[must_use]
pub const fn name(&self) -> &'static str {
match self {
Self::Hello => "Hello",
Self::Variables => "Variables",
Self::Functions => "Functions",
Self::Collections => "Collections",
Self::ControlFlow => "ControlFlow",
Self::ErrorHandling => "ErrorHandling",
Self::OopTraits => "OOP→Traits",
Self::Concurrency => "Concurrency",
Self::FfiUnsafe => "FFI/Unsafe",
Self::Metaprogramming => "Metaprogramming",
}
}
#[must_use]
pub const fn difficulty(&self) -> Difficulty {
match self {
Self::Hello | Self::Variables => Difficulty::Trivial,
Self::Functions | Self::Collections => Difficulty::Easy,
Self::ControlFlow | Self::ErrorHandling => Difficulty::Medium,
Self::OopTraits | Self::Concurrency => Difficulty::Hard,
Self::FfiUnsafe | Self::Metaprogramming => Difficulty::Expert,
}
}
#[must_use]
pub const fn weight(&self) -> f32 {
match self {
Self::Hello => 1.0,
Self::Variables => 1.5,
Self::Functions => 2.0,
Self::Collections => 3.0,
Self::ControlFlow => 4.0,
Self::ErrorHandling => 5.0,
Self::OopTraits => 7.0,
Self::Concurrency => 10.0,
Self::FfiUnsafe => 15.0,
Self::Metaprogramming => 20.0,
}
}
#[must_use]
pub fn python_example(&self) -> &'static str {
match self {
Self::Hello => r#"print("hello world")"#,
Self::Variables => {
r#"x = 42
y = x * 2 + 1
print(f"Result: {y}")"#
}
Self::Functions => {
r"def factorial(n):
if n <= 1:
return 1
return n * factorial(n - 1)"
}
Self::Collections => {
r#"squares = [x**2 for x in range(10) if x % 2 == 0]
counts = {word: len(word) for word in ["hello", "world"]}"#
}
Self::ControlFlow => {
r"def binary_search(arr, target):
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1"
}
Self::ErrorHandling => {
r#"def read_config(path):
try:
with open(path) as f:
return json.load(f)
except FileNotFoundError:
return {}
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON: {e}")"#
}
Self::OopTraits => {
r"class Shape:
def area(self): raise NotImplementedError
class Circle(Shape):
def __init__(self, radius): self.radius = radius
def area(self): return 3.14159 * self.radius ** 2"
}
Self::Concurrency => {
r"import asyncio
async def fetch_all(urls):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[fetch(session, u) for u in urls])"
}
Self::FfiUnsafe => {
r#"import ctypes
lib = ctypes.CDLL("libcrypto.so")
lib.SHA256_Init.argtypes = [ctypes.POINTER(SHA256_CTX)]"#
}
Self::Metaprogramming => {
r"@dataclass
class Point:
x: float
y: float
def distance(self, other):
return ((self.x - other.x)**2 + (self.y - other.y)**2)**0.5"
}
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Py2RsScore {
pub model_id: String,
pub max_level: u8,
pub single_shot_levels: Vec<u8>,
pub avg_turns_by_level: [f32; 10],
pub composite: f32,
pub level_results: Vec<LevelResult>,
}
impl Py2RsScore {
#[must_use]
pub fn new(model_id: impl Into<String>) -> Self {
Self {
model_id: model_id.into(),
max_level: 0,
single_shot_levels: Vec::new(),
avg_turns_by_level: [0.0; 10],
composite: 0.0,
level_results: Vec::new(),
}
}
pub fn add_level(&mut self, result: LevelResult) {
if result.passed {
if result.level > self.max_level {
self.max_level = result.level;
}
if result.turn == 1 {
self.single_shot_levels.push(result.level);
}
}
self.avg_turns_by_level[(result.level - 1) as usize] = result.turn as f32;
self.level_results.push(result);
}
pub fn finalize(&mut self) {
self.composite = self.compute_composite_score();
}
fn compute_composite_score(&self) -> f32 {
let weights = [1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 7.0, 10.0, 15.0, 20.0];
let max_possible: f32 = weights.iter().sum();
let earned: f32 = self
.single_shot_levels
.iter()
.map(|&l| weights[(l - 1) as usize])
.sum();
(earned / max_possible) * 100.0
}
#[must_use]
pub fn level_symbol(&self, level: u8) -> char {
if let Some(result) = self.level_results.iter().find(|r| r.level == level) {
if result.passed {
if result.turn == 1 {
'●' } else {
'◐' }
} else {
'○' }
} else {
'○'
}
}
#[must_use]
pub fn visual_summary(&self) -> String {
(1..=10).map(|l| self.level_symbol(l)).collect()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LevelResult {
pub level: u8,
pub name: String,
pub passed: bool,
pub turn: u32,
pub error: Option<String>,
pub latency: Duration,
}
impl LevelResult {
#[must_use]
pub fn passed(level: Py2RsLevel, turn: u32, latency: Duration) -> Self {
Self {
level: level.number(),
name: level.name().to_string(),
passed: true,
turn,
error: None,
latency,
}
}
#[must_use]
pub fn failed(
level: Py2RsLevel,
attempts: u32,
error: impl Into<String>,
latency: Duration,
) -> Self {
Self {
level: level.number(),
name: level.name().to_string(),
passed: false,
turn: attempts,
error: Some(error.into()),
latency,
}
}
}
#[must_use]
pub fn generate_canonical_examples() -> Vec<Example> {
Py2RsLevel::all()
.iter()
.map(|level| {
Example::new(
format!("py2rs-L{}", level.number()),
level.python_example(),
format!("Compile to valid Rust (Level {})", level.number()),
)
.with_difficulty(level.difficulty())
.with_tags(vec![
"py2rs".to_string(),
format!("level-{}", level.number()),
level.name().to_string(),
])
})
.collect()
}
#[must_use]
pub fn run_benchmark(model_id: &str, max_turns: u32) -> Py2RsScore {
let mut score = Py2RsScore::new(model_id);
for level in Py2RsLevel::all() {
let (passed, turn) = mock_model_result(level, model_id);
let result = if passed {
LevelResult::passed(
level,
turn,
Duration::from_millis(100 * u64::from(level.number())),
)
} else {
LevelResult::failed(
level,
max_turns,
"Compile error",
Duration::from_millis(100 * u64::from(level.number()) * u64::from(max_turns)),
)
};
score.add_level(result);
}
score.finalize();
score
}
fn mock_model_result(level: Py2RsLevel, model_id: &str) -> (bool, u32) {
let model_capability = match model_id {
m if m.contains("16b") || m.contains("large") => 9,
m if m.contains("6b") || m.contains("medium") => 7,
m if m.contains("2b") || m.contains("small") => 5,
_ => 6,
};
let level_num = level.number();
if level_num <= model_capability {
let turn = if level_num <= model_capability - 2 {
1
} else {
2
};
(true, turn)
} else {
(false, 5)
}
}
#[must_use]
pub fn compare_models(model_ids: &[(&str, u64)], max_turns: u32) -> ModelComparison {
let mut comparison = ModelComparison::new("py2rs-canonical");
for &(model_id, size) in model_ids {
let score = run_benchmark(model_id, max_turns);
let mut result = EvalResult::new(model_id, "py2rs-canonical", size);
for level_result in &score.level_results {
let level = Py2RsLevel::all()[(level_result.level - 1) as usize];
let example_result = if level_result.passed {
ExampleResult::solved(
format!("L{}", level_result.level),
level.difficulty(),
level_result.turn,
vec![100; level_result.turn as usize],
vec![Duration::from_millis(50); level_result.turn as usize],
)
} else {
ExampleResult::failed(
format!("L{}", level_result.level),
level.difficulty(),
max_turns,
{
#[allow(clippy::unwrap_or_default)]
level_result.error.clone().unwrap_or(String::new())
},
vec![100; max_turns as usize],
vec![Duration::from_millis(50); max_turns as usize],
)
};
result.add_example(example_result);
}
result.finalize(max_turns);
comparison.add_result(result);
}
comparison.compute_pareto_frontier();
comparison.generate_recommendations();
comparison
}
#[must_use]
pub fn format_comparison_table(_comparison: &ModelComparison, scores: &[Py2RsScore]) -> String {
let mut output = String::new();
output.push_str("┌────────────────────────────────────────────────────────────────┐\n");
output.push_str("│ Benchmark: py2rs-canonical (10 levels) │\n");
output.push_str("├──────────────┬───────┬────────────────────────────────┬────────┤\n");
output.push_str("│ Model │ Score │ Levels (● = T1, ◐ = T2+, ○ = fail) │ Max │\n");
output.push_str("├──────────────┼───────┼────────────────────────────────┼────────┤\n");
for score in scores {
let visual = score.visual_summary();
let _ = writeln!(
output,
"│ {:12} │ {:5.1} │ {:30} │ L{:<4} │",
&score.model_id[..score.model_id.len().min(12)],
score.composite,
visual,
score.max_level
);
}
output.push_str("├──────────────┴───────┴────────────────────────────────┴────────┤\n");
output.push_str("│ Legend: ● Pass Turn 1 | ◐ Pass Turn 2+ | ○ Failed │\n");
output.push_str("└────────────────────────────────────────────────────────────────┘\n");
output
}
#[cfg(test)]
#[path = "py2rs_tests.rs"]
mod tests;