use std::collections::HashMap;
use std::path::PathBuf;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use super::{CaseResult, TaskQualityAdapter, TaskQualityResult};
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclConfig {
pub dataset_path: PathBuf,
pub categories: Vec<String>,
pub max_cases: Option<usize>,
}
impl Default for BfclConfig {
fn default() -> Self {
Self {
dataset_path: PathBuf::from("bfcl_dataset.jsonl"),
categories: Vec::new(),
max_cases: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclEntry {
pub id: String,
#[serde(default)]
pub category: String,
pub question: String,
#[serde(rename = "function")]
pub functions: Vec<BfclFunction>,
#[serde(rename = "expected_output")]
pub expected_output: Vec<BfclExpectedOutput>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclFunction {
pub name: String,
pub description: String,
pub parameters: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclExpectedOutput {
pub name: String,
pub arguments: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclToolCallOutput {
pub name: String,
pub arguments: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct BfclLeaderboardResult {
pub model: String,
pub overall_accuracy: f64,
pub category_accuracy: HashMap<String, CategoryAccuracy>,
pub total_cases: usize,
pub passed_cases: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct CategoryAccuracy {
pub accuracy: f64,
pub total: usize,
pub passed: usize,
}
pub struct BfclAdapter {
config: BfclConfig,
}
impl BfclAdapter {
pub fn new(config: BfclConfig) -> Self {
Self { config }
}
pub fn config(&self) -> &BfclConfig {
&self.config
}
fn load_entries(&self) -> crate::Result<Vec<BfclEntry>> {
let path = &self.config.dataset_path;
if !path.exists() {
return Err(crate::BenchError::WorkloadNotFound { path: path.display().to_string() });
}
let content = std::fs::read_to_string(path).map_err(crate::BenchError::Io)?;
let mut entries: Vec<BfclEntry> = Vec::new();
for (line_num, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() {
continue;
}
let entry: BfclEntry =
serde_json::from_str(line).map_err(|e| crate::BenchError::WorkloadValidation {
field: format!("line {}", line_num + 1),
reason: format!("failed to parse BFCL entry: {e}"),
})?;
if !self.config.categories.is_empty()
&& !self.config.categories.contains(&entry.category)
{
continue;
}
entries.push(entry);
if let Some(max) = self.config.max_cases
&& entries.len() >= max
{
break;
}
}
Ok(entries)
}
fn translate_functions_to_tool_schemas(functions: &[BfclFunction]) -> Vec<serde_json::Value> {
functions
.iter()
.map(|f| {
serde_json::json!({
"name": f.name,
"description": f.description,
"parameters": f.parameters,
})
})
.collect()
}
fn score_case(
expected: &[BfclExpectedOutput],
actual: &[BfclToolCallOutput],
) -> (f64, Option<String>) {
if expected.len() != actual.len() {
return (
0.0,
Some(format!("expected {} tool call(s), got {}", expected.len(), actual.len())),
);
}
for (i, (exp, act)) in expected.iter().zip(actual.iter()).enumerate() {
if exp.name != act.name {
return (
0.0,
Some(format!("call {i}: expected function '{}', got '{}'", exp.name, act.name)),
);
}
match Self::arguments_match(&exp.arguments, &act.arguments) {
Ok(()) => {}
Err(reason) => {
return (0.0, Some(format!("call {i}, function '{}': {reason}", exp.name)));
}
}
}
(1.0, None)
}
fn arguments_match(
expected: &HashMap<String, serde_json::Value>,
actual: &HashMap<String, serde_json::Value>,
) -> Result<(), String> {
for key in expected.keys() {
if !actual.contains_key(key) {
return Err(format!("missing argument '{key}'"));
}
}
for key in actual.keys() {
if !expected.contains_key(key) {
return Err(format!("unexpected extra argument '{key}'"));
}
}
for (key, expected_val) in expected {
let actual_val = &actual[key];
if !json_values_equal(expected_val, actual_val) {
return Err(format!("argument '{key}': expected {expected_val}, got {actual_val}"));
}
}
Ok(())
}
fn generate_leaderboard_result(
&self,
model: &str,
entries: &[BfclEntry],
cases: &[CaseResult],
) -> BfclLeaderboardResult {
let total_cases = cases.len();
let passed_cases = cases.iter().filter(|c| c.passed).count();
let overall_accuracy =
if total_cases > 0 { passed_cases as f64 / total_cases as f64 } else { 0.0 };
let mut category_accuracy: HashMap<String, CategoryAccuracy> = HashMap::new();
for (entry, case) in entries.iter().zip(cases.iter()) {
let cat = category_accuracy.entry(entry.category.clone()).or_insert(CategoryAccuracy {
accuracy: 0.0,
total: 0,
passed: 0,
});
cat.total += 1;
if case.passed {
cat.passed += 1;
}
}
for cat in category_accuracy.values_mut() {
cat.accuracy = if cat.total > 0 { cat.passed as f64 / cat.total as f64 } else { 0.0 };
}
BfclLeaderboardResult {
model: model.to_string(),
overall_accuracy,
category_accuracy,
total_cases,
passed_cases,
}
}
}
impl Default for BfclAdapter {
fn default() -> Self {
Self::new(BfclConfig::default())
}
}
#[async_trait]
impl TaskQualityAdapter for BfclAdapter {
fn name(&self) -> &str {
"bfcl"
}
async fn run(&self, model: &str) -> crate::Result<TaskQualityResult> {
let entries = self.load_entries()?;
if entries.is_empty() {
return Ok(TaskQualityResult {
adapter_name: self.name().to_string(),
model: model.to_string(),
total_cases: 0,
passed_cases: 0,
accuracy: 0.0,
cases: Vec::new(),
});
}
let mut cases: Vec<CaseResult> = Vec::with_capacity(entries.len());
for entry in &entries {
let _tool_schemas = Self::translate_functions_to_tool_schemas(&entry.functions);
let actual_output: Vec<BfclToolCallOutput> = Vec::new();
let (score, details) = Self::score_case(&entry.expected_output, &actual_output);
cases.push(CaseResult {
case_id: entry.id.clone(),
passed: score >= 1.0,
score,
details,
});
}
let total_cases = cases.len();
let passed_cases = cases.iter().filter(|c| c.passed).count();
let accuracy = if total_cases > 0 { passed_cases as f64 / total_cases as f64 } else { 0.0 };
let _leaderboard = self.generate_leaderboard_result(model, &entries, &cases);
Ok(TaskQualityResult {
adapter_name: self.name().to_string(),
model: model.to_string(),
total_cases,
passed_cases,
accuracy,
cases,
})
}
}
fn json_values_equal(a: &serde_json::Value, b: &serde_json::Value) -> bool {
use serde_json::Value;
match (a, b) {
(Value::Null, Value::Null) => true,
(Value::Bool(a), Value::Bool(b)) => a == b,
(Value::Number(a), Value::Number(b)) => {
match (a.as_f64(), b.as_f64()) {
(Some(fa), Some(fb)) => (fa - fb).abs() < f64::EPSILON,
_ => false,
}
}
(Value::String(a), Value::String(b)) => a == b,
(Value::Array(a), Value::Array(b)) => {
a.len() == b.len() && a.iter().zip(b.iter()).all(|(x, y)| json_values_equal(x, y))
}
(Value::Object(a), Value::Object(b)) => {
a.len() == b.len()
&& a.iter()
.all(|(key, val)| b.get(key).is_some_and(|bval| json_values_equal(val, bval)))
}
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_json_values_equal_numbers() {
let a = serde_json::json!(42);
let b = serde_json::json!(42.0);
assert!(json_values_equal(&a, &b));
}
#[test]
fn test_json_values_equal_strings() {
let a = serde_json::json!("hello");
let b = serde_json::json!("hello");
assert!(json_values_equal(&a, &b));
let c = serde_json::json!("Hello");
assert!(!json_values_equal(&a, &c));
}
#[test]
fn test_json_values_equal_objects() {
let a = serde_json::json!({"x": 1, "y": 2});
let b = serde_json::json!({"y": 2, "x": 1});
assert!(json_values_equal(&a, &b));
}
#[test]
fn test_json_values_equal_different_types() {
let a = serde_json::json!(42);
let b = serde_json::json!("42");
assert!(!json_values_equal(&a, &b));
}
#[test]
fn test_score_case_exact_match() {
let expected = vec![BfclExpectedOutput {
name: "get_weather".to_string(),
arguments: HashMap::from([
("city".to_string(), serde_json::json!("Seattle")),
("unit".to_string(), serde_json::json!("fahrenheit")),
]),
}];
let actual = vec![BfclToolCallOutput {
name: "get_weather".to_string(),
arguments: HashMap::from([
("city".to_string(), serde_json::json!("Seattle")),
("unit".to_string(), serde_json::json!("fahrenheit")),
]),
}];
let (score, details) = BfclAdapter::score_case(&expected, &actual);
assert_eq!(score, 1.0);
assert!(details.is_none());
}
#[test]
fn test_score_case_wrong_function_name() {
let expected =
vec![BfclExpectedOutput { name: "get_weather".to_string(), arguments: HashMap::new() }];
let actual = vec![BfclToolCallOutput {
name: "get_temperature".to_string(),
arguments: HashMap::new(),
}];
let (score, details) = BfclAdapter::score_case(&expected, &actual);
assert_eq!(score, 0.0);
assert!(details.unwrap().contains("expected function 'get_weather'"));
}
#[test]
fn test_score_case_missing_argument() {
let expected = vec![BfclExpectedOutput {
name: "search".to_string(),
arguments: HashMap::from([
("query".to_string(), serde_json::json!("rust")),
("limit".to_string(), serde_json::json!(10)),
]),
}];
let actual = vec![BfclToolCallOutput {
name: "search".to_string(),
arguments: HashMap::from([("query".to_string(), serde_json::json!("rust"))]),
}];
let (score, details) = BfclAdapter::score_case(&expected, &actual);
assert_eq!(score, 0.0);
assert!(details.unwrap().contains("missing argument"));
}
#[test]
fn test_score_case_extra_argument() {
let expected = vec![BfclExpectedOutput {
name: "search".to_string(),
arguments: HashMap::from([("query".to_string(), serde_json::json!("rust"))]),
}];
let actual = vec![BfclToolCallOutput {
name: "search".to_string(),
arguments: HashMap::from([
("query".to_string(), serde_json::json!("rust")),
("extra".to_string(), serde_json::json!("unexpected")),
]),
}];
let (score, details) = BfclAdapter::score_case(&expected, &actual);
assert_eq!(score, 0.0);
assert!(details.unwrap().contains("unexpected extra argument"));
}
#[test]
fn test_score_case_wrong_count() {
let expected = vec![
BfclExpectedOutput { name: "a".to_string(), arguments: HashMap::new() },
BfclExpectedOutput { name: "b".to_string(), arguments: HashMap::new() },
];
let actual = vec![BfclToolCallOutput { name: "a".to_string(), arguments: HashMap::new() }];
let (score, details) = BfclAdapter::score_case(&expected, &actual);
assert_eq!(score, 0.0);
assert!(details.unwrap().contains("expected 2 tool call(s), got 1"));
}
#[test]
fn test_translate_functions_to_tool_schemas() {
let functions = vec![BfclFunction {
name: "get_weather".to_string(),
description: "Get the weather for a city".to_string(),
parameters: serde_json::json!({
"type": "object",
"properties": {
"city": {"type": "string"}
},
"required": ["city"]
}),
}];
let schemas = BfclAdapter::translate_functions_to_tool_schemas(&functions);
assert_eq!(schemas.len(), 1);
assert_eq!(schemas[0]["name"], "get_weather");
assert_eq!(schemas[0]["description"], "Get the weather for a city");
assert!(schemas[0]["parameters"]["properties"]["city"].is_object());
}
#[test]
fn test_bfcl_config_default() {
let config = BfclConfig::default();
assert_eq!(config.dataset_path, PathBuf::from("bfcl_dataset.jsonl"));
assert!(config.categories.is_empty());
assert!(config.max_cases.is_none());
}
#[test]
fn test_bfcl_adapter_default() {
let adapter = BfclAdapter::default();
assert_eq!(adapter.name(), "bfcl");
}
#[test]
fn test_bfcl_entry_deserialization() {
let json = r#"{
"id": "test_1",
"category": "simple",
"question": "What is the weather in Seattle?",
"function": [
{
"name": "get_weather",
"description": "Get weather info",
"parameters": {"type": "object", "properties": {"city": {"type": "string"}}}
}
],
"expected_output": [
{
"name": "get_weather",
"arguments": {"city": "Seattle"}
}
]
}"#;
let entry: BfclEntry = serde_json::from_str(json).unwrap();
assert_eq!(entry.id, "test_1");
assert_eq!(entry.category, "simple");
assert_eq!(entry.functions.len(), 1);
assert_eq!(entry.expected_output.len(), 1);
assert_eq!(entry.expected_output[0].name, "get_weather");
}
#[test]
fn test_leaderboard_result_generation() {
let config = BfclConfig::default();
let adapter = BfclAdapter::new(config);
let entries = vec![
BfclEntry {
id: "test_1".to_string(),
category: "simple".to_string(),
question: "q1".to_string(),
functions: vec![],
expected_output: vec![],
},
BfclEntry {
id: "test_2".to_string(),
category: "simple".to_string(),
question: "q2".to_string(),
functions: vec![],
expected_output: vec![],
},
BfclEntry {
id: "test_3".to_string(),
category: "multiple".to_string(),
question: "q3".to_string(),
functions: vec![],
expected_output: vec![],
},
];
let cases = vec![
CaseResult { case_id: "test_1".to_string(), passed: true, score: 1.0, details: None },
CaseResult {
case_id: "test_2".to_string(),
passed: false,
score: 0.0,
details: Some("wrong function".to_string()),
},
CaseResult { case_id: "test_3".to_string(), passed: true, score: 1.0, details: None },
];
let result = adapter.generate_leaderboard_result("gemini-2.5-flash", &entries, &cases);
assert_eq!(result.model, "gemini-2.5-flash");
assert_eq!(result.total_cases, 3);
assert_eq!(result.passed_cases, 2);
assert!((result.overall_accuracy - 2.0 / 3.0).abs() < f64::EPSILON);
let simple = &result.category_accuracy["simple"];
assert_eq!(simple.total, 2);
assert_eq!(simple.passed, 1);
assert!((simple.accuracy - 0.5).abs() < f64::EPSILON);
let multiple = &result.category_accuracy["multiple"];
assert_eq!(multiple.total, 1);
assert_eq!(multiple.passed, 1);
assert!((multiple.accuracy - 1.0).abs() < f64::EPSILON);
}
}