formal-ai 0.188.0

coding_modification_suite_issue_362
  record_type "coding_modification_suite"
  id "issue_362_multilingual_coding_modification"
  title "Issue 362 multilingual coding-modification suite"
  purpose "Bulk multi-turn coding-modification checks: initial draft -> edit -> edit across English, Russian, Hindi, and Chinese."
  runner "cargo test --test unit issue_362_multilingual_multi_turn_coding_modification_ratchet -- --nocapture"
  network_runner "FORMAL_AI_BULK_BENCHMARK=1 cargo test --test unit issue_362_external_edit_datasets_download_on_test_only -- --ignored --nocapture"
  minimum_pass_count "4"
  pass_rate_basis_points "10000"
  ratchet_policy "CI asserts passed >= minimum_pass_count; raise the floor only after new cases pass locally and in CI."
  download_policy "External code-edit datasets are fetched at test time into target/formal-ai-benchmarks; full datasets are never committed."
  audit "Edit, But Verify (arXiv:2604.05100) warns that instructed code-editing benchmark scores can overstate real-world editing reliability when language coverage, edit domains, or tests are weak; this suite treats external sources as sample providers and keeps a deterministic local ratchet."
  updated_at "2026-05-30"
external_edit_dataset_canitedit
  record_type "external_edit_dataset"
  id "canitedit"
  title "CanItEdit"
  task_family "instruction_driven_code_editing"
  license "MIT"
  license_url "https://raw.githubusercontent.com/nuprl/CanItEdit/main/LICENSE"
  source_url "https://github.com/nuprl/CanItEdit"
  dataset_url "https://huggingface.co/datasets/nuprl/CanItEdit"
  download_url "https://huggingface.co/datasets/nuprl/CanItEdit/resolve/main/data/test-00000-of-00001.parquet"
  download_cache "target/formal-ai-benchmarks/canitedit-test.parquet"
  upstream_path "data/test-00000-of-00001.parquet"
  source_ref "github:74d15ea7e6207cb9fdeeecd761907371d4cc2e26; huggingface:3c07f38b1f9385f3214fcea94d4664c79df0d36a; lfs:9f78b1a2378b96b24d158a6fe83d69aa18e43a360ae3b7d0891c02f660cc6222"
  expected_min_bytes "200000"
  expected_file_kind "parquet"
  integration_mode "download_on_test"
  row_count "105"
  license_verified_at "2026-05-30"
  audit_note "Use as an edit-task source; keep the local formal-ai ratchet because the upstream benchmark is Python-only and hidden-test oriented."
external_edit_dataset_humanevalfix
  record_type "external_edit_dataset"
  id "humanevalfix"
  title "HumanEvalFix from HumanEvalPack"
  task_family "program_repair"
  license "MIT"
  license_url "https://huggingface.co/datasets/bigcode/humanevalpack/raw/main/README.md"
  source_url "https://github.com/bigcode-project/octopack"
  dataset_url "https://huggingface.co/datasets/bigcode/humanevalpack"
  download_url "https://huggingface.co/datasets/bigcode/humanevalpack/resolve/main/python/test-00000-of-00001.parquet"
  download_cache "target/formal-ai-benchmarks/humanevalfix-python-test.parquet"
  upstream_path "python/test-00000-of-00001.parquet"
  source_ref "github:e17a8f6470264286bc6a52eb8263582083bf3bf6; huggingface:9a41762f73a8cb23bb5811b73d5aab164efcf378; lfs:ed5f15d789156e21222bfcd556c425a39042355c84ae1e8b058abd6a3d7f8075"
  expected_min_bytes "150000"
  expected_file_kind "parquet"
  integration_mode "download_on_test"
  license_verified_at "2026-05-30"
  audit_note "Use as a bug-fix source and keep it separate from the instructed-edit ratchet because HumanEvalFix is narrower than multi-turn modification."
external_edit_dataset_editbench
  record_type "external_edit_dataset"
  id "editbench"
  title "EDIT-Bench"
  task_family "instruction_driven_code_editing"
  license "Apache-2.0"
  license_url "https://raw.githubusercontent.com/waynchi/editbench/main/LICENSE.md"
  source_url "https://github.com/waynchi/editbench"
  dataset_url "https://huggingface.co/datasets/copilot-arena/editbench"
  download_url "https://huggingface.co/datasets/copilot-arena/editbench/resolve/main/data/test-00000-of-00001.parquet"
  download_cache "target/formal-ai-benchmarks/editbench-test.parquet"
  upstream_path "data/test-00000-of-00001.parquet"
  source_ref "github:2ecd13159711d2d5bbdf36700119b4278f387dc0; huggingface:0d41afafcfe7c759adcd2eaceabfa486ab6eb0e2; lfs:0245660f5422cc1404da044f612d2aa9511c7feec252416cbda447c9fe0ee531"
  expected_min_bytes "2000000"
  expected_file_kind "parquet"
  integration_mode "download_on_test"
  row_count "648"
  license_verified_at "2026-05-30"
  audit_note "Use as a real-world edit-task source, but do not treat raw pass rate as sufficient because the audit found thin tests and benchmark artifacts in parts of EDIT-Bench."
coding_modification_case_en_reverse_sort
  record_type "coding_modification_case"
  id "en_reverse_sort"
  source "self_authored_multilingual_followup"
  language "en"
  turns "initial_draft|edit|edit"
  initial_prompt "Write me a Rust program that lists the files in the current directory"
  first_edit_prompt "Make the program accept a path as an argument"
  second_edit_prompt "Sort the results in reverse order"
  expected_intent "write_program"
  expected_answer_contains "```rust"
  expected_answer_contains "env::args"
  expected_answer_contains "names.sort_by(|a, b| b.cmp(a))"
  expected_links_contains "program_parameter:language rust"
  expected_links_contains "program_parameter:task list_files_arg_reverse_sort"
coding_modification_case_ru_issue_349_reverse_sort
  record_type "coding_modification_case"
  id "ru_issue_349_reverse_sort"
  source "issue_349_dialog"
  language "ru"
  turns "initial_draft|edit|edit"
  initial_prompt "Напиши мне программу на Rust, которая выдаёт список файлов в текущей директории"
  first_edit_prompt "Сделай так, чтобы программа принимала путь как аргумент"
  second_edit_prompt "Сделай сортировку результатов в обратном порядке"
  expected_intent "write_program"
  expected_answer_contains "```rust"
  expected_answer_contains "env::args"
  expected_answer_contains "names.sort_by(|a, b| b.cmp(a))"
  expected_links_contains "program_parameter:language rust"
  expected_links_contains "program_parameter:task list_files_arg_reverse_sort"
coding_modification_case_hi_reverse_sort
  record_type "coding_modification_case"
  id "hi_reverse_sort"
  source "self_authored_multilingual_followup"
  language "hi"
  turns "initial_draft|edit|edit"
  initial_prompt "Rust में फ़ाइलों की सूची दिखाने वाला प्रोग्राम लिखो"
  first_edit_prompt "इसे ऐसा बनाओ कि प्रोग्राम पथ को तर्क के रूप में स्वीकार करे"
  second_edit_prompt "परिणामों को उल्टे क्रम में क्रमबद्ध करो"
  expected_intent "write_program"
  expected_answer_contains "```rust"
  expected_answer_contains "env::args"
  expected_answer_contains "names.sort_by(|a, b| b.cmp(a))"
  expected_links_contains "program_parameter:language rust"
  expected_links_contains "program_parameter:task list_files_arg_reverse_sort"
coding_modification_case_zh_reverse_sort
  record_type "coding_modification_case"
  id "zh_reverse_sort"
  source "self_authored_multilingual_followup"
  language "zh"
  turns "initial_draft|edit|edit"
  initial_prompt "用 Rust 编写一个列出当前目录中文件的程序"
  first_edit_prompt "制作程序，使其接受路径作为参数"
  second_edit_prompt "把结果按相反顺序排序"
  expected_intent "write_program"
  expected_answer_contains "```rust"
  expected_answer_contains "env::args"
  expected_answer_contains "names.sort_by(|a, b| b.cmp(a))"
  expected_links_contains "program_parameter:language rust"
  expected_links_contains "program_parameter:task list_files_arg_reverse_sort"