[
{"question":"What is the user's favorite color?","question_type":"single-session-user","ground_truth":"blue","hypothesis":"The user's favorite color is blue.","expected_correct":true},
{"question":"What is the user's name?","question_type":"single-session-user","ground_truth":"John","hypothesis":"The user's name is John.","expected_correct":true},
{"question":"What is the user's name?","question_type":"single-session-user","ground_truth":"John","hypothesis":"The user's name is Mike.","expected_correct":false},
{"question":"What city does the user live in?","question_type":"single-session-user","ground_truth":"Austin","hypothesis":"Austin, Texas.","expected_correct":true},
{"question":"What city does the user live in?","question_type":"single-session-user","ground_truth":"Austin","hypothesis":"The user lives in Denver.","expected_correct":false},
{"question":"What programming language does the user prefer?","question_type":"single-session-user","ground_truth":"Rust","hypothesis":"Rust","expected_correct":true},
{"question":"What programming language does the user prefer?","question_type":"single-session-user","ground_truth":"Rust","hypothesis":"Python","expected_correct":false},
{"question":"What did the assistant recommend for dinner?","question_type":"single-session-assistant","ground_truth":"Italian restaurant on Main Street","hypothesis":"The assistant suggested an Italian place on Main Street.","expected_correct":true},
{"question":"What did the assistant recommend for dinner?","question_type":"single-session-assistant","ground_truth":"Italian restaurant on Main Street","hypothesis":"The assistant recommended a Chinese restaurant.","expected_correct":false},
{"question":"What book did the assistant suggest?","question_type":"single-session-assistant","ground_truth":"Sapiens by Yuval Noah Harari","hypothesis":"Sapiens","expected_correct":true},
{"question":"What book did the assistant suggest?","question_type":"single-session-assistant","ground_truth":"Sapiens by Yuval Noah Harari","hypothesis":"Thinking Fast and Slow","expected_correct":false},
{"question":"What exercise routine did the assistant outline?","question_type":"single-session-assistant","ground_truth":"30 minutes of cardio followed by strength training","hypothesis":"Cardio for 30 min then strength training.","expected_correct":true},
{"question":"Does the user prefer morning or evening workouts?","question_type":"single-session-preference","ground_truth":"morning","hypothesis":"The user prefers to work out in the morning.","expected_correct":true},
{"question":"Does the user prefer morning or evening workouts?","question_type":"single-session-preference","ground_truth":"morning","hypothesis":"Evening workouts.","expected_correct":false},
{"question":"What type of music does the user enjoy?","question_type":"single-session-preference","ground_truth":"jazz","hypothesis":"The user enjoys jazz music.","expected_correct":true},
{"question":"What type of music does the user enjoy?","question_type":"single-session-preference","ground_truth":"jazz","hypothesis":"Rock and heavy metal.","expected_correct":false},
{"question":"Does the user prefer cats or dogs?","question_type":"single-session-preference","ground_truth":"dogs","hypothesis":"Dogs.","expected_correct":true},
{"question":"What phone does the user currently have?","question_type":"knowledge-update","ground_truth":"iPhone 15","hypothesis":"The user has an iPhone 15.","expected_correct":true},
{"question":"What phone does the user currently have?","question_type":"knowledge-update","ground_truth":"iPhone 15","hypothesis":"The user has an iPhone 13.","expected_correct":false},
{"question":"Where does the user work now?","question_type":"knowledge-update","ground_truth":"Google","hypothesis":"The user currently works at Google.","expected_correct":true},
{"question":"Where does the user work now?","question_type":"knowledge-update","ground_truth":"Google","hypothesis":"The user works at Microsoft.","expected_correct":false},
{"question":"What car does the user drive?","question_type":"knowledge-update","ground_truth":"Tesla Model 3","hypothesis":"A Tesla Model 3.","expected_correct":true},
{"question":"What car does the user drive?","question_type":"knowledge-update","ground_truth":"Tesla Model 3","hypothesis":"Honda Civic.","expected_correct":false},
{"question":"What is the user's current salary?","question_type":"knowledge-update","ground_truth":"$150,000","hypothesis":"The user earns around $150K.","expected_correct":true},
{"question":"When did the user start their new job?","question_type":"temporal-reasoning","ground_truth":"March 2024","hypothesis":"March 2024.","expected_correct":true},
{"question":"When did the user start their new job?","question_type":"temporal-reasoning","ground_truth":"March 2024","hypothesis":"The user started in early March 2024.","expected_correct":true},
{"question":"When did the user start their new job?","question_type":"temporal-reasoning","ground_truth":"March 2024","hypothesis":"January 2024.","expected_correct":false},
{"question":"How long ago did the user move?","question_type":"temporal-reasoning","ground_truth":"about 6 months ago","hypothesis":"Approximately six months ago.","expected_correct":true},
{"question":"How long ago did the user move?","question_type":"temporal-reasoning","ground_truth":"about 6 months ago","hypothesis":"About 2 years ago.","expected_correct":false},
{"question":"When was the user's last vacation?","question_type":"temporal-reasoning","ground_truth":"December 2023","hypothesis":"Late December 2023.","expected_correct":true},
{"question":"When was the user's last vacation?","question_type":"temporal-reasoning","ground_truth":"December 2023","hypothesis":"Summer 2023.","expected_correct":false},
{"question":"Which trip happened first, Paris or Tokyo?","question_type":"temporal-reasoning","ground_truth":"Paris","hypothesis":"The Paris trip was first.","expected_correct":true},
{"question":"Which trip happened first, Paris or Tokyo?","question_type":"temporal-reasoning","ground_truth":"Paris","hypothesis":"Tokyo came first.","expected_correct":false},
{"question":"How many times has the user mentioned cooking?","question_type":"multi-session","ground_truth":"3 times","hypothesis":"The user mentioned cooking in three different sessions.","expected_correct":true},
{"question":"How many times has the user mentioned cooking?","question_type":"multi-session","ground_truth":"3 times","hypothesis":"Once.","expected_correct":false},
{"question":"What topics did the user discuss across all sessions?","question_type":"multi-session","ground_truth":"cooking, hiking, and photography","hypothesis":"The user talked about cooking, hiking, and photography.","expected_correct":true},
{"question":"What topics did the user discuss across all sessions?","question_type":"multi-session","ground_truth":"cooking, hiking, and photography","hypothesis":"Only cooking and hiking.","expected_correct":false},
{"question":"Did the user ever change their mind about moving?","question_type":"multi-session","ground_truth":"Yes, initially wanted to move to Seattle but later decided on Portland","hypothesis":"Yes, the user first considered Seattle but ultimately chose Portland.","expected_correct":true},
{"question":"Did the user ever change their mind about moving?","question_type":"multi-session","ground_truth":"Yes, initially wanted to move to Seattle but later decided on Portland","hypothesis":"No, the user always wanted Portland.","expected_correct":false},
{"question":"What is the user's blood type?","question_type":"single-session-user","ground_truth":"unanswerable","hypothesis":"I don't have information about the user's blood type.","expected_correct":true,"is_abstention":true},
{"question":"What is the user's blood type?","question_type":"single-session-user","ground_truth":"unanswerable","hypothesis":"The user's blood type is O positive.","expected_correct":false,"is_abstention":true},
{"question":"What gift did the user receive for their birthday?","question_type":"single-session-user","ground_truth":"unanswerable","hypothesis":"I don't have enough information to answer this.","expected_correct":true,"is_abstention":true},
{"question":"What gift did the user receive for their birthday?","question_type":"single-session-user","ground_truth":"unanswerable","hypothesis":"The user received a watch.","expected_correct":false,"is_abstention":true},
{"question":"What is the user's mother's maiden name?","question_type":"single-session-user","ground_truth":"unanswerable","hypothesis":"This information was not discussed in our conversations.","expected_correct":true,"is_abstention":true},
{"question":"What is the user's favorite restaurant?","question_type":"single-session-preference","ground_truth":"Olive Garden","hypothesis":"Olive Garden","expected_correct":true},
{"question":"What is the user's favorite restaurant?","question_type":"single-session-preference","ground_truth":"Olive Garden","hypothesis":"The Olive Garden on Elm Street","expected_correct":true},
{"question":"How old is the user?","question_type":"single-session-user","ground_truth":"34","hypothesis":"34 years old","expected_correct":true},
{"question":"How old is the user?","question_type":"single-session-user","ground_truth":"34","hypothesis":"Thirty-four","expected_correct":true},
{"question":"When did the user adopt their dog?","question_type":"temporal-reasoning","ground_truth":"last spring","hypothesis":"Spring of last year.","expected_correct":true},
{"question":"What did the user and assistant discuss in the second session?","question_type":"multi-session","ground_truth":"travel plans to Japan","hypothesis":"They discussed upcoming travel plans to Japan.","expected_correct":true}
]