import json
import os
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple
def parse_jsonl(file_path: str) -> List[Dict]:
results = []
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
if line:
results.append(json.loads(line))
return results
def calculate_ratios(run: Dict) -> Dict:
tool_calls = run.get('tool_calls', {})
num_turns = run.get('num_turns', 0)
num_tool_calls = run.get('num_tool_calls', 0)
reads = 0
searches = 0
greps = 0
maps = 0
globs = 0
for tool, count in tool_calls.items():
if 'read' in tool.lower():
reads += count
if 'search' in tool.lower():
searches += count
if 'grep' in tool.lower():
greps += count
if 'map' in tool.lower():
maps += count
if 'glob' in tool.lower():
globs += count
ratios = {
'reads_per_turn': reads / num_turns if num_turns > 0 else 0,
'searches_per_turn': searches / num_turns if num_turns > 0 else 0,
'greps_per_turn': greps / num_turns if num_turns > 0 else 0,
'tools_per_turn': num_tool_calls / num_turns if num_turns > 0 else 0,
'total_reads': reads,
'total_searches': searches,
'total_greps': greps,
'total_maps': maps,
'total_globs': globs,
}
first_tool = None
if maps == 1 and run.get('mode') == 'srcwalk':
first_tool = 'srcwalk_map'
elif globs >= 1 and run.get('mode') == 'baseline':
first_tool = 'Glob (likely)'
ratios['likely_first_tool'] = first_tool
return ratios
def analyze_file(file_path: str) -> Tuple[str, List[Dict]]:
runs = parse_jsonl(file_path)
filename = os.path.basename(file_path)
analyzed_runs = []
for run in runs:
if 'error' in run:
continue
analysis = {
'task': run.get('task'),
'repo': run.get('repo'),
'mode': run.get('mode'),
'model': run.get('model'),
'num_turns': run.get('num_turns'),
'num_tool_calls': run.get('num_tool_calls'),
'tool_calls': run.get('tool_calls', {}),
'correct': run.get('correct'),
'srcwalk_version': run.get('srcwalk_version'),
}
analysis.update(calculate_ratios(run))
analyzed_runs.append(analysis)
return filename, analyzed_runs
def compare_modes(runs: List[Dict]):
baseline_runs = [r for r in runs if r['mode'] == 'baseline' and r['model'] == 'sonnet']
srcwalk_runs = [r for r in runs if r['mode'] == 'srcwalk' and r['model'] == 'sonnet']
def calc_avg(runs, key):
values = [r[key] for r in runs if key in r]
return sum(values) / len(values) if values else 0
print("\n" + "="*80)
print("BASELINE vs SRCWALK COMPARISON (Sonnet only)")
print("="*80)
print(f"\nBaseline runs: {len(baseline_runs)}")
print(f"Srcwalk runs: {len(srcwalk_runs)}")
print("\n--- Average Exploration Metrics ---")
print(f"{'Metric':<30} {'Baseline':>15} {'Srcwalk':>15} {'Delta':>15}")
print("-" * 80)
metrics = [
'tools_per_turn',
'reads_per_turn',
'searches_per_turn',
'greps_per_turn',
'num_turns',
'num_tool_calls',
]
for metric in metrics:
baseline_avg = calc_avg(baseline_runs, metric)
srcwalk_avg = calc_avg(srcwalk_runs, metric)
delta = srcwalk_avg - baseline_avg
print(f"{metric:<30} {baseline_avg:>15.2f} {srcwalk_avg:>15.2f} {delta:>15.2f}")
print("\n--- Tool Preference (Average per run) ---")
print(f"{'Tool':<30} {'Baseline':>15} {'Srcwalk':>15}")
print("-" * 80)
baseline_reads = calc_avg(baseline_runs, 'total_reads')
baseline_greps = calc_avg(baseline_runs, 'total_greps')
baseline_globs = calc_avg(baseline_runs, 'total_globs')
srcwalk_reads = calc_avg(srcwalk_runs, 'total_reads')
srcwalk_searches = calc_avg(srcwalk_runs, 'total_searches')
srcwalk_maps = calc_avg(srcwalk_runs, 'total_maps')
print(f"{'Read operations':<30} {baseline_reads:>15.2f} {srcwalk_reads:>15.2f}")
print(f"{'Search operations (Grep/Search)':<30} {baseline_greps:>15.2f} {srcwalk_searches:>15.2f}")
print(f"{'Discovery (Glob/Map)':<30} {baseline_globs:>15.2f} {srcwalk_maps:>15.2f}")
print("\n--- Srcwalk Map Usage ---")
srcwalk_with_map = sum(1 for r in srcwalk_runs if r['total_maps'] > 0)
print(f"Srcwalk runs starting with map: {srcwalk_with_map}/{len(srcwalk_runs)} ({100*srcwalk_with_map/len(srcwalk_runs):.1f}%)")
print("\n--- Exploration Patterns ---")
baseline_search_read_ratio = baseline_greps / baseline_reads if baseline_reads > 0 else 0
srcwalk_search_read_ratio = srcwalk_searches / srcwalk_reads if srcwalk_reads > 0 else 0
print(f"Baseline Grep:Read ratio: {baseline_search_read_ratio:.2f}:1")
print(f"Srcwalk Search:Read ratio: {srcwalk_search_read_ratio:.2f}:1")
return baseline_runs, srcwalk_runs
def print_detailed_runs(runs: List[Dict], title: str, limit: int = 5):
print(f"\n{title}")
print("="*80)
for i, run in enumerate(runs[:limit], 1):
print(f"\n{i}. {run['task']} ({run['repo']}) - {'✓' if run['correct'] else '✗'}")
print(f" Turns: {run['num_turns']}, Tool calls: {run['num_tool_calls']}")
print(f" Tools: {run['tool_calls']}")
print(f" Ratios: tools/turn={run['tools_per_turn']:.2f}, reads/turn={run['reads_per_turn']:.2f}, searches/turn={run['searches_per_turn']:.2f}")
if run['likely_first_tool']:
print(f" Likely first tool: {run['likely_first_tool']}")
def main():
results_dir = Path("/Users/flysikring/conductor/workspaces/srcwalk/almaty/benchmark/results")
target_files = [
"benchmark_20260213_131246.jsonl", "benchmark_20260213_135039.jsonl", ]
all_runs = []
for filename in target_files:
file_path = results_dir / filename
if file_path.exists():
print(f"\nProcessing: {filename}")
_, runs = analyze_file(str(file_path))
all_runs.extend(runs)
print(f" Found {len(runs)} valid runs")
baseline_runs, srcwalk_runs = compare_modes(all_runs)
print_detailed_runs(baseline_runs, "\nDETAILED BASELINE EXAMPLES", limit=3)
print_detailed_runs(srcwalk_runs, "\nDETAILED SRCWALK EXAMPLES", limit=3)
print("\n" + "="*80)
print("TASK-SPECIFIC COMPARISON")
print("="*80)
tasks = set(r['task'] for r in all_runs)
for task in sorted(tasks):
task_baseline = [r for r in baseline_runs if r['task'] == task]
task_srcwalk = [r for r in srcwalk_runs if r['task'] == task]
if task_baseline and task_srcwalk:
print(f"\n{task}:")
for b_run, t_run in zip(task_baseline, task_srcwalk):
file_marker = "OLD" if "131246" in str(b_run.get('srcwalk_version', '')) else "NEW"
print(f" Baseline: turns={b_run['num_turns']}, tools={b_run['num_tool_calls']}, {b_run['tool_calls']}")
print(f" Srcwalk ({file_marker}): turns={t_run['num_turns']}, tools={t_run['num_tool_calls']}, {t_run['tool_calls']}")
print(f" Efficiency: Baseline={b_run['tools_per_turn']:.2f} tools/turn, Srcwalk={t_run['tools_per_turn']:.2f} tools/turn")
if __name__ == "__main__":
main()