from datasets import load_dataset
import os
print("Loading dataset...")
ds = load_dataset('alea-institute/kl3m-sft-hearings-sample-001')
output_path = os.path.expanduser('~/sample-001.txt')
print(f"Extracting text from {len(ds['train'])} examples...")
with open(output_path, 'w', encoding='utf-8') as f:
for i, example in enumerate(ds['train']):
f.write(example['text'])
f.write('\n\n')
if (i + 1) % 100000 == 0:
print(f"Processed {i + 1:,} / {len(ds['train']):,} examples...")
print(f"Successfully wrote {len(ds['train']):,} text examples to {output_path}")