import sys
import os
def main():
print("=== Hugging Face Common Words Dataset Downloader ===\n")
try:
from datasets import load_dataset
import pandas as pd
print("✓ Required libraries found")
except ImportError as e:
print("✗ Missing required libraries")
print("\nPlease install them with:")
print(" pip3 install datasets pandas")
print("\nOr if using conda:")
print(" conda install -c huggingface datasets pandas")
return 1
try:
os.makedirs("data", exist_ok=True)
print("\nDownloading common-words-79k dataset from Hugging Face...")
print("This may take a few moments...\n")
dataset = load_dataset("jaagli/common-words-79k", split="whole")
print(f"✓ Downloaded {len(dataset)} entries")
df = dataset.to_pandas()
print(f"\nDataset columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
df.to_csv("data/common-words-79k-raw.csv", index=False)
print("\n✓ Saved raw dataset to data/common-words-79k-raw.csv")
if 'alias' in df.columns:
words_df = pd.DataFrame()
words_df['word'] = df['alias'].str.lower().str.strip()
if 'frequency' in df.columns:
words_df['frequency'] = df['frequency']
words_df = words_df.sort_values('frequency', ascending=False)
words_df = words_df.drop_duplicates(subset=['word'])
words_df.to_csv("data/common-words-processed.csv", index=False)
print(f"✓ Saved {len(words_df)} unique words to data/common-words-processed.csv")
print("\nSample words from dataset:")
for i, row in words_df.head(20).iterrows():
if 'frequency' in words_df.columns:
print(f" {row['word']} (frequency: {row['frequency']})")
else:
print(f" {row['word']}")
else:
print("\n⚠ Warning: 'alias' column not found in dataset")
print("Available columns:", df.columns.tolist())
text_cols = [col for col in df.columns if df[col].dtype == 'object']
if text_cols:
print(f"\nExtracting from column: {text_cols[0]}")
words = df[text_cols[0]].str.lower().str.strip().unique()
pd.DataFrame({'word': words}).to_csv("data/common-words-processed.csv", index=False)
print(f"✓ Saved {len(words)} unique words")
print("\n✓ Dataset download complete!")
print("\nNext steps:")
print("1. Run: cargo run --bin process_frequency_dictionary")
print("2. This will filter and prepare the dictionary for use")
except Exception as e:
print(f"\n✗ Error downloading dataset: {e}")
print("\nTroubleshooting:")
print("1. Check your internet connection")
print("2. Ensure you have access to Hugging Face datasets")
print("3. Try updating datasets library: pip3 install --upgrade datasets")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())