from model2vec import distill
MODELS = {
"standard": {
"source": "codesage/codesage-base-v2",
"pca_dims": 256,
"output": "models/codesage-m2v-256",
},
"large": {
"source": "jinaai/jina-embeddings-v3",
"pca_dims": 512,
"output": "models/jina-code-m2v-512",
},
}
if __name__ == "__main__":
for tier, config in MODELS.items():
print(f"Distilling {tier} tier: {config['source']} -> {config['output']}")
print(f" PCA dims: {config['pca_dims']}")
model = distill(
config["source"],
pca_dims=config["pca_dims"],
)
model.save_pretrained(config["output"])
print(f" Saved to {config['output']}/")
print()
print("Done. Next steps:")
print(" 1. Check model sizes (standard ~8 MB, large ~30 MB)")
print(" 2. Run: prx bench-ndcg --model-path models/codesage-m2v-256/")
print(" 3. Run: prx bench-ndcg --model-path models/jina-code-m2v-512/")
print(" 4. Compare NDCG@10 against builtin baseline")