{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from usearch.io import load_matrix\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vectors = load_matrix(\"datasets/arxiv_2M/abstract.e5-base-v2.fbin\", view=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vectors.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from usearch.index import Index, USES_SIMSIMD, USES_NATIVE_F16\n",
"\n",
"index = Index(ndim=vectors.shape[1], metric=\"cos\", dtype=\"i8\")\n",
"index.hardware_acceleration, USES_SIMSIMD, USES_NATIVE_F16"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import os\n",
"\n",
"if os.path.exists(\"cluster.i8.usearch\"):\n",
" index.load(\"cluster.i8.usearch\")\n",
"\n",
"if len(index) == 0:\n",
" index.add(None, vectors, log=True)\n",
" index.save(\"cluster.i8.usearch\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clustering = index.cluster(min_count=10, max_count=15, threads=60)\n",
"clustering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"{len(clustering.centroids_popularity[0])} unique clusters for {len(index)} members\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clustering.plot_centroids_popularity()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nx.draw_networkx(g)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import networkx as nx\n",
"\n",
"G = g\n",
"\n",
"node_sizes = nx.get_node_attributes(G, \"size\").values()\n",
"max_node_size = max(node_sizes)\n",
"node_sizes = [size * 10 / max_node_size for size in node_sizes]\n",
"\n",
"edge_labels = nx.get_edge_attributes(G, \"distance\")\n",
"edge_labels = {edge: f\"{label:.2}\" for edge, label in edge_labels.items()}\n",
"\n",
"pos = nx.spring_layout(G, seed=7)\n",
"nx.draw_networkx_nodes(G, pos, node_size=node_sizes)\n",
"nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=False))\n",
"nx.draw_networkx_labels(G, pos, font_size=10, font_family=\"sans-serif\")\n",
"nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=5)\n",
"\n",
"ax = plt.gca()\n",
"ax.margins(0.08)\n",
"plt.axis(\"off\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}