usearch 2.0.1

Smaller & Faster Single-File Vector Search Engine from Unum
Documentation
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from usearch.io import load_matrix\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = load_matrix(\"datasets/arxiv_2M/abstract.e5-base-v2.fbin\", view=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from usearch.index import Index, USES_SIMSIMD, USES_NATIVE_F16\n",
    "\n",
    "index = Index(ndim=vectors.shape[1], metric=\"cos\", dtype=\"i8\")\n",
    "index.hardware_acceleration, USES_SIMSIMD, USES_NATIVE_F16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import os\n",
    "\n",
    "if os.path.exists(\"cluster.i8.usearch\"):\n",
    "    index.load(\"cluster.i8.usearch\")\n",
    "\n",
    "if len(index) == 0:\n",
    "    index.add(None, vectors, log=True)\n",
    "    index.save(\"cluster.i8.usearch\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clustering = index.cluster(min_count=10, max_count=15, threads=60)\n",
    "clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"{len(clustering.centroids_popularity[0])} unique clusters for {len(index)} members\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clustering.plot_centroids_popularity()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nx.draw_networkx(g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import networkx as nx\n",
    "\n",
    "G = g\n",
    "\n",
    "node_sizes = nx.get_node_attributes(G, \"size\").values()\n",
    "max_node_size = max(node_sizes)\n",
    "node_sizes = [size * 10 / max_node_size for size in node_sizes]\n",
    "\n",
    "edge_labels = nx.get_edge_attributes(G, \"distance\")\n",
    "edge_labels = {edge: f\"{label:.2}\" for edge, label in edge_labels.items()}\n",
    "\n",
    "pos = nx.spring_layout(G, seed=7)\n",
    "nx.draw_networkx_nodes(G, pos, node_size=node_sizes)\n",
    "nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=False))\n",
    "nx.draw_networkx_labels(G, pos, font_size=10, font_family=\"sans-serif\")\n",
    "nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=5)\n",
    "\n",
    "ax = plt.gca()\n",
    "ax.margins(0.08)\n",
    "plt.axis(\"off\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}