from __future__ import annotations
import numpy as np
from sentence_transformers import SentenceTransformer
from pyvicinity import DistanceMetric, HNSWIndex
CORPUS = [
"Border collies are widely considered the smartest dog breed.",
"House cats descend from African wildcats domesticated ~10,000 years ago.",
"Octopuses can solve puzzles and open childproof jars.",
"Honeybees communicate flower locations via the waggle dance.",
"Crows pass tools and grudges across generations.",
"Sourdough bread relies on wild yeast and lactic-acid bacteria.",
"Espresso is brewed under ~9 bar of pressure for ~25 seconds.",
"Tomatoes are botanically fruits but legally vegetables in the US.",
"Miso paste ferments soybeans with the koji mold Aspergillus oryzae.",
"Saffron is harvested by hand from crocus flowers.",
"The Voyager 1 probe entered interstellar space in 2012.",
"A neutron star can spin hundreds of times per second.",
"Jupiter's Great Red Spot has been raging for at least 400 years.",
"The Sun loses 4 million tons of mass per second to fusion.",
"Saturn would float if you could find a bathtub big enough.",
"Rust's borrow checker prevents data races at compile time.",
"Python's GIL serializes bytecode execution per interpreter.",
"HNSW builds a hierarchical small-world graph for ANN search.",
"Vector databases store dense embeddings indexed by ANN.",
"Trusted publishing on PyPI removes long-lived API tokens.",
"The Mariana Trench reaches almost 11 km below sea level.",
"Antarctica is the largest desert on Earth by area.",
"Iceland sits on the Mid-Atlantic Ridge and is splitting in two.",
"Mount Everest grows ~4 mm taller each year.",
]
def main() -> None:
print("loading model (all-MiniLM-L6-v2)...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(CORPUS, convert_to_numpy=True).astype(np.float32)
n, dim = embeddings.shape
print(f"embedded {n} sentences -> dim={dim}")
index = HNSWIndex(
dim=dim,
metric=DistanceMetric.Cosine,
auto_normalize=True,
seed=0,
)
index.add_items(embeddings)
index.build()
queries = [
"Which animal is unusually clever?",
"How do you brew good coffee?",
"Tell me something about deep oceans.",
"What's the deal with HNSW?",
]
query_vecs = model.encode(queries, convert_to_numpy=True).astype(np.float32)
for q, qv in zip(queries, query_vecs):
print(f"\n>> {q}")
ids, dists = index.search(qv, k=3)
for rank, (i, d) in enumerate(zip(ids.tolist(), dists.tolist()), 1):
print(f" {rank}. ({d:+.3f}) {CORPUS[i]}")
if __name__ == "__main__":
main()