simstring_rust 0.3.4

A native Rust implementation of the SimString algorithm
Documentation
import json
import time
from pathlib import Path
from statistics import mean, stdev
from typing import Callable

from simstring_rust.database import HashDb
from simstring_rust.extractors import CharacterNgrams
from simstring_rust.measures import Cosine
from simstring_rust.searcher import Searcher


def create_database(ngrams_size: int) -> HashDb:
    extractor = CharacterNgrams(n=ngrams_size, endmarker=" ")
    return HashDb(extractor)


def measure_time(func: Callable) -> float:
    start = time.perf_counter()
    func()
    end = time.perf_counter()
    return end - start


def load_company_names() -> list[str]:
    current_dir = Path.cwd()
    file_path = current_dir / "benches" / "data" / "company_names.txt"
    with open(file_path) as f:
        return [line.strip() for line in f]


def bench_insert(results: list):
    company_names = load_company_names()
    iterations = 100
    measurement_time = 20

    for ngram_size in [2, 3, 4]:
        measurements = []
        start_time = time.time()
        iteration = 0

        while time.time() - start_time < measurement_time and iteration < iterations:

            def benchmark_iteration():
                db = create_database(ngram_size)
                for name in company_names:
                    db.insert(name)

            duration = measure_time(benchmark_iteration)
            measurements.append(duration)
            iteration += 1

        mean_time = mean(measurements)
        stddev_val = stdev(measurements) if len(measurements) > 1 else 0

        results.append(
            {
                "language": "python",
                "backend": "simstring-rust (python bindings)",
                "benchmark": "insert",
                "parameters": {"ngram_size": ngram_size},
                "stats": {
                    "mean": mean_time * 1000,
                    "stddev": stddev_val * 1000,
                    "iterations": len(measurements),
                },
            }
        )


def bench_search(results: list):
    company_names = load_company_names()
    search_terms = company_names[:100]
    iterations = 100
    measurement_time = 20
    similarity_thresholds = [0.6, 0.7, 0.8, 0.9]

    for ngram_size in [2, 3, 4]:
        db = create_database(ngram_size)
        for name in company_names:
            db.insert(name)

        searcher = Searcher(db, Cosine())

        for threshold in similarity_thresholds:
            measurements = []
            start_time = time.time()
            iteration = 0

            while (
                time.time() - start_time < measurement_time and iteration < iterations
            ):

                def benchmark_iteration():
                    for term in search_terms:
                        searcher.search(term, threshold)

                duration = measure_time(benchmark_iteration)
                measurements.append(duration)
                iteration += 1

            mean_time = mean(measurements)
            stddev_val = stdev(measurements) if len(measurements) > 1 else 0

            results.append(
                {
                    "language": "python",
                    "backend": "simstring-rust (python bindings)",
                    "benchmark": "search",
                    "parameters": {"ngram_size": ngram_size, "threshold": threshold},
                    "stats": {
                        "mean": mean_time * 1000,
                        "stddev": stddev_val * 1000,
                        "iterations": len(measurements),
                    },
                }
            )


if __name__ == "__main__":
    results = []
    bench_insert(results)
    bench_search(results)
    print(json.dumps(results, indent=2))