import time
from simstring_rust.database import HashDb
from simstring_rust.errors import SearchError
from simstring_rust.extractors import CharacterNgrams, CustomExtractor
from simstring_rust.measures import Cosine
from simstring_rust.searcher import Searcher
def main():
print("--- 1. Setting up components ---")
extractor = CharacterNgrams(n=2, endmarker="$")
print("Using extractor: CharacterNgrams(n=2, endmarker='$')")
sample_embedding = extractor.apply("Some text")
print(f"Sample embedding for 'Some text': {sample_embedding}")
class LowerBigrams:
def apply(self, text: str):
tokens = text.lower().split()
return [f"{left}|{right}" for left, right in zip(tokens, tokens[1:])]
custom_extractor = CustomExtractor(LowerBigrams())
custom_embedding = custom_extractor.apply("Some Custom Text")
print(f"Custom extractor embedding: {custom_embedding}")
measure = Cosine()
print("Using measure: Cosine()")
print("\n--- 2. Indexing strings ---")
db = HashDb(extractor)
corpus = ["foo", "bar", "fooo", "foooobar", "apple", "apply", "apples"]
print(f"Corpus: {corpus}")
start_time = time.time()
for item in corpus:
db.insert(item)
end_time = time.time()
print(f"Indexing {len(db)} strings took {(end_time - start_time) * 1000:.2f} ms.")
print("\n--- 3. Performing searches ---")
searcher = Searcher(db, measure)
query = "apple"
alpha = 0.8
print(f"\nRanked search for '{query}' with alpha >= {alpha}:")
ranked_results = searcher.ranked_search(query, alpha)
if not ranked_results:
print(" No matches found.")
else:
for item, score in ranked_results:
print(f" - Match: '{item}', Score: {score:.4f}")
print(f"\nUnranked search for '{query}' with alpha >= {alpha}:")
unranked_results = searcher.search(query, alpha)
print(f" - Matches: {unranked_results}")
print("\n--- 4. Testing error handling ---")
try:
invalid_alpha = 1.1
print(f"Attempting search with invalid alpha = {invalid_alpha}...")
searcher.search("test", invalid_alpha)
except SearchError as e:
print(f" Successfully caught expected error: {e}")
if __name__ == "__main__":
main()