import os
from axterminator.vlm import configure_vlm, detect_element_visual
def main():
print("=== VLM Backend Configuration ===\n")
try:
configure_vlm(backend="mlx")
print("MLX backend configured (local, ~50ms inference)")
except ImportError:
print("MLX not installed. Trying cloud backends...")
if os.environ.get("ANTHROPIC_API_KEY"):
configure_vlm(backend="anthropic")
print("Anthropic backend configured")
elif os.environ.get("OPENAI_API_KEY"):
configure_vlm(backend="openai")
print("OpenAI backend configured")
elif os.environ.get("GOOGLE_API_KEY"):
configure_vlm(backend="gemini")
print("Gemini backend configured")
else:
print("No VLM backend available.")
print("Install mlx-vlm or set ANTHROPIC_API_KEY/OPENAI_API_KEY/GOOGLE_API_KEY")
return
print("\n=== Visual Element Detection ===\n")
fake_image = b"PNG image data would go here"
result = detect_element_visual(
image_data=fake_image,
description="Save button in the toolbar",
image_width=1920,
image_height=1080,
)
if result:
x, y = result
print(f"Element found at coordinates: ({x}, {y})")
else:
print("Element not found in image")
print("\n=== How Visual Healing Works ===\n")
print("""
When a traditional locator fails, axterminator's self-healing system
tries these strategies in order:
1. data_testid - Most reliable, dev-specified
2. aria_label - Accessibility label
3. identifier - macOS accessibility identifier
4. title - Element title text
5. xpath - Structural path in accessibility tree
6. position - Relative position in parent
7. visual_vlm - Visual detection using VLM (this module!)
The visual_vlm strategy:
- Takes a screenshot of the application
- Sends it to the configured VLM with the element description
- VLM returns bounding box coordinates
- axterminator clicks at the center of the detected element
This enables testing apps with dynamic UIs where traditional
locators break frequently.
""")
if __name__ == "__main__":
main()