import json
from typing import List, Optional
try:
from opentelemetry import trace
HAS_OTEL = True
tracer = trace.get_tracer(__name__)
except ImportError:
HAS_OTEL = False
tracer = None
from briefcase.validation.extractors import Reference
from briefcase.validation.errors import ValidationError, ValidationErrorCode
class SemanticValidator:
def __init__(
self,
llm_client,
lakefs_client,
repository: str,
branch: str = "main"
):
self.llm = llm_client
self.lakefs = lakefs_client
self.repository = repository
self.branch = branch
def validate_semantic(
self,
prompt: str,
references: List[Reference]
) -> List[ValidationError]:
if HAS_OTEL and tracer:
with tracer.start_as_current_span("validation.semantic_check") as span:
return self._validate_semantic_with_telemetry(prompt, references, span)
else:
return self._validate_semantic_internal(prompt, references)
def _validate_semantic_with_telemetry(
self,
prompt: str,
references: List[Reference],
span
) -> List[ValidationError]:
span.set_attribute("validation.semantic.enabled", True)
if hasattr(self.llm, 'model_name'):
span.set_attribute("validation.semantic.model", self.llm.model_name)
errors = self._validate_semantic_internal(prompt, references)
return errors
def _validate_semantic_internal(
self,
prompt: str,
references: List[Reference]
) -> List[ValidationError]:
errors = []
for ref in references:
content = self._load_reference_content(ref)
if content is None:
continue
error = self._check_structure(prompt, ref, content)
if error:
errors.append(error)
return errors
def _load_reference_content(self, ref: Reference) -> Optional[str]:
try:
return self.lakefs.read_object(
repository=self.repository,
branch=self.branch,
path=ref.path
)
except Exception:
return None
def _check_structure(
self,
prompt: str,
ref: Reference,
content: str
) -> Optional[ValidationError]:
if HAS_OTEL and tracer:
with tracer.start_as_current_span("validation.semantic.structure_check") as span:
return self._check_structure_with_telemetry(prompt, ref, content, span)
else:
return self._check_structure_internal(prompt, ref, content)
def _check_structure_with_telemetry(
self,
prompt: str,
ref: Reference,
content: str,
span
) -> Optional[ValidationError]:
error = self._check_structure_internal(prompt, ref, content)
if error and error.metadata and 'confidence' in error.metadata:
span.set_attribute("validation.semantic.confidence", error.metadata['confidence'])
return error
def _check_structure_internal(
self,
prompt: str,
ref: Reference,
content: str
) -> Optional[ValidationError]:
validation_prompt = f"""
A prompt references: "{ref.text}"
Current document content (first 2000 characters):
{content[:2000]}...
Question: Does the referenced content still exist in the same form?
Are there any structural changes (e.g., section renumbered, content moved)?
Answer with JSON:
{{
"exists": true/false,
"changes": "description of changes" or null,
"confidence": 0.0-1.0
}}
"""
try:
response = self.llm.complete(validation_prompt)
result = self._parse_llm_response(response)
if not result['exists'] and result['confidence'] > 0.7:
return ValidationError(
code=ValidationErrorCode.REFERENCE_GONE,
message=f"Referenced content may have changed: {result['changes']}",
reference=ref.text,
severity="warning",
layer="semantic",
remediation="Review document structure changes",
metadata={'confidence': result['confidence']}
)
except Exception as e:
pass
return None
def _parse_llm_response(self, response: str) -> dict:
try:
return json.loads(response)
except Exception:
return {'exists': True, 'changes': None, 'confidence': 0.0}