Skip to main content

semver_analyzer_core/
traits.rs

1//! Trait definitions for language-pluggable analysis.
2//!
3//! Adding a new language means implementing these traits. The orchestrator,
4//! diff engine, and output format are language-agnostic and reused unchanged.
5//!
6//! ## Trait ownership
7//!
8//! | Trait | Used by | Per-language? |
9//! |---|---|---|
10//! | `Language` | TD + BU | Yes (unified analysis pipeline) |
11//! | `BehaviorAnalyzer` | BU | No (language-agnostic, LLM-based) |
12
13use crate::types::{
14    ApiSurface, BehavioralChangeKind, BodyAnalysisResult, BreakingVerdict, Caller, ChangedFunction,
15    EvidenceType, ExpectedChild, FunctionSpec, Reference, StructuralChange, Symbol, SymbolKind,
16    TestDiff, TestFile, Visibility,
17};
18use anyhow::Result;
19use serde::{de::DeserializeOwned, Serialize};
20use std::collections::{BTreeSet, HashMap};
21use std::fmt::Debug;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24
25// ── BU Traits (language-agnostic, LLM-based) ───────────────────────────
26
27/// Analyze behavioral changes via LLM-based spec inference.
28///
29/// Language-agnostic: the function body and signature are passed as
30/// strings. The LLM generates template-constrained `FunctionSpec`
31/// objects, which are compared mechanically (Tier 1) or via LLM
32/// fallback (Tier 2).
33///
34/// Implementations may use:
35/// - Direct LLM API calls (OpenAI, Anthropic, etc.)
36/// - `goose run --no-session -q -t "..."`
37/// - `opencode run "..."`
38/// - Any other agent CLI via `--llm-command`
39pub trait BehaviorAnalyzer {
40    /// Infer a function's behavioral spec from its body alone.
41    ///
42    /// Lower confidence than `infer_spec_with_test_context` because
43    /// the LLM has no grounded examples of expected behavior.
44    fn infer_spec(&self, function_body: &str, signature: &str) -> Result<FunctionSpec>;
45
46    /// Infer a spec with additional context from the test file.
47    ///
48    /// The test assertions give the LLM concrete examples of expected
49    /// behavior — reducing hallucination compared to body-only inference.
50    fn infer_spec_with_test_context(
51        &self,
52        function_body: &str,
53        signature: &str,
54        test_context: &TestDiff,
55    ) -> Result<FunctionSpec>;
56
57    /// Compare two specs and determine if the change is breaking.
58    ///
59    /// Uses a two-tier approach:
60    /// - Tier 1: Structural comparison on `FunctionSpec` fields
61    /// - Tier 2: LLM fallback for `notes` diffs and ambiguous matches
62    fn specs_are_breaking(&self, old: &FunctionSpec, new: &FunctionSpec)
63        -> Result<BreakingVerdict>;
64
65    /// Check whether a caller propagates a behavioral break from a callee.
66    ///
67    /// Given a caller's body/signature and evidence of a behavioral
68    /// break in a callee it invokes, determine whether the caller's
69    /// observable behavior actually changes. The caller might absorb
70    /// the break by:
71    ///   - Ignoring the callee's return value
72    ///   - Catching and handling the callee's new error behavior
73    ///   - Only invoking the callee on code paths that don't trigger
74    ///     the behavioral change
75    ///   - Applying its own validation that masks the change
76    ///
77    /// Returns true if the break propagates (caller IS affected),
78    /// false if the caller absorbs it (NOT affected).
79    fn check_propagation(
80        &self,
81        caller_body: &str,
82        caller_signature: &str,
83        callee_name: &str,
84        evidence_description: &str,
85    ) -> Result<bool>;
86}
87
88// ── Language abstraction traits (multi-language architecture) ────────────
89//
90// These traits define the integration point for multi-language support.
91// See `design/01-traits.md` for detailed documentation.
92
93/// Language-specific semantic rules consumed by the diff engine.
94///
95/// These encode the places where "is this breaking?" or "are these related?"
96/// differ fundamentally by language. The diff engine calls these methods
97/// instead of hardcoding language-specific rules.
98pub trait LanguageSemantics<M: Default + Clone + PartialEq = ()> {
99    /// Is adding this member to this container a breaking change?
100    ///
101    /// This is the single rule that differs most fundamentally by language:
102    /// - TypeScript: breaking only if the member is required (non-optional).
103    /// - Go: ALWAYS breaking for interfaces (all implementors must add it).
104    /// - Java: breaking for abstract methods, not for default methods.
105    /// - C#: breaking for abstract members on interfaces.
106    /// - Python: breaking for abstract methods on Protocol/ABC.
107    fn is_member_addition_breaking(&self, container: &Symbol<M>, member: &Symbol<M>) -> bool;
108
109    /// Are these two symbols part of the same logical family/group?
110    ///
111    /// Used to scope migration detection. When a symbol is removed, only
112    /// symbols in the same family are considered as potential absorption targets.
113    ///
114    /// - TypeScript/React: same component directory
115    /// - Go: same package
116    /// - Java: same package
117    /// - Python: same module
118    fn same_family(&self, a: &Symbol<M>, b: &Symbol<M>) -> bool;
119
120    /// Are these two symbols the same concept, possibly at different paths?
121    ///
122    /// When true, migration detection does a full member comparison (all members,
123    /// not just newly-added ones) because the candidate is assumed to be a direct
124    /// replacement for the removed symbol.
125    ///
126    /// Resolves companion types linked by naming convention:
127    /// - TypeScript: `Button` and `ButtonProps` (component + its props interface)
128    /// - Go: `Client` and `ClientOptions` (struct + its configuration)
129    /// - Java: `UserService` and `UserServiceImpl` (interface + implementation)
130    fn same_identity(&self, a: &Symbol<M>, b: &Symbol<M>) -> bool;
131
132    /// Numeric rank for a visibility level (higher = more visible).
133    ///
134    /// Used to determine if visibility was reduced (breaking) or increased.
135    /// The ordering differs by language:
136    /// - TypeScript: Private(0) < Internal(1) < Protected(1) < Public(2) < Exported(3)
137    /// - Java: Private(0) < PackagePrivate(1) < Protected(2) < Public(3)
138    /// - Go: Internal(0) < Exported(1)
139    fn visibility_rank(&self, v: Visibility) -> u8;
140
141    /// Parse union/constrained type values for fine-grained diffing.
142    ///
143    /// TypeScript: parse `'primary' | 'secondary' | 'danger'`.
144    /// Python: parse `Literal['a', 'b']`.
145    /// Most other languages return `None`.
146    fn parse_union_values(&self, _type_str: &str) -> Option<BTreeSet<String>> {
147        None
148    }
149
150    /// Whether a return type string represents an async wrapper.
151    ///
152    /// Used by the diff engine to detect sync→async and async→sync changes,
153    /// which are always breaking regardless of the inner type.
154    ///
155    /// TypeScript/JavaScript: `Promise<T>`
156    /// Python: `Coroutine[...]`, `Awaitable[...]`
157    /// Java: `CompletableFuture<T>`, `Future<T>`
158    /// Go: returns `false` (async handled via goroutines, not return types)
159    fn is_async_wrapper(&self, _type_str: &str) -> bool {
160        false
161    }
162
163    /// Format an import/use statement change hint for migration descriptions.
164    ///
165    /// When a symbol is renamed across packages, the diff engine includes
166    /// import guidance so consumers know to update their import paths.
167    ///
168    /// TypeScript: `"replace \`import { X } from 'old-pkg'\` with \`import { X } from 'new-pkg'\`"`
169    /// Go: `"replace \`\"old/pkg\"\` with \`\"new/pkg\"\`"`
170    /// Default: generic format without language-specific syntax.
171    fn format_import_change(&self, symbol: &str, old_path: &str, new_path: &str) -> String {
172        format!(
173            "replace import of `{}` from `{}` with `{}`",
174            symbol, old_path, new_path,
175        )
176    }
177
178    /// Should this symbol be excluded from diff analysis?
179    ///
180    /// Called by the diff engine to filter out symbols that should not be
181    /// compared. The most common case is TypeScript's `export * from '...'`
182    /// star re-export directives.
183    ///
184    /// TypeScript: `sym.name == "*"` (star re-exports)
185    /// Default: `false` (all symbols are analyzed)
186    fn should_skip_symbol(&self, _sym: &Symbol<M>) -> bool {
187        false
188    }
189
190    /// Human-readable label for members when building migration descriptions.
191    ///
192    /// TypeScript: `"props"` (component properties)
193    /// Go: `"fields"` (struct fields)
194    /// Default: `"members"`
195    fn member_label(&self) -> &'static str {
196        "members"
197    }
198
199    /// Extract a fallback key for rename matching from a symbol's metadata.
200    ///
201    /// When fingerprint-based rename detection fails, the diff engine uses
202    /// this method to extract an alternative matching key. For TypeScript
203    /// CSS tokens, this parses the resolved CSS value from the `.d.ts`
204    /// type annotation (e.g., the string `"#151515"` from a CSS variable).
205    ///
206    /// TypeScript: parses `["value"]: "..."` from the return type annotation
207    /// Default: `None` (no fallback key)
208    fn extract_rename_fallback_key(&self, _sym: &Symbol<M>) -> Option<String> {
209        None
210    }
211
212    /// Normalize a qualified name for relocation detection.
213    ///
214    /// Strips language-specific path segments that represent lifecycle
215    /// modifiers (e.g., TypeScript's `/deprecated/` and `/next/` directories).
216    /// Symbols with matching canonical names are detected as relocations
217    /// rather than separate removals and additions.
218    ///
219    /// TypeScript: strips `/deprecated/` and `/next/` segments
220    /// Default: returns the name unchanged
221    fn canonical_name_for_relocation(&self, qualified_name: &str) -> String {
222        qualified_name.to_string()
223    }
224
225    /// Classify a relocation based on old and new qualified names.
226    ///
227    /// Returns a human-readable label describing the relocation direction
228    /// (e.g., "moved to deprecated exports", "promoted from next to stable").
229    /// Returns `None` for generic relocations with no special classification.
230    ///
231    /// TypeScript: detects `/deprecated/` and `/next/` transitions
232    /// Default: `None` (no classification)
233    fn classify_relocation(&self, _old_qname: &str, _new_qname: &str) -> Option<&'static str> {
234        None
235    }
236
237    /// Derive the import subpath for a symbol, used in migration descriptions.
238    ///
239    /// When a symbol moves between submodules (e.g., from main exports to
240    /// `/deprecated/` exports), the import path changes. This method derives
241    /// the effective import path from the package name and qualified name.
242    ///
243    /// TypeScript: appends `/deprecated` or `/next` based on qualified name
244    /// Default: returns the package name unchanged
245    fn derive_import_subpath(&self, package: Option<&str>, _qualified_name: &str) -> String {
246        package.unwrap_or("unknown").to_string()
247    }
248
249    /// Produce additional structural changes by diffing language-specific
250    /// metadata on two matched symbols.
251    ///
252    /// Called by the diff engine for each pair of symbols that matched by
253    /// qualified name. The default implementation returns no changes.
254    ///
255    /// TypeScript: could diff `rendered_components` or `css` metadata.
256    /// Default: empty (no language-specific metadata diffing)
257    fn diff_language_data(&self, _old: &Symbol<M>, _new: &Symbol<M>) -> Vec<StructuralChange> {
258        vec![]
259    }
260
261    /// Post-process the change list before returning from diff_surfaces.
262    ///
263    /// TypeScript: dedup default export changes.
264    /// Most languages: no-op.
265    fn post_process(&self, _changes: &mut Vec<StructuralChange>) {}
266
267    /// If this language supports component hierarchy inference (e.g., React,
268    /// Vue, Django templates), return the hierarchy semantics implementation.
269    ///
270    /// The orchestrator uses this to prepare data for LLM hierarchy inference.
271    /// The trait is NOT responsible for LLM calls or prompt construction.
272    fn hierarchy(&self) -> Option<&dyn HierarchySemantics<M>> {
273        None
274    }
275
276    /// If this language supports LLM-based rename inference (e.g., CSS
277    /// physical→logical property renames, interface rename mappings),
278    /// return the rename semantics implementation.
279    ///
280    /// The orchestrator uses this to prepare data for LLM rename inference.
281    /// The trait is NOT responsible for LLM calls or prompt construction.
282    fn renames(&self) -> Option<&dyn RenameSemantics> {
283        None
284    }
285
286    /// If this language has deterministic body-level analysis (e.g., JSX diff,
287    /// CSS variable scanning for TypeScript), return the body analysis
288    /// implementation.
289    ///
290    /// The orchestrator calls this during BU Phase 1 to detect behavioral
291    /// breaks from function body changes without LLM assistance.
292    fn body_analyzer(&self) -> Option<&dyn BodyAnalysisSemantics> {
293        None
294    }
295
296    /// Primitive type names for this language, used by the diff engine's
297    /// structural similarity comparison.
298    ///
299    /// When two types are compared for structural similarity (e.g., during
300    /// rename detection), types matching these names are classified as
301    /// primitives. Two primitives of different names are structurally similar
302    /// (both are scalars), whereas a primitive vs a reference type is not.
303    ///
304    /// Default: common cross-language primitives (string, number, boolean,
305    /// void, null). Languages should override to add their own (e.g.,
306    /// TypeScript adds `undefined`, `never`, `any`, `unknown`; Java adds
307    /// `int`, `long`, `double`, `float`, `char`, `byte`, `short`).
308    fn primitive_type_names(&self) -> &[&str] {
309        &["string", "number", "boolean", "void", "null"]
310    }
311}
312
313// ── Optional capability traits ──────────────────────────────────────────
314//
315// These traits represent optional analysis capabilities that some languages
316// support. They are accessed via optional accessors on `LanguageSemantics`.
317// The orchestrator checks for their presence and conditionally runs the
318// corresponding analysis steps.
319
320/// Deterministic data preparation for component hierarchy inference.
321///
322/// Languages with component composition models (React, Vue, Django, etc.)
323/// implement this to tell the orchestrator what files belong to a component
324/// family and how families relate to each other.
325///
326/// The orchestrator uses `same_family` for symbol grouping, then these
327/// methods for data preparation. The LLM call itself stays in the orchestrator.
328///
329/// TODO: Reconsider — the methods that take repo/git_ref currently require
330/// language impls to know about git. A future refactor should have the
331/// orchestrator own all git plumbing and pass content to pure-logic methods.
332pub trait HierarchySemantics<M: Default + Clone + PartialEq = ()> {
333    /// Get file paths belonging to a component family directory.
334    ///
335    /// Given a family name (e.g., "Dropdown"), returns relative paths to
336    /// all source files in that family. Used to read content for the LLM prompt.
337    fn family_source_paths(&self, repo: &Path, git_ref: &str, family_name: &str) -> Vec<String>;
338
339    /// Get a human-readable family name from a group of symbols.
340    ///
341    /// TypeScript/React: extracts the component directory name
342    /// (e.g., "Dropdown" from "packages/react-core/src/components/Dropdown/...")
343    fn family_name_from_symbols(&self, symbols: &[&Symbol<M>]) -> Option<String>;
344
345    /// Detect cross-family relationships (e.g., React context imports).
346    ///
347    /// Returns pairs of (consumer_family, provider_family, relationship_name).
348    /// Used to include related component signatures in the LLM prompt.
349    fn cross_family_relationships(
350        &self,
351        repo: &Path,
352        git_ref: &str,
353    ) -> Vec<(String, String, String)>;
354
355    /// Read related component signatures for cross-family context.
356    ///
357    /// Given a provider family and the context/relationship names that
358    /// link it to a consumer, returns relevant source content to include
359    /// in the LLM prompt.
360    fn related_family_content(
361        &self,
362        repo: &Path,
363        git_ref: &str,
364        family_name: &str,
365        relationship_names: &[String],
366    ) -> Option<String>;
367
368    /// Whether a symbol is a candidate for hierarchy inference.
369    ///
370    /// The orchestrator calls this to filter symbols when grouping into
371    /// families. Only candidates are counted toward the minimum threshold.
372    ///
373    /// TypeScript/React: PascalCase Variable/Class/Function/Constant
374    /// (React components are PascalCase functions or classes).
375    fn is_hierarchy_candidate(&self, sym: &Symbol<M>) -> bool;
376
377    /// Minimum number of exported types for a family to qualify
378    /// for hierarchy inference. Default: 2.
379    fn min_components_for_hierarchy(&self) -> usize {
380        2
381    }
382
383    /// Compute component hierarchy deterministically.
384    ///
385    /// The default implementation returns an empty map. Language implementations
386    /// that support component hierarchy (e.g., TypeScript/React) override this
387    /// with the full algorithm using language-specific metadata.
388    ///
389    /// The method works on the NEW surface and structural changes. It returns
390    /// the expected hierarchy for the new version.
391    fn compute_deterministic_hierarchy(
392        &self,
393        new_surface: &ApiSurface<M>,
394        structural_changes: &[StructuralChange],
395    ) -> HashMap<String, HashMap<String, Vec<ExpectedChild>>> {
396        let _ = (new_surface, structural_changes);
397        HashMap::new()
398    }
399}
400
401/// Deterministic data preparation for LLM-based rename inference.
402///
403/// Languages that benefit from LLM-detected rename patterns (e.g., CSS
404/// physical→logical property renames, interface rename mappings) implement
405/// this to prepare the data for the LLM call.
406///
407/// The orchestrator calls these methods to build LLM inputs. The LLM call
408/// itself and prompt construction stay in the orchestrator/LLM crate.
409pub trait RenameSemantics {
410    /// Sample removed constants for rename pattern inference.
411    ///
412    /// Default implementation returns the first 30. Language impls can
413    /// prioritize certain suffixes/patterns for better LLM pattern discovery.
414    fn sample_removed_constants<'a>(
415        &self,
416        removed: &[&'a str],
417        _added: &[&'a str],
418    ) -> Vec<&'a str> {
419        removed.iter().take(30).copied().collect()
420    }
421
422    /// Sample added constants for rename pattern inference.
423    ///
424    /// Default implementation returns the first 30.
425    fn sample_added_constants<'a>(&self, _removed: &[&'a str], added: &[&'a str]) -> Vec<&'a str> {
426        added.iter().take(30).copied().collect()
427    }
428
429    /// Minimum count of removed constants to trigger rename inference.
430    /// Default: 50.
431    fn min_removed_for_constant_inference(&self) -> usize {
432        50
433    }
434
435    /// Minimum count of removed interfaces to trigger interface rename
436    /// inference. Default: 2.
437    fn min_removed_for_interface_inference(&self) -> usize {
438        2
439    }
440}
441
442/// Deterministic body-level analysis for behavioral change detection.
443///
444/// Languages with framework-specific body patterns (e.g., JSX diff and CSS
445/// variable scanning for TypeScript/React) implement this to detect
446/// behavioral breaks from function body changes without LLM assistance.
447///
448/// The orchestrator calls `analyze_changed_body` during BU Phase 1 for each
449/// changed function that passes visibility filtering.
450///
451/// The `category_label` field on results uses the serde serialization format
452/// of the language's `Category` type. At the call site, the orchestrator
453/// deserializes this into `L::Category` via serde.
454pub trait BodyAnalysisSemantics {
455    /// Run deterministic analysis on a changed function's body.
456    ///
457    /// Returns a list of (description, category_label) pairs representing
458    /// behavioral breaks detected. The category_label is the string form
459    /// of the language's Category enum (e.g., "dom_structure" for
460    /// `TsCategory::DomStructure`).
461    ///
462    /// TypeScript: runs JSX diff + CSS variable scanning.
463    /// Other languages: may check annotation changes, decorator changes, etc.
464    fn analyze_changed_body(
465        &self,
466        old_body: &str,
467        new_body: &str,
468        func_name: &str,
469        file_path: &str,
470    ) -> Vec<BodyAnalysisResult>;
471}
472
473/// Language-specific human-readable descriptions for changes.
474///
475/// Each language owns its messaging entirely -- there is no generic
476/// template in core. These descriptions are consumed by LLMs downstream,
477/// so language-appropriate terminology matters.
478pub trait MessageFormatter {
479    /// Produce a human-readable description for a structural change.
480    fn describe(&self, change: &StructuralChange) -> String;
481}
482
483// ── Worktree sharing ─────────────────────────────────────────────────────
484
485/// Opaque handle to a checked-out worktree.
486///
487/// Keeps the worktree alive as long as the handle exists. The worktree
488/// is cleaned up when the last `Arc<dyn WorktreeAccess>` drops.
489///
490/// Language crates implement this on their worktree guard type. The
491/// orchestrator holds `Arc<dyn WorktreeAccess>` to share worktrees
492/// between TD and SD pipelines via `std::sync::mpsc::channel`.
493pub trait WorktreeAccess: Send + Sync + 'static {
494    /// Filesystem path to the worktree directory.
495    fn path(&self) -> &Path;
496}
497
498/// Result of `extract_keeping_worktree`: API surface + optional worktree handle.
499pub type ExtractionWithWorktree<M> = (ApiSurface<M>, Option<Arc<dyn WorktreeAccess>>);
500
501// ── Extended analysis parameters ─────────────────────────────────────────
502
503/// Parameters for `Language::run_extended_analysis`.
504///
505/// Bundles the repo/ref context with data computed by the orchestrator
506/// so that language implementations can attach them to their extensions
507/// without the orchestrator needing to know the concrete extension type.
508///
509/// The `dep_dir`, `removed_dep_components`, and `dep_repo_packages` fields
510/// support languages with separate dependency repositories (e.g., a CSS
511/// design system repo, a proto definitions repo). Languages without such
512/// dependencies ignore these fields.
513#[derive(Debug, Clone)]
514pub struct ExtendedAnalysisParams {
515    /// Path to the primary repository being analyzed.
516    pub repo: PathBuf,
517    /// Git ref for the old (from) version.
518    pub from_ref: String,
519    /// Git ref for the new (to) version.
520    pub to_ref: String,
521    /// Optional path to a dependency resource repository (already checked
522    /// out and built). For TypeScript/CSS: the PatternFly CSS repo. For
523    /// other languages: proto definitions, IDL files, etc.
524    pub dep_dir: Option<PathBuf>,
525    /// Component/module directories removed between old and new versions
526    /// of the dependency repository. Computed by the orchestrator from
527    /// directory-level diffing. For TypeScript/CSS: removed CSS component
528    /// blocks (e.g., `["select", "chip"]`).
529    pub removed_dep_components: Vec<String>,
530    /// Dependency repo packages (name → version at new ref).
531    /// Used to generate dep-update rules for packages outside the main
532    /// analyzed monorepo.
533    pub dep_repo_packages: HashMap<String, String>,
534
535    /// Filesystem path to the from-ref worktree (if shared by TD).
536    ///
537    /// When set, the SD pipeline uses filesystem reads instead of
538    /// `read_git_file` for the old version. Also enables `oxc_resolver`
539    /// for robust import resolution (barrel files, package imports,
540    /// tsconfig path aliases).
541    pub from_worktree_path: Option<PathBuf>,
542
543    /// Filesystem path to the to-ref worktree (if shared by TD).
544    ///
545    /// Same as `from_worktree_path` but for the new version.
546    pub to_worktree_path: Option<PathBuf>,
547
548    /// CSS classes where a naive version prefix swap (e.g., `pf-v5-` → `pf-v6-`)
549    /// produces a class name that does not exist in the target CSS distribution.
550    ///
551    /// Each entry is `(old_class, dead_swapped_class)`. Used to generate rules
552    /// that flag these dead classes and suppress the blind prefix swap fix.
553    pub dead_css_classes_after_swap: Vec<(String, String)>,
554}
555
556// ── LLM category definitions ────────────────────────────────────────────
557
558/// A behavioral change category definition for LLM prompts.
559///
560/// Each language provides a list of these to guide the LLM's output.
561/// The `id` must match the serde name of the corresponding `Language::Category`
562/// enum variant (e.g., `"dom_structure"` for `TsCategory::DomStructure`).
563#[derive(Debug, Clone)]
564pub struct LlmCategoryDefinition {
565    /// Machine-readable identifier (e.g., `"dom_structure"`, `"annotation_change"`).
566    /// Must match the serde serialization of `Language::Category` variants.
567    pub id: String,
568    /// Short human label (e.g., `"DOM/render changes"`, `"Annotation changes"`).
569    pub label: String,
570    /// Detailed description for the LLM prompt explaining what this category covers.
571    pub description: String,
572}
573
574/// The core language abstraction.
575///
576/// Composes `LanguageSemantics + MessageFormatter` and adds six associated
577/// types representing language-specific data flowing through the pipeline.
578///
579/// Code that only needs semantic rules can take `&dyn LanguageSemantics`
580/// (no generic parameter). Code that needs the associated types takes
581/// `L: Language`.
582///
583/// ## Konveyor rule generation
584///
585/// Konveyor rule generation is **not** part of this trait. Rule generation
586/// is language-specific and lives in each language crate (e.g.,
587/// `crates/ts/src/konveyor.rs` and `konveyor_v2.rs`). The shared rule
588/// types (`KonveyorRule`, `FixStrategy`) live in `crates/konveyor-core/`.
589/// If a second language needs full rule generation, consider introducing
590/// a `KonveyorGenerator<L>` trait.
591pub trait Language:
592    LanguageSemantics<Self::SymbolData> + MessageFormatter + Send + Sync + 'static
593{
594    /// Per-symbol metadata type carried in `Symbol<M>.language_data`.
595    ///
596    /// TypeScript: `TsSymbolData` (rendered components, CSS tokens).
597    /// Languages without per-symbol metadata: `()`.
598    type SymbolData: Debug
599        + Clone
600        + Default
601        + PartialEq
602        + Eq
603        + Serialize
604        + DeserializeOwned
605        + Send
606        + Sync;
607
608    /// Behavioral change categories for this language.
609    type Category: Debug + Clone + Serialize + DeserializeOwned + Eq + std::hash::Hash + Send + Sync;
610
611    /// Manifest change types for this language's package system.
612    type ManifestChangeType: Debug
613        + Clone
614        + Serialize
615        + DeserializeOwned
616        + Eq
617        + PartialEq
618        + Send
619        + Sync;
620
621    /// Evidence data carried on behavioral changes.
622    type Evidence: Debug + Clone + Serialize + DeserializeOwned + Send + Sync;
623
624    /// Language-specific report data.
625    type ReportData: Debug + Clone + Serialize + DeserializeOwned + Send + Sync;
626
627    /// Language-specific analysis extensions.
628    ///
629    /// Carries pipeline results that are specific to this language
630    /// (e.g., SD pipeline results, hierarchy deltas for TypeScript).
631    /// Replaces the concrete `sd_result` and `hierarchy_deltas` fields
632    /// that were previously on `AnalysisReport`/`AnalysisResult`.
633    ///
634    /// TypeScript: `TsAnalysisExtensions` (SD result, hierarchy deltas).
635    /// Languages without extended analysis: `EmptyExtensions`.
636    type AnalysisExtensions: Debug + Clone + Default + Serialize + DeserializeOwned + Send + Sync;
637
638    // ── Constants ────────────────────────────────────────────────────
639
640    /// Symbol kinds that represent type definitions eligible for rename inference.
641    /// TypeScript: `&[SymbolKind::Interface, SymbolKind::Class]`
642    /// Go: `&[SymbolKind::Struct, SymbolKind::Interface]`
643    const RENAMEABLE_SYMBOL_KINDS: &'static [SymbolKind];
644
645    /// Language identifier for serialization dispatch.
646    const NAME: &'static str;
647
648    /// Manifest file path(s) for this language's package system.
649    ///
650    /// TypeScript: `&["package.json"]`
651    /// Go: `&["go.mod"]`
652    /// Java: `&["pom.xml"]` or `&["build.gradle"]`
653    ///
654    /// TODO: Reconsider — the orchestrator currently reads these files via git
655    /// and passes content to `diff_manifest_content`. A future refactor should
656    /// unify all git plumbing in the orchestrator so language impls are pure
657    /// content processors.
658    const MANIFEST_FILES: &'static [&'static str];
659
660    /// Discover per-package manifest files in monorepos.
661    ///
662    /// Returns `(manifest_path, package_name)` pairs for sub-packages.
663    /// The orchestrator diffs each discovered manifest in addition to
664    /// the static `MANIFEST_FILES` and tags resulting changes with
665    /// `source_package`.
666    ///
667    /// Default: empty (non-monorepo projects or languages without workspaces).
668    fn discover_package_manifests(_repo: &Path, _git_ref: &str) -> Vec<(String, String)> {
669        vec![]
670    }
671
672    /// Source file glob patterns for `git diff --name-only` filtering.
673    ///
674    /// TypeScript: `&["*.ts", "*.tsx"]`
675    /// Go: `&["*.go"]`
676    /// Java: `&["*.java"]`
677    ///
678    /// TODO: Same reconsideration as MANIFEST_FILES.
679    const SOURCE_FILE_PATTERNS: &'static [&'static str];
680
681    // ── Analysis pipeline methods ───────────────────────────────────
682
683    /// Extract the public API surface from source code at a git ref.
684    ///
685    /// The implementation is responsible for checking out the ref,
686    /// running any required build steps, parsing the output, and
687    /// cleaning up temporary files.
688    ///
689    /// An optional `DegradationTracker` can be provided to record non-fatal
690    /// extraction issues (e.g., partial tsc success). These appear in the
691    /// end-of-run summary.
692    fn extract(
693        &self,
694        repo: &Path,
695        git_ref: &str,
696        degradation: Option<&crate::diagnostics::DegradationTracker>,
697    ) -> Result<ApiSurface<Self::SymbolData>>;
698
699    /// Extract the API surface and optionally keep the worktree alive.
700    ///
701    /// Like `extract()`, but returns an `Arc<dyn WorktreeAccess>` that
702    /// keeps the worktree alive after extraction. The orchestrator uses
703    /// this to share worktrees between TD and SD pipelines.
704    ///
705    /// Default implementation calls `extract()` and returns `None` for
706    /// the worktree handle (worktree is created and dropped internally).
707    /// Languages that support worktree sharing override this to wrap
708    /// the guard in `Arc` and return it.
709    fn extract_keeping_worktree(
710        &self,
711        repo: &Path,
712        git_ref: &str,
713        degradation: Option<&crate::diagnostics::DegradationTracker>,
714    ) -> Result<ExtractionWithWorktree<Self::SymbolData>> {
715        let surface = self.extract(repo, git_ref, degradation)?;
716        Ok((surface, None))
717    }
718
719    /// Parse the diff between two git refs and identify all functions
720    /// whose bodies changed (public AND private).
721    fn parse_changed_functions(
722        &self,
723        repo: &Path,
724        from_ref: &str,
725        to_ref: &str,
726    ) -> Result<Vec<ChangedFunction>>;
727
728    /// Given a function, find what calls it (callers, not callees).
729    fn find_callers(&self, file: &Path, symbol_name: &str) -> Result<Vec<Caller>>;
730
731    /// Given a public symbol, find all references to it across the project.
732    fn find_references(&self, file: &Path, symbol_name: &str) -> Result<Vec<Reference>>;
733
734    /// Given a source file, find its associated test file(s) by convention.
735    fn find_tests(&self, repo: &Path, source_file: &Path) -> Result<Vec<TestFile>>;
736
737    /// Diff the test file between two refs. Returns changed assertion lines.
738    fn diff_test_assertions(
739        &self,
740        repo: &Path,
741        test_file: &TestFile,
742        from_ref: &str,
743        to_ref: &str,
744    ) -> Result<TestDiff>;
745
746    // ── Methods ─────────────────────────────────────────────────────
747
748    /// Diff manifest content between two versions.
749    ///
750    /// The orchestrator reads the manifest file(s) at both refs and passes
751    /// the raw content here. The language interprets the format and determines
752    /// what changed and whether it's breaking.
753    ///
754    /// TODO: Reconsider — same as above re: git plumbing ownership.
755    fn diff_manifest_content(old: &str, new: &str) -> Vec<crate::types::ManifestChange<Self>>
756    where
757        Self: Sized;
758
759    /// Whether a file path should be excluded from BU analysis.
760    ///
761    /// Filters out test files, build artifacts, index/barrel files, etc.
762    /// TypeScript: excludes `index.ts`, `.d.ts`, `.test.`, `.spec.`,
763    /// `__tests__/`, `dist/`
764    ///
765    /// TODO: Same reconsideration as above.
766    fn should_exclude_from_analysis(path: &Path) -> bool;
767
768    /// Build the language-specific report from analysis results.
769    ///
770    /// This is the primary report-building entry point. The Language owns
771    /// the entire report construction — language-agnostic structure (grouping
772    /// changes by file, counting breaks) AND language-specific enrichment
773    /// (component detection, hierarchy, child components, etc.).
774    ///
775    /// The result is dropped into a `ReportEnvelope` by the caller.
776    fn build_report(
777        &self,
778        results: &crate::types::AnalysisResult<Self>,
779        repo: &Path,
780        from_ref: &str,
781        to_ref: &str,
782    ) -> crate::types::AnalysisReport<Self>
783    where
784        Self: Sized;
785
786    // ── Behavioral change methods ───────────────────────────────
787
788    /// Determine the behavioral change kind from the evidence type.
789    /// TypeScript: LLM/body analysis → Class (component-level), test delta → Function
790    /// Default: always Function
791    fn behavioral_change_kind(&self, _evidence_type: &EvidenceType) -> BehavioralChangeKind {
792        BehavioralChangeKind::Function
793    }
794
795    /// Extract symbol references from a behavioral change description.
796    /// TypeScript: extracts PascalCase component names (e.g., `<Modal>`, `` `Button` ``)
797    /// Default: empty vec
798    fn extract_referenced_symbols(&self, _description: &str) -> Vec<String> {
799        vec![]
800    }
801
802    /// Format a qualified name for display in reports.
803    /// TypeScript: `src/Modal.tsx::Modal` → `Modal`
804    /// Default: return the qualified name as-is
805    fn display_name(&self, qualified_name: &str) -> String {
806        qualified_name.to_string()
807    }
808
809    /// Return the behavioral change categories for LLM prompts.
810    ///
811    /// Each category has an `id` that must match the serde serialization of
812    /// the corresponding `Language::Category` variant. The LLM prompt is
813    /// built dynamically from these definitions, so adding a new language
814    /// automatically gets language-appropriate behavioral categories.
815    ///
816    /// Default: empty (no behavioral categories — LLM skips category assignment).
817    fn llm_categories(&self) -> Vec<LlmCategoryDefinition> {
818        vec![]
819    }
820
821    // ── v2 Extended Analysis pipeline ───────────────────────────────
822
823    /// Run language-specific extended analysis.
824    ///
825    /// For TypeScript, this runs the SD (Source-Level Diff) pipeline:
826    /// reads component source files at both refs, extracts structured
827    /// profiles, diffs them, and builds composition trees.
828    ///
829    /// The `params` struct carries both common fields (repo, refs, CSS dir)
830    /// and data computed by the orchestrator (removed CSS blocks, dep-repo
831    /// packages) that the language impl can attach to its extensions.
832    ///
833    /// Default implementation returns empty extensions (no extended analysis).
834    fn run_extended_analysis(
835        &self,
836        _params: &ExtendedAnalysisParams,
837    ) -> Result<Self::AnalysisExtensions> {
838        Ok(Self::AnalysisExtensions::default())
839    }
840
841    /// Post-process extensions after both TD and extended analysis complete.
842    ///
843    /// This is where language-specific cross-pipeline processing happens.
844    /// For TypeScript, this runs deprecated replacement detection (requires
845    /// both TD structural changes and SD source-level changes) and transforms
846    /// structural changes accordingly. The `repo`/`from_ref`/`to_ref` params
847    /// enable git-based analysis (e.g., commit co-change analysis for
848    /// deprecated replacement detection).
849    ///
850    /// Returns the (potentially modified) structural changes.
851    /// Default implementation is a no-op.
852    fn finalize_extensions(
853        &self,
854        _extensions: &mut Self::AnalysisExtensions,
855        structural_changes: Arc<Vec<StructuralChange>>,
856        _repo: &Path,
857        _from_ref: &str,
858        _to_ref: &str,
859    ) -> Arc<Vec<StructuralChange>> {
860        structural_changes
861    }
862
863    /// Return log-friendly summary lines for the extended analysis results.
864    ///
865    /// The orchestrator calls this for progress/logging output.
866    /// Default implementation returns empty (no summary).
867    fn extensions_log_summary(&self, _extensions: &Self::AnalysisExtensions) -> Vec<String> {
868        vec![]
869    }
870}
871
872// ── Convenience functions (TD) ──────────────────────────────────────────
873
874/// Compare two API surfaces using language-specific semantic rules.
875///
876/// This is the primary entry point for the TD (Top-Down) pipeline.
877/// The `semantics` parameter provides language-specific rules.
878pub fn diff_surfaces_with_semantics<M, S>(
879    old: &ApiSurface<M>,
880    new: &ApiSurface<M>,
881    semantics: &S,
882) -> Vec<StructuralChange>
883where
884    M: Default + Clone + PartialEq,
885    S: LanguageSemantics<M>,
886{
887    crate::diff::diff_surfaces_with_semantics(old, new, semantics)
888}
889
890/// Compare two API surfaces using minimal semantics (no language-specific rules).
891///
892/// This uses `MinimalSemantics` which is language-agnostic: no member additions
893/// are breaking, no union parsing, no post-processing. For language-aware
894/// diffing, use `diff_surfaces_with_semantics` with a `LanguageSemantics` impl.
895pub fn diff_surfaces<M: Default + Clone + PartialEq>(
896    old: &ApiSurface<M>,
897    new: &ApiSurface<M>,
898) -> Vec<StructuralChange> {
899    crate::diff::diff_surfaces(old, new)
900}
901
902// Hierarchy algorithm tests live in crates/ts/src/language.rs where
903// they can use TsSymbolData.rendered_components. The core default
904// implementation returns an empty map.