Skip to main content

cyt_indexer/
retrieve.rs

1//! Decomposed catalog retrieval: merge tool schemas, score filtering, and enum pruning.
2
3use crate::build::{catalog_index_from_value, CatalogIndex};
4use crate::paths::{self, decomposed_prefix, get_root_tool_key, json_ext, md_ext, tool_id_from_decomposed_rel};
5use crate::policies::{
6    append_description_reinstate_entries, effective_policy, mcp_required_enum_values,
7    needs_description_reinstate, required_enum_values_by_tool, system_required_enum_values,
8    PolicyContext, ToolPolicy,
9};
10use crate::runtime_config;
11use serde_json::{json, Map, Value};
12use std::collections::{HashMap, HashSet};
13use std::path::{Path, PathBuf};
14
15/// In-memory map of decomposed JSON schema files keyed by catalog-relative path.
16#[derive(Debug, Clone, Default)]
17pub struct DecomposedCatalog {
18    /// Parsed JSON object per decomposed file path.
19    pub(crate) json_files: HashMap<String, Value>,
20}
21
22impl DecomposedCatalog {
23    /// Wrap a pre-built path-to-JSON map.
24    #[must_use]
25    pub const fn from_json_files(json_files: HashMap<String, Value>) -> Self {
26        Self { json_files }
27    }
28
29    /// Borrow the underlying path-to-JSON map.
30    #[must_use]
31    pub const fn json_files(&self) -> &HashMap<String, Value> {
32        &self.json_files
33    }
34
35    /// Load decomposed JSON files from a [`CatalogIndex`] file table.
36    #[must_use]
37    pub fn from_catalog_index(index: &CatalogIndex) -> Self {
38        let mut json_files = HashMap::new();
39        for (rel_path, content) in &index.files {
40            if rel_path.starts_with(&decomposed_prefix()) && rel_path.ends_with(&json_ext())
41                && let Ok(parsed) = serde_json::from_str::<Value>(content)
42                    && parsed.is_object() {
43                        json_files.insert(rel_path.clone(), parsed);
44                    }
45        }
46        Self { json_files }
47    }
48
49    /// Load decomposed JSON from a survivor/catalog dict (`json` array entries).
50    #[must_use]
51    pub fn from_catalog_dict(data: &Value) -> Self {
52        let mut json_files = HashMap::new();
53        if let Some(entries) = data.get("json").and_then(|v| v.as_array()) {
54            for entry in entries {
55                let Some(obj) = entry.as_object() else {
56                    continue;
57                };
58                let Some(file_path) = obj.get("file_path").and_then(|v| v.as_str()) else {
59                    continue;
60                };
61                let Some(content) = obj.get("content") else {
62                    continue;
63                };
64                if !content.is_object() {
65                    continue;
66                }
67                if let Some(key) = paths::to_decomposed_key(file_path) {
68                    json_files.insert(key, content.clone());
69                }
70            }
71        }
72        Self { json_files }
73    }
74
75    /// Overlay another catalog's JSON files (later keys win).
76    pub fn merge_json_files(&mut self, other: &Self) {
77        self.json_files.extend(other.json_files.clone());
78    }
79
80    /// Resolve a survivor or absolute path to a stored decomposed key, if present.
81    #[must_use]
82    pub fn resolve_key(&self, file_path: &str) -> Option<String> {
83        let mut candidates = Vec::new();
84        if let Some(normalized) = paths::to_decomposed_key(file_path) {
85            candidates.push(normalized);
86        }
87        candidates.push(file_path.to_string());
88        candidates
89            .into_iter()
90            .find(|candidate| self.has_json(candidate))
91    }
92
93    /// Whether a decomposed JSON file exists under `key`.
94    #[must_use]
95    pub fn has_json(&self, key: &str) -> bool {
96        self.json_files.contains_key(key)
97    }
98
99    /// Borrow parsed JSON for a decomposed file key.
100    #[must_use]
101    pub fn get_json(&self, key: &str) -> Option<&Value> {
102        self.json_files.get(key)
103    }
104}
105
106/// Parse a host catalog value into [`DecomposedCatalog`] (index dict or json-files map).
107#[must_use]
108pub fn decomposed_catalog_from_value(val: &Value) -> DecomposedCatalog {
109    if val.get("tools").is_some() && val.get("files").is_some() {
110        let idx = catalog_index_from_value(val);
111        return DecomposedCatalog::from_catalog_index(&idx);
112    }
113    if let Some(map) = val.as_object() {
114        let mut json_files = HashMap::new();
115        for (k, v) in map {
116            if v.is_object() {
117                json_files.insert(k.clone(), v.clone());
118            }
119        }
120        if !json_files.is_empty() {
121            return DecomposedCatalog::from_json_files(json_files);
122        }
123    }
124    DecomposedCatalog::default()
125}
126
127/// Recursively merge JSON objects; non-object overrides replace the base value.
128#[must_use]
129pub fn deep_merge(base: &Value, override_val: &Value) -> Value {
130    match (base, override_val) {
131        (Value::Object(base_map), Value::Object(override_map)) => {
132            let mut result = base_map.clone();
133            for (key, val) in override_map {
134                if let Some(existing) = result.get(key)
135                    && existing.is_object() && val.is_object() {
136                        result.insert(key.clone(), deep_merge(existing, val));
137                        continue;
138                    }
139                result.insert(key.clone(), val.clone());
140            }
141            Value::Object(result)
142        }
143        _ => override_val.clone(),
144    }
145}
146
147/// Walk parent decomposed JSON files and deep-merge them over `leaf_path`.
148#[must_use]
149pub fn climb_and_merge(leaf_path: &str, catalog: &DecomposedCatalog) -> Value {
150    let leaf_key = catalog.resolve_key(leaf_path).unwrap_or_else(|| {
151        paths::to_decomposed_key(leaf_path).unwrap_or_else(|| leaf_path.to_string())
152    });
153
154    let Some(mut current) = catalog.get_json(&leaf_key).cloned() else {
155        return json!({});
156    };
157
158    let mut current_path = PathBuf::from(&leaf_key);
159    current_path.pop();
160
161    let decomposed_root = paths::decomposed_root();
162
163    loop {
164        let parent_dir = current_path.parent().map(std::path::Path::to_path_buf);
165        let Some(parent_dir) = parent_dir else {
166            break;
167        };
168        if parent_dir == decomposed_root || !parent_dir.starts_with(&decomposed_root) {
169            break;
170        }
171
172        let parent_key = format!(
173            "{}/{}{}",
174            parent_dir.to_string_lossy(),
175            current_path
176                .file_name()
177                .unwrap_or_default()
178                .to_string_lossy(),
179            json_ext(),
180        );
181        if let Some(parent) = catalog.get_json(&parent_key) {
182            current = deep_merge(parent, &current);
183        }
184        current_path = parent_dir;
185    }
186    current
187}
188
189/// Collect rerank scores keyed by markdown content or json `file_path`.
190#[must_use]
191pub fn extract_scores(data: &Value) -> HashMap<String, f64> {
192    let mut scores = HashMap::new();
193    let Some(obj) = data.as_object() else {
194        return scores;
195    };
196    if let Some(md) = obj.get("md").and_then(|v| v.as_array()) {
197        for entry in md {
198            if let Some(e) = entry.as_object()
199                && let (Some(content), Some(score)) = (
200                    e.get("content").and_then(|v| v.as_str()),
201                    json_f64(e.get("score")),
202                ) {
203                    scores.insert(content.to_string(), score);
204                }
205        }
206    }
207    if let Some(json_arr) = obj.get("json").and_then(|v| v.as_array()) {
208        for entry in json_arr {
209            if let Some(e) = entry.as_object()
210                && let (Some(fp), Some(score)) = (
211                    e.get("file_path").and_then(|v| v.as_str()),
212                    json_f64(e.get("score")),
213                ) {
214                    scores.insert(fp.to_string(), score);
215                }
216        }
217    }
218    scores
219}
220
221/// Parse a JSON number or numeric string (pruner snapshots often store scores as strings).
222fn json_f64(value: Option<&Value>) -> Option<f64> {
223    let v = value?;
224    if let Some(n) = v.as_f64() {
225        return Some(n);
226    }
227    v.as_str()
228        .and_then(|s| s.trim().parse::<f64>().ok())
229}
230
231fn extract_from_dict(
232    data: &Map<String, Value>,
233    apply_decomposed_score_filter: bool,
234) -> Vec<String> {
235    let mut input_files = Vec::new();
236    for (key, value) in data {
237        if key == "md" {
238            continue;
239        }
240        if let Some(arr) = value.as_array() {
241            for entry in arr {
242                if let Some(e) = entry.as_object()
243                    && let Some(fp) = e.get("file_path").and_then(|v| v.as_str()) {
244                        if key == "json" && apply_decomposed_score_filter {
245                            let score = json_f64(e.get("score")).unwrap_or(0.0);
246                            if score <= runtime_config::decomposed_score() {
247                                continue;
248                            }
249                        }
250                        input_files.push(fp.to_string());
251                    }
252            }
253        } else if let Some(e) = value.as_object()
254            && let Some(fp) = e.get("file_path").and_then(|v| v.as_str()) {
255                input_files.push(fp.to_string());
256            }
257    }
258    input_files
259}
260
261/// List input `file_path` values from pruner/rerank survivor data.
262#[must_use]
263pub fn extract_input_files(data: &Value, apply_decomposed_score_filter: bool) -> Vec<String> {
264    if let Some(obj) = data.as_object() {
265        return extract_from_dict(obj, apply_decomposed_score_filter);
266    }
267    if let Some(arr) = data.as_array() {
268        return arr
269            .iter()
270            .filter_map(|entry| {
271                entry
272                    .as_object()
273                    .and_then(|e| e.get("file_path"))
274                    .and_then(|v| v.as_str())
275                    .map(String::from)
276            })
277            .collect();
278    }
279    Vec::new()
280}
281
282/// Parse survivor data into input file paths and score map.
283#[must_use]
284pub fn parse_json_input(
285    data: &Value,
286    apply_decomposed_score_filter: bool,
287) -> (Vec<String>, HashMap<String, f64>) {
288    (
289        extract_input_files(data, apply_decomposed_score_filter),
290        extract_scores(data),
291    )
292}
293
294fn filter_items(items_with_scores: &[(Value, f64)]) -> Vec<Value> {
295    let first_3_above = items_with_scores
296        .iter()
297        .take(3)
298        .all(|(_, score)| *score >= runtime_config::enum_score());
299
300    if first_3_above {
301        items_with_scores
302            .iter()
303            .filter(|(_, score)| *score >= runtime_config::enum_score())
304            .map(|(item, _)| item.clone())
305            .collect()
306    } else {
307        items_with_scores
308            .iter()
309            .take(3)
310            .map(|(item, _)| item.clone())
311            .collect()
312    }
313}
314
315/// Prune and sort JSON-schema `enum` arrays using rerank scores and preserve sets.
316pub fn filter_and_sort_enums<S: std::hash::BuildHasher, P: std::hash::BuildHasher>(
317    schema: &mut Value,
318    scores: &HashMap<String, f64, S>,
319    preserve_values: Option<&HashSet<String, P>>,
320) {
321    match schema {
322        Value::Object(map) => {
323            let keys: Vec<String> = map.keys().cloned().collect();
324            for key in keys {
325                if key == "enum" {
326                    if let Some(Value::Array(items)) = map.get("enum").cloned() {
327                        let mut preserved = Vec::new();
328                        let mut prunable = Vec::new();
329                        for item in items {
330                            if preserve_values
331                                .is_some_and(|pv| pv.contains(&item.to_string()))
332                            {
333                                preserved.push(item);
334                            } else {
335                                prunable.push(item);
336                            }
337                        }
338                        let mut items_with_scores: Vec<(Value, f64)> = prunable
339                            .into_iter()
340                            .map(|item| {
341                                let score = scores.get(&item.to_string()).copied().unwrap_or(0.0);
342                                (item, score)
343                            })
344                            .collect();
345                        items_with_scores.sort_by(|a, b| {
346                            b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
347                        });
348                        preserved.extend(filter_items(&items_with_scores));
349                        map.insert("enum".into(), Value::Array(preserved));
350                    }
351                } else if let Some(val) = map.get(&key).cloned() {
352                    let mut inner = val;
353                    filter_and_sort_enums(&mut inner, scores, preserve_values);
354                    map.insert(key, inner);
355                }
356            }
357        }
358        Value::Array(items) => {
359            for item in items.iter_mut() {
360                filter_and_sort_enums(item, scores, preserve_values);
361            }
362        }
363        _ => {}
364    }
365}
366
367/// Group decomposed file paths by root tool key; track standalone tool JSON files.
368#[must_use]
369pub fn group_files(
370    input_files: &[String],
371    catalog: &DecomposedCatalog,
372) -> (HashMap<String, Vec<String>>, HashSet<String>) {
373    let mut groups: HashMap<String, Vec<String>> = HashMap::new();
374    let mut tool_files = HashSet::new();
375    let decomposed_root = paths::decomposed_root();
376
377    for file_path in input_files {
378        let Some(key) = catalog.resolve_key(file_path) else {
379            eprintln!("Warning: File not found: {file_path}");
380            continue;
381        };
382        let rel = Path::new(&key)
383            .strip_prefix(&decomposed_root)
384            .unwrap_or_else(|_| Path::new(&key));
385        let parts: Vec<_> = rel.components().collect();
386        let is_tool =
387            parts.len() == 1 && parts[0].as_os_str().to_string_lossy().ends_with(&json_ext());
388
389        let Some(root_tool) = paths::get_root_tool_key(&key) else {
390            continue;
391        };
392        if is_tool {
393            tool_files.insert(key.clone());
394        }
395        groups.entry(root_tool).or_default().push(key);
396    }
397    (groups, tool_files)
398}
399
400fn tool_shell_from_root_key(root_tool: &str) -> Value {
401    let name = Path::new(root_tool)
402        .file_stem()
403        .unwrap_or_default()
404        .to_string_lossy();
405    json!({
406        "name": name,
407        "inputSchema": {"type": "object", "properties": {}},
408    })
409}
410
411/// Build retrieve ``ProcessGroupsOptions`` from policy context and catalog state.
412#[must_use]
413pub fn build_process_groups_options(
414    ctx: &PolicyContext,
415    catalog_dict: &Value,
416    store: &DecomposedCatalog,
417    preserve_values: Option<Vec<String>>,
418) -> ProcessGroupsOptions {
419    let mut system_preserve = system_required_enum_values(catalog_dict);
420    if let Some(pv) = preserve_values
421        && system_preserve.is_empty() {
422            system_preserve = pv.into_iter().collect();
423        }
424    let mcp_preserve = mcp_required_enum_values(catalog_dict);
425    let required_by_tool = required_enum_values_by_tool(catalog_dict);
426
427    let mut prune_optional_tools = HashSet::new();
428    for key in store.json_files().keys() {
429        if let Some(root_tool) = get_root_tool_key(key) {
430            let tool_name = tool_id_from_decomposed_rel(&root_tool);
431            let policy = effective_policy(ctx, &tool_name);
432            if matches!(
433                policy,
434                ToolPolicy::PruneOptional | ToolPolicy::PruneOptionalDescriptions
435            ) {
436                prune_optional_tools.insert(tool_name);
437            }
438        }
439    }
440
441    ProcessGroupsOptions {
442        system_preserve: (!system_preserve.is_empty()).then_some(system_preserve),
443        mcp_preserve: (!mcp_preserve.is_empty()).then_some(mcp_preserve),
444        required_by_tool,
445        prune_optional_tools,
446    }
447}
448
449/// Enum-preservation and optional-tool pruning settings for [`process_groups`].
450#[derive(Debug, Clone, Default)]
451pub struct ProcessGroupsOptions {
452    /// Enum values that must survive pruning for system tools.
453    pub system_preserve: Option<HashSet<String>>,
454    /// Enum values that must survive pruning for MCP tools.
455    pub mcp_preserve: Option<HashSet<String>>,
456    /// Per-tool required enum values from catalog metadata.
457    pub required_by_tool: HashMap<String, HashSet<String>>,
458    /// Tool names where `effective_policy` == "`prune_optional`" (enum filtering applies).
459    pub prune_optional_tools: HashSet<String>,
460}
461
462/// Build [`ProcessGroupsOptions`] from optional policy fields (Python/Node FFI).
463#[must_use]
464pub fn process_groups_options_from_fields<S: std::hash::BuildHasher + Default>(
465    system_preserve: Option<Vec<String>>,
466    mcp_preserve: Option<Vec<String>>,
467    required_by_tool: Option<HashMap<String, Vec<String>, S>>,
468    required_enum_values_by_tool: Option<HashMap<String, Vec<String>, S>>,
469    prune_optional_tools: Option<Vec<String>>,
470) -> ProcessGroupsOptions {
471    let required_by_tool = required_by_tool
472        .or(required_enum_values_by_tool)
473        .unwrap_or_default()
474        .into_iter()
475        .map(|(k, v)| (k, v.into_iter().collect()))
476        .collect();
477    ProcessGroupsOptions {
478        system_preserve: system_preserve.map(|items| items.into_iter().collect()),
479        mcp_preserve: mcp_preserve.map(|items| items.into_iter().collect()),
480        required_by_tool,
481        prune_optional_tools: prune_optional_tools
482            .unwrap_or_default()
483            .into_iter()
484            .collect(),
485    }
486}
487
488/// Merge grouped decomposed files into final tool schema values.
489#[must_use]
490pub fn process_groups<S: std::hash::BuildHasher>(
491    groups: &HashMap<String, Vec<String>, S>,
492    tool_files: &HashSet<String, S>,
493    scores: &HashMap<String, f64, S>,
494    catalog: &DecomposedCatalog,
495    opts: &ProcessGroupsOptions,
496) -> Vec<Value> {
497    let mut tools = Vec::new();
498
499    for (root_tool, files) in groups {
500        let mut base_tool = catalog
501            .get_json(root_tool)
502            .cloned()
503            .unwrap_or_else(|| tool_shell_from_root_key(root_tool));
504
505        let tool_name_in_schema = base_tool
506            .get("name")
507            .and_then(|v| v.as_str())
508            .unwrap_or("")
509            .to_string();
510
511        for file_key in files {
512            if tool_files.contains(file_key) {
513                continue;
514            }
515            base_tool = deep_merge(&base_tool, &climb_and_merge(file_key, catalog));
516        }
517
518        let stem_name = Path::new(root_tool)
519            .file_stem()
520            .unwrap_or_default()
521            .to_string_lossy()
522            .into_owned();
523        let tool_name = base_tool
524            .get("name")
525            .and_then(|v| v.as_str())
526            .filter(|s| !s.is_empty())
527            .unwrap_or(if tool_name_in_schema.is_empty() {
528                stem_name.as_str()
529            } else {
530                tool_name_in_schema.as_str()
531            })
532            .to_string();
533
534        if let Some(obj) = base_tool.as_object().cloned() {
535            let mut obj = obj;
536            obj.insert("name".into(), Value::String(tool_name.clone()));
537            obj.remove("id");
538            base_tool = Value::Object(obj);
539        }
540
541        if !scores.is_empty() {
542            let enum_preserve = if opts.prune_optional_tools.contains(&tool_name) {
543                opts.required_by_tool
544                    .get(&tool_name)
545                    .cloned()
546                    .or_else(|| opts.system_preserve.clone())
547                    .or_else(|| opts.mcp_preserve.clone())
548            } else {
549                None
550            };
551            filter_and_sort_enums(&mut base_tool, scores, enum_preserve.as_ref());
552        }
553        tools.push(base_tool);
554    }
555    tools
556}
557
558/// Options for [`retrieve_core`] and [`retrieve_tools_from_catalog`].
559#[derive(Debug, Clone, Default)]
560pub struct RetrieveOptions {
561    /// Drop low-score decomposed json entries before grouping.
562    pub apply_decomposed_score_filter: bool,
563    /// Enum preservation and optional-tool pruning for merged schemas.
564    pub process_groups: ProcessGroupsOptions,
565}
566
567/// Resolve the full build catalog dict used for reinstatement and enum metadata.
568pub fn resolve_build_catalog(catalog: &Value, survivor_data: &Value) -> Value {
569    if catalog.get("tools").is_some() && catalog.get("files").is_some() {
570        return catalog_index_from_value(catalog).to_catalog_dict();
571    }
572    if catalog
573        .get("json")
574        .and_then(Value::as_array)
575        .is_some_and(|arr| !arr.is_empty())
576    {
577        return catalog.clone();
578    }
579    survivor_data.clone()
580}
581/// Returns mitigated `{json, md}` data and a survivor overlay whose chunk contents
582/// match the reinstated entries (stripped descriptions on pruned optionals).
583pub fn apply_description_reinstate_to_data(
584    ctx: &PolicyContext,
585    data: &Value,
586    build_catalog: &Value,
587) -> (Value, DecomposedCatalog) {
588    let mut retrieve_data = data.clone();
589    let mut survivor = DecomposedCatalog::from_catalog_dict(data);
590    if !needs_description_reinstate(ctx) {
591        return (retrieve_data, survivor);
592    }
593
594    let json_entries = data
595        .get("json")
596        .and_then(Value::as_array)
597        .map_or(&[] as &[Value], std::vec::Vec::as_slice);
598    let empty_index = CatalogIndex {
599        tools: Vec::new(),
600        files: HashMap::new(),
601    };
602    let mitigated = append_description_reinstate_entries(
603        ctx,
604        json_entries,
605        build_catalog,
606        &empty_index,
607    );
608    if let Some(obj) = retrieve_data.as_object_mut() {
609        obj.insert("json".into(), Value::Array(mitigated));
610    }
611    survivor = DecomposedCatalog::from_catalog_dict(&retrieve_data);
612    (retrieve_data, survivor)
613}
614
615/// High-level retrieve: description reinstatement (when configured) then merge.
616pub fn retrieve_tools_from_catalog(
617    ctx: &PolicyContext,
618    data: &Value,
619    build_catalog: &Value,
620    store: &mut DecomposedCatalog,
621    opts: &RetrieveOptions,
622) -> Vec<Value> {
623    let (retrieve_data, survivor) = apply_description_reinstate_to_data(ctx, data, build_catalog);
624    retrieve_core(&retrieve_data, store, &survivor, opts)
625}
626
627/// Merge survivor input into the catalog store and emit reconstructed tool schemas.
628pub fn retrieve_core(
629    data: &Value,
630    store: &mut DecomposedCatalog,
631    survivor_overlay: &DecomposedCatalog,
632    opts: &RetrieveOptions,
633) -> Vec<Value> {
634    if !survivor_overlay.json_files.is_empty() {
635        store.merge_json_files(survivor_overlay);
636    }
637
638    let (input_files, scores) = parse_json_input(data, opts.apply_decomposed_score_filter);
639    let (groups, tool_files) = group_files(&input_files, store);
640    process_groups(&groups, &tool_files, &scores, store, &opts.process_groups)
641}
642
643/// Options for [`removed_chunks`].
644#[derive(Debug, Clone, Default)]
645pub struct RemovedChunksOptions {
646    /// When true, json entries in `surviving` with score <= decomposed threshold are treated
647    /// as non-surviving (matches [`RetrieveOptions::apply_decomposed_score_filter`]).
648    pub apply_decomposed_score_filter: bool,
649}
650
651/// Normalized identity for a catalog chunk entry (`json` or `md` array item).
652#[must_use]
653pub fn chunk_survivor_key(entry: &Value, section: &str) -> Option<String> {
654    let obj = entry.as_object()?;
655    if let Some(fp) = obj.get("file_path").and_then(|v| v.as_str()) {
656        return paths::to_decomposed_key(fp).or_else(|| Some(fp.to_string()));
657    }
658    if section == "md"
659        && let Some(content) = obj.get("content").and_then(|v| v.as_str()) {
660            return Some(format!("md:content:{content}"));
661        }
662    None
663}
664
665fn survivor_key_sets(
666    surviving: &Value,
667    apply_decomposed_score_filter: bool,
668) -> (HashSet<String>, HashSet<String>) {
669    let mut json_keys = HashSet::new();
670    let mut md_keys = HashSet::new();
671    let Some(obj) = surviving.as_object() else {
672        return (json_keys, md_keys);
673    };
674    if let Some(arr) = obj.get("json").and_then(|v| v.as_array()) {
675        for entry in arr {
676            let Some(e) = entry.as_object() else {
677                continue;
678            };
679            if apply_decomposed_score_filter {
680                let score = json_f64(e.get("score")).unwrap_or(0.0);
681                if score <= runtime_config::decomposed_score() {
682                    continue;
683                }
684            }
685            if let Some(key) = chunk_survivor_key(entry, "json") {
686                json_keys.insert(key);
687            }
688        }
689    }
690    if let Some(arr) = obj.get("md").and_then(|v| v.as_array()) {
691        for entry in arr {
692            if let Some(key) = chunk_survivor_key(entry, "md") {
693                md_keys.insert(key);
694            }
695        }
696    }
697    (json_keys, md_keys)
698}
699
700fn removed_section(
701    full: &Value,
702    section: &str,
703    survivor_keys: &HashSet<String>,
704) -> Vec<Value> {
705    let Some(arr) = full.get(section).and_then(|v| v.as_array()) else {
706        return Vec::new();
707    };
708    let mut removed = Vec::new();
709    for entry in arr {
710        let key = chunk_survivor_key(entry, section);
711        if key.as_ref().is_some_and(|k| survivor_keys.contains(k)) {
712            continue;
713        }
714        removed.push(entry.clone());
715    }
716    removed
717}
718
719/// Chunks present in `full_catalog` but not in `surviving` (same `{json, md}` shape as survivors).
720#[must_use]
721pub fn removed_chunks(
722    full_catalog: &Value,
723    surviving: &Value,
724    opts: &RemovedChunksOptions,
725) -> Value {
726    let (json_keys, md_keys) =
727        survivor_key_sets(surviving, opts.apply_decomposed_score_filter);
728    let json = removed_section(full_catalog, "json", &json_keys);
729    let md = removed_section(full_catalog, "md", &md_keys);
730    json!({
731        "json": json,
732        "md": md,
733    })
734}
735
736/// Walk a directory tree and build a `{json, md}` catalog dict from decomposed files.
737///
738/// # Errors
739///
740/// Returns an error when `dir_path` is not a directory, or when a json file cannot be read or parsed.
741pub fn load_catalog_from_dir(dir_path: &str) -> Result<Value, String> {
742    let root = Path::new(dir_path);
743    if !root.is_dir() {
744        return Err(format!("Directory not found: {dir_path}"));
745    }
746
747    let mut md_entries = Vec::new();
748    let mut json_entries = Vec::new();
749
750    for entry in walkdir_light(root)? {
751        let path = entry;
752        if !path.is_file() {
753            continue;
754        }
755        let path_str = path.to_string_lossy();
756        if !paths::is_catalog_decomposed_path(&path_str) {
757            continue;
758        }
759        let suffix = path.extension().and_then(|s| s.to_str()).unwrap_or("");
760        let is_skills_md = paths::to_skills_decomposed_key(&path_str).is_some()
761            && suffix.eq_ignore_ascii_case(trim_dot(&md_ext()))
762            && path.file_name().and_then(|n| n.to_str()) != Some("document.json");
763        if is_skills_md || (paths::to_decomposed_key(&path_str).is_some()
764            && suffix.eq_ignore_ascii_case(trim_dot(&md_ext())))
765        {
766            if let Ok(content) = std::fs::read_to_string(&path) {
767                md_entries.push(json!({
768                    "id": path.file_stem().unwrap_or_default().to_string_lossy(),
769                    "file_path": path.to_string_lossy(),
770                    "score": 0.0,
771                    "start_line": 1,
772                    "end_line": 1,
773                    "language": "markdown",
774                    "content": content,
775                }));
776            }
777        } else if suffix.eq_ignore_ascii_case(trim_dot(&json_ext()))
778            && paths::to_decomposed_key(&path_str).is_some()
779        {
780            let raw_text = std::fs::read_to_string(&path).map_err(|e| e.to_string())?;
781            let content: Value = serde_json::from_str(&raw_text).map_err(|e| e.to_string())?;
782            let line_count = raw_text.lines().count();
783            let rel_path = path.to_string_lossy();
784            let decomposed_key = paths::to_decomposed_key(&rel_path);
785            let entry_id = content
786                .get("id")
787                .cloned()
788                .or_else(|| {
789                    decomposed_key
790                        .as_ref()
791                        .map(|k| Value::String(paths::tool_id_from_decomposed_rel(k)))
792                })
793                .unwrap_or_else(|| {
794                    Value::String(
795                        path.file_stem()
796                            .unwrap_or_default()
797                            .to_string_lossy()
798                            .into_owned(),
799                    )
800                });
801            json_entries.push(json!({
802                "id": entry_id,
803                "name": entry_id,
804                "file_path": rel_path,
805                "score": 0.0,
806                "start_line": 1,
807                "end_line": line_count,
808                "language": "json",
809                "content": content,
810            }));
811        }
812    }
813
814    if md_entries.is_empty() && json_entries.is_empty() {
815        eprintln!("Warning: No .json or .md files found in {dir_path}");
816    }
817
818    Ok(json!({
819        "md": md_entries,
820        "json": json_entries,
821    }))
822}
823
824fn trim_dot(ext: &str) -> &str {
825    ext.strip_prefix('.').unwrap_or(ext)
826}
827
828fn walkdir_light(root: &Path) -> Result<Vec<PathBuf>, String> {
829    let mut stack = vec![root.to_path_buf()];
830    let mut files = Vec::new();
831    while let Some(dir) = stack.pop() {
832        let entries = std::fs::read_dir(&dir).map_err(|e| e.to_string())?;
833        for entry in entries {
834            let entry = entry.map_err(|e| e.to_string())?;
835            let path = entry.path();
836            if path.is_dir() {
837                stack.push(path);
838            } else {
839                files.push(path);
840            }
841        }
842    }
843    Ok(files)
844}
845
846#[cfg(test)]
847mod tests {
848    use super::*;
849    use serde_json::json;
850
851    #[test]
852    fn low_rerank_scores_kept_without_score_filter() {
853        let data = json!({
854            "json": [{
855                "file_path": "schemas/decomposed/Agent.json",
856                "score": "0.003",
857            }]
858        });
859        let files = extract_input_files(&data, false);
860        assert_eq!(files.len(), 1);
861    }
862
863    #[test]
864    fn low_rerank_scores_dropped_with_score_filter() {
865        let data = json!({
866            "json": [{
867                "file_path": "schemas/decomposed/Agent.json",
868                "score": "0.003",
869            }]
870        });
871        let files = extract_input_files(&data, true);
872        assert!(files.is_empty());
873    }
874
875    #[test]
876    fn removed_chunks_excludes_survivors_by_decomposed_key() {
877        let full = json!({
878            "json": [
879                {"file_path": "schemas/decomposed/Agent.json", "content": {"name": "Agent"}},
880                {"file_path": "schemas/decomposed/Agent/extra.json", "content": {}},
881            ],
882            "md": [
883                {"file_path": "schemas/decomposed/haiku.md", "content": "haiku"},
884                {"file_path": "schemas/decomposed/sonnet.md", "content": "sonnet"},
885            ],
886        });
887        let surviving = json!({
888            "json": [{"file_path": "src/catalog/schemas/decomposed/Agent.json"}],
889            "md": [{"file_path": "src/catalog/schemas/decomposed/haiku.md"}],
890        });
891        let removed = removed_chunks(&full, &surviving, &RemovedChunksOptions::default());
892        let json_removed = removed.get("json").and_then(Value::as_array);
893        assert_eq!(json_removed.map(std::vec::Vec::len), Some(1));
894        assert_eq!(
895            json_removed
896                .and_then(|entries| entries.first())
897                .and_then(|entry| entry.get("file_path"))
898                .and_then(Value::as_str),
899            Some("schemas/decomposed/Agent/extra.json")
900        );
901        let md_removed = removed.get("md").and_then(Value::as_array);
902        assert_eq!(md_removed.map(std::vec::Vec::len), Some(1));
903        assert_eq!(
904            md_removed
905                .and_then(|entries| entries.first())
906                .and_then(|entry| entry.get("file_path"))
907                .and_then(Value::as_str),
908            Some("schemas/decomposed/sonnet.md")
909        );
910    }
911
912    #[test]
913    fn removed_chunks_respects_score_filter_on_survivors() {
914        let full = json!({
915            "json": [
916                {"file_path": "schemas/decomposed/Keep.json", "score": 0.9},
917                {"file_path": "schemas/decomposed/Drop.json", "score": 0.9},
918            ],
919        });
920        let surviving = json!({
921            "json": [
922                {"file_path": "schemas/decomposed/Keep.json", "score": 0.9},
923                {"file_path": "schemas/decomposed/Drop.json", "score": 0.1},
924            ],
925        });
926        let removed = removed_chunks(
927            &full,
928            &surviving,
929            &RemovedChunksOptions {
930                apply_decomposed_score_filter: true,
931            },
932        );
933        let json_removed = removed.get("json").and_then(Value::as_array);
934        assert_eq!(json_removed.map(std::vec::Vec::len), Some(1));
935        assert_eq!(
936            json_removed
937                .and_then(|entries| entries.first())
938                .and_then(|entry| entry.get("file_path"))
939                .and_then(Value::as_str),
940            Some("schemas/decomposed/Drop.json")
941        );
942    }
943}