Skip to main content

codemem_engine/index/
spec_parser.rs

1//! OpenAPI and AsyncAPI spec file parsing.
2//!
3//! Parses OpenAPI (2.0/3.x) and AsyncAPI (2.x/3.0) specification files to extract
4//! API endpoints, channels, schemas, and metadata. Supports both JSON and YAML formats.
5//! Discovered endpoints are normalized via `api_surface::normalize_path_pattern()`.
6
7use serde::{Deserialize, Serialize};
8use std::path::Path;
9
10// ── OpenAPI Types ────────────────────────────────────────────────────────
11
12/// A single parsed endpoint from an OpenAPI/Swagger spec.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct SpecEndpoint {
15    /// HTTP method (uppercase: GET, POST, etc.).
16    pub method: String,
17    /// URL path pattern (normalized).
18    pub path: String,
19    /// The `operationId` if present.
20    pub operation_id: Option<String>,
21    /// Operation description or summary.
22    pub description: Option<String>,
23    /// Stringified JSON of the request body schema.
24    pub request_schema: Option<String>,
25    /// Stringified JSON of the primary success response schema.
26    pub response_schema: Option<String>,
27    /// Path to the spec file this was extracted from.
28    pub spec_file: String,
29}
30
31/// Result of parsing an OpenAPI/Swagger spec file.
32#[derive(Debug, Clone)]
33pub struct SpecParseResult {
34    /// All endpoints discovered in the spec.
35    pub endpoints: Vec<SpecEndpoint>,
36    /// API title from `info.title`.
37    pub title: Option<String>,
38    /// API version from `info.version`.
39    pub version: Option<String>,
40}
41
42// ── AsyncAPI Types ───────────────────────────────────────────────────────
43
44/// A single parsed channel operation from an AsyncAPI spec.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct SpecChannel {
47    /// Channel name/path.
48    pub channel: String,
49    /// Direction: `"publish"` or `"subscribe"`.
50    pub direction: String,
51    /// Protocol (e.g., `"kafka"`, `"amqp"`, `"mqtt"`).
52    pub protocol: Option<String>,
53    /// Stringified JSON of the message payload schema.
54    pub message_schema: Option<String>,
55    /// Operation description.
56    pub description: Option<String>,
57    /// The `operationId` if present.
58    pub operation_id: Option<String>,
59    /// Path to the spec file this was extracted from.
60    pub spec_file: String,
61}
62
63/// Result of parsing an AsyncAPI spec file.
64#[derive(Debug, Clone)]
65pub struct AsyncApiParseResult {
66    /// All channel operations discovered in the spec.
67    pub channels: Vec<SpecChannel>,
68    /// API title from `info.title`.
69    pub title: Option<String>,
70    /// API version from `info.version`.
71    pub version: Option<String>,
72}
73
74// ── Unified Result ───────────────────────────────────────────────────────
75
76/// A parsed spec file: either OpenAPI or AsyncAPI.
77#[derive(Debug, Clone)]
78pub enum SpecFileResult {
79    OpenApi(SpecParseResult),
80    AsyncApi(AsyncApiParseResult),
81}
82
83// ── Helpers ──────────────────────────────────────────────────────────────
84
85/// HTTP methods recognized in OpenAPI path items.
86const HTTP_METHODS: &[&str] = &["get", "post", "put", "delete", "patch", "options", "head"];
87
88/// Read a file and parse it into `serde_json::Value`, detecting JSON vs YAML by extension.
89fn read_spec_file(path: &Path) -> Option<serde_json::Value> {
90    let content = std::fs::read_to_string(path).ok()?;
91    let ext = path
92        .extension()
93        .and_then(|e| e.to_str())
94        .unwrap_or("")
95        .to_lowercase();
96
97    match ext.as_str() {
98        "json" => serde_json::from_str(&content).ok(),
99        "yaml" | "yml" => {
100            let yaml_val: serde_yaml::Value = serde_yaml::from_str(&content).ok()?;
101            // Convert serde_yaml::Value → serde_json::Value via serialization round-trip.
102            let json_str = serde_json::to_string(&yaml_val).ok()?;
103            serde_json::from_str(&json_str).ok()
104        }
105        _ => {
106            // Unknown extension: try JSON first, then YAML.
107            if let Ok(v) = serde_json::from_str::<serde_json::Value>(&content) {
108                return Some(v);
109            }
110            let yaml_val: serde_yaml::Value = serde_yaml::from_str(&content).ok()?;
111            let json_str = serde_json::to_string(&yaml_val).ok()?;
112            serde_json::from_str(&json_str).ok()
113        }
114    }
115}
116
117/// Stringify a `serde_json::Value` for schema storage.
118/// Returns `None` for `Value::Null`.
119fn stringify_schema(value: &serde_json::Value) -> Option<String> {
120    if value.is_null() {
121        return None;
122    }
123    Some(serde_json::to_string(value).unwrap_or_default())
124}
125
126/// Extract `info.title` from a parsed spec.
127fn extract_info_title(root: &serde_json::Value) -> Option<String> {
128    root.get("info")?
129        .get("title")?
130        .as_str()
131        .map(|s| s.to_string())
132}
133
134/// Extract `info.version` from a parsed spec.
135fn extract_info_version(root: &serde_json::Value) -> Option<String> {
136    root.get("info")?
137        .get("version")?
138        .as_str()
139        .map(|s| s.to_string())
140}
141
142// ── OpenAPI Parsing ──────────────────────────────────────────────────────
143
144/// Parse an OpenAPI (2.0 Swagger / 3.x) spec file.
145///
146/// Returns `None` if the file cannot be read, is not valid JSON/YAML,
147/// or lacks an `openapi` or `swagger` top-level key.
148pub fn parse_openapi(path: &Path) -> Option<SpecParseResult> {
149    let root = read_spec_file(path)?;
150    let obj = root.as_object()?;
151
152    // Confirm this is an OpenAPI/Swagger file.
153    let is_openapi = obj.contains_key("openapi");
154    let is_swagger = obj.contains_key("swagger");
155    if !is_openapi && !is_swagger {
156        return None;
157    }
158
159    let spec_file = path.to_string_lossy().to_string();
160    let title = extract_info_title(&root);
161    let version = extract_info_version(&root);
162
163    let mut endpoints = Vec::new();
164
165    let paths = match obj.get("paths").and_then(|v| v.as_object()) {
166        Some(p) => p,
167        None => {
168            return Some(SpecParseResult {
169                endpoints,
170                title,
171                version,
172            })
173        }
174    };
175
176    for (url_path, path_item) in paths {
177        let path_obj = match path_item.as_object() {
178            Some(o) => o,
179            None => continue,
180        };
181
182        let normalized = super::api_surface::normalize_path_pattern(url_path);
183
184        for method in HTTP_METHODS {
185            let operation = match path_obj.get(*method).and_then(|v| v.as_object()) {
186                Some(op) => op,
187                None => continue,
188            };
189
190            let operation_id = operation
191                .get("operationId")
192                .and_then(|v| v.as_str())
193                .map(|s| s.to_string());
194
195            // Prefer summary, fall back to description.
196            let description = operation
197                .get("summary")
198                .and_then(|v| v.as_str())
199                .or_else(|| operation.get("description").and_then(|v| v.as_str()))
200                .map(|s| s.to_string());
201
202            let request_schema = if is_swagger {
203                extract_swagger_request_schema(operation)
204            } else {
205                extract_openapi3_request_schema(operation)
206            };
207
208            let response_schema = if is_swagger {
209                extract_swagger_response_schema(operation)
210            } else {
211                extract_openapi3_response_schema(operation)
212            };
213
214            endpoints.push(SpecEndpoint {
215                method: method.to_uppercase(),
216                path: normalized.clone(),
217                operation_id,
218                description,
219                request_schema,
220                response_schema,
221                spec_file: spec_file.clone(),
222            });
223        }
224    }
225
226    Some(SpecParseResult {
227        endpoints,
228        title,
229        version,
230    })
231}
232
233/// Extract request body schema from an OpenAPI 3.x operation.
234/// Looks for `requestBody.content.application/json.schema`.
235fn extract_openapi3_request_schema(
236    operation: &serde_json::Map<String, serde_json::Value>,
237) -> Option<String> {
238    let schema = operation
239        .get("requestBody")?
240        .get("content")?
241        .get("application/json")?
242        .get("schema")?;
243    stringify_schema(schema)
244}
245
246/// Extract response schema from an OpenAPI 3.x operation.
247/// Checks `responses.200` then `responses.201` for `content.application/json.schema`.
248fn extract_openapi3_response_schema(
249    operation: &serde_json::Map<String, serde_json::Value>,
250) -> Option<String> {
251    let responses = operation.get("responses")?.as_object()?;
252
253    for status in &["200", "201"] {
254        if let Some(schema) = responses
255            .get(*status)
256            .and_then(|r| r.get("content"))
257            .and_then(|c| c.get("application/json"))
258            .and_then(|j| j.get("schema"))
259        {
260            return stringify_schema(schema);
261        }
262    }
263    None
264}
265
266/// Extract request body schema from a Swagger 2.0 operation.
267/// Looks for a parameter with `in: body` and extracts its `schema`.
268fn extract_swagger_request_schema(
269    operation: &serde_json::Map<String, serde_json::Value>,
270) -> Option<String> {
271    let parameters = operation.get("parameters")?.as_array()?;
272    for param in parameters {
273        if param.get("in").and_then(|v| v.as_str()) == Some("body") {
274            if let Some(schema) = param.get("schema") {
275                return stringify_schema(schema);
276            }
277        }
278    }
279    None
280}
281
282/// Extract response schema from a Swagger 2.0 operation.
283/// Checks `responses.200.schema` then `responses.201.schema`.
284fn extract_swagger_response_schema(
285    operation: &serde_json::Map<String, serde_json::Value>,
286) -> Option<String> {
287    let responses = operation.get("responses")?.as_object()?;
288
289    for status in &["200", "201"] {
290        if let Some(schema) = responses.get(*status).and_then(|r| r.get("schema")) {
291            return stringify_schema(schema);
292        }
293    }
294    None
295}
296
297// ── AsyncAPI Parsing ─────────────────────────────────────────────────────
298
299/// Parse an AsyncAPI (2.x / 3.0) spec file.
300///
301/// Returns `None` if the file cannot be read, is not valid JSON/YAML,
302/// or lacks an `asyncapi` top-level key.
303pub fn parse_asyncapi(path: &Path) -> Option<AsyncApiParseResult> {
304    let root = read_spec_file(path)?;
305    let obj = root.as_object()?;
306
307    if !obj.contains_key("asyncapi") {
308        return None;
309    }
310
311    let spec_file = path.to_string_lossy().to_string();
312    let title = extract_info_title(&root);
313    let version = extract_info_version(&root);
314
315    // Detect protocol from servers object.
316    let protocol = detect_asyncapi_protocol(obj);
317
318    let asyncapi_version = obj.get("asyncapi").and_then(|v| v.as_str()).unwrap_or("");
319
320    let channels = if asyncapi_version.starts_with("3.") {
321        parse_asyncapi_v3(obj, &spec_file, &protocol)
322    } else {
323        // 2.x (default)
324        parse_asyncapi_v2(obj, &spec_file, &protocol)
325    };
326
327    Some(AsyncApiParseResult {
328        channels,
329        title,
330        version,
331    })
332}
333
334/// Detect the protocol from the `servers` object (first server's `protocol` field).
335fn detect_asyncapi_protocol(obj: &serde_json::Map<String, serde_json::Value>) -> Option<String> {
336    let servers = obj.get("servers")?.as_object()?;
337    // Take the first server entry.
338    let (_name, server) = servers.iter().next()?;
339    server
340        .get("protocol")
341        .and_then(|v| v.as_str())
342        .map(|s| s.to_string())
343}
344
345/// Parse AsyncAPI 2.x channels.
346///
347/// Structure: `channels.<name>.publish` / `channels.<name>.subscribe`, each with
348/// `operationId`, `description`, `message.payload`.
349fn parse_asyncapi_v2(
350    obj: &serde_json::Map<String, serde_json::Value>,
351    spec_file: &str,
352    protocol: &Option<String>,
353) -> Vec<SpecChannel> {
354    let mut result = Vec::new();
355
356    let channels = match obj.get("channels").and_then(|v| v.as_object()) {
357        Some(c) => c,
358        None => return result,
359    };
360
361    for (channel_name, channel_value) in channels {
362        let channel_obj = match channel_value.as_object() {
363            Some(o) => o,
364            None => continue,
365        };
366
367        for direction in &["publish", "subscribe"] {
368            let operation = match channel_obj.get(*direction).and_then(|v| v.as_object()) {
369                Some(op) => op,
370                None => continue,
371            };
372
373            let operation_id = operation
374                .get("operationId")
375                .and_then(|v| v.as_str())
376                .map(|s| s.to_string());
377
378            let description = operation
379                .get("description")
380                .and_then(|v| v.as_str())
381                .or_else(|| operation.get("summary").and_then(|v| v.as_str()))
382                .map(|s| s.to_string());
383
384            let message_schema = operation
385                .get("message")
386                .and_then(|m| m.get("payload"))
387                .and_then(stringify_schema);
388
389            result.push(SpecChannel {
390                channel: channel_name.clone(),
391                direction: direction.to_string(),
392                protocol: protocol.clone(),
393                message_schema,
394                description,
395                operation_id,
396                spec_file: spec_file.to_string(),
397            });
398        }
399    }
400
401    result
402}
403
404/// Parse AsyncAPI 3.0 channels and operations.
405///
406/// In v3, channels are under `channels` and operations are under `operations`.
407/// Each operation has a `channel.$ref` pointing to a channel, plus `action` (send/receive).
408fn parse_asyncapi_v3(
409    obj: &serde_json::Map<String, serde_json::Value>,
410    spec_file: &str,
411    protocol: &Option<String>,
412) -> Vec<SpecChannel> {
413    let mut result = Vec::new();
414
415    let operations = match obj.get("operations").and_then(|v| v.as_object()) {
416        Some(o) => o,
417        None => return result,
418    };
419
420    for (_op_name, op_value) in operations {
421        let operation = match op_value.as_object() {
422            Some(o) => o,
423            None => continue,
424        };
425
426        // Resolve channel name from $ref: "#/channels/channelName"
427        let channel_name = operation
428            .get("channel")
429            .and_then(|c| {
430                // Could be a $ref object or a direct reference.
431                if let Some(ref_str) = c.get("$ref").and_then(|v| v.as_str()) {
432                    // Extract last segment: "#/channels/myChannel" → "myChannel"
433                    ref_str.rsplit('/').next().map(|s| s.to_string())
434                } else {
435                    c.as_str().map(|s| s.to_string())
436                }
437            })
438            .unwrap_or_default();
439
440        // action: "send" or "receive" → map to "publish"/"subscribe"
441        let direction = match operation.get("action").and_then(|v| v.as_str()) {
442            Some("send") => "publish".to_string(),
443            Some("receive") => "subscribe".to_string(),
444            Some(other) => other.to_string(),
445            None => continue,
446        };
447
448        let operation_id = operation
449            .get("operationId")
450            .and_then(|v| v.as_str())
451            .map(|s| s.to_string());
452
453        let description = operation
454            .get("description")
455            .and_then(|v| v.as_str())
456            .or_else(|| operation.get("summary").and_then(|v| v.as_str()))
457            .map(|s| s.to_string());
458
459        // In v3, messages can be on the operation or on the channel.
460        let message_schema = extract_v3_message_schema(operation, obj, &channel_name);
461
462        result.push(SpecChannel {
463            channel: channel_name,
464            direction,
465            protocol: protocol.clone(),
466            message_schema,
467            description,
468            operation_id,
469            spec_file: spec_file.to_string(),
470        });
471    }
472
473    result
474}
475
476/// Extract message payload schema for an AsyncAPI 3.0 operation.
477///
478/// Checks the operation's `messages` first, then falls back to the channel's
479/// `messages` in the root `channels` object.
480fn extract_v3_message_schema(
481    operation: &serde_json::Map<String, serde_json::Value>,
482    root: &serde_json::Map<String, serde_json::Value>,
483    channel_name: &str,
484) -> Option<String> {
485    // Try operation-level messages first.
486    if let Some(messages) = operation.get("messages") {
487        if let Some(schema) = first_message_payload(messages) {
488            return Some(schema);
489        }
490    }
491
492    // Fall back to channel-level messages.
493    let channel = root.get("channels")?.get(channel_name)?;
494    let messages = channel.get("messages")?;
495    first_message_payload(messages)
496}
497
498/// Extract the `payload` schema from the first message in a messages value.
499/// Messages can be an object (keyed by name) or an array.
500fn first_message_payload(messages: &serde_json::Value) -> Option<String> {
501    if let Some(obj) = messages.as_object() {
502        for (_name, msg) in obj {
503            if let Some(payload) = msg.get("payload") {
504                return stringify_schema(payload);
505            }
506        }
507    } else if let Some(arr) = messages.as_array() {
508        for msg in arr {
509            if let Some(payload) = msg.get("payload") {
510                return stringify_schema(payload);
511            }
512        }
513    }
514    None
515}
516
517// ── Directory Scanning ───────────────────────────────────────────────────
518
519/// Well-known spec file names to match by filename alone.
520const SPEC_FILE_NAMES: &[&str] = &[
521    "openapi.yaml",
522    "openapi.yml",
523    "openapi.json",
524    "swagger.yaml",
525    "swagger.yml",
526    "swagger.json",
527    "asyncapi.yaml",
528    "asyncapi.yml",
529    "asyncapi.json",
530];
531
532/// Scan a directory for API spec files (OpenAPI / AsyncAPI) and parse them.
533///
534/// Uses `ignore::WalkBuilder` to walk the directory tree, respecting `.gitignore`.
535/// Detects spec files by well-known filenames and by peeking at file contents
536/// for top-level `openapi`, `swagger`, or `asyncapi` keys.
537pub fn scan_api_specs(root: &Path) -> Vec<SpecFileResult> {
538    let mut results = Vec::new();
539
540    let walker = ignore::WalkBuilder::new(root)
541        .hidden(true)
542        .git_ignore(true)
543        .git_global(true)
544        .git_exclude(true)
545        .build();
546
547    for entry in walker {
548        let entry = match entry {
549            Ok(e) => e,
550            Err(_) => continue,
551        };
552
553        if !entry.file_type().is_some_and(|ft| ft.is_file()) {
554            continue;
555        }
556
557        let path = entry.path();
558        let ext = path
559            .extension()
560            .and_then(|e| e.to_str())
561            .unwrap_or("")
562            .to_lowercase();
563
564        // Only consider JSON/YAML files.
565        if !matches!(ext.as_str(), "json" | "yaml" | "yml") {
566            continue;
567        }
568
569        let file_name = path
570            .file_name()
571            .and_then(|n| n.to_str())
572            .unwrap_or("")
573            .to_lowercase();
574
575        let is_well_known = SPEC_FILE_NAMES.contains(&file_name.as_str());
576
577        if !is_well_known {
578            // Peek at the file to check for spec-identifying keys.
579            if !peek_is_spec_file(path) {
580                continue;
581            }
582        }
583
584        // Try OpenAPI first, then AsyncAPI.
585        if let Some(openapi) = parse_openapi(path) {
586            results.push(SpecFileResult::OpenApi(openapi));
587        } else if let Some(asyncapi) = parse_asyncapi(path) {
588            results.push(SpecFileResult::AsyncApi(asyncapi));
589        }
590    }
591
592    results
593}
594
595/// Quick check: read the first 200 bytes and look for spec-identifying keys.
596fn peek_is_spec_file(path: &Path) -> bool {
597    let mut buf = [0u8; 200];
598    let file = match std::fs::File::open(path) {
599        Ok(f) => f,
600        Err(_) => return false,
601    };
602
603    use std::io::Read;
604    let mut reader = std::io::BufReader::new(file);
605    let n = match reader.read(&mut buf) {
606        Ok(n) => n,
607        Err(_) => return false,
608    };
609
610    let snippet = String::from_utf8_lossy(&buf[..n]).to_lowercase();
611
612    // Check for top-level keys that identify spec files.
613    // In YAML: `openapi:` or `swagger:` or `asyncapi:` at start of line.
614    // In JSON: `"openapi"` or `"swagger"` or `"asyncapi"` near the start.
615    snippet.contains("\"openapi\"")
616        || snippet.contains("\"swagger\"")
617        || snippet.contains("\"asyncapi\"")
618        || snippet.contains("openapi:")
619        || snippet.contains("swagger:")
620        || snippet.contains("asyncapi:")
621}
622
623#[cfg(test)]
624#[path = "tests/spec_parser_tests.rs"]
625mod tests;