refyne 0.1.1

Official Rust SDK for the Refyne API - LLM-powered web extraction
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
#!/usr/bin/env python3
"""
Generate Rust types from the Refyne API OpenAPI specification.

This script fetches the OpenAPI spec from a configurable location and
generates Rust structs with serde derives. The output is written to src/types.rs.

Usage:
  python scripts/generate.py [options]

Options:
  --url <url>      Fetch spec from URL (default: http://localhost:8080/openapi.json)
  --file <path>    Read spec from local file
  --output <path>  Output file path (default: src/types.rs)
  --help           Show this help message

Environment Variables:
  OPENAPI_SPEC_URL   Override the default URL
  OPENAPI_SPEC_FILE  Use a local file instead of fetching
"""

import argparse
import json
import os
import sys
import urllib.request
from pathlib import Path
from typing import Any, Optional

DEFAULT_SPEC_URL = "http://localhost:8080/openapi.json"
DEFAULT_OUTPUT = "src/types.rs"

# Rust reserved keywords that need to be renamed
RUST_KEYWORDS = {
    "type", "fn", "let", "const", "static", "mut", "ref", "self", "super",
    "crate", "mod", "pub", "use", "struct", "enum", "trait", "impl", "for",
    "where", "loop", "while", "if", "else", "match", "return", "break",
    "continue", "move", "box", "async", "await", "dyn", "abstract", "become",
    "do", "final", "macro", "override", "priv", "typeof", "unsized", "virtual",
    "yield", "try", "union", "in", "as"
}

# Collected inline enums during processing
inline_enums: dict[str, list[str]] = {}


def parse_args() -> argparse.Namespace:
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Generate Rust types from the Refyne API OpenAPI specification.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Fetch from local development server
  python scripts/generate.py

  # Fetch from production API
  python scripts/generate.py --url https://api.refyne.uk/openapi.json

  # Documentation: https://refyne.uk/docs

  # Use a local file
  python scripts/generate.py --file ./openapi.json

  # Using environment variables
  OPENAPI_SPEC_URL=http://localhost:8080/openapi.json python scripts/generate.py
"""
    )
    parser.add_argument(
        "--url",
        help=f"Fetch spec from URL (default: {DEFAULT_SPEC_URL})"
    )
    parser.add_argument(
        "--file",
        help="Read spec from local file"
    )
    parser.add_argument(
        "--output",
        default=DEFAULT_OUTPUT,
        help=f"Output file path (default: {DEFAULT_OUTPUT})"
    )

    args = parser.parse_args()

    # Check environment variables if not set via CLI
    if args.file is None and args.url is None:
        if os.environ.get("OPENAPI_SPEC_FILE"):
            args.file = os.environ["OPENAPI_SPEC_FILE"]
        elif os.environ.get("OPENAPI_SPEC_URL"):
            args.url = os.environ["OPENAPI_SPEC_URL"]
        else:
            args.url = DEFAULT_SPEC_URL

    return args


def fetch_spec(url: str) -> dict:
    """Fetch OpenAPI spec from URL."""
    print(f"Fetching OpenAPI spec from: {url}")
    try:
        with urllib.request.urlopen(url, timeout=30) as response:
            return json.loads(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        raise RuntimeError(f"Failed to fetch spec: {e}")


def load_spec_from_file(file_path: str) -> dict:
    """Load OpenAPI spec from a local file."""
    print(f"Loading OpenAPI spec from file: {file_path}")
    path = Path(file_path).resolve()
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def to_snake_case(name: str) -> str:
    """Convert camelCase or PascalCase to snake_case."""
    result = []
    for i, char in enumerate(name):
        if char.isupper():
            if i > 0 and (name[i-1].islower() or (i + 1 < len(name) and name[i+1].islower())):
                result.append("_")
            result.append(char.lower())
        else:
            result.append(char)
    return "".join(result)


def to_pascal_case(name: str) -> str:
    """Convert snake_case or kebab-case to PascalCase."""
    return "".join(word.capitalize() for word in name.replace("-", "_").split("_"))


def escape_rust_keyword(name: str) -> str:
    """Escape Rust keywords by prefixing with r#."""
    if name in RUST_KEYWORDS:
        return f"r#{name}"
    return name


def resolve_ref(ref: str, spec: dict) -> dict:
    """Resolve a JSON reference to its schema."""
    ref_path = ref.replace("#/", "").split("/")
    current = spec
    for part in ref_path:
        current = current[part]
    return current


def make_enum_name(parent_type: str, field_name: str) -> str:
    """Generate an enum name from parent type and field name."""
    return f"{parent_type}{to_pascal_case(field_name)}"


def openapi_type_to_rust(
    schema: dict,
    spec: dict,
    is_required: bool = True,
    parent_type: str = "",
    field_name: str = ""
) -> str:
    """Convert an OpenAPI schema type to a Rust type string."""
    global inline_enums

    if "$ref" in schema:
        ref_name = schema["$ref"].split("/")[-1]
        rust_type = ref_name
        return rust_type if is_required else f"Option<{rust_type}>"

    if "allOf" in schema:
        # For allOf, take the first ref or combine
        types = [openapi_type_to_rust(s, spec, True) for s in schema["allOf"]]
        # Usually allOf is used for composition, use first type
        rust_type = types[0] if types else "serde_json::Value"
        return rust_type if is_required else f"Option<{rust_type}>"

    if "oneOf" in schema or "anyOf" in schema:
        # For oneOf/anyOf, use serde_json::Value as a catch-all
        return "serde_json::Value" if is_required else "Option<serde_json::Value>"

    if "enum" in schema:
        # Generate a proper enum for inline enums
        if parent_type and field_name:
            enum_name = make_enum_name(parent_type, field_name)
            inline_enums[enum_name] = schema["enum"]
            return enum_name if is_required else f"Option<{enum_name}>"
        # Fallback to String for enums without context
        return "String" if is_required else "Option<String>"

    schema_type = schema.get("type", "object")
    schema_format = schema.get("format")

    if schema_type == "string":
        rust_type = "String"
    elif schema_type == "integer":
        if schema_format == "int64":
            rust_type = "i64"
        elif schema_format == "int32":
            rust_type = "i32"
        else:
            rust_type = "i64"  # Default to i64 for integers
    elif schema_type == "number":
        if schema_format == "float":
            rust_type = "f32"
        else:
            rust_type = "f64"  # Default to f64 for numbers
    elif schema_type == "boolean":
        rust_type = "bool"
    elif schema_type == "array":
        items = schema.get("items", {})
        item_type = openapi_type_to_rust(items, spec, True, parent_type, field_name)
        rust_type = f"Vec<{item_type}>"
    elif schema_type == "object":
        additional_props = schema.get("additionalProperties")
        if additional_props is True:
            rust_type = "serde_json::Value"
        elif isinstance(additional_props, dict):
            value_type = openapi_type_to_rust(additional_props, spec, True, parent_type, field_name)
            rust_type = f"std::collections::HashMap<String, {value_type}>"
        elif schema.get("properties"):
            # Inline object - use Value
            rust_type = "serde_json::Value"
        else:
            rust_type = "serde_json::Value"
    else:
        rust_type = "serde_json::Value"

    return rust_type if is_required else f"Option<{rust_type}>"


def is_request_type(name: str) -> bool:
    """Check if a type name represents a request type."""
    # InputBody is a request type, but OutputBody/ResponseBody are response types
    if name.endswith("OutputBody") or name.endswith("ResponseBody"):
        return False
    return name.endswith("Request") or name.endswith("Input") or name.endswith("InputBody")


def is_response_type(name: str) -> bool:
    """Check if a type name represents a response type."""
    return (
        name.endswith("Response") or
        name.endswith("Output") or
        name.endswith("Result") or
        name.endswith("OutputBody") or
        name.endswith("ResponseBody")
    )


def has_required_enum_fields(schema: dict) -> bool:
    """Check if schema has required fields that are enums."""
    required = set(schema.get("required", []))
    properties = schema.get("properties", {})

    for prop_name, prop_schema in properties.items():
        if prop_name in required:
            # Check if field is an enum (either inline or ref)
            if "enum" in prop_schema:
                return True
            # For refs, we can't easily check, so assume they might be enums
            # Actually, most refs will be to complex types, not enums
    return False


def get_serde_attributes(name: str, schema: dict) -> list[str]:
    """Get serde derive attributes for a struct."""
    attrs = ["Debug", "Clone"]

    # All types need both Serialize and Deserialize because types can be nested
    # within each other - a request type might be used as a field in another type
    # that needs to be deserialized
    attrs.extend(["Serialize", "Deserialize"])

    # Add Default for request types without required enum fields
    if is_request_type(name) and not has_required_enum_fields(schema):
        attrs.append("Default")

    return attrs


def generate_enum(name: str, values: list[str], description: str = "") -> list[str]:
    """Generate a Rust enum from enum values."""
    lines = []

    # Doc comment
    if description:
        lines.append(f"/// {description}")

    # Derive attributes
    lines.append("#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]")

    # Determine rename strategy
    all_lowercase = all(v == v.lower() for v in values)

    if all_lowercase:
        lines.append('#[serde(rename_all = "lowercase")]')

    lines.append(f"pub enum {name} {{")

    for value in values:
        # Convert to PascalCase for Rust variant
        variant = to_pascal_case(value)

        # Add serde rename if needed (for non-lowercase variants)
        if not all_lowercase:
            lines.append(f'    #[serde(rename = "{value}")]')

        lines.append(f"    /// {value}")
        lines.append(f"    {variant},")

    lines.append("}")
    return lines


def generate_struct(name: str, schema: dict, spec: dict) -> list[str]:
    """Generate a Rust struct from an OpenAPI schema."""
    lines = []

    # Doc comment
    description = schema.get("description", "")
    if description:
        lines.append(f"/// {description}")
    if schema.get("deprecated"):
        msg = schema.get("x-deprecated-message", "This type is deprecated.")
        lines.append(f"#[deprecated(note = \"{msg}\")]")

    # Derive attributes
    derives = get_serde_attributes(name, schema)
    lines.append(f"#[derive({', '.join(derives)})]")

    # Check if any fields use snake_case (have underscores)
    properties = schema.get("properties", {})
    has_snake_case = any("_" in k for k in properties.keys())

    # Only add rename_all if fields don't have underscores (are camelCase)
    if not has_snake_case and properties:
        lines.append('#[serde(rename_all = "camelCase")]')

    lines.append(f"pub struct {name} {{")

    required_fields = set(schema.get("required", []))

    for prop_name, prop_schema in properties.items():
        # Skip JSON Schema metadata fields (like $schema)
        if prop_name.startswith("$"):
            continue
        is_required = prop_name in required_fields
        rust_field_name = to_snake_case(prop_name)
        rust_field_name = escape_rust_keyword(rust_field_name)
        rust_type = openapi_type_to_rust(prop_schema, spec, is_required, name, prop_name)

        # Doc comment for field
        prop_description = prop_schema.get("description", "")
        if prop_description:
            lines.append(f"    /// {prop_description}")
        if prop_schema.get("deprecated"):
            msg = prop_schema.get("x-deprecated-message", "This field is deprecated.")
            lines.append(f"    #[deprecated(note = \"{msg}\")]")

        # Add serde rename for fields that need it
        actual_field_name = rust_field_name.replace("r#", "")
        needs_rename = False

        # Need rename if field is a Rust keyword
        if actual_field_name != rust_field_name.replace("r#", ""):
            needs_rename = True
        # Need rename if original has underscores but we converted to snake_case differently
        elif "_" not in prop_name and has_snake_case:
            needs_rename = True

        if needs_rename:
            lines.append(f'    #[serde(rename = "{prop_name}")]')

        # Skip serializing None for optional fields in request types
        if not is_required and is_request_type(name):
            lines.append("    #[serde(skip_serializing_if = \"Option::is_none\")]")

        lines.append(f"    pub {rust_field_name}: {rust_type},")

    lines.append("}")
    return lines


def generate_type_alias(name: str, schema: dict, spec: dict) -> list[str]:
    """Generate a Rust type alias from an OpenAPI schema."""
    lines = []

    description = schema.get("description", "")
    if description:
        lines.append(f"/// {description}")

    rust_type = openapi_type_to_rust(schema, spec, True)
    lines.append(f"pub type {name} = {rust_type};")

    return lines


def generate_types(spec: dict) -> str:
    """Generate all Rust types from an OpenAPI spec."""
    global inline_enums
    inline_enums = {}  # Reset for each run

    api_version = spec.get("info", {}).get("version", "unknown")

    lines = [
        "//! API types for the Refyne SDK.",
        "//!",
        "//! These types are generated from the OpenAPI specification.",
        "//! Do not edit this file manually - run `make generate` to regenerate.",
        "//!",
        f"//! Generated from API version: {api_version}",
        "",
        "#![allow(dead_code)]",
        "",
        "use serde::{Deserialize, Serialize};",
        "",
    ]

    schemas = spec.get("components", {}).get("schemas", {})

    if not schemas:
        lines.append("// No schemas found in OpenAPI specification")
        return "\n".join(lines)

    # First pass: collect all inline enums
    for name, schema in schemas.items():
        if schema.get("type") == "object" or "properties" in schema:
            for prop_name, prop_schema in schema.get("properties", {}).items():
                if "enum" in prop_schema and prop_schema.get("type") == "string":
                    enum_name = make_enum_name(name, prop_name)
                    inline_enums[enum_name] = prop_schema["enum"]

    # Group schemas by category
    request_schemas = []
    response_schemas = []
    enum_schemas = []
    other_schemas = []

    for name, schema in schemas.items():
        if "enum" in schema and "properties" not in schema:
            enum_schemas.append((name, schema))
        elif is_request_type(name):
            request_schemas.append((name, schema))
        elif is_response_type(name):
            response_schemas.append((name, schema))
        else:
            other_schemas.append((name, schema))

    # Generate top-level enums first
    if enum_schemas or inline_enums:
        lines.append("// " + "=" * 76)
        lines.append("// Enums")
        lines.append("// " + "=" * 76)
        lines.append("")

        # Top-level enums from schema
        for name, schema in enum_schemas:
            lines.extend(generate_enum(name, schema.get("enum", []), schema.get("description", "")))
            lines.append("")

        # Inline enums collected during processing
        for enum_name, values in sorted(inline_enums.items()):
            # Skip if already generated as top-level
            if any(name == enum_name for name, _ in enum_schemas):
                continue
            lines.extend(generate_enum(enum_name, values))
            lines.append("")

    # Generate request types
    if request_schemas:
        lines.append("// " + "=" * 76)
        lines.append("// Request Types")
        lines.append("// " + "=" * 76)
        lines.append("")
        for name, schema in request_schemas:
            if schema.get("type") == "object" or "properties" in schema:
                lines.extend(generate_struct(name, schema, spec))
            else:
                lines.extend(generate_type_alias(name, schema, spec))
            lines.append("")

    # Generate response types
    if response_schemas:
        lines.append("// " + "=" * 76)
        lines.append("// Response Types")
        lines.append("// " + "=" * 76)
        lines.append("")
        for name, schema in response_schemas:
            if schema.get("type") == "object" or "properties" in schema:
                lines.extend(generate_struct(name, schema, spec))
            else:
                lines.extend(generate_type_alias(name, schema, spec))
            lines.append("")

    # Generate other types
    if other_schemas:
        lines.append("// " + "=" * 76)
        lines.append("// Other Types")
        lines.append("// " + "=" * 76)
        lines.append("")
        for name, schema in other_schemas:
            if schema.get("type") == "object" or "properties" in schema:
                lines.extend(generate_struct(name, schema, spec))
            elif "allOf" in schema:
                lines.extend(generate_struct(name, schema, spec))
            else:
                lines.extend(generate_type_alias(name, schema, spec))
            lines.append("")

    # Add missing types that the SDK depends on but aren't in the OpenAPI spec
    lines.append("// " + "=" * 76)
    lines.append("// Additional Types (not in OpenAPI spec but required by SDK)")
    lines.append("// " + "=" * 76)
    lines.append("")

    # ProvidersResponse - used by list_providers()
    lines.extend([
        "/// Response containing available LLM providers.",
        "#[derive(Debug, Clone, Deserialize)]",
        "pub struct ProvidersResponse {",
        "    /// List of available provider names.",
        "    pub providers: Vec<String>,",
        "}",
        "",
    ])

    # Model type for ModelList items (if not already defined)
    if not any(name == "Model" for name, _ in other_schemas):
        lines.extend([
            "/// Available LLM model.",
            "#[derive(Debug, Clone, Deserialize)]",
            "pub struct Model {",
            "    /// Model identifier.",
            "    pub id: String,",
            "    /// Display name.",
            "    pub name: String,",
            "}",
            "",
        ])

    # Add type aliases for client.rs compatibility
    # Only add aliases for types that don't already exist in the schema
    schema_names = set(schemas.keys())

    lines.extend([
        "// ==========================================================================",
        "// Type Aliases for Client Compatibility",
        "// ==========================================================================",
        "",
    ])

    # Define all aliases - only add if alias name doesn't exist in schema
    type_aliases = [
        # Job types
        ("Job", "JobResponse", "Single job response."),
        ("JobList", "ListJobsOutputBody", "Job list response."),
        ("JobResults", "serde_json::Value", "Job extraction results (dynamic JSON)."),

        # Schema types
        ("Schema", "SchemaOutput", "Schema response."),
        ("SchemaList", "ListSchemasOutputBody", "Schema list response."),
        ("CreateSchemaRequest", "CreateSchemaInputBody", "Schema creation request."),

        # Site types
        ("Site", "SavedSiteOutput", "Saved site response."),
        ("SiteList", "ListSavedSitesOutputBody", "Saved site list response."),
        ("CreateSiteRequest", "CreateSavedSiteInputBody", "Site creation request."),

        # API key types
        ("ApiKeyList", "ListKeysOutputBody", "API key list response."),
        ("ApiKeyCreated", "CreateKeyOutputBody", "API key creation response."),

        # LLM key types
        ("LlmKey", "UserServiceKeyResponse", "User LLM service key response."),
        ("LlmKeyList", "ListUserServiceKeysOutputBody", "LLM service key list response."),
        ("UpsertLlmKeyRequest", "UserServiceKeyInput", "LLM key upsert request."),

        # LLM chain types
        ("LlmChain", "GetUserFallbackChainOutputBody", "LLM fallback chain."),
        ("LlmChainEntry", "UserFallbackChainEntryInput", "LLM fallback chain entry."),

        # Model types
        ("ModelList", "UserListModelsOutputBody", "Model list response."),

        # Extract types
        ("ExtractRequest", "ExtractInputBody", "Extract request."),
        ("ExtractResponse", "ExtractOutputBody", "Extract response."),

        # Crawl types
        ("CrawlRequest", "CreateCrawlJobInputBody", "Crawl request."),
        ("CrawlJobCreated", "CrawlJobResponseBody", "Crawl job created response."),

        # Analyze types
        ("AnalyzeRequest", "AnalyzeInputBody", "Analyze request."),
        ("AnalyzeResponse", "AnalyzeResponseBody", "Analyze response."),
    ]

    for alias_name, target_type, doc in type_aliases:
        # Skip if alias name already exists in the schema
        if alias_name in schema_names:
            continue
        lines.append(f"/// {doc}")
        lines.append(f"pub type {alias_name} = {target_type};")
        lines.append("")

    return "\n".join(lines)


def main() -> int:
    """Main entry point."""
    args = parse_args()

    try:
        # Load spec
        if args.file:
            spec = load_spec_from_file(args.file)
        else:
            spec = fetch_spec(args.url)

        print(f"OpenAPI version: {spec.get('openapi', 'unknown')}")
        print(f"API title: {spec.get('info', {}).get('title', 'unknown')}")
        print(f"API version: {spec.get('info', {}).get('version', 'unknown')}")

        # Generate types
        types_code = generate_types(spec)

        # Write output
        output_path = Path(args.output).resolve()
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(types_code)

        print(f"Types written to: {output_path}")

        # Count generated types
        schema_count = len(spec.get("components", {}).get("schemas", {}))
        enum_count = len(inline_enums)
        print(f"Generated {schema_count} types + {enum_count} inline enums")

        return 0

    except Exception as e:
        print(f"Error generating types: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())