finetype-cli 0.6.39

# FineType v2 — Technology Domain Definitions
#
# Hierarchy: domain.category.type (locale is a field, not in the key)
# Full label at inference time: domain.category.type.LOCALE
#
# Each definition is a transformation contract:
#   - broad_type:     Target DuckDB type
#   - format_string:  DuckDB strptime format (null if not strptime-based)
#   - transform:      DuckDB SQL expression ({col} = column placeholder)
#   - transform_ext:  Enhanced transform requiring a DuckDB extension
#   - validation:     JSON Schema fragment for data quality checks
#   - tier:           Path from root to parent in the inference graph
#   - decompose:      Optional struct expansion for multi-field output
#
# Technology domain covers:
#   - network formats (IPv4, IPv6, URLs, etc.)
#   - cryptographic formats (UUIDs, hashes, tokens)
#   - code identifiers (ISBN, IMEI, EAN, etc.)
#   - development metadata (versions, licenses, programming languages)
#   - hardware specifications
# ─────────────────────────────────────────────────────────────────────

# ═════════════════════════════════════════════════════════════════════
# CATEGORY: internet
#
# Network addresses, URLs, and web-related formats. These are
# machine-readable identifiers for network resources and protocols.
# ═════════════════════════════════════════════════════════════════════

technology.internet.ip_v4:
  title: "IPv4 Address"
  description: >
    Standard 4-octet IPv4 address format (e.g., 192.168.1.1).
    Resolves to VARCHAR with INET validation. Can optionally
    parse to DuckDB INET type with the inet extension.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: "CAST({col} AS INET)"
  decompose: null
  validation:
    type: string
    pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
  tier: [INET, internet]
  release_priority: 5
  aliases: [ipv4]
  samples:
    - "192.168.1.1"
    - "8.8.8.8"
    - "127.0.0.1"
    - "10.0.0.0"
  references: null
  notes: >
    v1 migration: Was internet.ip_v4. transform_ext uses DuckDB's inet extension
    if available. Without the extension, stored as VARCHAR but validated as IPv4.

technology.internet.ip_v4_with_port:
  title: "IPv4 Address with Port"
  description: >
    IPv4 address with port number separated by colon (e.g., 192.168.1.1:8080).
    Resolves to VARCHAR. Can decompose to separate IP and port fields.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    ip: "REGEXP_EXTRACT({col}, '^([^:]+):')"
    port: "CAST(REGEXP_EXTRACT({col}, ':([0-9]+)$') AS SMALLINT)"
  validation:
    type: string
    pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):[0-9]{1,5}$"
  tier: [INET, internet]
  release_priority: 4
  aliases: [ipv4_port]
  samples:
    - "192.168.1.1:8080"
    - "10.0.0.1:3306"
    - "127.0.0.1:5432"
  references: null
  notes: >
    v1 migration: Was internet.ip_v4_with_port. Includes decompose to split
    address and port into separate columns during transformation.

technology.internet.ip_v6:
  title: "IPv6 Address"
  description: >
    Standard IPv6 address format with 8 16-bit hex groups separated by colons.
    May use :: notation for zero compression. Resolves to VARCHAR.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: "CAST({col} AS INET)"
  decompose: null
  validation:
    type: string
    pattern: "^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$"
  tier: [INET, internet]
  release_priority: 4
  aliases: [ipv6]
  samples:
    - "2001:0db8:85a3:0000:0000:8a2e:0370:7334"
    - "2001:db8:85a3::8a2e:370:7334"
    - "::1"
    - "fe80::1"
  references: null
  notes: >
    v1 migration: Was internet.ip_v6. Complex regex due to :: compression notation.

technology.internet.mac_address:
  title: "MAC Address"
  description: >
    Media Access Control (MAC) address in standard colon-separated hex format
    (e.g., 00:1a:2b:3c:4d:5e). Also accepts hyphen-separated format.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$"
  tier: [INET, internet]
  release_priority: 4
  aliases: [mac]
  samples:
    - "00:1a:2b:3c:4d:5e"
    - "08-00-27-00-00-00"
    - "a0:b1:c2:d3:e4:f5"
  references: null
  notes: >
    v1 migration: Was internet.mac_address. Pattern accepts both colon and hyphen
    separators. Data typically uppercase, but pattern is case-insensitive.

technology.internet.url:
  title: "URL"
  description: >
    Uniform Resource Locator (complete web address including scheme).
    Includes protocol (http, https, ftp, etc.), hostname, and optional path.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
    format: "uri"
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    scheme: "REGEXP_EXTRACT({col}, '^([^:]+):')"
    hostname: "REGEXP_EXTRACT({col}, '(?:https?://)?([^/?]+)')"
    path: "REGEXP_EXTRACT({col}, '(?:https?://[^/]+)?(/.*)$')"
  validation:
    type: string
    pattern: "^(?:(?:https?|ftp|file):)?//(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\\.)+[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?::\\d+)?(?:/[^?#]*)?(?:\\?[^#]*)?(?:#.*)?$"
  tier: [VARCHAR, internet]
  release_priority: 5
  aliases: [web_url, uri]
  samples:
    - "https://example.com/path/to/resource"
    - "http://www.google.com"
    - "ftp://ftp.example.org/file.txt"
    - "https://example.com:8080/path?query=value#anchor"
    - "//cdn.example.com/lib.js"
  references: null
  notes: >
    v1 migration: Was internet.url. Includes optional decompose for parsing
    scheme, hostname, and path components.
    Phase 0: Merged technology.internet.uri into this type.
    URI had 37% training data overlap with URL (both use http/https schemes).
    Model could not distinguish them (regression analysis).
    Scheme is optional but `//` is required, so protocol-relative URLs
    (`//cdn.example.com/lib.js`) validate while bare hostnames and bare ids
    (`msg32812262`) do not. Closing the `//` gap makes the validator precise
    enough to gate the url header-hint over-emit (spec
    2026-06-25-sharpen-stage-audit; task t-00007f4d).

technology.internet.hostname:
  title: "Hostname"
  description: >
    Fully qualified domain name or simple hostname. Does not include protocol or path.
    Allows alphanumeric characters, hyphens, and dots. Subdomains supported.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^(?:(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\\.)*[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)$"
  tier: [VARCHAR, internet]
  release_priority: 4
  aliases: [domain_name]
  samples:
    - "example.com"
    - "www.google.com"
    - "subdomain.example.org"
    - "localhost"
  references: null
  notes: >
    v1 migration: Was internet.hostname. Pattern requires lowercase (use LOWER()
    to normalize). Does not validate DNS resolution.

# technology.internet.port: REMOVED
# False positives on plain integer columns. Range 0-65535 overlaps heavily with integer_number.

technology.internet.top_level_domain:
  title: "Top-Level Domain (TLD)"
  description: >
    Domain extension (e.g., .com, .org, .uk). May include second-level domains
    for country codes (e.g., .co.uk). Does not include the dot.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^(?:[a-z]{2,}|xn--[a-z0-9]+)(?:\\.[a-z]{2,})?$"
  tier: [VARCHAR, internet]
  release_priority: 4
  aliases: [tld]
  samples:
    - "com"
    - "org"
    - "uk"
    - "co.uk"
    - "xn--3e0b707e"
  references: null
  notes: >
    v1 migration: Was internet.tld (aliased to top_level_domain for clarity).
    Pattern allows IDN (xn--) format.

# technology.internet.slug: REMOVED in Phase 0
# Collapsed into representation.code.alphanumeric_id.
# Rarely analytically important; confused with hostname in CLDR regression.

technology.internet.user_agent:
  title: "User Agent String"
  description: >
    HTTP User-Agent header string identifying the client application.
    Typically contains browser, OS, and version information.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^(Mozilla/|curl/|python-requests/|Wget/|Go-http-client/|axios/|PostmanRuntime/|kube-probe/|Java/|okhttp/|Apache-HttpClient/|libcurl/|node-fetch/|Dalvik/|CFNetwork/|Lynx/|Links |Scrapy/|Googlebot/|Bingbot/|Slackbot|Twitterbot/|facebookexternalhit/|LinkedInBot/|Prometheus/|Datadog/|Ruby/|Dart/|grpc-|HTTPie/|bot|spider|crawl)"
    minLength: 10
    maxLength: 500
  tier: [VARCHAR, internet]
  release_priority: 3
  aliases: null
  samples:
    - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    - "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X)"
    - "curl/7.64.1"
  references: null
  notes: >
    v1 migration: Was internet.user_agent. v13: Added prefix-based pattern
    for validation branch signal. Checks for known UA prefixes (Mozilla/,
    curl/, python-requests/, etc.) and common bot/spider/crawler identifiers.
    Parse with dedicated UA parser for full details.

technology.internet.http_method:
  title: "HTTP Method"
  description: >
    HTTP request method (GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS, TRACE, CONNECT).
    Broad categorization; lower release priority.
  designation: broad_words
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    # 27 case variants = 9 HTTP methods × {UPPER, lower, Title}. Enumerated
    # explicitly in both pattern AND enum because CompiledValidator applies
    # them conjunctively (crates/finetype-core/src/validator.rs delegates to
    # jsonschema::Validator). (?i) on pattern alone does NOT help — enum is
    # exact-string, so "Get" would pass pattern but fail enum.
    # See spec .orbit/specs/2026-04-20-distilled-data-relabel-7-types/spec.yaml (v1.3).
    pattern: "^(GET|Get|get|POST|Post|post|PUT|Put|put|DELETE|Delete|delete|PATCH|Patch|patch|HEAD|Head|head|OPTIONS|Options|options|TRACE|Trace|trace|CONNECT|Connect|connect)$"
    enum: [GET, Get, get, POST, Post, post, PUT, Put, put, DELETE, Delete, delete, PATCH, Patch, patch, HEAD, Head, head, OPTIONS, Options, options, TRACE, Trace, trace, CONNECT, Connect, connect]
  tier: [VARCHAR, internet]
  release_priority: 3
  aliases: null
  samples:
    - "GET"
    - "post"
    - "Delete"
  references: null
  notes: >
    v1 migration: Was internet.http_method marked broad_words. Lower priority
    due to high cardinality but limited semantic information.
    v13: Added explicit pattern for validation branch signal (100% pass rate
    for methods vs 33% for IATA's ^[A-Z]{3}$).
    v17: Expanded enum + pattern to 27 case variants (UPPER/lower/Title per
    method) to fix the v16 training-audit finding where distilled http_method
    rows (SAN JOAQUIN, GOAT, OPERATING, IN PROGRESS, ENROUTE) caused
    misclassification. Cascade: multi-branch validation-branch pass-rate
    feature becomes ~1.0 for real HTTP-method columns regardless of case
    (was ~1/3 when only UPPER variants passed). Realised only after retrain.

# technology.internet.http_status_code: REMOVED
# False positives on plain integer columns. Range 100-599 overlaps heavily with integer_number.

# ═════════════════════════════════════════════════════════════════════
# CATEGORY: cryptographic
#
# Hash functions, tokens, and cryptographic identifiers.
# These are deterministic outputs of cryptographic algorithms or
# random tokens used for security purposes.
# ═════════════════════════════════════════════════════════════════════

# technology.cryptographic.uuid: MOVED to representation.identifier.uuid in v0.5.1
# UUID is a universal identifier, not crypto-specific.

technology.cryptographic.hash:
  title: "Cryptographic Hash"
  description: >
    Output of a cryptographic hash function (MD5, SHA-1, SHA-256, etc.).
    Detected as hex string of fixed length. Broad categorization by length.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    algorithm: "CASE WHEN LENGTH({col}) = 32 THEN 'MD5' WHEN LENGTH({col}) = 40 THEN 'SHA1' WHEN LENGTH({col}) = 64 THEN 'SHA256' WHEN LENGTH({col}) = 128 THEN 'SHA512' ELSE 'UNKNOWN' END"
  validation:
    type: string
    pattern: "^[0-9a-f]{32}$|^[0-9a-f]{40}$|^[0-9a-f]{64}$|^[0-9a-f]{128}$"
  tier: [VARCHAR, cryptographic]
  release_priority: 4
  aliases: null
  samples:
    - "5d41402abc4b2a76b9719d911017c592"
    - "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"
    - "2c26b46911185131006dfb3b57a99d73"
  references: null
  notes: >
    v1 migration: Was cryptographic.hash. Decompose infers algorithm from length:
    32→MD5, 40→SHA1, 64→SHA256, 128→SHA512. No cryptographic validation (can't
    verify true hash without known input).

technology.cryptographic.token_urlsafe:
  title: "URL-Safe Base64 Token"
  description: >
    Random token encoded in URL-safe Base64 (uses - and _ instead of + and /).
    Commonly used for auth tokens, reset links, and secure identifiers.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^[A-Za-z0-9_-]{12,128}$"
  tier: [VARCHAR, cryptographic]
  release_priority: 3
  aliases: [urlsafe_token]
  samples:
    - "SFMyNDI5STExOjEwMQ"
    - "dGVzdC10b2tlbg"
    - "aGVsbG8td29ybGQ"
  references: null
  notes: >
    v1 migration: Was cryptographic.token_urlsafe. URL-safe Base64 avoids
    characters that need escaping in URLs. Length typically 12-128 chars.

# ═════════════════════════════════════════════════════════════════════
# CATEGORY: code
#
# Standardized codes and product identifiers (ISBN, IMEI, EAN, etc.).
# These have formal specifications and may include check digits.
# ═════════════════════════════════════════════════════════════════════

# technology.code.isbn: MOVED to identity.commerce.isbn in v0.5.1

technology.code.imei:
  title: "IMEI"
  description: >
    International Mobile Equipment Identity. 15-digit identifier for mobile devices.
    Last digit is a check digit (Luhn algorithm).
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose: null
  validation:
    type: string
    pattern: "^[0-9]{15}$"
  tier: [VARCHAR, code]
  release_priority: 3
  aliases: null
  samples:
    - "490154203237518"
    - "352043068649149"
    - "010000862422488"
  references: null
  notes: >
    v1 migration: Was code.imei. 15 numeric digits. Check digit (last digit)
    uses Luhn algorithm but validation is done via crate, not regex.

# technology.code.ean: MOVED to identity.commerce.ean in v0.5.1

# technology.code.issn: MOVED to identity.commerce.issn in v0.5.1

technology.code.doi:
  title: "DOI"
  description: >
    Digital Object Identifier — a persistent identifier for digital content
    (journal articles, datasets, books). Format: 10.XXXX/suffix where XXXX
    is a registrant code and suffix is assigned by the registrant.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    registrant: "REGEXP_EXTRACT({col}, '^10\\.([0-9]+)/')"
    suffix: "REGEXP_EXTRACT({col}, '^10\\.[0-9]+/(.*)')"
  validation:
    type: string
    pattern: "^10\\.[0-9]{4,9}/[^\\s]+$"
  tier: [VARCHAR, code]
  release_priority: 3
  aliases: null
  samples:
    - "10.1038/nature12373"
    - "10.1000/xyz123"
    - "10.1016/j.cell.2009.01.043"
    - "10.48550/arXiv.2301.07041"
  references: null
  notes: >
    New in v2. DOI prefix always starts with "10." followed by registrant
    code (4-9 digits), then "/" and a suffix. Common registrants include
    10.1038 (Nature), 10.1016 (Elsevier), 10.1126 (Science), 10.1145 (ACM),
    10.48550 (arXiv). Suffix may contain dots, hyphens, slashes.

technology.code.locale_code:
  title: "Locale Code"
  description: >
    Language-region code in IETF BCP 47 format (e.g., en-US, fr-FR, zh-Hans).
    Allows for script subtags (e.g., zh-Hant) and extensions.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    language: "REGEXP_EXTRACT(LOWER({col}), '^([a-z]{2,3})')"
    country: "REGEXP_EXTRACT(LOWER({col}), '-([a-z]{2})(?:$|-)')"
  validation:
    type: string
    pattern: "^[a-zA-Z]{2,3}(?:[-_][a-zA-Z]{2,4})*$"
  tier: [VARCHAR, code]
  release_priority: 4
  aliases: [bcp47, language_tag]
  samples:
    - "en"
    - "en-US"
    - "fr-FR"
    - "zh-Hans-CN"
    - "de-AT"
  references: null
  notes: >
    v1 migration: Was code.locale_code. IETF BCP 47 format. Common codes:
    en, en-US, fr, de, zh-Hans, zh-Hant. Decompose extracts language and country.
    bcp47 alias added — locale_code already covers BCP 47 format.

technology.development.version:
  title: "Version Number"
  description: >
    Software or document version in semantic versioning format (e.g., 1.2.3)
    or variant formats (e.g., v1.2.3, 1.2.3-alpha, 1.2.3+build).
    Resolves to VARCHAR.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    major: "CAST(REGEXP_EXTRACT({col}, '^v?([0-9]+)') AS SMALLINT)"
    minor: "CAST(REGEXP_EXTRACT({col}, '^\\.([0-9]+)') AS SMALLINT)"
    patch: "CAST(REGEXP_EXTRACT({col}, '^\\.([0-9]+)(?:-|\\+|$)') AS SMALLINT)"
    prerelease: "REGEXP_EXTRACT({col}, '-([a-zA-Z0-9.]+)(?:\\+|$)')"
    metadata: "REGEXP_EXTRACT({col}, '\\+([a-zA-Z0-9.]+)$')"
  validation:
    type: string
    pattern: "^v?[0-9]+\\.[0-9]+\\.[0-9]+(?:-[a-zA-Z0-9.]+)?(?:\\+[a-zA-Z0-9.]+)?$"
  tier: [VARCHAR, development]
  release_priority: 4
  aliases: null
  samples:
    - "1.2.3"
    - "v1.2.3"
    - "1.2.3-alpha"
    - "1.2.3-beta.1"
    - "1.2.3+build.123"
  references: null
  notes: >
    v1 migration: Was development.version. Semantic versioning (MAJOR.MINOR.PATCH).
    Decompose extracts major, minor, patch, prerelease, and metadata components.

technology.development.calver:
  title: "Calendar Version"
  description: >
    Calendar versioning (CalVer) following date-based patterns like YYYY.MM.DD
    or YYYY.MM. Used by projects that version based on release calendar.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    year: "CAST(REGEXP_EXTRACT({col}, '^([0-9]{4})') AS SMALLINT)"
    month: "CAST(REGEXP_EXTRACT({col}, '^[0-9]{4}\\.([0-9]{2})') AS SMALLINT)"
    day: "CAST(REGEXP_EXTRACT({col}, '^[0-9]{4}\\.[0-9]{2}\\.([0-9]{2})') AS SMALLINT)"
  validation:
    type: string
    pattern: "^[0-9]{4}\\.[0-9]{2}(?:\\.[0-9]{2})?(?:\\..*)?$"
  tier: [VARCHAR, development]
  release_priority: 3
  aliases: null
  samples:
    - "2024.02"
    - "2024.02.09"
    - "2023.12.31"
    - "2024.01.15.1"
  references: null
  notes: >
    v1 migration: Was development.calver. Decompose extracts year, month, day.
    Format: YYYY.MM or YYYY.MM.DD.

technology.development.docker_ref:
  title: "Docker/OCI Image Reference"
  description: >
    OCI container image reference in the form [registry/]repository[:tag][@digest].
    Used in Dockerfiles, Kubernetes manifests, and CI/CD pipelines.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    registry: "REGEXP_EXTRACT({col}, '^(?:([a-zA-Z0-9.\\-]+(?::\\d+)?)/)')"
    repository: "REGEXP_EXTRACT({col}, '(?:(?:[a-zA-Z0-9.\\-]+(?::\\d+)?)/)?([a-z0-9._\\-/]+?)(?::|@|$)')"
    tag: "REGEXP_EXTRACT({col}, ':([a-zA-Z0-9_.\\-]+)')"
    digest: "REGEXP_EXTRACT({col}, '@(sha256:[a-fA-F0-9]{64})')"
  validation:
    type: string
    pattern: "^(?:(?:[a-zA-Z0-9.\\-]+(?::\\d+)?)/)?(?:[a-z0-9._\\-/]+)(?::[a-zA-Z0-9_.\\-]+)?(?:@sha256:[a-fA-F0-9]{64})?$"
  tier: [VARCHAR, development]
  release_priority: 3
  aliases: [oci_image_ref, container_image]
  samples:
    - "nginx:latest"
    - "ghcr.io/myorg/myapp:v1.2.3"
    - "docker.io/library/postgres:16-alpine"
    - "gcr.io/project/service@sha256:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890"
  references: null
  notes: >
    OCI image reference format. Includes optional registry,
    repository path, tag, and digest components.

# ═════════════════════════════════════════════════════════════════════
# CATEGORY: identifier
#
# Technology-specific identifiers with structured formats that encode
# timestamps or other metadata. Distinguished from representation.identifier
# by having technology-specific semantics.
# ═════════════════════════════════════════════════════════════════════

technology.identifier.ulid:
  title: "ULID"
  description: >
    Universally Unique Lexicographically Sortable Identifier. 26-character
    Crockford Base32 string encoding a 48-bit timestamp and 80 bits of randomness.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: "TRY_CAST({col} AS ULID)"
  decompose:
    timestamp_ms: "EPOCH_MS(({col}::ULID)::BIGINT)"
  validation:
    type: string
    pattern: "^[0123456789ABCDEFGHJKMNPQRSTVWXYZ]{26}$"
  tier: [VARCHAR, identifier]
  release_priority: 4
  aliases: null
  samples:
    - "01ARZ3NDEKTSV4RRFFQ69G5FAV"
    - "01H5J3XZQK0000000000000000"
    - "01FWHE4G8R2SXHM3F07YR3FW0T"
  references: null
  notes: >
    Crockford Base32 excludes I, L, O, U to avoid
    ambiguity. Always uppercase. First 10 chars encode millisecond timestamp.

technology.identifier.tsid:
  title: "TSID"
  description: >
    Time-Sorted ID. 32 lowercase hexadecimal characters with an embedded
    timestamp in the leading digits. Distinguished from UUID by having no
    hyphens, and from cryptographic hashes by the timestamp constraint.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: "tsid_to_timestamp({col})"
  decompose: null
  validation:
    type: string
    pattern: "^[0-9a-f]{32}$"
  tier: [VARCHAR, identifier]
  release_priority: 3
  aliases: null
  samples:
    - "0190a1b2c3d4e5f6a7b8c9d0e1f2a3b4"
    - "018f47d5a6b70000000000000000abcd"
    - "0191c8e4f2a30000fedcba9876543210"
  references: null
  notes: >
    Leading 8-10 hex chars decode to a reasonable
    timestamp (2010-2050). Disambiguation from hash/token_hex requires
    header hints or timestamp validation.

technology.identifier.snowflake_id:
  title: "Snowflake ID"
  description: >
    Twitter/Discord-style snowflake identifier. A 17-20 digit integer
    encoding a millisecond timestamp, worker ID, and sequence number.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: BIGINT
  frictionless:
    type: integer
  format_string: null
  transform: "epoch_ms(({col}::BIGINT >> 22) + 1288834974657)"
  transform_ext: null
  decompose:
    timestamp_ms: "({col}::BIGINT >> 22) + 1288834974657"
    worker_id: "({col}::BIGINT >> 12) & 1023"
    sequence: "{col}::BIGINT & 4095"
  validation:
    type: string
    pattern: "^\\d{17,20}$"
  tier: [BIGINT, identifier]
  release_priority: 3
  aliases: [twitter_id, discord_id]
  samples:
    - "175928847299117063"
    - "1234567890123456789"
    - "80351110224678912"
  references: null
  notes: >
    Twitter epoch: 1288834974657 (2010-11-04).
    Bits: 41 timestamp + 10 worker + 12 sequence. Range ~10^17 to ~10^19.

# ═════════════════════════════════════════════════════════════════════
# CATEGORY: cloud
#
# Cloud provider resource identifiers and URIs.
# ═════════════════════════════════════════════════════════════════════

technology.cloud.aws_arn:
  title: "AWS ARN"
  description: >
    Amazon Resource Name. Globally unique identifier for AWS resources
    in the format arn:partition:service:region:account-id:resource.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    partition: "REGEXP_EXTRACT({col}, '^arn:([^:]+):')"
    service: "REGEXP_EXTRACT({col}, '^arn:[^:]+:([^:]+):')"
    region: "REGEXP_EXTRACT({col}, '^arn:[^:]+:[^:]+:([^:]*):' )"
    account_id: "REGEXP_EXTRACT({col}, '^arn:[^:]+:[^:]+:[^:]*:([^:]*):' )"
    resource: "REGEXP_EXTRACT({col}, '^arn:[^:]+:[^:]+:[^:]*:[^:]*:(.+)$')"
  validation:
    type: string
    pattern: "^arn:(aws|aws-cn|aws-us-gov):[a-zA-Z0-9\\-]+:[a-z0-9\\-]*:\\d{0,12}:.+$"
  tier: [VARCHAR, cloud]
  release_priority: 3
  aliases: [amazon_resource_name]
  samples:
    - "arn:aws:s3:::my-bucket"
    - "arn:aws:iam::123456789012:user/johndoe"
    - "arn:aws:ec2:us-east-1:123456789012:instance/i-1234567890abcdef0"
    - "arn:aws:lambda:eu-west-1:123456789012:function:my-function"
  references: null
  notes: >
    Region and account-id may be empty for global
    resources (e.g., S3 buckets, IAM). Three partitions: aws, aws-cn, aws-us-gov.

technology.cloud.s3_uri:
  title: "S3 URI"
  description: >
    Amazon S3 bucket/key URI in the format s3://bucket-name/key-path.
    Used in data pipelines, ETL tools, and cloud storage references.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    bucket: "REGEXP_EXTRACT({col}, '^s3://([^/]+)')"
    key: "REGEXP_EXTRACT({col}, '^s3://[^/]+/(.*)')"
  validation:
    type: string
    pattern: "^s3://[a-z0-9][a-z0-9.\\-]{1,61}[a-z0-9](/.*)?$"
  tier: [VARCHAR, cloud]
  release_priority: 3
  aliases: null
  samples:
    - "s3://my-bucket/data/file.csv"
    - "s3://production-logs/2024/01/events.parquet"
    - "s3://data-lake-raw/ingestion/batch-001/part-00000.json"
  references: null
  notes: >
    Bucket names: 3-63 chars, lowercase, no underscores.
    Key paths can contain any characters after the bucket name.

technology.cryptographic.jwt:
  title: "JSON Web Token"
  description: >
    JSON Web Token (JWT) consisting of three base64url-encoded segments
    separated by dots: header.payload.signature. Used for authentication
    and information exchange.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    header: "REGEXP_EXTRACT({col}, '^([A-Za-z0-9_-]+)\\.')"
    payload: "REGEXP_EXTRACT({col}, '^\\.([A-Za-z0-9_-]+)\\.')"
    signature: "REGEXP_EXTRACT({col}, '\\.([A-Za-z0-9_-]+)$')"
  validation:
    type: string
    pattern: "^[A-Za-z0-9_-]{10,}\\.[A-Za-z0-9_-]{10,}\\.[A-Za-z0-9_-]{10,}$"
  tier: [VARCHAR, cryptographic]
  release_priority: 3
  aliases: [json_web_token]
  samples:
    - "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c"
    - "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhdXRoLmV4YW1wbGUuY29tIn0.dBjftJeZ4CVP-mB92K27uhbUJU1p1r_wW1gFWFOEjXk"
  references: null
  notes: >
    Each segment is at least 10 base64url chars to
    distinguish from other dot-separated formats. Header typically starts
    with "eyJ" (base64 for '{"').

technology.internet.cidr:
  title: "CIDR Notation"
  description: >
    Classless Inter-Domain Routing notation combining an IPv4 address
    with a prefix length (e.g., 192.168.1.0/24). Used in network
    configuration, firewall rules, and IP access control.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: "CAST({col} AS INET)"
  decompose:
    network: "REGEXP_EXTRACT({col}, '^([^/]+)/')"
    prefix_length: "CAST(REGEXP_EXTRACT({col}, '/(\\d+)$') AS TINYINT)"
  validation:
    type: string
    pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)/([0-9]|[12]\\d|3[0-2])$"
  tier: [INET, internet]
  release_priority: 4
  aliases: [cidr_block, ip_range]
  samples:
    - "192.168.1.0/24"
    - "10.0.0.0/8"
    - "172.16.0.0/12"
    - "0.0.0.0/0"
  references: null
  notes: >
    IPv4 CIDR only. Prefix length 0-32. Distinguished
    from plain IPv4 by the /prefix suffix.

technology.internet.urn:
  title: "URN"
  description: >
    Uniform Resource Name — a persistent, location-independent identifier
    in the format urn:nid:nss. Used for namespaced identifiers like
    ISBN URNs, OIDs, and UUID URNs.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
    format: "uri"
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    nid: "REGEXP_EXTRACT({col}, '^urn:([^:]+):')"
    nss: "REGEXP_EXTRACT({col}, '^urn:[^:]+:(.+)$')"
  validation:
    type: string
    pattern: "^urn:[a-z0-9][a-z0-9\\-]{0,31}:.+$"
  tier: [VARCHAR, internet]
  release_priority: 3
  aliases: [uniform_resource_name]
  samples:
    - "urn:isbn:0451450523"
    - "urn:ietf:rfc:2648"
    - "urn:uuid:6e8bc430-9c3a-11d9-9669-0800200c9a66"
    - "urn:oid:2.16.840"
  references: null
  notes: >
    NID (Namespace ID) is 1-32 lowercase alphanumeric
    or hyphen chars. NSS (Namespace Specific String) is the remainder.

technology.internet.data_uri:
  title: "Data URI"
  description: >
    Inline data URI scheme embedding content directly in a string.
    Format: data:[mediatype][;base64],data. Used for embedding small
    files in HTML, CSS, and API responses.
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
    format: "uri"
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    media_type: "REGEXP_EXTRACT({col}, '^data:([^;,]+)')"
    is_base64: "CASE WHEN {col} LIKE '%base64,%' THEN true ELSE false END"
  validation:
    type: string
    pattern: "^data:(?:[a-zA-Z0-9]+/[a-zA-Z0-9\\-+.]+)?(?:;[a-zA-Z0-9\\-]+=[^;,]*)*(?:;base64)?,.+$"
  tier: [VARCHAR, internet]
  release_priority: 3
  aliases: null
  samples:
    - "data:text/plain;base64,SGVsbG8gV29ybGQ="
    - "data:image/png;base64,iVBORw0KGgoAAAANSUhEUg=="
    - "data:application/json,{\"key\":\"value\"}"
    - "data:text/html,%3Ch1%3EHello%3C%2Fh1%3E"
  references: null
  notes: >
    Media type is optional (defaults to text/plain;charset=US-ASCII).
    Base64 encoding is indicated by ;base64 before the comma.

technology.filesystem.windows_path:
  title: "Windows File Path"
  description: >
    An absolute Microsoft Windows filesystem path: a drive letter and colon
    (C:\, D:\) or a UNC share root (\\server\share) followed by
    backslash-separated segments. Distinct from URLs (no scheme) and POSIX
    paths (forward-slash rooted).
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    drive: "REGEXP_EXTRACT({col}, '^([A-Za-z]:)')"
    filename: "REGEXP_EXTRACT({col}, '([^\\\\]+)$')"
  validation:
    type: string
    pattern: '^([A-Za-z]:\\|\\\\)[^\r\n]*$'
    minLength: 3
  tier: [VARCHAR, filesystem]
  release_priority: 3
  aliases: [path, filepath, directory]
  samples:
    - "C:\\Windows\\System32\\drivers\\WdfLdr.sys"
    - "D:\\research\\repos\\project\\src\\Main.cs"
    - "\\\\server\\share\\reports\\q3.xlsx"
  references: null
  notes: >
    New in v0.6.36 (spec 2026-06-19-plain-text-type-discovery). Highest
    volume x nameability type hiding behind representation.text.plain_text:
    7,651 distinct corpus datasets, blind-panel confidence 0.97 (unanimous).
    Pattern fires on <1.2% of multi-word prose. POSIX-path sibling deferred
    (below volume bar + URL-route ambiguity).

technology.internet.message_id:
  title: "Email Message-ID"
  description: >
    RFC 2822 Message-ID header — a globally-unique identifier for an email
    message: an angle-bracketed left@right token (<unique@domain>).
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    local: "REGEXP_EXTRACT({col}, '^<([^@]+)@')"
    domain: "REGEXP_EXTRACT({col}, '@([^>]+)>$')"
  validation:
    type: string
    pattern: '^<[^<>@\s]+@[^<>@\s]+>$'
  tier: [VARCHAR, internet]
  release_priority: 3
  aliases: [messageid, msgid]
  samples:
    - "<30365805.1075860998985.JavaMail.evans@thyme>"
    - "<CAF2a3b9c@mail.gmail.com>"
    - "<199911abc@server.example.org>"
  references: null
  notes: >
    New in v0.6.36 (spec 2026-06-19-plain-text-type-discovery). 3,032
    distinct corpus datasets, blind-panel confidence 0.95 (unanimous).
    Angle-bracket + @ grammar is unambiguous. Corpus sample is Enron-heavy
    (single provenance) but the RFC 2822 type is universal. Sits beside
    url / urn / mac_address as an internet protocol identifier.

technology.code.qualified_name:
  title: "Qualified Name"
  description: >
    A dotted, reverse-DNS-style fully-qualified identifier: three or more
    dot-separated identifier segments naming a namespaced code symbol
    (Java/.NET package or class, config key, module path).
  designation: universal
  locales: [UNIVERSAL]
  broad_type: VARCHAR
  frictionless:
    type: string
  format_string: null
  transform: "CAST({col} AS VARCHAR)"
  transform_ext: null
  decompose:
    root: "REGEXP_EXTRACT({col}, '^([A-Za-z_][A-Za-z0-9_]*)')"
    leaf: "REGEXP_EXTRACT({col}, '([A-Za-z0-9_]+)$')"
  validation:
    type: string
    pattern: '^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*){2,}$'
  tier: [VARCHAR, code]
  release_priority: 3
  aliases: [fqn, classname, namespace]
  samples:
    - "com.google.javascript.jscomp.Compiler"
    - "org.apache.commons.math.fraction.Fraction"
    - "Avalonia.UnitTests.Controls.ContentControl"
  references: null
  notes: >
    New in v0.6.36 (spec 2026-06-19-plain-text-type-discovery). 1,677
    distinct corpus datasets, blind-panel confidence 0.79 (unanimous
    "java/dotted FQN"). Pattern fires on ZERO multi-word prose. Boundary
    caveat: structurally overlaps technology.internet.hostname (www.x.com);
    the disambiguating signal is a CamelCase / non-TLD leaf segment — a
    Sharpen-precedence decision for the retrain, not the validator.