harn-stdlib 0.8.58

// std/web — deterministic HTTP-backed source ingest and extraction helpers.
import { cache_get, cache_put } from "std/cache"
import { safe_parse } from "std/json"

fn __web_header(headers, name) {
  let wanted = lowercase(name)
  for entry in headers ?? {} {
    if lowercase(entry.key) == wanted {
      return entry.value
    }
  }
  return nil
}

fn __web_store_options(opts) {
  if opts?.store == nil {
    return nil
  }
  return {store: opts.store}
}

fn __web_cache_key(method, url, opts) {
  return opts?.cache_key ?? ("std/web:web_fetch:" + uppercase(method) + ":" + url)
}

fn __web_effective_url(url, http_opts) {
  let query = http_opts?.query
  if query == nil {
    return url
  }
  let parsed = url_parse(url)
  var pairs = query_parse(parsed?.query ?? "")
  for entry in query {
    if entry.value != nil {
      pairs = pairs + [{key: entry.key, value: to_string(entry.value)}]
    }
  }
  return url_build(
    {
      scheme: parsed.scheme,
      host: parsed.host,
      port: parsed.port,
      path: parsed.path ?? "/",
      query: query_stringify(pairs),
      fragment: parsed.fragment,
      username: parsed.username,
      password: parsed.password,
    },
  )
}

fn __web_base_headers(opts) {
  let http_headers = opts?.http_options?.headers ?? {}
  let headers = opts?.headers
  if headers == nil {
    return http_headers
  }
  return http_headers + headers
}

fn __web_conditional_headers(headers, previous, enabled) {
  if !enabled || previous == nil {
    return headers
  }
  var next = headers
  if __web_header(next, "if-none-match") == nil && previous?.etag != nil {
    next = next + {"If-None-Match": previous.etag}
  }
  if __web_header(next, "if-modified-since") == nil && previous?.last_modified != nil {
    next = next + {"If-Modified-Since": previous.last_modified}
  }
  return next
}

fn __web_fetch_envelope(source_url, response, fetched_at, cache_status) {
  let headers = response?.headers ?? {}
  return {
    ok: response?.ok ?? false,
    status: response?.status,
    body: response?.body ?? "",
    headers: headers,
    content_type: __web_header(headers, "content-type"),
    etag: __web_header(headers, "etag"),
    last_modified: __web_header(headers, "last-modified"),
    fetched_at: fetched_at,
    cache_status: cache_status,
    source_url: source_url,
    final_url: response?.final_url ?? source_url,
    not_modified: false,
  }
}

fn __web_not_modified(source_url, response, previous, fetched_at) {
  let headers = response?.headers ?? {}
  var next = previous
  next = next
    + {
    ok: true,
    status: 304,
    headers: headers,
    fetched_at: fetched_at,
    cache_status: "not_modified",
    source_url: source_url,
    final_url: response?.final_url ?? previous?.final_url ?? source_url,
    not_modified: true,
  }
  let etag = __web_header(headers, "etag")
  if etag != nil {
    next = next + {etag: etag}
  }
  let last_modified = __web_header(headers, "last-modified")
  if last_modified != nil {
    next = next + {last_modified: last_modified}
  }
  let content_type = __web_header(headers, "content-type")
  if content_type != nil {
    next = next + {content_type: content_type}
  }
  return next
}

/**
 * Fetch a web source through Harn's HTTP stack and normalize provenance.
 *
 * Options:
 *   - method, headers, query, body, timeout, retry, session, proxy, tls:
 *     forwarded to `http_request`.
 *   - http_options: explicit HTTP option dict when callers want web-specific
 *     options kept separate.
 *   - previous: previous `web_fetch` envelope used for conditional headers.
 *   - store: `std/cache` store. When present, cached ETag / Last-Modified
 *     values are used for conditional re-fetches and 304 responses reuse the
 *     cached body.
 *   - cache_key: override for the cache key.
 *   - conditional: set false to skip If-None-Match / If-Modified-Since.
 *   - fetched_at: deterministic timestamp override for fixtures.
 *
 * @effects: [net, time]
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: web_fetch(url, options)
 */
pub fn web_fetch(url: string, options = nil) -> dict {
  let opts = options ?? {}
  let method = uppercase(opts?.method ?? opts?.http_options?.method ?? "GET")
  var http_opts = opts?.http_options ?? opts
  let source_url = __web_effective_url(url, http_opts)
  let store_opts = __web_store_options(opts)
  let cache_key = __web_cache_key(method, source_url, opts)
  var previous = opts?.previous
  var cache_status = if store_opts == nil {
    "bypass"
  } else {
    "miss"
  }
  if previous == nil && store_opts != nil {
    let cached = cache_get(cache_key, store_opts)
    if cached.hit {
      previous = cached.value
      cache_status = "refresh"
    }
  }
  let headers = __web_conditional_headers(__web_base_headers(opts), previous, opts?.conditional ?? true)
  http_opts = http_opts + {headers: headers}
  let response = harness.net.request(method, url, http_opts)
  let fetched_at = opts?.fetched_at ?? harness.clock.timestamp()
  if response.status == 304 && previous != nil {
    let not_modified = __web_not_modified(source_url, response, previous, fetched_at)
    if store_opts != nil {
      cache_put(cache_key, not_modified, store_opts)
    }
    return not_modified
  }
  let envelope = __web_fetch_envelope(source_url, response, fetched_at, cache_status)
  if store_opts != nil && envelope.ok {
    cache_put(cache_key, envelope, store_opts)
  }
  return envelope
}

fn __web_search_tokens(query) {
  let normalized = regex_replace("[^A-Za-z0-9_./@:-]+", " ", lowercase(query ?? ""))
  var out = []
  for token in split(trim(normalized), " ") {
    if token != "" && len(token) > 1 && !out.contains(token) {
      out = out.push(token)
    }
  }
  return out
}

fn __web_search_text(entry) {
  return lowercase(
    to_string(entry?.title ?? "")
      + " "
      + to_string(entry?.name ?? "")
      + " "
      + to_string(entry?.url ?? entry?.href ?? entry?.source_url ?? "")
      + " "
      + to_string(entry?.snippet ?? entry?.description ?? entry?.text ?? "")
      + " "
      + join(entry?.tags ?? [], " ")
      + " "
      + to_string(entry?.package ?? ""),
  )
}

fn __web_search_score(entry, query, tokens) {
  let haystack = __web_search_text(entry)
  var score = 0
  let phrase = lowercase(trim(query ?? ""))
  if phrase != "" && contains(haystack, phrase) {
    score = score + 25
  }
  let title = lowercase(to_string(entry?.title ?? entry?.name ?? ""))
  let url = lowercase(to_string(entry?.url ?? entry?.href ?? entry?.source_url ?? ""))
  for token in tokens {
    if contains(title, token) {
      score = score + 8
    } else if contains(url, token) {
      score = score + 5
    } else if contains(haystack, token) {
      score = score + 2
    }
  }
  let source_type = entry?.source_type ?? entry?.kind
  if ["docs", "pinned_docs", "package_registry", "registry"].contains(source_type) {
    score = score + 6
  }
  if entry?.authority || entry?.authoritative {
    score = score + 8
  }
  if entry?.version != nil || entry?.pinned_version != nil {
    score = score + 2
  }
  return score
}

fn __web_search_take(items, limit) {
  var out = []
  for item in items {
    if len(out) >= limit {
      return out
    }
    out = out.push(item)
  }
  return out
}

fn __web_search_path_get(value, path, fallback = nil) {
  if path == nil {
    return fallback
  }
  let parts = if type_of(path) == "list" {
    path
  } else {
    split(to_string(path), ".").filter({ part -> part != "" })
  }
  var current = value
  for part in parts {
    if current == nil {
      return fallback
    }
    current = current[part]
  }
  return current ?? fallback
}

fn __web_search_merge_headers(left, right) {
  let merged = left ?? {}
  let extra = right ?? {}
  return merged + extra
}

fn __web_search_tool_options(base, args) {
  let merged = base ?? {}
  let extra = args?.options ?? args ?? {}
  return merged + extra
}

fn __web_search_backend(opts) {
  let explicit = opts?.backend
  if type_of(explicit) == "dict" {
    return explicit
      + {
      kind: explicit?.kind ?? explicit?.type ?? "curated_index",
      id: explicit?.id ?? explicit?.name ?? explicit?.kind ?? "configured",
      deterministic: true,
    }
  }
  if type_of(explicit) == "string" {
    return {kind: explicit, id: explicit, deterministic: true}
  }
  if opts?.provider_results != nil {
    return {kind: "provider_hosted", id: "provider_results", deterministic: true, evidence: "provider_results"}
  }
  if opts?.api != nil {
    let api = opts.api
    return {
      kind: "api",
      id: api?.id ?? api?.name ?? api?.url ?? "configured_search_api",
      deterministic: true,
      evidence: "search_api",
      api: api,
    }
  }
  let env_url = harness.env.get("HARN_WEB_SEARCH_URL")
  if env_url != nil && env_url != "" {
    var headers = {}
    let bearer = harness.env.get("HARN_WEB_SEARCH_BEARER_TOKEN")
    if bearer != nil && bearer != "" {
      headers = {Authorization: "Bearer " + bearer}
    }
    return {
      kind: "api",
      id: "env:HARN_WEB_SEARCH_URL",
      deterministic: true,
      evidence: "search_api",
      api: {
        url: env_url,
        query_param: harness.env.get_or("HARN_WEB_SEARCH_QUERY_PARAM", "q"),
        headers: headers,
      },
    }
  }
  return {
    kind: "curated_index",
    id: opts?.index_id ?? "inline",
    deterministic: true,
    evidence: "curated_index",
  }
}

fn __web_search_public_backend(backend) {
  var public = {
    kind: backend.kind,
    id: backend.id,
    deterministic: backend?.deterministic ?? true,
    evidence: backend?.evidence ?? backend.kind,
  }
  if backend?.source_url != nil {
    public = public + {source_url: backend.source_url}
  }
  if backend?.final_url != nil {
    public = public + {final_url: backend.final_url}
  }
  return public
}

fn __web_search_normalize(entry, rank, backend, score = nil) {
  let url = entry?.url ?? entry?.href ?? entry?.source_url ?? entry?.docs_url
  return {
    rank: rank,
    title: to_string(entry?.title ?? entry?.name ?? url ?? ""),
    url: url,
    snippet: to_string(entry?.snippet ?? entry?.description ?? entry?.text ?? ""),
    source_url: entry?.source_url ?? url,
    final_url: entry?.final_url ?? url,
    source_type: entry?.source_type ?? entry?.kind ?? "web",
    authority: entry?.authority ?? entry?.authoritative ?? false,
    package: entry?.package ?? entry?.name,
    registry: entry?.registry ?? entry?.ecosystem,
    version: entry?.version ?? entry?.pinned_version,
    provenance: {
      backend_kind: backend.kind,
      backend_id: backend.id,
      evidence: backend?.evidence ?? backend.kind,
      score: score,
      source_url: entry?.source_url ?? url,
      trust_score: entry?.trust_score,
      first_seen: entry?.first_seen,
      fetched_at: backend?.fetched_at,
    },
  }
}

fn __web_search_curated(query, opts, backend) {
  let tokens = __web_search_tokens(query)
  let limit = opts?.limit ?? 10
  var scored = []
  for entry in opts?.index ?? opts?.results ?? backend?.index ?? [] {
    let score = __web_search_score(entry, query, tokens)
    if score > 0 || opts?.include_zero_score {
      scored = scored.push({entry: entry, score: score})
    }
  }
  let ordered = scored.sort_by({ item -> 0 - item.score })
  var results = []
  var rank = 1
  for item in __web_search_take(ordered, limit) {
    results = results.push(__web_search_normalize(item.entry, rank, backend, item.score))
    rank = rank + 1
  }
  return {
    ok: true,
    query: query,
    backend: __web_search_public_backend(backend),
    results: results,
    provenance: {resolver: "std/web", result_count: len(results), authoritative_first: true},
  }
}

fn __web_search_provider_results(query, opts, backend) {
  var results = []
  var rank = 1
  for entry in __web_search_take(opts.provider_results ?? [], opts?.limit ?? 10) {
    results = results.push(__web_search_normalize(entry, rank, backend))
    rank = rank + 1
  }
  return {
    ok: true,
    query: query,
    backend: __web_search_public_backend(backend),
    results: results,
    provenance: {resolver: "std/web", result_count: len(results), authoritative_first: false},
  }
}

fn __web_search_api_item(item, api) {
  if type_of(item) != "dict" {
    return {title: to_string(item), snippet: to_string(item)}
  }
  return item
    + {
    title: __web_search_path_get(item, api?.title_path ?? "title", item?.title ?? item?.name),
    url: __web_search_path_get(item, api?.url_path ?? "url", item?.url ?? item?.href),
    snippet: __web_search_path_get(
      item,
      api?.snippet_path ?? "snippet",
      item?.snippet ?? item?.description ?? item?.text,
    ),
    source_url: __web_search_path_get(item, api?.source_url_path ?? "source_url", item?.source_url),
  }
}

fn __web_search_api(query, opts, backend) {
  let api = backend.api ?? opts.api
  let limit = opts?.limit ?? api?.limit ?? 10
  var fetch_options = api?.fetch_options ?? {}
  let method = uppercase(api?.method ?? fetch_options?.method ?? "GET")
  if method == "GET" {
    let query_param = api?.query_param ?? "q"
    let existing_query = fetch_options?.query ?? {}
    fetch_options = fetch_options + {method: method, query: existing_query + {[query_param]: query}}
  } else {
    let headers = __web_search_merge_headers(fetch_options?.headers, api?.headers)
    fetch_options = fetch_options
      + {method: method, headers: headers, body: api?.body ?? json_stringify({query: query, limit: limit})}
  }
  if api?.headers != nil && method == "GET" {
    fetch_options = fetch_options + {headers: __web_search_merge_headers(fetch_options?.headers, api.headers)}
  }
  let fetched = web_fetch(api.url, fetch_options)
  let parsed = safe_parse(fetched.body)
  let items = __web_search_path_get(parsed, api?.results_path ?? "results", [])
  let raw_results = if type_of(items) == "list" {
    items
  } else {
    []
  }
  var results = []
  var rank = 1
  for item in __web_search_take(raw_results, limit) {
    let normalized = __web_search_api_item(item, api)
    results = results
      .push(
      __web_search_normalize(
        normalized,
        rank,
        backend + {fetched_at: fetched.fetched_at},
        normalized?.score,
      ),
    )
    rank = rank + 1
  }
  let fetch_ok = fetched.ok ?? false
  let parsed_ok = parsed != nil
  return {
    ok: fetch_ok && parsed_ok,
    query: query,
    backend: __web_search_public_backend(
      backend + {source_url: fetched.source_url, final_url: fetched.final_url},
    ),
    results: results,
    provenance: {
      resolver: "std/web",
      result_count: len(results),
      authoritative_first: false,
      fetched_at: fetched.fetched_at,
      cache_status: fetched.cache_status,
      source_url: fetched.source_url,
      final_url: fetched.final_url,
    },
  }
}

/**
 * Search authoritative web, docs, or registry evidence through a normalized
 * backend contract.
 *
 * Backends:
 *   - `index` / `results`: deterministic curated entries scored locally.
 *   - `api`: configured JSON search API fetched through `web_fetch`.
 *   - `provider_results`: externally captured hosted-tool results normalized
 *     without coupling this primitive to any provider.
 *   - `HARN_WEB_SEARCH_URL`: optional process configuration for an API backend.
 *
 * @effects: [net, time]
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: web_search(query, options)
 */
pub fn web_search(query: string, options = nil) -> dict {
  let opts = options ?? {}
  let backend = __web_search_backend(opts)
  if backend.kind == "api" {
    return __web_search_api(query, opts, backend)
  }
  if backend.kind == "provider_hosted" {
    return __web_search_provider_results(query, opts, backend)
  }
  return __web_search_curated(query, opts, backend)
}

/**
 * Deterministically verify package imports and optional symbol evidence for
 * source files. Options include `project_root`, `installed_packages`,
 * `registry`, `now`, `low_trust_threshold`, and `min_package_age_days`.
 *
 * @effects: [fs]
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: verify_imports(paths, options)
 */
pub fn verify_imports(paths, options = nil) -> dict {
  return __verify_imports(paths, options ?? {})
}

/**
 * Add model-callable `web_search` and `verify_imports` tools with
 * capability-gated grounding guidance.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: web_grounding_tools(registry, options)
 */
pub fn web_grounding_tools(registry = nil, options = nil) {
  let opts = options ?? {}
  var tools = registry ?? tool_registry()
  tools = tool_define(
    tools,
    opts?.search_tool_name ?? "web_search",
    "Search authoritative docs, package registries, or a configured search API and return normalized results with provenance.",
    {
      parameters: {
        type: "object",
        required: ["query"],
        properties: {query: {type: "string"}, limit: {type: "integer"}, index: {type: "array"}, api: {type: "object"}},
      },
      annotations: {readOnlyHint: true},
      handler: { args -> web_search(args.query, __web_search_tool_options(opts, args)) },
      guidance: "For unfamiliar APIs, package names, or recent behavior, call web_search against authoritative docs or registries before coding. Prefer pinned docs and registry evidence over general web snippets, and carry source URLs into the answer or follow-up prompt.",
    },
  )
  tools = tool_define(
    tools,
    opts?.verify_tool_name ?? "verify_imports",
    "Verify imports in source files against manifests, installed-package hints, registry evidence, symbol metadata, and package trust signals.",
    {
      parameters: {
        type: "object",
        required: ["paths"],
        properties: {
          paths: {type: "array", items: {type: "string"}},
          project_root: {type: "string"},
          registry: {type: "array"},
          installed_packages: {},
        },
      },
      annotations: {readOnlyHint: true},
      handler: { args -> verify_imports(args.paths ?? [args.path], __web_search_tool_options(opts, args)) },
      guidance: "After generating or editing code with imports, call verify_imports on the touched source files. Treat package_not_found and symbol_not_found as blockers. Treat fresh_package or low_trust_package warnings as a forced lookup trigger before proceeding.",
    },
  )
  return tools
}

/**
 * Resolve a URL reference against a fetched source URL.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: web_resolve_url(base_url, href)
 */
pub fn web_resolve_url(base_url: string, href: string) {
  return __web_resolve_url(base_url, href)
}

/**
 * Return the origin URL with `path`, dropping query and fragment.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: web_origin_url(url, path)
 */
pub fn web_origin_url(url: string, path: string = "/") -> string {
  return __web_origin_url(url, path)
}

/**
 * Parse title, meta, canonical URL, links, tables, JSON-LD, and plain text.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: web_parse_html(html, source_url)
 */
pub fn web_parse_html(html: string, source_url = nil) -> dict {
  return __web_extract_html(html, source_url)
}

/**
 * Extract the HTML title.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_title(html)
 */
pub fn html_title(html: string) {
  return web_parse_html(html).title
}

/**
 * Extract normalized HTML meta tags keyed by lower-case name/property.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_meta(html)
 */
pub fn html_meta(html: string) {
  return web_parse_html(html).meta
}

/**
 * Extract resolved links from anchors.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_links(html, source_url)
 */
pub fn html_links(html: string, source_url = nil) {
  return web_parse_html(html, source_url).links
}

/**
 * Extract table captions, headers, and rows.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_tables(html)
 */
pub fn html_tables(html: string) {
  return web_parse_html(html).tables
}

/**
 * Extract parsed JSON-LD script blocks.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_json_ld(html)
 */
pub fn html_json_ld(html: string) {
  return web_parse_html(html).json_ld
}

/**
 * Extract normalized visible text without script/style/noscript bodies.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: html_text(html)
 */
pub fn html_text(html: string) {
  return web_parse_html(html).text
}

fn __web_robots_url(url) {
  return web_origin_url(url, "/robots.txt")
}

fn __web_line_value(line) {
  let parts = split(line, ":")
  if len(parts) < 2 {
    return nil
  }
  return {field: lowercase(trim(parts[0])), value: trim(join(parts[1:], ":"))}
}

fn __web_robots_rows(body) {
  var rows = []
  for line in split(body ?? "", "\n") {
    let clean = trim(split(line, "#")[0])
    if clean != "" {
      let row = __web_line_value(clean)
      if row != nil {
        rows = rows + [row]
      }
    }
  }
  return rows
}

fn __web_robots_group_score(agents, target) {
  var score = 0
  for agent in agents {
    let normalized = lowercase(trim(agent))
    if normalized == target {
      score = 2
    } else if normalized == "*" && score < 1 {
      score = 1
    }
  }
  return score
}

fn __web_robots_push_group(groups, agents, rules, target) {
  if len(agents) == 0 || len(rules) == 0 {
    return groups
  }
  let score = __web_robots_group_score(agents, target)
  if score <= 0 {
    return groups
  }
  return groups + [{score: score, rules: rules}]
}

fn __web_robots_rules(body, user_agent) {
  let target = lowercase(trim(user_agent ?? "*"))
  var groups = []
  var agents = []
  var group_rules = []
  for row in __web_robots_rows(body) {
    if row.field == "user-agent" {
      if len(group_rules) > 0 {
        groups = __web_robots_push_group(groups, agents, group_rules, target)
        agents = []
        group_rules = []
      }
      agents = agents + [row.value]
    } else if row.field == "allow" || row.field == "disallow" {
      group_rules = group_rules + [{kind: row.field, path: row.value}]
    }
  }
  groups = __web_robots_push_group(groups, agents, group_rules, target)
  var best_score = 0
  for group in groups {
    if group.score > best_score {
      best_score = group.score
    }
  }
  var rules = []
  for group in groups {
    if group.score == best_score {
      rules = rules + group.rules
    }
  }
  return rules
}

fn __web_path_for_robots(url) {
  let parsed = url_parse(url)
  let query = parsed?.query
  if query == nil || query == "" {
    return parsed.path ?? "/"
  }
  let path = parsed.path ?? "/"
  return path + "?" + query
}

/**
 * Return whether robots.txt permits `user_agent` to fetch `url`.
 *
 * Missing or non-2xx robots files allow by default. Matching uses the
 * deterministic subset recurring CI workflows need: exact or `*` user-agent
 * groups, Allow/Disallow path prefixes, and longest-prefix precedence.
 *
 * @effects: [net]
 * @allocation: stack-only
 * @errors: []
 * @api_stability: stable
 * @example: robots_allowed(url, user_agent, options)
 */
pub fn robots_allowed(url: string, user_agent: string = "*", options = nil) -> bool {
  let opts = options ?? {}
  let robots_url = opts?.robots_url ?? __web_robots_url(url)
  let response = web_fetch(robots_url, opts?.fetch_options ?? {})
  if !(response.ok ?? false) {
    return true
  }
  let path = __web_path_for_robots(url)
  var best_len = -1
  var allowed = true
  for rule in __web_robots_rules(response.body, user_agent) {
    if rule.path == "" {
      continue
    }
    if starts_with(path, rule.path) {
      let score = len(rule.path)
      if score > best_len || (score == best_len && rule.kind == "allow") {
        best_len = score
        allowed = rule.kind == "allow"
      }
    }
  }
  return allowed
}

fn __web_xml_unescape(text) {
  var out = regex_replace("&amp;", "&", text)
  out = regex_replace("&lt;", "<", out)
  out = regex_replace("&gt;", ">", out)
  out = regex_replace("&quot;", "\"", out)
  out = regex_replace("&apos;", "'", out)
  return out
}

fn __web_sitemap_locs(xml) {
  var urls = []
  for capture in regex_captures("(?is)<loc\\b[^>]*>(.*?)</loc>", xml ?? "") {
    let loc = trim(__web_xml_unescape(capture.groups[0] ?? ""))
    if loc != "" && !urls.contains(loc) {
      urls = urls + [loc]
    }
  }
  return urls
}

fn __web_sitemap_candidates(base_url, opts) {
  if opts?.sitemap_urls != nil {
    return opts.sitemap_urls
  }
  var urls = []
  let robots = web_fetch(opts?.robots_url ?? web_origin_url(base_url, "/robots.txt"), opts?.fetch_options ?? {})
  if robots.ok {
    for row in __web_robots_rows(robots.body) {
      if row.field == "sitemap" && row.value != "" && !urls.contains(row.value) {
        urls = urls + [row.value]
      }
    }
  }
  if len(urls) == 0 {
    urls = [web_origin_url(base_url, "/sitemap.xml")]
  }
  return urls
}

/**
 * Discover URLs from robots-advertised sitemaps or `/sitemap.xml`.
 *
 * Options: sitemap_urls, robots_url, fetch_options, max_sitemaps.
 *
 * @effects: [net]
 * @allocation: heap
 * @errors: []
 * @api_stability: stable
 * @example: sitemap_urls(base_url, options)
 */
pub fn sitemap_urls(base_url: string, options = nil) -> list<string> {
  let opts = options ?? {}
  let max_sitemaps = opts?.max_sitemaps ?? 10
  var found = []
  var count = 0
  for sitemap_url in __web_sitemap_candidates(base_url, opts) {
    if count >= max_sitemaps {
      return found
    }
    count = count + 1
    let response = web_fetch(sitemap_url, opts?.fetch_options ?? {})
    if response.ok {
      for loc in __web_sitemap_locs(response.body) {
        if !found.contains(loc) {
          found = found + [loc]
        }
      }
    }
  }
  return found
}