harn-stdlib 0.8.24

// std/web — deterministic HTTP-backed source ingest and extraction helpers.
import { cache_get, cache_put } from "std/cache"

fn __web_header(headers, name) {
  let wanted = lowercase(name)
  for entry in headers ?? {} {
    if lowercase(entry.key) == wanted {
      return entry.value
    }
  }
  return nil
}

fn __web_store_options(opts) {
  if opts?.store == nil {
    return nil
  }
  return {store: opts.store}
}

fn __web_cache_key(method, url, opts) {
  return opts?.cache_key ?? ("std/web:web_fetch:" + uppercase(method) + ":" + url)
}

fn __web_effective_url(url, http_opts) {
  let query = http_opts?.query
  if query == nil {
    return url
  }
  let parsed = url_parse(url)
  var pairs = query_parse(parsed?.query ?? "")
  for entry in query {
    if entry.value != nil {
      pairs = pairs + [{key: entry.key, value: to_string(entry.value)}]
    }
  }
  return url_build(
    {
      scheme: parsed.scheme,
      host: parsed.host,
      port: parsed.port,
      path: parsed.path ?? "/",
      query: query_stringify(pairs),
      fragment: parsed.fragment,
      username: parsed.username,
      password: parsed.password,
    },
  )
}

fn __web_base_headers(opts) {
  let http_headers = opts?.http_options?.headers ?? {}
  let headers = opts?.headers
  if headers == nil {
    return http_headers
  }
  return http_headers + headers
}

fn __web_conditional_headers(headers, previous, enabled) {
  if !enabled || previous == nil {
    return headers
  }
  var next = headers
  if __web_header(next, "if-none-match") == nil && previous?.etag != nil {
    next = next + {"If-None-Match": previous.etag}
  }
  if __web_header(next, "if-modified-since") == nil && previous?.last_modified != nil {
    next = next + {"If-Modified-Since": previous.last_modified}
  }
  return next
}

fn __web_fetch_envelope(source_url, response, fetched_at, cache_status) {
  let headers = response?.headers ?? {}
  return {
    ok: response?.ok ?? false,
    status: response?.status,
    body: response?.body ?? "",
    headers: headers,
    content_type: __web_header(headers, "content-type"),
    etag: __web_header(headers, "etag"),
    last_modified: __web_header(headers, "last-modified"),
    fetched_at: fetched_at,
    cache_status: cache_status,
    source_url: source_url,
    final_url: response?.final_url ?? source_url,
    not_modified: false,
  }
}

fn __web_not_modified(source_url, response, previous, fetched_at) {
  let headers = response?.headers ?? {}
  var next = previous
  next = next
    + {
    ok: true,
    status: 304,
    headers: headers,
    fetched_at: fetched_at,
    cache_status: "not_modified",
    source_url: source_url,
    final_url: response?.final_url ?? previous?.final_url ?? source_url,
    not_modified: true,
  }
  let etag = __web_header(headers, "etag")
  if etag != nil {
    next = next + {etag: etag}
  }
  let last_modified = __web_header(headers, "last-modified")
  if last_modified != nil {
    next = next + {last_modified: last_modified}
  }
  let content_type = __web_header(headers, "content-type")
  if content_type != nil {
    next = next + {content_type: content_type}
  }
  return next
}

/**
 * Fetch a web source through Harn's HTTP stack and normalize provenance.
 *
 * Options:
 *   - method, headers, query, body, timeout, retry, session, proxy, tls:
 *     forwarded to `http_request`.
 *   - http_options: explicit HTTP option dict when callers want web-specific
 *     options kept separate.
 *   - previous: previous `web_fetch` envelope used for conditional headers.
 *   - store: `std/cache` store. When present, cached ETag / Last-Modified
 *     values are used for conditional re-fetches and 304 responses reuse the
 *     cached body.
 *   - cache_key: override for the cache key.
 *   - conditional: set false to skip If-None-Match / If-Modified-Since.
 *   - fetched_at: deterministic timestamp override for fixtures.
 */
pub fn web_fetch(url: string, options = nil) -> dict {
  let opts = options ?? {}
  let method = uppercase(opts?.method ?? opts?.http_options?.method ?? "GET")
  var http_opts = opts?.http_options ?? opts
  let source_url = __web_effective_url(url, http_opts)
  let store_opts = __web_store_options(opts)
  let cache_key = __web_cache_key(method, source_url, opts)
  var previous = opts?.previous
  var cache_status = if store_opts == nil {
    "bypass"
  } else {
    "miss"
  }
  if previous == nil && store_opts != nil {
    let cached = cache_get(cache_key, store_opts)
    if cached.hit {
      previous = cached.value
      cache_status = "refresh"
    }
  }
  let headers = __web_conditional_headers(__web_base_headers(opts), previous, opts?.conditional ?? true)
  http_opts = http_opts + {headers: headers}
  let response = http_request(method, url, http_opts)
  let fetched_at = opts?.fetched_at ?? timestamp()
  if response.status == 304 && previous != nil {
    let not_modified = __web_not_modified(source_url, response, previous, fetched_at)
    if store_opts != nil {
      cache_put(cache_key, not_modified, store_opts)
    }
    return not_modified
  }
  let envelope = __web_fetch_envelope(source_url, response, fetched_at, cache_status)
  if store_opts != nil && envelope.ok {
    cache_put(cache_key, envelope, store_opts)
  }
  return envelope
}

/** Resolve a URL reference against a fetched source URL. */
pub fn web_resolve_url(base_url: string, href: string) {
  return __web_resolve_url(base_url, href)
}

/** Return the origin URL with `path`, dropping query and fragment. */
pub fn web_origin_url(url: string, path: string = "/") -> string {
  return __web_origin_url(url, path)
}

/** Parse title, meta, canonical URL, links, tables, JSON-LD, and plain text. */
pub fn web_parse_html(html: string, source_url = nil) -> dict {
  return __web_extract_html(html, source_url)
}

/** Extract the HTML title. */
pub fn html_title(html: string) {
  return web_parse_html(html).title
}

/** Extract normalized HTML meta tags keyed by lower-case name/property. */
pub fn html_meta(html: string) {
  return web_parse_html(html).meta
}

/** Extract resolved links from anchors. */
pub fn html_links(html: string, source_url = nil) {
  return web_parse_html(html, source_url).links
}

/** Extract table captions, headers, and rows. */
pub fn html_tables(html: string) {
  return web_parse_html(html).tables
}

/** Extract parsed JSON-LD script blocks. */
pub fn html_json_ld(html: string) {
  return web_parse_html(html).json_ld
}

/** Extract normalized visible text without script/style/noscript bodies. */
pub fn html_text(html: string) {
  return web_parse_html(html).text
}

fn __web_robots_url(url) {
  return web_origin_url(url, "/robots.txt")
}

fn __web_line_value(line) {
  let parts = split(line, ":")
  if len(parts) < 2 {
    return nil
  }
  return {field: lowercase(trim(parts[0])), value: trim(join(parts[1:], ":"))}
}

fn __web_robots_rows(body) {
  var rows = []
  for line in split(body ?? "", "\n") {
    let clean = trim(split(line, "#")[0])
    if clean != "" {
      let row = __web_line_value(clean)
      if row != nil {
        rows = rows + [row]
      }
    }
  }
  return rows
}

fn __web_robots_group_score(agents, target) {
  var score = 0
  for agent in agents {
    let normalized = lowercase(trim(agent))
    if normalized == target {
      score = 2
    } else if normalized == "*" && score < 1 {
      score = 1
    }
  }
  return score
}

fn __web_robots_push_group(groups, agents, rules, target) {
  if len(agents) == 0 || len(rules) == 0 {
    return groups
  }
  let score = __web_robots_group_score(agents, target)
  if score <= 0 {
    return groups
  }
  return groups + [{score: score, rules: rules}]
}

fn __web_robots_rules(body, user_agent) {
  let target = lowercase(trim(user_agent ?? "*"))
  var groups = []
  var agents = []
  var group_rules = []
  for row in __web_robots_rows(body) {
    if row.field == "user-agent" {
      if len(group_rules) > 0 {
        groups = __web_robots_push_group(groups, agents, group_rules, target)
        agents = []
        group_rules = []
      }
      agents = agents + [row.value]
    } else if row.field == "allow" || row.field == "disallow" {
      group_rules = group_rules + [{kind: row.field, path: row.value}]
    }
  }
  groups = __web_robots_push_group(groups, agents, group_rules, target)
  var best_score = 0
  for group in groups {
    if group.score > best_score {
      best_score = group.score
    }
  }
  var rules = []
  for group in groups {
    if group.score == best_score {
      rules = rules + group.rules
    }
  }
  return rules
}

fn __web_path_for_robots(url) {
  let parsed = url_parse(url)
  let query = parsed?.query
  if query == nil || query == "" {
    return parsed.path ?? "/"
  }
  let path = parsed.path ?? "/"
  return path + "?" + query
}

/**
 * Return whether robots.txt permits `user_agent` to fetch `url`.
 *
 * Missing or non-2xx robots files allow by default. Matching uses the
 * deterministic subset recurring CI workflows need: exact or `*` user-agent
 * groups, Allow/Disallow path prefixes, and longest-prefix precedence.
 */
pub fn robots_allowed(url: string, user_agent: string = "*", options = nil) -> bool {
  let opts = options ?? {}
  let robots_url = opts?.robots_url ?? __web_robots_url(url)
  let response = web_fetch(robots_url, opts?.fetch_options ?? {})
  if !(response.ok ?? false) {
    return true
  }
  let path = __web_path_for_robots(url)
  var best_len = -1
  var allowed = true
  for rule in __web_robots_rules(response.body, user_agent) {
    if rule.path == "" {
      continue
    }
    if starts_with(path, rule.path) {
      let score = len(rule.path)
      if score > best_len || (score == best_len && rule.kind == "allow") {
        best_len = score
        allowed = rule.kind == "allow"
      }
    }
  }
  return allowed
}

fn __web_xml_unescape(text) {
  var out = regex_replace("&amp;", "&", text)
  out = regex_replace("&lt;", "<", out)
  out = regex_replace("&gt;", ">", out)
  out = regex_replace("&quot;", "\"", out)
  out = regex_replace("&apos;", "'", out)
  return out
}

fn __web_sitemap_locs(xml) {
  var urls = []
  for capture in regex_captures("(?is)<loc\\b[^>]*>(.*?)</loc>", xml ?? "") {
    let loc = trim(__web_xml_unescape(capture.groups[0] ?? ""))
    if loc != "" && !urls.contains(loc) {
      urls = urls + [loc]
    }
  }
  return urls
}

fn __web_sitemap_candidates(base_url, opts) {
  if opts?.sitemap_urls != nil {
    return opts.sitemap_urls
  }
  var urls = []
  let robots = web_fetch(opts?.robots_url ?? web_origin_url(base_url, "/robots.txt"), opts?.fetch_options ?? {})
  if robots.ok {
    for row in __web_robots_rows(robots.body) {
      if row.field == "sitemap" && row.value != "" && !urls.contains(row.value) {
        urls = urls + [row.value]
      }
    }
  }
  if len(urls) == 0 {
    urls = [web_origin_url(base_url, "/sitemap.xml")]
  }
  return urls
}

/**
 * Discover URLs from robots-advertised sitemaps or `/sitemap.xml`.
 *
 * Options: sitemap_urls, robots_url, fetch_options, max_sitemaps.
 */
pub fn sitemap_urls(base_url: string, options = nil) -> list<string> {
  let opts = options ?? {}
  let max_sitemaps = opts?.max_sitemaps ?? 10
  var found = []
  var count = 0
  for sitemap_url in __web_sitemap_candidates(base_url, opts) {
    if count >= max_sitemaps {
      return found
    }
    count = count + 1
    let response = web_fetch(sitemap_url, opts?.fetch_options ?? {})
    if response.ok {
      for loc in __web_sitemap_locs(response.body) {
        if !found.contains(loc) {
          found = found + [loc]
        }
      }
    }
  }
  return found
}