// std/web — deterministic HTTP-backed source ingest and extraction helpers.
import { cache_get, cache_put } from "std/cache"
fn __web_header(headers, name) {
let wanted = lowercase(name)
for entry in headers ?? {} {
if lowercase(entry.key) == wanted {
return entry.value
}
}
return nil
}
fn __web_store_options(opts) {
if opts?.store == nil {
return nil
}
return {store: opts.store}
}
fn __web_cache_key(method, url, opts) {
return opts?.cache_key ?? ("std/web:web_fetch:" + uppercase(method) + ":" + url)
}
fn __web_effective_url(url, http_opts) {
let query = http_opts?.query
if query == nil {
return url
}
let parsed = url_parse(url)
var pairs = query_parse(parsed?.query ?? "")
for entry in query {
if entry.value != nil {
pairs = pairs + [{key: entry.key, value: to_string(entry.value)}]
}
}
return url_build(
{
scheme: parsed.scheme,
host: parsed.host,
port: parsed.port,
path: parsed.path ?? "/",
query: query_stringify(pairs),
fragment: parsed.fragment,
username: parsed.username,
password: parsed.password,
},
)
}
fn __web_base_headers(opts) {
let http_headers = opts?.http_options?.headers ?? {}
let headers = opts?.headers
if headers == nil {
return http_headers
}
return http_headers + headers
}
fn __web_conditional_headers(headers, previous, enabled) {
if !enabled || previous == nil {
return headers
}
var next = headers
if __web_header(next, "if-none-match") == nil && previous?.etag != nil {
next = next + {"If-None-Match": previous.etag}
}
if __web_header(next, "if-modified-since") == nil && previous?.last_modified != nil {
next = next + {"If-Modified-Since": previous.last_modified}
}
return next
}
fn __web_fetch_envelope(source_url, response, fetched_at, cache_status) {
let headers = response?.headers ?? {}
return {
ok: response?.ok ?? false,
status: response?.status,
body: response?.body ?? "",
headers: headers,
content_type: __web_header(headers, "content-type"),
etag: __web_header(headers, "etag"),
last_modified: __web_header(headers, "last-modified"),
fetched_at: fetched_at,
cache_status: cache_status,
source_url: source_url,
final_url: response?.final_url ?? source_url,
not_modified: false,
}
}
fn __web_not_modified(source_url, response, previous, fetched_at) {
let headers = response?.headers ?? {}
var next = previous
next = next
+ {
ok: true,
status: 304,
headers: headers,
fetched_at: fetched_at,
cache_status: "not_modified",
source_url: source_url,
final_url: response?.final_url ?? previous?.final_url ?? source_url,
not_modified: true,
}
let etag = __web_header(headers, "etag")
if etag != nil {
next = next + {etag: etag}
}
let last_modified = __web_header(headers, "last-modified")
if last_modified != nil {
next = next + {last_modified: last_modified}
}
let content_type = __web_header(headers, "content-type")
if content_type != nil {
next = next + {content_type: content_type}
}
return next
}
/**
* Fetch a web source through Harn's HTTP stack and normalize provenance.
*
* Options:
* - method, headers, query, body, timeout, retry, session, proxy, tls:
* forwarded to `http_request`.
* - http_options: explicit HTTP option dict when callers want web-specific
* options kept separate.
* - previous: previous `web_fetch` envelope used for conditional headers.
* - store: `std/cache` store. When present, cached ETag / Last-Modified
* values are used for conditional re-fetches and 304 responses reuse the
* cached body.
* - cache_key: override for the cache key.
* - conditional: set false to skip If-None-Match / If-Modified-Since.
* - fetched_at: deterministic timestamp override for fixtures.
*/
pub fn web_fetch(url: string, options = nil) -> dict {
let opts = options ?? {}
let method = uppercase(opts?.method ?? opts?.http_options?.method ?? "GET")
var http_opts = opts?.http_options ?? opts
let source_url = __web_effective_url(url, http_opts)
let store_opts = __web_store_options(opts)
let cache_key = __web_cache_key(method, source_url, opts)
var previous = opts?.previous
var cache_status = if store_opts == nil {
"bypass"
} else {
"miss"
}
if previous == nil && store_opts != nil {
let cached = cache_get(cache_key, store_opts)
if cached.hit {
previous = cached.value
cache_status = "refresh"
}
}
let headers = __web_conditional_headers(__web_base_headers(opts), previous, opts?.conditional ?? true)
http_opts = http_opts + {headers: headers}
let response = http_request(method, url, http_opts)
let fetched_at = opts?.fetched_at ?? timestamp()
if response.status == 304 && previous != nil {
let not_modified = __web_not_modified(source_url, response, previous, fetched_at)
if store_opts != nil {
cache_put(cache_key, not_modified, store_opts)
}
return not_modified
}
let envelope = __web_fetch_envelope(source_url, response, fetched_at, cache_status)
if store_opts != nil && envelope.ok {
cache_put(cache_key, envelope, store_opts)
}
return envelope
}
/** Resolve a URL reference against a fetched source URL. */
pub fn web_resolve_url(base_url: string, href: string) {
return __web_resolve_url(base_url, href)
}
/** Return the origin URL with `path`, dropping query and fragment. */
pub fn web_origin_url(url: string, path: string = "/") -> string {
return __web_origin_url(url, path)
}
/** Parse title, meta, canonical URL, links, tables, JSON-LD, and plain text. */
pub fn web_parse_html(html: string, source_url = nil) -> dict {
return __web_extract_html(html, source_url)
}
/** Extract the HTML title. */
pub fn html_title(html: string) {
return web_parse_html(html).title
}
/** Extract normalized HTML meta tags keyed by lower-case name/property. */
pub fn html_meta(html: string) {
return web_parse_html(html).meta
}
/** Extract resolved links from anchors. */
pub fn html_links(html: string, source_url = nil) {
return web_parse_html(html, source_url).links
}
/** Extract table captions, headers, and rows. */
pub fn html_tables(html: string) {
return web_parse_html(html).tables
}
/** Extract parsed JSON-LD script blocks. */
pub fn html_json_ld(html: string) {
return web_parse_html(html).json_ld
}
/** Extract normalized visible text without script/style/noscript bodies. */
pub fn html_text(html: string) {
return web_parse_html(html).text
}
fn __web_robots_url(url) {
return web_origin_url(url, "/robots.txt")
}
fn __web_line_value(line) {
let parts = split(line, ":")
if len(parts) < 2 {
return nil
}
return {field: lowercase(trim(parts[0])), value: trim(join(parts[1:], ":"))}
}
fn __web_robots_rows(body) {
var rows = []
for line in split(body ?? "", "\n") {
let clean = trim(split(line, "#")[0])
if clean != "" {
let row = __web_line_value(clean)
if row != nil {
rows = rows + [row]
}
}
}
return rows
}
fn __web_robots_group_score(agents, target) {
var score = 0
for agent in agents {
let normalized = lowercase(trim(agent))
if normalized == target {
score = 2
} else if normalized == "*" && score < 1 {
score = 1
}
}
return score
}
fn __web_robots_push_group(groups, agents, rules, target) {
if len(agents) == 0 || len(rules) == 0 {
return groups
}
let score = __web_robots_group_score(agents, target)
if score <= 0 {
return groups
}
return groups + [{score: score, rules: rules}]
}
fn __web_robots_rules(body, user_agent) {
let target = lowercase(trim(user_agent ?? "*"))
var groups = []
var agents = []
var group_rules = []
for row in __web_robots_rows(body) {
if row.field == "user-agent" {
if len(group_rules) > 0 {
groups = __web_robots_push_group(groups, agents, group_rules, target)
agents = []
group_rules = []
}
agents = agents + [row.value]
} else if row.field == "allow" || row.field == "disallow" {
group_rules = group_rules + [{kind: row.field, path: row.value}]
}
}
groups = __web_robots_push_group(groups, agents, group_rules, target)
var best_score = 0
for group in groups {
if group.score > best_score {
best_score = group.score
}
}
var rules = []
for group in groups {
if group.score == best_score {
rules = rules + group.rules
}
}
return rules
}
fn __web_path_for_robots(url) {
let parsed = url_parse(url)
let query = parsed?.query
if query == nil || query == "" {
return parsed.path ?? "/"
}
let path = parsed.path ?? "/"
return path + "?" + query
}
/**
* Return whether robots.txt permits `user_agent` to fetch `url`.
*
* Missing or non-2xx robots files allow by default. Matching uses the
* deterministic subset recurring CI workflows need: exact or `*` user-agent
* groups, Allow/Disallow path prefixes, and longest-prefix precedence.
*/
pub fn robots_allowed(url: string, user_agent: string = "*", options = nil) -> bool {
let opts = options ?? {}
let robots_url = opts?.robots_url ?? __web_robots_url(url)
let response = web_fetch(robots_url, opts?.fetch_options ?? {})
if !(response.ok ?? false) {
return true
}
let path = __web_path_for_robots(url)
var best_len = -1
var allowed = true
for rule in __web_robots_rules(response.body, user_agent) {
if rule.path == "" {
continue
}
if starts_with(path, rule.path) {
let score = len(rule.path)
if score > best_len || (score == best_len && rule.kind == "allow") {
best_len = score
allowed = rule.kind == "allow"
}
}
}
return allowed
}
fn __web_xml_unescape(text) {
var out = regex_replace("&", "&", text)
out = regex_replace("<", "<", out)
out = regex_replace(">", ">", out)
out = regex_replace(""", "\"", out)
out = regex_replace("'", "'", out)
return out
}
fn __web_sitemap_locs(xml) {
var urls = []
for capture in regex_captures("(?is)<loc\\b[^>]*>(.*?)</loc>", xml ?? "") {
let loc = trim(__web_xml_unescape(capture.groups[0] ?? ""))
if loc != "" && !urls.contains(loc) {
urls = urls + [loc]
}
}
return urls
}
fn __web_sitemap_candidates(base_url, opts) {
if opts?.sitemap_urls != nil {
return opts.sitemap_urls
}
var urls = []
let robots = web_fetch(opts?.robots_url ?? web_origin_url(base_url, "/robots.txt"), opts?.fetch_options ?? {})
if robots.ok {
for row in __web_robots_rows(robots.body) {
if row.field == "sitemap" && row.value != "" && !urls.contains(row.value) {
urls = urls + [row.value]
}
}
}
if len(urls) == 0 {
urls = [web_origin_url(base_url, "/sitemap.xml")]
}
return urls
}
/**
* Discover URLs from robots-advertised sitemaps or `/sitemap.xml`.
*
* Options: sitemap_urls, robots_url, fetch_options, max_sitemaps.
*/
pub fn sitemap_urls(base_url: string, options = nil) -> list<string> {
let opts = options ?? {}
let max_sitemaps = opts?.max_sitemaps ?? 10
var found = []
var count = 0
for sitemap_url in __web_sitemap_candidates(base_url, opts) {
if count >= max_sitemaps {
return found
}
count = count + 1
let response = web_fetch(sitemap_url, opts?.fetch_options ?? {})
if response.ok {
for loc in __web_sitemap_locs(response.body) {
if !found.contains(loc) {
found = found + [loc]
}
}
}
}
return found
}