crusty 0.12.0

Fast && scalable Broad Web Crawler developed on top of crusty-core
---
log:
  # fancy some colors? - disable if redirect to file is intended
  ansi: false
  # base-line log level
  level: warn
  # my_crate=info,my_crate::my_mod=debug,[my_span]=trace
  # see https://tracing.rs/tracing_subscriber/filter/struct.envfilter
  #"[task{name=Crusty::go}]=info", "[task{name=Crusty::job_reader}]=info"
  #filter: ["[task{name=Crusty::go}]=info", "[task{name=Crusty::job_reader}]=info"]
host: crawler-1 # for metrics
app_id: crusty # for metrics

# Clickhouse database settings
clickhouse:
  url: http://clickhouse:8123
  username: default
  password: ""
  database: crusty

  # We persist various queue metrics
  metrics_queue:
    table_name: metrics_queue
    label: ""
    # we always try to write in bulk, buffer up to max items before writing
    buffer_capacity: 1000
    # while we're waiting for buffer to fill wake once in a while to check for force_write_duration
    check_for_force_write_duration: 100ms
    # if force_write_duration elapsed since last write but we yet not filled buffer_capacity force the write anyway
    force_write_duration: 500ms
  # We persist some db metrics for further analysis
  metrics_db:
    table_name: metrics_db
    label: ""
    buffer_capacity: 1000
    check_for_force_write_duration: 100ms
    force_write_duration: 500ms
  # We persist metrics and various meta-data for each visited page
  metrics_task:
    table_name: metrics_task
    label: ""
    buffer_capacity: 10000
    check_for_force_write_duration: 100ms
    force_write_duration: 500ms
  # We persist candidates for newly discovered domains, db will perform final deduplication
  domain_discovery_insert:
    table_name: domain_discovery
    label: insert
    buffer_capacity: 10000
    check_for_force_write_duration: 500ms
    force_write_duration: 2500ms
  # We persist confirmations that domain has been checked so that it won't be re-selected unless special criteria is met
  domain_discovery_update:
    table_name: domain_discovery
    label: update
    buffer_capacity: 10000
    check_for_force_write_duration: 500ms
    force_write_duration: 2500ms
# resolver settings
# leave empty for auto-conf
#resolver:
  # number of concurrent green threads for name resolution(be mindful of your dns server capacity)
  # this should be configured carefully, low setting will lead to job starvation(inability to satisfy requested concurrency_profile.domain_concurrency)
  #concurrency: 64
# domain discovery cache capacity, this cache helps to ease load on clickhouse(so we do not insert billions of duplicated records)
# but because cache is local it's effectiveness will drop when adding new crawler nodes
# so if one were to try running this on google scale it would most likely require a dedicated dedup layer before hitting clickhouse
ddc_cap: 25000000
# recently discovered domains live in cache up to this duration
ddc_lifetime: 1h
# We monitor various internal queues and persist their status to db
queue_monitor_interval: 1s
# We parse HTML in a separate thread pool, stack size is configurable
# apparently even 32mib is not enough given max_response_size of 2mib...
parser_processor_stack_size: 128mib
# Fancy local address binding for monster setups with several NICs(local port limitation)
networking_profile:
  values:
    bind_local_ipv4:
    bind_local_ipv6:
    socket_read_buffer_size: 32kib
    socket_write_buffer_size: 32kib
    connect_timeout: 5s
#leave commented for auto-conf
#concurrency_profile:
  #= N of physical cores by default
  #parser_concurrency:
  # We check multiple domains concurrency, set accordingly to saturate your hardware(cpu/network bound)
  #domain_concurrency: 100
# We select new jobs(domains) from queue-like structure hosted in clickhouse
job_reader:
  domain_table_name: domain_discovery
  #we resolve IP of all discovered domains and calculate addr_key
  #1. Take only ipv4
  #2. Sort
  #3. Take first IP and apply addr_key_mask masking
  #4. addr_key = addr_key | addr_key_4_mask;
  #we now use addr_key in shard calculation, we never select more than domain_top_n domains from a given addr_key
  #this ensures we are being polite to websites with different domains hosted on the same IP(or subnet, depending on addr_key_4_mask)
  addr_key_mask: 24 #read as /24 meaning first 24 bits are significant while last 8 are not(will be masked)
  # re-select checked domains after some time
  re_after_days: 3
  # queue is sharded, do not ask the same shard for job unless duration has passed since last time we asked
  shard_min_last_read: 1s
  # min shard number we have access to
  shard_min: 1
  # max shard number we have access to
  shard_max: 25
  # total number of all shards, in a multi-node setup shard_total > shard_max - shard_min + 1 (always)
  shard_total: 25
  # select up to N domains from a shard at once
  shard_select_limit: 100000
  # buffer up to N domains and do not try to fetch new if we have enough
  job_buffer: 100000
  # select up to N domains belonging to the same IP, (a.tumblr.com, b.tumblr.com, c.tumblr.com but not d.tumblr.com)
  domain_top_n: 2
  # those settings relate to a crawler running on some particular domain
  default_crawling_settings:
    # up to N pages concurrently, keep this number low to avoid excess stress
    concurrency: 2
    internal_read_buffer_size: 32kib
    max_response_size: 2mib
    # follow up to N redirects before giving up
    max_redirect: 5
    # 1s-5s is a safe bet to keep to avoid extra stress
    delay: 1s
    # vary delay time by this jitter(0..)
    delay_jitter: 1s
    # timeout for page loading and buffering
    load_timeout: 10s
    # after soft timeout elapses we no longer queue new tasks for domain
    job_soft_timeout: 30s
    # after hard timeout elapses we forcibly stop the crawling job for this domain
    job_hard_timeout: 60s
    user_agent: "crusty/0.12.0"
    compression: true
    # custom headers are supported
    custom_headers:
      accept:
        - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
  # initial list of seed URLs to start the broad crawling from, additionally we also read seeds from CRUSTY_SEEDS env. variable
  seeds: []