schema: "nika/workflow@0.12"
workflow: parallel-web-scraper
provider: "{{PROVIDER}}"
model: "{{MODEL}}"
artifacts:
dir: .
tasks:
- id: scrape_sites
for_each:
- { url: "https://blog.rust-lang.org/", name: "Rust Blog" }
- { url: "https://github.blog/", name: "GitHub Blog" }
- { url: "https://news.ycombinator.com/", name: "Hacker News" }
- { url: "https://www.rust-lang.org/", name: "Rust Homepage" }
- { url: "https://crates.io/", name: "Crates.io" }
- { url: "https://docs.rs/", name: "Docs.rs" }
- { url: "https://httpbin.org/", name: "HTTPBin" }
- { url: "https://jsonplaceholder.typicode.com/", name: "JSON Placeholder" }
- { url: "https://www.wikipedia.org/", name: "Wikipedia" }
- { url: "https://developer.mozilla.org/", name: "MDN Web Docs" }
as: site
concurrency: 5
fetch:
url: "{{with.site.url}}"
extract: markdown
timeout: 20
- id: summarize_pages
depends_on: [scrape_sites]
with:
pages: $scrape_sites
for_each: "{{with.pages}}"
as: page
concurrency: 5
infer:
prompt: |
Summarize this web page in 100 words:
{{with.page | first(3000)}}
temperature: 0.3
max_tokens: 300
- id: intelligence_report
depends_on: [summarize_pages]
with:
summaries: $summarize_pages
infer:
system: "You are an intelligence analyst."
prompt: |
Create a technology intelligence report from: {{with.summaries}}
Include: Executive Overview, Key Themes, Emerging Patterns, Actions.
temperature: 0.3
max_tokens: 1500
artifact:
path: output/intelligence-report.md