Skip to main content

crw_server/routes/
research.rs

1//! Firecrawl-compatible Research API routes (`/v1/search/research/*`).
2//!
3//! Stateless primitives over live data (no self-hosted index): our OWN fastCRW
4//! SearXNG search (web + research-mode, the primary recall driver) merged with
5//! OpenAlex + Semantic Scholar via [`crw_search::research`]. The agent brain
6//! (intent routing, exact-name reframing, leaderboard/survey) lives in the
7//! research SKILL, not here — exactly like Firecrawl.
8//!
9//! Response shapes mirror Firecrawl's v2 research SDK ([`crw_core::research_types`])
10//! so their SDK/CLI works drop-in against our base URL.
11
12use axum::Json;
13use axum::extract::{Path, Query, State};
14use crw_core::error::CrwError;
15use crw_core::research_types::{
16    GithubResponse, PaperMetaResponse, PapersResponse, ReadPaperResponse, ResearchGithubItem,
17    ResearchPassage, SimilarResponse,
18};
19use crw_search::research::{self, Mode, PaperHit, ResearchKeys, SearchFilters};
20use crw_search::{SearxngClient, SearxngParams};
21use serde::Deserialize;
22use std::sync::Arc;
23
24use crate::error::AppError;
25use crate::state::AppState;
26
27const DEFAULT_K: usize = 40;
28const MAX_K: usize = 100;
29
30fn keys(state: &AppState) -> ResearchKeys<'_> {
31    ResearchKeys {
32        openalex_key: state.config.search.openalex_api_key.as_deref(),
33        openalex_mailto: state.config.search.openalex_mailto.as_deref(),
34        s2_key: state.config.search.s2_api_key.as_deref(),
35    }
36}
37
38fn searxng(state: &AppState) -> Result<Arc<SearxngClient>, CrwError> {
39    state.searxng.as_ref().cloned().ok_or_else(|| {
40        CrwError::SearchDisabled(
41            "Search is disabled. Set [search].searxng_url or CRW_SEARCH__SEARXNG_URL.".into(),
42        )
43    })
44}
45
46fn clamp_k(k: Option<usize>) -> usize {
47    k.unwrap_or(DEFAULT_K).clamp(1, MAX_K)
48}
49
50/// Join engines, or `None` when empty (sending `engines=` can silently empty
51/// the SearXNG leg).
52fn join_nonempty(v: &[String]) -> Option<String> {
53    if v.is_empty() {
54        None
55    } else {
56        Some(v.join(","))
57    }
58}
59
60/// One SearXNG leg → arXiv-only [`PaperHit`]s. `engines: None` = plain web
61/// (google/bing); `Some(joined)` = research-mode scholarly engines.
62async fn searxng_papers(
63    client: &SearxngClient,
64    engines: Option<String>,
65    query: &str,
66) -> Vec<PaperHit> {
67    let params = SearxngParams {
68        q: query.to_string(),
69        categories: None,
70        language: Some("en".to_string()),
71        time_range: None,
72        engines,
73        pageno: None,
74        safesearch: None,
75    };
76    let Ok(resp) = client.fetch(&params).await else {
77        return Vec::new();
78    };
79    resp.results
80        .into_iter()
81        .filter_map(|r| {
82            let title = r.title.clone().unwrap_or_default();
83            let blob = format!(
84                "{} {} {}",
85                r.url.unwrap_or_default(),
86                title,
87                r.content.unwrap_or_default()
88            );
89            PaperHit::from_searxng(&title, &blob, r.score.unwrap_or(0.0))
90        })
91        .collect()
92}
93
94#[derive(Deserialize)]
95pub struct PapersQuery {
96    query: String,
97    k: Option<usize>,
98    authors: Option<String>,
99    categories: Option<String>,
100    from: Option<String>,
101    to: Option<String>,
102}
103
104/// `GET /v1/search/research/papers` — ranked paper search. Merges our own
105/// fastCRW search (web + research-mode) with OpenAlex + SS, frequency-ranked
106/// (the `research_tools.py` cascade core).
107pub async fn search_papers(
108    State(state): State<AppState>,
109    Query(q): Query<PapersQuery>,
110) -> Result<Json<PapersResponse>, AppError> {
111    let client = searxng(&state)?;
112    let k = clamp_k(q.k);
113    let f = SearchFilters {
114        authors: q.authors,
115        categories: q.categories,
116        from: q.from,
117        to: q.to,
118    };
119    let kz = keys(&state);
120    let research_engines = join_nonempty(&state.config.search.research_engines);
121    // our own search (primary driver) + OpenAlex + SS, all in parallel
122    let (web, scholar, oa_ss) = tokio::join!(
123        searxng_papers(&client, None, &q.query),
124        searxng_papers(&client, research_engines, &q.query),
125        research::search_papers_pools(&kz, &q.query, k, &f),
126    );
127    let mut pools = vec![web, scholar];
128    pools.extend(oa_ss);
129    let results = research::merge_rank(pools, k);
130    Ok(Json(PapersResponse {
131        success: true,
132        results,
133    }))
134}
135
136#[derive(Deserialize)]
137pub struct PaperQuery {
138    query: Option<String>,
139    k: Option<usize>,
140}
141
142/// `GET /v1/search/research/papers/{id}` — inspect metadata, or (with `?query`)
143/// read top passages.
144///
145/// `ponytail:` read_passages is abstract-scoped (ranks abstract sentences by
146/// query-term overlap). Full arXiv-body passages are the upgrade: scrape
147/// `arxiv.org/html|pdf/<id>` via `state.renderer` + chunk + rank. Deferred —
148/// abstract relevance carries the common "does this paper mention X" check, and
149/// the heavy scrape plumbing (crw_crawl::single::scrape_url, 8 args) is a follow-up.
150pub async fn get_paper(
151    State(state): State<AppState>,
152    Path(id): Path<String>,
153    Query(q): Query<PaperQuery>,
154) -> Result<Json<serde_json::Value>, AppError> {
155    let kz = keys(&state);
156    let meta = research::inspect(&kz, &id)
157        .await
158        .ok_or_else(|| CrwError::NotFound(format!("paper not found: {id}")))?;
159    match q.query {
160        None => Ok(Json(
161            serde_json::to_value(PaperMetaResponse {
162                success: true,
163                paper: meta,
164            })
165            .map_err(|e| CrwError::Internal(e.to_string()))?,
166        )),
167        Some(query) => {
168            let k = q.k.unwrap_or(4).clamp(1, 20);
169            let passages = rank_abstract_passages(meta.abstract_.as_deref(), &query, k);
170            Ok(Json(
171                serde_json::to_value(ReadPaperResponse {
172                    success: true,
173                    paper_id: meta.paper_id.clone(),
174                    query,
175                    passages,
176                    paper: meta,
177                })
178                .map_err(|e| CrwError::Internal(e.to_string()))?,
179            ))
180        }
181    }
182}
183
184/// Split an abstract into sentences, score each by query-term overlap, return
185/// the top-k. Legal (abstract is CC0 metadata) and dependency-free.
186fn rank_abstract_passages(abstract_: Option<&str>, query: &str, k: usize) -> Vec<ResearchPassage> {
187    let Some(text) = abstract_ else {
188        return Vec::new();
189    };
190    let qterms: Vec<String> = query
191        .to_lowercase()
192        .split_whitespace()
193        .filter(|w| w.len() > 2)
194        .map(String::from)
195        .collect();
196    let mut scored: Vec<ResearchPassage> = text
197        .split(['.', '!', '?'])
198        .map(str::trim)
199        .filter(|s| s.len() > 20)
200        .map(|sentence| {
201            let low = sentence.to_lowercase();
202            let hits = qterms.iter().filter(|t| low.contains(*t)).count();
203            let score = if qterms.is_empty() {
204                0.0
205            } else {
206                hits as f64 / qterms.len() as f64
207            };
208            ResearchPassage {
209                text: sentence.to_string(),
210                score,
211            }
212        })
213        .collect();
214    scored.sort_by(|a, b| {
215        b.score
216            .partial_cmp(&a.score)
217            .unwrap_or(std::cmp::Ordering::Equal)
218    });
219    scored.truncate(k);
220    scored
221}
222
223#[derive(Deserialize)]
224pub struct SimilarQuery {
225    intent: Option<String>,
226    mode: Option<String>,
227    k: Option<usize>,
228}
229
230/// `GET /v1/search/research/papers/{id}/similar` — citation-graph expansion.
231pub async fn similar(
232    State(state): State<AppState>,
233    Path(id): Path<String>,
234    Query(q): Query<SimilarQuery>,
235) -> Result<Json<SimilarResponse>, AppError> {
236    // Firecrawl requires `intent`; enforce for drop-in compat.
237    if q.intent.as_deref().unwrap_or("").trim().is_empty() {
238        return Err(CrwError::InvalidRequest("`intent` is required".into()).into());
239    }
240    let k = clamp_k(q.k);
241    let mode = match q.mode.as_deref() {
242        Some("citers") => Mode::Citers,
243        Some("references") => Mode::References,
244        _ => Mode::Similar,
245    };
246    let kz = keys(&state);
247    let results = research::related(&kz, &id, mode, k).await;
248    let pool_size = results.len();
249    Ok(Json(SimilarResponse {
250        success: true,
251        results,
252        pool_size,
253        truncated: pool_size >= k,
254        note: None,
255    }))
256}
257
258#[derive(Deserialize)]
259pub struct GithubQuery {
260    query: String,
261    k: Option<usize>,
262}
263
264/// `GET /v1/search/research/github` — GitHub search via our SearXNG github
265/// engines. `ponytail:` SearXNG yields repo/readme hits, so `resultType` is
266/// `repo_readme`; issue/PR/discussion granularity (Firecrawl's `github_history`)
267/// is a follow-up if the github engines expose it.
268pub async fn github(
269    State(state): State<AppState>,
270    Query(q): Query<GithubQuery>,
271) -> Result<Json<GithubResponse>, AppError> {
272    let client = searxng(&state)?;
273    let k = q.k.unwrap_or(20).clamp(1, 100);
274    let params = SearxngParams {
275        q: q.query,
276        categories: None,
277        language: Some("en".to_string()),
278        time_range: None,
279        engines: join_nonempty(&state.config.search.github_engines),
280        pageno: None,
281        safesearch: None,
282    };
283    let resp = client
284        .fetch(&params)
285        .await
286        .map_err(|e| CrwError::HttpError(format!("github search failed: {e}")))?;
287    let results: Vec<ResearchGithubItem> = resp
288        .results
289        .into_iter()
290        .take(k)
291        .filter_map(|r| {
292            let url = r.url?;
293            // owner/name from a github URL path
294            let repo = url
295                .split("github.com/")
296                .nth(1)
297                .map(|p| p.split('/').take(2).collect::<Vec<_>>().join("/"))
298                .unwrap_or_default();
299            Some(ResearchGithubItem {
300                result_type: "repo_readme".to_string(),
301                repo,
302                url,
303                page_type: None,
304                number: None,
305                segment_count: None,
306                readme_url: None,
307                title: r.title.unwrap_or_default(),
308                snippet: r.content.clone().unwrap_or_default(),
309                content_md: r.content,
310            })
311        })
312        .collect();
313    Ok(Json(GithubResponse {
314        success: true,
315        results,
316    }))
317}