use std::sync::OnceLock;
use crate::coding::contains_cjk;
use crate::engine::normalize_prompt;
use crate::seed::{
self, Slot, WordForm, ROLE_CLAUSE_CONTINUATION_MARKER, ROLE_ENUMERATION_CONSTRAINT,
ROLE_ENUMERATION_REQUEST_OPENER, ROLE_FOLLOWUP_INSTRUCTION_VERB,
ROLE_RESEARCH_EVALUATION_DOMAIN, ROLE_RESEARCH_EVIDENCE_DOMAIN, ROLE_RESEARCH_QUESTION_OPENER,
ROLE_RESEARCH_SUPERLATIVE_MODIFIER, ROLE_WEB_SEARCH_ACTION, ROLE_WEB_SEARCH_EXPLICIT_PREFIX,
ROLE_WEB_SEARCH_IMPERATIVE_LEAD, ROLE_WEB_SEARCH_QUERY_LEADING_NOISE,
ROLE_WEB_SEARCH_QUERY_TRAILING_NOISE, ROLE_WEB_SEARCH_SIGNAL, ROLE_WEB_SEARCH_SOURCE_ONLY,
ROLE_WEB_SEARCH_STRONG_ACTION, ROLE_WEB_SEARCH_TOPIC_MARKER,
};
use super::web_requests::normalize_url_candidate;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum WebSearchQueryKind {
ExplicitPrefix,
SemanticAction,
ImplicitResearchQuestion,
EnumerationResearchRequest,
}
impl WebSearchQueryKind {
pub(super) const fn as_str(self) -> &'static str {
match self {
Self::ExplicitPrefix => "explicit_prefix",
Self::SemanticAction => "semantic_action",
Self::ImplicitResearchQuestion => "implicit_research_question",
Self::EnumerationResearchRequest => "enumeration_research_request",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(super) struct WebSearchRequest {
pub(super) query: String,
pub(super) kind: WebSearchQueryKind,
}
pub(super) fn extract_web_search_request(
prompt: &str,
normalized: &str,
) -> Option<WebSearchRequest> {
let normalized_words = normalize_prompt(prompt);
if normalized_words.starts_with("search conversations ")
|| normalized_words.starts_with("search my conversations ")
|| normalized_words.starts_with("search my chats ")
|| is_personal_fact_filter_request(&normalized_words)
{
return None;
}
for &prefix in &markers().explicit_prefixes {
if let Some(query) = normalized.strip_prefix(prefix) {
if let Some(query) = valid_search_query(query) {
return Some(WebSearchRequest {
query,
kind: WebSearchQueryKind::ExplicitPrefix,
});
}
}
if let Some(query) = normalized_words.strip_prefix(prefix) {
if let Some(query) = valid_search_query(query) {
return Some(WebSearchRequest {
query,
kind: WebSearchQueryKind::ExplicitPrefix,
});
}
}
}
if let Some(query) = extract_semantic_web_search_query(&normalized_words) {
return Some(WebSearchRequest {
query,
kind: WebSearchQueryKind::SemanticAction,
});
}
if let Some(query) = extract_enumeration_research_request(&normalized_words) {
return Some(WebSearchRequest {
query,
kind: WebSearchQueryKind::EnumerationResearchRequest,
});
}
extract_implicit_research_question(&normalized_words).map(|query| WebSearchRequest {
query,
kind: WebSearchQueryKind::ImplicitResearchQuestion,
})
}
fn is_personal_fact_filter_request(normalized: &str) -> bool {
normalized.contains("facts i have contributed")
|| normalized.contains("facts ive contributed")
|| normalized.contains("facts i contributed")
|| normalized.contains("my facts")
}
fn clean_search_query(value: &str) -> String {
value
.trim()
.trim_matches(is_url_wrapper_punctuation)
.trim_end_matches(is_url_trailing_punctuation)
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
const fn is_url_wrapper_punctuation(character: char) -> bool {
matches!(
character,
'<' | '>' | '(' | ')' | '[' | ']' | '{' | '}' | '"' | '\'' | '`' | '«' | '»'
)
}
const fn is_url_trailing_punctuation(character: char) -> bool {
matches!(character, '.' | ',' | '!' | '?' | ';' | ':' | '…')
}
const fn is_sentence_boundary(character: char) -> bool {
matches!(
character,
'.' | '?' | '!' | ';' | ':' | '。' | '?' | '!' | ';' | ':'
)
}
struct WebSearchMarkers {
explicit_prefixes: Vec<&'static str>,
action_markers: Vec<&'static str>,
strong_action_markers: Vec<&'static str>,
signal_markers: Vec<&'static str>,
topic_after_markers: Vec<&'static str>,
topic_before_markers: Vec<&'static str>,
imperative_lead_markers: Vec<&'static str>,
leading_noise: Vec<&'static str>,
trailing_noise: Vec<&'static str>,
source_only: Vec<String>,
followup_verbs: Vec<&'static str>,
continuation_markers: Vec<&'static str>,
research_question_prefixes: Vec<&'static str>,
research_modifiers: Vec<&'static str>,
research_evidence_domains: Vec<&'static str>,
research_evaluation_domains: Vec<&'static str>,
enumeration_prefixes: Vec<&'static str>,
enumeration_constraint_markers: Vec<&'static str>,
}
fn markers() -> &'static WebSearchMarkers {
static CACHE: OnceLock<WebSearchMarkers> = OnceLock::new();
CACHE.get_or_init(|| WebSearchMarkers {
explicit_prefixes: prefix_literals(ROLE_WEB_SEARCH_EXPLICIT_PREFIX),
action_markers: bare_literals(ROLE_WEB_SEARCH_ACTION),
strong_action_markers: bare_literals(ROLE_WEB_SEARCH_STRONG_ACTION),
signal_markers: bare_literals(ROLE_WEB_SEARCH_SIGNAL),
topic_after_markers: prefix_literals(ROLE_WEB_SEARCH_TOPIC_MARKER),
topic_before_markers: suffix_literals(ROLE_WEB_SEARCH_TOPIC_MARKER),
imperative_lead_markers: prefix_literals(ROLE_WEB_SEARCH_IMPERATIVE_LEAD),
leading_noise: prefix_literals(ROLE_WEB_SEARCH_QUERY_LEADING_NOISE),
trailing_noise: suffix_literals(ROLE_WEB_SEARCH_QUERY_TRAILING_NOISE),
source_only: source_literals(ROLE_WEB_SEARCH_SOURCE_ONLY),
followup_verbs: bare_literals(ROLE_FOLLOWUP_INSTRUCTION_VERB),
continuation_markers: bare_literals(ROLE_CLAUSE_CONTINUATION_MARKER),
research_question_prefixes: prefix_literals(ROLE_RESEARCH_QUESTION_OPENER),
research_modifiers: bare_literals(ROLE_RESEARCH_SUPERLATIVE_MODIFIER),
research_evidence_domains: bare_literals(ROLE_RESEARCH_EVIDENCE_DOMAIN),
research_evaluation_domains: bare_literals(ROLE_RESEARCH_EVALUATION_DOMAIN),
enumeration_prefixes: prefix_literals(ROLE_ENUMERATION_REQUEST_OPENER),
enumeration_constraint_markers: bare_literals(ROLE_ENUMERATION_CONSTRAINT),
})
}
fn prefix_literals(role: &str) -> Vec<&'static str> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| form.slot() == Slot::Prefix)
.map(WordForm::before_slot)
.collect()
}
fn suffix_literals(role: &str) -> Vec<&'static str> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| form.slot() == Slot::Suffix)
.map(WordForm::after_slot)
.collect()
}
fn bare_literals(role: &str) -> Vec<&'static str> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| form.slot() == Slot::Bare)
.map(|form| form.text.as_str())
.collect()
}
fn source_literals(role: &str) -> Vec<String> {
seed::lexicon()
.words_for_role(role)
.iter()
.map(|word| word.trim().to_lowercase())
.collect()
}
fn extract_semantic_web_search_query(normalized: &str) -> Option<String> {
let markers = markers();
let has_action = contains_any_search_marker(normalized, &markers.action_markers);
if !has_action {
return None;
}
let has_strong_action = contains_any_search_marker(normalized, &markers.strong_action_markers);
if !has_strong_action && !contains_any_search_marker(normalized, &markers.signal_markers) {
return None;
}
for &marker in &markers.topic_after_markers {
if let Some(index) = normalized.find(marker) {
let start = index + marker.len();
if let Some(query) = valid_search_query(&normalized[start..]) {
return Some(query);
}
}
}
for &marker in &markers.topic_before_markers {
if let Some(index) = normalized.find(marker) {
if let Some(query) = valid_search_query(&normalized[..index]) {
return Some(query);
}
}
}
for &marker in &markers.imperative_lead_markers {
if let Some(index) = normalized.find(marker) {
let start = index + marker.len();
if let Some(query) = valid_search_query(&normalized[start..]) {
return Some(query);
}
}
}
None
}
fn extract_implicit_research_question(normalized: &str) -> Option<String> {
let markers = markers();
if !starts_with_any(normalized, &markers.research_question_prefixes) {
return None;
}
let padded = format!(" {normalized} ");
let has_modifier = markers
.research_modifiers
.iter()
.any(|marker| padded.contains(marker));
let has_evidence_domain = markers
.research_evidence_domains
.iter()
.any(|marker| padded.contains(marker));
let has_evaluation_domain = markers
.research_evaluation_domains
.iter()
.any(|marker| padded.contains(marker));
if !(has_modifier || has_evidence_domain && has_evaluation_domain) {
return None;
}
let query = strip_implicit_research_prefix(normalized);
valid_search_query(query)
}
fn extract_enumeration_research_request(normalized: &str) -> Option<String> {
let query = strip_enumeration_research_prefix(normalized)?;
if !looks_like_enumeration_research_query(query) {
return None;
}
valid_search_query(query)
}
fn starts_with_any(value: &str, prefixes: &[&str]) -> bool {
prefixes.iter().any(|prefix| value.starts_with(prefix))
}
fn strip_implicit_research_prefix(value: &str) -> &str {
for &prefix in &markers().research_question_prefixes {
if let Some(stripped) = value.strip_prefix(prefix) {
return stripped;
}
}
value
}
fn strip_enumeration_research_prefix(value: &str) -> Option<&str> {
for &prefix in &markers().enumeration_prefixes {
if let Some(stripped) = value.strip_prefix(prefix) {
return Some(stripped);
}
}
None
}
fn looks_like_enumeration_research_query(query: &str) -> bool {
if query.split_whitespace().count() < 3 {
return false;
}
contains_any_search_marker(query, &markers().enumeration_constraint_markers)
}
fn contains_any_search_marker(normalized: &str, markers: &[&str]) -> bool {
markers
.iter()
.any(|marker| contains_search_marker(normalized, marker))
}
fn contains_search_marker(normalized: &str, marker: &str) -> bool {
if marker.starts_with(' ') || marker.ends_with(' ') {
let padded = format!(" {normalized} ");
padded.contains(marker)
} else {
normalized.contains(marker)
}
}
fn valid_search_query(value: &str) -> Option<String> {
let query = clean_semantic_search_query(value);
let query_key = query.to_lowercase();
if query.is_empty()
|| markers().source_only.iter().any(|word| word == &query_key)
|| normalize_url_candidate(&query).is_some()
{
return None;
}
Some(query)
}
fn truncate_search_instruction_tail(value: &str) -> &str {
let markers = markers();
let lower = value.to_ascii_lowercase();
let mut cut = value.len();
for &verb in &markers.followup_verbs {
let cjk = contains_cjk(verb);
let mut from = 0;
while let Some(relative) = lower[from..].find(verb) {
let start = from + relative;
let end = start + verb.len();
from = end;
if !cjk && (!is_token_start(&lower, start) || !is_token_end(&lower, end)) {
continue;
}
if let Some(boundary) = boundary_before(&lower, start, markers) {
cut = cut.min(boundary);
}
}
}
value[..cut].trim()
}
fn is_token_start(text: &str, index: usize) -> bool {
!text[..index]
.chars()
.next_back()
.is_some_and(char::is_alphanumeric)
}
fn is_token_end(text: &str, index: usize) -> bool {
!text[index..]
.chars()
.next()
.is_some_and(char::is_alphanumeric)
}
fn boundary_before(text: &str, verb_start: usize, markers: &WebSearchMarkers) -> Option<usize> {
let head = text[..verb_start].trim_end();
if head.is_empty() {
return None;
}
if head.ends_with(is_sentence_boundary) {
return Some(head.len());
}
let mut cursor = head;
let mut matched = false;
loop {
let trimmed = cursor.trim_end();
let shortened = markers
.continuation_markers
.iter()
.find(|&&marker| ends_with_token(trimmed, marker))
.map(|&marker| &trimmed[..trimmed.len() - marker.len()]);
match shortened {
Some(rest) => {
cursor = rest;
matched = true;
}
None => break,
}
}
matched.then(|| cursor.trim_end().len())
}
fn ends_with_token(haystack: &str, marker: &str) -> bool {
if contains_cjk(marker) {
haystack.ends_with(marker)
} else {
haystack == marker
|| haystack
.strip_suffix(marker)
.is_some_and(|head| head.ends_with(char::is_whitespace))
}
}
fn clean_semantic_search_query(value: &str) -> String {
let markers = markers();
let mut query = clean_search_query(truncate_search_instruction_tail(value));
loop {
let before = query.clone();
for &prefix in &markers.leading_noise {
if let Some(stripped) = query.strip_prefix(prefix) {
query = clean_search_query(stripped);
}
}
for &suffix in &markers.trailing_noise {
if let Some(stripped) = query.strip_suffix(suffix) {
query = clean_search_query(stripped);
}
}
if query == before {
return query;
}
}
}