formal-ai 0.178.0

//! Scoped documentation-answer handlers for known project APIs.

use crate::engine::SymbolicAnswer;
use crate::event_log::EventLog;
use crate::language::{detect as detect_language, Language};
use crate::seed::{self, Slot};
use crate::solver_handlers::finalize_simple;

const PANDAS_DATAFRAME_JOIN_DOCS_URL: &str =
    "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html";

/// Handles project-method documentation prompts such as
/// "how the join method works in pandas" with a narrow official-docs summary.
pub fn try_docs_method_explanation(
    prompt: &str,
    normalized: &str,
    log: &mut EventLog,
) -> Option<SymbolicAnswer> {
    if !is_pandas_dataframe_join_prompt(prompt, normalized) {
        return None;
    }

    log.append("docs_method:request", prompt.to_owned());
    log.append("docs_method:project", "pandas".to_owned());
    log.append("docs_method:method", "pandas.DataFrame.join".to_owned());
    log.append("docs_method:source_kind", "official-docs".to_owned());
    log.append("docs_method:source", PANDAS_DATAFRAME_JOIN_DOCS_URL);

    let body = render_pandas_dataframe_join_answer(detect_language(prompt));

    Some(finalize_simple(
        prompt,
        log,
        "docs_method_explanation",
        "response:docs_method_explanation",
        &body,
        0.92,
    ))
}

/// True when the prompt asks how the pandas `DataFrame.join` method works.
///
/// The prompt must address pandas, read as an [`is_explanation_request`], and not
/// be an [`is_explicit_web_search`]. The join itself is then recognised through
/// two kinds of evidence:
///
/// * Code-resident API identifiers — `DataFrame.join`, `df.join`, and the
///   `join`+`dataframe` pairing. These are written the same in every language
///   (they are the library's own symbol names, not natural-language words), so
///   they legitimately live in the handler rather than the lexicon: they are the
///   bridge from a multilingual question to one specific documented API.
/// * The translatable noun "method" — matched through the
///   [`code_method_noun`](seed::ROLE_CODE_METHOD_NOUN) role rather than the four
///   per-language words it used to hardcode, paired with the `join` identifier so
///   "how the join method works", "метод join", "join विधि", "join 方法" all
///   resolve without naming the word "method" in any language here.
fn is_pandas_dataframe_join_prompt(prompt: &str, normalized: &str) -> bool {
    let lower = prompt.to_lowercase();
    let normalized = normalized.trim();
    if is_explicit_web_search(normalized) {
        return false;
    }
    if !has_word(normalized, "pandas") {
        return false;
    }
    if !is_explanation_request(normalized) {
        return false;
    }

    lower.contains("dataframe.join")
        || lower.contains("df.join")
        || (has_word(normalized, "join") && has_word(normalized, "dataframe"))
        || (has_word(normalized, "join")
            && seed::lexicon().mentions_role(seed::ROLE_CODE_METHOD_NOUN, normalized))
}

/// True when the prompt opens with an imperative to search the web, so it should
/// be answered by the web-search handler rather than this narrow docs handler.
///
/// Mirrors the original two-part screen by meaning. The search imperative is read
/// from the [`web_search_imperative_lead`](seed::ROLE_WEB_SEARCH_IMPERATIVE_LEAD)
/// role: only its [`Slot::Prefix`] forms are clause-initial leads, so the literal
/// before each ellipsis ("search ", "look up ", "research ", "найди ", "搜索", …)
/// is matched against the start of the prompt — exactly the original `starts_with`
/// test, now covering every supported language. The medium is read from the
/// [`web_medium`](seed::ROLE_WEB_MEDIUM) role; its surfaces are space-wrapped, so
/// they are matched with the web-search recogniser's whole-token convention — pad
/// the prompt with spaces and test `contains` — which also catches a medium word
/// at the very end ("search the web").
fn is_explicit_web_search(normalized: &str) -> bool {
    let lexicon = seed::lexicon();
    let requests_search = lexicon
        .role_word_forms(seed::ROLE_WEB_SEARCH_IMPERATIVE_LEAD)
        .into_iter()
        .filter(|form| form.slot() == Slot::Prefix)
        .any(|form| normalized.starts_with(form.before_slot()));
    if !requests_search {
        return false;
    }
    let padded = format!(" {normalized} ");
    lexicon
        .role_word_forms(seed::ROLE_WEB_MEDIUM)
        .into_iter()
        .any(|form| {
            let marker = form.text.as_str();
            if marker.starts_with(' ') || marker.ends_with(' ') {
                padded.contains(marker)
            } else {
                normalized.contains(marker)
            }
        })
}

/// True when the prompt is phrased as a request to have something explained.
///
/// Every interrogative and imperative lead-in lives in the
/// [`explanation_request_lead`](seed::ROLE_EXPLANATION_REQUEST_LEAD) role rather
/// than in this function, so no question word is hardcoded here. Each surface is
/// matched by its slot, mirroring `meta_explanation`'s `is_why_question`: a
/// [`Slot::Prefix`] form ("how …", "explain …", "как …", "क्या है …", "解释…") is
/// matched by the literal before the ellipsis against the start of the prompt,
/// while a bare form ("how", "कैसे काम", "如何工作", …) is matched as a raw
/// substring anywhere. The space-wrapped bare forms (" how ", " как ") match only
/// on whole-word boundaries.
fn is_explanation_request(normalized: &str) -> bool {
    seed::lexicon()
        .role_word_forms(seed::ROLE_EXPLANATION_REQUEST_LEAD)
        .into_iter()
        .any(|form| match form.slot() {
            Slot::Prefix => normalized.starts_with(form.before_slot()),
            _ => normalized.contains(form.text.as_str()),
        })
}

fn has_word(normalized: &str, word: &str) -> bool {
    normalized.split_whitespace().any(|token| token == word)
}

fn render_pandas_dataframe_join_answer(language: Language) -> String {
    match language {
        Language::Russian => format!(
            "pandas `DataFrame.join` добавляет столбцы из `other` DataFrame \
             или именованной Series к вызывающему DataFrame и возвращает новый \
             DataFrame.\n\n\
             В рамках этого метода: по умолчанию это left join по индексу \
             вызывающего DataFrame. Если задан `on`, pandas сопоставляет этот \
             столбец или уровень индекса с индексом объекта `other`. Параметр \
             `how` управляет объединением ключей (`left`, `right`, `outer`, \
             `inner`, `cross`, `left_anti` или `right_anti`). `lsuffix` и \
             `rsuffix` нужны при совпадающих именах столбцов, `sort` сортирует \
             ключи join, а `validate` проверяет связи one-to-one, one-to-many, \
             many-to-one или many-to-many. Для join столбец-к-столбцу \
             документация pandas указывает на `DataFrame.merge`.\n\n\
             Источник: [pandas.DataFrame.join]({PANDAS_DATAFRAME_JOIN_DOCS_URL}) \
             (официальная документация pandas)."
        ),
        Language::Hindi => format!(
            "pandas `DataFrame.join` कॉल करने वाले DataFrame में `other` \
             DataFrame या named Series के columns जोड़ता है और नया DataFrame \
             लौटाता है.\n\n\
             इस method के दायरे में: default रूप से यह caller के index पर left \
             join करता है. `on` देने पर pandas caller के उस column या index \
             level को `other` object के index से मिलाता है. `how` parameter \
             keys को मिलाने का तरीका चुनता है (`left`, `right`, `outer`, \
             `inner`, `cross`, `left_anti`, या `right_anti`). Column नाम टकराने \
             पर `lsuffix` और `rsuffix`, join keys को sort करने के लिए `sort`, \
             और one-to-one, one-to-many, many-to-one, या many-to-many संबंध \
             जांचने के लिए `validate` इस्तेमाल करें. Column-on-column joins \
             के लिए pandas docs `DataFrame.merge` की ओर भेजते हैं.\n\n\
             Source: [pandas.DataFrame.join]({PANDAS_DATAFRAME_JOIN_DOCS_URL}) \
             (official pandas docs)."
        ),
        Language::Chinese => format!(
            "pandas `DataFrame.join` 会把 `other` DataFrame 或具名 Series 的列加入调用方，并返回新的 DataFrame。\n\n\
             只看这个方法：默认情况下，它使用调用方的 index 执行 left join。设置 `on` 时，pandas 会把调用方的列或索引层级与 `other` 对象的 index 匹配。`how` 参数控制键的组合方式（`left`、`right`、`outer`、`inner`、`cross`、`left_anti` 或 `right_anti`）。列名冲突时使用 `lsuffix` 和 `rsuffix`，用 `sort` 排序 join keys，用 `validate` 检查 one-to-one、one-to-many、many-to-one 或 many-to-many 关系。对于列到列的 join，pandas 文档指向 `DataFrame.merge`。\n\n\
             Source: [pandas.DataFrame.join]({PANDAS_DATAFRAME_JOIN_DOCS_URL}) \
             (official pandas docs)."
        ),
        Language::English | Language::Unknown => format!(
            "pandas `DataFrame.join` joins columns from the `other` DataFrame or \
             named Series into the caller and returns a new DataFrame.\n\n\
             Scoped to this method: by default, it performs a left join using the \
             caller's index. If `on` is set, pandas matches that caller column or \
             index level against the `other` object's index. The `how` parameter \
             controls key handling (`left`, `right`, `outer`, `inner`, `cross`, \
             `left_anti`, or `right_anti`). Use `lsuffix` and `rsuffix` when \
             column names overlap, `sort` to order join keys, and `validate` to \
             check one-to-one, one-to-many, many-to-one, or many-to-many \
             relationships. For column-on-column joins, the pandas docs point to \
             `DataFrame.merge`.\n\n\
             Source: [pandas.DataFrame.join]({PANDAS_DATAFRAME_JOIN_DOCS_URL}) \
             (official pandas docs)."
        ),
    }
}