use super::http::{HttpClient, HttpError};
use super::wiktionary::query_encode;
pub struct Wikidata<'a, T: HttpClient + ?Sized> {
http: &'a T,
}
impl<'a, T: HttpClient + ?Sized> Wikidata<'a, T> {
pub const fn new(http: &'a T) -> Self {
Self { http }
}
pub fn sparql(&self, query: &str) -> Result<String, HttpError> {
let url = format!(
"https://query.wikidata.org/sparql?format=json&query={query}",
query = query_encode(query),
);
self.http.get(&url)
}
pub fn lexeme_translations(
&self,
source_lexeme_id: &str,
target_lang_iso: &str,
) -> Result<Vec<SparqlLemma>, HttpError> {
let query = format!(
"SELECT DISTINCT ?lemma WHERE {{ \
wd:{source_lexeme_id} ontolex:sense ?source_sense . \
wd:{source_lexeme_id} wikibase:lexicalCategory ?category . \
?source_sense wdt:P5137 ?meaning . \
?lexeme ontolex:sense ?sense . \
?sense wdt:P5137 ?meaning . \
?lexeme wikibase:lexicalCategory ?category . \
?lexeme dct:language ?language . \
?language wdt:P218 \"{target_lang_iso}\" . \
?lexeme wikibase:lemma ?lemma . \
}}"
);
let body = self.sparql(&query)?;
Ok(parse_sparql_lemmas(&body))
}
pub fn search_lexeme(
&self,
lemma: &str,
language_iso: &str,
) -> Result<Vec<LexemeSearchHit>, HttpError> {
let url = format!(
"https://www.wikidata.org/w/api.php?action=wbsearchentities\
&search={lemma}&language={lang}&type=lexeme&format=json\
&uselang={lang}&limit=5",
lemma = query_encode(lemma),
lang = query_encode(language_iso),
);
let body = self.http.get(&url)?;
Ok(parse_wbsearch_hits(&body))
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SparqlLemma {
pub value: String,
pub language: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LexemeSearchHit {
pub id: String,
pub label: String,
}
#[must_use]
pub fn parse_sparql_lemmas(body: &str) -> Vec<SparqlLemma> {
let mut out: Vec<SparqlLemma> = Vec::new();
let mut cursor = 0usize;
while let Some(start) = body[cursor..].find("\"lemma\"") {
let absolute = cursor + start;
let Some(open_brace_offset) = body[absolute..].find('{') else {
break;
};
let object_start = absolute + open_brace_offset;
let Some(object_end) = find_matching_brace(&body[object_start..]) else {
break;
};
let object_body = &body[object_start..object_start + object_end];
let value = read_json_field(object_body, "value").unwrap_or_default();
let language = read_json_field(object_body, "xml:lang");
out.push(SparqlLemma { value, language });
cursor = object_start + object_end;
}
out
}
fn find_matching_brace(input: &str) -> Option<usize> {
let bytes = input.as_bytes();
if bytes.is_empty() || bytes[0] != b'{' {
return None;
}
let mut depth: u32 = 0;
let mut in_string = false;
let mut escape = false;
for (offset, byte) in bytes.iter().enumerate() {
if escape {
escape = false;
continue;
}
match (*byte, in_string) {
(b'\\', true) => escape = true,
(b'"', _) => in_string = !in_string,
(b'{', false) => depth += 1,
(b'}', false) => {
depth -= 1;
if depth == 0 {
return Some(offset + 1);
}
}
_ => {}
}
}
None
}
fn read_json_field(object: &str, name: &str) -> Option<String> {
let needle = format!("\"{name}\"");
let mut search_start = 0usize;
while let Some(field_offset) = object[search_start..].find(&needle) {
let absolute = search_start + field_offset;
let after_key = absolute + needle.len();
let colon_offset = object[after_key..].find(':')?;
let after_colon = after_key + colon_offset + 1;
let Some(open_quote) = object[after_colon..].find('"') else {
search_start = after_colon;
continue;
};
let value_start = after_colon + open_quote + 1;
return Some(read_json_string(&object[value_start..]));
}
None
}
#[must_use]
pub fn parse_wbsearch_hits(body: &str) -> Vec<LexemeSearchHit> {
let mut out: Vec<LexemeSearchHit> = Vec::new();
let Some(search_idx) = body.find("\"search\"") else {
return out;
};
let mut cursor = search_idx;
while let Some(id_offset) = body[cursor..].find("\"id\"") {
let absolute = cursor + id_offset + "\"id\"".len();
let Some(qstart) = body[absolute..].find('"') else {
break;
};
let id_start = absolute + qstart + 1;
let id = read_json_string(&body[id_start..]);
let after_id = id_start + escaped_string_advance(&body[id_start..]);
let mut label = String::new();
if let Some(label_offset) = body[after_id..].find("\"label\"") {
let close_idx = body[after_id..].find('}').unwrap_or(usize::MAX);
if label_offset < close_idx {
let after_label = after_id + label_offset + "\"label\"".len();
if let Some(lq) = body[after_label..].find('"') {
let label_start = after_label + lq + 1;
label = read_json_string(&body[label_start..]);
}
}
}
out.push(LexemeSearchHit { id, label });
cursor = after_id;
}
out
}
fn read_json_string(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut iter = input.chars();
while let Some(character) = iter.next() {
if character == '"' {
break;
}
if character == '\\' {
let Some(next) = iter.next() else { break };
match next {
'n' => out.push('\n'),
't' => out.push('\t'),
'r' => out.push('\r'),
'"' => out.push('"'),
'\\' => out.push('\\'),
'/' => out.push('/'),
'u' => {
let mut hex = String::with_capacity(4);
for _ in 0..4 {
let Some(c) = iter.next() else { break };
hex.push(c);
}
let Ok(codepoint) = u32::from_str_radix(&hex, 16) else {
break;
};
if (0xD800..=0xDBFF).contains(&codepoint) {
if iter.next() != Some('\\') || iter.next() != Some('u') {
break;
}
let mut low_hex = String::with_capacity(4);
for _ in 0..4 {
let Some(c) = iter.next() else { break };
low_hex.push(c);
}
let Ok(low) = u32::from_str_radix(&low_hex, 16) else {
break;
};
let combined = 0x1_0000 + ((codepoint - 0xD800) << 10) + (low - 0xDC00);
if let Some(c) = char::from_u32(combined) {
out.push(c);
}
} else if let Some(c) = char::from_u32(codepoint) {
out.push(c);
}
}
other => out.push(other),
}
} else {
out.push(character);
}
}
out
}
fn escaped_string_advance(input: &str) -> usize {
let mut consumed = 0usize;
let mut bytes = input.bytes();
while let Some(byte) = bytes.next() {
consumed += 1;
if byte == b'"' {
return consumed;
}
if byte == b'\\' {
if let Some(next) = bytes.next() {
consumed += 1;
if next == b'u' {
for _ in 0..4 {
if bytes.next().is_some() {
consumed += 1;
}
}
}
}
}
}
consumed
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_sparql_lemmas_extracts_values_and_languages() {
let body = r#"{
"results": {
"bindings": [
{"lemma": {"xml:lang": "ru", "type": "literal", "value": "привет"}},
{"lemma": {"xml:lang": "fr", "type": "literal", "value": "bonjour"}}
]
}
}"#;
let rows = parse_sparql_lemmas(body);
assert_eq!(rows.len(), 2);
assert_eq!(rows[0].value, "привет");
assert_eq!(rows[0].language.as_deref(), Some("ru"));
assert_eq!(rows[1].value, "bonjour");
assert_eq!(rows[1].language.as_deref(), Some("fr"));
}
#[test]
fn parse_sparql_lemmas_handles_empty_bindings() {
let body = r#"{"results": {"bindings": []}}"#;
let rows = parse_sparql_lemmas(body);
assert!(rows.is_empty());
}
#[test]
fn parse_wbsearch_hits_extracts_lexeme_ids_and_labels() {
let body = r#"{"searchinfo": {"search": "hello"}, "search": [
{"id": "L8485", "label": "hello", "description": "..."},
{"id": "L52", "label": "hello", "description": "..."}
]}"#;
let hits = parse_wbsearch_hits(body);
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].id, "L8485");
assert_eq!(hits[0].label, "hello");
assert_eq!(hits[1].id, "L52");
}
#[test]
fn read_json_string_decodes_unicode_escapes() {
let input = "\\u0041BC\"rest";
assert_eq!(read_json_string(input), "ABC");
}
#[test]
fn escaped_string_advance_skips_to_closing_quote() {
let input = "hello\"world";
assert_eq!(escaped_string_advance(input), 6);
}
#[test]
fn escaped_string_advance_handles_unicode_escapes() {
let input = "\\u0041\"after";
assert_eq!(escaped_string_advance(input), 7);
}
}