Skip to main content

zantetsu_vecdb/
matcher.rs

1use crate::error::{MatchResult, MatcherError};
2use dirs::data_dir;
3use flate2::read::GzDecoder;
4use reqwest::blocking::Client;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::{HashMap, HashSet};
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::{Path, PathBuf};
11use strsim::jaro_winkler;
12use tracing::debug;
13
14const DEFAULT_TIMEOUT_SECS: u64 = 10;
15
16/// Which backend produced a canonical title match.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18pub enum MatchProvider {
19    /// Match derived from the local Kitsu SQL dump.
20    KitsuDump,
21    /// Match derived from a remote GraphQL endpoint.
22    RemoteEndpoint,
23}
24
25/// External and internal identifiers for a matched anime.
26#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct AnimeIds {
28    /// Kitsu anime id.
29    pub kitsu: Option<u32>,
30    /// AniList-compatible media id.
31    pub anilist: Option<u32>,
32    /// MyAnimeList anime id.
33    pub mal: Option<u32>,
34}
35
36/// The best-scoring match for a query title.
37#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
38pub struct AnimeTitleMatch {
39    /// Which backend produced this result.
40    pub provider: MatchProvider,
41    /// Canonical title chosen for the anime.
42    pub canonical_title: String,
43    /// The specific alias that scored highest for the query.
44    pub matched_title: String,
45    /// Similarity score in the range `[0.0, 1.0]`.
46    pub score: f32,
47    /// Known IDs for the matched anime.
48    pub ids: AnimeIds,
49    /// Known aliases for the anime, canonical title first.
50    pub titles: Vec<String>,
51}
52
53/// Backend choice for canonical title matching.
54#[derive(Debug, Clone)]
55pub enum MatchSource {
56    /// Use a local Kitsu SQL dump located at `dump_path`.
57    KitsuDump {
58        /// Path to either the dump directory or a specific `latest.sql` / `latest.sql.gz` file.
59        dump_path: PathBuf,
60    },
61    /// Use a remote GraphQL endpoint.
62    RemoteEndpoint {
63        /// Remote endpoint URL.
64        endpoint: String,
65    },
66}
67
68impl MatchSource {
69    /// Use the local Kitsu dump at the provided path.
70    #[must_use]
71    pub fn kitsu_dump(dump_path: impl Into<PathBuf>) -> Self {
72        Self::KitsuDump {
73            dump_path: dump_path.into(),
74        }
75    }
76
77    /// Use a remote GraphQL endpoint.
78    #[must_use]
79    pub fn remote_endpoint(endpoint: impl Into<String>) -> Self {
80        Self::RemoteEndpoint {
81            endpoint: endpoint.into(),
82        }
83    }
84}
85
86/// Returns the default dump directory used by `kitsu-sync`.
87#[must_use]
88pub fn default_kitsu_dump_dir() -> PathBuf {
89    data_dir()
90        .unwrap_or_else(|| PathBuf::from("."))
91        .join("zantetsu")
92        .join("kitsu-dumps")
93}
94
95/// Canonical title matcher backed by either a local Kitsu dump or a remote GraphQL endpoint.
96pub struct TitleMatcher {
97    backend: MatcherBackend,
98}
99
100enum MatcherBackend {
101    Kitsu(KitsuTitleMatcher),
102    RemoteEndpoint(RemoteTitleMatcher),
103}
104
105impl TitleMatcher {
106    /// Create a matcher from the selected source.
107    pub fn new(source: MatchSource) -> MatchResult<Self> {
108        let backend = match source {
109            MatchSource::KitsuDump { dump_path } => {
110                MatcherBackend::Kitsu(KitsuTitleMatcher::from_dump(dump_path)?)
111            }
112            MatchSource::RemoteEndpoint { endpoint } => {
113                MatcherBackend::RemoteEndpoint(RemoteTitleMatcher::new(endpoint)?)
114            }
115        };
116
117        Ok(Self { backend })
118    }
119
120    /// Create a matcher from a local Kitsu dump path.
121    pub fn from_kitsu_dump(dump_path: impl Into<PathBuf>) -> MatchResult<Self> {
122        Self::new(MatchSource::kitsu_dump(dump_path))
123    }
124
125    /// Create a matcher using a remote GraphQL endpoint.
126    pub fn from_remote_endpoint(endpoint: impl Into<String>) -> MatchResult<Self> {
127        Self::new(MatchSource::remote_endpoint(endpoint))
128    }
129
130    /// Return the best available match for the provided title.
131    pub fn match_title(&self, title: &str) -> MatchResult<Option<AnimeTitleMatch>> {
132        Ok(self.search_titles(title, 1)?.into_iter().next())
133    }
134
135    /// Search for the best matches for the provided title.
136    pub fn search_titles(&self, title: &str, limit: usize) -> MatchResult<Vec<AnimeTitleMatch>> {
137        let query = normalize_title(title);
138        if query.is_empty() {
139            return Err(MatcherError::EmptyQuery);
140        }
141
142        let limit = limit.max(1);
143        match &self.backend {
144            MatcherBackend::Kitsu(matcher) => Ok(score_entries(
145                MatchProvider::KitsuDump,
146                &matcher.entries,
147                title,
148                &query,
149                limit,
150            )),
151            MatcherBackend::RemoteEndpoint(matcher) => matcher.search_titles(title, &query, limit),
152        }
153    }
154}
155
156struct KitsuTitleMatcher {
157    entries: Vec<CatalogEntry>,
158}
159
160impl KitsuTitleMatcher {
161    fn from_dump(dump_path: PathBuf) -> MatchResult<Self> {
162        let resolved = resolve_dump_path(&dump_path)?;
163        debug!("loading Kitsu dump from {}", resolved.display());
164        let reader = open_dump_reader(&resolved)?;
165        let entries = parse_kitsu_dump(reader)?;
166        Ok(Self { entries })
167    }
168}
169
170struct RemoteTitleMatcher {
171    client: Client,
172    endpoint: String,
173}
174
175impl RemoteTitleMatcher {
176    fn new(endpoint: String) -> MatchResult<Self> {
177        let endpoint = endpoint.trim().to_string();
178        if endpoint.is_empty() {
179            return Err(MatcherError::InvalidResponse(
180                "remote endpoint URL cannot be empty".into(),
181            ));
182        }
183
184        let client = Client::builder()
185            .timeout(std::time::Duration::from_secs(DEFAULT_TIMEOUT_SECS))
186            .user_agent(format!("zantetsu/{}", env!("CARGO_PKG_VERSION")))
187            .build()?;
188        Ok(Self { client, endpoint })
189    }
190
191    fn search_titles(
192        &self,
193        raw_query: &str,
194        normalized_query: &str,
195        limit: usize,
196    ) -> MatchResult<Vec<AnimeTitleMatch>> {
197        let payload = RemoteGraphQlRequest {
198            query: REMOTE_GRAPHQL_QUERY,
199            variables: RemoteGraphQlVariables {
200                search: raw_query,
201                per_page: limit.max(5) as i64,
202            },
203        };
204
205        let response = self
206            .client
207            .post(&self.endpoint)
208            .json(&payload)
209            .send()?
210            .error_for_status()?;
211
212        let envelope: RemoteGraphQlEnvelope = response.json()?;
213        if let Some(errors) = envelope.errors {
214            let message = errors
215                .into_iter()
216                .map(|error| error.message)
217                .collect::<Vec<_>>()
218                .join("; ");
219            return Err(MatcherError::GraphQl(message));
220        }
221
222        let media = envelope
223            .data
224            .ok_or_else(|| MatcherError::InvalidResponse("missing data".into()))?
225            .page
226            .ok_or_else(|| MatcherError::InvalidResponse("missing page".into()))?
227            .media;
228
229        let entries = media
230            .into_iter()
231            .map(CatalogEntry::from_remote_media)
232            .collect::<Vec<_>>();
233
234        Ok(score_entries(
235            MatchProvider::RemoteEndpoint,
236            &entries,
237            raw_query,
238            normalized_query,
239            limit,
240        ))
241    }
242}
243
244#[derive(Debug, Clone)]
245struct CatalogEntry {
246    canonical_title: String,
247    titles: Vec<String>,
248    normalized_titles: Vec<String>,
249    ids: AnimeIds,
250}
251
252impl CatalogEntry {
253    fn new(canonical_title: String, titles: Vec<String>, ids: AnimeIds) -> Option<Self> {
254        let titles = dedupe_titles(&canonical_title, titles);
255        if titles.is_empty() {
256            return None;
257        }
258
259        let canonical_title = titles
260            .first()
261            .cloned()
262            .unwrap_or_else(|| canonical_title.trim().to_string());
263
264        let normalized_titles = titles.iter().map(|title| normalize_title(title)).collect();
265        Some(Self {
266            canonical_title,
267            titles,
268            normalized_titles,
269            ids,
270        })
271    }
272
273    fn from_remote_media(media: RemoteMedia) -> Self {
274        let mut titles = Vec::new();
275        if let Some(title) = media.title.user_preferred.clone() {
276            titles.push(title);
277        }
278        if let Some(title) = media.title.romaji.clone() {
279            titles.push(title);
280        }
281        if let Some(title) = media.title.english.clone() {
282            titles.push(title);
283        }
284        if let Some(title) = media.title.native.clone() {
285            titles.push(title);
286        }
287        titles.extend(media.synonyms.unwrap_or_default());
288
289        let canonical_title = media
290            .title
291            .user_preferred
292            .clone()
293            .or(media.title.romaji.clone())
294            .or(media.title.english.clone())
295            .or(media.title.native.clone())
296            .unwrap_or_else(|| format!("Remote {}", media.id));
297
298        Self::new(
299            canonical_title,
300            titles,
301            AnimeIds {
302                kitsu: None,
303                anilist: Some(media.id),
304                mal: media.id_mal,
305            },
306        )
307        .expect("remote media should always produce at least one title")
308    }
309}
310
311#[derive(Debug, Default)]
312struct RawAnimeRecord {
313    canonical_title: Option<String>,
314    titles: Vec<String>,
315    slug: Option<String>,
316}
317
318#[derive(Debug, Clone)]
319enum DumpSection {
320    None,
321    Anime(AnimeColumns),
322    Mappings(MappingColumns),
323}
324
325#[derive(Debug, Clone)]
326struct AnimeColumns {
327    id: usize,
328    canonical_title: Option<usize>,
329    titles: Option<usize>,
330    abbreviated_titles: Option<usize>,
331    slug: Option<usize>,
332}
333
334impl AnimeColumns {
335    fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
336        let id = find_column_index(columns, "id")
337            .ok_or_else(|| MatcherError::InvalidDump("anime COPY is missing id column".into()))?;
338        Ok(Self {
339            id,
340            canonical_title: find_column_index(columns, "canonical_title"),
341            titles: find_column_index(columns, "titles"),
342            abbreviated_titles: find_column_index(columns, "abbreviated_titles"),
343            slug: find_column_index(columns, "slug"),
344        })
345    }
346}
347
348#[derive(Debug, Clone)]
349struct MappingColumns {
350    item_id: usize,
351    item_type: Option<usize>,
352    external_site: usize,
353    external_id: usize,
354}
355
356impl MappingColumns {
357    fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
358        let item_id = find_column_index(columns, "item_id").ok_or_else(|| {
359            MatcherError::InvalidDump("mappings COPY is missing item_id column".into())
360        })?;
361        let external_site = find_column_index(columns, "external_site").ok_or_else(|| {
362            MatcherError::InvalidDump("mappings COPY is missing external_site column".into())
363        })?;
364        let external_id = find_column_index(columns, "external_id").ok_or_else(|| {
365            MatcherError::InvalidDump("mappings COPY is missing external_id column".into())
366        })?;
367
368        Ok(Self {
369            item_id,
370            item_type: find_column_index(columns, "item_type"),
371            external_site,
372            external_id,
373        })
374    }
375}
376
377fn score_entries(
378    provider: MatchProvider,
379    entries: &[CatalogEntry],
380    raw_query: &str,
381    normalized_query: &str,
382    limit: usize,
383) -> Vec<AnimeTitleMatch> {
384    let mut scored = entries
385        .iter()
386        .filter_map(|entry| score_entry(provider, entry, raw_query, normalized_query))
387        .collect::<Vec<_>>();
388
389    scored.sort_by(|left, right| {
390        right
391            .score
392            .partial_cmp(&left.score)
393            .unwrap_or(std::cmp::Ordering::Equal)
394            .then_with(|| left.canonical_title.cmp(&right.canonical_title))
395    });
396    scored.truncate(limit);
397    scored
398}
399
400fn score_entry(
401    provider: MatchProvider,
402    entry: &CatalogEntry,
403    _raw_query: &str,
404    normalized_query: &str,
405) -> Option<AnimeTitleMatch> {
406    let mut best_title = None;
407    let mut best_score = 0.0_f32;
408
409    for (index, normalized_title) in entry.normalized_titles.iter().enumerate() {
410        let score = similarity_score(normalized_query, normalized_title);
411        if score > best_score {
412            best_score = score;
413            best_title = entry.titles.get(index).cloned();
414        }
415    }
416
417    best_title.map(|matched_title| AnimeTitleMatch {
418        provider,
419        canonical_title: entry.canonical_title.clone(),
420        matched_title,
421        score: best_score,
422        ids: entry.ids.clone(),
423        titles: entry.titles.clone(),
424    })
425}
426
427fn similarity_score(query: &str, candidate: &str) -> f32 {
428    if query.is_empty() || candidate.is_empty() {
429        return 0.0;
430    }
431    if query == candidate {
432        return 1.0;
433    }
434
435    let jaro = jaro_winkler(query, candidate) as f32;
436    let token_overlap = token_overlap_score(query, candidate);
437    let contains = if query.contains(candidate) || candidate.contains(query) {
438        1.0
439    } else {
440        0.0
441    };
442    let prefix = if query.starts_with(candidate) || candidate.starts_with(query) {
443        1.0
444    } else {
445        0.0
446    };
447    let len_ratio = query.len().min(candidate.len()) as f32 / query.len().max(candidate.len()) as f32;
448
449    let mut score =
450        0.55 * jaro + 0.20 * token_overlap + 0.15 * contains + 0.10 * len_ratio + 0.05 * prefix;
451    if token_overlap == 0.0 && contains == 0.0 && jaro < 0.90 {
452        score *= 0.75;
453    }
454
455    score.clamp(0.0, 1.0)
456}
457
458fn token_overlap_score(left: &str, right: &str) -> f32 {
459    let left_tokens = left.split_whitespace().collect::<HashSet<_>>();
460    let right_tokens = right.split_whitespace().collect::<HashSet<_>>();
461    if left_tokens.is_empty() || right_tokens.is_empty() {
462        return 0.0;
463    }
464
465    let shared = left_tokens.intersection(&right_tokens).count() as f32;
466    (2.0 * shared) / (left_tokens.len() as f32 + right_tokens.len() as f32)
467}
468
469fn normalize_title(title: &str) -> String {
470    let mut normalized = String::with_capacity(title.len());
471    let mut last_was_space = true;
472
473    for ch in title.chars() {
474        let mapped = match ch {
475            '×' | '✕' | '✖' => 'x',
476            '&' => ' ',
477            _ => ch.to_ascii_lowercase(),
478        };
479
480        if mapped.is_alphanumeric() {
481            normalized.push(mapped);
482            last_was_space = false;
483        } else if mapped.is_whitespace() || matches!(mapped, '-' | '_' | '.' | ':' | '/' | '\\') {
484            if !last_was_space {
485                normalized.push(' ');
486                last_was_space = true;
487            }
488        }
489    }
490
491    normalized.trim().to_string()
492}
493
494fn dedupe_titles(canonical_title: &str, titles: Vec<String>) -> Vec<String> {
495    let mut deduped = Vec::new();
496    let mut seen = HashSet::new();
497
498    let mut push_title = |title: String| {
499        let cleaned = title.trim();
500        if cleaned.is_empty() {
501            return;
502        }
503        let key = normalize_title(cleaned);
504        if key.is_empty() || !seen.insert(key) {
505            return;
506        }
507        deduped.push(cleaned.to_string());
508    };
509
510    push_title(canonical_title.to_string());
511    for title in titles {
512        push_title(title);
513    }
514
515    deduped
516}
517
518fn resolve_dump_path(path: &Path) -> MatchResult<PathBuf> {
519    if path.is_file() {
520        return Ok(path.to_path_buf());
521    }
522
523    if path.is_dir() {
524        let sql = path.join("latest.sql");
525        if sql.is_file() {
526            return Ok(sql);
527        }
528
529        let gzip = path.join("latest.sql.gz");
530        if gzip.is_file() {
531            return Ok(gzip);
532        }
533
534        return Err(MatcherError::InvalidDumpPath(format!(
535            "directory {} does not contain latest.sql or latest.sql.gz",
536            path.display()
537        )));
538    }
539
540    Err(MatcherError::InvalidDumpPath(format!(
541        "{} does not exist",
542        path.display()
543    )))
544}
545
546fn open_dump_reader(path: &Path) -> MatchResult<Box<dyn BufRead>> {
547    let file = File::open(path)?;
548    if path
549        .extension()
550        .and_then(|ext| ext.to_str())
551        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
552    {
553        return Ok(Box::new(BufReader::new(GzDecoder::new(file))));
554    }
555
556    Ok(Box::new(BufReader::new(file)))
557}
558
559fn parse_kitsu_dump<R: BufRead>(reader: R) -> MatchResult<Vec<CatalogEntry>> {
560    let mut anime_by_id = HashMap::<u32, RawAnimeRecord>::new();
561    let mut ids_by_anime = HashMap::<u32, AnimeIds>::new();
562    let mut section = DumpSection::None;
563
564    for line in reader.lines() {
565        let line = line?;
566        if let Some((table, columns)) = parse_copy_statement(&line) {
567            section = match table.as_str() {
568                "anime" => DumpSection::Anime(AnimeColumns::try_from_columns(&columns)?),
569                "mappings" => DumpSection::Mappings(MappingColumns::try_from_columns(&columns)?),
570                _ => DumpSection::None,
571            };
572            continue;
573        }
574
575        if line == "\\." {
576            section = DumpSection::None;
577            continue;
578        }
579
580        match &section {
581            DumpSection::Anime(columns) => parse_anime_row(&line, columns, &mut anime_by_id)?,
582            DumpSection::Mappings(columns) => {
583                parse_mapping_row(&line, columns, &mut ids_by_anime)?;
584            }
585            DumpSection::None => {}
586        }
587    }
588
589    if anime_by_id.is_empty() {
590        return Err(MatcherError::InvalidDump(
591            "anime COPY section was not found".into(),
592        ));
593    }
594
595    let mut entries = anime_by_id
596        .into_iter()
597        .filter_map(|(kitsu_id, record)| {
598            let fallback_slug = record.slug.as_deref().map(slug_to_title);
599            let canonical_title = record
600                .canonical_title
601                .clone()
602                .or_else(|| record.titles.first().cloned())
603                .or(fallback_slug)?;
604
605            let mut titles = record.titles;
606            if let Some(slug) = record.slug {
607                titles.push(slug_to_title(&slug));
608            }
609
610            let mut ids = ids_by_anime.remove(&kitsu_id).unwrap_or_default();
611            ids.kitsu = Some(kitsu_id);
612            CatalogEntry::new(canonical_title, titles, ids)
613        })
614        .collect::<Vec<_>>();
615
616    entries.sort_by(|left, right| left.canonical_title.cmp(&right.canonical_title));
617    Ok(entries)
618}
619
620fn parse_copy_statement(line: &str) -> Option<(String, Vec<String>)> {
621    let rest = line.strip_prefix("COPY ")?;
622    let rest = rest.strip_suffix(" FROM stdin;")?;
623    let open = rest.find('(')?;
624    let close = rest.rfind(')')?;
625    if close <= open {
626        return None;
627    }
628
629    let table = normalize_identifier(&rest[..open]);
630    let columns = rest[open + 1..close]
631        .split(',')
632        .map(normalize_identifier)
633        .collect::<Vec<_>>();
634    Some((table, columns))
635}
636
637fn parse_anime_row(
638    line: &str,
639    columns: &AnimeColumns,
640    anime_by_id: &mut HashMap<u32, RawAnimeRecord>,
641) -> MatchResult<()> {
642    let fields = split_copy_row(line);
643    let id = field_value(&fields, columns.id)
644        .ok_or_else(|| MatcherError::InvalidDump("anime row is missing id".into()))?
645        .parse::<u32>()
646        .map_err(|_| MatcherError::InvalidDump("anime id is not numeric".into()))?;
647
648    let record = anime_by_id.entry(id).or_default();
649
650    if let Some(index) = columns.canonical_title {
651        if let Some(value) = field_value(&fields, index) {
652            record.canonical_title = Some(value.to_string());
653        }
654    }
655
656    if let Some(index) = columns.titles {
657        if let Some(value) = field_value(&fields, index) {
658            record.titles.extend(parse_json_titles(value));
659        }
660    }
661
662    if let Some(index) = columns.abbreviated_titles {
663        if let Some(value) = field_value(&fields, index) {
664            record.titles.extend(parse_json_titles(value));
665        }
666    }
667
668    if let Some(index) = columns.slug {
669        if let Some(value) = field_value(&fields, index) {
670            record.slug = Some(value.to_string());
671        }
672    }
673
674    Ok(())
675}
676
677fn parse_mapping_row(
678    line: &str,
679    columns: &MappingColumns,
680    ids_by_anime: &mut HashMap<u32, AnimeIds>,
681) -> MatchResult<()> {
682    let fields = split_copy_row(line);
683    if let Some(index) = columns.item_type {
684        if !field_value(&fields, index)
685            .is_some_and(|value| value.eq_ignore_ascii_case("anime"))
686        {
687            return Ok(());
688        }
689    }
690
691    let item_id = match field_value(&fields, columns.item_id)
692        .and_then(|value| value.parse::<u32>().ok())
693    {
694        Some(id) => id,
695        None => return Ok(()),
696    };
697
698    let external_site = match field_value(&fields, columns.external_site) {
699        Some(site) => site.to_ascii_lowercase(),
700        None => return Ok(()),
701    };
702    let external_id = match field_value(&fields, columns.external_id)
703        .and_then(|value| value.parse::<u32>().ok())
704    {
705        Some(id) => id,
706        None => return Ok(()),
707    };
708
709    let ids = ids_by_anime.entry(item_id).or_default();
710    if external_site.contains("anilist") {
711        ids.anilist = Some(external_id);
712    } else if external_site.contains("myanimelist") || external_site == "mal" {
713        ids.mal = Some(external_id);
714    }
715
716    Ok(())
717}
718
719fn split_copy_row(line: &str) -> Vec<Option<String>> {
720    line.split('\t').map(parse_copy_field).collect()
721}
722
723fn parse_copy_field(field: &str) -> Option<String> {
724    if field == "\\N" {
725        return None;
726    }
727
728    let mut output = String::with_capacity(field.len());
729    let mut chars = field.chars();
730    while let Some(ch) = chars.next() {
731        if ch == '\\' {
732            match chars.next() {
733                Some('t') => output.push('\t'),
734                Some('n') => output.push('\n'),
735                Some('r') => output.push('\r'),
736                Some('b') => output.push('\u{0008}'),
737                Some('f') => output.push('\u{000C}'),
738                Some('\\') => output.push('\\'),
739                Some(other) => output.push(other),
740                None => output.push('\\'),
741            }
742        } else {
743            output.push(ch);
744        }
745    }
746
747    Some(output)
748}
749
750fn field_value(fields: &[Option<String>], index: usize) -> Option<&str> {
751    fields.get(index).and_then(|field| field.as_deref())
752}
753
754fn normalize_identifier(identifier: &str) -> String {
755    identifier
756        .split('.')
757        .next_back()
758        .unwrap_or(identifier)
759        .trim()
760        .trim_matches('"')
761        .to_ascii_lowercase()
762}
763
764fn find_column_index(columns: &[String], target: &str) -> Option<usize> {
765    columns
766        .iter()
767        .position(|column| column.eq_ignore_ascii_case(target))
768}
769
770fn parse_json_titles(raw: &str) -> Vec<String> {
771    let parsed = match serde_json::from_str::<Value>(raw) {
772        Ok(value) => value,
773        Err(_) => return Vec::new(),
774    };
775
776    match parsed {
777        Value::Object(map) => map
778            .into_values()
779            .filter_map(|value| value.as_str().map(ToOwned::to_owned))
780            .collect(),
781        Value::Array(items) => items
782            .into_iter()
783            .filter_map(|value| value.as_str().map(ToOwned::to_owned))
784            .collect(),
785        Value::String(title) => vec![title],
786        _ => Vec::new(),
787    }
788}
789
790fn slug_to_title(slug: &str) -> String {
791    slug.replace('-', " ").trim().to_string()
792}
793
794const REMOTE_GRAPHQL_QUERY: &str = r#"
795query($search: String!, $perPage: Int!) {
796  Page(page: 1, perPage: $perPage) {
797    media(search: $search, type: ANIME) {
798      id
799      idMal
800      title {
801        romaji
802        english
803        native
804        userPreferred
805      }
806      synonyms
807    }
808  }
809}
810"#;
811
812#[derive(Debug, Serialize)]
813struct RemoteGraphQlRequest<'a> {
814    query: &'a str,
815    variables: RemoteGraphQlVariables<'a>,
816}
817
818#[derive(Debug, Serialize)]
819struct RemoteGraphQlVariables<'a> {
820    search: &'a str,
821    #[serde(rename = "perPage")]
822    per_page: i64,
823}
824
825#[derive(Debug, Deserialize)]
826struct RemoteGraphQlEnvelope {
827    data: Option<RemoteGraphQlData>,
828    errors: Option<Vec<RemoteGraphQlError>>,
829}
830
831#[derive(Debug, Deserialize)]
832struct RemoteGraphQlData {
833    #[serde(rename = "Page")]
834    page: Option<RemoteGraphQlPage>,
835}
836
837#[derive(Debug, Deserialize)]
838struct RemoteGraphQlPage {
839    media: Vec<RemoteMedia>,
840}
841
842#[derive(Debug, Deserialize)]
843struct RemoteMedia {
844    id: u32,
845    #[serde(rename = "idMal")]
846    id_mal: Option<u32>,
847    title: RemoteTitle,
848    synonyms: Option<Vec<String>>,
849}
850
851#[derive(Debug, Deserialize)]
852struct RemoteTitle {
853    romaji: Option<String>,
854    english: Option<String>,
855    native: Option<String>,
856    #[serde(rename = "userPreferred")]
857    user_preferred: Option<String>,
858}
859
860#[derive(Debug, Deserialize)]
861struct RemoteGraphQlError {
862    message: String,
863}
864
865#[cfg(test)]
866mod tests {
867    use super::*;
868    use flate2::Compression;
869    use flate2::write::GzEncoder;
870    use std::fs;
871    use std::io::Write;
872    use std::time::{SystemTime, UNIX_EPOCH};
873
874    #[test]
875    fn matches_titles_from_plain_sql_dump() {
876        let temp_dir = unique_temp_dir();
877        let dump_path = temp_dir.join("latest.sql");
878        fs::write(&dump_path, sample_dump()).unwrap();
879
880        let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
881        let matched = matcher.match_title("spy x family").unwrap().unwrap();
882
883        assert_eq!(matched.provider, MatchProvider::KitsuDump);
884        assert_eq!(matched.canonical_title, "Spy x Family");
885        assert_eq!(matched.ids.kitsu, Some(1));
886        assert_eq!(matched.ids.anilist, Some(777));
887        assert_eq!(matched.ids.mal, Some(12345));
888        assert!(matched.score > 0.90);
889
890        fs::remove_dir_all(temp_dir).unwrap();
891    }
892
893    #[test]
894    fn reads_gzipped_dump_files() {
895        let temp_dir = unique_temp_dir();
896        let dump_path = temp_dir.join("latest.sql.gz");
897        write_gzip(&dump_path, sample_dump().as_bytes());
898
899        let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
900        let matches = matcher.search_titles("frieren", 2).unwrap();
901
902        assert_eq!(matches[0].canonical_title, "Sousou no Frieren");
903        assert_eq!(matches[0].ids.anilist, Some(888));
904
905        fs::remove_dir_all(temp_dir).unwrap();
906    }
907
908    #[test]
909    fn normalizes_titles_before_scoring() {
910        assert_eq!(normalize_title("SPY×FAMILY"), "spyxfamily");
911        assert!(similarity_score("spy x family", "spy family") > 0.75);
912        assert!(similarity_score("jujutsu kaisen", "bleach") < 0.50);
913    }
914
915    #[test]
916    fn parses_json_title_values() {
917        let titles = parse_json_titles(r#"{"en":"Frieren","en_jp":"Sousou no Frieren"}"#);
918        assert_eq!(titles.len(), 2);
919        assert!(titles.contains(&"Frieren".to_string()));
920        assert!(titles.contains(&"Sousou no Frieren".to_string()));
921    }
922
923    fn unique_temp_dir() -> PathBuf {
924        let unique = SystemTime::now()
925            .duration_since(UNIX_EPOCH)
926            .unwrap()
927            .as_nanos();
928        let path = std::env::temp_dir().join(format!(
929            "zantetsu-vecdb-test-{}-{}",
930            std::process::id(),
931            unique
932        ));
933        fs::create_dir_all(&path).unwrap();
934        path
935    }
936
937    fn write_gzip(path: &Path, bytes: &[u8]) {
938        let file = File::create(path).unwrap();
939        let mut encoder = GzEncoder::new(file, Compression::default());
940        encoder.write_all(bytes).unwrap();
941        encoder.finish().unwrap();
942    }
943
944    fn sample_dump() -> String {
945        [
946            "-- sample kitsu dump",
947            "COPY public.anime (id, slug, titles, canonical_title, abbreviated_titles) FROM stdin;",
948            "1\tspy-x-family\t{\"en\":\"Spy x Family\",\"en_jp\":\"SPY×FAMILY\",\"ja_jp\":\"スパイファミリー\"}\tSpy x Family\t[\"Spy Family\"]",
949            "2\tsousou-no-frieren\t{\"en\":\"Frieren: Beyond Journey's End\",\"en_jp\":\"Sousou no Frieren\"}\tSousou no Frieren\t[\"Frieren\"]",
950            "\\.",
951            "COPY public.mappings (item_id, item_type, external_site, external_id) FROM stdin;",
952            "1\tAnime\tanilist/anime\t777",
953            "1\tAnime\tmyanimelist/anime\t12345",
954            "2\tAnime\tanilist/anime\t888",
955            "\\.",
956            "",
957        ]
958        .join("\n")
959    }
960}