Skip to main content

zantetsu_vecdb/
matcher.rs

1use crate::error::{MatchResult, MatcherError};
2use dirs::data_dir;
3use flate2::read::GzDecoder;
4use reqwest::blocking::Client;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::{HashMap, HashSet};
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::{Path, PathBuf};
11use strsim::jaro_winkler;
12use tracing::debug;
13
14const DEFAULT_TIMEOUT_SECS: u64 = 10;
15
16/// Which backend produced a canonical title match.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18pub enum MatchProvider {
19    /// Match derived from the local Kitsu SQL dump.
20    KitsuDump,
21    /// Match derived from a remote GraphQL endpoint.
22    RemoteEndpoint,
23}
24
25/// External and internal identifiers for a matched anime.
26#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct AnimeIds {
28    /// Kitsu anime id.
29    pub kitsu: Option<u32>,
30    /// AniList-compatible media id.
31    pub anilist: Option<u32>,
32    /// MyAnimeList anime id.
33    pub mal: Option<u32>,
34}
35
36/// The best-scoring match for a query title.
37#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
38pub struct AnimeTitleMatch {
39    /// Which backend produced this result.
40    pub provider: MatchProvider,
41    /// Canonical title chosen for the anime.
42    pub canonical_title: String,
43    /// The specific alias that scored highest for the query.
44    pub matched_title: String,
45    /// Similarity score in the range `[0.0, 1.0]`.
46    pub score: f32,
47    /// Known IDs for the matched anime.
48    pub ids: AnimeIds,
49    /// Known aliases for the anime, canonical title first.
50    pub titles: Vec<String>,
51}
52
53/// Backend choice for canonical title matching.
54///
55/// # Examples
56///
57/// ```rust
58/// use zantetsu_vecdb::MatchSource;
59///
60/// let local = MatchSource::kitsu_dump("/tmp/kitsu-dumps");
61/// let remote = MatchSource::remote_endpoint("https://graphql.anilist.co");
62///
63/// let _ = (local, remote);
64/// ```
65#[derive(Debug, Clone)]
66pub enum MatchSource {
67    /// Use a local Kitsu SQL dump located at `dump_path`.
68    KitsuDump {
69        /// Path to either the dump directory or a specific `latest.sql` / `latest.sql.gz` file.
70        dump_path: PathBuf,
71    },
72    /// Use a remote GraphQL endpoint.
73    RemoteEndpoint {
74        /// Remote endpoint URL.
75        endpoint: String,
76    },
77}
78
79impl MatchSource {
80    /// Use the local Kitsu dump at the provided path.
81    #[must_use]
82    pub fn kitsu_dump(dump_path: impl Into<PathBuf>) -> Self {
83        Self::KitsuDump {
84            dump_path: dump_path.into(),
85        }
86    }
87
88    /// Use a remote GraphQL endpoint.
89    #[must_use]
90    pub fn remote_endpoint(endpoint: impl Into<String>) -> Self {
91        Self::RemoteEndpoint {
92            endpoint: endpoint.into(),
93        }
94    }
95}
96
97/// Returns the default dump directory used by `kitsu-sync`.
98#[must_use]
99pub fn default_kitsu_dump_dir() -> PathBuf {
100    data_dir()
101        .unwrap_or_else(|| PathBuf::from("."))
102        .join("zantetsu")
103        .join("kitsu-dumps")
104}
105
106/// Canonical title matcher backed by either a local Kitsu dump or a remote GraphQL endpoint.
107///
108/// Construct a matcher once and reuse it for many title lookups.
109pub struct TitleMatcher {
110    backend: MatcherBackend,
111}
112
113enum MatcherBackend {
114    Kitsu(KitsuTitleMatcher),
115    RemoteEndpoint(RemoteTitleMatcher),
116}
117
118impl TitleMatcher {
119    /// Create a matcher from the selected source.
120    ///
121    /// # Examples
122    ///
123    /// ```rust,no_run
124    /// use zantetsu_vecdb::{MatchSource, TitleMatcher};
125    ///
126    /// let matcher = TitleMatcher::new(
127    ///     MatchSource::remote_endpoint("https://graphql.anilist.co"),
128    /// )
129    /// .unwrap();
130    ///
131    /// let _ = matcher;
132    /// ```
133    pub fn new(source: MatchSource) -> MatchResult<Self> {
134        let backend = match source {
135            MatchSource::KitsuDump { dump_path } => {
136                MatcherBackend::Kitsu(KitsuTitleMatcher::from_dump(dump_path)?)
137            }
138            MatchSource::RemoteEndpoint { endpoint } => {
139                MatcherBackend::RemoteEndpoint(RemoteTitleMatcher::new(endpoint)?)
140            }
141        };
142
143        Ok(Self { backend })
144    }
145
146    /// Create a matcher from a local Kitsu dump path.
147    pub fn from_kitsu_dump(dump_path: impl Into<PathBuf>) -> MatchResult<Self> {
148        Self::new(MatchSource::kitsu_dump(dump_path))
149    }
150
151    /// Create a matcher using a remote GraphQL endpoint.
152    pub fn from_remote_endpoint(endpoint: impl Into<String>) -> MatchResult<Self> {
153        Self::new(MatchSource::remote_endpoint(endpoint))
154    }
155
156    /// Return the best available match for the provided title.
157    ///
158    /// # Examples
159    ///
160    /// ```rust,no_run
161    /// use zantetsu_vecdb::{MatchSource, TitleMatcher};
162    ///
163    /// let matcher = TitleMatcher::new(
164    ///     MatchSource::remote_endpoint("https://graphql.anilist.co"),
165    /// )
166    /// .unwrap();
167    ///
168    /// let best = matcher.match_title("Spy x Family").unwrap();
169    /// assert!(best.is_some());
170    /// ```
171    pub fn match_title(&self, title: &str) -> MatchResult<Option<AnimeTitleMatch>> {
172        Ok(self.search_titles(title, 1)?.into_iter().next())
173    }
174
175    /// Search for the best matches for the provided title.
176    ///
177    /// Results are ordered from highest score to lowest score.
178    pub fn search_titles(&self, title: &str, limit: usize) -> MatchResult<Vec<AnimeTitleMatch>> {
179        let query = normalize_title(title);
180        if query.is_empty() {
181            return Err(MatcherError::EmptyQuery);
182        }
183
184        let limit = limit.max(1);
185        match &self.backend {
186            MatcherBackend::Kitsu(matcher) => Ok(score_entries(
187                MatchProvider::KitsuDump,
188                &matcher.entries,
189                title,
190                &query,
191                limit,
192            )),
193            MatcherBackend::RemoteEndpoint(matcher) => matcher.search_titles(title, &query, limit),
194        }
195    }
196}
197
198struct KitsuTitleMatcher {
199    entries: Vec<CatalogEntry>,
200}
201
202impl KitsuTitleMatcher {
203    fn from_dump(dump_path: PathBuf) -> MatchResult<Self> {
204        let resolved = resolve_dump_path(&dump_path)?;
205        debug!("loading Kitsu dump from {}", resolved.display());
206        let reader = open_dump_reader(&resolved)?;
207        let entries = parse_kitsu_dump(reader)?;
208        Ok(Self { entries })
209    }
210}
211
212struct RemoteTitleMatcher {
213    client: Client,
214    endpoint: String,
215}
216
217impl RemoteTitleMatcher {
218    fn new(endpoint: String) -> MatchResult<Self> {
219        let endpoint = endpoint.trim().to_string();
220        if endpoint.is_empty() {
221            return Err(MatcherError::InvalidResponse(
222                "remote endpoint URL cannot be empty".into(),
223            ));
224        }
225
226        let client = Client::builder()
227            .timeout(std::time::Duration::from_secs(DEFAULT_TIMEOUT_SECS))
228            .user_agent(format!("zantetsu/{}", env!("CARGO_PKG_VERSION")))
229            .build()?;
230        Ok(Self { client, endpoint })
231    }
232
233    fn search_titles(
234        &self,
235        raw_query: &str,
236        normalized_query: &str,
237        limit: usize,
238    ) -> MatchResult<Vec<AnimeTitleMatch>> {
239        let payload = RemoteGraphQlRequest {
240            query: REMOTE_GRAPHQL_QUERY,
241            variables: RemoteGraphQlVariables {
242                search: raw_query,
243                per_page: limit.max(5) as i64,
244            },
245        };
246
247        let response = self
248            .client
249            .post(&self.endpoint)
250            .json(&payload)
251            .send()?
252            .error_for_status()?;
253
254        let envelope: RemoteGraphQlEnvelope = response.json()?;
255        if let Some(errors) = envelope.errors {
256            let message = errors
257                .into_iter()
258                .map(|error| error.message)
259                .collect::<Vec<_>>()
260                .join("; ");
261            return Err(MatcherError::GraphQl(message));
262        }
263
264        let media = envelope
265            .data
266            .ok_or_else(|| MatcherError::InvalidResponse("missing data".into()))?
267            .page
268            .ok_or_else(|| MatcherError::InvalidResponse("missing page".into()))?
269            .media;
270
271        let entries = media
272            .into_iter()
273            .map(CatalogEntry::from_remote_media)
274            .collect::<Vec<_>>();
275
276        Ok(score_entries(
277            MatchProvider::RemoteEndpoint,
278            &entries,
279            raw_query,
280            normalized_query,
281            limit,
282        ))
283    }
284}
285
286#[derive(Debug, Clone)]
287struct CatalogEntry {
288    canonical_title: String,
289    titles: Vec<String>,
290    normalized_titles: Vec<String>,
291    ids: AnimeIds,
292}
293
294impl CatalogEntry {
295    fn new(canonical_title: String, titles: Vec<String>, ids: AnimeIds) -> Option<Self> {
296        let titles = dedupe_titles(&canonical_title, titles);
297        if titles.is_empty() {
298            return None;
299        }
300
301        let canonical_title = titles
302            .first()
303            .cloned()
304            .unwrap_or_else(|| canonical_title.trim().to_string());
305
306        let normalized_titles = titles.iter().map(|title| normalize_title(title)).collect();
307        Some(Self {
308            canonical_title,
309            titles,
310            normalized_titles,
311            ids,
312        })
313    }
314
315    fn from_remote_media(media: RemoteMedia) -> Self {
316        let mut titles = Vec::new();
317        if let Some(title) = media.title.user_preferred.clone() {
318            titles.push(title);
319        }
320        if let Some(title) = media.title.romaji.clone() {
321            titles.push(title);
322        }
323        if let Some(title) = media.title.english.clone() {
324            titles.push(title);
325        }
326        if let Some(title) = media.title.native.clone() {
327            titles.push(title);
328        }
329        titles.extend(media.synonyms.unwrap_or_default());
330
331        let canonical_title = media
332            .title
333            .user_preferred
334            .clone()
335            .or(media.title.romaji.clone())
336            .or(media.title.english.clone())
337            .or(media.title.native.clone())
338            .unwrap_or_else(|| format!("Remote {}", media.id));
339
340        Self::new(
341            canonical_title,
342            titles,
343            AnimeIds {
344                kitsu: None,
345                anilist: Some(media.id),
346                mal: media.id_mal,
347            },
348        )
349        .expect("remote media should always produce at least one title")
350    }
351}
352
353#[derive(Debug, Default)]
354struct RawAnimeRecord {
355    canonical_title: Option<String>,
356    titles: Vec<String>,
357    slug: Option<String>,
358}
359
360#[derive(Debug, Clone)]
361enum DumpSection {
362    None,
363    Anime(AnimeColumns),
364    Mappings(MappingColumns),
365}
366
367#[derive(Debug, Clone)]
368struct AnimeColumns {
369    id: usize,
370    canonical_title: Option<usize>,
371    titles: Option<usize>,
372    abbreviated_titles: Option<usize>,
373    slug: Option<usize>,
374}
375
376impl AnimeColumns {
377    fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
378        let id = find_column_index(columns, "id")
379            .ok_or_else(|| MatcherError::InvalidDump("anime COPY is missing id column".into()))?;
380        Ok(Self {
381            id,
382            canonical_title: find_column_index(columns, "canonical_title"),
383            titles: find_column_index(columns, "titles"),
384            abbreviated_titles: find_column_index(columns, "abbreviated_titles"),
385            slug: find_column_index(columns, "slug"),
386        })
387    }
388}
389
390#[derive(Debug, Clone)]
391struct MappingColumns {
392    item_id: usize,
393    item_type: Option<usize>,
394    external_site: usize,
395    external_id: usize,
396}
397
398impl MappingColumns {
399    fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
400        let item_id = find_column_index(columns, "item_id").ok_or_else(|| {
401            MatcherError::InvalidDump("mappings COPY is missing item_id column".into())
402        })?;
403        let external_site = find_column_index(columns, "external_site").ok_or_else(|| {
404            MatcherError::InvalidDump("mappings COPY is missing external_site column".into())
405        })?;
406        let external_id = find_column_index(columns, "external_id").ok_or_else(|| {
407            MatcherError::InvalidDump("mappings COPY is missing external_id column".into())
408        })?;
409
410        Ok(Self {
411            item_id,
412            item_type: find_column_index(columns, "item_type"),
413            external_site,
414            external_id,
415        })
416    }
417}
418
419fn score_entries(
420    provider: MatchProvider,
421    entries: &[CatalogEntry],
422    raw_query: &str,
423    normalized_query: &str,
424    limit: usize,
425) -> Vec<AnimeTitleMatch> {
426    let mut scored = entries
427        .iter()
428        .filter_map(|entry| score_entry(provider, entry, raw_query, normalized_query))
429        .collect::<Vec<_>>();
430
431    scored.sort_by(|left, right| {
432        right
433            .score
434            .partial_cmp(&left.score)
435            .unwrap_or(std::cmp::Ordering::Equal)
436            .then_with(|| left.canonical_title.cmp(&right.canonical_title))
437    });
438    scored.truncate(limit);
439    scored
440}
441
442fn score_entry(
443    provider: MatchProvider,
444    entry: &CatalogEntry,
445    _raw_query: &str,
446    normalized_query: &str,
447) -> Option<AnimeTitleMatch> {
448    let mut best_title = None;
449    let mut best_score = 0.0_f32;
450
451    for (index, normalized_title) in entry.normalized_titles.iter().enumerate() {
452        let score = similarity_score(normalized_query, normalized_title);
453        if score > best_score {
454            best_score = score;
455            best_title = entry.titles.get(index).cloned();
456        }
457    }
458
459    best_title.map(|matched_title| AnimeTitleMatch {
460        provider,
461        canonical_title: entry.canonical_title.clone(),
462        matched_title,
463        score: best_score,
464        ids: entry.ids.clone(),
465        titles: entry.titles.clone(),
466    })
467}
468
469fn similarity_score(query: &str, candidate: &str) -> f32 {
470    if query.is_empty() || candidate.is_empty() {
471        return 0.0;
472    }
473    if query == candidate {
474        return 1.0;
475    }
476
477    let jaro = jaro_winkler(query, candidate) as f32;
478    let token_overlap = token_overlap_score(query, candidate);
479    let contains = if query.contains(candidate) || candidate.contains(query) {
480        1.0
481    } else {
482        0.0
483    };
484    let prefix = if query.starts_with(candidate) || candidate.starts_with(query) {
485        1.0
486    } else {
487        0.0
488    };
489    let len_ratio =
490        query.len().min(candidate.len()) as f32 / query.len().max(candidate.len()) as f32;
491
492    let mut score =
493        0.55 * jaro + 0.20 * token_overlap + 0.15 * contains + 0.10 * len_ratio + 0.05 * prefix;
494    if token_overlap == 0.0 && contains == 0.0 && jaro < 0.90 {
495        score *= 0.75;
496    }
497
498    score.clamp(0.0, 1.0)
499}
500
501fn token_overlap_score(left: &str, right: &str) -> f32 {
502    let left_tokens = left.split_whitespace().collect::<HashSet<_>>();
503    let right_tokens = right.split_whitespace().collect::<HashSet<_>>();
504    if left_tokens.is_empty() || right_tokens.is_empty() {
505        return 0.0;
506    }
507
508    let shared = left_tokens.intersection(&right_tokens).count() as f32;
509    (2.0 * shared) / (left_tokens.len() as f32 + right_tokens.len() as f32)
510}
511
512fn normalize_title(title: &str) -> String {
513    let mut normalized = String::with_capacity(title.len());
514    let mut last_was_space = true;
515
516    for ch in title.chars() {
517        let mapped = match ch {
518            '×' | '✕' | '✖' => 'x',
519            '&' => ' ',
520            _ => ch.to_ascii_lowercase(),
521        };
522
523        if mapped.is_alphanumeric() {
524            normalized.push(mapped);
525            last_was_space = false;
526        } else if (mapped.is_whitespace() || matches!(mapped, '-' | '_' | '.' | ':' | '/' | '\\'))
527            && !last_was_space
528        {
529            normalized.push(' ');
530            last_was_space = true;
531        }
532    }
533
534    normalized.trim().to_string()
535}
536
537fn dedupe_titles(canonical_title: &str, titles: Vec<String>) -> Vec<String> {
538    let mut deduped = Vec::new();
539    let mut seen = HashSet::new();
540
541    let mut push_title = |title: String| {
542        let cleaned = title.trim();
543        if cleaned.is_empty() {
544            return;
545        }
546        let key = normalize_title(cleaned);
547        if key.is_empty() || !seen.insert(key) {
548            return;
549        }
550        deduped.push(cleaned.to_string());
551    };
552
553    push_title(canonical_title.to_string());
554    for title in titles {
555        push_title(title);
556    }
557
558    deduped
559}
560
561fn resolve_dump_path(path: &Path) -> MatchResult<PathBuf> {
562    if path.is_file() {
563        return Ok(path.to_path_buf());
564    }
565
566    if path.is_dir() {
567        let sql = path.join("latest.sql");
568        if sql.is_file() {
569            return Ok(sql);
570        }
571
572        let gzip = path.join("latest.sql.gz");
573        if gzip.is_file() {
574            return Ok(gzip);
575        }
576
577        return Err(MatcherError::InvalidDumpPath(format!(
578            "directory {} does not contain latest.sql or latest.sql.gz",
579            path.display()
580        )));
581    }
582
583    Err(MatcherError::InvalidDumpPath(format!(
584        "{} does not exist",
585        path.display()
586    )))
587}
588
589fn open_dump_reader(path: &Path) -> MatchResult<Box<dyn BufRead>> {
590    let file = File::open(path)?;
591    if path
592        .extension()
593        .and_then(|ext| ext.to_str())
594        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
595    {
596        return Ok(Box::new(BufReader::new(GzDecoder::new(file))));
597    }
598
599    Ok(Box::new(BufReader::new(file)))
600}
601
602fn parse_kitsu_dump<R: BufRead>(reader: R) -> MatchResult<Vec<CatalogEntry>> {
603    let mut anime_by_id = HashMap::<u32, RawAnimeRecord>::new();
604    let mut ids_by_anime = HashMap::<u32, AnimeIds>::new();
605    let mut section = DumpSection::None;
606
607    for line in reader.lines() {
608        let line = line?;
609        if let Some((table, columns)) = parse_copy_statement(&line) {
610            section = match table.as_str() {
611                "anime" => DumpSection::Anime(AnimeColumns::try_from_columns(&columns)?),
612                "mappings" => DumpSection::Mappings(MappingColumns::try_from_columns(&columns)?),
613                _ => DumpSection::None,
614            };
615            continue;
616        }
617
618        if line == "\\." {
619            section = DumpSection::None;
620            continue;
621        }
622
623        match &section {
624            DumpSection::Anime(columns) => parse_anime_row(&line, columns, &mut anime_by_id)?,
625            DumpSection::Mappings(columns) => {
626                parse_mapping_row(&line, columns, &mut ids_by_anime)?;
627            }
628            DumpSection::None => {}
629        }
630    }
631
632    if anime_by_id.is_empty() {
633        return Err(MatcherError::InvalidDump(
634            "anime COPY section was not found".into(),
635        ));
636    }
637
638    let mut entries = anime_by_id
639        .into_iter()
640        .filter_map(|(kitsu_id, record)| {
641            let fallback_slug = record.slug.as_deref().map(slug_to_title);
642            let canonical_title = record
643                .canonical_title
644                .clone()
645                .or_else(|| record.titles.first().cloned())
646                .or(fallback_slug)?;
647
648            let mut titles = record.titles;
649            if let Some(slug) = record.slug {
650                titles.push(slug_to_title(&slug));
651            }
652
653            let mut ids = ids_by_anime.remove(&kitsu_id).unwrap_or_default();
654            ids.kitsu = Some(kitsu_id);
655            CatalogEntry::new(canonical_title, titles, ids)
656        })
657        .collect::<Vec<_>>();
658
659    entries.sort_by(|left, right| left.canonical_title.cmp(&right.canonical_title));
660    Ok(entries)
661}
662
663fn parse_copy_statement(line: &str) -> Option<(String, Vec<String>)> {
664    let rest = line.strip_prefix("COPY ")?;
665    let rest = rest.strip_suffix(" FROM stdin;")?;
666    let open = rest.find('(')?;
667    let close = rest.rfind(')')?;
668    if close <= open {
669        return None;
670    }
671
672    let table = normalize_identifier(&rest[..open]);
673    let columns = rest[open + 1..close]
674        .split(',')
675        .map(normalize_identifier)
676        .collect::<Vec<_>>();
677    Some((table, columns))
678}
679
680fn parse_anime_row(
681    line: &str,
682    columns: &AnimeColumns,
683    anime_by_id: &mut HashMap<u32, RawAnimeRecord>,
684) -> MatchResult<()> {
685    let fields = split_copy_row(line);
686    let id = field_value(&fields, columns.id)
687        .ok_or_else(|| MatcherError::InvalidDump("anime row is missing id".into()))?
688        .parse::<u32>()
689        .map_err(|_| MatcherError::InvalidDump("anime id is not numeric".into()))?;
690
691    let record = anime_by_id.entry(id).or_default();
692
693    if let Some(index) = columns.canonical_title {
694        if let Some(value) = field_value(&fields, index) {
695            record.canonical_title = Some(value.to_string());
696        }
697    }
698
699    if let Some(index) = columns.titles {
700        if let Some(value) = field_value(&fields, index) {
701            record.titles.extend(parse_json_titles(value));
702        }
703    }
704
705    if let Some(index) = columns.abbreviated_titles {
706        if let Some(value) = field_value(&fields, index) {
707            record.titles.extend(parse_json_titles(value));
708        }
709    }
710
711    if let Some(index) = columns.slug {
712        if let Some(value) = field_value(&fields, index) {
713            record.slug = Some(value.to_string());
714        }
715    }
716
717    Ok(())
718}
719
720fn parse_mapping_row(
721    line: &str,
722    columns: &MappingColumns,
723    ids_by_anime: &mut HashMap<u32, AnimeIds>,
724) -> MatchResult<()> {
725    let fields = split_copy_row(line);
726    if let Some(index) = columns.item_type {
727        if !field_value(&fields, index).is_some_and(|value| value.eq_ignore_ascii_case("anime")) {
728            return Ok(());
729        }
730    }
731
732    let item_id =
733        match field_value(&fields, columns.item_id).and_then(|value| value.parse::<u32>().ok()) {
734            Some(id) => id,
735            None => return Ok(()),
736        };
737
738    let external_site = match field_value(&fields, columns.external_site) {
739        Some(site) => site.to_ascii_lowercase(),
740        None => return Ok(()),
741    };
742    let external_id = match field_value(&fields, columns.external_id)
743        .and_then(|value| value.parse::<u32>().ok())
744    {
745        Some(id) => id,
746        None => return Ok(()),
747    };
748
749    let ids = ids_by_anime.entry(item_id).or_default();
750    if external_site.contains("anilist") {
751        ids.anilist = Some(external_id);
752    } else if external_site.contains("myanimelist") || external_site == "mal" {
753        ids.mal = Some(external_id);
754    }
755
756    Ok(())
757}
758
759fn split_copy_row(line: &str) -> Vec<Option<String>> {
760    line.split('\t').map(parse_copy_field).collect()
761}
762
763fn parse_copy_field(field: &str) -> Option<String> {
764    if field == "\\N" {
765        return None;
766    }
767
768    let mut output = String::with_capacity(field.len());
769    let mut chars = field.chars();
770    while let Some(ch) = chars.next() {
771        if ch == '\\' {
772            match chars.next() {
773                Some('t') => output.push('\t'),
774                Some('n') => output.push('\n'),
775                Some('r') => output.push('\r'),
776                Some('b') => output.push('\u{0008}'),
777                Some('f') => output.push('\u{000C}'),
778                Some('\\') => output.push('\\'),
779                Some(other) => output.push(other),
780                None => output.push('\\'),
781            }
782        } else {
783            output.push(ch);
784        }
785    }
786
787    Some(output)
788}
789
790fn field_value(fields: &[Option<String>], index: usize) -> Option<&str> {
791    fields.get(index).and_then(|field| field.as_deref())
792}
793
794fn normalize_identifier(identifier: &str) -> String {
795    identifier
796        .split('.')
797        .next_back()
798        .unwrap_or(identifier)
799        .trim()
800        .trim_matches('"')
801        .to_ascii_lowercase()
802}
803
804fn find_column_index(columns: &[String], target: &str) -> Option<usize> {
805    columns
806        .iter()
807        .position(|column| column.eq_ignore_ascii_case(target))
808}
809
810fn parse_json_titles(raw: &str) -> Vec<String> {
811    let parsed = match serde_json::from_str::<Value>(raw) {
812        Ok(value) => value,
813        Err(_) => return Vec::new(),
814    };
815
816    match parsed {
817        Value::Object(map) => map
818            .into_values()
819            .filter_map(|value| value.as_str().map(ToOwned::to_owned))
820            .collect(),
821        Value::Array(items) => items
822            .into_iter()
823            .filter_map(|value| value.as_str().map(ToOwned::to_owned))
824            .collect(),
825        Value::String(title) => vec![title],
826        _ => Vec::new(),
827    }
828}
829
830fn slug_to_title(slug: &str) -> String {
831    slug.replace('-', " ").trim().to_string()
832}
833
834const REMOTE_GRAPHQL_QUERY: &str = r#"
835query($search: String!, $perPage: Int!) {
836  Page(page: 1, perPage: $perPage) {
837    media(search: $search, type: ANIME) {
838      id
839      idMal
840      title {
841        romaji
842        english
843        native
844        userPreferred
845      }
846      synonyms
847    }
848  }
849}
850"#;
851
852#[derive(Debug, Serialize)]
853struct RemoteGraphQlRequest<'a> {
854    query: &'a str,
855    variables: RemoteGraphQlVariables<'a>,
856}
857
858#[derive(Debug, Serialize)]
859struct RemoteGraphQlVariables<'a> {
860    search: &'a str,
861    #[serde(rename = "perPage")]
862    per_page: i64,
863}
864
865#[derive(Debug, Deserialize)]
866struct RemoteGraphQlEnvelope {
867    data: Option<RemoteGraphQlData>,
868    errors: Option<Vec<RemoteGraphQlError>>,
869}
870
871#[derive(Debug, Deserialize)]
872struct RemoteGraphQlData {
873    #[serde(rename = "Page")]
874    page: Option<RemoteGraphQlPage>,
875}
876
877#[derive(Debug, Deserialize)]
878struct RemoteGraphQlPage {
879    media: Vec<RemoteMedia>,
880}
881
882#[derive(Debug, Deserialize)]
883struct RemoteMedia {
884    id: u32,
885    #[serde(rename = "idMal")]
886    id_mal: Option<u32>,
887    title: RemoteTitle,
888    synonyms: Option<Vec<String>>,
889}
890
891#[derive(Debug, Deserialize)]
892struct RemoteTitle {
893    romaji: Option<String>,
894    english: Option<String>,
895    native: Option<String>,
896    #[serde(rename = "userPreferred")]
897    user_preferred: Option<String>,
898}
899
900#[derive(Debug, Deserialize)]
901struct RemoteGraphQlError {
902    message: String,
903}
904
905#[cfg(test)]
906mod tests {
907    use super::*;
908    use flate2::Compression;
909    use flate2::write::GzEncoder;
910    use std::fs;
911    use std::io::Write;
912    use std::time::{SystemTime, UNIX_EPOCH};
913
914    #[test]
915    fn matches_titles_from_plain_sql_dump() {
916        let temp_dir = unique_temp_dir();
917        let dump_path = temp_dir.join("latest.sql");
918        fs::write(&dump_path, sample_dump()).unwrap();
919
920        let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
921        let matched = matcher.match_title("spy x family").unwrap().unwrap();
922
923        assert_eq!(matched.provider, MatchProvider::KitsuDump);
924        assert_eq!(matched.canonical_title, "Spy x Family");
925        assert_eq!(matched.ids.kitsu, Some(1));
926        assert_eq!(matched.ids.anilist, Some(777));
927        assert_eq!(matched.ids.mal, Some(12345));
928        assert!(matched.score > 0.90);
929
930        fs::remove_dir_all(temp_dir).unwrap();
931    }
932
933    #[test]
934    fn reads_gzipped_dump_files() {
935        let temp_dir = unique_temp_dir();
936        let dump_path = temp_dir.join("latest.sql.gz");
937        write_gzip(&dump_path, sample_dump().as_bytes());
938
939        let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
940        let matches = matcher.search_titles("frieren", 2).unwrap();
941
942        assert_eq!(matches[0].canonical_title, "Sousou no Frieren");
943        assert_eq!(matches[0].ids.anilist, Some(888));
944
945        fs::remove_dir_all(temp_dir).unwrap();
946    }
947
948    #[test]
949    fn normalizes_titles_before_scoring() {
950        assert_eq!(normalize_title("SPY×FAMILY"), "spyxfamily");
951        assert!(similarity_score("spy x family", "spy family") > 0.75);
952        assert!(similarity_score("jujutsu kaisen", "bleach") < 0.50);
953    }
954
955    #[test]
956    fn parses_json_title_values() {
957        let titles = parse_json_titles(r#"{"en":"Frieren","en_jp":"Sousou no Frieren"}"#);
958        assert_eq!(titles.len(), 2);
959        assert!(titles.contains(&"Frieren".to_string()));
960        assert!(titles.contains(&"Sousou no Frieren".to_string()));
961    }
962
963    fn unique_temp_dir() -> PathBuf {
964        let unique = SystemTime::now()
965            .duration_since(UNIX_EPOCH)
966            .unwrap()
967            .as_nanos();
968        let path = std::env::temp_dir().join(format!(
969            "zantetsu-vecdb-test-{}-{}",
970            std::process::id(),
971            unique
972        ));
973        fs::create_dir_all(&path).unwrap();
974        path
975    }
976
977    fn write_gzip(path: &Path, bytes: &[u8]) {
978        let file = File::create(path).unwrap();
979        let mut encoder = GzEncoder::new(file, Compression::default());
980        encoder.write_all(bytes).unwrap();
981        encoder.finish().unwrap();
982    }
983
984    fn sample_dump() -> String {
985        [
986            "-- sample kitsu dump",
987            "COPY public.anime (id, slug, titles, canonical_title, abbreviated_titles) FROM stdin;",
988            "1\tspy-x-family\t{\"en\":\"Spy x Family\",\"en_jp\":\"SPY×FAMILY\",\"ja_jp\":\"スパイファミリー\"}\tSpy x Family\t[\"Spy Family\"]",
989            "2\tsousou-no-frieren\t{\"en\":\"Frieren: Beyond Journey's End\",\"en_jp\":\"Sousou no Frieren\"}\tSousou no Frieren\t[\"Frieren\"]",
990            "\\.",
991            "COPY public.mappings (item_id, item_type, external_site, external_id) FROM stdin;",
992            "1\tAnime\tanilist/anime\t777",
993            "1\tAnime\tmyanimelist/anime\t12345",
994            "2\tAnime\tanilist/anime\t888",
995            "\\.",
996            "",
997        ]
998        .join("\n")
999    }
1000}