1use crate::error::{MatchResult, MatcherError};
2use dirs::data_dir;
3use flate2::read::GzDecoder;
4use reqwest::blocking::Client;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::{HashMap, HashSet};
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::{Path, PathBuf};
11use strsim::jaro_winkler;
12use tracing::debug;
13
14const DEFAULT_TIMEOUT_SECS: u64 = 10;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18pub enum MatchProvider {
19 KitsuDump,
21 RemoteEndpoint,
23}
24
25#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct AnimeIds {
28 pub kitsu: Option<u32>,
30 pub anilist: Option<u32>,
32 pub mal: Option<u32>,
34}
35
36#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
38pub struct AnimeTitleMatch {
39 pub provider: MatchProvider,
41 pub canonical_title: String,
43 pub matched_title: String,
45 pub score: f32,
47 pub ids: AnimeIds,
49 pub titles: Vec<String>,
51}
52
53#[derive(Debug, Clone)]
66pub enum MatchSource {
67 KitsuDump {
69 dump_path: PathBuf,
71 },
72 RemoteEndpoint {
74 endpoint: String,
76 },
77}
78
79impl MatchSource {
80 #[must_use]
82 pub fn kitsu_dump(dump_path: impl Into<PathBuf>) -> Self {
83 Self::KitsuDump {
84 dump_path: dump_path.into(),
85 }
86 }
87
88 #[must_use]
90 pub fn remote_endpoint(endpoint: impl Into<String>) -> Self {
91 Self::RemoteEndpoint {
92 endpoint: endpoint.into(),
93 }
94 }
95}
96
97#[must_use]
99pub fn default_kitsu_dump_dir() -> PathBuf {
100 data_dir()
101 .unwrap_or_else(|| PathBuf::from("."))
102 .join("zantetsu")
103 .join("kitsu-dumps")
104}
105
106pub struct TitleMatcher {
110 backend: MatcherBackend,
111}
112
113enum MatcherBackend {
114 Kitsu(KitsuTitleMatcher),
115 RemoteEndpoint(RemoteTitleMatcher),
116}
117
118impl TitleMatcher {
119 pub fn new(source: MatchSource) -> MatchResult<Self> {
134 let backend = match source {
135 MatchSource::KitsuDump { dump_path } => {
136 MatcherBackend::Kitsu(KitsuTitleMatcher::from_dump(dump_path)?)
137 }
138 MatchSource::RemoteEndpoint { endpoint } => {
139 MatcherBackend::RemoteEndpoint(RemoteTitleMatcher::new(endpoint)?)
140 }
141 };
142
143 Ok(Self { backend })
144 }
145
146 pub fn from_kitsu_dump(dump_path: impl Into<PathBuf>) -> MatchResult<Self> {
148 Self::new(MatchSource::kitsu_dump(dump_path))
149 }
150
151 pub fn from_remote_endpoint(endpoint: impl Into<String>) -> MatchResult<Self> {
153 Self::new(MatchSource::remote_endpoint(endpoint))
154 }
155
156 pub fn match_title(&self, title: &str) -> MatchResult<Option<AnimeTitleMatch>> {
172 Ok(self.search_titles(title, 1)?.into_iter().next())
173 }
174
175 pub fn search_titles(&self, title: &str, limit: usize) -> MatchResult<Vec<AnimeTitleMatch>> {
179 let query = normalize_title(title);
180 if query.is_empty() {
181 return Err(MatcherError::EmptyQuery);
182 }
183
184 let limit = limit.max(1);
185 match &self.backend {
186 MatcherBackend::Kitsu(matcher) => Ok(score_entries(
187 MatchProvider::KitsuDump,
188 &matcher.entries,
189 title,
190 &query,
191 limit,
192 )),
193 MatcherBackend::RemoteEndpoint(matcher) => matcher.search_titles(title, &query, limit),
194 }
195 }
196}
197
198struct KitsuTitleMatcher {
199 entries: Vec<CatalogEntry>,
200}
201
202impl KitsuTitleMatcher {
203 fn from_dump(dump_path: PathBuf) -> MatchResult<Self> {
204 let resolved = resolve_dump_path(&dump_path)?;
205 debug!("loading Kitsu dump from {}", resolved.display());
206 let reader = open_dump_reader(&resolved)?;
207 let entries = parse_kitsu_dump(reader)?;
208 Ok(Self { entries })
209 }
210}
211
212struct RemoteTitleMatcher {
213 client: Client,
214 endpoint: String,
215}
216
217impl RemoteTitleMatcher {
218 fn new(endpoint: String) -> MatchResult<Self> {
219 let endpoint = endpoint.trim().to_string();
220 if endpoint.is_empty() {
221 return Err(MatcherError::InvalidResponse(
222 "remote endpoint URL cannot be empty".into(),
223 ));
224 }
225
226 let client = Client::builder()
227 .timeout(std::time::Duration::from_secs(DEFAULT_TIMEOUT_SECS))
228 .user_agent(format!("zantetsu/{}", env!("CARGO_PKG_VERSION")))
229 .build()?;
230 Ok(Self { client, endpoint })
231 }
232
233 fn search_titles(
234 &self,
235 raw_query: &str,
236 normalized_query: &str,
237 limit: usize,
238 ) -> MatchResult<Vec<AnimeTitleMatch>> {
239 let payload = RemoteGraphQlRequest {
240 query: REMOTE_GRAPHQL_QUERY,
241 variables: RemoteGraphQlVariables {
242 search: raw_query,
243 per_page: limit.max(5) as i64,
244 },
245 };
246
247 let response = self
248 .client
249 .post(&self.endpoint)
250 .json(&payload)
251 .send()?
252 .error_for_status()?;
253
254 let envelope: RemoteGraphQlEnvelope = response.json()?;
255 if let Some(errors) = envelope.errors {
256 let message = errors
257 .into_iter()
258 .map(|error| error.message)
259 .collect::<Vec<_>>()
260 .join("; ");
261 return Err(MatcherError::GraphQl(message));
262 }
263
264 let media = envelope
265 .data
266 .ok_or_else(|| MatcherError::InvalidResponse("missing data".into()))?
267 .page
268 .ok_or_else(|| MatcherError::InvalidResponse("missing page".into()))?
269 .media;
270
271 let entries = media
272 .into_iter()
273 .map(CatalogEntry::from_remote_media)
274 .collect::<Vec<_>>();
275
276 Ok(score_entries(
277 MatchProvider::RemoteEndpoint,
278 &entries,
279 raw_query,
280 normalized_query,
281 limit,
282 ))
283 }
284}
285
286#[derive(Debug, Clone)]
287struct CatalogEntry {
288 canonical_title: String,
289 titles: Vec<String>,
290 normalized_titles: Vec<String>,
291 ids: AnimeIds,
292}
293
294impl CatalogEntry {
295 fn new(canonical_title: String, titles: Vec<String>, ids: AnimeIds) -> Option<Self> {
296 let titles = dedupe_titles(&canonical_title, titles);
297 if titles.is_empty() {
298 return None;
299 }
300
301 let canonical_title = titles
302 .first()
303 .cloned()
304 .unwrap_or_else(|| canonical_title.trim().to_string());
305
306 let normalized_titles = titles.iter().map(|title| normalize_title(title)).collect();
307 Some(Self {
308 canonical_title,
309 titles,
310 normalized_titles,
311 ids,
312 })
313 }
314
315 fn from_remote_media(media: RemoteMedia) -> Self {
316 let mut titles = Vec::new();
317 if let Some(title) = media.title.user_preferred.clone() {
318 titles.push(title);
319 }
320 if let Some(title) = media.title.romaji.clone() {
321 titles.push(title);
322 }
323 if let Some(title) = media.title.english.clone() {
324 titles.push(title);
325 }
326 if let Some(title) = media.title.native.clone() {
327 titles.push(title);
328 }
329 titles.extend(media.synonyms.unwrap_or_default());
330
331 let canonical_title = media
332 .title
333 .user_preferred
334 .clone()
335 .or(media.title.romaji.clone())
336 .or(media.title.english.clone())
337 .or(media.title.native.clone())
338 .unwrap_or_else(|| format!("Remote {}", media.id));
339
340 Self::new(
341 canonical_title,
342 titles,
343 AnimeIds {
344 kitsu: None,
345 anilist: Some(media.id),
346 mal: media.id_mal,
347 },
348 )
349 .expect("remote media should always produce at least one title")
350 }
351}
352
353#[derive(Debug, Default)]
354struct RawAnimeRecord {
355 canonical_title: Option<String>,
356 titles: Vec<String>,
357 slug: Option<String>,
358}
359
360#[derive(Debug, Clone)]
361enum DumpSection {
362 None,
363 Anime(AnimeColumns),
364 Mappings(MappingColumns),
365}
366
367#[derive(Debug, Clone)]
368struct AnimeColumns {
369 id: usize,
370 canonical_title: Option<usize>,
371 titles: Option<usize>,
372 abbreviated_titles: Option<usize>,
373 slug: Option<usize>,
374}
375
376impl AnimeColumns {
377 fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
378 let id = find_column_index(columns, "id")
379 .ok_or_else(|| MatcherError::InvalidDump("anime COPY is missing id column".into()))?;
380 Ok(Self {
381 id,
382 canonical_title: find_column_index(columns, "canonical_title"),
383 titles: find_column_index(columns, "titles"),
384 abbreviated_titles: find_column_index(columns, "abbreviated_titles"),
385 slug: find_column_index(columns, "slug"),
386 })
387 }
388}
389
390#[derive(Debug, Clone)]
391struct MappingColumns {
392 item_id: usize,
393 item_type: Option<usize>,
394 external_site: usize,
395 external_id: usize,
396}
397
398impl MappingColumns {
399 fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
400 let item_id = find_column_index(columns, "item_id").ok_or_else(|| {
401 MatcherError::InvalidDump("mappings COPY is missing item_id column".into())
402 })?;
403 let external_site = find_column_index(columns, "external_site").ok_or_else(|| {
404 MatcherError::InvalidDump("mappings COPY is missing external_site column".into())
405 })?;
406 let external_id = find_column_index(columns, "external_id").ok_or_else(|| {
407 MatcherError::InvalidDump("mappings COPY is missing external_id column".into())
408 })?;
409
410 Ok(Self {
411 item_id,
412 item_type: find_column_index(columns, "item_type"),
413 external_site,
414 external_id,
415 })
416 }
417}
418
419fn score_entries(
420 provider: MatchProvider,
421 entries: &[CatalogEntry],
422 raw_query: &str,
423 normalized_query: &str,
424 limit: usize,
425) -> Vec<AnimeTitleMatch> {
426 let mut scored = entries
427 .iter()
428 .filter_map(|entry| score_entry(provider, entry, raw_query, normalized_query))
429 .collect::<Vec<_>>();
430
431 scored.sort_by(|left, right| {
432 right
433 .score
434 .partial_cmp(&left.score)
435 .unwrap_or(std::cmp::Ordering::Equal)
436 .then_with(|| left.canonical_title.cmp(&right.canonical_title))
437 });
438 scored.truncate(limit);
439 scored
440}
441
442fn score_entry(
443 provider: MatchProvider,
444 entry: &CatalogEntry,
445 _raw_query: &str,
446 normalized_query: &str,
447) -> Option<AnimeTitleMatch> {
448 let mut best_title = None;
449 let mut best_score = 0.0_f32;
450
451 for (index, normalized_title) in entry.normalized_titles.iter().enumerate() {
452 let score = similarity_score(normalized_query, normalized_title);
453 if score > best_score {
454 best_score = score;
455 best_title = entry.titles.get(index).cloned();
456 }
457 }
458
459 best_title.map(|matched_title| AnimeTitleMatch {
460 provider,
461 canonical_title: entry.canonical_title.clone(),
462 matched_title,
463 score: best_score,
464 ids: entry.ids.clone(),
465 titles: entry.titles.clone(),
466 })
467}
468
469fn similarity_score(query: &str, candidate: &str) -> f32 {
470 if query.is_empty() || candidate.is_empty() {
471 return 0.0;
472 }
473 if query == candidate {
474 return 1.0;
475 }
476
477 let jaro = jaro_winkler(query, candidate) as f32;
478 let token_overlap = token_overlap_score(query, candidate);
479 let contains = if query.contains(candidate) || candidate.contains(query) {
480 1.0
481 } else {
482 0.0
483 };
484 let prefix = if query.starts_with(candidate) || candidate.starts_with(query) {
485 1.0
486 } else {
487 0.0
488 };
489 let len_ratio =
490 query.len().min(candidate.len()) as f32 / query.len().max(candidate.len()) as f32;
491
492 let mut score =
493 0.55 * jaro + 0.20 * token_overlap + 0.15 * contains + 0.10 * len_ratio + 0.05 * prefix;
494 if token_overlap == 0.0 && contains == 0.0 && jaro < 0.90 {
495 score *= 0.75;
496 }
497
498 score.clamp(0.0, 1.0)
499}
500
501fn token_overlap_score(left: &str, right: &str) -> f32 {
502 let left_tokens = left.split_whitespace().collect::<HashSet<_>>();
503 let right_tokens = right.split_whitespace().collect::<HashSet<_>>();
504 if left_tokens.is_empty() || right_tokens.is_empty() {
505 return 0.0;
506 }
507
508 let shared = left_tokens.intersection(&right_tokens).count() as f32;
509 (2.0 * shared) / (left_tokens.len() as f32 + right_tokens.len() as f32)
510}
511
512fn normalize_title(title: &str) -> String {
513 let mut normalized = String::with_capacity(title.len());
514 let mut last_was_space = true;
515
516 for ch in title.chars() {
517 let mapped = match ch {
518 '×' | '✕' | '✖' => 'x',
519 '&' => ' ',
520 _ => ch.to_ascii_lowercase(),
521 };
522
523 if mapped.is_alphanumeric() {
524 normalized.push(mapped);
525 last_was_space = false;
526 } else if (mapped.is_whitespace() || matches!(mapped, '-' | '_' | '.' | ':' | '/' | '\\'))
527 && !last_was_space
528 {
529 normalized.push(' ');
530 last_was_space = true;
531 }
532 }
533
534 normalized.trim().to_string()
535}
536
537fn dedupe_titles(canonical_title: &str, titles: Vec<String>) -> Vec<String> {
538 let mut deduped = Vec::new();
539 let mut seen = HashSet::new();
540
541 let mut push_title = |title: String| {
542 let cleaned = title.trim();
543 if cleaned.is_empty() {
544 return;
545 }
546 let key = normalize_title(cleaned);
547 if key.is_empty() || !seen.insert(key) {
548 return;
549 }
550 deduped.push(cleaned.to_string());
551 };
552
553 push_title(canonical_title.to_string());
554 for title in titles {
555 push_title(title);
556 }
557
558 deduped
559}
560
561fn resolve_dump_path(path: &Path) -> MatchResult<PathBuf> {
562 if path.is_file() {
563 return Ok(path.to_path_buf());
564 }
565
566 if path.is_dir() {
567 let sql = path.join("latest.sql");
568 if sql.is_file() {
569 return Ok(sql);
570 }
571
572 let gzip = path.join("latest.sql.gz");
573 if gzip.is_file() {
574 return Ok(gzip);
575 }
576
577 return Err(MatcherError::InvalidDumpPath(format!(
578 "directory {} does not contain latest.sql or latest.sql.gz",
579 path.display()
580 )));
581 }
582
583 Err(MatcherError::InvalidDumpPath(format!(
584 "{} does not exist",
585 path.display()
586 )))
587}
588
589fn open_dump_reader(path: &Path) -> MatchResult<Box<dyn BufRead>> {
590 let file = File::open(path)?;
591 if path
592 .extension()
593 .and_then(|ext| ext.to_str())
594 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
595 {
596 return Ok(Box::new(BufReader::new(GzDecoder::new(file))));
597 }
598
599 Ok(Box::new(BufReader::new(file)))
600}
601
602fn parse_kitsu_dump<R: BufRead>(reader: R) -> MatchResult<Vec<CatalogEntry>> {
603 let mut anime_by_id = HashMap::<u32, RawAnimeRecord>::new();
604 let mut ids_by_anime = HashMap::<u32, AnimeIds>::new();
605 let mut section = DumpSection::None;
606
607 for line in reader.lines() {
608 let line = line?;
609 if let Some((table, columns)) = parse_copy_statement(&line) {
610 section = match table.as_str() {
611 "anime" => DumpSection::Anime(AnimeColumns::try_from_columns(&columns)?),
612 "mappings" => DumpSection::Mappings(MappingColumns::try_from_columns(&columns)?),
613 _ => DumpSection::None,
614 };
615 continue;
616 }
617
618 if line == "\\." {
619 section = DumpSection::None;
620 continue;
621 }
622
623 match §ion {
624 DumpSection::Anime(columns) => parse_anime_row(&line, columns, &mut anime_by_id)?,
625 DumpSection::Mappings(columns) => {
626 parse_mapping_row(&line, columns, &mut ids_by_anime)?;
627 }
628 DumpSection::None => {}
629 }
630 }
631
632 if anime_by_id.is_empty() {
633 return Err(MatcherError::InvalidDump(
634 "anime COPY section was not found".into(),
635 ));
636 }
637
638 let mut entries = anime_by_id
639 .into_iter()
640 .filter_map(|(kitsu_id, record)| {
641 let fallback_slug = record.slug.as_deref().map(slug_to_title);
642 let canonical_title = record
643 .canonical_title
644 .clone()
645 .or_else(|| record.titles.first().cloned())
646 .or(fallback_slug)?;
647
648 let mut titles = record.titles;
649 if let Some(slug) = record.slug {
650 titles.push(slug_to_title(&slug));
651 }
652
653 let mut ids = ids_by_anime.remove(&kitsu_id).unwrap_or_default();
654 ids.kitsu = Some(kitsu_id);
655 CatalogEntry::new(canonical_title, titles, ids)
656 })
657 .collect::<Vec<_>>();
658
659 entries.sort_by(|left, right| left.canonical_title.cmp(&right.canonical_title));
660 Ok(entries)
661}
662
663fn parse_copy_statement(line: &str) -> Option<(String, Vec<String>)> {
664 let rest = line.strip_prefix("COPY ")?;
665 let rest = rest.strip_suffix(" FROM stdin;")?;
666 let open = rest.find('(')?;
667 let close = rest.rfind(')')?;
668 if close <= open {
669 return None;
670 }
671
672 let table = normalize_identifier(&rest[..open]);
673 let columns = rest[open + 1..close]
674 .split(',')
675 .map(normalize_identifier)
676 .collect::<Vec<_>>();
677 Some((table, columns))
678}
679
680fn parse_anime_row(
681 line: &str,
682 columns: &AnimeColumns,
683 anime_by_id: &mut HashMap<u32, RawAnimeRecord>,
684) -> MatchResult<()> {
685 let fields = split_copy_row(line);
686 let id = field_value(&fields, columns.id)
687 .ok_or_else(|| MatcherError::InvalidDump("anime row is missing id".into()))?
688 .parse::<u32>()
689 .map_err(|_| MatcherError::InvalidDump("anime id is not numeric".into()))?;
690
691 let record = anime_by_id.entry(id).or_default();
692
693 if let Some(index) = columns.canonical_title {
694 if let Some(value) = field_value(&fields, index) {
695 record.canonical_title = Some(value.to_string());
696 }
697 }
698
699 if let Some(index) = columns.titles {
700 if let Some(value) = field_value(&fields, index) {
701 record.titles.extend(parse_json_titles(value));
702 }
703 }
704
705 if let Some(index) = columns.abbreviated_titles {
706 if let Some(value) = field_value(&fields, index) {
707 record.titles.extend(parse_json_titles(value));
708 }
709 }
710
711 if let Some(index) = columns.slug {
712 if let Some(value) = field_value(&fields, index) {
713 record.slug = Some(value.to_string());
714 }
715 }
716
717 Ok(())
718}
719
720fn parse_mapping_row(
721 line: &str,
722 columns: &MappingColumns,
723 ids_by_anime: &mut HashMap<u32, AnimeIds>,
724) -> MatchResult<()> {
725 let fields = split_copy_row(line);
726 if let Some(index) = columns.item_type {
727 if !field_value(&fields, index).is_some_and(|value| value.eq_ignore_ascii_case("anime")) {
728 return Ok(());
729 }
730 }
731
732 let item_id =
733 match field_value(&fields, columns.item_id).and_then(|value| value.parse::<u32>().ok()) {
734 Some(id) => id,
735 None => return Ok(()),
736 };
737
738 let external_site = match field_value(&fields, columns.external_site) {
739 Some(site) => site.to_ascii_lowercase(),
740 None => return Ok(()),
741 };
742 let external_id = match field_value(&fields, columns.external_id)
743 .and_then(|value| value.parse::<u32>().ok())
744 {
745 Some(id) => id,
746 None => return Ok(()),
747 };
748
749 let ids = ids_by_anime.entry(item_id).or_default();
750 if external_site.contains("anilist") {
751 ids.anilist = Some(external_id);
752 } else if external_site.contains("myanimelist") || external_site == "mal" {
753 ids.mal = Some(external_id);
754 }
755
756 Ok(())
757}
758
759fn split_copy_row(line: &str) -> Vec<Option<String>> {
760 line.split('\t').map(parse_copy_field).collect()
761}
762
763fn parse_copy_field(field: &str) -> Option<String> {
764 if field == "\\N" {
765 return None;
766 }
767
768 let mut output = String::with_capacity(field.len());
769 let mut chars = field.chars();
770 while let Some(ch) = chars.next() {
771 if ch == '\\' {
772 match chars.next() {
773 Some('t') => output.push('\t'),
774 Some('n') => output.push('\n'),
775 Some('r') => output.push('\r'),
776 Some('b') => output.push('\u{0008}'),
777 Some('f') => output.push('\u{000C}'),
778 Some('\\') => output.push('\\'),
779 Some(other) => output.push(other),
780 None => output.push('\\'),
781 }
782 } else {
783 output.push(ch);
784 }
785 }
786
787 Some(output)
788}
789
790fn field_value(fields: &[Option<String>], index: usize) -> Option<&str> {
791 fields.get(index).and_then(|field| field.as_deref())
792}
793
794fn normalize_identifier(identifier: &str) -> String {
795 identifier
796 .split('.')
797 .next_back()
798 .unwrap_or(identifier)
799 .trim()
800 .trim_matches('"')
801 .to_ascii_lowercase()
802}
803
804fn find_column_index(columns: &[String], target: &str) -> Option<usize> {
805 columns
806 .iter()
807 .position(|column| column.eq_ignore_ascii_case(target))
808}
809
810fn parse_json_titles(raw: &str) -> Vec<String> {
811 let parsed = match serde_json::from_str::<Value>(raw) {
812 Ok(value) => value,
813 Err(_) => return Vec::new(),
814 };
815
816 match parsed {
817 Value::Object(map) => map
818 .into_values()
819 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
820 .collect(),
821 Value::Array(items) => items
822 .into_iter()
823 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
824 .collect(),
825 Value::String(title) => vec![title],
826 _ => Vec::new(),
827 }
828}
829
830fn slug_to_title(slug: &str) -> String {
831 slug.replace('-', " ").trim().to_string()
832}
833
834const REMOTE_GRAPHQL_QUERY: &str = r#"
835query($search: String!, $perPage: Int!) {
836 Page(page: 1, perPage: $perPage) {
837 media(search: $search, type: ANIME) {
838 id
839 idMal
840 title {
841 romaji
842 english
843 native
844 userPreferred
845 }
846 synonyms
847 }
848 }
849}
850"#;
851
852#[derive(Debug, Serialize)]
853struct RemoteGraphQlRequest<'a> {
854 query: &'a str,
855 variables: RemoteGraphQlVariables<'a>,
856}
857
858#[derive(Debug, Serialize)]
859struct RemoteGraphQlVariables<'a> {
860 search: &'a str,
861 #[serde(rename = "perPage")]
862 per_page: i64,
863}
864
865#[derive(Debug, Deserialize)]
866struct RemoteGraphQlEnvelope {
867 data: Option<RemoteGraphQlData>,
868 errors: Option<Vec<RemoteGraphQlError>>,
869}
870
871#[derive(Debug, Deserialize)]
872struct RemoteGraphQlData {
873 #[serde(rename = "Page")]
874 page: Option<RemoteGraphQlPage>,
875}
876
877#[derive(Debug, Deserialize)]
878struct RemoteGraphQlPage {
879 media: Vec<RemoteMedia>,
880}
881
882#[derive(Debug, Deserialize)]
883struct RemoteMedia {
884 id: u32,
885 #[serde(rename = "idMal")]
886 id_mal: Option<u32>,
887 title: RemoteTitle,
888 synonyms: Option<Vec<String>>,
889}
890
891#[derive(Debug, Deserialize)]
892struct RemoteTitle {
893 romaji: Option<String>,
894 english: Option<String>,
895 native: Option<String>,
896 #[serde(rename = "userPreferred")]
897 user_preferred: Option<String>,
898}
899
900#[derive(Debug, Deserialize)]
901struct RemoteGraphQlError {
902 message: String,
903}
904
905#[cfg(test)]
906mod tests {
907 use super::*;
908 use flate2::Compression;
909 use flate2::write::GzEncoder;
910 use std::fs;
911 use std::io::Write;
912 use std::time::{SystemTime, UNIX_EPOCH};
913
914 #[test]
915 fn matches_titles_from_plain_sql_dump() {
916 let temp_dir = unique_temp_dir();
917 let dump_path = temp_dir.join("latest.sql");
918 fs::write(&dump_path, sample_dump()).unwrap();
919
920 let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
921 let matched = matcher.match_title("spy x family").unwrap().unwrap();
922
923 assert_eq!(matched.provider, MatchProvider::KitsuDump);
924 assert_eq!(matched.canonical_title, "Spy x Family");
925 assert_eq!(matched.ids.kitsu, Some(1));
926 assert_eq!(matched.ids.anilist, Some(777));
927 assert_eq!(matched.ids.mal, Some(12345));
928 assert!(matched.score > 0.90);
929
930 fs::remove_dir_all(temp_dir).unwrap();
931 }
932
933 #[test]
934 fn reads_gzipped_dump_files() {
935 let temp_dir = unique_temp_dir();
936 let dump_path = temp_dir.join("latest.sql.gz");
937 write_gzip(&dump_path, sample_dump().as_bytes());
938
939 let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
940 let matches = matcher.search_titles("frieren", 2).unwrap();
941
942 assert_eq!(matches[0].canonical_title, "Sousou no Frieren");
943 assert_eq!(matches[0].ids.anilist, Some(888));
944
945 fs::remove_dir_all(temp_dir).unwrap();
946 }
947
948 #[test]
949 fn normalizes_titles_before_scoring() {
950 assert_eq!(normalize_title("SPY×FAMILY"), "spyxfamily");
951 assert!(similarity_score("spy x family", "spy family") > 0.75);
952 assert!(similarity_score("jujutsu kaisen", "bleach") < 0.50);
953 }
954
955 #[test]
956 fn parses_json_title_values() {
957 let titles = parse_json_titles(r#"{"en":"Frieren","en_jp":"Sousou no Frieren"}"#);
958 assert_eq!(titles.len(), 2);
959 assert!(titles.contains(&"Frieren".to_string()));
960 assert!(titles.contains(&"Sousou no Frieren".to_string()));
961 }
962
963 fn unique_temp_dir() -> PathBuf {
964 let unique = SystemTime::now()
965 .duration_since(UNIX_EPOCH)
966 .unwrap()
967 .as_nanos();
968 let path = std::env::temp_dir().join(format!(
969 "zantetsu-vecdb-test-{}-{}",
970 std::process::id(),
971 unique
972 ));
973 fs::create_dir_all(&path).unwrap();
974 path
975 }
976
977 fn write_gzip(path: &Path, bytes: &[u8]) {
978 let file = File::create(path).unwrap();
979 let mut encoder = GzEncoder::new(file, Compression::default());
980 encoder.write_all(bytes).unwrap();
981 encoder.finish().unwrap();
982 }
983
984 fn sample_dump() -> String {
985 [
986 "-- sample kitsu dump",
987 "COPY public.anime (id, slug, titles, canonical_title, abbreviated_titles) FROM stdin;",
988 "1\tspy-x-family\t{\"en\":\"Spy x Family\",\"en_jp\":\"SPY×FAMILY\",\"ja_jp\":\"スパイファミリー\"}\tSpy x Family\t[\"Spy Family\"]",
989 "2\tsousou-no-frieren\t{\"en\":\"Frieren: Beyond Journey's End\",\"en_jp\":\"Sousou no Frieren\"}\tSousou no Frieren\t[\"Frieren\"]",
990 "\\.",
991 "COPY public.mappings (item_id, item_type, external_site, external_id) FROM stdin;",
992 "1\tAnime\tanilist/anime\t777",
993 "1\tAnime\tmyanimelist/anime\t12345",
994 "2\tAnime\tanilist/anime\t888",
995 "\\.",
996 "",
997 ]
998 .join("\n")
999 }
1000}