1use crate::error::{MatchResult, MatcherError};
2use dirs::data_dir;
3use flate2::read::GzDecoder;
4use reqwest::blocking::Client;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::{HashMap, HashSet};
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::{Path, PathBuf};
11use strsim::jaro_winkler;
12use tracing::debug;
13
14const DEFAULT_TIMEOUT_SECS: u64 = 10;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18pub enum MatchProvider {
19 KitsuDump,
21 RemoteEndpoint,
23}
24
25#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct AnimeIds {
28 pub kitsu: Option<u32>,
30 pub anilist: Option<u32>,
32 pub mal: Option<u32>,
34}
35
36#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
38pub struct AnimeTitleMatch {
39 pub provider: MatchProvider,
41 pub canonical_title: String,
43 pub matched_title: String,
45 pub score: f32,
47 pub ids: AnimeIds,
49 pub titles: Vec<String>,
51}
52
53#[derive(Debug, Clone)]
55pub enum MatchSource {
56 KitsuDump {
58 dump_path: PathBuf,
60 },
61 RemoteEndpoint {
63 endpoint: String,
65 },
66}
67
68impl MatchSource {
69 #[must_use]
71 pub fn kitsu_dump(dump_path: impl Into<PathBuf>) -> Self {
72 Self::KitsuDump {
73 dump_path: dump_path.into(),
74 }
75 }
76
77 #[must_use]
79 pub fn remote_endpoint(endpoint: impl Into<String>) -> Self {
80 Self::RemoteEndpoint {
81 endpoint: endpoint.into(),
82 }
83 }
84}
85
86#[must_use]
88pub fn default_kitsu_dump_dir() -> PathBuf {
89 data_dir()
90 .unwrap_or_else(|| PathBuf::from("."))
91 .join("zantetsu")
92 .join("kitsu-dumps")
93}
94
95pub struct TitleMatcher {
97 backend: MatcherBackend,
98}
99
100enum MatcherBackend {
101 Kitsu(KitsuTitleMatcher),
102 RemoteEndpoint(RemoteTitleMatcher),
103}
104
105impl TitleMatcher {
106 pub fn new(source: MatchSource) -> MatchResult<Self> {
108 let backend = match source {
109 MatchSource::KitsuDump { dump_path } => {
110 MatcherBackend::Kitsu(KitsuTitleMatcher::from_dump(dump_path)?)
111 }
112 MatchSource::RemoteEndpoint { endpoint } => {
113 MatcherBackend::RemoteEndpoint(RemoteTitleMatcher::new(endpoint)?)
114 }
115 };
116
117 Ok(Self { backend })
118 }
119
120 pub fn from_kitsu_dump(dump_path: impl Into<PathBuf>) -> MatchResult<Self> {
122 Self::new(MatchSource::kitsu_dump(dump_path))
123 }
124
125 pub fn from_remote_endpoint(endpoint: impl Into<String>) -> MatchResult<Self> {
127 Self::new(MatchSource::remote_endpoint(endpoint))
128 }
129
130 pub fn match_title(&self, title: &str) -> MatchResult<Option<AnimeTitleMatch>> {
132 Ok(self.search_titles(title, 1)?.into_iter().next())
133 }
134
135 pub fn search_titles(&self, title: &str, limit: usize) -> MatchResult<Vec<AnimeTitleMatch>> {
137 let query = normalize_title(title);
138 if query.is_empty() {
139 return Err(MatcherError::EmptyQuery);
140 }
141
142 let limit = limit.max(1);
143 match &self.backend {
144 MatcherBackend::Kitsu(matcher) => Ok(score_entries(
145 MatchProvider::KitsuDump,
146 &matcher.entries,
147 title,
148 &query,
149 limit,
150 )),
151 MatcherBackend::RemoteEndpoint(matcher) => matcher.search_titles(title, &query, limit),
152 }
153 }
154}
155
156struct KitsuTitleMatcher {
157 entries: Vec<CatalogEntry>,
158}
159
160impl KitsuTitleMatcher {
161 fn from_dump(dump_path: PathBuf) -> MatchResult<Self> {
162 let resolved = resolve_dump_path(&dump_path)?;
163 debug!("loading Kitsu dump from {}", resolved.display());
164 let reader = open_dump_reader(&resolved)?;
165 let entries = parse_kitsu_dump(reader)?;
166 Ok(Self { entries })
167 }
168}
169
170struct RemoteTitleMatcher {
171 client: Client,
172 endpoint: String,
173}
174
175impl RemoteTitleMatcher {
176 fn new(endpoint: String) -> MatchResult<Self> {
177 let endpoint = endpoint.trim().to_string();
178 if endpoint.is_empty() {
179 return Err(MatcherError::InvalidResponse(
180 "remote endpoint URL cannot be empty".into(),
181 ));
182 }
183
184 let client = Client::builder()
185 .timeout(std::time::Duration::from_secs(DEFAULT_TIMEOUT_SECS))
186 .user_agent(format!("zantetsu/{}", env!("CARGO_PKG_VERSION")))
187 .build()?;
188 Ok(Self { client, endpoint })
189 }
190
191 fn search_titles(
192 &self,
193 raw_query: &str,
194 normalized_query: &str,
195 limit: usize,
196 ) -> MatchResult<Vec<AnimeTitleMatch>> {
197 let payload = RemoteGraphQlRequest {
198 query: REMOTE_GRAPHQL_QUERY,
199 variables: RemoteGraphQlVariables {
200 search: raw_query,
201 per_page: limit.max(5) as i64,
202 },
203 };
204
205 let response = self
206 .client
207 .post(&self.endpoint)
208 .json(&payload)
209 .send()?
210 .error_for_status()?;
211
212 let envelope: RemoteGraphQlEnvelope = response.json()?;
213 if let Some(errors) = envelope.errors {
214 let message = errors
215 .into_iter()
216 .map(|error| error.message)
217 .collect::<Vec<_>>()
218 .join("; ");
219 return Err(MatcherError::GraphQl(message));
220 }
221
222 let media = envelope
223 .data
224 .ok_or_else(|| MatcherError::InvalidResponse("missing data".into()))?
225 .page
226 .ok_or_else(|| MatcherError::InvalidResponse("missing page".into()))?
227 .media;
228
229 let entries = media
230 .into_iter()
231 .map(CatalogEntry::from_remote_media)
232 .collect::<Vec<_>>();
233
234 Ok(score_entries(
235 MatchProvider::RemoteEndpoint,
236 &entries,
237 raw_query,
238 normalized_query,
239 limit,
240 ))
241 }
242}
243
244#[derive(Debug, Clone)]
245struct CatalogEntry {
246 canonical_title: String,
247 titles: Vec<String>,
248 normalized_titles: Vec<String>,
249 ids: AnimeIds,
250}
251
252impl CatalogEntry {
253 fn new(canonical_title: String, titles: Vec<String>, ids: AnimeIds) -> Option<Self> {
254 let titles = dedupe_titles(&canonical_title, titles);
255 if titles.is_empty() {
256 return None;
257 }
258
259 let canonical_title = titles
260 .first()
261 .cloned()
262 .unwrap_or_else(|| canonical_title.trim().to_string());
263
264 let normalized_titles = titles.iter().map(|title| normalize_title(title)).collect();
265 Some(Self {
266 canonical_title,
267 titles,
268 normalized_titles,
269 ids,
270 })
271 }
272
273 fn from_remote_media(media: RemoteMedia) -> Self {
274 let mut titles = Vec::new();
275 if let Some(title) = media.title.user_preferred.clone() {
276 titles.push(title);
277 }
278 if let Some(title) = media.title.romaji.clone() {
279 titles.push(title);
280 }
281 if let Some(title) = media.title.english.clone() {
282 titles.push(title);
283 }
284 if let Some(title) = media.title.native.clone() {
285 titles.push(title);
286 }
287 titles.extend(media.synonyms.unwrap_or_default());
288
289 let canonical_title = media
290 .title
291 .user_preferred
292 .clone()
293 .or(media.title.romaji.clone())
294 .or(media.title.english.clone())
295 .or(media.title.native.clone())
296 .unwrap_or_else(|| format!("Remote {}", media.id));
297
298 Self::new(
299 canonical_title,
300 titles,
301 AnimeIds {
302 kitsu: None,
303 anilist: Some(media.id),
304 mal: media.id_mal,
305 },
306 )
307 .expect("remote media should always produce at least one title")
308 }
309}
310
311#[derive(Debug, Default)]
312struct RawAnimeRecord {
313 canonical_title: Option<String>,
314 titles: Vec<String>,
315 slug: Option<String>,
316}
317
318#[derive(Debug, Clone)]
319enum DumpSection {
320 None,
321 Anime(AnimeColumns),
322 Mappings(MappingColumns),
323}
324
325#[derive(Debug, Clone)]
326struct AnimeColumns {
327 id: usize,
328 canonical_title: Option<usize>,
329 titles: Option<usize>,
330 abbreviated_titles: Option<usize>,
331 slug: Option<usize>,
332}
333
334impl AnimeColumns {
335 fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
336 let id = find_column_index(columns, "id")
337 .ok_or_else(|| MatcherError::InvalidDump("anime COPY is missing id column".into()))?;
338 Ok(Self {
339 id,
340 canonical_title: find_column_index(columns, "canonical_title"),
341 titles: find_column_index(columns, "titles"),
342 abbreviated_titles: find_column_index(columns, "abbreviated_titles"),
343 slug: find_column_index(columns, "slug"),
344 })
345 }
346}
347
348#[derive(Debug, Clone)]
349struct MappingColumns {
350 item_id: usize,
351 item_type: Option<usize>,
352 external_site: usize,
353 external_id: usize,
354}
355
356impl MappingColumns {
357 fn try_from_columns(columns: &[String]) -> MatchResult<Self> {
358 let item_id = find_column_index(columns, "item_id").ok_or_else(|| {
359 MatcherError::InvalidDump("mappings COPY is missing item_id column".into())
360 })?;
361 let external_site = find_column_index(columns, "external_site").ok_or_else(|| {
362 MatcherError::InvalidDump("mappings COPY is missing external_site column".into())
363 })?;
364 let external_id = find_column_index(columns, "external_id").ok_or_else(|| {
365 MatcherError::InvalidDump("mappings COPY is missing external_id column".into())
366 })?;
367
368 Ok(Self {
369 item_id,
370 item_type: find_column_index(columns, "item_type"),
371 external_site,
372 external_id,
373 })
374 }
375}
376
377fn score_entries(
378 provider: MatchProvider,
379 entries: &[CatalogEntry],
380 raw_query: &str,
381 normalized_query: &str,
382 limit: usize,
383) -> Vec<AnimeTitleMatch> {
384 let mut scored = entries
385 .iter()
386 .filter_map(|entry| score_entry(provider, entry, raw_query, normalized_query))
387 .collect::<Vec<_>>();
388
389 scored.sort_by(|left, right| {
390 right
391 .score
392 .partial_cmp(&left.score)
393 .unwrap_or(std::cmp::Ordering::Equal)
394 .then_with(|| left.canonical_title.cmp(&right.canonical_title))
395 });
396 scored.truncate(limit);
397 scored
398}
399
400fn score_entry(
401 provider: MatchProvider,
402 entry: &CatalogEntry,
403 _raw_query: &str,
404 normalized_query: &str,
405) -> Option<AnimeTitleMatch> {
406 let mut best_title = None;
407 let mut best_score = 0.0_f32;
408
409 for (index, normalized_title) in entry.normalized_titles.iter().enumerate() {
410 let score = similarity_score(normalized_query, normalized_title);
411 if score > best_score {
412 best_score = score;
413 best_title = entry.titles.get(index).cloned();
414 }
415 }
416
417 best_title.map(|matched_title| AnimeTitleMatch {
418 provider,
419 canonical_title: entry.canonical_title.clone(),
420 matched_title,
421 score: best_score,
422 ids: entry.ids.clone(),
423 titles: entry.titles.clone(),
424 })
425}
426
427fn similarity_score(query: &str, candidate: &str) -> f32 {
428 if query.is_empty() || candidate.is_empty() {
429 return 0.0;
430 }
431 if query == candidate {
432 return 1.0;
433 }
434
435 let jaro = jaro_winkler(query, candidate) as f32;
436 let token_overlap = token_overlap_score(query, candidate);
437 let contains = if query.contains(candidate) || candidate.contains(query) {
438 1.0
439 } else {
440 0.0
441 };
442 let prefix = if query.starts_with(candidate) || candidate.starts_with(query) {
443 1.0
444 } else {
445 0.0
446 };
447 let len_ratio = query.len().min(candidate.len()) as f32 / query.len().max(candidate.len()) as f32;
448
449 let mut score =
450 0.55 * jaro + 0.20 * token_overlap + 0.15 * contains + 0.10 * len_ratio + 0.05 * prefix;
451 if token_overlap == 0.0 && contains == 0.0 && jaro < 0.90 {
452 score *= 0.75;
453 }
454
455 score.clamp(0.0, 1.0)
456}
457
458fn token_overlap_score(left: &str, right: &str) -> f32 {
459 let left_tokens = left.split_whitespace().collect::<HashSet<_>>();
460 let right_tokens = right.split_whitespace().collect::<HashSet<_>>();
461 if left_tokens.is_empty() || right_tokens.is_empty() {
462 return 0.0;
463 }
464
465 let shared = left_tokens.intersection(&right_tokens).count() as f32;
466 (2.0 * shared) / (left_tokens.len() as f32 + right_tokens.len() as f32)
467}
468
469fn normalize_title(title: &str) -> String {
470 let mut normalized = String::with_capacity(title.len());
471 let mut last_was_space = true;
472
473 for ch in title.chars() {
474 let mapped = match ch {
475 '×' | '✕' | '✖' => 'x',
476 '&' => ' ',
477 _ => ch.to_ascii_lowercase(),
478 };
479
480 if mapped.is_alphanumeric() {
481 normalized.push(mapped);
482 last_was_space = false;
483 } else if mapped.is_whitespace() || matches!(mapped, '-' | '_' | '.' | ':' | '/' | '\\') {
484 if !last_was_space {
485 normalized.push(' ');
486 last_was_space = true;
487 }
488 }
489 }
490
491 normalized.trim().to_string()
492}
493
494fn dedupe_titles(canonical_title: &str, titles: Vec<String>) -> Vec<String> {
495 let mut deduped = Vec::new();
496 let mut seen = HashSet::new();
497
498 let mut push_title = |title: String| {
499 let cleaned = title.trim();
500 if cleaned.is_empty() {
501 return;
502 }
503 let key = normalize_title(cleaned);
504 if key.is_empty() || !seen.insert(key) {
505 return;
506 }
507 deduped.push(cleaned.to_string());
508 };
509
510 push_title(canonical_title.to_string());
511 for title in titles {
512 push_title(title);
513 }
514
515 deduped
516}
517
518fn resolve_dump_path(path: &Path) -> MatchResult<PathBuf> {
519 if path.is_file() {
520 return Ok(path.to_path_buf());
521 }
522
523 if path.is_dir() {
524 let sql = path.join("latest.sql");
525 if sql.is_file() {
526 return Ok(sql);
527 }
528
529 let gzip = path.join("latest.sql.gz");
530 if gzip.is_file() {
531 return Ok(gzip);
532 }
533
534 return Err(MatcherError::InvalidDumpPath(format!(
535 "directory {} does not contain latest.sql or latest.sql.gz",
536 path.display()
537 )));
538 }
539
540 Err(MatcherError::InvalidDumpPath(format!(
541 "{} does not exist",
542 path.display()
543 )))
544}
545
546fn open_dump_reader(path: &Path) -> MatchResult<Box<dyn BufRead>> {
547 let file = File::open(path)?;
548 if path
549 .extension()
550 .and_then(|ext| ext.to_str())
551 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
552 {
553 return Ok(Box::new(BufReader::new(GzDecoder::new(file))));
554 }
555
556 Ok(Box::new(BufReader::new(file)))
557}
558
559fn parse_kitsu_dump<R: BufRead>(reader: R) -> MatchResult<Vec<CatalogEntry>> {
560 let mut anime_by_id = HashMap::<u32, RawAnimeRecord>::new();
561 let mut ids_by_anime = HashMap::<u32, AnimeIds>::new();
562 let mut section = DumpSection::None;
563
564 for line in reader.lines() {
565 let line = line?;
566 if let Some((table, columns)) = parse_copy_statement(&line) {
567 section = match table.as_str() {
568 "anime" => DumpSection::Anime(AnimeColumns::try_from_columns(&columns)?),
569 "mappings" => DumpSection::Mappings(MappingColumns::try_from_columns(&columns)?),
570 _ => DumpSection::None,
571 };
572 continue;
573 }
574
575 if line == "\\." {
576 section = DumpSection::None;
577 continue;
578 }
579
580 match §ion {
581 DumpSection::Anime(columns) => parse_anime_row(&line, columns, &mut anime_by_id)?,
582 DumpSection::Mappings(columns) => {
583 parse_mapping_row(&line, columns, &mut ids_by_anime)?;
584 }
585 DumpSection::None => {}
586 }
587 }
588
589 if anime_by_id.is_empty() {
590 return Err(MatcherError::InvalidDump(
591 "anime COPY section was not found".into(),
592 ));
593 }
594
595 let mut entries = anime_by_id
596 .into_iter()
597 .filter_map(|(kitsu_id, record)| {
598 let fallback_slug = record.slug.as_deref().map(slug_to_title);
599 let canonical_title = record
600 .canonical_title
601 .clone()
602 .or_else(|| record.titles.first().cloned())
603 .or(fallback_slug)?;
604
605 let mut titles = record.titles;
606 if let Some(slug) = record.slug {
607 titles.push(slug_to_title(&slug));
608 }
609
610 let mut ids = ids_by_anime.remove(&kitsu_id).unwrap_or_default();
611 ids.kitsu = Some(kitsu_id);
612 CatalogEntry::new(canonical_title, titles, ids)
613 })
614 .collect::<Vec<_>>();
615
616 entries.sort_by(|left, right| left.canonical_title.cmp(&right.canonical_title));
617 Ok(entries)
618}
619
620fn parse_copy_statement(line: &str) -> Option<(String, Vec<String>)> {
621 let rest = line.strip_prefix("COPY ")?;
622 let rest = rest.strip_suffix(" FROM stdin;")?;
623 let open = rest.find('(')?;
624 let close = rest.rfind(')')?;
625 if close <= open {
626 return None;
627 }
628
629 let table = normalize_identifier(&rest[..open]);
630 let columns = rest[open + 1..close]
631 .split(',')
632 .map(normalize_identifier)
633 .collect::<Vec<_>>();
634 Some((table, columns))
635}
636
637fn parse_anime_row(
638 line: &str,
639 columns: &AnimeColumns,
640 anime_by_id: &mut HashMap<u32, RawAnimeRecord>,
641) -> MatchResult<()> {
642 let fields = split_copy_row(line);
643 let id = field_value(&fields, columns.id)
644 .ok_or_else(|| MatcherError::InvalidDump("anime row is missing id".into()))?
645 .parse::<u32>()
646 .map_err(|_| MatcherError::InvalidDump("anime id is not numeric".into()))?;
647
648 let record = anime_by_id.entry(id).or_default();
649
650 if let Some(index) = columns.canonical_title {
651 if let Some(value) = field_value(&fields, index) {
652 record.canonical_title = Some(value.to_string());
653 }
654 }
655
656 if let Some(index) = columns.titles {
657 if let Some(value) = field_value(&fields, index) {
658 record.titles.extend(parse_json_titles(value));
659 }
660 }
661
662 if let Some(index) = columns.abbreviated_titles {
663 if let Some(value) = field_value(&fields, index) {
664 record.titles.extend(parse_json_titles(value));
665 }
666 }
667
668 if let Some(index) = columns.slug {
669 if let Some(value) = field_value(&fields, index) {
670 record.slug = Some(value.to_string());
671 }
672 }
673
674 Ok(())
675}
676
677fn parse_mapping_row(
678 line: &str,
679 columns: &MappingColumns,
680 ids_by_anime: &mut HashMap<u32, AnimeIds>,
681) -> MatchResult<()> {
682 let fields = split_copy_row(line);
683 if let Some(index) = columns.item_type {
684 if !field_value(&fields, index)
685 .is_some_and(|value| value.eq_ignore_ascii_case("anime"))
686 {
687 return Ok(());
688 }
689 }
690
691 let item_id = match field_value(&fields, columns.item_id)
692 .and_then(|value| value.parse::<u32>().ok())
693 {
694 Some(id) => id,
695 None => return Ok(()),
696 };
697
698 let external_site = match field_value(&fields, columns.external_site) {
699 Some(site) => site.to_ascii_lowercase(),
700 None => return Ok(()),
701 };
702 let external_id = match field_value(&fields, columns.external_id)
703 .and_then(|value| value.parse::<u32>().ok())
704 {
705 Some(id) => id,
706 None => return Ok(()),
707 };
708
709 let ids = ids_by_anime.entry(item_id).or_default();
710 if external_site.contains("anilist") {
711 ids.anilist = Some(external_id);
712 } else if external_site.contains("myanimelist") || external_site == "mal" {
713 ids.mal = Some(external_id);
714 }
715
716 Ok(())
717}
718
719fn split_copy_row(line: &str) -> Vec<Option<String>> {
720 line.split('\t').map(parse_copy_field).collect()
721}
722
723fn parse_copy_field(field: &str) -> Option<String> {
724 if field == "\\N" {
725 return None;
726 }
727
728 let mut output = String::with_capacity(field.len());
729 let mut chars = field.chars();
730 while let Some(ch) = chars.next() {
731 if ch == '\\' {
732 match chars.next() {
733 Some('t') => output.push('\t'),
734 Some('n') => output.push('\n'),
735 Some('r') => output.push('\r'),
736 Some('b') => output.push('\u{0008}'),
737 Some('f') => output.push('\u{000C}'),
738 Some('\\') => output.push('\\'),
739 Some(other) => output.push(other),
740 None => output.push('\\'),
741 }
742 } else {
743 output.push(ch);
744 }
745 }
746
747 Some(output)
748}
749
750fn field_value(fields: &[Option<String>], index: usize) -> Option<&str> {
751 fields.get(index).and_then(|field| field.as_deref())
752}
753
754fn normalize_identifier(identifier: &str) -> String {
755 identifier
756 .split('.')
757 .next_back()
758 .unwrap_or(identifier)
759 .trim()
760 .trim_matches('"')
761 .to_ascii_lowercase()
762}
763
764fn find_column_index(columns: &[String], target: &str) -> Option<usize> {
765 columns
766 .iter()
767 .position(|column| column.eq_ignore_ascii_case(target))
768}
769
770fn parse_json_titles(raw: &str) -> Vec<String> {
771 let parsed = match serde_json::from_str::<Value>(raw) {
772 Ok(value) => value,
773 Err(_) => return Vec::new(),
774 };
775
776 match parsed {
777 Value::Object(map) => map
778 .into_values()
779 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
780 .collect(),
781 Value::Array(items) => items
782 .into_iter()
783 .filter_map(|value| value.as_str().map(ToOwned::to_owned))
784 .collect(),
785 Value::String(title) => vec![title],
786 _ => Vec::new(),
787 }
788}
789
790fn slug_to_title(slug: &str) -> String {
791 slug.replace('-', " ").trim().to_string()
792}
793
794const REMOTE_GRAPHQL_QUERY: &str = r#"
795query($search: String!, $perPage: Int!) {
796 Page(page: 1, perPage: $perPage) {
797 media(search: $search, type: ANIME) {
798 id
799 idMal
800 title {
801 romaji
802 english
803 native
804 userPreferred
805 }
806 synonyms
807 }
808 }
809}
810"#;
811
812#[derive(Debug, Serialize)]
813struct RemoteGraphQlRequest<'a> {
814 query: &'a str,
815 variables: RemoteGraphQlVariables<'a>,
816}
817
818#[derive(Debug, Serialize)]
819struct RemoteGraphQlVariables<'a> {
820 search: &'a str,
821 #[serde(rename = "perPage")]
822 per_page: i64,
823}
824
825#[derive(Debug, Deserialize)]
826struct RemoteGraphQlEnvelope {
827 data: Option<RemoteGraphQlData>,
828 errors: Option<Vec<RemoteGraphQlError>>,
829}
830
831#[derive(Debug, Deserialize)]
832struct RemoteGraphQlData {
833 #[serde(rename = "Page")]
834 page: Option<RemoteGraphQlPage>,
835}
836
837#[derive(Debug, Deserialize)]
838struct RemoteGraphQlPage {
839 media: Vec<RemoteMedia>,
840}
841
842#[derive(Debug, Deserialize)]
843struct RemoteMedia {
844 id: u32,
845 #[serde(rename = "idMal")]
846 id_mal: Option<u32>,
847 title: RemoteTitle,
848 synonyms: Option<Vec<String>>,
849}
850
851#[derive(Debug, Deserialize)]
852struct RemoteTitle {
853 romaji: Option<String>,
854 english: Option<String>,
855 native: Option<String>,
856 #[serde(rename = "userPreferred")]
857 user_preferred: Option<String>,
858}
859
860#[derive(Debug, Deserialize)]
861struct RemoteGraphQlError {
862 message: String,
863}
864
865#[cfg(test)]
866mod tests {
867 use super::*;
868 use flate2::Compression;
869 use flate2::write::GzEncoder;
870 use std::fs;
871 use std::io::Write;
872 use std::time::{SystemTime, UNIX_EPOCH};
873
874 #[test]
875 fn matches_titles_from_plain_sql_dump() {
876 let temp_dir = unique_temp_dir();
877 let dump_path = temp_dir.join("latest.sql");
878 fs::write(&dump_path, sample_dump()).unwrap();
879
880 let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
881 let matched = matcher.match_title("spy x family").unwrap().unwrap();
882
883 assert_eq!(matched.provider, MatchProvider::KitsuDump);
884 assert_eq!(matched.canonical_title, "Spy x Family");
885 assert_eq!(matched.ids.kitsu, Some(1));
886 assert_eq!(matched.ids.anilist, Some(777));
887 assert_eq!(matched.ids.mal, Some(12345));
888 assert!(matched.score > 0.90);
889
890 fs::remove_dir_all(temp_dir).unwrap();
891 }
892
893 #[test]
894 fn reads_gzipped_dump_files() {
895 let temp_dir = unique_temp_dir();
896 let dump_path = temp_dir.join("latest.sql.gz");
897 write_gzip(&dump_path, sample_dump().as_bytes());
898
899 let matcher = TitleMatcher::from_kitsu_dump(&temp_dir).unwrap();
900 let matches = matcher.search_titles("frieren", 2).unwrap();
901
902 assert_eq!(matches[0].canonical_title, "Sousou no Frieren");
903 assert_eq!(matches[0].ids.anilist, Some(888));
904
905 fs::remove_dir_all(temp_dir).unwrap();
906 }
907
908 #[test]
909 fn normalizes_titles_before_scoring() {
910 assert_eq!(normalize_title("SPY×FAMILY"), "spyxfamily");
911 assert!(similarity_score("spy x family", "spy family") > 0.75);
912 assert!(similarity_score("jujutsu kaisen", "bleach") < 0.50);
913 }
914
915 #[test]
916 fn parses_json_title_values() {
917 let titles = parse_json_titles(r#"{"en":"Frieren","en_jp":"Sousou no Frieren"}"#);
918 assert_eq!(titles.len(), 2);
919 assert!(titles.contains(&"Frieren".to_string()));
920 assert!(titles.contains(&"Sousou no Frieren".to_string()));
921 }
922
923 fn unique_temp_dir() -> PathBuf {
924 let unique = SystemTime::now()
925 .duration_since(UNIX_EPOCH)
926 .unwrap()
927 .as_nanos();
928 let path = std::env::temp_dir().join(format!(
929 "zantetsu-vecdb-test-{}-{}",
930 std::process::id(),
931 unique
932 ));
933 fs::create_dir_all(&path).unwrap();
934 path
935 }
936
937 fn write_gzip(path: &Path, bytes: &[u8]) {
938 let file = File::create(path).unwrap();
939 let mut encoder = GzEncoder::new(file, Compression::default());
940 encoder.write_all(bytes).unwrap();
941 encoder.finish().unwrap();
942 }
943
944 fn sample_dump() -> String {
945 [
946 "-- sample kitsu dump",
947 "COPY public.anime (id, slug, titles, canonical_title, abbreviated_titles) FROM stdin;",
948 "1\tspy-x-family\t{\"en\":\"Spy x Family\",\"en_jp\":\"SPY×FAMILY\",\"ja_jp\":\"スパイファミリー\"}\tSpy x Family\t[\"Spy Family\"]",
949 "2\tsousou-no-frieren\t{\"en\":\"Frieren: Beyond Journey's End\",\"en_jp\":\"Sousou no Frieren\"}\tSousou no Frieren\t[\"Frieren\"]",
950 "\\.",
951 "COPY public.mappings (item_id, item_type, external_site, external_id) FROM stdin;",
952 "1\tAnime\tanilist/anime\t777",
953 "1\tAnime\tmyanimelist/anime\t12345",
954 "2\tAnime\tanilist/anime\t888",
955 "\\.",
956 "",
957 ]
958 .join("\n")
959 }
960}