use super::Index;
use crate::{ApiConfig, IIStatus, QueryConfig};
use crate::chunk::{self, ChunkSource};
use crate::error::Error;
use crate::index::{BuildConfig, ImageDescription, tfidf};
use crate::uid::{self, Uid};
use ragit_fs::{
basename,
exists,
file_name,
parent,
read_bytes,
read_string,
set_extension,
};
use ragit_pdl::JsonType;
use serde_json::Value;
use std::collections::{HashMap, HashSet};
impl Index {
pub fn check(&self) -> Result<(), Error> {
let mut images = HashMap::new();
let mut chunks_to_files = HashMap::with_capacity(self.chunk_count);
let mut processed_files = HashSet::with_capacity(self.processed_files.len());
let mut all_chunk_uids = HashSet::with_capacity(self.chunk_count);
let uids_to_files = self.processed_files.iter().map(|(file, uid)| (uid.to_string(), file.to_string())).collect::<HashMap<_, _>>();
let mut file_uid_checks = uids_to_files.keys().map(|uid| (uid.to_string(), false )).collect::<HashMap<_, _>>();
let mut chunk_count = 0;
for chunk_file in self.get_all_chunk_files()? {
let chunk_prefix = basename(&parent(&chunk_file)?)?;
let chunk_suffix = file_name(&chunk_file)?;
let chunk_uid = Uid::from_prefix_and_suffix(&chunk_prefix, &chunk_suffix)?;
let chunk = chunk::load_from_file(&chunk_file)?;
chunk_count += 1;
all_chunk_uids.insert(chunk_uid);
if chunk_uid != chunk.uid { return Err(Error::BrokenIndex(format!("Corrupted chunk: `{chunk_file}`'s uid is supposed to be `{chunk_uid}`, but is `{}`.", chunk.uid)));
}
match &chunk.source {
ChunkSource::File { path, index, page: _ } => {
chunks_to_files.insert(chunk_uid, (path.to_string(), *index));
processed_files.insert(path.to_string());
},
}
for image in chunk.images.iter() {
images.insert(*image, false );
}
let tfidf_file = set_extension(&chunk_file, "tfidf")?;
tfidf::load_from_file(&tfidf_file)?;
}
for tfidf_file in self.get_all_tfidf_files()? {
let chunk_file = set_extension(&tfidf_file, "chunk")?;
if !exists(&chunk_file) {
return Err(Error::BrokenIndex(format!("`{tfidf_file}` exists, but `{chunk_file}` doesn't.")));
}
}
for processed_file in processed_files.iter() {
if !self.processed_files.contains_key(processed_file) { return Err(Error::BrokenIndex(format!("There's a chunk of `{processed_file}`, but self.processed_files does not have its entry.")));
}
}
for file_index in self.get_all_file_indexes()? {
let uid_prefix = basename(&parent(&file_index)?)?;
let uid_suffix = file_name(&file_index)?;
let file_uid = format!("{uid_prefix}{uid_suffix}");
let file_name = match uids_to_files.get(&file_uid) {
Some(file_name) => file_name.to_string(),
None => { return Err(Error::BrokenIndex(format!("There's a file_index for `{file_uid}`, but self.processed_files does not have an entry with such hash value.")));
},
};
match file_uid_checks.get_mut(&file_uid) {
Some(exists) => { *exists = true; },
None => unreachable!(), }
for (index1, uid) in uid::load_from_file(&file_index)?.iter().enumerate() {
match chunks_to_files.get(uid) {
Some((file_name_from_chunk, index2)) => {
if &file_name != file_name_from_chunk { return Err(Error::BrokenIndex(format!("`{file_index}`'s file name is `{file_name}` and it has a chunk `{uid}`. But the chunk points to `{file_name_from_chunk}`.")));
}
if index1 != *index2 {
return Err(Error::BrokenIndex(format!("`{file_index}`'s {index1}th chunk uid is `{uid}`, but the chunk's index is {index2}.")));
}
},
None => { return Err(Error::BrokenIndex(format!("`{file_index}` has a chunk `{uid}`, but there's no such chunk in `.ragit/chunks`.")));
},
}
}
}
for (file_uid, exists) in file_uid_checks.iter() {
if !*exists { let file_name = uids_to_files.get(file_uid).unwrap();
return Err(Error::BrokenIndex(format!("`{file_name}` doesn't have an index.")));
}
}
if chunk_count != self.chunk_count { return Err(Error::BrokenIndex(format!("self.chunk_count is {}, but the actual number is {chunk_count}", self.chunk_count)));
}
for image_file in self.get_all_image_files()? {
let image_uid = Uid::from_prefix_and_suffix(
&file_name(&parent(&image_file)?)?,
&file_name(&image_file)?,
)?;
let image_description_path = set_extension(&image_file, "json")?;
match images.get_mut(&image_uid) {
Some(exists) => { *exists = true; },
None => {
},
}
let image_bytes = read_bytes(&image_file)?;
image::load_from_memory_with_format( &image_bytes,
image::ImageFormat::Png,
)?;
let image_description = read_string(&image_description_path)?;
if serde_json::from_str::<ImageDescription>(&image_description).is_err() {
return Err(Error::BrokenIndex(format!("`{image_file}` exists, but `{image_description_path}` does not exist.")));
}
}
for (image_file_hash, exists) in images.iter() {
if !*exists { return Err(Error::BrokenIndex(format!("`{image_file_hash}.png` not found.")));
}
}
serde_json::from_str::<BuildConfig>(
&read_string(&self.get_build_config_path()?)?,
)?;
serde_json::from_str::<QueryConfig>(
&read_string(&self.get_query_config_path()?)?,
)?;
serde_json::from_str::<ApiConfig>(
&read_string(&self.get_api_config_path()?)?,
)?;
let mut keys = HashSet::new();
for path in [
self.get_build_config_path()?,
self.get_api_config_path()?,
self.get_query_config_path()?,
] {
let j = read_string(&path)?;
let j = serde_json::from_str::<Value>(&j)?;
match j {
Value::Object(obj) => {
for (key, _) in obj.iter() {
if keys.contains(key) {
return Err(Error::BrokenIndex(format!("Key conflict in config file {path:?}: {key:?}")));
}
keys.insert(key.to_string());
}
},
_ => {
return Err(Error::JsonTypeError {
expected: JsonType::Object,
got: (&j).into(),
});
},
}
}
if self.ii_status == IIStatus::Complete {
self.check_ii()?;
}
if let Some(uid) = self.uid {
let c_uid = self.calculate_uid(true )?;
if uid != c_uid {
return Err(Error::BrokenIndex(format!("self.uid is {uid}, but the calculated uid is {c_uid}")));
}
}
Ok(())
}
}