use crate::chunk::Chunk;
use crate::constant::{CHUNK_DIR_NAME, IMAGE_DIR_NAME, INDEX_DIR_NAME};
use crate::error::Error;
use crate::index::Index;
use crate::query::QueryTurn;
use ragit_fs::{
WriteMode,
extension,
file_name,
file_size,
get_relative_path,
is_dir,
join3,
read_bytes,
read_bytes_offset,
read_dir,
write_bytes,
write_string,
};
use serde::{Deserialize, Serialize};
use sha3::{Digest, Sha3_256};
use std::fmt;
use std::str::FromStr;
mod query;
#[cfg(test)]
mod tests;
pub use query::{UidQueryConfig, UidQueryResult};
#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, Ord, PartialEq, PartialOrd, Serialize)]
pub struct Uid {
pub(crate) high: u128,
pub(crate) low: u128,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum UidType {
Chunk,
Image,
File,
KnowledgeBase,
Summary,
Group,
}
#[derive(Clone, Copy, Debug)]
pub enum UidWriteMode {
Naive,
Compact,
}
pub fn load_from_file(path: &str) -> Result<Vec<Uid>, Error> {
let bytes = read_bytes(path)?;
match bytes.get(0) {
Some((b'a'..=b'f') | (b'0'..=b'9')) => match String::from_utf8(bytes) {
Ok(s) => {
let mut result = vec![];
for line in s.lines() {
result.push(line.parse::<Uid>()?);
}
Ok(result)
},
Err(_) => Err(Error::CorruptedFile { path: path.to_string(), message: Some(String::from("the file is not a valid utf-8")) }),
},
Some(192..=224) => {
let byte_len = (bytes[0] & 0b0011_1111) as usize;
let mut result = vec![];
let mut curr_uid = Uid::decode_partial(&bytes[1..(byte_len + 1)])?;
result.push(curr_uid);
let mut cursor = byte_len + 1;
loop {
let d = Uid::decode_partial(&bytes[cursor..(cursor + byte_len)])?;
curr_uid = curr_uid + d;
result.push(curr_uid);
cursor += byte_len;
if cursor + byte_len > bytes.len() {
break;
}
}
if cursor != bytes.len() {
Err(Error::CorruptedFile { path: path.to_string(), message: Some(format!("the file is {} bytes, but a byte_len is {byte_len}", bytes.len())) })
}
else {
Ok(result)
}
},
Some(b) => Err(Error::CorruptedFile { path: path.to_string(), message: Some(format!("unexpected uid prefix: `{b}`")) }),
None => Ok(vec![]),
}
}
pub fn save_to_file(
path: &str,
uids: &[Uid],
write_mode: UidWriteMode,
) -> Result<(), Error> {
match write_mode {
UidWriteMode::Compact if uids.len() > 1 => {
let mut uids = uids.to_vec();
uids.sort();
let mut diffs = Vec::with_capacity(uids.len() - 1);
let mut max_byte_len = uids[0].byte_len();
for i in 1..uids.len() {
let diff = uids[i].checked_sub(uids[i - 1]).unwrap();
let byte_len = diff.byte_len();
diffs.push(diff);
max_byte_len = max_byte_len.max(byte_len);
}
let mut result = Vec::with_capacity(1 + max_byte_len * uids.len());
result.push(0b1100_0000 | max_byte_len as u8);
uids[0].encode_partial(max_byte_len, &mut result);
for d in diffs.iter() {
d.encode_partial(max_byte_len, &mut result);
}
Ok(write_bytes(
path,
&result,
WriteMode::Atomic,
)?)
},
_ => Ok(write_string(
path,
&uids.iter().map(|uid| uid.to_string()).collect::<Vec<_>>().join("\n"),
WriteMode::Atomic,
)?),
}
}
impl Uid {
const METADATA_MASK: u128 = 0xffff_ffff_ffff_ffff_0000_0000_0000_0000;
const CHUNK_TYPE: u128 = (0x1 << 32);
const IMAGE_TYPE: u128 = (0x2 << 32);
const FILE_TYPE: u128 = (0x3 << 32);
const GROUP_TYPE: u128 = (0x4 << 32);
const KNOWLEDGE_BASE_TYPE: u128 = (0x5 << 32);
const SUMMARY_TYPE: u128 = (0x6 << 32);
const QUERY_TURN_TYPE: u128 = (0x7 << 32);
pub(crate) fn decode_partial(bytes: &[u8]) -> Result<Self, Error> {
match bytes.len() {
0 => Ok(Uid { high: 0, low: 0 }),
1..=15 => Ok(Uid { high: 0, low: u128_from_bytes(bytes)? }),
16 => Ok(Uid { high: 0, low: u128_from_bytes(bytes)? }),
17..=31 => Ok(Uid { high: u128_from_bytes(&bytes[..(bytes.len() - 16)])?, low: u128_from_bytes(&bytes[(bytes.len() - 16)..])? }),
32 => Ok(Uid { high: u128_from_bytes(&bytes[0..16])?, low: u128_from_bytes(&bytes[16..])? }),
_ => Err(Error::CannotDecodeUid),
}
}
pub(crate) fn encode_partial(&self, len: usize, buffer: &mut Vec<u8>) {
for b in self.high.to_be_bytes().into_iter().chain(self.low.to_be_bytes().into_iter()).skip(32 - len) {
buffer.push(b);
}
}
pub fn decode(bytes: &[u8]) -> Result<Self, Error> {
match bytes.len() {
32 => Ok(Uid { high: u128_from_bytes(&bytes[0..16])?, low: u128_from_bytes(&bytes[16..])? }),
_ => Err(Error::CannotDecodeUid),
}
}
pub fn encode(&self, buffer: &mut Vec<u8>) {
for b in self.high.to_be_bytes().into_iter().chain(self.low.to_be_bytes().into_iter()) {
buffer.push(b);
}
}
pub(crate) fn byte_len(&self) -> usize {
if self.high == 0 {
(255 - self.low.leading_zeros() as usize) / 8 - 15
} else {
(255 - self.high.leading_zeros() as usize) / 8 + 1
}
}
pub(crate) fn dummy() -> Self {
Uid {
high: 0,
low: 0,
}
}
pub fn new_chunk(chunk: &Chunk) -> Self {
let mut hasher = Sha3_256::new();
hasher.update(format!("{}{}{}{}", chunk.source.hash_str(), chunk.title, chunk.summary, chunk.data).as_bytes());
let mut result = format!("{:064x}", hasher.finalize()).parse::<Uid>().unwrap();
result = result.clear_metadata();
result.low |= Uid::CHUNK_TYPE;
result.low |= (chunk.data.len() as u128) & 0xffff_ffff;
result
}
pub fn new_image(bytes: &[u8]) -> Self {
let mut hasher = Sha3_256::new();
hasher.update(bytes);
let mut result = format!("{:064x}", hasher.finalize()).parse::<Uid>().unwrap();
result = result.clear_metadata();
result.low |= Uid::IMAGE_TYPE;
result.low |= (bytes.len() as u128) & 0xffff_ffff;
result
}
pub fn new_file(root_dir: &str, path: &str) -> Result<Self, Error> {
let size = file_size(path)?;
let rel_path = get_relative_path(&root_dir.to_string(), &path.to_string())?;
let mut file_path_hasher = Sha3_256::new();
file_path_hasher.update(rel_path.as_bytes());
let file_path_uid = format!("{:064x}", file_path_hasher.finalize()).parse::<Uid>().unwrap();
let mut file_content_hasher = Sha3_256::new();
if size < 32 * 1024 * 1024 {
let bytes = read_bytes(path)?;
file_content_hasher.update(&bytes);
}
else {
let block_size = 32 * 1024 * 1024;
let mut offset = 0;
loop {
let bytes = read_bytes_offset(path, offset, offset + block_size)?;
file_content_hasher.update(&bytes);
offset += block_size;
if offset >= size {
break;
}
}
}
let mut result = format!("{:064x}", file_content_hasher.finalize()).parse::<Uid>().unwrap();
result ^= file_path_uid;
result = result.clear_metadata();
result.low |= Uid::FILE_TYPE;
result.low |= (size as u128) & 0xffff_ffff;
Ok(result)
}
pub fn new_group(uids: &[Uid]) -> Self {
let mut result = Uid::dummy();
let mut child_count = 0;
for uid in uids.iter() {
result += *uid;
match uid.get_uid_type() {
Ok(UidType::Group) => { child_count += uid.get_data_size(); },
_ => { child_count += 1; },
}
}
result = result.clear_metadata();
result.low |= Uid::GROUP_TYPE;
result.low |= (child_count as u128) & 0xffff_ffff;
result
}
pub fn new_knowledge_base(uids: &[Uid]) -> Self {
let mut result = Uid::dummy();
for uid in uids.iter() {
result += *uid;
}
result = result.clear_metadata();
result.low |= Uid::KNOWLEDGE_BASE_TYPE;
result.low |= (uids.len() as u128) & 0xffff_ffff;
result
}
pub fn new_summary(summary: &str) -> Self {
let mut hasher = Sha3_256::new();
hasher.update(summary.as_bytes());
let mut result = format!("{:064x}", hasher.finalize()).parse::<Uid>().unwrap();
result = result.clear_metadata();
result.low |= Uid::SUMMARY_TYPE;
result.low |= (summary.len() as u128) & 0xffff_ffff;
result
}
pub fn new_query_turn(turn: &QueryTurn) -> Self {
let mut hasher = Sha3_256::new();
hasher.update(turn.query.as_bytes());
hasher.update(turn.response.model.as_bytes());
hasher.update(turn.response.response.as_bytes());
hasher.update(&turn.timestamp.to_le_bytes());
let mut result = format!("{:064x}", hasher.finalize()).parse::<Uid>().unwrap();
for chunk in turn.response.retrieved_chunks.iter() {
result += chunk.uid;
}
result = result.clear_metadata();
result.low |= Uid::QUERY_TURN_TYPE;
result.low |= turn.timestamp.max(0) as u128 & 0xffff_ffff;
result
}
pub fn update_file_uid(mut old: Uid, old_path: &str, new_path: &str) -> Self {
let mut old_path_hasher = Sha3_256::new();
old_path_hasher.update(old_path.as_bytes());
let mut old_path_uid = format!("{:064x}", old_path_hasher.finalize()).parse::<Uid>().unwrap();
old_path_uid.low &= Uid::METADATA_MASK;
let mut new_path_hasher = Sha3_256::new();
new_path_hasher.update(new_path.as_bytes());
let mut new_path_uid = format!("{:064x}", new_path_hasher.finalize()).parse::<Uid>().unwrap();
new_path_uid.low &= Uid::METADATA_MASK;
old ^= old_path_uid;
old ^= new_path_uid;
old
}
pub(crate) fn from_prefix_and_suffix(prefix: &str, suffix: &str) -> Result<Self, Error> {
if prefix.len() != 2 || suffix.len() != 62 {
Err(Error::InvalidUid(format!("{prefix}{suffix}")))
}
else {
match (suffix.get(0..30), suffix.get(30..)) {
(Some(high_suff), Some(low)) => match (
u128::from_str_radix(&format!("{prefix}{high_suff}"), 16),
u128::from_str_radix(low, 16),
) {
(Ok(high), Ok(low)) => Ok(Uid { high, low }),
_ => Err(Error::InvalidUid(format!("{prefix}{suffix}"))),
},
_ => Err(Error::InvalidUid(format!("{prefix}{suffix}"))),
}
}
}
pub fn get_prefix(&self) -> String {
format!("{:02x}", self.high >> 120)
}
pub fn get_suffix(&self) -> String {
format!("{:030x}{:032x}", self.high & 0xff_ffff_ffff_ffff_ffff_ffff_ffff_ffff, self.low)
}
pub fn abbrev(&self, n: usize) -> String {
match n {
0 => String::new(),
1..=32 => {
let s = format!("{:032x}", self.high);
s.get(0..n).unwrap().to_string()
},
33.. => {
let s = self.to_string();
s.get(0..n.min(64)).unwrap().to_string()
},
}
}
pub fn is_valid_prefix(s: &str) -> bool {
s.len() <= 64 && s.chars().all(
|c| match c {
'0'..='9' => true,
'a'..='f' => true,
_ => false,
}
)
}
#[must_use = "method returns a new uid and does not mutate the original value"]
pub(crate) fn clear_metadata(&self) -> Uid {
let mut result = *self;
result.low &= Uid::METADATA_MASK;
result
}
pub(crate) fn get_uid_type(&self) -> Result<UidType, Error> {
let field = ((self.low >> 32) & 0xf) << 32;
match field {
Uid::CHUNK_TYPE => Ok(UidType::Chunk),
Uid::IMAGE_TYPE => Ok(UidType::Image),
Uid::FILE_TYPE => Ok(UidType::File),
Uid::GROUP_TYPE => Ok(UidType::Group),
Uid::KNOWLEDGE_BASE_TYPE => Ok(UidType::KnowledgeBase),
Uid::SUMMARY_TYPE => Ok(UidType::Summary),
_ => Err(Error::InvalidUid(self.to_string())),
}
}
pub(crate) fn get_data_size(&self) -> usize {
(self.low & 0xffff_ffff) as usize
}
fn checked_sub(&self, other: Uid) -> Option<Uid> {
let (carry, low) = match self.low.checked_sub(other.low) {
Some(n) => (0, n),
None => (1, u128::MAX - other.low + self.low + 1),
};
if carry > self.high {
None
}
else {
match (self.high - carry).checked_sub(other.high) {
Some(high) => Some(Uid { high, low }),
None => None,
}
}
}
}
impl fmt::Display for Uid {
fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(fmt, "{:032x}{:032x}", self.high, self.low)
}
}
impl FromStr for Uid {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Error> {
if s.len() != 64 {
Err(Error::InvalidUid(s.to_string()))
}
else {
match (s.get(0..32), s.get(32..)) {
(Some(high), Some(low)) => match (
u128::from_str_radix(high, 16),
u128::from_str_radix(low, 16),
) {
(Ok(high), Ok(low)) => Ok(Uid { high, low }),
_ => Err(Error::InvalidUid(s.to_string())),
},
_ => Err(Error::InvalidUid(s.to_string())),
}
}
}
}
impl std::ops::BitXor for Uid {
type Output = Self;
fn bitxor(self, rhs: Self) -> Self {
Uid {
low: self.low ^ rhs.low,
high: self.high ^ rhs.high,
}
}
}
impl std::ops::BitXorAssign for Uid {
fn bitxor_assign(&mut self, rhs: Self) {
self.low ^= rhs.low;
self.high ^= rhs.high;
}
}
impl std::ops::Add for Uid {
type Output = Self;
fn add(self, rhs: Self) -> Self {
let (low, carry) = self.low.overflowing_add(rhs.low);
let mut high = self.high.wrapping_add(rhs.high);
if carry {
high = high.wrapping_add(1);
}
Uid { low, high }
}
}
impl std::ops::AddAssign for Uid {
fn add_assign(&mut self, rhs: Self) {
let result = *self + rhs;
*self = result;
}
}
impl Index {
pub fn get_all_chunk_uids(&self) -> Result<Vec<Uid>, Error> {
let mut result = vec![];
for internal in read_dir(&join3(&self.root_dir, &INDEX_DIR_NAME, &CHUNK_DIR_NAME)?, false)? {
let prefix = file_name(&internal)?;
if !is_dir(&internal) {
continue;
}
for chunk_file in read_dir(&internal, false)? {
if extension(&chunk_file).unwrap_or(None).unwrap_or(String::new()) == "chunk" {
result.push(Uid::from_prefix_and_suffix(&prefix, &file_name(&chunk_file)?)?);
}
}
}
result.sort();
Ok(result)
}
pub fn get_all_image_uids(&self) -> Result<Vec<Uid>, Error> {
let mut result = vec![];
for internal in read_dir(&join3(&self.root_dir, &INDEX_DIR_NAME, &IMAGE_DIR_NAME)?, false)? {
let prefix = file_name(&internal)?;
if !is_dir(&internal) {
continue;
}
for image_file in read_dir(&internal, false)? {
if extension(&image_file).unwrap_or(None).unwrap_or(String::new()) == "png" {
result.push(Uid::from_prefix_and_suffix(&prefix, &file_name(&image_file)?)?);
}
}
}
result.sort();
Ok(result)
}
}
fn u128_from_bytes(bytes: &[u8]) -> Result<u128, Error> {
match bytes.len() {
0 => Ok(0),
1..=15 => {
let mut padded = [0; 16];
padded[(16 - bytes.len())..].copy_from_slice(bytes);
Ok(u128::from_be_bytes(padded))
},
16 => Ok(u128::from_be_bytes(bytes.try_into().unwrap())),
_ => Err(Error::CannotDecodeUid),
}
}