import argparse
import json
from tqdm import tqdm
import os
import struct
import numpy as np
import gzip
def write_sparse_vectors_to_binary_file(filename, term_id):
def write_binary_sequence(lst_pairs, file):
file.write((len(lst_pairs)).to_bytes(4, byteorder="little", signed=False))
for v in lst_pairs:
file.write((int(v[0])).to_bytes(4, byteorder="little", signed=False))
for v in lst_pairs:
value = v[1]
ba = bytearray(struct.pack("f", value))
file.write(ba)
with open(filename, "wb") as fout:
fout.write((len(term_id)).to_bytes(4, byteorder="little", signed=False))
for d in tqdm(term_id):
lst = sorted(list(d.items()))
write_binary_sequence(lst, fout)
def write_sparse_vectors_to_binary_file_2(filename, term_id):
def write_binary_sequence(lst_pairs, file):
file.write((len(lst_pairs)).to_bytes(4, byteorder="little", signed=False))
for v in lst_pairs:
file.write((int(v[0])).to_bytes(4, byteorder="little", signed=False))
for v in lst_pairs:
value = v[1]
ba = bytearray(struct.pack("f", value))
file.write(ba)
with open(filename, "wb") as fout:
fout.write((len(term_id)).to_bytes(4, byteorder="little", signed=False))
for dd in tqdm(term_id):
d = {x: y for x, y in zip(dd[0], dd[1])}
lst = sorted(list(d.items()))
write_binary_sequence(lst, fout)
def convert_documents_with_no_token_conversion(document_folder):
sorted_files = sorted(
filter(lambda x: x.endswith(".json"), os.listdir(document_folder)),
key=lambda x: x.split(".", maxsplit=1),
)
documents = []
doc_ids = []
for current_file in tqdm(sorted_files):
with open(os.path.join(document_folder, current_file)) as f:
l = json.load(f)
documents.extend(
dict(zip(element["coordinates"], element["values"]))
for element in l["vectors"]
)
doc_ids.extend([element["id"] for element in l["vectors"]])
return documents, doc_ids
def convert_queries_with_no_token_conversion(queries_path):
queries = []
query_ids = []
with open(queries_path) as f:
query_file = json.load(f)
queries.extend(
dict(zip(element["coordinates"], element["values"]))
for element in query_file["vectors"]
)
query_ids.extend([element["id"] for element in query_file["vectors"]])
return queries, query_ids
def convert_documents_from_compressed_file(document_path):
tokens_set = set()
print("Scanning the documents to build the token to ids mapping")
with gzip.open(document_path, "r") as f:
f.seek(512)
for line in tqdm(f):
try:
result = json.loads(line)
except:
print("Footer skipped\n")
break
tokens_set.update(result["vector"].keys())
sorted_tokens_set = sorted(tokens_set)
token_to_id_mapping = {v: i for i, v in enumerate(list(sorted_tokens_set))}
documents = []
doc_ids = []
with gzip.open(document_path, "r") as file:
file.seek(512)
for line in tqdm(file):
try:
line_data = json.loads(line.strip())
except:
print("Footer skipped\n")
break
vs = np.array([v for v in line_data["vector"].values()], dtype=np.float32)
ks = np.array([token_to_id_mapping[k] for k in line_data["vector"].keys()])
documents.append((ks, vs))
doc_ids.append(line_data["id"])
return documents, doc_ids, token_to_id_mapping
def convert_documents_from_folder(document_path):
tokens_set = set()
print("Scanning the documents to build the token to ids mapping")
for file_name in tqdm(os.listdir(document_path)):
with open(os.path.join(document_path, file_name), "r") as file:
for line in file:
line_data = json.loads(line.strip())
tokens_set.update(line_data["vector"].keys())
sorted_tokens_set = sorted(tokens_set)
token_to_id_mapping = {v: i for i, v in enumerate(list(sorted_tokens_set))}
documents = []
doc_ids = []
for file_name in tqdm(os.listdir(document_path)):
with open(os.path.join(document_path, file_name), "r") as file:
for line in file:
line_data = json.loads(line.strip())
vs = np.array([v for v in line_data["vector"].values()], dtype=np.float32)
ks = np.array([token_to_id_mapping[k] for k in line_data["vector"].keys()])
documents.append((ks, vs))
doc_ids.append(line_data["id"])
return documents, doc_ids, token_to_id_mapping
def convert_documents_from_file(document_path):
tokens_set = set()
print("Scanning the documents to build the token to ids mapping")
with open(document_path, "r") as file:
for line in tqdm(file):
line_data = json.loads(line.strip())
tokens_set.update(line_data["vector"].keys())
sorted_tokens_set = sorted(tokens_set)
token_to_id_mapping = {v: i for i, v in enumerate(list(sorted_tokens_set))}
documents = []
doc_ids = []
with open(document_path, "r") as file:
for line in tqdm(file):
line_data = json.loads(line.strip())
vs = np.array([v for v in line_data["vector"].values()], dtype=np.float32)
ks = np.array([token_to_id_mapping[k] for k in line_data["vector"].keys()])
documents.append((ks, vs))
doc_ids.append(line_data["id"])
return documents, doc_ids, token_to_id_mapping
def convert_queries_from_compressed_file(queries_path, token_to_id_mapping=None):
queries = []
queries_ids = []
with gzip.open(queries_path, "r") as f:
f.seek(512)
json_list = list(f)
for json_str in tqdm(json_list):
try:
result = json.loads(json_str.decode())
except:
print("Footer skipped\n")
break
new_dict = {token_to_id_mapping[k]: v for k, v in result["vector"].items()}
queries.append(new_dict)
queries_ids.append(result["id"])
print(f"Number of queries: {len(queries)}")
return queries, queries_ids
def convert_queries_from_file(queries_path, token_to_id_mapping=None):
with open(queries_path, "r") as f:
json_list = list(f)
queries = []
queries_ids = []
for json_str in tqdm(json_list):
result = json.loads(json_str)
new_dict = {token_to_id_mapping[k]: v for k, v in result["vector"].items()}
queries.append(new_dict)
queries_ids.append(result["id"])
return queries, queries_ids
def main():
parser = argparse.ArgumentParser(
description="Parser for documents and queries conversion to Seismic format."
)
parser.add_argument("--document-path-or-folder", help="Path to the documents file")
parser.add_argument("--query-path", help="Path to the queries file")
parser.add_argument(
"--output-dir",
help="Path to the output dir. Will create a 'data' repo inside it. ",
)
parser.add_argument(
"--skip-token-conversion",
action="store_true",
default=False,
help="Whether you want to skip the token to id converison (if you already have done it)",
)
parser.add_argument(
"--large-dataset",
action="store_true",
default=False,
help="Set to true if you are dealing with very large datasets and you don't want to load the entire collection in memory.",
)
args = parser.parse_args()
document_path = args.document_path_or_folder
query_path = args.query_path
output_dir = args.output_dir
data_dir = os.path.join(output_dir, "data")
if not os.path.exists(data_dir):
os.makedirs(data_dir)
print(f"Saving into {data_dir}")
if args.large_dataset:
raise NotImplementedError("Logic for large datasets not implement yet!")
if not args.skip_token_conversion:
print(f"Reading and converting documents from {document_path}")
if document_path.endswith("tar.gz"):
documents, doc_ids, token_id_mapping = (
convert_documents_from_compressed_file(document_path)
)
else:
if os.path.isdir(document_path):
documents, doc_ids, token_id_mapping = convert_documents_from_folder(
document_path
)
else:
documents, doc_ids, token_id_mapping = convert_documents_from_file(
document_path
)
print(f"Reading and converting queries from {query_path}")
if query_path.endswith("tar.gz"):
queries, queries_ids = convert_queries_from_compressed_file(
query_path, token_id_mapping
)
else:
queries, queries_ids = convert_queries_from_file(
query_path, token_id_mapping
)
print("Saving to Seismic format")
seismic_format_doc_path = os.path.join(data_dir, "documents.bin")
write_sparse_vectors_to_binary_file_2(seismic_format_doc_path, documents)
np.save(os.path.join(data_dir, "doc_ids.npy"), doc_ids)
seismic_format_query_path = os.path.join(data_dir, "queries.bin")
write_sparse_vectors_to_binary_file(seismic_format_query_path, queries)
np.save(os.path.join(data_dir, "queries_ids.npy"), queries_ids)
token2id_path = os.path.join(data_dir, "token_to_id_mapping.json")
with open(token2id_path, "w") as fp:
json.dump(token_id_mapping, fp)
else:
print("Reading and converting documents...")
documents, doc_ids = convert_documents_with_no_token_conversion(document_path)
print("Reading and converting queries...")
queries, query_ids = convert_queries_with_no_token_conversion(query_path)
print("Saving to Seismic format")
seismic_format_doc_path = os.path.join(data_dir, "documents.bin")
write_sparse_vectors_to_binary_file(seismic_format_doc_path, documents)
seismic_format_query_path = os.path.join(data_dir, "queries.bin")
write_sparse_vectors_to_binary_file(seismic_format_query_path, queries)
np.save(os.path.join(data_dir, "doc_ids.npy"), doc_ids)
np.save(os.path.join(data_dir, "queries_ids.npy"), query_ids)
if __name__ == "__main__":
main()