first commit
This commit is contained in:
259
AIEC-RAG/atlas_rag/vectorstore/create_neo4j_index.py
Normal file
259
AIEC-RAG/atlas_rag/vectorstore/create_neo4j_index.py
Normal file
@ -0,0 +1,259 @@
|
||||
import faiss
|
||||
import numpy as np
|
||||
import time
|
||||
import logging
|
||||
from atlas_rag.kg_construction.utils.csv_processing.csv_to_npy import convert_csv_to_npy
|
||||
def create_faiss_index(output_directory, filename_pattern, index_type="HNSW,Flat"):
|
||||
"""
|
||||
Create faiss index for the graph, for index type, see https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
||||
|
||||
"IVF65536_HNSW32,Flat" for 1M to 10M nodes
|
||||
|
||||
"HNSW,Flat" for toy dataset
|
||||
|
||||
"""
|
||||
# Convert csv to npy
|
||||
convert_csv_to_npy(
|
||||
csv_path=f"{output_directory}/triples_csv/triple_nodes_{filename_pattern}_from_json_with_emb.csv",
|
||||
npy_path=f"{output_directory}/vector_index/triple_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
convert_csv_to_npy(
|
||||
csv_path=f"{output_directory}/triples_csv/text_nodes_{filename_pattern}_from_json_with_emb.csv",
|
||||
npy_path=f"{output_directory}/vector_index/text_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
convert_csv_to_npy(
|
||||
csv_path=f"{output_directory}/triples_csv/triple_edges_{filename_pattern}_from_json_with_concept_with_emb.csv",
|
||||
npy_path=f"{output_directory}/vector_index/triple_edges_{filename_pattern}_from_json_with_concept_with_emb.npy",
|
||||
)
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/triple_nodes_{filename_pattern}_from_json_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/triple_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/text_nodes_{filename_pattern}_from_json_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/text_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/triple_edges_{filename_pattern}_from_json_with_concept_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/triple_edges_{filename_pattern}_from_json_with_concept_with_emb.npy",
|
||||
)
|
||||
|
||||
# cannot avoid loading into memory when training
|
||||
# simply try load all to train
|
||||
def build_faiss_from_npy(index_type, index_path, npy_path):
|
||||
# check npy size.
|
||||
# shapes = []
|
||||
start_time = time.time()
|
||||
# with open(npy_path, "rb") as f:
|
||||
# while True:
|
||||
# try:
|
||||
# array = np.load(f)
|
||||
# shapes.append(array.shape)
|
||||
# except Exception as e:
|
||||
# print(f"Stopped loading due to: {str(e)}")
|
||||
# break
|
||||
# if shapes:
|
||||
# total_rows = sum(shape[0] for shape in shapes)
|
||||
# dimension = shapes[0][1]
|
||||
# print(f"Total embeddings in {npy_path}\n {total_rows}, Dimension: {dimension}")
|
||||
# minilm is 32
|
||||
# get the dimension from the npy file
|
||||
with open(npy_path, "rb") as f:
|
||||
array = np.load(f)
|
||||
dimension = array.shape[1]
|
||||
print(f"Dimension: {dimension}")
|
||||
index = faiss.index_factory(dimension, index_type, faiss.METRIC_INNER_PRODUCT)
|
||||
|
||||
if index_type.startswith("IVF"):
|
||||
index_ivf = faiss.extract_index_ivf(index)
|
||||
clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
|
||||
index_ivf.clustering_index = clustering_index
|
||||
|
||||
# Load data to match the training samples size.
|
||||
# Done by random picking indexes from shapes and check if the sum of the indexes is over the sample size or not.
|
||||
# If yes then read them and start training, skip the np.load part for non chosen indexes
|
||||
|
||||
# selected_indices = set()
|
||||
# possible_indices = list(range(len(shapes)))
|
||||
# selected_training_samples = 0
|
||||
# while selected_training_samples < max_training_samples and possible_indices:
|
||||
# idx = random.choice(possible_indices)
|
||||
# selected_indices.add(idx)
|
||||
# selected_training_samples += shapes[idx][0]
|
||||
# possible_indices.remove(idx)
|
||||
# print(f"Selected total: {selected_training_samples} samples for training")
|
||||
|
||||
xt = []
|
||||
current_index = 0
|
||||
with open(npy_path, "rb") as f:
|
||||
while True:
|
||||
try:
|
||||
# array = np.load(f)
|
||||
# if current_index in selected_indices:
|
||||
array = np.load(f)
|
||||
# faiss.normalize_L2(array)
|
||||
xt.append(array)
|
||||
# current_index += 1
|
||||
except Exception as e:
|
||||
logging.info(f"Stopped loading due to: {str(e)}")
|
||||
break
|
||||
if xt:
|
||||
xt = np.vstack(xt)
|
||||
logging.info(f"Loading time: {time.time() - start_time:.2f} seconds")
|
||||
start_time = time.time()
|
||||
index.train(xt)
|
||||
end_time = time.time()
|
||||
logging.info(f"Training time: {end_time - start_time:.2f} seconds")
|
||||
del xt
|
||||
start_time = time.time()
|
||||
with open(npy_path, "rb") as f:
|
||||
while True:
|
||||
try:
|
||||
array = np.load(f)
|
||||
# faiss.normalize_L2(array)
|
||||
index.add(array)
|
||||
except Exception as e:
|
||||
logging.info(f"Stopped loading due to: {str(e)}")
|
||||
break
|
||||
logging.info(f"Adding time: {time.time() - start_time:.2f} seconds")
|
||||
|
||||
# Convert the GPU index to a CPU index for saving
|
||||
index = faiss.index_gpu_to_cpu(index)
|
||||
# Save the CPU index to a file
|
||||
faiss.write_index(index, index_path)
|
||||
|
||||
def train_and_write_indexes(keyword, npy_dir="./import"):
|
||||
keyword_to_paths = {
|
||||
'cc_en': {
|
||||
'npy':{
|
||||
'node': f"{npy_dir}/triple_nodes_cc_en_from_json_2.npy",
|
||||
# 'edge': f"{npy_dir}/triple_edges_cc_en_from_json_2.npy",
|
||||
'text': f"{npy_dir}/text_nodes_cc_en_from_json_with_emb_2.npy",
|
||||
},
|
||||
'index':{
|
||||
'node': f"{npy_dir}/triple_nodes_cc_en_from_json_non_norm.index",
|
||||
# 'edge': f"{npy_dir}/triple_edges_cc_en_from_json_non_norm.index",
|
||||
'text': f"{npy_dir}/text_nodes_cc_en_from_json_with_emb_non_norm.index",
|
||||
},
|
||||
'index_type':{
|
||||
'node': "IVF1048576_HNSW32,Flat",
|
||||
# 'edge': "IVF1048576_HNSW32,Flat",
|
||||
'text': "IVF262144_HNSW32,Flat",
|
||||
},
|
||||
'csv':{
|
||||
'node': f"{npy_dir}/triple_nodes_cc_en_from_json.csv",
|
||||
# 'edge': ff"{npy_dir}/triple_edges_cc_en_from_json.csv",
|
||||
'text': f"{npy_dir}/text_nodes_cc_en_from_json_with_emb.csv",
|
||||
}
|
||||
},
|
||||
'pes2o_abstract': {
|
||||
'npy':{
|
||||
'node': f"{npy_dir}/triple_nodes_pes2o_abstract_from_json.npy",
|
||||
# 'edge': f"{npy_dir}/triple_edges_pes2o_abstract_from_json.npy",
|
||||
'text': f"{npy_dir}/text_nodes_pes2o_abstract_from_json_with_emb.npy",
|
||||
},
|
||||
'index':{
|
||||
'node': f"{npy_dir}/triple_nodes_pes2o_abstract_from_json_non_norm.index",
|
||||
# 'edge': f"{npy_dir}/triple_edges_pes2o_abstract_from_json_non_norm.index",
|
||||
'text': f"{npy_dir}/text_nodes_pes2o_abstract_from_json_with_emb_non_norm.index",
|
||||
},
|
||||
'index_type':{
|
||||
'node': "IVF1048576_HNSW32,Flat",
|
||||
# 'edge': "IVF1048576_HNSW32,Flat",
|
||||
'text': "IVF65536_HNSW32,Flat",
|
||||
},
|
||||
'csv':{
|
||||
'node_csv': f"{npy_dir}/triple_nodes_pes2o_abstract_from_json.csv",
|
||||
# 'edge_csv': ff"{npy_dir}/triple_edges_pes2o_abstract_from_json.csv",
|
||||
'text_csv': f"{npy_dir}/text_nodes_pes2o_abstract_from_json_with_emb.csv",
|
||||
}
|
||||
},
|
||||
'en_simple_wiki_v0': {
|
||||
'npy':{
|
||||
'node': f"{npy_dir}/triple_nodes_en_simple_wiki_v0_from_json.npy",
|
||||
# 'edge': f"{npy_dir}/triple_edges_en_simple_wiki_v0_from_json.npy",
|
||||
'text': f"{npy_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb.npy",
|
||||
},
|
||||
'index':{
|
||||
'node': f"{npy_dir}/triple_nodes_en_simple_wiki_v0_from_json_non_norm.index",
|
||||
# 'edge': f"{npy_dir}/triple_edges_en_simple_wiki_v0_from_json_non_norm.index",
|
||||
'text': f"{npy_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb_non_norm.index",
|
||||
},
|
||||
'index_type':{
|
||||
'node': "IVF1048576_HNSW32,Flat",
|
||||
# 'edge': "IVF1048576_HNSW32,Flat",
|
||||
'text': "IVF65536_HNSW32,Flat",
|
||||
},
|
||||
'csv':{
|
||||
'node_csv': f"{npy_dir}/triple_nodes_en_simple_wiki_v0_from_json.csv",
|
||||
# 'edge_csv': ff"{npy_dir}/triple_edges_en_simple_wiki_v0_from_json.csv",
|
||||
'text_csv': f"{npy_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb.csv",
|
||||
}
|
||||
}
|
||||
}
|
||||
emb_list = ['node', 'text'] # Add 'edge' if needed and uncomment the related path lines
|
||||
for emb in emb_list:
|
||||
npy_path = keyword_to_paths[keyword]['npy'][emb]
|
||||
index_path = keyword_to_paths[keyword]['index'][emb]
|
||||
index_type = keyword_to_paths[keyword]['index_type'][emb]
|
||||
logging.info(f"Index {index_path}, Building...")
|
||||
# For cc-en the recommended training samples is 600_000_000, for the rest we can afford to training them using all data.
|
||||
build_faiss_from_npy(index_type, index_path, npy_path)
|
||||
|
||||
# # Test the index
|
||||
# for emb in emb_list:
|
||||
# index_path = keyword_to_paths[keyword]['index'][emb]
|
||||
# print(f"Index {index_path}, Testing...")
|
||||
# test_and_search_faiss_index(index_path, keyword_to_paths[keyword]['csv'][emb])
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
x = 1
|
||||
|
||||
# keyword = "cc_en" # Replace with your actual keyword
|
||||
# logging.basicConfig(
|
||||
# filename=f'{keyword}_faiss_creation.log', # Log file
|
||||
# level=logging.INFO, # Set the logging level
|
||||
# format='%(asctime)s - %(levelname)s - %(message)s' # Log format
|
||||
# )
|
||||
|
||||
# argparser = argparse.ArgumentParser(description="Train and write FAISS indexes for LKG construction.")
|
||||
# argparser.add_argument("--npy_dir", type=str, default="./import", help="Directory containing the .npy files.")
|
||||
# argparser.add_argument("--keyword", type=str, default=keyword, help="Keyword to select the dataset.")
|
||||
|
||||
# args = argparser.parse_args()
|
||||
# keyword = args.keyword
|
||||
# npy_dir = args.npy_dir
|
||||
|
||||
# train_and_write_indexes(keyword,npy_dir)
|
||||
# index_type = "IVF65536_HNSW32,Flat"
|
||||
index_type = "HNSW,Flat"
|
||||
|
||||
output_directory = "/home/jbai/AutoSchemaKG/import/Dulce"
|
||||
filename_pattern = "Dulce"
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/triple_nodes_{filename_pattern}_from_json_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/triple_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/text_nodes_{filename_pattern}_from_json_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/text_nodes_{filename_pattern}_from_json_with_emb.npy",
|
||||
)
|
||||
|
||||
build_faiss_from_npy(
|
||||
index_type=index_type,
|
||||
index_path=f"{output_directory}/vector_index/triple_edges_{filename_pattern}_from_json_with_concept_with_emb_non_norm.index",
|
||||
npy_path=f"{output_directory}/vector_index/triple_edges_{filename_pattern}_from_json_with_concept_with_emb.npy",
|
||||
)
|
||||
Reference in New Issue
Block a user