first commit

This commit is contained in:
闫旭隆
2025-09-24 09:29:12 +08:00
parent 6339cdebb9
commit 2308536f66
360 changed files with 136381 additions and 0 deletions

View File

@ -0,0 +1,160 @@
import csv
from tqdm import tqdm
def check_created_csv_header(keyword, csv_dir):
keyword_to_paths ={
'cc_en':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv",
'concept_with_numeric_id': f"{csv_dir}/concept_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
},
'pes2o_abstract':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv",
},
'en_simple_wiki_v0':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv",
},
}
for key, path in keyword_to_paths[keyword].items():
with open(path) as infile:
reader = csv.reader(infile)
header = next(reader)
print(f"Header of {key}: {header}")
# print first 5 rows
for i, row in enumerate(reader):
if i < 1:
print(row)
else:
break
def add_csv_columns(node_csv, edge_csv, text_csv, node_with_numeric_id, edge_with_numeric_id, text_with_numeric_id):
with open(node_csv) as infile, open(node_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':LABEL')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
with open(edge_csv) as infile, open(edge_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':TYPE')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
with open(text_csv) as infile, open(text_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':LABEL')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
# def add_csv_columns(keyword, csv_dir):
# keyword_to_paths ={
# 'cc_en':{
# 'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb.csv",
# 'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv"
# },
# 'pes2o_abstract':{
# 'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept.csv",
# 'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv"
# },
# 'en_simple_wiki_v0':{
# 'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept.csv",
# 'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv"
# },
# }
# # ouput node
# with open(keyword_to_paths[keyword]['node_csv']) as infile, open(keyword_to_paths[keyword]['node_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':LABEL'
# label_index = header.index(':LABEL')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
# # output edge (TYPE instead of LABEL for edge)
# with open(keyword_to_paths[keyword]['edge_csv']) as infile, open(keyword_to_paths[keyword]['edge_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':TYPE'
# label_index = header.index(':TYPE')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
# # output text
# with open(keyword_to_paths[keyword]['text_csv']) as infile, open(keyword_to_paths[keyword]['text_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':LABEL'
# label_index = header.index(':LABEL')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
if __name__ == "__main__":
keyword = "en_simple_wiki_v0"
csv_dir = "./import" # Change this to your CSV directory
add_csv_columns(keyword, csv_dir)
# check_created_csv_header(keyword)

View File

@ -0,0 +1,189 @@
import networkx as nx
import csv
import ast
import hashlib
import os
from atlas_rag.kg_construction.triple_config import ProcessingConfig
import pickle
def get_node_id(entity_name, entity_to_id={}):
"""Returns existing or creates new nX ID for an entity using a hash-based approach."""
if entity_name not in entity_to_id:
# Use a hash function to generate a unique ID
hash_object = hashlib.sha256(entity_name.encode('utf-8'))
hash_hex = hash_object.hexdigest() # Get the hexadecimal representation of the hash
# Use the first 8 characters of the hash as the ID (you can adjust the length as needed)
entity_to_id[entity_name] = hash_hex
return entity_to_id[entity_name]
def csvs_to_temp_graphml(triple_node_file, triple_edge_file, config:ProcessingConfig=None):
g = nx.DiGraph()
entity_to_id = {}
# Add triple nodes
with open(triple_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["name:ID"]
mapped_id = get_node_id(node_id, entity_to_id)
if mapped_id not in g.nodes:
g.add_node(mapped_id, id=node_id, type=row["type"])
# Add triple edges
with open(triple_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = get_node_id(row[":END_ID"], entity_to_id)
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# save graph to
output_name = f"{config.output_directory}/kg_graphml/{config.filename_pattern}_without_concept.pkl"
# check if output file directory exists
output_dir = os.path.dirname(output_name)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# store the graph to a pickle file
with open(output_name, 'wb') as output_file:
pickle.dump(g, output_file)
def csvs_to_graphml(triple_node_file, text_node_file, concept_node_file,
triple_edge_file, text_edge_file, concept_edge_file,
output_file):
'''
Convert multiple CSV files into a single GraphML file.
Types of nodes to be added to the graph:
- Triple nodes: Nodes representing triples, with properties like subject, predicate, object.
- Text nodes: Nodes representing text, with properties like text content.
- Concept nodes: Nodes representing concepts, with properties like concept name and type.
Types of edges to be added to the graph:
- Triple edges: Edges representing relationships between triples, with properties like relation type.
- Text edges: Edges representing relationships between text and nodes, with properties like text type.
- Concept edges: Edges representing relationships between concepts and nodes, with properties like concept type.
DiGraph networkx attributes:
Node:
- type: Type of the node (e.g., entity, event, text, concept).
- file_id: List of text IDs the node is associated with.
- id: Node Name
Edge:
- relation: relation name
- file_id: List of text IDs the edge is associated with.
- type: Type of the edge (e.g., Source, Relation, Concept).
- synsets: List of synsets associated with the edge.
'''
g = nx.DiGraph()
entity_to_id = {}
# Add triple nodes
with open(triple_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["name:ID"]
mapped_id = get_node_id(node_id, entity_to_id)
# Check if node already exists to prevent duplicates
if mapped_id not in g.nodes:
g.add_node(mapped_id, id=node_id, type=row["type"])
# Add text nodes
with open(text_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["text_id:ID"]
# Check if node already exists to prevent duplicates
if node_id not in g.nodes:
g.add_node(node_id, file_id=node_id, id=row["original_text"], type="passage")
# Add concept nodes
with open(concept_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["concept_id:ID"]
# Check if node already exists to prevent duplicates
if node_id not in g.nodes:
g.add_node(node_id, file_id="concept_file", id=row["name"], type="concept")
# Add file id for triple nodes and concept nodes when add the edges
# Add triple edges
with open(triple_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = get_node_id(row[":END_ID"], entity_to_id)
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# Add file_id to start and end nodes if they are triple or concept nodes
for node_id in [start_id, end_id]:
if g.nodes[node_id]['type'] in ['triple', 'concept'] and 'file_id' not in g.nodes[node_id]:
g.nodes[node_id]['file_id'] = row.get("file_id", "triple_file")
# Add concepts to the edge
concepts = ast.literal_eval(row["concepts"])
for concept in concepts:
if "concepts" not in g.edges[start_id, end_id]:
g.edges[start_id, end_id]['concepts'] = str(concept)
else:
# Avoid duplicate concepts by checking if concept is already in the list
current_concepts = g.edges[start_id, end_id]['concepts'].split(",")
if str(concept) not in current_concepts:
g.edges[start_id, end_id]['concepts'] += "," + str(concept)
# Add text edges
with open(text_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = row[":END_ID"]
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation="mention in", type=row[":TYPE"])
# Add file_id to start node if it is a triple or concept node
if 'file_id' in g.nodes[start_id]:
g.nodes[start_id]['file_id'] += "," + str(end_id)
else:
g.nodes[start_id]['file_id'] = str(end_id)
# Add concept edges between triple nodes and concept nodes
with open(concept_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = row[":END_ID"] # end id is concept node id
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# Write to GraphML
# check if output file directory exists
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
nx.write_graphml(g, output_file, infer_numeric_types=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Convert CSV files to GraphML format.')
parser.add_argument('--triple_node_file', type=str, required=True, help='Path to the triple node CSV file.')
parser.add_argument('--text_node_file', type=str, required=True, help='Path to the text node CSV file.')
parser.add_argument('--concept_node_file', type=str, required=True, help='Path to the concept node CSV file.')
parser.add_argument('--triple_edge_file', type=str, required=True, help='Path to the triple edge CSV file.')
parser.add_argument('--text_edge_file', type=str, required=True, help='Path to the text edge CSV file.')
parser.add_argument('--concept_edge_file', type=str, required=True, help='Path to the concept edge CSV file.')
parser.add_argument('--output_file', type=str, required=True, help='Path to the output GraphML file.')
args = parser.parse_args()
csvs_to_graphml(args.triple_node_file, args.text_node_file, args.concept_node_file,
args.triple_edge_file, args.text_edge_file, args.concept_edge_file,
args.output_file)

View File

@ -0,0 +1,70 @@
import pandas as pd
import numpy as np
from ast import literal_eval # Safer string-to-list conversion
import os
CHUNKSIZE = 100_000 # Adjust based on your RAM (100K rows per chunk)
EMBEDDING_COL = "embedding:STRING" # Column name with embeddings
# DIMENSION = 32 # Update with your embedding dimension
ENTITY_ONLY = True
def parse_embedding(embed_str):
"""Convert embedding string to numpy array"""
# Remove brackets and convert to list
return np.array(literal_eval(embed_str), dtype=np.float32)
# Create memory-mapped numpy file
def convert_csv_to_npy(csv_path, npy_path):
total_embeddings = 0
# check dir exist, if not then create it
os.makedirs(os.path.dirname(npy_path), exist_ok=True)
with open(npy_path, "wb") as f:
pass # Initialize empty file
# Process CSV in chunks
for chunk_idx, df_chunk in enumerate(
pd.read_csv(csv_path, chunksize=CHUNKSIZE, usecols=[EMBEDDING_COL])
):
# Parse embeddings
embeddings = np.stack(
df_chunk[EMBEDDING_COL].apply(parse_embedding).values
)
# Verify dimensions
# assert embeddings.shape[1] == DIMENSION, \
# f"Dimension mismatch at chunk {chunk_idx}"
total_embeddings += embeddings.shape[0]
# Append to .npy file
with open(npy_path, "ab") as f:
np.save(f, embeddings.astype(np.float32))
print(f"Processed chunk {chunk_idx} ({CHUNKSIZE*(chunk_idx+1)} rows)")
print(f"Total number of embeddings: {total_embeddings}")
print("Conversion complete!")
if __name__ == "__main__":
keyword = 'cc_en' # Change this to your desired keyword
csv_dir="./import" # Change this to your CSV directory
keyword_to_paths ={
'cc_en':{
'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_2.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_2.csv",
'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json_with_emb.csv",
},
'pes2o_abstract':{
'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json.csv",
'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_emb.csv",
},
'en_simple_wiki_v0':{
'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json.csv",
'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb.csv",
},
}
for key, path in keyword_to_paths[keyword].items():
npy_path = path.replace(".csv", ".npy")
convert_csv_to_npy(path, npy_path)
print(f"Converted {path} to {npy_path}")

View File

@ -0,0 +1,27 @@
import os
import glob
def merge_csv_files(output_file, input_dir):
"""
Merge all CSV files in the input directory into a single output file.
Args:
output_file (str): Path to the output CSV file.
input_dir (str): Directory containing the input CSV files.
"""
# Delete the output file if it exists
if os.path.exists(output_file):
os.remove(output_file)
# Write the header to the output file
with open(output_file, 'w') as outfile:
outfile.write("node,conceptualized_node,node_type\n")
# Append the contents of all CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_dir, '*.csv')):
with open(csv_file, 'r') as infile:
# Skip the header line
next(infile)
# Append the remaining lines to the output file
with open(output_file, 'a') as outfile:
outfile.writelines(infile)