first commit

This commit is contained in:
闫旭隆
2025-10-17 09:31:28 +08:00
commit 4698145045
589 changed files with 196795 additions and 0 deletions

View File

@ -0,0 +1,160 @@
import csv
from tqdm import tqdm
def check_created_csv_header(keyword, csv_dir):
keyword_to_paths ={
'cc_en':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv",
'concept_with_numeric_id': f"{csv_dir}/concept_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
},
'pes2o_abstract':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv",
},
'en_simple_wiki_v0':{
'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv",
},
}
for key, path in keyword_to_paths[keyword].items():
with open(path) as infile:
reader = csv.reader(infile)
header = next(reader)
print(f"Header of {key}: {header}")
# print first 5 rows
for i, row in enumerate(reader):
if i < 1:
print(row)
else:
break
def add_csv_columns(node_csv, edge_csv, text_csv, node_with_numeric_id, edge_with_numeric_id, text_with_numeric_id):
with open(node_csv) as infile, open(node_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':LABEL')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
with open(edge_csv) as infile, open(edge_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':TYPE')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
with open(text_csv) as infile, open(text_with_numeric_id, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
header = next(reader)
print(header)
label_index = header.index(':LABEL')
header.insert(label_index, 'numeric_id') # Add new column name
writer.writerow(header)
for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
writer.writerow(row)
# def add_csv_columns(keyword, csv_dir):
# keyword_to_paths ={
# 'cc_en':{
# 'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb.csv",
# 'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv"
# },
# 'pes2o_abstract':{
# 'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept.csv",
# 'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv"
# },
# 'en_simple_wiki_v0':{
# 'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept.csv",
# 'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json.csv",
# 'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
# 'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
# 'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv"
# },
# }
# # ouput node
# with open(keyword_to_paths[keyword]['node_csv']) as infile, open(keyword_to_paths[keyword]['node_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':LABEL'
# label_index = header.index(':LABEL')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
# # output edge (TYPE instead of LABEL for edge)
# with open(keyword_to_paths[keyword]['edge_csv']) as infile, open(keyword_to_paths[keyword]['edge_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':TYPE'
# label_index = header.index(':TYPE')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
# # output text
# with open(keyword_to_paths[keyword]['text_csv']) as infile, open(keyword_to_paths[keyword]['text_with_numeric_id'], 'w') as outfile:
# reader = csv.reader(infile)
# writer = csv.writer(outfile)
# # Read the header
# header = next(reader)
# print(header)
# # Insert 'numeric_id' before ':LABEL'
# label_index = header.index(':LABEL')
# header.insert(label_index, 'numeric_id') # Add new column name
# writer.writerow(header)
# # Process each row and add a numeric ID
# for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
# row.insert(label_index, row_number) # Add numeric ID before ':LABEL'
# writer.writerow(row)
if __name__ == "__main__":
keyword = "en_simple_wiki_v0"
csv_dir = "./import" # Change this to your CSV directory
add_csv_columns(keyword, csv_dir)
# check_created_csv_header(keyword)

View File

@ -0,0 +1,189 @@
import networkx as nx
import csv
import ast
import hashlib
import os
from atlas_rag.kg_construction.triple_config import ProcessingConfig
import pickle
def get_node_id(entity_name, entity_to_id={}):
"""Returns existing or creates new nX ID for an entity using a hash-based approach."""
if entity_name not in entity_to_id:
# Use a hash function to generate a unique ID
hash_object = hashlib.sha256(entity_name.encode('utf-8'))
hash_hex = hash_object.hexdigest() # Get the hexadecimal representation of the hash
# Use the first 8 characters of the hash as the ID (you can adjust the length as needed)
entity_to_id[entity_name] = hash_hex
return entity_to_id[entity_name]
def csvs_to_temp_graphml(triple_node_file, triple_edge_file, config:ProcessingConfig=None):
g = nx.DiGraph()
entity_to_id = {}
# Add triple nodes
with open(triple_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["name:ID"]
mapped_id = get_node_id(node_id, entity_to_id)
if mapped_id not in g.nodes:
g.add_node(mapped_id, id=node_id, type=row["type"])
# Add triple edges
with open(triple_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = get_node_id(row[":END_ID"], entity_to_id)
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# save graph to
output_name = f"{config.output_directory}/kg_graphml/{config.filename_pattern}_without_concept.pkl"
# check if output file directory exists
output_dir = os.path.dirname(output_name)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# store the graph to a pickle file
with open(output_name, 'wb') as output_file:
pickle.dump(g, output_file)
def csvs_to_graphml(triple_node_file, text_node_file, concept_node_file,
triple_edge_file, text_edge_file, concept_edge_file,
output_file):
'''
Convert multiple CSV files into a single GraphML file.
Types of nodes to be added to the graph:
- Triple nodes: Nodes representing triples, with properties like subject, predicate, object.
- Text nodes: Nodes representing text, with properties like text content.
- Concept nodes: Nodes representing concepts, with properties like concept name and type.
Types of edges to be added to the graph:
- Triple edges: Edges representing relationships between triples, with properties like relation type.
- Text edges: Edges representing relationships between text and nodes, with properties like text type.
- Concept edges: Edges representing relationships between concepts and nodes, with properties like concept type.
DiGraph networkx attributes:
Node:
- type: Type of the node (e.g., entity, event, text, concept).
- file_id: List of text IDs the node is associated with.
- id: Node Name
Edge:
- relation: relation name
- file_id: List of text IDs the edge is associated with.
- type: Type of the edge (e.g., Source, Relation, Concept).
- synsets: List of synsets associated with the edge.
'''
g = nx.DiGraph()
entity_to_id = {}
# Add triple nodes
with open(triple_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["name:ID"]
mapped_id = get_node_id(node_id, entity_to_id)
# Check if node already exists to prevent duplicates
if mapped_id not in g.nodes:
g.add_node(mapped_id, id=node_id, type=row["type"])
# Add text nodes
with open(text_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["text_id:ID"]
# Check if node already exists to prevent duplicates
if node_id not in g.nodes:
g.add_node(node_id, file_id=node_id, id=row["original_text"], type="passage")
# Add concept nodes
with open(concept_node_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
node_id = row["concept_id:ID"]
# Check if node already exists to prevent duplicates
if node_id not in g.nodes:
g.add_node(node_id, file_id="concept_file", id=row["name"], type="concept")
# Add file id for triple nodes and concept nodes when add the edges
# Add triple edges
with open(triple_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = get_node_id(row[":END_ID"], entity_to_id)
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# Add file_id to start and end nodes if they are triple or concept nodes
for node_id in [start_id, end_id]:
if g.nodes[node_id]['type'] in ['triple', 'concept'] and 'file_id' not in g.nodes[node_id]:
g.nodes[node_id]['file_id'] = row.get("file_id", "triple_file")
# Add concepts to the edge
concepts = ast.literal_eval(row["concepts"])
for concept in concepts:
if "concepts" not in g.edges[start_id, end_id]:
g.edges[start_id, end_id]['concepts'] = str(concept)
else:
# Avoid duplicate concepts by checking if concept is already in the list
current_concepts = g.edges[start_id, end_id]['concepts'].split(",")
if str(concept) not in current_concepts:
g.edges[start_id, end_id]['concepts'] += "," + str(concept)
# Add text edges
with open(text_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = row[":END_ID"]
# Check if edge already exists to prevent duplicates
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation="mention in", type=row[":TYPE"])
# Add file_id to start node if it is a triple or concept node
if 'file_id' in g.nodes[start_id]:
g.nodes[start_id]['file_id'] += "," + str(end_id)
else:
g.nodes[start_id]['file_id'] = str(end_id)
# Add concept edges between triple nodes and concept nodes
with open(concept_edge_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
start_id = get_node_id(row[":START_ID"], entity_to_id)
end_id = row[":END_ID"] # end id is concept node id
if not g.has_edge(start_id, end_id):
g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
# Write to GraphML
# check if output file directory exists
output_dir = os.path.dirname(output_file)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
nx.write_graphml(g, output_file, infer_numeric_types=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Convert CSV files to GraphML format.')
parser.add_argument('--triple_node_file', type=str, required=True, help='Path to the triple node CSV file.')
parser.add_argument('--text_node_file', type=str, required=True, help='Path to the text node CSV file.')
parser.add_argument('--concept_node_file', type=str, required=True, help='Path to the concept node CSV file.')
parser.add_argument('--triple_edge_file', type=str, required=True, help='Path to the triple edge CSV file.')
parser.add_argument('--text_edge_file', type=str, required=True, help='Path to the text edge CSV file.')
parser.add_argument('--concept_edge_file', type=str, required=True, help='Path to the concept edge CSV file.')
parser.add_argument('--output_file', type=str, required=True, help='Path to the output GraphML file.')
args = parser.parse_args()
csvs_to_graphml(args.triple_node_file, args.text_node_file, args.concept_node_file,
args.triple_edge_file, args.text_edge_file, args.concept_edge_file,
args.output_file)

View File

@ -0,0 +1,70 @@
import pandas as pd
import numpy as np
from ast import literal_eval # Safer string-to-list conversion
import os
CHUNKSIZE = 100_000 # Adjust based on your RAM (100K rows per chunk)
EMBEDDING_COL = "embedding:STRING" # Column name with embeddings
# DIMENSION = 32 # Update with your embedding dimension
ENTITY_ONLY = True
def parse_embedding(embed_str):
"""Convert embedding string to numpy array"""
# Remove brackets and convert to list
return np.array(literal_eval(embed_str), dtype=np.float32)
# Create memory-mapped numpy file
def convert_csv_to_npy(csv_path, npy_path):
total_embeddings = 0
# check dir exist, if not then create it
os.makedirs(os.path.dirname(npy_path), exist_ok=True)
with open(npy_path, "wb") as f:
pass # Initialize empty file
# Process CSV in chunks
for chunk_idx, df_chunk in enumerate(
pd.read_csv(csv_path, chunksize=CHUNKSIZE, usecols=[EMBEDDING_COL])
):
# Parse embeddings
embeddings = np.stack(
df_chunk[EMBEDDING_COL].apply(parse_embedding).values
)
# Verify dimensions
# assert embeddings.shape[1] == DIMENSION, \
# f"Dimension mismatch at chunk {chunk_idx}"
total_embeddings += embeddings.shape[0]
# Append to .npy file
with open(npy_path, "ab") as f:
np.save(f, embeddings.astype(np.float32))
print(f"Processed chunk {chunk_idx} ({CHUNKSIZE*(chunk_idx+1)} rows)")
print(f"Total number of embeddings: {total_embeddings}")
print("Conversion complete!")
if __name__ == "__main__":
keyword = 'cc_en' # Change this to your desired keyword
csv_dir="./import" # Change this to your CSV directory
keyword_to_paths ={
'cc_en':{
'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_2.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_2.csv",
'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json_with_emb.csv",
},
'pes2o_abstract':{
'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json.csv",
'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_emb.csv",
},
'en_simple_wiki_v0':{
'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json.csv",
# 'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json.csv",
'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb.csv",
},
}
for key, path in keyword_to_paths[keyword].items():
npy_path = path.replace(".csv", ".npy")
convert_csv_to_npy(path, npy_path)
print(f"Converted {path} to {npy_path}")

View File

@ -0,0 +1,27 @@
import os
import glob
def merge_csv_files(output_file, input_dir):
"""
Merge all CSV files in the input directory into a single output file.
Args:
output_file (str): Path to the output CSV file.
input_dir (str): Directory containing the input CSV files.
"""
# Delete the output file if it exists
if os.path.exists(output_file):
os.remove(output_file)
# Write the header to the output file
with open(output_file, 'w') as outfile:
outfile.write("node,conceptualized_node,node_type\n")
# Append the contents of all CSV files in the input directory
for csv_file in glob.glob(os.path.join(input_dir, '*.csv')):
with open(csv_file, 'r') as infile:
# Skip the header line
next(infile)
# Append the remaining lines to the output file
with open(output_file, 'a') as outfile:
outfile.writelines(infile)

View File

@ -0,0 +1,277 @@
from tqdm import tqdm
import argparse
import os
import csv
import json
import re
import hashlib
# Increase the field size limit
csv.field_size_limit(10 * 1024 * 1024) # 10 MB limit
# Function to compute a hash ID from text
def compute_hash_id(text):
# Use SHA-256 to generate a hash
hash_object = hashlib.sha256(text.encode('utf-8'))
return hash_object.hexdigest() # Return hash as a hex string
def clean_text(text):
# remove NUL as well
new_text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\v", " ").replace("\f", " ").replace("\b", " ").replace("\a", " ").replace("\e", " ").replace(";", ",")
new_text = new_text.replace("\x00", "")
new_text = re.sub(r'\s+', ' ', new_text).strip()
return new_text
def remove_NUL(text):
return text.replace("\x00", "")
def json2csv(dataset, data_dir, output_dir, test=False):
"""
Convert JSON files to CSV files for nodes, edges, and missing concepts.
Args:
dataset (str): Name of the dataset.
data_dir (str): Directory containing the JSON files.
output_dir (str): Directory to save the output CSV files.
test (bool): If True, run in test mode (process only 3 files).
"""
visited_nodes = set()
visited_hashes = set()
all_entities = set()
all_events = set()
all_relations = set()
file_dir_list = [f for f in os.listdir(data_dir) if dataset in f]
file_dir_list = sorted(file_dir_list)
if test:
file_dir_list = file_dir_list[:3]
print("Loading data from the json files")
print("Number of files: ", len(file_dir_list))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Define output file paths
node_csv_without_emb = os.path.join(output_dir, f"triple_nodes_{dataset}_from_json_without_emb.csv")
edge_csv_without_emb = os.path.join(output_dir, f"triple_edges_{dataset}_from_json_without_emb.csv")
node_text_file = os.path.join(output_dir, f"text_nodes_{dataset}_from_json.csv")
edge_text_file = os.path.join(output_dir, f"text_edges_{dataset}_from_json.csv")
missing_concepts_file = os.path.join(output_dir, f"missing_concepts_{dataset}_from_json.csv")
if test:
node_text_file = os.path.join(output_dir, f"text_nodes_{dataset}_from_json_test.csv")
edge_text_file = os.path.join(output_dir, f"text_edges_{dataset}_from_json_test.csv")
node_csv_without_emb = os.path.join(output_dir, f"triple_nodes_{dataset}_from_json_without_emb_test.csv")
edge_csv_without_emb = os.path.join(output_dir, f"triple_edges_{dataset}_from_json_without_emb_test.csv")
missing_concepts_file = os.path.join(output_dir, f"missing_concepts_{dataset}_from_json_test.csv")
# Open CSV files for writing
with open(node_text_file, "w", newline='', encoding='utf-8', errors='ignore') as csvfile_node_text, \
open(edge_text_file, "w", newline='', encoding='utf-8', errors='ignore') as csvfile_edge_text, \
open(node_csv_without_emb, "w", newline='', encoding='utf-8', errors='ignore') as csvfile_node, \
open(edge_csv_without_emb, "w", newline='', encoding='utf-8', errors='ignore') as csvfile_edge:
csv_writer_node_text = csv.writer(csvfile_node_text)
csv_writer_edge_text = csv.writer(csvfile_edge_text)
writer_node = csv.writer(csvfile_node)
writer_edge = csv.writer(csvfile_edge)
# Write headers
csv_writer_node_text.writerow(["text_id:ID", "original_text", ":LABEL"])
csv_writer_edge_text.writerow([":START_ID", ":END_ID", ":TYPE"])
writer_node.writerow(["name:ID", "type", "concepts", "synsets", ":LABEL"])
writer_edge.writerow([":START_ID", ":END_ID", "relation", "concepts", "synsets", ":TYPE"])
# Process each file
for file_dir in tqdm(file_dir_list):
print("Processing file for file ids: ", file_dir)
with open(os.path.join(data_dir, file_dir), "r") as jsonfile:
for line in jsonfile:
data = json.loads(line.strip())
original_text = data["original_text"]
original_text = remove_NUL(original_text)
if "Here is the passage." in original_text:
original_text = original_text.split("Here is the passage.")[-1]
eot_token = "<|eot_id|>"
original_text = original_text.split(eot_token)[0]
text_hash_id = compute_hash_id(original_text)
# Write the original text as nodes
if text_hash_id not in visited_hashes:
visited_hashes.add(text_hash_id)
csv_writer_node_text.writerow([text_hash_id, original_text, "Text"])
file_id = str(data["id"])
entity_relation_dict = data["entity_relation_dict"]
event_entity_relation_dict = data["event_entity_relation_dict"]
event_relation_dict = data["event_relation_dict"]
# Process entity triples
entity_triples = []
for entity_triple in entity_relation_dict:
try:
assert isinstance(entity_triple["Head"], str)
assert isinstance(entity_triple["Relation"], str)
assert isinstance(entity_triple["Tail"], str)
head_entity = entity_triple["Head"]
relation = entity_triple["Relation"]
tail_entity = entity_triple["Tail"]
# Clean the text
head_entity = clean_text(head_entity)
relation = clean_text(relation)
tail_entity = clean_text(tail_entity)
if head_entity.isspace() or len(head_entity) == 0 or tail_entity.isspace() or len(tail_entity) == 0:
continue
entity_triples.append((head_entity, relation, tail_entity))
except:
print(f"Error processing entity triple: {entity_triple}")
continue
# Process event triples
event_triples = []
for event_triple in event_relation_dict:
try:
assert isinstance(event_triple["Head"], str)
assert isinstance(event_triple["Relation"], str)
assert isinstance(event_triple["Tail"], str)
head_event = event_triple["Head"]
relation = event_triple["Relation"]
tail_event = event_triple["Tail"]
# Clean the text
head_event = clean_text(head_event)
relation = clean_text(relation)
tail_event = clean_text(tail_event)
if head_event.isspace() or len(head_event) == 0 or tail_event.isspace() or len(tail_event) == 0:
continue
event_triples.append((head_event, relation, tail_event))
except:
print(f"Error processing event triple: {event_triple}")
# Process event-entity triples
event_entity_triples = []
for event_entity_participations in event_entity_relation_dict:
if "Event" not in event_entity_participations or "Entity" not in event_entity_participations:
continue
if not isinstance(event_entity_participations["Event"], str) or not isinstance(event_entity_participations["Entity"], list):
continue
for entity in event_entity_participations["Entity"]:
if not isinstance(entity, str):
continue
entity = clean_text(entity)
event = clean_text(event_entity_participations["Event"])
if event.isspace() or len(event) == 0 or entity.isspace() or len(entity) == 0:
continue
event_entity_triples.append((event, "is participated by", entity))
# Write nodes and edges to CSV files
for entity_triple in entity_triples:
head_entity, relation, tail_entity = entity_triple
if head_entity is None or tail_entity is None or relation is None:
continue
if head_entity.isspace() or tail_entity.isspace() or relation.isspace():
continue
if len(head_entity) == 0 or len(tail_entity) == 0 or len(relation) == 0:
continue
# Add nodes to files
if head_entity not in visited_nodes:
visited_nodes.add(head_entity)
all_entities.add(head_entity)
writer_node.writerow([head_entity, "entity", [], [], "Node"])
csv_writer_edge_text.writerow([head_entity, text_hash_id, "Source"])
if tail_entity not in visited_nodes:
visited_nodes.add(tail_entity)
all_entities.add(tail_entity)
writer_node.writerow([tail_entity, "entity", [], [], "Node"])
csv_writer_edge_text.writerow([tail_entity, text_hash_id, "Source"])
all_relations.add(relation)
writer_edge.writerow([head_entity, tail_entity, relation, [], [], "Relation"])
for event_triple in event_triples:
head_event, relation, tail_event = event_triple
if head_event is None or tail_event is None or relation is None:
continue
if head_event.isspace() or tail_event.isspace() or relation.isspace():
continue
if len(head_event) == 0 or len(tail_event) == 0 or len(relation) == 0:
continue
# Add nodes to files
if head_event not in visited_nodes:
visited_nodes.add(head_event)
all_events.add(head_event)
writer_node.writerow([head_event, "event", [], [], "Node"])
csv_writer_edge_text.writerow([head_event, text_hash_id, "Source"])
if tail_event not in visited_nodes:
visited_nodes.add(tail_event)
all_events.add(tail_event)
writer_node.writerow([tail_event, "event", [], [], "Node"])
csv_writer_edge_text.writerow([tail_event, text_hash_id, "Source"])
all_relations.add(relation)
writer_edge.writerow([head_event, tail_event, relation, [], [], "Relation"])
for event_entity_triple in event_entity_triples:
head_event, relation, tail_entity = event_entity_triple
if head_event is None or tail_entity is None or relation is None:
continue
if head_event.isspace() or tail_entity.isspace() or relation.isspace():
continue
if len(head_event) == 0 or len(tail_entity) == 0 or len(relation) == 0:
continue
# Add nodes to files
if head_event not in visited_nodes:
visited_nodes.add(head_event)
all_events.add(head_event)
writer_node.writerow([head_event, "event", [], [], "Node"])
csv_writer_edge_text.writerow([head_event, text_hash_id, "Source"])
if tail_entity not in visited_nodes:
visited_nodes.add(tail_entity)
all_entities.add(tail_entity)
writer_node.writerow([tail_entity, "entity", [], [], "Node"])
csv_writer_edge_text.writerow([tail_entity, text_hash_id, "Source"])
all_relations.add(relation)
writer_edge.writerow([head_event, tail_entity, relation, [], [], "Relation"])
# Write missing concepts to CSV
with open(missing_concepts_file, "w", newline='', encoding='utf-8', errors='ignore') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Name", "Type"])
for entity in all_entities:
writer.writerow([entity, "Entity"])
for event in all_events:
writer.writerow([event, "Event"])
for relation in all_relations:
writer.writerow([relation, "Relation"])
print("Data to CSV completed successfully.")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="[pes2o_abstract, en_simple_wiki_v0, cc_en]")
parser.add_argument("--data_dir", type=str, required=True, help="Directory containing the graph raw JSON files")
parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the output CSV files")
parser.add_argument("--test", action="store_true", help="Test the script")
args = parser.parse_args()
json2csv(dataset=args.dataset, data_dir=args.data_dir, output_dir=args.output_dir, test=args.test)

View File

@ -0,0 +1,169 @@
import networkx as nx
import json
from tqdm import tqdm
import os
import hashlib
def get_node_id(entity_name, entity_to_id):
"""Returns existing or creates new nX ID for an entity using a hash-based approach."""
if entity_name not in entity_to_id:
# Use a hash function to generate a unique ID
hash_object = hashlib.md5(entity_name.encode()) # Use MD5 or another hashing algorithm
hash_hex = hash_object.hexdigest() # Get the hexadecimal representation of the hash
# Use the first 8 characters of the hash as the ID (you can adjust the length as needed)
entity_to_id[entity_name] = f'n{hash_hex[:16]}'
return entity_to_id[entity_name]
def clean_text(text):
# remove NUL as well
new_text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ").replace("\v", " ").replace("\f", " ").replace("\b", " ").replace("\a", " ").replace("\e", " ").replace(";", ",")
new_text = new_text.replace("\x00", "")
return new_text
def process_kg_data(input_passage_dir, input_triple_dir, output_dir, keyword):
# Get file names containing the keyword
file_names = [file for file in list(os.listdir(input_triple_dir)) if keyword in file]
print(f"Keyword: {keyword}")
print(f"Number of files: {len(file_names)}")
print(file_names)
passage_file_names = [file for file in list(os.listdir(input_passage_dir)) if keyword in file]
print(f'Passage file names: {passage_file_names}')
g = nx.DiGraph()
print("Graph created.")
entity_to_id = {}
# check if output directory exists, if not create it
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Output directory {output_dir} created.")
output_path = f"{output_dir}/{keyword}_kg_from_corpus.graphml"
# Create the original_text to node_id dictionary and add passage node to the graph
with open(f"{input_passage_dir}/{passage_file_names[0]}") as f:
data = json.load(f)
for item in tqdm(data, desc="Processing passages"):
passage_id = item["id"]
passage_text = item["text"]
node_id = get_node_id(passage_text, entity_to_id)
if passage_text.isspace() or len(passage_text) == 0:
continue
# Add the passage node to the graph
g.add_node(node_id, type="passage", id=passage_text, file_id=passage_id)
for file_name in tqdm(file_names):
print(f"Processing {file_name}")
input_file_path = f"{input_triple_dir}/{file_name}"
with open(input_file_path) as f:
for line in tqdm(f):
data = json.loads(line)
metadata = data["metadata"]
file_id = data["id"]
original_text = data["original_text"]
entity_relation_dict = data["entity_relation_dict"]
event_entity_relation_dict = data["event_entity_relation_dict"]
event_relation_dict = data["event_relation_dict"]
# Process entity triples
entity_triples = []
for entity_triple in entity_relation_dict:
if not all(key in entity_triple for key in ["Head", "Relation", "Tail"]):
continue
head_entity = clean_text(entity_triple["Head"])
relation = clean_text(entity_triple["Relation"])
tail_entity = clean_text(entity_triple["Tail"])
if head_entity.isspace() or len(head_entity) == 0 or tail_entity.isspace() or len(tail_entity) == 0:
continue
entity_triples.append((head_entity, relation, tail_entity))
# Add entity triples to the graph
for triple in entity_triples:
head_id = get_node_id(triple[0], entity_to_id)
tail_id = get_node_id(triple[2], entity_to_id)
g.add_node(head_id, type="entity", id=triple[0])
g.add_node(tail_id, type="entity", id=triple[2])
g.add_edge(head_id, get_node_id(original_text, entity_to_id), relation='mention in')
g.add_edge(tail_id, get_node_id(original_text, entity_to_id), relation='mention in')
g.add_edge(head_id, tail_id, relation=triple[1])
for node_id in [head_id, tail_id]:
if "file_id" not in g.nodes[node_id]:
g.nodes[node_id]["file_id"] = str(file_id)
else:
g.nodes[node_id]["file_id"] += "," + str(file_id)
edge = g.edges[head_id, tail_id]
if "file_id" not in edge:
edge["file_id"] = str(file_id)
else:
edge["file_id"] += "," + str(file_id)
# Process event triples
event_triples = []
for event_triple in event_relation_dict:
if not all(key in event_triple for key in ["Head", "Relation", "Tail"]):
continue
head_event = clean_text(event_triple["Head"])
relation = clean_text(event_triple["Relation"])
tail_event = clean_text(event_triple["Tail"])
if head_event.isspace() or len(head_event) == 0 or tail_event.isspace() or len(tail_event) == 0:
continue
event_triples.append((head_event, relation, tail_event))
# Add event triples to the graph
for triple in event_triples:
head_id = get_node_id(triple[0], entity_to_id)
tail_id = get_node_id(triple[2], entity_to_id)
g.add_node(head_id, type="event", id=triple[0])
g.add_node(tail_id, type="event", id=triple[2])
g.add_edge(head_id, get_node_id(original_text, entity_to_id), relation='mention in')
g.add_edge(tail_id, get_node_id(original_text, entity_to_id), relation='mention in')
g.add_edge(head_id, tail_id, relation=triple[1])
for node_id in [head_id, tail_id]:
if "file_id" not in g.nodes[node_id]:
g.nodes[node_id]["file_id"] = str(file_id)
else:
g.nodes[node_id]["file_id"] += "," + str(file_id)
edge = g.edges[head_id, tail_id]
if "file_id" not in edge:
edge["file_id"] = str(file_id)
else:
edge["file_id"] += "," + str(file_id)
# Process event-entity triples
event_entity_triples = []
for event_entity_participations in event_entity_relation_dict:
if not all(key in event_entity_participations for key in ["Event", "Entity"]):
continue
event = clean_text(event_entity_participations["Event"])
if event.isspace() or len(event) == 0:
continue
for entity in event_entity_participations["Entity"]:
if not isinstance(entity, str) or entity.isspace() or len(entity) == 0:
continue
entity = clean_text(entity)
event_entity_triples.append((event, "is participated by", entity))
# Add event-entity triples to the graph
for triple in event_entity_triples:
head_id = get_node_id(triple[0], entity_to_id)
tail_id = get_node_id(triple[2], entity_to_id)
g.add_node(head_id, type="event", id=triple[0])
g.add_node(tail_id, type="entity", id=triple[2])
g.add_edge(head_id, tail_id, relation=triple[1])
for node_id in [head_id, tail_id]:
if "file_id" not in g.nodes[node_id]:
g.nodes[node_id]["file_id"] = str(file_id)
edge = g.edges[head_id, tail_id]
if "file_id" not in edge:
edge["file_id"] = str(file_id)
else:
edge["file_id"] += "," + str(file_id)
print(f"Number of nodes: {g.number_of_nodes()}")
print(f"Number of edges: {g.number_of_edges()}")
print(f"Graph density: {nx.density(g)}")
with open(output_path, 'wb') as f:
nx.write_graphml(g, f, infer_numeric_types=True)

View File

@ -0,0 +1,63 @@
import argparse
import json
import os
import sys
from pathlib import Path
# Set up argument parser
parser = argparse.ArgumentParser(description="Convert all Markdown files in a folder to separate JSON files.")
parser.add_argument(
"--input", required=True, help="Path to the folder containing Markdown files"
)
parser.add_argument(
"--output", default=None, help="Output folder for JSON files (defaults to input folder if not specified)"
)
# Parse arguments
args = parser.parse_args()
# Resolve input folder path
input_folder = Path(args.input)
if not input_folder.is_dir():
print(f"Error: '{args.input}' is not a directory.", file=sys.stderr)
sys.exit(1)
# Set output folder (use input folder if not specified)
output_folder = Path(args.output) if args.output else input_folder
output_folder.mkdir(parents=True, exist_ok=True)
# Find all .md files in the input folder
markdown_files = [f for f in input_folder.iterdir() if f.suffix.lower() == ".md"]
if not markdown_files:
print(f"Error: No Markdown files found in '{args.input}'.", file=sys.stderr)
sys.exit(1)
# Process each Markdown file
for file in markdown_files:
try:
# Read the content of the file
with open(file, "r", encoding="utf-8") as f:
content = f.read()
# Create the JSON object
obj = {
"id": "1",
"text": content,
"metadata": {
"lang": "en"
}
}
# Create output JSON filename (e.g., file1.md -> file1.json)
output_file = output_folder / f"{file.stem}.json"
# Write JSON to file
with open(output_file, "w", encoding="utf-8") as f:
json.dump([obj], f, indent=4)
print(f"Successfully converted '{file}' to '{output_file}'")
except FileNotFoundError:
print(f"Error: File '{file}' not found.", file=sys.stderr)
except Exception as e:
print(f"Error processing file '{file}': {e}", file=sys.stderr)