Files
AIEC-new/AIEC-RAG/atlas_rag/kg_construction/concept_to_csv.py
2025-10-17 09:31:28 +08:00

153 lines
5.8 KiB
Python

import ast
import uuid
import csv
from tqdm import tqdm
import hashlib
import os
def generate_uuid():
"""Generate a random UUID"""
return str(uuid.uuid4())
def parse_concepts(s):
"""Parse concepts field and filter empty values"""
try:
parsed = ast.literal_eval(s) if s and s != '[]' else []
return [c.strip() for c in parsed if c.strip()]
except:
return []
# Function to compute a hash ID from text
def compute_hash_id(text):
# Use SHA-256 to generate a hash
text = text + '_concept'
hash_object = hashlib.sha256(text.encode('utf-8'))
return hash_object.hexdigest() # Return hash as a hex string
def all_concept_triples_csv_to_csv(node_file, edge_file, concepts_file, output_node_file, output_edge_file, output_full_concept_triple_edges):
# to deal add output the concepts nodes, edges, and new full_triple_edges,
# we need to read the concepts maps to the memory, as it is usually not too large.
# Then we need to iterate over the triple nodes to create concept edges
# Finally we iterate over the triple edges to create the full_triple_edges
# Read missing concept
# relation_concepts_mapping = {}
# all_missing_concepts = []
# check if all output directories exist
output_dir = os.path.dirname(output_node_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_dir = os.path.dirname(output_edge_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_dir = os.path.dirname(output_full_concept_triple_edges)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
node_to_concepts = {}
relation_to_concepts = {}
all_concepts = set()
with open(concepts_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Load missing concepts list
print("Loading concepts...")
for row in tqdm(reader):
if row['node_type'] == 'relation':
relation = row['node']
concepts = [c.strip() for c in row['conceptualized_node'].split(',') if c.strip()]
if relation not in relation_to_concepts:
relation_to_concepts[relation] = concepts
else:
relation_to_concepts[relation].extend(concepts)
relation_to_concepts[relation] = list(set(relation_to_concepts[relation]))
else:
node = row['node']
concepts = [c.strip() for c in row['conceptualized_node'].split(',') if c.strip()]
if node not in node_to_concepts:
node_to_concepts[node] = concepts
else:
node_to_concepts[node].extend(concepts)
node_to_concepts[node] = list(set(node_to_concepts[node]))
print("Loading concepts done.")
print(f"Relation to concepts: {len(relation_to_concepts)}")
print(f"Node to concepts: {len(node_to_concepts)}")
# Read triple nodes and write to output concept edges files
print("Processing triple nodes...")
with open(node_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
# name:ID,type,concepts,synsets,:LABEL
header = next(reader)
with open (output_edge_file, 'w', newline='', encoding='utf-8') as f_out:
writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
writer.writerow([':START_ID', ':END_ID', 'relation', ':TYPE'])
for row in tqdm(reader):
node_name = row[0]
if node_name in node_to_concepts:
for concept in node_to_concepts[node_name]:
concept_id = compute_hash_id(concept)
writer.writerow([row[0], concept_id, 'has_concept', 'Concept'])
all_concepts.add(concept)
for concept in parse_concepts(row[2]):
concept_id = compute_hash_id(concept)
writer.writerow([row[0], concept_id, 'has_concept', 'Concept'])
all_concepts.add(concept)
# Read the concept nodes and write to output concept nodes file
print("Processing concept nodes...")
with open (output_node_file, 'w', newline='', encoding='utf-8') as f_out:
writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
writer.writerow(['concept_id:ID', 'name', ':LABEL'])
for concept in tqdm(all_concepts):
concept_id = compute_hash_id(concept)
writer.writerow([concept_id, concept, 'Concept'])
# Read triple edges and write to output full concept triple edges file
print("Processing triple edges...")
with open(edge_file, 'r', encoding='utf-8') as f:
with open(output_full_concept_triple_edges, 'w', newline='', encoding='utf-8') as f_out:
reader = csv.reader(f)
writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
header = next(reader)
writer.writerow([':START_ID', ':END_ID', 'relation', 'concepts', 'synsets', ':TYPE'])
for row in tqdm(reader):
src_id = row[0]
end_id = row[1]
relation = row[2]
concepts = row[3]
synsets = row[4]
original_concepts = parse_concepts(concepts)
if relation in relation_to_concepts:
for concept in relation_to_concepts[relation]:
if concept not in original_concepts:
original_concepts.append(concept)
original_concepts = list(set(original_concepts))
writer.writerow([src_id, end_id, relation, original_concepts, synsets, 'Relation'])
return