import ast import uuid import csv from tqdm import tqdm import hashlib import os def generate_uuid(): """Generate a random UUID""" return str(uuid.uuid4()) def parse_concepts(s): """Parse concepts field and filter empty values""" try: parsed = ast.literal_eval(s) if s and s != '[]' else [] return [c.strip() for c in parsed if c.strip()] except: return [] # Function to compute a hash ID from text def compute_hash_id(text): # Use SHA-256 to generate a hash text = text + '_concept' hash_object = hashlib.sha256(text.encode('utf-8')) return hash_object.hexdigest() # Return hash as a hex string def all_concept_triples_csv_to_csv(node_file, edge_file, concepts_file, output_node_file, output_edge_file, output_full_concept_triple_edges): # to deal add output the concepts nodes, edges, and new full_triple_edges, # we need to read the concepts maps to the memory, as it is usually not too large. # Then we need to iterate over the triple nodes to create concept edges # Finally we iterate over the triple edges to create the full_triple_edges # Read missing concept # relation_concepts_mapping = {} # all_missing_concepts = [] # check if all output directories exist output_dir = os.path.dirname(output_node_file) if not os.path.exists(output_dir): os.makedirs(output_dir) output_dir = os.path.dirname(output_edge_file) if not os.path.exists(output_dir): os.makedirs(output_dir) output_dir = os.path.dirname(output_full_concept_triple_edges) if not os.path.exists(output_dir): os.makedirs(output_dir) node_to_concepts = {} relation_to_concepts = {} all_concepts = set() with open(concepts_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) # Load missing concepts list print("Loading concepts...") for row in tqdm(reader): if row['node_type'] == 'relation': relation = row['node'] concepts = [c.strip() for c in row['conceptualized_node'].split(',') if c.strip()] if relation not in relation_to_concepts: relation_to_concepts[relation] = concepts else: relation_to_concepts[relation].extend(concepts) relation_to_concepts[relation] = list(set(relation_to_concepts[relation])) else: node = row['node'] concepts = [c.strip() for c in row['conceptualized_node'].split(',') if c.strip()] if node not in node_to_concepts: node_to_concepts[node] = concepts else: node_to_concepts[node].extend(concepts) node_to_concepts[node] = list(set(node_to_concepts[node])) print("Loading concepts done.") print(f"Relation to concepts: {len(relation_to_concepts)}") print(f"Node to concepts: {len(node_to_concepts)}") # Read triple nodes and write to output concept edges files print("Processing triple nodes...") with open(node_file, 'r', encoding='utf-8') as f: reader = csv.reader(f) # name:ID,type,concepts,synsets,:LABEL header = next(reader) with open (output_edge_file, 'w', newline='', encoding='utf-8') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_ALL) writer.writerow([':START_ID', ':END_ID', 'relation', ':TYPE']) for row in tqdm(reader): node_name = row[0] if node_name in node_to_concepts: for concept in node_to_concepts[node_name]: concept_id = compute_hash_id(concept) writer.writerow([row[0], concept_id, 'has_concept', 'Concept']) all_concepts.add(concept) for concept in parse_concepts(row[2]): concept_id = compute_hash_id(concept) writer.writerow([row[0], concept_id, 'has_concept', 'Concept']) all_concepts.add(concept) # Read the concept nodes and write to output concept nodes file print("Processing concept nodes...") with open (output_node_file, 'w', newline='', encoding='utf-8') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_ALL) writer.writerow(['concept_id:ID', 'name', ':LABEL']) for concept in tqdm(all_concepts): concept_id = compute_hash_id(concept) writer.writerow([concept_id, concept, 'Concept']) # Read triple edges and write to output full concept triple edges file print("Processing triple edges...") with open(edge_file, 'r', encoding='utf-8') as f: with open(output_full_concept_triple_edges, 'w', newline='', encoding='utf-8') as f_out: reader = csv.reader(f) writer = csv.writer(f_out, quoting=csv.QUOTE_ALL) header = next(reader) writer.writerow([':START_ID', ':END_ID', 'relation', 'concepts', 'synsets', ':TYPE']) for row in tqdm(reader): src_id = row[0] end_id = row[1] relation = row[2] concepts = row[3] synsets = row[4] original_concepts = parse_concepts(concepts) if relation in relation_to_concepts: for concept in relation_to_concepts[relation]: if concept not in original_concepts: original_concepts.append(concept) original_concepts = list(set(original_concepts)) writer.writerow([src_id, end_id, relation, original_concepts, synsets, 'Relation']) return