first commit

2025-09-24 09:29:12 +08:00
parent 6339cdebb9
commit 2308536f66
360 changed files with 136381 additions and 0 deletions
--- a/atlas_rag/kg_construction/utils/csv_processing/init.py
+++ b/atlas_rag/kg_construction/utils/csv_processing/init.py
--- a/atlas_rag/kg_construction/utils/csv_processing/pycache/init.cpython-311.pyc
+++ b/atlas_rag/kg_construction/utils/csv_processing/pycache/init.cpython-311.pyc
--- a/atlas_rag/kg_construction/utils/csv_processing/pycache/csv_add_numeric_id.cpython-311.pyc
+++ b/atlas_rag/kg_construction/utils/csv_processing/pycache/csv_add_numeric_id.cpython-311.pyc
--- a/atlas_rag/kg_construction/utils/csv_processing/pycache/csv_to_graphml.cpython-311.pyc
+++ b/atlas_rag/kg_construction/utils/csv_processing/pycache/csv_to_graphml.cpython-311.pyc
--- a/atlas_rag/kg_construction/utils/csv_processing/pycache/merge_csv.cpython-311.pyc
+++ b/atlas_rag/kg_construction/utils/csv_processing/pycache/merge_csv.cpython-311.pyc
--- a/atlas_rag/kg_construction/utils/csv_processing/csv_add_numeric_id.py
+++ b/atlas_rag/kg_construction/utils/csv_processing/csv_add_numeric_id.py
@ -0,0 +1,160 @@
+import csv
+from tqdm import tqdm
+
+def check_created_csv_header(keyword, csv_dir):
+    keyword_to_paths ={
+        'cc_en':{
+            'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
+            'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
+            'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv",
+            'concept_with_numeric_id': f"{csv_dir}/concept_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
+        },
+        'pes2o_abstract':{
+            'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
+            'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
+            'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv",
+        },
+        'en_simple_wiki_v0':{
+            'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
+            'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
+            'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv",
+        },
+    }
+    for key, path in keyword_to_paths[keyword].items():
+        with open(path) as infile:
+            reader = csv.reader(infile)
+            header = next(reader)
+            print(f"Header of {key}: {header}")
+            
+            # print first 5 rows
+            for i, row in enumerate(reader):
+                if i < 1:
+                    print(row)
+                else:
+                    break
+
+def add_csv_columns(node_csv, edge_csv, text_csv, node_with_numeric_id, edge_with_numeric_id, text_with_numeric_id):
+    with open(node_csv) as infile, open(node_with_numeric_id, 'w', newline='') as outfile:
+        reader = csv.reader(infile)
+        writer = csv.writer(outfile)
+        header = next(reader)
+        print(header)
+        label_index = header.index(':LABEL')
+        header.insert(label_index, 'numeric_id')  # Add new column name
+        writer.writerow(header)
+        for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+            row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+            writer.writerow(row)
+    with open(edge_csv) as infile, open(edge_with_numeric_id, 'w', newline='') as outfile:
+        reader = csv.reader(infile)
+        writer = csv.writer(outfile)
+        header = next(reader)
+        print(header)
+        label_index = header.index(':TYPE')
+        header.insert(label_index, 'numeric_id')  # Add new column name
+        writer.writerow(header)
+        for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+            row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+            writer.writerow(row)
+    with open(text_csv) as infile, open(text_with_numeric_id, 'w', newline='') as outfile:
+        reader = csv.reader(infile)
+        writer = csv.writer(outfile)
+        header = next(reader)
+        print(header)
+        label_index = header.index(':LABEL')
+        header.insert(label_index, 'numeric_id')  # Add new column name
+        writer.writerow(header)
+        for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+            row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+            writer.writerow(row)
+            
+
+# def add_csv_columns(keyword, csv_dir):
+#     keyword_to_paths ={
+#         'cc_en':{
+#             'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb.csv",
+#             'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb.csv",
+#             'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json.csv",
+            
+#             'node_with_numeric_id': f"{csv_dir}/triple_nodes_cc_en_from_json_without_emb_with_numeric_id.csv",
+#             'edge_with_numeric_id': f"{csv_dir}/triple_edges_cc_en_from_json_without_emb_with_numeric_id.csv",
+#             'text_with_numeric_id': f"{csv_dir}/text_nodes_cc_en_from_json_with_numeric_id.csv"
+#         },
+#         'pes2o_abstract':{
+#             'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb.csv",
+#             'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept.csv",
+#             'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json.csv",
+            
+#             'node_with_numeric_id': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json_without_emb_with_numeric_id.csv",
+#             'edge_with_numeric_id': f"{csv_dir}/triple_edges_pes2o_abstract_from_json_without_emb_full_concept_with_numeric_id.csv",
+#             'text_with_numeric_id': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_numeric_id.csv"
+#         },
+#         'en_simple_wiki_v0':{
+#             'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb.csv",
+#             'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept.csv",
+#             'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json.csv",
+            
+#             'node_with_numeric_id': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json_without_emb_with_numeric_id.csv",
+#             'edge_with_numeric_id': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json_without_emb_full_concept_with_numeric_id.csv",
+#             'text_with_numeric_id': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_numeric_id.csv"
+#         },
+#     }
+#     # ouput node
+#     with open(keyword_to_paths[keyword]['node_csv']) as infile, open(keyword_to_paths[keyword]['node_with_numeric_id'], 'w') as outfile:
+#         reader = csv.reader(infile)
+#         writer = csv.writer(outfile)
+
+#         # Read the header
+#         header = next(reader)
+#         print(header)
+#         # Insert 'numeric_id' before ':LABEL'
+#         label_index = header.index(':LABEL')
+#         header.insert(label_index, 'numeric_id')  # Add new column name
+#         writer.writerow(header)
+
+#         # Process each row and add a numeric ID
+#         for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+#             row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+#             writer.writerow(row)
+            
+#     # output edge (TYPE instead of LABEL for edge)
+#     with open(keyword_to_paths[keyword]['edge_csv']) as infile, open(keyword_to_paths[keyword]['edge_with_numeric_id'], 'w') as outfile:
+#         reader = csv.reader(infile)
+#         writer = csv.writer(outfile)
+
+#         # Read the header
+#         header = next(reader)
+#         print(header)
+#         # Insert 'numeric_id' before ':TYPE'
+#         label_index = header.index(':TYPE')
+#         header.insert(label_index, 'numeric_id')  # Add new column name
+#         writer.writerow(header)
+
+#         # Process each row and add a numeric ID
+#         for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+#             row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+#             writer.writerow(row)
+            
+#     # output text
+#     with open(keyword_to_paths[keyword]['text_csv']) as infile, open(keyword_to_paths[keyword]['text_with_numeric_id'], 'w') as outfile:
+#         reader = csv.reader(infile)
+#         writer = csv.writer(outfile)
+
+#         # Read the header
+#         header = next(reader)
+#         print(header)
+#         # Insert 'numeric_id' before ':LABEL'
+#         label_index = header.index(':LABEL')
+#         header.insert(label_index, 'numeric_id')  # Add new column name
+#         writer.writerow(header)
+
+#         # Process each row and add a numeric ID
+#         for row_number, row in tqdm(enumerate(reader), desc="Adding numeric ID"):
+#             row.insert(label_index, row_number)  # Add numeric ID before ':LABEL'
+#             writer.writerow(row)
+
+if __name__ == "__main__":
+    keyword = "en_simple_wiki_v0"
+    csv_dir = "./import"  # Change this to your CSV directory
+    add_csv_columns(keyword, csv_dir)
+    # check_created_csv_header(keyword)
--- a/atlas_rag/kg_construction/utils/csv_processing/csv_to_graphml.py
+++ b/atlas_rag/kg_construction/utils/csv_processing/csv_to_graphml.py
@ -0,0 +1,189 @@
+import networkx as nx
+import csv
+import ast
+import hashlib
+import os
+from atlas_rag.kg_construction.triple_config import ProcessingConfig
+import pickle
+
+def get_node_id(entity_name, entity_to_id={}):
+    """Returns existing or creates new nX ID for an entity using a hash-based approach."""
+    if entity_name not in entity_to_id:
+        # Use a hash function to generate a unique ID
+        hash_object = hashlib.sha256(entity_name.encode('utf-8'))
+        hash_hex = hash_object.hexdigest()  # Get the hexadecimal representation of the hash
+        # Use the first 8 characters of the hash as the ID (you can adjust the length as needed)
+        entity_to_id[entity_name] = hash_hex
+    return entity_to_id[entity_name]
+
+def csvs_to_temp_graphml(triple_node_file, triple_edge_file, config:ProcessingConfig=None):
+    g = nx.DiGraph()
+    entity_to_id = {}
+
+    # Add triple nodes
+    with open(triple_node_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            node_id = row["name:ID"]
+            mapped_id = get_node_id(node_id, entity_to_id)
+            if mapped_id not in g.nodes:
+                g.add_node(mapped_id, id=node_id, type=row["type"]) 
+            
+
+    # Add triple edges
+    with open(triple_edge_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            start_id = get_node_id(row[":START_ID"], entity_to_id)
+            end_id = get_node_id(row[":END_ID"], entity_to_id)
+            # Check if edge already exists to prevent duplicates
+            if not g.has_edge(start_id, end_id):
+                g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
+
+    # save graph to 
+    output_name = f"{config.output_directory}/kg_graphml/{config.filename_pattern}_without_concept.pkl"
+    # check if output file directory exists
+    output_dir = os.path.dirname(output_name)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # store the graph to a pickle file
+    with open(output_name, 'wb') as output_file:
+        pickle.dump(g, output_file)
+    
+    
+
+def csvs_to_graphml(triple_node_file, text_node_file, concept_node_file,
+                    triple_edge_file, text_edge_file, concept_edge_file,
+                    output_file):
+    '''
+    Convert multiple CSV files into a single GraphML file.
+    
+    Types of nodes to be added to the graph:
+    - Triple nodes: Nodes representing triples, with properties like subject, predicate, object.
+    - Text nodes: Nodes representing text, with properties like text content.
+    - Concept nodes: Nodes representing concepts, with properties like concept name and type.
+
+    Types of edges to be added to the graph:
+    - Triple edges: Edges representing relationships between triples, with properties like relation type.
+    - Text edges: Edges representing relationships between text and nodes, with properties like text type.
+    - Concept edges: Edges representing relationships between concepts and nodes, with properties like concept type.
+    
+    DiGraph networkx attributes:
+    Node:
+    - type: Type of the node (e.g., entity, event, text, concept).
+    - file_id: List of text IDs the node is associated with.
+    - id: Node Name 
+    Edge:
+    - relation: relation name
+    - file_id: List of text IDs the edge is associated with.
+    - type: Type of the edge (e.g., Source, Relation, Concept).
+    - synsets: List of synsets associated with the edge.
+    
+    '''
+    g = nx.DiGraph()
+    entity_to_id = {}
+
+    # Add triple nodes
+    with open(triple_node_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            node_id = row["name:ID"]
+            mapped_id = get_node_id(node_id, entity_to_id)
+            # Check if node already exists to prevent duplicates
+            if mapped_id not in g.nodes:
+                g.add_node(mapped_id, id=node_id, type=row["type"])
+            
+    # Add text nodes
+    with open(text_node_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            node_id = row["text_id:ID"]
+            # Check if node already exists to prevent duplicates
+            if node_id not in g.nodes:
+                g.add_node(node_id, file_id=node_id, id=row["original_text"], type="passage")
+
+    # Add concept nodes
+    with open(concept_node_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            node_id = row["concept_id:ID"]
+            # Check if node already exists to prevent duplicates
+            if node_id not in g.nodes:
+                g.add_node(node_id, file_id="concept_file", id=row["name"], type="concept")
+
+    # Add file id for triple nodes and concept nodes when add the edges
+    
+    # Add triple edges
+    with open(triple_edge_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            start_id = get_node_id(row[":START_ID"], entity_to_id)
+            end_id = get_node_id(row[":END_ID"], entity_to_id)
+            # Check if edge already exists to prevent duplicates
+            if not g.has_edge(start_id, end_id):
+                g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
+                # Add file_id to start and end nodes if they are triple or concept nodes
+                for node_id in [start_id, end_id]:
+                    if g.nodes[node_id]['type'] in ['triple', 'concept'] and 'file_id' not in g.nodes[node_id]:
+                        g.nodes[node_id]['file_id'] = row.get("file_id", "triple_file")
+            
+            # Add concepts to the edge
+            concepts = ast.literal_eval(row["concepts"])
+            for concept in concepts:
+                if "concepts" not in g.edges[start_id, end_id]:
+                    g.edges[start_id, end_id]['concepts'] = str(concept)
+                else:
+                    # Avoid duplicate concepts by checking if concept is already in the list
+                    current_concepts = g.edges[start_id, end_id]['concepts'].split(",")
+                    if str(concept) not in current_concepts:
+                        g.edges[start_id, end_id]['concepts'] += "," + str(concept)
+            
+
+    # Add text edges
+    with open(text_edge_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            start_id = get_node_id(row[":START_ID"], entity_to_id)
+            end_id = row[":END_ID"]
+            # Check if edge already exists to prevent duplicates
+            if not g.has_edge(start_id, end_id):
+                g.add_edge(start_id, end_id, relation="mention in", type=row[":TYPE"])
+                # Add file_id to start node if it is a triple or concept node
+                if 'file_id' in g.nodes[start_id]:
+                    g.nodes[start_id]['file_id'] += "," + str(end_id)
+                else:
+                    g.nodes[start_id]['file_id'] = str(end_id)
+
+    # Add concept edges between triple nodes and concept nodes
+    with open(concept_edge_file, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            start_id = get_node_id(row[":START_ID"], entity_to_id)
+            end_id = row[":END_ID"] # end id is concept node id
+            if not g.has_edge(start_id, end_id):
+                g.add_edge(start_id, end_id, relation=row["relation"], type=row[":TYPE"])
+
+    # Write to GraphML
+    # check if output file directory exists
+    output_dir = os.path.dirname(output_file)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    nx.write_graphml(g, output_file, infer_numeric_types=True)
+    
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Convert CSV files to GraphML format.')
+    parser.add_argument('--triple_node_file', type=str, required=True, help='Path to the triple node CSV file.')
+    parser.add_argument('--text_node_file', type=str, required=True, help='Path to the text node CSV file.')
+    parser.add_argument('--concept_node_file', type=str, required=True, help='Path to the concept node CSV file.')
+    parser.add_argument('--triple_edge_file', type=str, required=True, help='Path to the triple edge CSV file.')
+    parser.add_argument('--text_edge_file', type=str, required=True, help='Path to the text edge CSV file.')
+    parser.add_argument('--concept_edge_file', type=str, required=True, help='Path to the concept edge CSV file.')
+    parser.add_argument('--output_file', type=str, required=True, help='Path to the output GraphML file.')
+
+    args = parser.parse_args()
+    
+    csvs_to_graphml(args.triple_node_file, args.text_node_file, args.concept_node_file,
+                    args.triple_edge_file, args.text_edge_file, args.concept_edge_file,
+                    args.output_file)
--- a/atlas_rag/kg_construction/utils/csv_processing/csv_to_npy.py
+++ b/atlas_rag/kg_construction/utils/csv_processing/csv_to_npy.py
@ -0,0 +1,70 @@
+import pandas as pd
+import numpy as np
+from ast import literal_eval  # Safer string-to-list conversion
+import os
+
+CHUNKSIZE = 100_000  # Adjust based on your RAM (100K rows per chunk)
+EMBEDDING_COL = "embedding:STRING"  # Column name with embeddings
+# DIMENSION = 32  # Update with your embedding dimension
+ENTITY_ONLY = True
+def parse_embedding(embed_str):
+    """Convert embedding string to numpy array"""
+    # Remove brackets and convert to list
+    return np.array(literal_eval(embed_str), dtype=np.float32)
+
+# Create memory-mapped numpy file
+def convert_csv_to_npy(csv_path, npy_path):
+    total_embeddings = 0
+    # check dir exist, if not then create it
+    os.makedirs(os.path.dirname(npy_path), exist_ok=True)
+    
+    with open(npy_path, "wb") as f:
+        pass  # Initialize empty file
+
+    # Process CSV in chunks
+    for chunk_idx, df_chunk in enumerate(
+        pd.read_csv(csv_path, chunksize=CHUNKSIZE, usecols=[EMBEDDING_COL])
+    ):  
+        
+        
+        # Parse embeddings
+        embeddings = np.stack(
+            df_chunk[EMBEDDING_COL].apply(parse_embedding).values
+        )
+        
+        # Verify dimensions
+        # assert embeddings.shape[1] == DIMENSION, \
+        #     f"Dimension mismatch at chunk {chunk_idx}"
+        total_embeddings += embeddings.shape[0]
+        # Append to .npy file
+        with open(npy_path, "ab") as f:
+            np.save(f, embeddings.astype(np.float32))
+        
+        print(f"Processed chunk {chunk_idx} ({CHUNKSIZE*(chunk_idx+1)} rows)")
+    print(f"Total number of embeddings: {total_embeddings}")
+    print("Conversion complete!")
+    
+if __name__ == "__main__":
+    keyword = 'cc_en'  # Change this to your desired keyword
+    csv_dir="./import" # Change this to your CSV directory
+    keyword_to_paths ={
+        'cc_en':{
+            'node_csv': f"{csv_dir}/triple_nodes_cc_en_from_json_2.csv",
+            # 'edge_csv': f"{csv_dir}/triple_edges_cc_en_from_json_2.csv",
+            'text_csv': f"{csv_dir}/text_nodes_cc_en_from_json_with_emb.csv",
+        },
+        'pes2o_abstract':{
+            'node_csv': f"{csv_dir}/triple_nodes_pes2o_abstract_from_json.csv",
+            # 'edge_csv': f"{csv_dir}/triple_edges_pes2o_abstract_from_json.csv",
+            'text_csv': f"{csv_dir}/text_nodes_pes2o_abstract_from_json_with_emb.csv",
+        },
+        'en_simple_wiki_v0':{
+            'node_csv': f"{csv_dir}/triple_nodes_en_simple_wiki_v0_from_json.csv",
+            # 'edge_csv': f"{csv_dir}/triple_edges_en_simple_wiki_v0_from_json.csv",
+            'text_csv': f"{csv_dir}/text_nodes_en_simple_wiki_v0_from_json_with_emb.csv",
+        },
+    }
+    for key, path in keyword_to_paths[keyword].items():
+        npy_path = path.replace(".csv", ".npy")
+        convert_csv_to_npy(path, npy_path)
+        print(f"Converted {path} to {npy_path}")
--- a/atlas_rag/kg_construction/utils/csv_processing/merge_csv.py
+++ b/atlas_rag/kg_construction/utils/csv_processing/merge_csv.py
@ -0,0 +1,27 @@
+import os
+import glob
+
+def merge_csv_files(output_file, input_dir):
+    """
+    Merge all CSV files in the input directory into a single output file.
+
+    Args:
+        output_file (str): Path to the output CSV file.
+        input_dir (str): Directory containing the input CSV files.
+    """
+    # Delete the output file if it exists
+    if os.path.exists(output_file):
+        os.remove(output_file)
+
+    # Write the header to the output file
+    with open(output_file, 'w') as outfile:
+        outfile.write("node,conceptualized_node,node_type\n")
+
+    # Append the contents of all CSV files in the input directory
+    for csv_file in glob.glob(os.path.join(input_dir, '*.csv')):
+        with open(csv_file, 'r') as infile:
+            # Skip the header line
+            next(infile)
+            # Append the remaining lines to the output file
+            with open(output_file, 'a') as outfile:
+                outfile.writelines(infile)