Files
AIEC-RAG---/AIEC-RAG/elasticsearch_vectorization/config.py
2025-09-25 10:33:37 +08:00

209 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Elasticsearch配置文件
包含云端ES连接配置和相关常量
"""
import os
from typing import Optional
class ElasticsearchConfig:
"""Elasticsearch配置类"""
# ES连接配置优先从环境变量读取
ES_HOST = os.getenv("ELASTICSEARCH_HOST", "http://101.200.154.78:9200")
ES_USERNAME = os.getenv("ELASTICSEARCH_USERNAME", "elastic")
ES_PASSWORD = os.getenv("ELASTICSEARCH_PASSWORD", "Abcd123456")
# 备用SSH隧道方式如果端口未开放时使用
# ES_HOST = "http://localhost:9200" # 通过SSH隧道映射到本地
# 索引配置
INDEX_PREFIX = "hipporag"
EDGES_INDEX_SUFFIX = "edges"
PASSAGES_INDEX_SUFFIX = "passages"
# 向量配置
VECTOR_DIMENSION = 1024 # text-embedding-v3的维度
SIMILARITY_METRIC = "cosine"
# 批处理配置
DEFAULT_BATCH_SIZE = 100
DEFAULT_TEXT_BATCH_SIZE = 40
DEFAULT_EDGE_BATCH_SIZE = 256
# 搜索配置
DEFAULT_TOP_K = 10
DEFAULT_NUM_CANDIDATES_MULTIPLIER = 2
@classmethod
def get_edges_index_name(cls, keyword: str) -> str:
"""获取边索引名称"""
return f"{cls.INDEX_PREFIX}_{keyword}_{cls.EDGES_INDEX_SUFFIX}"
@classmethod
def get_passages_index_name(cls, keyword: str) -> str:
"""获取文本段落索引名称"""
return f"{cls.INDEX_PREFIX}_{keyword}_{cls.PASSAGES_INDEX_SUFFIX}"
@classmethod
def get_es_config(cls) -> dict:
"""获取ES连接配置"""
return {
"hosts": [cls.ES_HOST],
"basic_auth": (cls.ES_USERNAME, cls.ES_PASSWORD),
"verify_certs": False, # 如果使用自签名证书
"ssl_show_warn": False,
"request_timeout": 300, # 请求超时时间增加至5分钟
"retry_on_timeout": True,
"max_retries": 5, # 重试次数增加至5次
"sniff_on_start": False, # 禁用节点嗅探,避免网络问题
"sniff_on_connection_fail": False, # 禁用连接失败时的嗅探
"sniff_timeout": 10,
"http_compress": True, # 启用HTTP压缩
"http_auth": (cls.ES_USERNAME, cls.ES_PASSWORD),
"timeout": 300, # 连接超时
"connection_class": None, # 使用默认连接类
"selector_class": None, # 使用默认选择器
"dead_timeout": 60, # 死连接超时时间
"retry_on_status": {502, 503, 504, 408, 429}, # 重试的HTTP状态码
"maxsize": 25, # 连接池大小
}
class SSHTunnelConfig:
"""SSH隧道配置类"""
# SSH连接配置需要向管理员获取
SSH_HOST = "101.200.154.78" # SSH服务器地址
SSH_PORT = 22 # SSH端口
SSH_USERNAME = "your_username" # 需要替换为实际用户名
SSH_KEY_PATH = "path/to/your/private_key.pem" # 需要替换为实际私钥路径
# SSH_PASSWORD = "your_password" # 如果使用密码认证
# 隧道映射配置
LOCAL_PORT = 9200 # 本地端口
REMOTE_HOST = "localhost" # 服务器内部ES地址
REMOTE_PORT = 9200 # 服务器内部ES端口
@classmethod
def get_ssh_config(cls) -> dict:
"""获取SSH隧道配置"""
config = {
'ssh_host': cls.SSH_HOST,
'ssh_port': cls.SSH_PORT,
'ssh_username': cls.SSH_USERNAME,
'local_port': cls.LOCAL_PORT,
'remote_host': cls.REMOTE_HOST,
'remote_port': cls.REMOTE_PORT
}
# 添加认证信息
if hasattr(cls, 'SSH_KEY_PATH') and cls.SSH_KEY_PATH != "path/to/your/private_key.pem":
config['ssh_key_path'] = cls.SSH_KEY_PATH
elif hasattr(cls, 'SSH_PASSWORD'):
config['ssh_password'] = cls.SSH_PASSWORD
return config
class HippoRAGConfig:
"""HippoRAG检索配置"""
# 推理配置
TOPK_EDGES = 10
TOPK_NODES = 30
WEIGHT_ADJUST = 0.05
PPR_ALPHA = 0.85
PPR_MAX_ITER = 100
PPR_TOL = 1e-6
# 检索模式
RETRIEVAL_MODE = "query2edge" # query2edge, query2node, ner2node
# OneAPI配置从环境变量读取
@classmethod
def get_oneapi_config(cls) -> dict:
"""获取OneAPI配置"""
return {
"api_key": os.getenv('ONEAPI_KEY'),
"base_url": os.getenv('ONEAPI_BASE_URL'),
"model_embed": os.getenv('ONEAPI_MODEL_EMBED', 'text-embedding-v3'),
"model_gen": os.getenv('ONEAPI_MODEL_GEN', 'qwen2-7b-instruct')
}
# 索引映射模板(兼容版本)
EDGES_INDEX_MAPPING = {
"mappings": {
"properties": {
"edge_index": {"type": "integer"},
"head_node_id": {"type": "keyword"},
"tail_node_id": {"type": "keyword"},
"head_entity": {"type": "keyword"},
"relation": {"type": "keyword"},
"tail_entity": {"type": "keyword"},
"triple_text": {"type": "text"},
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": True,
"similarity": "cosine"
},
"created_at": {"type": "date"},
"keyword": {"type": "keyword"}
}
},
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
PASSAGES_INDEX_MAPPING = {
"mappings": {
"properties": {
"passage_id": {"type": "keyword"},
"content": {"type": "text"},
"file_id": {"type": "keyword"},
"evidence": {"type": "keyword"},
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": True,
"similarity": "cosine"
},
"created_at": {"type": "date"},
"keyword": {"type": "keyword"}
}
},
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
# 新增Node节点ES索引mapping
NODES_INDEX_MAPPING = {
"mappings": {
"properties": {
"node_id": {"type": "keyword"},
"name": {"type": "text"},
"type": {"type": "keyword"},
"concepts": {"type": "text"},
"synsets": {"type": "text"},
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": True,
"similarity": "cosine"
},
"created_at": {"type": "date"},
"keyword": {"type": "keyword"}
}
},
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0 # 改回0避免黄色状态
}
}