first commit
This commit is contained in:
580
retriver/langsmith/langsmith_retriever.py
Normal file
580
retriver/langsmith/langsmith_retriever.py
Normal file
@ -0,0 +1,580 @@
|
||||
"""
|
||||
集成LangSmith监控的迭代检索器
|
||||
Token监控通过OneAPILLM自动处理
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# LangSmith追踪支持
|
||||
from langsmith import traceable
|
||||
|
||||
from retriver.langgraph.iterative_retriever import IterativeRetriever
|
||||
from retriver.langgraph.graph_state import create_initial_state
|
||||
|
||||
|
||||
class LangSmithIterativeRetriever(IterativeRetriever):
|
||||
"""
|
||||
集成LangSmith监控的迭代检索器
|
||||
|
||||
Token消耗通过OneAPILLM的_generate方法自动记录
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
keyword: str,
|
||||
top_k: int = 2,
|
||||
max_iterations: int = 2,
|
||||
max_parallel_retrievals: int = 2,
|
||||
oneapi_key: Optional[str] = None,
|
||||
oneapi_base_url: Optional[str] = None,
|
||||
model_name: Optional[str] = None,
|
||||
embed_model_name: Optional[str] = None,
|
||||
complexity_model_name: Optional[str] = None,
|
||||
sufficiency_model_name: Optional[str] = None,
|
||||
langsmith_project: Optional[str] = None,
|
||||
skip_llm_generation: bool = False
|
||||
):
|
||||
# 配置LangSmith
|
||||
self._setup_langsmith(langsmith_project)
|
||||
|
||||
# 调用父类初始化
|
||||
super().__init__(
|
||||
keyword=keyword,
|
||||
top_k=top_k,
|
||||
max_iterations=max_iterations,
|
||||
max_parallel_retrievals=max_parallel_retrievals,
|
||||
oneapi_key=oneapi_key,
|
||||
oneapi_base_url=oneapi_base_url,
|
||||
model_name=model_name,
|
||||
embed_model_name=embed_model_name,
|
||||
complexity_model_name=complexity_model_name,
|
||||
sufficiency_model_name=sufficiency_model_name,
|
||||
skip_llm_generation=skip_llm_generation
|
||||
)
|
||||
|
||||
print("[SEARCH] LangSmith监控已启用")
|
||||
|
||||
def _setup_langsmith(self, project_name: Optional[str] = None):
|
||||
"""配置LangSmith环境"""
|
||||
# 设置环境变量
|
||||
if not os.getenv("LANGCHAIN_TRACING_V2"):
|
||||
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
||||
|
||||
if not os.getenv("LANGCHAIN_ENDPOINT"):
|
||||
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
|
||||
|
||||
# 设置项目名称
|
||||
if project_name:
|
||||
os.environ["LANGCHAIN_PROJECT"] = project_name
|
||||
elif not os.getenv("LANGCHAIN_PROJECT"):
|
||||
os.environ["LANGCHAIN_PROJECT"] = "hipporag-retriever"
|
||||
|
||||
# 检查API密钥
|
||||
if not os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGCHAIN_API_KEY") == "your_langsmith_api_key_here":
|
||||
print("[WARNING] 请设置正确的LANGCHAIN_API_KEY环境变量")
|
||||
print("访问 https://smith.langchain.com 获取API密钥")
|
||||
|
||||
@traceable(name="Complete_Retrieval_Process")
|
||||
def retrieve(self, query: str, mode: str = "0") -> Dict[str, Any]:
|
||||
"""
|
||||
带LangSmith追踪的完整检索过程
|
||||
"""
|
||||
print(f"[STARTING] 开始检索过程 (LangSmith追踪)")
|
||||
print(f"[POINT] 项目: {os.getenv('LANGCHAIN_PROJECT', 'hipporag-retriever')}")
|
||||
print(f"[SEARCH] 查询: {query}")
|
||||
print(f"[BUG] 调试模式: {mode}")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# 创建初始状态
|
||||
initial_state = create_initial_state(
|
||||
original_query=query,
|
||||
max_iterations=self.max_iterations,
|
||||
debug_mode=mode
|
||||
)
|
||||
|
||||
# 执行工作流 - LangChain会自动追踪所有LLM调用
|
||||
# 配置LangSmith避免追踪大型数据结构
|
||||
config = {
|
||||
"recursion_limit": 50,
|
||||
"metadata": {
|
||||
"note": "大型PageRank数据仅保存本地,不上传LangSmith"
|
||||
},
|
||||
"callbacks": [], # 禁用额外的回调避免数据泄漏
|
||||
"run_name": f"Retrieval-{query[:20]}..."
|
||||
}
|
||||
|
||||
result_state = self.workflow.invoke(initial_state, config=config)
|
||||
|
||||
# 构建结果
|
||||
end_time = time.time()
|
||||
total_time = end_time - start_time
|
||||
|
||||
# 构建完整结果(本地使用)
|
||||
full_result = self._build_final_result(result_state, total_time)
|
||||
|
||||
# 清理结果状态中的大型数据结构以避免LangSmith传输
|
||||
cleaned_state = self._clean_state_for_langsmith(result_state)
|
||||
|
||||
# 为LangSmith创建轻量级版本,移除大型数据结构
|
||||
langsmith_result = self._build_langsmith_safe_result(cleaned_state, total_time)
|
||||
|
||||
print(f"[OK] 检索完成 (耗时: {total_time:.2f}秒)")
|
||||
print(f"[LINK] 在LangSmith中查看详细追踪信息")
|
||||
|
||||
# 返回轻量级版本给LangSmith,但将完整结果保存到文件
|
||||
self._save_full_result_to_file(full_result)
|
||||
|
||||
# 为了兼容langsmith_example.py,返回包含真实文档数据的结果
|
||||
# 但移除PageRank等大型数据以避免LangSmith传输问题
|
||||
result_with_docs = {
|
||||
"query": langsmith_result.get("query", ""),
|
||||
"answer": langsmith_result.get("answer", ""),
|
||||
"total_passages": langsmith_result.get("total_passages", 0),
|
||||
"retrieval_path": langsmith_result.get("retrieval_path", "unknown"),
|
||||
"iterations": langsmith_result.get("iterations", 0),
|
||||
"is_sufficient": langsmith_result.get("is_sufficient", False),
|
||||
"sub_queries": langsmith_result.get("sub_queries", [])[:3],
|
||||
|
||||
# 包含真实的文档数据(为langsmith_example.py使用)
|
||||
"all_documents": result_state.get('all_documents', []),
|
||||
"all_passages": result_state.get('all_passages', []),
|
||||
"passage_sources": result_state.get('passage_sources', []),
|
||||
|
||||
# 其他完整信息
|
||||
"query_complexity": result_state.get('query_complexity', {}),
|
||||
"decomposed_sub_queries": result_state.get('decomposed_sub_queries', []),
|
||||
"sufficiency_check": result_state.get('sufficiency_check', {}),
|
||||
|
||||
"debug_info": {
|
||||
"total_time": total_time,
|
||||
"langsmith_project": langsmith_result.get("debug_info", {}).get("langsmith_project", ""),
|
||||
"note": "包含真实文档数据的完整版本"
|
||||
}
|
||||
}
|
||||
|
||||
return result_with_docs
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\n[WARNING] 检索被用户中断")
|
||||
end_time = time.time()
|
||||
return {
|
||||
"query": query,
|
||||
"answer": "检索被用户中断",
|
||||
"error": "KeyboardInterrupt",
|
||||
"total_time": end_time - start_time,
|
||||
"iterations": 0,
|
||||
"total_passages": 0,
|
||||
"sub_queries": [],
|
||||
"debug_info": {"error": "KeyboardInterrupt", "total_time": end_time - start_time}
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 检索过程出错: {e}")
|
||||
end_time = time.time()
|
||||
return {
|
||||
"query": query,
|
||||
"answer": f"抱歉,检索过程中遇到错误: {str(e)}",
|
||||
"error": str(e),
|
||||
"total_time": end_time - start_time,
|
||||
"iterations": 0,
|
||||
"total_passages": 0,
|
||||
"sub_queries": [],
|
||||
"debug_info": {"error": str(e), "total_time": end_time - start_time}
|
||||
}
|
||||
|
||||
def _build_final_result(self, final_state, total_time: float) -> Dict[str, Any]:
|
||||
"""构建最终结果"""
|
||||
# 获取最终的Token使用统计
|
||||
final_token_info = self._get_token_usage_info()
|
||||
|
||||
return {
|
||||
"query": final_state.get('original_query', ''),
|
||||
"answer": final_state.get('final_answer', '') or "未能生成答案",
|
||||
|
||||
# 查询复杂度信息
|
||||
"query_complexity": final_state.get('query_complexity', {}),
|
||||
"is_complex_query": final_state.get('is_complex_query', False),
|
||||
"retrieval_path": "complex_hipporag" if final_state.get('is_complex_query', False) else "simple_vector",
|
||||
|
||||
"iterations": final_state.get('current_iteration', 0),
|
||||
"total_passages": len(final_state.get('all_passages', [])),
|
||||
"sub_queries": final_state.get('sub_queries', []),
|
||||
"decomposed_sub_queries": final_state.get('decomposed_sub_queries', []),
|
||||
"initial_retrieval_details": final_state.get('initial_retrieval_details', {}),
|
||||
"sufficiency_check": final_state.get('sufficiency_check', {}),
|
||||
"current_sub_queries": final_state.get('current_sub_queries', []),
|
||||
"is_sufficient": final_state.get('is_sufficient', False),
|
||||
|
||||
# 完整的文档和段落数据(本地文件专用)
|
||||
"all_documents": final_state.get('all_documents', []),
|
||||
"all_passages": final_state.get('all_passages', []),
|
||||
"passage_sources": final_state.get('passage_sources', []),
|
||||
|
||||
# PageRank数据已移除状态存储,改为本地文件存储(避免LangSmith传输)
|
||||
"pagerank_data_available": final_state.get('pagerank_data_available', False),
|
||||
"pagerank_summary": final_state.get('pagerank_summary', {}),
|
||||
"concept_exploration_results": final_state.get('concept_exploration_results', {}),
|
||||
"exploration_round": final_state.get('exploration_round', 0),
|
||||
|
||||
"debug_info": {
|
||||
"total_time": total_time,
|
||||
"retrieval_calls": final_state.get('debug_info', {}).get('retrieval_calls', 0),
|
||||
"llm_calls": final_state.get('debug_info', {}).get('llm_calls', 0),
|
||||
"langsmith_project": os.getenv('LANGCHAIN_PROJECT', 'hipporag-retriever'),
|
||||
|
||||
# Token使用统计
|
||||
"token_usage_summary": final_token_info,
|
||||
|
||||
# 路径统计
|
||||
"complexity_analysis": {
|
||||
"is_complex": final_state.get('is_complex_query', False),
|
||||
"complexity_level": final_state.get('query_complexity', {}).get('complexity_level', 'unknown'),
|
||||
"confidence": final_state.get('query_complexity', {}).get('confidence', 0),
|
||||
"reason": final_state.get('query_complexity', {}).get('reason', '')
|
||||
},
|
||||
|
||||
# 调试模式信息
|
||||
"debug_mode_analysis": {
|
||||
"debug_mode": final_state.get('debug_mode', '0'),
|
||||
"debug_override": final_state.get('query_complexity', {}).get('debug_override', {}),
|
||||
"path_override_applied": bool(final_state.get('query_complexity', {}).get('debug_override', {}))
|
||||
},
|
||||
|
||||
# 充分性检查历史
|
||||
"sufficiency_analysis": {
|
||||
"final_sufficiency": final_state.get('is_sufficient', False),
|
||||
"sufficiency_check_details": final_state.get('sufficiency_check', {}),
|
||||
"iteration_sufficiency_history": [
|
||||
{
|
||||
"iteration": item.get('iteration', 0),
|
||||
"is_sufficient": item.get('is_sufficient', False),
|
||||
"confidence": item.get('sufficiency_check', {}).get('confidence', 0),
|
||||
"reason": item.get('sufficiency_check', {}).get('reason', '')
|
||||
}
|
||||
for item in final_state.get('iteration_history', [])
|
||||
if 'sufficiency_check' in item
|
||||
],
|
||||
"sufficiency_progression": self._analyze_sufficiency_progression(final_state)
|
||||
},
|
||||
|
||||
# 路由决策历史
|
||||
"routing_analysis": {
|
||||
"total_routing_decisions": len([
|
||||
item for item in final_state.get('iteration_history', [])
|
||||
if item.get('action') in ['sufficiency_check', 'sub_query_generation', 'parallel_retrieval', 'collect_pagerank_scores']
|
||||
]),
|
||||
"sub_query_generation_count": len([
|
||||
item for item in final_state.get('iteration_history', [])
|
||||
if item.get('action') == 'sub_query_generation'
|
||||
]),
|
||||
"parallel_retrieval_count": len([
|
||||
item for item in final_state.get('iteration_history', [])
|
||||
if item.get('action') == 'parallel_retrieval'
|
||||
]),
|
||||
"pagerank_collection_count": len([
|
||||
item for item in final_state.get('iteration_history', [])
|
||||
if item.get('action') == 'collect_pagerank_scores'
|
||||
])
|
||||
},
|
||||
|
||||
# 概念探索分析(新增)
|
||||
"concept_exploration_analysis": {
|
||||
"exploration_enabled": final_state.get('exploration_round', 0) > 0,
|
||||
"exploration_rounds": final_state.get('exploration_round', 0),
|
||||
"pagerank_nodes_analyzed": len(final_state.get('pagerank_summary', {}).get('all_nodes_sorted', [])),
|
||||
"successful_branches_total": sum([
|
||||
round_data.get('successful_branches', 0)
|
||||
for round_key, round_data in final_state.get('concept_exploration_results', {}).items()
|
||||
if round_key.startswith('round_')
|
||||
]),
|
||||
"total_branches_attempted": sum([
|
||||
round_data.get('total_branches', 0)
|
||||
for round_key, round_data in final_state.get('concept_exploration_results', {}).items()
|
||||
if round_key.startswith('round_')
|
||||
])
|
||||
}
|
||||
},
|
||||
"all_passages": final_state.get('all_passages', []),
|
||||
"all_documents": final_state.get('all_documents', []), # 添加文档列表
|
||||
"iteration_history": final_state.get('iteration_history', [])
|
||||
}
|
||||
|
||||
def _clean_state_for_langsmith(self, state: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""清理状态中的大型数据结构,避免发送给LangSmith"""
|
||||
cleaned_state = state.copy()
|
||||
|
||||
# PageRank数据已从状态中移除,不需要清理
|
||||
if cleaned_state.get('pagerank_data_available', False):
|
||||
print(f"[?] PageRank数据已存储在本地,未包含在状态中")
|
||||
|
||||
# 清理概念探索结果中的大型数据
|
||||
if 'concept_exploration_results' in cleaned_state:
|
||||
cleaned_exploration = {}
|
||||
for key, value in cleaned_state['concept_exploration_results'].items():
|
||||
if isinstance(value, dict):
|
||||
# 只保留统计信息,移除具体的节点数据
|
||||
cleaned_exploration[key] = {
|
||||
'total_branches': value.get('total_branches', 0),
|
||||
'successful_branches': value.get('successful_branches', 0),
|
||||
'exploration_type': value.get('exploration_type', 'unknown')
|
||||
}
|
||||
else:
|
||||
cleaned_exploration[key] = value
|
||||
cleaned_state['concept_exploration_results'] = cleaned_exploration
|
||||
print(f"[?] 已清理概念探索详细结果,只保留统计信息")
|
||||
|
||||
# 清理PageRank汇总中的详细节点数据
|
||||
if 'pagerank_summary' in cleaned_state:
|
||||
summary = cleaned_state['pagerank_summary']
|
||||
if isinstance(summary, dict) and 'all_nodes_sorted' in summary:
|
||||
nodes_count = len(summary.get('all_nodes_sorted', []))
|
||||
cleaned_summary = {k: v for k, v in summary.items() if k != 'all_nodes_sorted'}
|
||||
cleaned_summary['nodes_count'] = nodes_count
|
||||
cleaned_state['pagerank_summary'] = cleaned_summary
|
||||
print(f"[?] 已清理 {nodes_count} 个PageRank节点详情,只保留统计")
|
||||
|
||||
return cleaned_state
|
||||
|
||||
def _analyze_sufficiency_progression(self, final_state) -> Dict[str, Any]:
|
||||
"""分析充分性检查的进展"""
|
||||
iteration_history = final_state.get('iteration_history', [])
|
||||
sufficiency_checks = [
|
||||
item for item in iteration_history
|
||||
if 'sufficiency_check' in item
|
||||
]
|
||||
|
||||
if not sufficiency_checks:
|
||||
return {"status": "no_sufficiency_checks"}
|
||||
|
||||
# 分析进展模式
|
||||
confidences = [sc.get('sufficiency_check', {}).get('confidence', 0) for sc in sufficiency_checks]
|
||||
sufficiencies = [sc.get('is_sufficient', False) for sc in sufficiency_checks]
|
||||
|
||||
progression_pattern = "unknown"
|
||||
if len(sufficiencies) >= 2:
|
||||
if not sufficiencies[0] and sufficiencies[-1]:
|
||||
progression_pattern = "improved_to_sufficient"
|
||||
elif all(sufficiencies):
|
||||
progression_pattern = "consistently_sufficient"
|
||||
elif not any(sufficiencies):
|
||||
progression_pattern = "consistently_insufficient"
|
||||
else:
|
||||
progression_pattern = "mixed"
|
||||
|
||||
return {
|
||||
"total_checks": len(sufficiency_checks),
|
||||
"confidence_progression": confidences,
|
||||
"sufficiency_progression": sufficiencies,
|
||||
"pattern": progression_pattern,
|
||||
"final_confidence": confidences[-1] if confidences else 0,
|
||||
"confidence_improvement": confidences[-1] - confidences[0] if len(confidences) >= 2 else 0
|
||||
}
|
||||
|
||||
def _get_token_usage_info(self) -> Dict[str, Any]:
|
||||
"""
|
||||
获取当前的Token使用信息
|
||||
"""
|
||||
try:
|
||||
# 尝试从不同的属性路径获取Token信息
|
||||
debug_info = {}
|
||||
|
||||
# 检查self.llm是否存在
|
||||
if hasattr(self, 'llm'):
|
||||
debug_info["has_llm"] = True
|
||||
|
||||
# 检查oneapi_generator是否存在
|
||||
if hasattr(self.llm, 'oneapi_generator'):
|
||||
debug_info["has_generator"] = True
|
||||
generator = self.llm.oneapi_generator
|
||||
|
||||
# 获取Token统计
|
||||
last_usage = getattr(generator, 'last_token_usage', {})
|
||||
total_usage = getattr(generator, 'total_token_usage', {})
|
||||
model_name = getattr(generator, 'model_name', 'unknown')
|
||||
|
||||
debug_info.update({
|
||||
"last_call": last_usage,
|
||||
"total_usage": total_usage,
|
||||
"model_name": model_name,
|
||||
"has_last_usage": bool(last_usage),
|
||||
"has_total_usage": bool(total_usage)
|
||||
})
|
||||
|
||||
return debug_info
|
||||
else:
|
||||
debug_info["has_generator"] = False
|
||||
else:
|
||||
debug_info["has_llm"] = False
|
||||
|
||||
debug_info["error"] = "无法找到Token信息"
|
||||
return debug_info
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"error": f"获取Token信息失败: {str(e)}",
|
||||
"exception_type": type(e).__name__
|
||||
}
|
||||
|
||||
@traceable(name="Simple_Retrieve")
|
||||
def retrieve_simple(self, query: str, mode: str = "0") -> str:
|
||||
"""简单检索接口"""
|
||||
result = self.retrieve(query, mode)
|
||||
return result.get('answer', '')
|
||||
|
||||
def _build_langsmith_safe_result(self, final_state, total_time: float) -> Dict[str, Any]:
|
||||
"""构建LangSmith安全的结果,移除大型数据结构以避免传输限制"""
|
||||
final_token_info = self._get_token_usage_info()
|
||||
|
||||
return {
|
||||
"query": final_state.get('original_query', ''),
|
||||
"answer": final_state.get('final_answer', '') or "未能生成答案",
|
||||
|
||||
# 查询复杂度信息
|
||||
"query_complexity": final_state.get('query_complexity', {}),
|
||||
"is_complex_query": final_state.get('is_complex_query', False),
|
||||
"retrieval_path": "complex_hipporag" if final_state.get('is_complex_query', False) else "simple_vector",
|
||||
|
||||
"iterations": final_state.get('current_iteration', 0),
|
||||
"total_passages": len(final_state.get('all_passages', [])),
|
||||
"sub_queries": final_state.get('sub_queries', []),
|
||||
"decomposed_sub_queries": final_state.get('decomposed_sub_queries', []),
|
||||
"sufficiency_check": final_state.get('sufficiency_check', {}),
|
||||
"is_sufficient": final_state.get('is_sufficient', False),
|
||||
|
||||
# 添加文档和段落数据(langsmith_example.py需要这些数据)
|
||||
# 注意:这些数据仅供本地脚本使用,不会发送到LangSmith web端
|
||||
"all_documents": final_state.get('all_documents', []),
|
||||
"all_passages": final_state.get('all_passages', []),
|
||||
|
||||
# 简化的统计信息,不包含大型数据结构
|
||||
"pagerank_summary_stats": {
|
||||
"data_available": final_state.get('pagerank_data_available', False),
|
||||
"exploration_rounds": final_state.get('exploration_round', 0),
|
||||
"has_concept_exploration": bool(final_state.get('concept_exploration_results', {}))
|
||||
},
|
||||
|
||||
"debug_info": {
|
||||
"total_time": total_time,
|
||||
"retrieval_calls": final_state.get('debug_info', {}).get('retrieval_calls', 0),
|
||||
"llm_calls": final_state.get('debug_info', {}).get('llm_calls', 0),
|
||||
"langsmith_project": os.getenv('LANGCHAIN_PROJECT', 'hipporag-retriever'),
|
||||
"token_usage_summary": final_token_info,
|
||||
"complexity_analysis": {
|
||||
"is_complex": final_state.get('is_complex_query', False),
|
||||
"complexity_level": final_state.get('query_complexity', {}).get('complexity_level', 'unknown'),
|
||||
"confidence": final_state.get('query_complexity', {}).get('confidence', 0),
|
||||
"reason": final_state.get('query_complexity', {}).get('reason', '')
|
||||
},
|
||||
"final_sufficiency": final_state.get('is_sufficient', False),
|
||||
"note": "完整结果已保存到本地文件,此为LangSmith优化版本"
|
||||
}
|
||||
}
|
||||
|
||||
def _save_full_result_to_file(self, full_result: Dict[str, Any]):
|
||||
"""将完整结果保存到本地文件"""
|
||||
try:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"langsmith_full_{timestamp}.json"
|
||||
|
||||
# 创建json_langsmith目录(如果不存在)
|
||||
json_dir = os.path.join(os.path.dirname(__file__), "json_langsmith")
|
||||
os.makedirs(json_dir, exist_ok=True)
|
||||
|
||||
filepath = os.path.join(json_dir, filename)
|
||||
|
||||
# 序列化时需要特殊处理numpy数组和Document对象等
|
||||
def json_serializer(obj):
|
||||
# 处理langchain Document对象
|
||||
if hasattr(obj, 'page_content') and hasattr(obj, 'metadata'):
|
||||
return {
|
||||
'page_content': obj.page_content,
|
||||
'metadata': obj.metadata
|
||||
}
|
||||
# 处理numpy数组
|
||||
elif hasattr(obj, 'tolist'):
|
||||
return obj.tolist()
|
||||
# 处理其他自定义对象
|
||||
elif hasattr(obj, '__dict__'):
|
||||
try:
|
||||
return obj.__dict__
|
||||
except:
|
||||
return str(obj)
|
||||
else:
|
||||
return str(obj)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(full_result, f, ensure_ascii=False, indent=2, default=json_serializer)
|
||||
|
||||
print(f"[FOLDER] 完整结果已保存: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARNING] 保存完整结果失败: {e}")
|
||||
|
||||
|
||||
def create_langsmith_retriever(
|
||||
keyword: str,
|
||||
top_k: int = 2,
|
||||
max_iterations: int = 2,
|
||||
max_parallel_retrievals: int = 2,
|
||||
langsmith_project: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> LangSmithIterativeRetriever:
|
||||
"""创建带LangSmith监控的迭代检索器"""
|
||||
return LangSmithIterativeRetriever(
|
||||
keyword=keyword,
|
||||
top_k=top_k,
|
||||
max_iterations=max_iterations,
|
||||
max_parallel_retrievals=max_parallel_retrievals,
|
||||
langsmith_project=langsmith_project,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
# 用于检查LangSmith连接状态的工具函数
|
||||
def check_langsmith_connection() -> bool:
|
||||
"""检查LangSmith连接状态,带重试机制"""
|
||||
import time
|
||||
max_retries = 3
|
||||
retry_delay = 2 # 秒
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
from langsmith import Client
|
||||
|
||||
api_key = os.getenv("LANGCHAIN_API_KEY")
|
||||
if not api_key or api_key == "your_langsmith_api_key_here":
|
||||
print("[ERROR] LangSmith API密钥未设置或无效")
|
||||
return False
|
||||
|
||||
# 设置较短的超时时间
|
||||
client = Client()
|
||||
# 尝试获取项目列表来验证连接
|
||||
projects = list(client.list_projects(limit=1))
|
||||
print("[OK] LangSmith连接正常")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
|
||||
# 检查是否为已知的服务不可用错误
|
||||
if any(keyword in error_msg.lower() for keyword in ['503', 'service unavailable', 'server error', 's3', 'timeout', 'deadline exceeded']):
|
||||
print(f"[WARNING] LangSmith服务暂时不可用 (尝试 {attempt + 1}/{max_retries}): {error_msg[:100]}...")
|
||||
|
||||
if attempt < max_retries - 1: # 还有重试次数
|
||||
print(f"⏰ {retry_delay}秒后重试...")
|
||||
time.sleep(retry_delay)
|
||||
retry_delay *= 1.5 # 递增重试间隔
|
||||
continue
|
||||
else:
|
||||
print("[ERROR] LangSmith服务持续不可用,将在本地模式下运行")
|
||||
return False
|
||||
else:
|
||||
print(f"[ERROR] LangSmith连接失败: {e}")
|
||||
return False
|
||||
|
||||
return False
|
||||
Reference in New Issue
Block a user