微信扫码
添加专属顾问
我要投稿
手把手教你用DeepEval自定义模型评估RAG实例,代码级实战指南。 核心内容: 1. 快速导入DeepEval评估框架的关键依赖包 2. 自定义Qwen模型对接DeepEval的完整实现 3. 同步/异步API调用的工程化处理技巧
#导入依赖包import timeimport requestsimport jsonfrom services.ChatService import ChatServicefrom deepeval.models import DeepEvalBaseLLMfrom deepeval.test_case import LLMTestCasefrom deepeval.metrics import (FaithfulnessMetric, ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric)
2、自定义模型
#自定义模型class QwenModel(DeepEvalBaseLLM): def __init__(self): self.api_key = "fastgpt-*******" self.base_url = "https://jz-fastgpt-stable.djtest.cn/api/v1" self.model_name = "qwen-max" def load_model(self): return self def generate(self, prompt: str) -> str: # 调用 Qwen API 的逻辑 headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } payload = { "model": self.model_name, "messages": [{"role": "user", "content": prompt}], "temperature": 0 } response = requests.post( f"{self.base_url}/chat/completions", headers=headers, data=json.dumps(payload) ) if response.status_code == 200: return response.json()["choices"][0]["message"]["content"] else: raise RuntimeError(f"API 调用失败: {response.status_code}, {response.text}") async def a_generate(self, prompt: str) -> str: # 异步实现(与同步类似) return self.generate(prompt) def get_model_name(self): return self.model_name
#评估代码封装class EvalService: def get_faithfulness(self,ques:str, response): # 创建评估模型实例 qwen_model = QwenModel() faithfulness_metric = FaithfulnessMetric(model=qwen_model) test_case = self.get_test_case(ques,response) faithfulness_metric.measure(test_case) faithfulness = dict() faithfulness["score"] = faithfulness_metric.score faithfulness["reason"] = faithfulness_metric.reason print(f"faithfulness:{faithfulness}") return faithfulness def get_contextprecision(self, ques: str, response): # 创建评估模型实例 qwen_model = QwenModel() contextprecision_metric = ContextualPrecisionMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextprecision_metric.measure(test_case) contextprecision = dict() contextprecision["score"] = contextprecision_metric.score contextprecision["reason"] = contextprecision_metric.reason print(f"contextprecision:{contextprecision}") return contextprecision def get_contextrecall(self, ques: str, response): # 创建评估模型实例 qwen_model = QwenModel() contextrecall_metric = ContextualRecallMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextrecall_metric.measure(test_case) contextrecall = dict() contextrecall["score"] = contextrecall_metric.score contextrecall["reason"] = contextrecall_metric.reason print(f"contextrecall:{contextrecall}") return contextrecall def get_contextrelevant(self, ques: str, response): # 创建评估模型实例 qwen_model = QwenModel() contextrelevant_metric = ContextualRelevancyMetric(model=qwen_model) test_case = self.get_test_case(ques,response) contextrelevant_metric.measure(test_case) contextrelevant = dict() contextrelevant["score"] = contextrelevant_metric.score contextrelevant["reason"] = contextrelevant_metric.reason print(f"contextrelevant:{contextrelevant}") return contextrelevant def get_test_case(self, ques: str, result): quote_list = result["responseData"][1]["quoteList"] retrival_context = [] for quote in quote_list: retrival_context.append(f"{quote['q']}:{quote['a']}") context = [] historypreview = result["responseData"][2]["historyPreview"] for history in historypreview: context.append(history['value']) answer = result["choices"][0]["message"]["content"] # 使用自定义模型进行评估 res_case = LLMTestCase( input=ques, actual_output=answer, expected_output=answer, context=context, retrieval_context=retrival_context ) return res_case
if __name__ == "__main__":
url='https://XXXXXX/api/v1/chat/completions'
key='fastgpt-XXXXXX'
cr=ChatService(url,key)
#调用ai应用,得到result
result=cr.question_response("XXX怎么收费?")
es = EvalService()
es.get_faithfulness("XXX怎么收费?", result)
es.get_contextprecision("XXX怎么收费?", result)
es.get_contextrecall("XXX怎么收费?", result)
es.get_contextrelevant("XXX怎么收费?", result)
faithfulness:{'score': 1.0, 'reason': '实际输出与检索上下文完全一致,没有任何矛盾之处,所以得到了满分1.00的忠实度评分。'}contextprecision:{'score': 1.0, 'reason': '得分为1.00,因为相关的节点(即第一个节点)被正确地排在了最前面。'}contextrecall:{'score': 0.5, 'reason': '分数为0.50,因为虽然节点在检索上下文中提到了'}contextrelevant:{'score': 0.16666666666666666, 'reason': "分数为0.17,因为大部分检索内容并未涉及XXX问题,例如……"}
53AI,企业落地大模型首选服务商
产品:场景落地咨询+大模型应用平台+行业解决方案
承诺:免费POC验证,效果达标后再合作。零风险落地应用大模型,已交付160+中大型企业
2025-08-30
涌现观点|RAG评估的"不可能三角":当独角兽公司因AI评估失误损失10亿美元时,我们才意识到这个被忽视的技术死角
2025-08-29
RAG2.0进入“即插即用”时代!清华YAML+MCP让复杂RAG秒变“乐高”
2025-08-29
利用RAG构建智能问答平台实战经验分享
2025-08-29
RAG如七夕,鹊桥大工程:再看文档解析实际落地badcase
2025-08-29
基于智能体增强生成式检索(Agentic RAG)的流程知识提取技术研究
2025-08-29
RAG 为何能瞬间找到答案?向量数据库告诉你
2025-08-28
寻找RAG通往上下文工程之桥:生成式AI的双重基石重构
2025-08-28
万字长文详解优图RAG技术
2025-06-05
2025-06-06
2025-06-05
2025-06-05
2025-06-20
2025-06-20
2025-07-15
2025-06-24
2025-06-24
2025-06-05