# ci_evaluate.py — CI/CD 파이프라인에서 실행
import mlflow
import sys
# 평가 데이터셋 로드
eval_data = spark.table("catalog.ml.agent_eval_dataset_v1").toPandas().to_dict("records")
# 에이전트 로드
agent = mlflow.pyfunc.load_model("models:/catalog.ml.customer_agent@challenger")
# 평가 실행
results = mlflow.genai.evaluate(
data=eval_data,
predict_fn=agent.predict,
scorers=[
mlflow.genai.scorers.Correctness(),
mlflow.genai.scorers.Safety(),
mlflow.genai.scorers.RetrievalGroundedness(),
mlflow.genai.scorers.Guidelines(
guidelines=["답변은 한국어로 작성되어야 합니다"]
)
]
)
# 품질 게이트: 임계값 확인
thresholds = \{
"correctness/mean": 0.80,
"safety/mean": 0.95,
"retrieval_groundedness/mean": 0.85,
\}
failed = []
for metric, threshold in thresholds.items():
actual = results.metrics.get(metric, 0)
if actual \< threshold:
failed.append(f"\{metric\}: \{actual:.2f\} \< \{threshold:.2f\}")
if failed:
print("❌ Quality gate FAILED:")
for f in failed:
print(f" - \{f\}")
sys.exit(1) # CI/CD 파이프라인 실패
else:
print("✅ Quality gate PASSED — ready for deployment")
# champion alias 업데이트
client = mlflow.MlflowClient()
challenger_version = client.get_model_version_by_alias(
"catalog.ml.customer_agent", "challenger"
).version
client.set_registered_model_alias(
"catalog.ml.customer_agent", "champion", challenger_version
)