from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt import SparkTrials
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import mlflow
import numpy as np
# 1. 탐색 공간 정의
search_space = {
"max_depth": hp.choice("max_depth", [3, 5, 7, 9, 11]),
"n_estimators": hp.choice("n_estimators", [100, 200, 300, 500]),
"learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
"min_child_weight": hp.uniform("min_child_weight", 1, 10),
"subsample": hp.uniform("subsample", 0.5, 1.0),
"colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
"gamma": hp.uniform("gamma", 0, 5),
"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(10)),
"reg_lambda": hp.loguniform("reg_lambda", np.log(0.001), np.log(10))
}
# 2. 목적 함수 정의
def objective(params):
with mlflow.start_run(nested=True):
# 파라미터 로깅
mlflow.log_params(params)
# 모델 학습 및 교차 검증
model = XGBClassifier(**params, use_label_encoder=False, eval_metric="logloss")
scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_weighted")
avg_score = np.mean(scores)
mlflow.log_metric("cv_f1_score", avg_score)
# Hyperopt는 최소화하므로 음수로 반환
return {"loss": -avg_score, "status": STATUS_OK}
# 3. 분산 튜닝 실행
with mlflow.start_run(run_name="xgboost-tuning"):
spark_trials = SparkTrials(parallelism=8) # 8개 동시 시행
best_params = fmin(
fn=objective,
space=search_space,
algo=tpe.suggest, # Tree-structured Parzen Estimator
max_evals=100, # 최대 100번 시행
trials=spark_trials
)
print(f"Best parameters: {best_params}")