import os
import cloudpickle
from tune.api.factory import TUNE_OBJECT_FACTORY
from typing import Any, Optional, Tuple
from uuid import uuid4
import numpy as np
import pandas as pd
from sklearn.metrics import get_scorer
from sklearn.model_selection import cross_val_score
from triad import FileSystem
from tune import NonIterativeObjectiveFunc, Trial, TrialReport
from tune.constants import (
SPACE_MODEL_NAME,
TUNE_DATASET_DF_DEFAULT_NAME,
TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME,
)
from tune_sklearn.utils import to_sk_model, to_sk_model_expr
[docs]class SKObjective(NonIterativeObjectiveFunc):
def __init__(
self,
scoring: Any,
feature_prefix: str = "",
label_col: str = "label",
checkpoint_path: Optional[str] = None,
) -> None:
super().__init__()
self._last_id = ""
self._model_type: Any = None
self._model_expr: str = ""
self._scoring = scoring
self._feature_prefix = feature_prefix
self._label_col = label_col
if checkpoint_path is None:
self._checkpoint_path = checkpoint_path
else:
self._checkpoint_path = TUNE_OBJECT_FACTORY.get_path_or_temp(
checkpoint_path
)
[docs] def generate_sort_metric(self, value: float) -> float:
return -value
[docs] def run(self, trial: Trial) -> TrialReport:
params = dict(trial.params.simple_value)
if trial.trial_id != self._last_id:
self._model_type = to_sk_model(params.pop(SPACE_MODEL_NAME))
self._model_expr = to_sk_model_expr(self._model_type)
self._train_x, self._train_y = self._reset_xy(
trial.dfs[TUNE_DATASET_DF_DEFAULT_NAME]
)
self._test_x, self._test_y = self._reset_xy(
trial.dfs[TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME]
)
self._last_id = trial.trial_id
else:
params.pop(SPACE_MODEL_NAME)
model = self._model_type(**params).fit(self._train_x, self._train_y)
metric = get_scorer(self._scoring)(model, self._test_x, self._test_y)
metadata = dict(model=self._model_expr)
if self._checkpoint_path is not None:
fp = os.path.join(self._checkpoint_path, str(uuid4()) + ".pkl")
with FileSystem().openbin(fp, mode="wb") as f:
cloudpickle.dump(model, f)
metadata["checkpoint_path"] = fp
return TrialReport(
trial,
metric=metric,
metadata=metadata,
sort_metric=self.generate_sort_metric(metric),
)
def _reset_xy(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
train_df = df.sample(frac=1, random_state=0).reset_index(drop=True)
train_x = train_df.drop([self._label_col], axis=1)
cols = [x for x in train_x.columns if x.startswith(self._feature_prefix)]
return train_x[cols], train_df[self._label_col]
[docs]class SKCVObjective(SKObjective):
def __init__(
self,
scoring: Any,
cv: int = 5,
feature_prefix: str = "",
label_col: str = "label",
checkpoint_path: Optional[str] = None,
) -> None:
super().__init__(
scoring=scoring,
feature_prefix=feature_prefix,
label_col=label_col,
checkpoint_path=checkpoint_path,
)
self._cv = cv
[docs] def run(self, trial: Trial) -> TrialReport:
params = dict(trial.params.simple_value)
if trial.trial_id != self._last_id:
self._model_type = to_sk_model(params.pop(SPACE_MODEL_NAME))
self._model_expr = to_sk_model_expr(self._model_type)
self._train_x, self._train_y = self._reset_xy(
trial.dfs[TUNE_DATASET_DF_DEFAULT_NAME]
)
self._last_id = trial.trial_id
else:
params.pop(SPACE_MODEL_NAME)
model = self._model_type(**params)
s = cross_val_score(
model, self._train_x, self._train_y, cv=self._cv, scoring=self._scoring
)
metadata = dict(model=self._model_expr, cv_scores=[float(x) for x in s])
if self._checkpoint_path is not None:
model.fit(self._train_x, self._train_y)
fp = os.path.join(self._checkpoint_path, str(uuid4()) + ".pkl")
with FileSystem().openbin(fp, mode="wb") as f:
cloudpickle.dump(model, f)
metadata["checkpoint_path"] = fp
metric = float(np.mean(s))
return TrialReport(
trial,
metric=metric,
metadata=metadata,
sort_metric=self.generate_sort_metric(metric),
)