Source code for hyperts.experiment

# -*- coding:utf-8 -*-
"""

"""
from hypernets.searchers import make_searcher
from hypernets.discriminators import make_discriminator
from hypernets.experiment.cfg import ExperimentCfg as cfg
from hypernets.tabular.cache import clear as _clear_cache
from hypernets.utils import logging, isnotebook, load_module

from hyperts.utils import get_tool_box
from hyperts.utils import consts, set_random_state
from hyperts.hyper_ts import HyperTS as hyper_ts_cls
from hyperts.framework.compete import TSCompeteExperiment

logger = logging.get_logger(__name__)

[docs]def make_experiment(train_data, task, eval_data=None, test_data=None, mode='stats', max_trials=50, eval_size=0.2, cv=False, num_folds=3, ensemble_size=10, target=None, freq=None, timestamp=None, forecast_train_data_periods=None, forecast_drop_part_sample=False, timestamp_format='%Y-%m-%d %H:%M:%S', covariates=None, dl_forecast_window=None, dl_forecast_horizon=1, contamination=0.05, id=None, searcher=None, search_space=None, search_callbacks=None, searcher_options=None, callbacks=None, early_stopping_rounds=20, early_stopping_time_limit=3600, early_stopping_reward=None, reward_metric=None, optimize_direction=None, discriminator=None, hyper_model_options=None, tf_gpu_usage_strategy=0, tf_memory_limit=2048, final_retrain_on_wholedata=True, verbose=1, log_level=None, random_state=None, clear_cache=None, **kwargs): """ Parameters ---------- train_data : str, Pandas or Dask or Cudf DataFrame. Feature data for training with target column. For str, it's should be the data path in file system, will be loaded as pnadas Dataframe. we'll detect data format from this path (only .csv and .parquet are supported now). task : str. Task could be 'univariate-forecast', 'multivariate-forecast', and 'univariate-binaryclass', 'univariate-multiclass', 'multivariate-binaryclass, and ’multivariate-multiclass’. Notably, task can also configure 'forecast', 'classification', 'regression',and 'detection'. Besides, 'tsf', 'utsf','mtsf', 'tsc', 'tsr', 'tsd'('tsa', 'tsad') are also ok. At this point, HyprTS will perform detailed task type inference from the data combined with other known column information. eval_data : str, Pandas or Dask or Cudf DataFrame, optional. Feature data for evaluation, should be None or have the same python type with 'train_data'. test_data : str, Pandas or Dask or Cudf DataFrame, optional. Feature data for testing without target column, should be None or have the same python type with 'train_data'. max_trials : int, maximum number of tests (model search), optional, (default=50). eval_size : float or int, When the eval_data is None, customize the ratio to split the eval_data from the train_data. int indicates the prediction length of the forecast task. (default=0.2 or 10). cv : bool, default False. If True, use cross-validation instead of evaluation set reward to guide the search process. num_folds : int, default 3. Number of cross-validated folds, only valid when cv is true. mode : str, default 'stats'. Optional {'stats', 'dl', 'nas'}, where, 'stats' indicates that all the models selected in the execution experiment are statistical models. 'dl' indicates that all the models selected in the execution experiment are deep learning models. 'nas' indicates that the selected model of the execution experiment will be a deep network model for neural architecture search, which is not currently supported. target : str or list, optional. Target feature name for training, which must be one of the train_data columns for classification[str], regression[str] or unvariate forecast task [list]. For multivariate forecast task, it is multiple columns of training data. ensemble_size: 'int' or None, default 10. The number of estimator to ensemble. During the AutoML process, a lot of models will be generated with different preprocessing pipelines, different models, and different hyperparameters. Usually selecting some of the models that perform well to ensemble can obtain better generalization ability than just selecting the single best model. freq : 'str', DateOffset or None, default None. Note: If your task is a discontinuous time series, you can specify the freq as 'Discrete'. timestamp : str, forecast task 'timestamp' cannot be None, (default=None). forecast_train_data_periods : 'int', Cut off a certain period of data from the train data from back to front as a train set. (default=None). timestamp_format : str, the date format of timestamp col for forecast task, (default='%Y-%m-%d %H:%M:%S'). covariates/covariables : list[n*str], if the data contains covariates, specify the covariable column names, (default=None). dl_forecast_window : int, list or None. When selecting 'dl' or 'nas' mode, you can specify window, which is the sequence length of each sample (lag), (default=None). dl_forecast_horizon : int or None. When selecting 'dl' or 'nas' mode, you can specify horizon, which is the length of the interval between the input and the target, (default=1). contamination : float, should be in the interval (0, 1], optional (default=0.05). This parameter is adopted only in anomaly detection task to generate pseudo ground truth. The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. id : str or None, (default=None). The experiment id. callbacks: list of ExperimentCallback, optional. ExperimentCallback list. searcher : str, searcher class, search object, optional. The hypernets Searcher instance to explore search space, default is MCTSSearcher instance. For str, should be one of 'evolution', 'mcts', 'random'. For class, should be one of EvolutionSearcher, MCTSSearcher, RandomSearcher, or subclass of hypernets Searcher. For other, should be instance of hypernets Searcher. searcher_options: dict, optional, default is None. The options to create searcher, is used if searcher is str. search_space : callable, optional Used to initialize searcher instance (if searcher is None, str or class). search_callbacks Hypernets search callbacks, used to initialize searcher instance (if searcher is None, str or class). If log_level >= WARNNING, default is EarlyStoppingCallback only. If log_level < WARNNING, defalult is EarlyStoppingCallback plus SummaryCallback. early_stopping_rounds : int optional. Setting of EarlyStoppingCallback, is used if EarlyStoppingCallback instance not found from search_callbacks. Set zero or None to disable it, default is 20. early_stopping_time_limit : int, optional. Setting of EarlyStoppingCallback, is used if EarlyStoppingCallback instance not found from search_callbacks. Set zero or None to disable it, default is 3600 seconds. early_stopping_reward : float, optional. Setting of EarlyStoppingCallback, is used if EarlyStoppingCallback instance not found from search_callbacks. Set zero or None to disable it, default is None. reward_metric : str, callable, optional, (default 'accuracy' for binary/multiclass task, 'rmse' for forecast/regression task) Hypernets search reward metric name or callable. Possible values: - accuracy - auc - f1 - logloss - mse - mae - rmse - mape - smape - msle - precision - r2 - recall optimize_direction : str, optional. Hypernets search reward metric direction, default is detected from reward_metric. discriminator : instance of hypernets.discriminator.BaseDiscriminator, optional Discriminator is used to determine whether to continue training hyper_model_options: dict, optional. Options to initlize HyperModel except *reward_metric*, *task*, *callbacks*, *discriminator*. tf_gpu_usage_strategy : int, optional {0, 1, 2}. Deep neural net models(tensorflow) gpu usage strategy. 0:cpu | 1:gpu-memory growth | 2: gpu-memory limit. tf_memory_limit : int, GPU memory limit, default 2048. final_retrain_on_wholedata : bool, after the search, whether to retrain the optimal model on the whole data set. default True. random_state : int or None, default None. clear_cache: bool, optional, (default False) Clear cache store before running the expeirment. verbose : int, 0, 1, or 2, (default=1). 0 = silent, 1 = progress bar, 2 = one line per epoch (DL mode). Print order selection output to the screen. log_level : int, str, or None, (default=None), Level of logging, possible values: -logging.CRITICAL -logging.FATAL -logging.ERROR -logging.WARNING -logging.WARN -logging.INFO -logging.DEBUG -logging.NOTSET kwargs: Parameters to initialize experiment instance, refrence TSCompeteExperiment for more details. Returns ------- Runnable experiment object. """ def find_target(df): columns = df.columns.to_list() for col in columns: if col.lower() in cfg.experiment_default_target_set: return col raise ValueError(f'Not found one of {cfg.experiment_default_target_set} from your data,' f' implicit target must be specified.') def to_search_object(searcher, search_space): from hypernets.core.searcher import Searcher as SearcherSpec from hypernets.searchers import EvolutionSearcher if searcher is None: searcher = default_searcher(EvolutionSearcher, search_space, searcher_options) elif isinstance(searcher, (type, str)): searcher = default_searcher(searcher, search_space, searcher_options) elif not isinstance(searcher, SearcherSpec): logger.warning(f'Unrecognized searcher "{searcher}".') return searcher def to_metric_str(metrics): if callable(metrics): metrics = [metrics.__name__] elif isinstance(metrics, str): metrics = [metrics.lower()] else: metrics = 'auto' return metrics def default_search_space(task, metrics=None, covariates=None): metrics = to_metric_str(metrics) if mode == consts.Mode_STATS and task in consts.TASK_LIST_FORECAST: from hyperts.framework.search_space import StatsForecastSearchSpace search_space = StatsForecastSearchSpace(task=task, timestamp=timestamp, covariables=covariates, drop_observed_sample=forecast_drop_part_sample) elif mode == consts.Mode_STATS and task in consts.TASK_LIST_CLASSIFICATION: from hyperts.framework.search_space import StatsClassificationSearchSpace search_space = StatsClassificationSearchSpace(task=task, timestamp=timestamp) elif mode == consts.Mode_STATS and task in consts.TASK_LIST_REGRESSION: raise NotImplementedError( 'STATSRegressionSearchSpace is not implemented yet.' ) elif mode == consts.Mode_DL and task in consts.TASK_LIST_FORECAST: from hyperts.framework.search_space import DLForecastSearchSpace search_space = DLForecastSearchSpace(task=task, timestamp=timestamp, metrics=metrics, covariables=covariates, window=dl_forecast_window, horizon=dl_forecast_horizon, drop_observed_sample=forecast_drop_part_sample) elif mode == consts.Mode_DL and task in consts.TASK_LIST_CLASSIFICATION: from hyperts.framework.search_space import DLClassRegressSearchSpace search_space = DLClassRegressSearchSpace(task=task, timestamp=timestamp, metrics=metrics) elif mode == consts.Mode_DL and task in consts.TASK_LIST_REGRESSION: from hyperts.framework.search_space import DLClassRegressSearchSpace search_space = DLClassRegressSearchSpace(task=task, timestamp=timestamp, metrics=metrics) elif mode == consts.Mode_NAS and task in consts.TASK_LIST_FORECAST: from hyperts.framework.search_space.micro_search_space import TSNASGenrealSearchSpace search_space = TSNASGenrealSearchSpace(task=task, timestamp=timestamp, metrics=metrics, covariables=covariates, window=dl_forecast_window, horizon=dl_forecast_horizon) elif mode == consts.Mode_NAS and task in consts.TASK_LIST_CLASSIFICATION: from hyperts.framework.search_space.micro_search_space import TSNASGenrealSearchSpace search_space = TSNASGenrealSearchSpace(task=task, timestamp=timestamp, metrics=metrics, covariables=covariates, window=dl_forecast_window, horizon=dl_forecast_horizon) elif mode == consts.Mode_NAS and task in consts.TASK_LIST_REGRESSION: from hyperts.framework.search_space.micro_search_space import TSNASGenrealSearchSpace search_space = TSNASGenrealSearchSpace(task=task, timestamp=timestamp, metrics=metrics, covariables=covariates, window=dl_forecast_window, horizon=dl_forecast_horizon) elif mode == consts.Mode_STATS and task in consts.TASK_LIST_DETECTION: from hyperts.framework.search_space.macro_search_space import StatsDetectionSearchSpace search_space = StatsDetectionSearchSpace(task=task, timestamp=timestamp, covariables=covariates, drop_observed_sample=forecast_drop_part_sample) elif mode == consts.Mode_DL and task in consts.TASK_LIST_DETECTION: from hyperts.framework.search_space.macro_search_space import DLDetectionSearchSpace search_space = DLDetectionSearchSpace(task=task, timestamp=timestamp, metrics=metrics, covariables=covariates, window=dl_forecast_window, horizon=dl_forecast_horizon, drop_observed_sample=forecast_drop_part_sample) elif mode == consts.Mode_NAS and task in consts.TASK_LIST_DETECTION: raise NotImplementedError( 'NASDetectionSearchSpace is not implemented yet.' ) else: raise ValueError('The default search space was not found!') return search_space def default_searcher(cls, search_space, options): assert search_space is not None, '"search_space" should be specified when "searcher" is None or str.' assert optimize_direction in {'max', 'min'} if options is None: options = {} options['optimize_direction'] = optimize_direction s = make_searcher(cls, search_space, **options) return s def default_experiment_callbacks(): cbs = cfg.experiment_callbacks_notebook if isnotebook() else cfg.experiment_callbacks_console cbs = [load_module(cb)() if isinstance(cb, str) else cb for cb in cbs] return cbs def default_search_callbacks(): cbs = cfg.hyper_model_callbacks_notebook if isnotebook() else cfg.hyper_model_callbacks_console cbs = [load_module(cb)() if isinstance(cb, str) else cb for cb in cbs] return cbs def append_early_stopping_callbacks(cbs): from hypernets.core.callbacks import EarlyStoppingCallback assert isinstance(cbs, (tuple, list)) if any([isinstance(cb, EarlyStoppingCallback) for cb in cbs]): return cbs op = optimize_direction if optimize_direction is not None \ else 'max' if scorer._sign > 0 else 'min' es = EarlyStoppingCallback(early_stopping_rounds, op, time_limit=early_stopping_time_limit, expected_reward=early_stopping_reward) return [es] + cbs def task_omit_mapping(task): assert isinstance(task, str) if task.lower() == 'tsf': return consts.Task_FORECAST elif task.lower() == 'utsf': return consts.Task_UNIVARIATE_FORECAST elif task.lower() == 'mtsf': return consts.Task_MULTIVARIATE_FORECAST elif task.lower() == 'tsc': return consts.Task_CLASSIFICATION elif task.lower() == 'tsr': return consts.Task_REGRESSION elif task.lower() in ['tsa', 'tsd', 'tsad']: return consts.Task_DETECTION else: return task kwargs = kwargs.copy() kwargs['max_trials'] = max_trials kwargs['eval_size'] = eval_size kwargs['cv'] = cv kwargs['num_folds'] = num_folds kwargs['verbose'] = verbose if kwargs.get('covariables') is not None and covariates is None: covariates = kwargs.pop('covariables') if kwargs.get('dl_gpu_usage_strategy') is not None and tf_gpu_usage_strategy == 0: tf_gpu_usage_strategy = kwargs.pop('dl_gpu_usage_strategy') if kwargs.get('dl_memory_limit') is not None and tf_memory_limit == 2048: tf_memory_limit = kwargs.pop('dl_memory_limit') # 1. Set Log Level if log_level is None: log_level = logging.WARN logging.set_level(log_level) # 2. Set Random State if random_state is not None: set_random_state(seed=random_state, mode=mode) if mode != consts.Mode_STATS: try: from tensorflow import __version__ logger.info(f'The tensorflow version is {str(__version__)}.') except ImportError: raise RuntimeError('Please install `tensorflow` package first. command: pip install tensorflow.') # 3. Check Data, Task and Mode assert train_data is not None, 'train data is required.' assert eval_data is None or type(eval_data) is type(train_data) assert test_data is None or type(test_data) is type(train_data) TASK_LIST = consts.TASK_LIST_FORECAST + consts.TASK_LIST_CLASSIFICATION + \ consts.TASK_LIST_REGRESSION + consts.TASK_LIST_DETECTION assert task is not None, f'task is required. Task naming paradigm: {TASK_LIST}.' task = task_omit_mapping(task) if task not in TASK_LIST: raise ValueError(f'Task naming paradigm: {TASK_LIST}') if task in consts.TASK_LIST_FORECAST + consts.TASK_LIST_DETECTION and timestamp is None: raise ValueError("Forecast task 'timestamp' cannot be None.") if task in consts.TASK_LIST_FORECAST + consts.TASK_LIST_DETECTION and covariates is None: logger.info('If the data contains covariates, specify the covariate column names.') if freq is consts.DISCRETE_FORECAST and mode is consts.Mode_STATS: raise RuntimeError('Note: `stats` mode does not support discrete data forecast.') # 4. Set GPU Usage Strategy for DL or NAS Mode if mode in [consts.Mode_DL, consts.Mode_NAS]: if tf_gpu_usage_strategy == 0: import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' elif tf_gpu_usage_strategy == 1: from hyperts.utils import tf_gpu tf_gpu.set_memory_growth() elif tf_gpu_usage_strategy == 2: from hyperts.utils import tf_gpu tf_gpu.set_memory_limit(limit=tf_memory_limit) else: raise ValueError(f'The GPU strategy is not supported. ' f'Default [0:cpu | 1:gpu-memory growth | 2: gpu-memory limit].') # 5. Load data if isinstance(train_data, str): import pandas as pd tb = get_tool_box(pd.DataFrame) train_data = tb.load_data(train_data, reset_index=True) eval_data = tb.load_data(eval_data, reset_index=True) if eval_data is not None else None X_test = tb.load_data(test_data, reset_index=True) if test_data is not None else None else: tb = get_tool_box(train_data, eval_data, test_data) train_data = tb.reset_index(train_data) eval_data = tb.reset_index(eval_data) if eval_data is not None else None X_test = tb.reset_index(test_data) if test_data is not None else None if task in consts.TASK_LIST_FORECAST + consts.TASK_LIST_DETECTION: if timestamp is consts.MISSING_TIMESTAMP: timestamp = consts.TIMESTAMP if freq is None or freq is consts.DISCRETE_FORECAST: generate_freq = 'H' freq = consts.DISCRETE_FORECAST else: generate_freq = freq pseudo_timestamp = tb.DataFrame({f'{timestamp}': tb.date_range(start=consts.PSEUDO_DATE_START, periods=len(train_data), freq=generate_freq)}) train_data = tb.concat_df([pseudo_timestamp, train_data], axis=1) kwargs['train_end_date'] = pseudo_timestamp[timestamp].max() kwargs['generate_freq'] = generate_freq if (freq is not None and 'N' in freq) or 'N' in tb.infer_ts_freq(train_data, ts_name=timestamp): timestamp_format = None train_data[timestamp] = tb.datetime_format(train_data[timestamp], format=timestamp_format) if eval_data is not None: eval_data[timestamp] = tb.datetime_format(eval_data[timestamp], format=timestamp_format) if X_test is not None: X_test[timestamp] = tb.datetime_format(X_test[timestamp], format=timestamp_format) # 6. Split X_train, y_train, X_eval, y_eval X_train, y_train, X_eval, y_eval = None, None, None, None unsupervised_anomaly_detection_task = False anomaly_detection_label = None if task in consts.TASK_LIST_CLASSIFICATION + consts.TASK_LIST_REGRESSION: if target is None: target = find_target(train_data) X_train = train_data.copy() y_train = tb.pop(X_train, item=target) if eval_data is not None: X_eval = eval_data.copy() y_eval = tb.pop(X_eval, item=target) elif task in consts.TASK_LIST_FORECAST + consts.TASK_LIST_DETECTION: excluded_variables = [timestamp] + covariates if covariates is not None else [timestamp] all_variables = tb.columns_tolist(train_data) if target is None: unsupervised_anomaly_detection_task = True target = tb.list_diff(all_variables, excluded_variables) elif target is not None: if task in consts.TASK_LIST_FORECAST and isinstance(target, str): target = [target] elif task in consts.TASK_LIST_DETECTION: assert isinstance(target, str) anomaly_detection_label = target target = tb.list_diff(all_variables, excluded_variables) X_train, y_train = train_data[excluded_variables], train_data[target] if eval_data is not None: X_eval, y_eval = eval_data[excluded_variables], eval_data[target] if freq is None: freq = tb.infer_ts_freq(X_train, ts_name=timestamp) if freq is None: raise RuntimeError('Unable to infer correct frequency, ' 'please check data or specify frequency.') elif freq is not None and freq is not consts.DISCRETE_FORECAST: infer_freq = tb.infer_ts_freq(X_train, ts_name=timestamp) if freq != infer_freq: logger.warning(f'The specified frequency is {freq}, but ' f'the inferred frequency is {infer_freq}.') if anomaly_detection_label is not None: target = tb.list_diff(target, [anomaly_detection_label]) # 7. Covarite Transformer if covariates is not None: from hyperts.utils.transformers import CovariateTransformer cs = CovariateTransformer( covariables=covariates, data_cleaner_args=kwargs.pop('data_cleaner_args', None) ).fit(X_train) actual_covariates = cs.covariables_ else: from hyperts.utils.transformers import IdentityTransformer cs = IdentityTransformer().fit(X_train) actual_covariates = covariates # 8. Infer Forecast Window for DL Mode if mode in [consts.Mode_DL, consts.Mode_NAS] and task in consts.TASK_LIST_FORECAST + consts.TASK_LIST_DETECTION: if forecast_train_data_periods is None: X_train_length = len(X_train) elif isinstance(forecast_train_data_periods, int) and forecast_train_data_periods < len(X_train): X_train_length = forecast_train_data_periods else: raise ValueError(f'forecast_train_data_periods can not be greater than {len(X_train)}.') if cv: X_train_length = int(X_train_length // num_folds) if eval_data is not None: max_win_size = int((X_train_length + dl_forecast_horizon - 1) / 2) elif isinstance(eval_size, int): if X_train_length > eval_size - dl_forecast_horizon + 1: max_win_size = int((X_train_length - eval_size - dl_forecast_horizon + 1) / 2) else: raise ValueError(f'eval_size has to be less than {X_train_length - dl_forecast_horizon + 1}.') else: max_win_size = int((X_train_length * (1 - eval_size) - dl_forecast_horizon + 1) / 2) if max_win_size < 1: logger.warning(f'The trian data is too short to start {mode} mode, ' 'stats mode has been automatically switched.') mode = consts.Mode_STATS hist_store_upper_limit = consts.HISTORY_UPPER_LIMIT else: if dl_forecast_window is None: import numpy as np if max_win_size <= 10: dl_forecast_window = list(filter(lambda x: x <= max_win_size, [2, 4, 6, 8, 10])) else: if task in consts.TASK_LIST_FORECAST: candidate_windows = [3, 8, 12, 24, 30]*1 + [48, 60]*1 + [72, 96, 168, 183]*1 else: candidate_windows = [4, 8, 16, 24, 32] dl_forecast_window = list(filter(lambda x: x <= max_win_size, candidate_windows)) periods = [tb.fft_infer_period(y_train[col]) for col in target] period = int(np.argmax(np.bincount(periods))) if period > 0 and period <= max_win_size and period < 367: dl_forecast_window.append(period) elif isinstance(dl_forecast_window, int): assert dl_forecast_window < max_win_size, f'The slide window can not be greater than {max_win_size}' dl_forecast_window = [dl_forecast_window] elif isinstance(dl_forecast_window, list): assert max( dl_forecast_window) < max_win_size, f'The slide window can not be greater than {max_win_size}' else: raise ValueError(f'This type of {dl_forecast_window} is not supported.') logger.info(f'The slide window length of {mode} mode list is: {dl_forecast_window}') hist_store_upper_limit = max(dl_forecast_window) + 1 else: hist_store_upper_limit = consts.HISTORY_UPPER_LIMIT # 9. Task Type Infering if task in [consts.Task_FORECAST] and len(y_train.columns) == 1: task = consts.Task_UNIVARIATE_FORECAST elif task in [consts.Task_FORECAST] and len(y_train.columns) > 1: task = consts.Task_MULTIVARIATE_FORECAST if task in [consts.Task_CLASSIFICATION]: if y_train.nunique() == 2: if len(X_train.columns) == 1: task = consts.Task_UNIVARIATE_BINARYCLASS else: task = consts.Task_MULTIVARIATE_BINARYCLASS else: if len(X_train.columns) == 1: task = consts.Task_UNIVARIATE_MULTICALSS else: task = consts.Task_MULTIVARIATE_MULTICALSS if task in [consts.Task_DETECTION]: if unsupervised_anomaly_detection_task: if len(train_data.columns) - 1 == 1: task = consts.Task_UNIVARIATE_DETECTION elif len(train_data.columns) - 1 > 1: task = consts.Task_MULTIVARIATE_DETECTION else: if actual_covariates is not None: len_covariates = len(actual_covariates) else: len_covariates = 0 if len(y_train.columns) + len_covariates - 1 == 1: task = consts.Task_UNIVARIATE_DETECTION elif len(y_train.columns) + len_covariates - 1 > 1: task = consts.Task_MULTIVARIATE_DETECTION logger.info(f'Inference task type could be [{task}].') # 10. Configuration if reward_metric is None: if task in consts.TASK_LIST_FORECAST: reward_metric = 'mae' if task in consts.TASK_LIST_CLASSIFICATION: reward_metric = 'accuracy' if task in consts.TASK_LIST_REGRESSION: reward_metric = 'rmse' if task in consts.TASK_LIST_DETECTION: reward_metric = 'f1' logger.info(f'No reward metric specified, use "{reward_metric}" for {task} task by default.') if isinstance(reward_metric, str): logger.info(f'Reward_metric is [{reward_metric}].') else: logger.info(f'Reward_metric is [{reward_metric.__name__}].') # 11. Get scorer if kwargs.get('scorer') is None: kwargs['pos_label'] = tb.infer_pos_label(y_train, task, anomaly_detection_label, kwargs.get('pos_label')) scorer = tb.metrics.metric_to_scorer(reward_metric, task=task, pos_label=kwargs.get('pos_label'), optimize_direction=optimize_direction) else: scorer = kwargs.pop('scorer') if isinstance(scorer, str): raise ValueError('scorer should be a [make_scorer(metric, greater_is_better)] type.') # 12. Specify optimization direction if optimize_direction is None or len(optimize_direction) == 0: optimize_direction = 'max' if scorer._sign > 0 else 'min' logger.info(f'Optimize direction is [{optimize_direction}].') # 13. Get search space if (searcher is None or isinstance(searcher, str)) and search_space is None: search_space = default_search_space(task=task, metrics=reward_metric, covariates=actual_covariates) search_space.update_init_params(freq=freq) else: search_space.update_init_params( task=task, timestamp=timestamp, metrics=to_metric_str(reward_metric), covariables=actual_covariates, window=dl_forecast_window, horizon=dl_forecast_horizon, freq=freq) # 14. Get searcher searcher = to_search_object(searcher, search_space) logger.info(f'Searcher is [{searcher.__class__.__name__}].') # 15. Define callbacks if search_callbacks is None: search_callbacks = default_search_callbacks() search_callbacks = append_early_stopping_callbacks(search_callbacks) if callbacks is None: callbacks = default_experiment_callbacks() # 16. Define discriminator if discriminator is None and cfg.experiment_discriminator is not None and len(cfg.experiment_discriminator) > 0: discriminator = make_discriminator(cfg.experiment_discriminator, optimize_direction=optimize_direction, **(cfg.experiment_discriminator_options or {})) # 17. Define id if id is None: hasher = tb.data_hasher() id = hasher(dict(X_train=X_train, y_train=y_train, X_eval=X_eval, y_eval=y_eval, eval_size=kwargs.get('eval_size'), target=target, task=task)) id = f'{hyper_ts_cls.__name__}_{id}' # 18. Define hyper_model if hyper_model_options is None: hyper_model_options = {'covariates': covariates} hyper_model = hyper_ts_cls(searcher, mode=mode, timestamp=timestamp, reward_metric=reward_metric, task=task, callbacks=search_callbacks, discriminator=discriminator, **hyper_model_options) # 19. Build Experiment experiment = TSCompeteExperiment(hyper_model, X_train=X_train, y_train=y_train, X_eval=X_eval, y_eval=y_eval, task=task, mode=mode, timestamp_col=timestamp, target_col=target, covariate_cols=[covariates, actual_covariates], covariate_cleaner=cs, freq=freq, log_level=log_level, random_state=random_state, optimize_direction=optimize_direction, scorer=scorer, id=id, forecast_train_data_periods=forecast_train_data_periods, hist_store_upper_limit=hist_store_upper_limit, ensemble_size=ensemble_size, callbacks=callbacks, anomaly_label_col=anomaly_detection_label, contamination=contamination, final_retrain_on_wholedata=final_retrain_on_wholedata, **kwargs) # 20. Clear Cache if clear_cache: _clear_cache() if logger.is_info_enabled(): train_shape = tb.get_shape(X_train) test_shape = tb.get_shape(X_test, allow_none=True) eval_shape = tb.get_shape(X_eval, allow_none=True) if anomaly_detection_label is None: actual_target = target else: actual_target = anomaly_detection_label logger.info(f'make_experiment with train data:{train_shape}, ' f'test data:{test_shape}, eval data:{eval_shape}, target:{actual_target}, task:{task}') return experiment