FS-TFP/benchmark/FedHPOBench/fedhpobench/utils/surrogate_dataloader.py

import datetime
import numpy as np
import os
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate as sk_cross_validate
from tqdm import tqdm

from fedhpobench.utils.tabular_dataloader import load_data


def sampling(X, Y, over_rate=1, down_rate=1.0, cvg_score=0.5):
    rel_score = Y
    over_X = np.repeat(X[rel_score > cvg_score], over_rate, axis=0)
    over_Y = np.repeat(Y[rel_score > cvg_score], over_rate, axis=0)

    mask = np.random.choice(X[rel_score <= cvg_score].shape[0],
                            size=int(X[rel_score <= cvg_score].shape[0] *
                                     down_rate),
                            replace=False)
    down_X = np.array(X[rel_score <= cvg_score])[mask]
    down_Y = np.array(Y[rel_score <= cvg_score])[mask]
    return np.concatenate([over_X, down_X],
                          axis=0), np.concatenate([over_Y, down_Y], axis=0)


def load_surrogate_model(modeldir, model, dname, algo):
    model_list = []
    path = os.path.join(modeldir, model, dname, algo)
    file_names = os.listdir(path)
    for fname in file_names:
        if not fname.startswith('surrogate_model'):
            continue
        with open(os.path.join(path, fname), 'rb') as f:
            model_state = f.read()
            model = pickle.loads(model_state)
            model_list.append(model)

    infofile = os.path.join(path, 'info.pkl')
    with open(infofile, 'rb') as f:
        info = pickle.loads(f.read())

    # TODO: remove X and Y
    X = np.load(os.path.join(path, 'X.npy'))
    Y = np.load(os.path.join(path, 'Y.npy'))

    return model_list, info, X, Y


def build_surrogate_model(datadir, model, dname, algo, key='val_acc'):
    r"""
    from TabularBenchmark to SurrogateBenchmark data format
    """
    table, meta_info = load_data(datadir, model, dname, algo)
    savedir = os.path.join('data/surrogate_model', model, dname, algo)
    os.makedirs(savedir, exist_ok=True)
    # Build data to train the surrogate_model
    X, Y = [], []
    fidelity_space = sorted(['sample_client', 'round'])
    configuration_space = sorted(
        list(set(table.keys()) - {'result', 'seed'} - set(fidelity_space)))

    if not os.path.exists(os.path.join(savedir,
                                       'X.npy')) or not os.path.exists(
                                           os.path.join(savedir, 'Y.npy')):
        print('Building data mat...')
        for idx in tqdm(range(len(table))):
            row = table.iloc[idx]
            x = [row[col]
                 for col in configuration_space] + [row['sample_client']]
            result = eval(row['result'])
            val_loss = result['val_avg_loss']
            for rnd in range(len(val_loss)):
                X.append(x + [rnd * meta_info['eval_freq']])
                best_round = np.argmin(val_loss[:rnd + 1])
                Y.append(result[key][best_round])
        X, Y = np.array(X), np.array(Y)
        np.save(os.path.join(savedir, 'X.npy'), X)
        np.save(os.path.join(savedir, 'Y.npy'), Y)
    else:
        print('Loading cache...')
        X = np.load(os.path.join(savedir, 'X.npy'))
        Y = np.load(os.path.join(savedir, 'Y.npy'))

    new_X, new_Y = sampling(X, Y, over_rate=1, down_rate=1)

    perm = np.random.permutation(np.arange(len(new_Y)))
    new_X, new_Y = new_X[perm], new_Y[perm]

    best_res = -np.inf
    # Ten-fold validation to get ten surrogate_model
    for n_estimators in [10, 20]:
        for max_depth in [10, 15, 20]:
            regr = RandomForestRegressor(n_estimators=n_estimators,
                                         max_depth=max_depth)
            # dict_keys(['fit_time', 'score_time', 'estimator',
            # 'test_score', 'train_score'])
            res = sk_cross_validate(regr,
                                    new_X,
                                    new_Y,
                                    cv=10,
                                    n_jobs=-1,
                                    scoring='neg_mean_absolute_error',
                                    return_estimator=True,
                                    return_train_score=True)
            test_metric = np.mean(res['test_score'])
            train_metric = np.mean(res['train_score'])
            print(f'n_estimators: {n_estimators}, max_depth: {max_depth}, '
                  f'train_metric: {train_metric}, test_metric: {test_metric}')
            if test_metric > best_res:
                best_res = test_metric
                best_models = res['estimator']

    # Save model
    for i, rf in enumerate(best_models):
        file_name = f'surrogate_model_{i}.pkl'
        model_state = pickle.dumps(rf)
        with open(os.path.join(savedir, file_name), 'wb') as f:
            f.write(model_state)

    # Save info
    info = {
        'configuration_space': configuration_space,
        'fidelity_space': fidelity_space
    }
    pkl = pickle.dumps(info)
    with open(os.path.join(savedir, 'info.pkl'), 'wb') as f:
        f.write(pkl)

    return best_models, info, X, Y