FS-TFP/benchmark/FedHPOBench/fedhpobench/utils/surrogate_dataloader.py

132 lines
5.0 KiB
Python

import datetime
import numpy as np
import os
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate as sk_cross_validate
from tqdm import tqdm
from fedhpobench.utils.tabular_dataloader import load_data
def sampling(X, Y, over_rate=1, down_rate=1.0, cvg_score=0.5):
rel_score = Y
over_X = np.repeat(X[rel_score > cvg_score], over_rate, axis=0)
over_Y = np.repeat(Y[rel_score > cvg_score], over_rate, axis=0)
mask = np.random.choice(X[rel_score <= cvg_score].shape[0],
size=int(X[rel_score <= cvg_score].shape[0] *
down_rate),
replace=False)
down_X = np.array(X[rel_score <= cvg_score])[mask]
down_Y = np.array(Y[rel_score <= cvg_score])[mask]
return np.concatenate([over_X, down_X],
axis=0), np.concatenate([over_Y, down_Y], axis=0)
def load_surrogate_model(modeldir, model, dname, algo):
model_list = []
path = os.path.join(modeldir, model, dname, algo)
file_names = os.listdir(path)
for fname in file_names:
if not fname.startswith('surrogate_model'):
continue
with open(os.path.join(path, fname), 'rb') as f:
model_state = f.read()
model = pickle.loads(model_state)
model_list.append(model)
infofile = os.path.join(path, 'info.pkl')
with open(infofile, 'rb') as f:
info = pickle.loads(f.read())
# TODO: remove X and Y
X = np.load(os.path.join(path, 'X.npy'))
Y = np.load(os.path.join(path, 'Y.npy'))
return model_list, info, X, Y
def build_surrogate_model(datadir, model, dname, algo, key='val_acc'):
r"""
from TabularBenchmark to SurrogateBenchmark data format
"""
table, meta_info = load_data(datadir, model, dname, algo)
savedir = os.path.join('data/surrogate_model', model, dname, algo)
os.makedirs(savedir, exist_ok=True)
# Build data to train the surrogate_model
X, Y = [], []
fidelity_space = sorted(['sample_client', 'round'])
configuration_space = sorted(
list(set(table.keys()) - {'result', 'seed'} - set(fidelity_space)))
if not os.path.exists(os.path.join(savedir,
'X.npy')) or not os.path.exists(
os.path.join(savedir, 'Y.npy')):
print('Building data mat...')
for idx in tqdm(range(len(table))):
row = table.iloc[idx]
x = [row[col]
for col in configuration_space] + [row['sample_client']]
result = eval(row['result'])
val_loss = result['val_avg_loss']
for rnd in range(len(val_loss)):
X.append(x + [rnd * meta_info['eval_freq']])
best_round = np.argmin(val_loss[:rnd + 1])
Y.append(result[key][best_round])
X, Y = np.array(X), np.array(Y)
np.save(os.path.join(savedir, 'X.npy'), X)
np.save(os.path.join(savedir, 'Y.npy'), Y)
else:
print('Loading cache...')
X = np.load(os.path.join(savedir, 'X.npy'))
Y = np.load(os.path.join(savedir, 'Y.npy'))
new_X, new_Y = sampling(X, Y, over_rate=1, down_rate=1)
perm = np.random.permutation(np.arange(len(new_Y)))
new_X, new_Y = new_X[perm], new_Y[perm]
best_res = -np.inf
# Ten-fold validation to get ten surrogate_model
for n_estimators in [10, 20]:
for max_depth in [10, 15, 20]:
regr = RandomForestRegressor(n_estimators=n_estimators,
max_depth=max_depth)
# dict_keys(['fit_time', 'score_time', 'estimator',
# 'test_score', 'train_score'])
res = sk_cross_validate(regr,
new_X,
new_Y,
cv=10,
n_jobs=-1,
scoring='neg_mean_absolute_error',
return_estimator=True,
return_train_score=True)
test_metric = np.mean(res['test_score'])
train_metric = np.mean(res['train_score'])
print(f'n_estimators: {n_estimators}, max_depth: {max_depth}, '
f'train_metric: {train_metric}, test_metric: {test_metric}')
if test_metric > best_res:
best_res = test_metric
best_models = res['estimator']
# Save model
for i, rf in enumerate(best_models):
file_name = f'surrogate_model_{i}.pkl'
model_state = pickle.dumps(rf)
with open(os.path.join(savedir, file_name), 'wb') as f:
f.write(model_state)
# Save info
info = {
'configuration_space': configuration_space,
'fidelity_space': fidelity_space
}
pkl = pickle.dumps(info)
with open(os.path.join(savedir, 'info.pkl'), 'wb') as f:
f.write(pkl)
return best_models, info, X, Y