163 lines
5.6 KiB
Python
163 lines
5.6 KiB
Python
import logging
|
|
import os
|
|
import os.path as osp
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from torchvision.datasets.utils import download_and_extract_archive
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Credit(object):
|
|
"""
|
|
Give Me Some Credit Data Set
|
|
(https://www.kaggle.com/competitions/GiveMeSomeCredit)
|
|
Data Set: cs-training.csv, 150000 instances and 12 attributes
|
|
The first attribute is the user ID which we do not need, the second
|
|
attribute is the label, determining whether a loan should be granted.
|
|
|
|
Arguments:
|
|
root (str): root path
|
|
num_of_clients(int): number of clients
|
|
feature_partition(list): the number of features
|
|
partitioned to each client
|
|
tr_frac (float): train set proportion for each task; default=0.8
|
|
args (dict): set Ture or False to decide whether
|
|
to normalize or standardize the data or not,
|
|
e.g., {'normalization': False, 'standardization': False}
|
|
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
|
|
debug_size(int): use a subset for debug,
|
|
0 for using entire dataset
|
|
download (bool): indicator to download dataset
|
|
seed: a random seed
|
|
"""
|
|
base_folder = 'givemesomecredit'
|
|
url = 'https://federatedscope.oss-cn-beijing.aliyuncs.com/cs-training.zip'
|
|
raw_file = 'cs-training.csv'
|
|
|
|
def __init__(self,
|
|
root,
|
|
num_of_clients,
|
|
feature_partition,
|
|
args,
|
|
algo=None,
|
|
tr_frac=0.8,
|
|
debug_size=0,
|
|
download=True,
|
|
seed=123):
|
|
super(Credit, self).__init__()
|
|
self.root = root
|
|
self.num_of_clients = num_of_clients
|
|
self.feature_partition = feature_partition
|
|
self.tr_frac = tr_frac
|
|
self.seed = seed
|
|
self.args = args
|
|
self.algo = algo
|
|
self.data_size_for_debug = debug_size
|
|
self.data_dict = {}
|
|
self.data = {}
|
|
|
|
if download:
|
|
self.download()
|
|
if not self._check_existence():
|
|
raise RuntimeError("Dataset not found or corrupted." +
|
|
"You can use download=True to download it")
|
|
|
|
self._get_data()
|
|
self._partition_data()
|
|
|
|
def _get_data(self):
|
|
fpath = os.path.join(self.root, self.base_folder)
|
|
file = osp.join(fpath, self.raw_file)
|
|
data = self._read_raw(file)
|
|
data = data[:, 1:]
|
|
|
|
# the following codes are used to choose balanced data
|
|
# they may be removed later
|
|
# '''
|
|
sample_size = 150000
|
|
|
|
def balance_sample(sample_size, y):
|
|
y_ones_idx = (y == 1).nonzero()[0]
|
|
y_ones_idx = np.random.choice(y_ones_idx,
|
|
size=int(sample_size / 2))
|
|
y_zeros_idx = (y == 0).nonzero()[0]
|
|
y_zeros_idx = np.random.choice(y_zeros_idx,
|
|
size=int(sample_size / 2))
|
|
|
|
y_index = np.concatenate([y_zeros_idx, y_ones_idx], axis=0)
|
|
np.random.shuffle(y_index)
|
|
return y_index
|
|
|
|
sample_idx = balance_sample(sample_size, data[:, 0])
|
|
data = data[sample_idx]
|
|
# '''
|
|
|
|
if self.data_size_for_debug != 0:
|
|
subset_size = min(len(data), self.data_size_for_debug)
|
|
np.random.shuffle(data)
|
|
data = data[:subset_size]
|
|
|
|
train_num = int(self.tr_frac * len(data))
|
|
|
|
self.data_dict['train'] = data[:train_num]
|
|
self.data_dict['test'] = data[train_num:]
|
|
|
|
def _read_raw(self, file_path):
|
|
data = pd.read_csv(file_path)
|
|
data = data.fillna(method='ffill')
|
|
data = data.values
|
|
return data
|
|
|
|
def _check_existence(self):
|
|
fpath = os.path.join(self.root, self.base_folder, self.raw_file)
|
|
return osp.exists(fpath)
|
|
|
|
def download(self):
|
|
if self._check_existence():
|
|
logger.info("Files already exist")
|
|
return
|
|
download_and_extract_archive(self.url,
|
|
os.path.join(self.root, self.base_folder),
|
|
filename=self.url.split('/')[-1])
|
|
|
|
def _partition_data(self):
|
|
|
|
x = self.data_dict['train'][:, 1:]
|
|
y = self.data_dict['train'][:, 0]
|
|
|
|
test_data = {
|
|
'x': self.data_dict['test'][:, 1:],
|
|
'y': self.data_dict['test'][:, 0]
|
|
}
|
|
test_x = test_data['x']
|
|
test_y = test_data['y']
|
|
|
|
self.data = dict()
|
|
for i in range(self.num_of_clients + 1):
|
|
self.data[i] = dict()
|
|
if i == 0:
|
|
self.data[0]['train'] = None
|
|
self.data[0]['test'] = test_data
|
|
elif i == 1:
|
|
self.data[1]['train'] = {'x': x[:, :self.feature_partition[0]]}
|
|
self.data[1]['test'] = {
|
|
'x': test_x[:, :self.feature_partition[0]]
|
|
}
|
|
else:
|
|
self.data[i]['train'] = {
|
|
'x': x[:,
|
|
self.feature_partition[i -
|
|
2]:self.feature_partition[i -
|
|
1]]
|
|
}
|
|
self.data[i]['test'] = {
|
|
'x': test_x[:, self.feature_partition[i - 2]:self.
|
|
feature_partition[i - 1]]
|
|
}
|
|
self.data[i]['val'] = None
|
|
|
|
self.data[self.num_of_clients]['train']['y'] = y
|
|
self.data[self.num_of_clients]['test']['y'] = test_y[:]
|