FS-TFP/federatedscope/vertical_fl/dataset/blog.py

167 lines
6.1 KiB
Python

import glob
import logging
import os
import os.path as osp
import numpy as np
import pandas as pd
from torchvision.datasets.utils import download_and_extract_archive
logger = logging.getLogger(__name__)
class Blog(object):
"""
BlogFeedback Data Set
(https://archive.ics.uci.edu/ml/datasets/BlogFeedback)
Data Set Information:
This data originates from blog posts. The raw HTML-documents
of the blog posts were crawled and processed.
The prediction task associated with the data is the prediction
of the number of comments in the upcoming 24 hours. In order
to simulate this situation, we choose a basetime (in the past)
and select the blog posts that were published at most
72 hours before the selected base date/time. Then, we calculate
all the features of the selected blog posts from the information
that was available at the basetime, therefore each instance
corresponds to a blog post. The target is the number of
comments that the blog post received in the next 24 hours
relative to the basetime.
Number of Instances: 60021
Number of Attributes: 281, the last one is the number of comments
in the next 24 hours
Training set: 'blogData_train.csv', 52397 instances
Testing set: 'blogData_test*.csv', 60 files, 7624 instances totally
Arguments:
root (str): root path
num_of_clients(int): number of clients
feature_partition(list): the number of features
partitioned to each client
tr_frac (float): train set proportion for each task; default=0.8
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
debug_size(int): use a subset for debug,
0 for using entire dataset
download (bool): indicator to download dataset
seed: a random seed
"""
base_folder = 'blogfeedback'
url = 'https://federatedscope.oss-cn-beijing.aliyuncs.com/BlogFeedback.zip'
raw_file = 'BlogFeedback.zip'
def __init__(self,
root,
num_of_clients,
feature_partition,
args,
algo=None,
tr_frac=0.8,
debug_size=0,
download=True,
seed=123):
super(Blog, self).__init__()
self.root = root
self.num_of_clients = num_of_clients
self.tr_frac = tr_frac
self.feature_partition = feature_partition
self.seed = seed
self.args = args
self.algo = algo
self.data_size_for_debug = debug_size
self.data_dict = {}
self.data = {}
if download:
self.download()
if not self._check_existence():
raise RuntimeError("Dataset not found or corrupted." +
"You can use download=True to download it")
self._get_data()
self._partition_data()
def _get_data(self):
fpath = os.path.join(self.root, self.base_folder)
train_file = osp.join(fpath, 'blogData_train.csv')
train_data = self._read_raw(train_file)
test_files = glob.glob(osp.join(fpath, "blogData_test*.csv"))
test_files.sort()
flag = 0
for f in test_files:
f_data = self._read_raw(f)
if flag == 0:
test_data = f_data
flag = 1
else:
test_data = np.concatenate((test_data, f_data), axis=0)
if self.data_size_for_debug != 0:
subset_size = min(len(train_data), self.data_size_for_debug)
np.random.shuffle(train_data)
train_data = train_data[:subset_size]
self.data_dict['train'] = train_data
self.data_dict['test'] = test_data
def _read_raw(self, file_path):
data = pd.read_csv(file_path, header=None, usecols=list(range(281)))
data = data.fillna(method='ffill')
data = data.values
return data
def _check_existence(self):
fpath = os.path.join(self.root, self.base_folder, self.raw_file)
return osp.exists(fpath)
def download(self):
if self._check_existence():
logger.info("Files already exist")
return
download_and_extract_archive(self.url,
os.path.join(self.root, self.base_folder),
filename=self.url.split('/')[-1])
def _partition_data(self):
x = self.data_dict['train'][:, :self.feature_partition[-1]]
y = self.data_dict['train'][:, self.feature_partition[-1]]
test_data = dict()
test_data['x'] = self.data_dict['test'][:, :self.feature_partition[-1]]
test_data['y'] = self.data_dict['test'][:, self.feature_partition[-1]]
test_x = test_data['x']
test_y = test_data['y']
self.data = dict()
for i in range(self.num_of_clients + 1):
self.data[i] = dict()
if i == 0:
self.data[0]['train'] = None
self.data[0]['test'] = test_data
elif i == 1:
self.data[1]['train'] = {'x': x[:, :self.feature_partition[0]]}
self.data[1]['test'] = {
'x': test_x[:, :self.feature_partition[0]]
}
else:
self.data[i]['train'] = {
'x': x[:,
self.feature_partition[i -
2]:self.feature_partition[i -
1]]
}
self.data[i]['test'] = {
'x': test_x[:, self.feature_partition[i - 2]:self.
feature_partition[i - 1]]
}
self.data[i]['val'] = None
self.data[self.num_of_clients]['train']['y'] = y[:]
self.data[self.num_of_clients]['test']['y'] = test_y[:]