FS-TFP/federatedscope/vertical_fl/dataset/blog.py

import glob
import logging
import os
import os.path as osp

import numpy as np
import pandas as pd

from torchvision.datasets.utils import download_and_extract_archive

logger = logging.getLogger(__name__)


class Blog(object):
    """
    BlogFeedback Data Set
    (https://archive.ics.uci.edu/ml/datasets/BlogFeedback)

    Data Set Information:
    This data originates from blog posts. The raw HTML-documents
    of the blog posts were crawled and processed.
    The prediction task associated with the data is the prediction
    of the number of comments in the upcoming 24 hours. In order
    to simulate this situation, we choose a basetime (in the past)
    and select the blog posts that were published at most
    72 hours before the selected base date/time. Then, we calculate
    all the features of the selected blog posts from the information
    that was available at the basetime, therefore each instance
    corresponds to a blog post. The target is the number of
    comments that the blog post received in the next 24 hours
    relative to the basetime.

    Number of Instances: 60021
    Number of Attributes: 281, the last one is the number of comments
                          in the next 24 hours
    Training set: 'blogData_train.csv', 52397 instances
    Testing set: 'blogData_test*.csv', 60 files, 7624 instances totally

    Arguments:
        root (str): root path
        num_of_clients(int): number of clients
        feature_partition(list): the number of features
                                    partitioned to each client
        tr_frac (float): train set proportion for each task; default=0.8
        args (dict): set Ture or False to decide whether
                     to normalize or standardize the data or not,
                     e.g., {'normalization': False, 'standardization': False}
        algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
        debug_size(int): use a subset for debug,
                                  0 for using entire dataset
        download (bool): indicator to download dataset
        seed: a random seed
    """
    base_folder = 'blogfeedback'
    url = 'https://federatedscope.oss-cn-beijing.aliyuncs.com/BlogFeedback.zip'
    raw_file = 'BlogFeedback.zip'

    def __init__(self,
                 root,
                 num_of_clients,
                 feature_partition,
                 args,
                 algo=None,
                 tr_frac=0.8,
                 debug_size=0,
                 download=True,
                 seed=123):
        super(Blog, self).__init__()
        self.root = root
        self.num_of_clients = num_of_clients
        self.tr_frac = tr_frac
        self.feature_partition = feature_partition
        self.seed = seed
        self.args = args
        self.algo = algo
        self.data_size_for_debug = debug_size
        self.data_dict = {}
        self.data = {}

        if download:
            self.download()
        if not self._check_existence():
            raise RuntimeError("Dataset not found or corrupted." +
                               "You can use download=True to download it")

        self._get_data()
        self._partition_data()

    def _get_data(self):
        fpath = os.path.join(self.root, self.base_folder)
        train_file = osp.join(fpath, 'blogData_train.csv')
        train_data = self._read_raw(train_file)
        test_files = glob.glob(osp.join(fpath, "blogData_test*.csv"))
        test_files.sort()

        flag = 0
        for f in test_files:
            f_data = self._read_raw(f)
            if flag == 0:
                test_data = f_data
                flag = 1
            else:
                test_data = np.concatenate((test_data, f_data), axis=0)

        if self.data_size_for_debug != 0:
            subset_size = min(len(train_data), self.data_size_for_debug)
            np.random.shuffle(train_data)
            train_data = train_data[:subset_size]

        self.data_dict['train'] = train_data
        self.data_dict['test'] = test_data

    def _read_raw(self, file_path):
        data = pd.read_csv(file_path, header=None, usecols=list(range(281)))
        data = data.fillna(method='ffill')
        data = data.values
        return data

    def _check_existence(self):
        fpath = os.path.join(self.root, self.base_folder, self.raw_file)
        return osp.exists(fpath)

    def download(self):
        if self._check_existence():
            logger.info("Files already exist")
            return
        download_and_extract_archive(self.url,
                                     os.path.join(self.root, self.base_folder),
                                     filename=self.url.split('/')[-1])

    def _partition_data(self):
        x = self.data_dict['train'][:, :self.feature_partition[-1]]
        y = self.data_dict['train'][:, self.feature_partition[-1]]
        test_data = dict()
        test_data['x'] = self.data_dict['test'][:, :self.feature_partition[-1]]
        test_data['y'] = self.data_dict['test'][:, self.feature_partition[-1]]

        test_x = test_data['x']
        test_y = test_data['y']

        self.data = dict()
        for i in range(self.num_of_clients + 1):
            self.data[i] = dict()
            if i == 0:
                self.data[0]['train'] = None
                self.data[0]['test'] = test_data
            elif i == 1:
                self.data[1]['train'] = {'x': x[:, :self.feature_partition[0]]}
                self.data[1]['test'] = {
                    'x': test_x[:, :self.feature_partition[0]]
                }
            else:
                self.data[i]['train'] = {
                    'x': x[:,
                           self.feature_partition[i -
                                                  2]:self.feature_partition[i -
                                                                            1]]
                }
                self.data[i]['test'] = {
                    'x': test_x[:, self.feature_partition[i - 2]:self.
                                feature_partition[i - 1]]
                }
            self.data[i]['val'] = None

        self.data[self.num_of_clients]['train']['y'] = y[:]
        self.data[self.num_of_clients]['test']['y'] = test_y[:]