FS-TFP/federatedscope/vertical_fl/dataloader/utils.py

import numpy as np
import math


def batch_iter(data, batch_size, shuffled=True):
    """
    A batch iteration

    Arguments:
        data(dict): data
        batch_size (int): the batch size
        shuffled (bool): whether to shuffle the data at the start of each epoch
    :returns: sample index, batch of x, batch_of y
    :rtype: int, ndarray, ndarray
    """

    assert 'x' in data and 'y' in data
    data_x = np.asarray(data['x'])
    data_y = np.asarray(data['y'])
    data_size = len(data_y)
    num_batches_per_epoch = math.ceil(data_size / batch_size)

    while True:
        shuffled_index = np.random.permutation(
            np.arange(data_size)) if shuffled else np.arange(data_size)
        for batch in range(num_batches_per_epoch):
            start_index = batch * batch_size
            end_index = min(data_size, (batch + 1) * batch_size)
            sample_index = shuffled_index[start_index:end_index]
            yield sample_index, data_x[sample_index], data_y[sample_index]


class VerticalDataSampler(object):
    """
    VerticalDataSampler is used to sample a subset from data

    Arguments:
        data(dict): data
        replace (bool): Whether the sample is with or without replacement
    """
    def __init__(self,
                 data,
                 replace=False,
                 use_full_trainset=True,
                 feature_frac=1.0):
        assert 'x' in data
        self.data_x = np.asarray(data['x'])
        self.data_y = np.asarray(data['y']) if 'y' in data else None
        self.data_size = self.data_x.shape[0]
        self.feature_size = self.data_x.shape[1]
        self.replace = replace
        self.use_full_trainset = use_full_trainset
        self.selected_feature_num = max(1,
                                        int(self.feature_size * feature_frac))
        self.selected_feature_index = None

    def sample_data(self, sample_size, index=None):

        # use the entire dataset
        if self.use_full_trainset:
            return range(len(self.data_x)), self.data_x, self.data_y

        if index is not None:
            sampled_x = self.data_x[index]
            sampled_y = self.data_y[index] if self.data_y is not None else None
        else:
            sample_size = min(sample_size, self.data_size)
            index = np.random.choice(a=self.data_size,
                                     size=sample_size,
                                     replace=self.replace)
            sampled_x = self.data_x[index]
            sampled_y = self.data_y[index] if self.data_y is not None else None

        return index, sampled_x, sampled_y

    def sample_feature(self, x):
        if self.selected_feature_num == self.feature_size:
            return range(x.shape[-1]), x
        else:
            feature_index = np.random.choice(a=self.feature_size,
                                             size=self.selected_feature_num,
                                             replace=False)
            self.selected_feature_index = feature_index

        return feature_index, x[:, feature_index]