150 lines
6.6 KiB
Python
150 lines
6.6 KiB
Python
import logging
|
|
|
|
from importlib import import_module
|
|
from federatedscope.core.data.utils import RegexInverseMap, load_dataset, \
|
|
convert_data_mode
|
|
from federatedscope.core.auxiliaries.utils import setup_seed
|
|
|
|
import federatedscope.register as register
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Modifications:
|
|
# 1. Add FedDGCN support. Line 203
|
|
# (2024-10-8, czzhangheng)
|
|
|
|
try:
|
|
from federatedscope.contrib.data import *
|
|
except ImportError as error:
|
|
logger.warning(
|
|
f'{error} in `federatedscope.contrib.data`, some modules are not '
|
|
f'available.')
|
|
|
|
# TODO: Add PyGNodeDataTranslator and PyGLinkDataTranslator
|
|
# TODO: move splitter to PyGNodeDataTranslator and PyGLinkDataTranslator
|
|
TRANS_DATA_MAP = {
|
|
'BaseDataTranslator': [
|
|
'.*?@.*?', 'hiv', 'proteins', 'imdb-binary', 'bbbp', 'tox21', 'bace',
|
|
'sider', 'clintox', 'esol', 'freesolv', 'lipo', 'cifar4cl', 'cifar4lp'
|
|
],
|
|
# Add trafficflow to 'DummyDataTranslator'
|
|
'DummyDataTranslator': [
|
|
'toy', 'quadratic', 'femnist', 'celeba', 'shakespeare', 'twitter',
|
|
'subreddit', 'synthetic', 'ciao', 'epinions', '.*?vertical_fl_data.*?',
|
|
'.*?movielens.*?', '.*?netflix.*?', '.*?cikmcup.*?',
|
|
'graph_multi_domain.*?', 'cora', 'citeseer', 'pubmed', 'dblp_conf',
|
|
'dblp_org', 'csbm.*?', 'fb15k-237', 'wn18', 'adult', 'abalone',
|
|
'credit', 'blog', 'trafficflow'
|
|
], # Dummy for FL dataset
|
|
'RawDataTranslator': ['hetero_nlp_tasks'],
|
|
}
|
|
DATA_TRANS_MAP = RegexInverseMap(TRANS_DATA_MAP, None)
|
|
|
|
|
|
def get_data(config, client_cfgs=None):
|
|
"""Instantiate the data and update the configuration accordingly if
|
|
necessary.
|
|
|
|
Arguments:
|
|
config: a cfg node object
|
|
client_cfgs: dict of client-specific cfg node object
|
|
Returns:
|
|
The dataset object and the updated configuration.
|
|
|
|
Note:
|
|
The available ``data.type`` is shown below:
|
|
================================== ===========================
|
|
Data type Domain
|
|
================================== ===========================
|
|
FEMNIST CV
|
|
Celeba CV
|
|
``${DNAME}@torchvision`` CV
|
|
Shakespeare NLP
|
|
SubReddit NLP
|
|
Twitter (Sentiment140) NLP
|
|
``${DNAME}@torchtext`` NLP
|
|
``${DNAME}@huggingface_datasets`` NLP
|
|
Cora Graph (node-level)
|
|
CiteSeer Graph (node-level)
|
|
PubMed Graph (node-level)
|
|
DBLP_conf Graph (node-level)
|
|
DBLP_org Graph (node-level)
|
|
csbm Graph (node-level)
|
|
Epinions Graph (link-level)
|
|
Ciao Graph (link-level)
|
|
FB15k Graph (link-level)
|
|
FB15k-237 Graph (link-level)
|
|
WN18 Graph (link-level)
|
|
MUTAG Graph (graph-level)
|
|
BZR Graph (graph-level)
|
|
COX2 Graph (graph-level)
|
|
DHFR Graph (graph-level)
|
|
PTC_MR Graph (graph-level)
|
|
AIDS Graph (graph-level)
|
|
NCI1 Graph (graph-level)
|
|
ENZYMES Graph (graph-level)
|
|
DD Graph (graph-level)
|
|
PROTEINS Graph (graph-level)
|
|
COLLAB Graph (graph-level)
|
|
IMDB-BINARY Graph (graph-level)
|
|
IMDB-MULTI Graph (graph-level)
|
|
REDDIT-BINARY Graph (graph-level)
|
|
HIV Graph (graph-level)
|
|
ESOL Graph (graph-level)
|
|
FREESOLV Graph (graph-level)
|
|
LIPO Graph (graph-level)
|
|
PCBA Graph (graph-level)
|
|
MUV Graph (graph-level)
|
|
BACE Graph (graph-level)
|
|
BBBP Graph (graph-level)
|
|
TOX21 Graph (graph-level)
|
|
TOXCAST Graph (graph-level)
|
|
SIDER Graph (graph-level)
|
|
CLINTOX Graph (graph-level)
|
|
graph_multi_domain_mol Graph (graph-level)
|
|
graph_multi_domain_small Graph (graph-level)
|
|
graph_multi_domain_biochem Graph (graph-level)
|
|
cikmcup Graph (graph-level)
|
|
toy Tabular
|
|
synthetic Tabular
|
|
quadratic Tabular
|
|
``${DNAME}openml`` Tabular
|
|
vertical_fl_data Tabular(vertical)
|
|
VFLMovieLens1M Recommendation
|
|
VFLMovieLens10M Recommendation
|
|
HFLMovieLens1M Recommendation
|
|
HFLMovieLens10M Recommendation
|
|
VFLNetflix Recommendation
|
|
HFLNetflix Recommendation
|
|
trafficflow Traffic Flow Prediction
|
|
================================== ===========================
|
|
"""
|
|
# Fix the seed for data generation
|
|
setup_seed(12345)
|
|
|
|
for func in register.data_dict.values():
|
|
data_and_config = func(config, client_cfgs)
|
|
if data_and_config is not None:
|
|
return data_and_config
|
|
|
|
# Load dataset from source files
|
|
dataset, modified_config = load_dataset(config, client_cfgs)
|
|
|
|
# Apply translator to non-FL dataset to transform it into its federated
|
|
# counterpart
|
|
if dataset is not None:
|
|
translator = getattr(import_module('federatedscope.core.data'),
|
|
DATA_TRANS_MAP[config.data.type.lower()])(
|
|
modified_config, client_cfgs)
|
|
data = translator(dataset)
|
|
|
|
# Convert `StandaloneDataDict` to `ClientData` when in distribute mode
|
|
data = convert_data_mode(data, modified_config)
|
|
else:
|
|
data = None
|
|
|
|
# Restore the user-specified seed after the data generation
|
|
setup_seed(config.seed)
|
|
|
|
return data, modified_config
|