FS-TFP/federatedscope/core/gpu_manager.py

91 lines
3.1 KiB
Python

import os
def check_gpus():
if not 'NVIDIA System Management' in os.popen('nvidia-smi -h').read():
print("'nvidia-smi' tool not found.")
return False
return True
class GPUManager():
"""
To automatic allocate the gpu, which returns the gpu with the largest
free memory rate, unless the specified_device has been set up
When gpus is unavailable, return 'cpu';
The implementation of GPUManager is referred to
https://github.com/QuantumLiu/tf_gpu_manager
"""
def __init__(self, gpu_available=False, specified_device=-1):
self.gpu_avaiable = gpu_available and check_gpus()
self.specified_device = specified_device
if self.gpu_avaiable:
self.gpus = self._query_gpus()
for gpu in self.gpus:
gpu['allocated'] = False
else:
self.gpus = None
def _sort_by_memory(self, gpus, by_size=False):
if by_size:
return sorted(gpus, key=lambda d: d['memory.free'], reverse=True)
else:
print('Sorted by free memory rate')
return sorted(
gpus,
key=lambda d: float(d['memory.free']) / d['memory.total'],
reverse=True)
def _query_gpus(self):
args = ['index', 'gpu_name', 'memory.free', 'memory.total']
cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(
','.join(args))
results = os.popen(cmd).readlines()
return [self._parse(line, args) for line in results]
def _parse(self, line, args):
numberic_args = ['memory.free', 'memory.total']
to_numberic = lambda v: float(v.upper().strip().replace('MIB', '').
replace('W', ''))
process = lambda k, v: (int(to_numberic(v))
if k in numberic_args else v.strip())
return {
k: process(k, v)
for k, v in zip(args,
line.strip().split(','))
}
def auto_choice(self):
"""
To allocate a device
"""
if self.gpus is None:
return 'cpu'
elif self.specified_device >= 0:
# allow users to specify the device
return 'cuda:{}'.format(self.specified_device)
else:
for old_infos, new_infos in zip(self.gpus, self._query_gpus()):
old_infos.update(new_infos)
unallocated_gpus = [
gpu for gpu in self.gpus if not gpu['allocated']
]
if len(unallocated_gpus) == 0:
# reset when all gpus have been allocated
unallocated_gpus = self.gpus
for gpu in self.gpus:
gpu['allocated'] = False
chosen_gpu = self._sort_by_memory(unallocated_gpus, True)[0]
chosen_gpu['allocated'] = True
index = chosen_gpu['index']
return 'cuda:{:s}'.format(index)
# for testing
if __name__ == '__main__':
gpu_manager = GPUManager(gpu_available=True, specified_device=0)
for i in range(20):
print(gpu_manager.auto_choice())