Source code for akid.train.tuner

"""
A module to provide a mechanism to ease network tuning.
"""
from __future__ import print_function

import sys
import inspect
import multiprocessing
from .semaphore import Semaphore
import subprocess
import os

from jinja2 import Template
from tqdm import tqdm
import pycuda.autoinit
import pycuda.driver as cuda
import gflags as flags

FLAGS = flags.FLAGS
flags.DEFINE_boolean("use_sub_shell", True, "Deprecated. Not used."
                     " Use sub shell to run training"
                     " instances or not. This is to get around the issue"
                     " tensorflow will not release memory after a process"
                     " finishes its work. So to force tensorflow release "
                     " resources, just run the process in a sub shell.")
flags.DEFINE_integer("gpu_start_No", 0, "The start No of GPU to use. This flag"
                     " is to make sure the correct GPU mask is passed when"
                     " running training instances using subshell. For example,"
                     " if you are going to use GPU 1-9, you need to pass"
                     " --gpu_start_no=1.")

NETWORK_LOG_HEADER = "Network Setup: \\n"


def spawn_using_sub_shell(setup_func, work_dir, idxs):
    gpu_No_str = ""
    # Make comma separated gpu No list.
    for idx in idxs:
        gpu_No_str += "{},".format(idx + FLAGS.gpu_start_No)

    # Add training code to the end.
    training_call = """


kid = setup()
import inspect
from akid.utils import glog as log
log.info("{}" + inspect.getsource(setup))
kid.practice()
    """.format(NETWORK_LOG_HEADER)

    training_code = setup_func + training_call
    # Save code to file.
    file_name = "net_{}.py".format(gpu_No_str)
    with open(os.path.join(work_dir, file_name), 'w') as f:
        f.write(training_code)
    # Run.
    subprocess.call(
        "cd {}; CUDA_VISIBLE_DEVICES={} python {}".format(
            work_dir,
            gpu_No_str,
            file_name),
        shell=True)


def spawn(s, l, gpu_mask, gpu_num, return_values, setup_func, repeat):
    s.acquire(gpu_num)

    # Look up GPU(s) and mark it used.
    # A lock is unnecessary for manager list, but in order to let the
    # printed information print right, a lock is used to control access to
    # stdout.
    with l:
        acquired_gpu = 0
        idxs = []
        for idx, avail in enumerate(gpu_mask):
            if avail == 1:
                gpu_mask[idx] = 0
                idxs.append(idx)
                acquired_gpu += 1
                if acquired_gpu == gpu_num:
                    break

        print("GPU mask {}.".format(gpu_mask))
        print("Using GPU {}.".format(idxs))

    repeat_folder = str(repeat)
    # Create folder to hold one training repeat.
    if not os.path.exists(repeat_folder):
        os.mkdir(repeat_folder)
    work_dir = repeat_folder

    spawn_using_sub_shell(setup_func, work_dir, idxs)

    # Release the GPU.
    with l:
        for idx in idxs:
            print("Released GPU {}.".format(idx))
            gpu_mask[idx] = 1

    s.release(gpu_num)


[docs]def tune(template, opt_paras_list=[{}], net_paras_list=[{}], repeat_times=1, gpu_num_per_instance=1, debug=False): """ A function `tune` that takes a Brain jinja2 template class and a parameters to fill the template in runtime. Parameters provided should complete the remaining network parameters in the template. The tuner is not aware of the content of the list items. It is up to the user to define template right, so parameters will be filled in the right place. The jinja2 template must be a function named `setup`, and return a set up `Kid`. All necessary module imports should be put in the function instead of module level import usually. The `tune` function would use all available GPUs to train networks with all given different set of parameters. If available GPUs are not enough, the ones that cannot be trained will wait till some others finish, and get its turn. ## Parameter Tuning Usage Tunable parameters are divided into two set, network hyper parameters, `net_paras_list`, and optimization hyper parameters, `opt_paras_list`. Each set is specified by a list whose item is a dictionary that holds the actual value of whatever hyper parameters defined as jinja2 templates. Each item in the list corresponds to a tentative training instance. network paras and optimization paras combine with each other exponentially(or in Cartesian Product way if we could use Math terminology), which is to say if you have two items in network parameter list, and two in optimization parameters, the total number of training instances will be four. Final training precisions will be returned as a list. Since the final precision normally will not be the optimal one, which normally occurs during training, the returned values are used for testing purpose only now ## Run repeated experiment To run repeated experiment, just leave `opt_paras_list` and `net_paras_list` to their default value. ## GPU Resources Allocation If the `gpu_num_per_instance` is None, a gpu would be allocated to each thread, otherwise, the length of the list should be the same with that of the training instance (aka the #opt_paras_list * #net_paras_list * repeat_times), or an int. Given the available GPU numbers, a semaphore is created to control access to GPUs. A lock is created to control access to the mask to indicator which GPU is available. After a process has modified the gpu mask, it releases the lock immediately, so other process could access it. But the semaphore is still not release, since it is used to control access to the actual GPU. A training instance will be launched in a subshell using the GPU acquired. The semaphore is only released after the training has finished. ## Example For example, to tune the activation function and learning rates of a network, first we set up network parameters in `net_paras_list`, optimization parameters in `opt_paras_list`, build a network in the `setup` function, then pass all of it to tune:: net_paras_list = [] net_paras_list.append({ "activation": [ {"type": "relu"}, {"type": "relu"}, {"type": "relu"}, {"type": "relu"}], "bn": True}) net_paras_list.append({ "activation": [ {"type": "maxout", "group_size": 2}, {"type": "maxout", "group_size": 2}, {"type": "maxout", "group_size": 2}, {"type": "maxout", "group_size": 5}], "bn": True}) opt_paras_list = [] opt_paras_list.append({"lr": 0.025}) opt_paras_list.append({"lr": 0.05}) def setup(graph): brain.attach(cnn_block( ksize=[8, 8], init_para={ "name": "uniform", "range": 0.005}, wd={"type": "l2", "scale": 0.0005}, out_channel_num=384, pool_size=[4, 4], pool_stride=[2, 2], activation={{ net_paras["activation"][1] }}, keep_prob=0.5, bn={{ net_paras["bn"] }})) tune(setup, opt_paras_list, net_paras_list) """ # Parse command line flags FLAGS(sys.argv) # Set up data structures. # ######################################################################### manager = multiprocessing.Manager() gpu_num = cuda.Device.count() gpu_mask = manager.list([1] * gpu_num) return_values = manager.list() net_num = len(net_paras_list) opt_num = len(opt_paras_list) if type(gpu_num_per_instance) is not int: if len(net_paras_list) * len(opt_paras_list) * repeat_times \ != len(gpu_num_per_instance): raise Exception(""" The number of gpu used per training instance should match `#net_paras_list({}) * #opt_paras_list({}) * repeat_times({}): {}`, or a single int. """.format(net_num, opt_num, repeat_times, net_num * opt_num * repeat_times) ) # Logistics # ######################################################################### s = Semaphore(len(gpu_mask)) l = multiprocessing.Lock() process_pool = [] template_str = Template(inspect.getsource(template)) # Start tuning. # ######################################################################### for repeat in xrange(0, repeat_times): for i, opt_paras in enumerate(opt_paras_list): for j, net_paras in enumerate(net_paras_list): setup_func = template_str.render(opt_paras=opt_paras, net_paras=net_paras) _gpu_num = gpu_num_per_instance[ repeat*(net_num * opt_num) + i*net_num + j] \ if type(gpu_num_per_instance) is list \ else gpu_num_per_instance p = multiprocessing.Process(target=spawn, args=(s, l, gpu_mask, _gpu_num, return_values, setup_func, repeat)) process_pool.append(p) p.start() # Wait for all processes to finish. for p in tqdm(process_pool): p.join() # TODO(Shuai): Think what should be the return value for subprocess call. return return_values