#
# mcfly
#
# Copyright 2017 Netherlands eScience Center
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Summary:
This module provides the main functionality of mcfly: searching for an
optimal model architecture. The work flow is as follows:
Function generate_models from modelgen.py generates and compiles models.
Function train_models_on_samples trains those models.
Function find_best_architecture is wrapper function that combines
these steps.
Example function calls can be found in the tutorial notebook
(https://github.com/NLeSC/mcfly-tutorial)
"""
import json
import os
import warnings
import numpy as np
from sklearn import neighbors, metrics as sklearnmetrics
from tensorflow.keras import metrics
from tensorflow.keras.callbacks import EarlyStopping
from . import modelgen
[docs]def train_models_on_samples(X_train, y_train, X_val, y_val, models,
nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
model_path=None, early_stopping=False,
batch_size=20, metric='accuracy', class_weight=None):
"""
Given a list of compiled models, this function trains
them all on a subset of the train data. If the given size of the subset is
smaller then the size of the data, the complete data set is used.
Parameters
----------
X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
The input dataset for training
y_train : numpy array of shape (num_samples, num_classes)
The output classes for the train data, in binary format
X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
The input dataset for validation
y_val : numpy array of shape (num_samples_val, num_classes)
The output classes for the validation data, in binary format
models : list of model, params, modeltypes
List of keras models to train
nr_epochs : int, optional
nr of epochs to use for training one model
subset_size :
The number of samples used from the complete train set
verbose : bool, optional
flag for displaying verbose output
outputfile: str, optional
Filename to store the model training results
model_path : str, optional
Directory to store the models as HDF5 files
early_stopping: bool
Stop when validation loss does not decrease
batch_size : int
nr of samples per batch
metric : str
metric to store in the history object
class_weight: dict, optional
Dictionary containing class weights (example: {0: 0.5, 1: 2.})
Returns
----------
histories : list of Keras History objects
train histories for all models
val_metrics : list of floats
validation accuraracies of the models
val_losses : list of floats
validation losses of the models
"""
X_train_sub = X_train[:subset_size, :, :]
y_train_sub = y_train[:subset_size, :]
metric_name = _get_metric_name(metric)
histories = []
val_metrics = []
val_losses = []
for i, (model, params, model_types) in enumerate(models):
if verbose:
print('Training model %d' % i, model_types)
model_metrics = [_get_metric_name(metric.name) for metric in model.metrics]
if metric_name not in model_metrics:
raise ValueError('Invalid metric: "{}" is not among the metrics the models was compiled with ({}).'
.format(metric_name, model_metrics))
if early_stopping:
callbacks = [
EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
else:
callbacks = []
history = model.fit(X_train_sub, y_train_sub,
epochs=nr_epochs, batch_size=batch_size,
# see comment on subsize_set
validation_data=(X_val, y_val),
verbose=verbose,
callbacks=callbacks,
class_weight=class_weight)
histories.append(history)
val_metrics.append(_get_from_history('val_' + metric_name, history.history)[-1])
val_losses.append(_get_from_history('val_loss', history.history)[-1])
if outputfile is not None:
store_train_hist_as_json(params, model_types, history.history, outputfile)
if model_path is not None:
model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
return histories, val_metrics, val_losses
def _get_from_history(metric_name, history_history):
"""Gets the metric from the history object. Tries to solve inconsistencies in abbreviation of accuracy between
Tensorflow/Keras versions. """
if metric_name == 'val_accuracy':
return _get_either_from_history('val_accuracy', 'val_acc', history_history)
elif metric_name == 'accuracy':
return _get_either_from_history('accuracy', 'acc', history_history)
else:
return history_history[metric_name]
def _get_either_from_history(option1, option2, history_history):
try:
return history_history[option1]
except KeyError:
try:
return history_history[option2]
except KeyError:
raise KeyError('No {} or {} in history.'.format(option1, option2))
[docs]def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='accuracy'):
"""
This function stores the model parameters, the loss and accuracy history
of one model in a JSON file. It appends the model information to the
existing models in the file.
Parameters
----------
params : dict
parameters for one model
model_type : Keras model object
Keras model object for one model
history : dict
training history from one model
outputfile : str
path where the json file needs to be stored
metric_name : str, optional
name of metric from history to store
"""
jsondata = params.copy()
jsondata['train_metric'] = _get_from_history(metric_name, history)
jsondata['train_loss'] = _get_from_history('loss', history)
jsondata['val_metric'] = _get_from_history('val_' + metric_name, history)
jsondata['val_loss'] = _get_from_history('val_loss', history)
jsondata['modeltype'] = model_type
jsondata['metric'] = metric_name
for k in jsondata.keys():
if isinstance(jsondata[k], np.ndarray) or isinstance(jsondata[k], list):
jsondata[k] = [_cast_to_primitive_type(element) for element in jsondata[k]]
if os.path.isfile(outputfile):
with open(outputfile, 'r') as outfile:
previousdata = json.load(outfile)
else:
previousdata = []
previousdata.append(jsondata)
with open(outputfile, 'w') as outfile:
json.dump(previousdata, outfile, sort_keys=True,
indent=4, ensure_ascii=False)
def _cast_to_primitive_type(obj):
if isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.integer):
return int(obj)
else:
return obj
[docs]def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
number_of_models=5, nr_epochs=5, subset_size=100,
outputpath=None, model_path=None, metric='accuracy',
class_weight=None,
**kwargs):
"""
Tries out a number of models on a subsample of the data,
and outputs the best found architecture and hyperparameters.
Parameters
----------
X_train : numpy array
The input dataset for training of shape
(num_samples, num_timesteps, num_channels)
y_train : numpy array
The output classes for the train data, in binary format of shape
(num_samples, num_classes)
X_val : numpy array
The input dataset for validation of shape
(num_samples_val, num_timesteps, num_channels)
y_val : numpy array
The output classes for the validation data, in binary format of shape
(num_samples_val, num_classes)
verbose : bool, optional
flag for displaying verbose output
number_of_models : int, optiona
The number of models to generate and test
nr_epochs : int, optional
The number of epochs that each model is trained
subset_size : int, optional
The size of the subset of the data that is used for finding
the optimal architecture
outputpath : str, optional
File location to store the model results
model_path: str, optional
Directory to save the models as HDF5 files
class_weight: dict, optional
Dictionary containing class weights (example: {0: 0.5, 1: 2.})
metric: str, optional
metric that is used to evaluate the model on the validation set.
See https://keras.io/metrics/ for possible metrics
**kwargs: key-value parameters
parameters for generating the models
(see docstring for modelgen.generate_models)
Returns
----------
best_model : Keras model
Best performing model, already trained on a small sample data set.
best_params : dict
Dictionary containing the hyperparameters for the best model
best_model_type : str
Type of the best model
knn_acc : float
accuaracy for kNN prediction on validation set
"""
models = modelgen.generate_models(X_train.shape, y_train.shape[1],
number_of_models=number_of_models,
metrics=[metric],
**kwargs)
histories, val_accuracies, val_losses = train_models_on_samples(X_train,
y_train,
X_val,
y_val,
models,
nr_epochs,
subset_size=subset_size,
verbose=verbose,
outputfile=outputpath,
model_path=model_path,
metric=metric,
class_weight=class_weight)
best_model_index = np.argmax(val_accuracies)
best_model, best_params, best_model_type = models[best_model_index]
knn_acc = kNN_accuracy(
X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
if verbose:
print('Best model: model ', best_model_index)
print('Model type: ', best_model_type)
print('Hyperparameters: ', best_params)
print(str(metric) + ' on validation set: ',
val_accuracies[best_model_index])
print('Accuracy of kNN on validation set', knn_acc)
if val_accuracies[best_model_index] < knn_acc:
warnings.warn('Best model not better than kNN: ' +
str(val_accuracies[best_model_index]) + ' vs ' +
str(knn_acc)
)
return best_model, best_params, best_model_type, knn_acc
def _get_metric_name(name):
"""
Gives the keras name for a metric
Parameters
----------
name : str
original name of the metric
Returns
-------
"""
if name == 'acc' or name == 'accuracy':
return 'accuracy'
try:
metric_fn = metrics.get(name)
return metric_fn.__name__
except:
pass
return name
[docs]def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
"""
Performs k-Neigherst Neighbors and returns the accuracy score.
Parameters
----------
X_train : numpy array
Train set of shape (num_samples, num_timesteps, num_channels)
y_train : numpy array
Class labels for train set
X_val : numpy array
Validation set of shape (num_samples, num_timesteps, num_channels)
y_val : numpy array
Class labels for validation set
k : int
number of neighbors to use for classifying
Returns
-------
accuracy: float
accuracy score on the validation set
"""
num_samples, num_timesteps, num_channels = X_train.shape
clf = neighbors.KNeighborsClassifier(k)
clf.fit(
X_train.reshape(
num_samples,
num_timesteps *
num_channels),
y_train)
num_samples, num_timesteps, num_channels = X_val.shape
val_predict = clf.predict(
X_val.reshape(num_samples,
num_timesteps * num_channels))
return sklearnmetrics.accuracy_score(val_predict, y_val)