This notebook trains multiple models for soil moisture prediction maps. The model training data is based on weekly averaged soil moisture probes and multiple spatial-temporal dependent covariates.
The prediction models that are tested are based on Gaussian Process regression with two different base function: Random Forest (RF) and Bayesian Linear Regression (BLR). The base functions are used to account for the non-linear relationship between the covariates and the soil moisture. Gaussian Process Regression (GPR) is used to account for the spatial-temporal correlation of the soil moisture probes. When GPR is enabled, base model were used as the mean function of GPR.
The model selection is based on the cross-validation of the prediction error (normalized RMSE, R2) for the test data. Test data is selected based on n-fold cross-validation. Different test data is selected for each fold.
Note that test data is n-fold split based on the spatial location of the probes, and not on time. If test data would be selected based on unique location in space and time rather than on space only, the cross-validation would reveal in superficial low prediction errors, because the temporal variability of the soil moisture over short time periods is very low relative to the spatial variations, and hence test data would be strongly correlated with the training data.
User settings, such as input/output paths and all other options, are set in the settings file, e.g.:
settings_testmodel_spatial.yaml
This package is part of the machine learning project developed for the Agricultural Research Federation (AgReFed).
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import yaml
import argparse
from types import SimpleNamespace
from matplotlib.image import imread
import time
# Custom local libraries:
sys.path.append('../../python_scripts')
from utils import print2, truncate_data
from preprocessing import gen_kfold
import GPmodel as gp # GP model plus kernel functions and distance matrix calculation
import model_blr as blr
import model_rf as rf
from soilmod_xval import runmodel
All settings are specified in the .yaml file to make analysis reproducible. Below we will read and inspect the settings.
# Define name of settings file to save configuration
fname_settings = 'settings_testmodel_spatial.yaml'
path_settings = '.'
# Load settings from yaml file
with open(os.path.join(path_settings,fname_settings), 'r') as f:
settings = yaml.load(f, Loader=yaml.FullLoader)
# Parse settings dictinary as namespace (settings are available as
# settings.variable_name rather than settings['variable_name'])
settings = SimpleNamespace(**settings)
# Add temporal or vertical component
if settings.axistype == 'temporal':
settings.colname_zcoord = settings.colname_tcoord
settings.colname_zmin = settings.colname_tmin
settings.colname_zmax = settings.colname_tmax
if type(settings.model_functions) != list:
settings.model_functions = [settings.model_functions]
# check if outpath exists, if not create direcory
os.makedirs(settings.outpath, exist_ok = True)
# Intialise output info file:
print('init')
print(f'--- Parameter Settings ---')
print(f'Selected Model Functions: {settings.model_functions}')
print(f'Target Name: {settings.name_target}')
print(f'--------------------------')
# Print features selected
print("")
print("--- Features Selected ---")
for key in settings.__dict__:
if key == "name_features":
for feature in settings.name_features:
print(f"'{feature}'")
init --- Parameter Settings --- Selected Model Functions: ['rf', 'blr', 'rf-gp', 'blr-gp'] Target Name: SM -------------------------- --- Features Selected --- 'DepthBot' 'DepthTop' 'Bucket' 'ET20m_df_999' 'CLY' 'SOC' 'NDVI_95' 'SND' 'Rain_df_999' 'TWI' 'ET20m_df_90' 'ET20m_df_99' 'ET20m' 'Rain' 'NDVI_50' 'ET20m_df_95' 'Total' 'Rain_df_99' 'Rain_df_50' 'ET20m_df_70' 'ET20m_df_50'
The data preprocessing includes the following steps:
print('Reading data into dataframe...')
# Read in data
dfsel = pd.read_csv(os.path.join(settings.inpath, settings.infname))
# Rename x and y coordinates of input data
if settings.colname_xcoord != 'x':
dfsel.rename(columns={settings.colname_xcoord: 'x'}, inplace = True)
if settings.colname_ycoord != 'y':
dfsel.rename(columns={settings.colname_ycoord: 'y'}, inplace = True)
if (settings.axistype == 'vertical') & (settings.colname_zcoord != 'z'):
dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True)
else:
dfsel.rename(columns={settings.colname_tcoord: 'z'}, inplace = True)
dfsel.rename(columns={settings.colname_zcoord: 'z'}, inplace = True)
settings.name_features.append('z')
# Select data between zmin and zmax
dfsel = dfsel[(dfsel['z'] >= settings.colname_zmin) & (dfsel['z'] <= settings.colname_zmax)]
# Generate n-fold indices
print(f'Generating {settings.nfold}-fold test sets based on moisture probe locations..')
# dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['z'], precision_unique = 0.01, sort=True)
dfsel = gen_kfold(dfsel, nfold = settings.nfold, label_nfold = 'nfold', id_unique = ['x', 'y'], precision_unique = 0.01, sort=False)
## Get coordinates for training data and set coord origin to (0,0)
print(f'Setting coordinate origin to (0,0)...')
bound_xmin = dfsel.x.min()
bound_xmax = dfsel.x.max()
bound_ymin = dfsel.y.min()
bound_ymax = dfsel.y.max()
# Set origin to (0,0)
dfsel['x'] = dfsel['x'] - bound_xmin
dfsel['y'] = dfsel['y'] - bound_ymin
print('Preprocessing data finished.')
Reading data into dataframe... Generating 5-fold test sets based on moisture probe locations.. Setting coordinate origin to (0,0)... Preprocessing data finished.
Here we train and test multiple models as specified in the settings: settings.model_functions. The models are a combination of Gaussian Process regression with either a Random Forest or Bayesian linear regression model as base function.
This training step will take a couple of minutes given that a new model needs to be trained and evaluated for each cross-validation and model type.
The results for each model are saved as:
# Define stats result lists
stats_summaries = []
# Loop over model functions and evaluate
for model_function in settings.model_functions:
# run and evaluate model
time_start = time.time()
dfsum, stats_summary, model_outpath = runmodel(dfsel, model_function, settings)
time_end = time.time()
print(f'Time elapsed: {(time_end - time_start)/3600} hours.)')
print(f'All output files of {model_function} saved in {model_outpath}')
print('')
# save results
stats_summaries.append(stats_summary)
Computing 5-fold xrossvalidation for function model: rf Processing for nfold 1 Using default hyperparameters for Random Forest RMSE: 14.4671 Normalized RMSE: 0.4262 Normalized ROOT MEDIAN SE: 0.2163 R^2: 0.8183 Lin's CCC: 0.8991 Nash Sutcliffe Efficiency: 0.8183 Mean Theta: 0.7825 Median Theta: 0.0 Processing for nfold 2 Using default hyperparameters for Random Forest RMSE: 16.4298 Normalized RMSE: 0.4577 Normalized ROOT MEDIAN SE: 0.2374 R^2: 0.7905 Lin's CCC: 0.8789 Nash Sutcliffe Efficiency: 0.7905 Mean Theta: 1.1752 Median Theta: 1.0 Processing for nfold 3 Using default hyperparameters for Random Forest RMSE: 19.4168 Normalized RMSE: 0.5296 Normalized ROOT MEDIAN SE: 0.2629 R^2: 0.7195 Lin's CCC: 0.8519 Nash Sutcliffe Efficiency: 0.7195 Mean Theta: 1.2683 Median Theta: 1.0 Processing for nfold 4 Using default hyperparameters for Random Forest RMSE: 17.8703 Normalized RMSE: 0.5303 Normalized ROOT MEDIAN SE: 0.2536 R^2: 0.7188 Lin's CCC: 0.8443 Nash Sutcliffe Efficiency: 0.7188 Mean Theta: 1.9405 Median Theta: 1.0 Processing for nfold 5 Using default hyperparameters for Random Forest RMSE: 14.2455 Normalized RMSE: 0.4411 Normalized ROOT MEDIAN SE: 0.2337 R^2: 0.8054 Lin's CCC: 0.8992 Nash Sutcliffe Efficiency: 0.8054 Mean Theta: 0.8088 Median Theta: 0.0 Saving all predictions in dataframe ... ---- X-validation Summary ----- Mean RMSE: 16.486 +/- 1.98 Mean normalized RMSE: 0.477 +/- 0.044 Median normalized RMSE: 0.237 Mean R^2: 0.771 Mean LCCC: 0.875 Mean NSE: 0.771 Mean Theta: 1.195 +/- 0.42 Time elapsed: 0.042336971627341374 hours.) All output files of rf saved in testmodel_spatial/Xval_5-fold_rf_SM Computing 5-fold xrossvalidation for function model: blr Processing for nfold 1 RMSE: 32.7737 Normalized RMSE: 0.9656 Normalized ROOT MEDIAN SE: 0.8211 R^2: 0.0676 Lin's CCC: 0.1598 Nash Sutcliffe Efficiency: 0.0676 Mean Theta: 2.0885 Median Theta: 2.0 Processing for nfold 2 RMSE: 32.6492 Normalized RMSE: 0.9096 Normalized ROOT MEDIAN SE: 0.7485 R^2: 0.1727 Lin's CCC: 0.2466 Nash Sutcliffe Efficiency: 0.1727 Mean Theta: 2.0413 Median Theta: 1.0 Processing for nfold 3 RMSE: 35.241 Normalized RMSE: 0.9613 Normalized ROOT MEDIAN SE: 0.7575 R^2: 0.0759 Lin's CCC: 0.1709 Nash Sutcliffe Efficiency: 0.0759 Mean Theta: 2.4475 Median Theta: 2.0 Processing for nfold 4 RMSE: 32.4634 Normalized RMSE: 0.9633 Normalized ROOT MEDIAN SE: 0.785 R^2: 0.0721 Lin's CCC: 0.1826 Nash Sutcliffe Efficiency: 0.0721 Mean Theta: 2.0072 Median Theta: 1.0 Processing for nfold 5 RMSE: 30.6514 Normalized RMSE: 0.9492 Normalized ROOT MEDIAN SE: 0.7945 R^2: 0.0991 Lin's CCC: 0.1952 Nash Sutcliffe Efficiency: 0.0991 Mean Theta: 1.7338 Median Theta: 1.0 Saving all predictions in dataframe ... ---- X-validation Summary ----- Mean RMSE: 32.756 +/- 1.463 Mean normalized RMSE: 0.95 +/- 0.021 Median normalized RMSE: 0.785 Mean R^2: 0.098 Mean LCCC: 0.191 Mean NSE: 0.098 Mean Theta: 2.064 +/- 0.228 Time elapsed: 0.006215808855162727 hours.) All output files of blr saved in testmodel_spatial/Xval_5-fold_blr_SM Computing 5-fold xrossvalidation for function model: rf-gp Processing for nfold 1 Using default hyperparameters for Random Forest Mean of Y: -0.0085 +/- 4.3632 Mean of Mean function: 51.1927 +/- 32.998 Mean of Mean function noise: 7.6754 +/- 5.1775 Optimizing GP hyperparameters... Mean Input Noise: 1.7591479628827345 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [1.01197236e-02 9.63804094e-04 2.61097468e+02 2.87698448e+04] Marginal Log Likelihood: -7273.257882669782 Computing GP predictions for test set nfold 1 Logl: -7273.257882669782 Logl: -7273.257882669782 GP Marginal Log-Likelihood: -7273.26 RMSE: 14.526 Normalized RMSE: 0.428 Normalized ROOT MEDIAN SE: 0.2137 R^2: 0.8168 Lin's CCC: 0.899 Nash Sutcliffe Efficiency: 0.8168 Mean Theta: 0.7885 Median Theta: 0.0 Processing for nfold 2 Using default hyperparameters for Random Forest Mean of Y: -0.0168 +/- 4.3589 Mean of Mean function: 50.8739 +/- 32.5323 Mean of Mean function noise: 7.7182 +/- 5.1795 Optimizing GP hyperparameters... Mean Input Noise: 1.7706833616160353 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [1.00000004e-02 2.28945271e-03 2.68167271e+02 1.13519946e+04] Marginal Log Likelihood: -7316.932066834188 Computing GP predictions for test set nfold 2 Logl: -7316.932066834188 Logl: -7316.932066834188 GP Marginal Log-Likelihood: -7316.93 RMSE: 16.4177 Normalized RMSE: 0.4574 Normalized ROOT MEDIAN SE: 0.2358 R^2: 0.7908 Lin's CCC: 0.8798 Nash Sutcliffe Efficiency: 0.7908 Mean Theta: 1.1654 Median Theta: 1.0 Processing for nfold 3 Using default hyperparameters for Random Forest Mean of Y: -0.005 +/- 4.3672 Mean of Mean function: 49.4068 +/- 32.2749 Mean of Mean function noise: 7.7389 +/- 4.9954 Optimizing GP hyperparameters... Mean Input Noise: 1.7720341685909913 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [1.00000017e-02 1.62770272e-04 2.94874581e+02 2.04489966e+04] Marginal Log Likelihood: -7399.49403769255 Computing GP predictions for test set nfold 3 Logl: -7399.49403769255 Logl: -7399.49403769255 GP Marginal Log-Likelihood: -7399.49 RMSE: 19.3666 Normalized RMSE: 0.5283 Normalized ROOT MEDIAN SE: 0.2591 R^2: 0.7209 Lin's CCC: 0.8519 Nash Sutcliffe Efficiency: 0.7209 Mean Theta: 1.2566 Median Theta: 1.0 Processing for nfold 4 Using default hyperparameters for Random Forest Mean of Y: -0.0022 +/- 4.3562 Mean of Mean function: 50.8271 +/- 33.1048 Mean of Mean function noise: 7.66 +/- 5.2189 Optimizing GP hyperparameters... Mean Input Noise: 1.7584099651744856 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [1.01133551e-02 3.84599703e-04 3.21713984e+02 2.61618154e+04] Marginal Log Likelihood: -7195.709911523896 Computing GP predictions for test set nfold 4 Logl: -7195.709911523896 Logl: -7195.709911523896 GP Marginal Log-Likelihood: -7195.71 RMSE: 17.8958 Normalized RMSE: 0.531 Normalized ROOT MEDIAN SE: 0.2529 R^2: 0.718 Lin's CCC: 0.8442 Nash Sutcliffe Efficiency: 0.718 Mean Theta: 1.9522 Median Theta: 1.0 Processing for nfold 5 Using default hyperparameters for Random Forest Mean of Y: -0.011 +/- 4.5851 Mean of Mean function: 50.086 +/- 33.389 Mean of Mean function noise: 8.0091 +/- 5.4148 Optimizing GP hyperparameters... Mean Input Noise: 1.7467756861273704 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [1.01209730e-02 1.66100415e-04 2.91065980e+02 2.81217120e+04] Marginal Log Likelihood: -6929.128156322768 Computing GP predictions for test set nfold 5 Logl: -6929.128156322768 Logl: -6929.128156322768 GP Marginal Log-Likelihood: -6929.13 RMSE: 14.2446 Normalized RMSE: 0.4411 Normalized ROOT MEDIAN SE: 0.233 R^2: 0.8054 Lin's CCC: 0.8994 Nash Sutcliffe Efficiency: 0.8054 Mean Theta: 0.8057 Median Theta: 0.0 Saving all predictions in dataframe ... ---- X-validation Summary ----- Mean RMSE: 16.49 +/- 1.957 Mean normalized RMSE: 0.477 +/- 0.044 Median normalized RMSE: 0.236 Mean R^2: 0.77 Mean LCCC: 0.875 Mean NSE: 0.77 Mean Theta: 1.194 +/- 0.423 Time elapsed: 2.5581216569079293 hours.) All output files of rf-gp saved in testmodel_spatial/Xval_5-fold_rf-gp_SM Computing 5-fold xrossvalidation for function model: blr-gp Processing for nfold 1 Mean of Y: 0.0 +/- 32.5942 Mean of Mean function: 51.1843 +/- 11.7475 Mean of Mean function noise: 22.6627 +/- 0.0146 Optimizing GP hyperparameters... Mean Input Noise: 0.6952984824706497 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [3.01545068e-01 1.00446158e-04 2.64445581e+01 4.86907956e+02] Marginal Log Likelihood: -7297.860250294331 Computing GP predictions for test set nfold 1 Logl: -7297.860250294331 Logl: -7297.860250294331 GP Marginal Log-Likelihood: -7297.86 RMSE: 31.2975 Normalized RMSE: 0.9221 Normalized ROOT MEDIAN SE: 0.7716 R^2: 0.1497 Lin's CCC: 0.2982 Nash Sutcliffe Efficiency: 0.1497 Mean Theta: 1.3127 Median Theta: 1.0 Processing for nfold 2 Mean of Y: 0.0 +/- 32.6452 Mean of Mean function: 50.8571 +/- 10.1873 Mean of Mean function noise: 22.8369 +/- 0.0136 Optimizing GP hyperparameters... Mean Input Noise: 0.6995489483730474 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [2.97495765e-01 1.12035703e-04 2.56817293e+01 4.28172440e+02] Marginal Log Likelihood: -7242.352682713499 Computing GP predictions for test set nfold 2 Logl: -7242.352682713499 Logl: -7242.352682713499 GP Marginal Log-Likelihood: -7242.35 RMSE: 31.0349 Normalized RMSE: 0.8646 Normalized ROOT MEDIAN SE: 0.7311 R^2: 0.2525 Lin's CCC: 0.342 Nash Sutcliffe Efficiency: 0.2525 Mean Theta: 1.2206 Median Theta: 1.0 Processing for nfold 3 Mean of Y: 0.0 +/- 31.9826 Mean of Mean function: 49.4018 +/- 11.3231 Mean of Mean function noise: 22.5271 +/- 0.016 Optimizing GP hyperparameters... Mean Input Noise: 0.7043552258042414 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [2.96698224e-01 1.66056929e-04 2.52498374e+01 7.82734308e+02] Marginal Log Likelihood: -7197.8781152629135 Computing GP predictions for test set nfold 3 Logl: -7197.8781152629135 Logl: -7197.8781152629135 GP Marginal Log-Likelihood: -7197.88 RMSE: 36.0817 Normalized RMSE: 0.9842 Normalized ROOT MEDIAN SE: 0.7913 R^2: 0.0313 Lin's CCC: 0.2864 Nash Sutcliffe Efficiency: 0.0313 Mean Theta: 2.2472 Median Theta: 2.0 Processing for nfold 4 Mean of Y: 0.0 +/- 32.6635 Mean of Mean function: 50.8249 +/- 11.8049 Mean of Mean function noise: 22.9097 +/- 0.0164 Optimizing GP hyperparameters... Mean Input Noise: 0.7013860844316251 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [3.05669467e-01 1.00326079e-04 2.64975307e+01 7.87787845e+02] Marginal Log Likelihood: -7208.405907211392 Computing GP predictions for test set nfold 4 Logl: -7208.405907211392 Logl: -7208.405907211392 GP Marginal Log-Likelihood: -7208.41 RMSE: 33.7539 Normalized RMSE: 1.0016 Normalized ROOT MEDIAN SE: 0.9005 R^2: -0.0031 Lin's CCC: 0.2736 Nash Sutcliffe Efficiency: -0.0031 Mean Theta: 1.7978 Median Theta: 1.0 Processing for nfold 5 Mean of Y: -0.0 +/- 33.1783 Mean of Mean function: 50.075 +/- 11.6365 Mean of Mean function noise: 23.2732 +/- 0.0164 Optimizing GP hyperparameters... Mean Input Noise: 0.7014604788325597 Optimized Hyperparameters (amplitude, y_noise_fac, lengthscale_z, lengths_xy): [3.16710464e-01 1.00276778e-04 2.67318905e+01 5.26628953e+02] Marginal Log Likelihood: -6933.260123205554 Computing GP predictions for test set nfold 5 Logl: -6933.260123205554 Logl: -6933.260123205554 GP Marginal Log-Likelihood: -6933.26 RMSE: 29.0253 Normalized RMSE: 0.8988 Normalized ROOT MEDIAN SE: 0.7799 R^2: 0.1922 Lin's CCC: 0.3553 Nash Sutcliffe Efficiency: 0.1922 Mean Theta: 1.1371 Median Theta: 1.0 Saving all predictions in dataframe ... ---- X-validation Summary ----- Mean RMSE: 32.239 +/- 2.438 Mean normalized RMSE: 0.934 +/- 0.052 Median normalized RMSE: 0.78 Mean R^2: 0.125 Mean LCCC: 0.311 Mean NSE: 0.125 Mean Theta: 1.543 +/- 0.42 Time elapsed: 1.990724447634485 hours.) All output files of blr-gp saved in testmodel_spatial/Xval_5-fold_blr-gp_SM
for model_function, stats_summary in zip(settings.model_functions, stats_summaries):
print(f'--- {model_function} ---')
for key, value in stats_summary.items():
print(f'{key}: \t{value[0]}\t\u00b1 {value[1]}')
--- rf --- RMSE: 16.486 ± 1.98 NRMSE: 0.477 ± 0.044 R2: 1.195 ± 0.42 LCCC: 0.875 ± 0.023 NSE: 0.771 ± 0.043 Theta: 1.195 ± 0.42 --- blr --- RMSE: 32.756 ± 1.463 NRMSE: 0.95 ± 0.021 R2: 2.064 ± 0.228 LCCC: 0.191 ± 0.03 NSE: 0.098 ± 0.039 Theta: 2.064 ± 0.228 --- rf-gp --- RMSE: 16.49 ± 1.957 NRMSE: 0.477 ± 0.044 R2: 1.194 ± 0.423 LCCC: 0.875 ± 0.023 NSE: 0.77 ± 0.042 Theta: 1.194 ± 0.423 --- blr-gp --- RMSE: 32.239 ± 2.438 NRMSE: 0.934 ± 0.052 R2: 1.543 ± 0.42 LCCC: 0.311 ± 0.032 NSE: 0.125 ± 0.097 Theta: 1.543 ± 0.42
Results for each cross-validation set (here 8) are saved in a separate folder, including result csv files multiple plots for residual analysis and ground truth versus prediction. To inspect please check images in output folder. Here we show only a few overview results of the combined cross-validation results.
plt.rcParams.update({'font.size': 10})
plt.rcParams.update({'font.family': 'Arial'})
plt.rcParams.update({'axes.titlesize': 10})
plt.rcParams.update({'axes.labelsize': 8})
plt.rcParams.update({'xtick.labelsize': 8})
plt.rcParams.update({'ytick.labelsize': 8})
plt.rcParams.update({'legend.fontsize': 8})
plt.rcParams.update({'figure.dpi': 100})
# Set bar chart color
# colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown']
# Function: Plot stats of all models
def plot_models_nfold(respaths, modelnames, statname = 'RMSE'):
dfstats = []
for respath in respaths:
dfstats.append(pd.read_csv(os.path.join(respath,f'{settings.name_target}_nfold_summary_stats.csv')))
fig, ax = plt.subplots(1,1, figsize = (6,2))
# plot bars for each model side by side
count = len(modelnames)
width = 1/(count+1)
mid = width * (count-1)/2
# set color cycle
ax.set_prop_cycle(color = ['tab:blue', 'tab:blue', 'tab:red', 'tab:red', 'tab:green', 'tab:green', 'tab:orange', 'tab:orange'])
for df, modelname, idx in zip(dfstats, modelnames, range(0, len(modelnames))):
hatch = '///' if 'GP' in modelname else ''
# Other hatch options: https://matplotlib.org/3.1.1/gallery/shapes_and_collections/hatch_style_reference.html
ax.bar(df.nfold - mid + idx*width, df[statname], label = modelname + ' model',
alpha = 0.6, width = width, edgecolor = 'k', linewidth = 1, hatch = hatch)
# df.plot(y=statname, x = 'nfold', kind = 'bar', title = f'{statname} for {modelname}', ax=ax)
ax.set_xlim(0.5, settings.nfold+0.5)
ax.set_xticks(range(1,settings.nfold+1))
ax.set_xticklabels(range(1,settings.nfold+1))
ax.set_xlabel('X-validation fold')
ax.set_title(f'{statname} for all models')
ax.legend()
plt.savefig(os.path.join(settings.outpath, f'results_{statname}_nfold.png'))
plt.show()
# Function: Plot stats of all models
def plot_models(respaths, modelnames, statname = 'RMSE'):
dfstats = []
for respath in respaths:
dfstats.append(pd.read_csv(os.path.join(respath,f'{settings.name_target}_nfold_summary_stats.csv')))
fig, ax = plt.subplots(1,1, figsize = (4,2), dpi=100)
# set color cycle
ax.set_prop_cycle(color = ['tab:blue', 'tab:blue', 'tab:red', 'tab:red'])
df_out = []
for df, modelname, idx in zip(dfstats, modelnames, range(0, len(modelnames))):
hatch = '///' if 'GP' in modelname else ''
df_mean = df.mean()
df_std = df.std()
df_out.append(pd.DataFrame({modelname + ' mean': df_mean, modelname + ' std':df_std}).drop('nfold'))
ax.bar(idx, df_mean[statname], yerr = df_std[statname], label = modelname,
alpha = 0.6, edgecolor = 'k', linewidth = 1, hatch = hatch)
df_out = pd.concat(df_out, axis = 1)
df_out.to_csv(os.path.join(settings.outpath, 'results_stats.csv'))
# Grid line for y = 0
ax.set_xticks(range(0,len(modelnames)))
ax.set_xticklabels(modelnames, ha = 'center')
ax.set_ylabel(statname)
ax.set_title(f'{statname} for all models')
if (ax.get_ylim()[0]) < 0:
ax.axhline(y=0, color='k', linestyle='--', lw=1)
fig.subplots_adjust(hspace=0.4, wspace=0.2, left=0.15, right=0.95, bottom=0.12, top=0.88)
# Set background color to transparent
# fig.patch.set_facecolor('none')
# ax.patch.set_facecolor('none')
plt.savefig(os.path.join(settings.outpath, f'results_{statname}.png'), dpi=300, transparent=True)
plt.show()
respaths = [os.path.join(settings.outpath, f'Xval_{settings.nfold}-fold_rf_{settings.name_target}'),
os.path.join(settings.outpath, f'Xval_{settings.nfold}-fold_rf-gp_{settings.name_target}'),
os.path.join(settings.outpath, f'Xval_{settings.nfold}-fold_blr_{settings.name_target}'),
os.path.join(settings.outpath, f'Xval_{settings.nfold}-fold_blr-gp_{settings.name_target}'),
]
modelnames = ['RF',
'RF-GP',
'BLR',
'BLR-GP'
]
# Statistics of folds
plot_models_nfold(respaths, modelnames, statname = 'RMSE')
plot_models_nfold(respaths, modelnames, statname = 'R2')
plot_models(respaths, modelnames, statname = 'RMSE')
plot_models(respaths, modelnames, statname = 'R2')
plot_models(respaths, modelnames, statname = 'LCCC')
plot_models(respaths, modelnames, statname = 'Theta')
respaths_gp = respaths[1:4:2]
modelnames_gp = modelnames[1:4:2]
plot_models(respaths_gp, modelnames_gp, statname = 'amplitude')
plot_models(respaths_gp, modelnames_gp, statname = 'noise_std')
plot_models(respaths_gp, modelnames_gp, statname = 'lengthscale_z')
plot_models(respaths_gp, modelnames_gp, statname = 'lengthscale_xy')
plot_models(respaths_gp, modelnames_gp, statname = 'LogL')