import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from IPython.display import display

from module_utils.utils import describe_plus, r2, quantile, plot_dist, plot_mean_std
from module_utils.model_training import plot_mae_and_predict, train_model,\
                                        compute_losses_reg, compute_losses_train_test_reg
from module_utils.scoreRegression import scoreClassif, scoreClassifier

sns.set()
warnings.filterwarnings('ignore')
np.set_printoptions(precision=2)


    df = pd.read_csv('Soil-Food.csv', sep = ',')
df.rename(columns={"carrots.veg_results.data.proteinMgPer100g": "Carrots protein (mg/100g)",
                   "carrots.veg_results.data.polyphenolsMgGae100gFw": "Carrots polyphenols (mg/100g)",
                   "carrots.veg_results.data.antioxidentsFrap" : "Carrots antioxidants (µmol/100g)",
                  "spinach.veg_results.data.proteinMgPer100g": "Spinach protein (mg/100g)",
                   "spinach.veg_results.data.polyphenolsMgGae100gFw": "Spinach polyphenols (mg/100g)",
                  "spinach.veg_results.data.antioxidentsFrap": "Spinach antioxidants (µmol/100g)"}, inplace=True)


    df_preview_sample = df[["carrots.carrotscan.median_500", "carrots.carrotscan.median_530", "spinach.spinachscan.median_850", "spinach.spinachscan.median_880"]].copy(deep=True)
df_preview_sample.head()


    df_preview_target = df[["Carrots protein (mg/100g)", "Carrots polyphenols (mg/100g)", "Carrots antioxidants (µmol/100g)", "Spinach protein (mg/100g)", "Spinach antioxidants (µmol/100g)"]].copy(deep=True)
df_preview_target.head()


    sns.jointplot(x='soil_0_15_Cu', y='soil_15_30_Cu', data=df, kind='reg', stat_func=r2, height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Copper in soil, 0-6" depths', fontsize=16)
plt.ylabel('Copper in soil, 6-12" depths', fontsize=16)
_ = plt.title('Copper in soil, 0 – 6” and 6 to 12” depths', fontsize=16, weight='bold')


    sns.jointplot(x='soil_0_15_K', y='soil_15_30_K', data=df, stat_func=r2, kind='reg', height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Potassium in soil, 0-6" depth', fontsize=16)
plt.ylabel('Potassium in soil, 6-12" depth', fontsize=16)
_ = plt.title('Potassium in soil, 0-6" and 6-12" depths', fontsize=16, weight='bold')


    sns.jointplot(x='soil3.display_loi.data.Total Organic C %', y='soil3.display_respiration.data.ugc_gsoil', data=df, stat_func=r2, kind="reg", height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Total Organic Carbon', fontsize=16)
plt.ylabel('Soil Respiration', fontsize=16)
_ = plt.title('Soil carbon by soil respiration', fontsize=16, weight='bold')


    sns.jointplot(x='soil9.display_loi.data.Total Organic C %', y='soil9.display_respiration.data.ugc_gsoil', data=df, stat_func=r2, kind="reg", height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Total Organic Carbon', fontsize=16)
plt.ylabel('Soil Respiration', fontsize=16)
_ = plt.title('Soil carbon by soil respiration', fontsize=16, weight='bold')


    df_poly_anti = pd.concat([df['Carrots polyphenols (mg/100g)'].fillna(0) + df['Spinach polyphenols (mg/100g)'].fillna(0),
           df['Carrots antioxidants (µmol/100g)'].fillna(0) + df['Spinach antioxidants (µmol/100g)'].fillna(0)], axis=1)
df_poly_anti.replace(0.00, np.nan, inplace=True)
df_poly_anti.rename(columns={0: "Polyphenols", 1: "Antioxidants"}, inplace=True)


    sns.jointplot(x='Polyphenols', y='Antioxidants', data=df_poly_anti, stat_func=r2, kind='reg', height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Polyphenols', fontsize=16)
plt.ylabel('Antioxidants', fontsize=16)
_ = plt.title('Antioxidants by Polyphenols, carrots and spinach', fontsize=16, weight='bold')


    df_anti_soilresp = pd.concat([df['soil3.display_loi.data.Total Organic C %'].fillna(0) + df['soil9.display_loi.data.Total Organic C %'].fillna(0),
           df['Carrots antioxidants (µmol/100g)'].fillna(0) + df['Spinach antioxidants (µmol/100g)'].fillna(0)], axis=1)
df_anti_soilresp.replace(0.00, np.nan, inplace=True)
df_anti_soilresp.rename(columns={0: "Soil Respiration", 1: "Antioxidants"}, inplace=True)


    sns.jointplot(x='Soil Respiration', y='Antioxidants', data=df_anti_soilresp, stat_func=r2, kind='reg', height=8)
plt.subplots_adjust(top=1.2)
plt.xlabel('Soil Respiration', fontsize=16)
plt.ylabel('Antioxidants', fontsize=16)
_ = plt.title('Antioxidants by Soil Respiration, carrots and spinach', fontsize=16, weight='bold')


    # group the target variables and plot the distribution
df_target_carrots = df[['Carrots polyphenols (mg/100g)', 'Carrots protein (mg/100g)', 'Carrots antioxidants (µmol/100g)']]
df_target_spinach = df[['Spinach polyphenols (mg/100g)', 'Spinach protein (mg/100g)', 'Spinach antioxidants (µmol/100g)']]


    # plot histogram for each columns of the dataframe
plot_dist(df_target_carrots)
plot_dist(df_target_spinach)


    df_carrotscan = df[["carrots.carrotscan.median_365", "carrots.carrotscan.median_385", "carrots.carrotscan.median_450", "carrots.carrotscan.median_500", "carrots.carrotscan.median_530", "carrots.carrotscan.median_587", "carrots.carrotscan.median_632", "carrots.carrotscan.median_850", "carrots.carrotscan.median_880", "carrots.carrotscan.median_940"]]
df_carrotscan.rename(columns={"carrots.carrotscan.median_365": "365",
                              "carrots.carrotscan.median_385": "385",
                              "carrots.carrotscan.median_450": "450",
                              "carrots.carrotscan.median_500": "500",
                              "carrots.carrotscan.median_530": "530",
                              "carrots.carrotscan.median_587": "587",
                              "carrots.carrotscan.median_632": "632",
                              "carrots.carrotscan.median_850": "850",
                              "carrots.carrotscan.median_880": "880",
                              "carrots.carrotscan.median_940": "940"}, inplace=True)
df_target_carrots_describe = describe_plus(df_target_carrots)
display(df_target_carrots_describe)


    plot_mean_std(df_carrotscan, 'Wavelength in nm', 'Absorbance', 'Mean and standard deviation of spectral data from carrots')


    df_spinachscan = df[["spinach.spinachscan.median_365", "spinach.spinachscan.median_385", "spinach.spinachscan.median_450", "spinach.spinachscan.median_500", "spinach.spinachscan.median_530", "spinach.spinachscan.median_587", "spinach.spinachscan.median_632", "spinach.spinachscan.median_850", "spinach.spinachscan.median_880", "spinach.spinachscan.median_940"]]
df_spinachscan.rename(columns={"spinach.spinachscan.median_365": "365",
                              "spinach.spinachscan.median_385": "385",
                              "spinach.spinachscan.median_450": "450",
                              "spinach.spinachscan.median_500": "500",
                              "spinach.spinachscan.median_530": "530",
                              "spinach.spinachscan.median_587": "587",
                              "spinach.spinachscan.median_632": "632",
                              "spinach.spinachscan.median_850": "850",
                              "spinach.spinachscan.median_880": "880",
                              "spinach.spinachscan.median_940": "940"}, inplace=True)
df_target_spinach_describe = describe_plus(df_target_spinach)
display(df_target_spinach_describe)


    plot_mean_std(df_spinachscan, 'Wavelength in nm', 'Absorbance', 'Mean and standard deviation of spectral data from spinach')


    # remove all missing values from the carrots dataframe
df_carrots_raw = pd.concat([df_carrotscan, df_target_carrots], axis=1 ,sort=False)
df_carrots_noNA = df_carrots_raw.dropna()


    # Independent variables
x = df_carrots_noNA[df_carrots_noNA.columns[:-3]].values
# variables to predict (the 3 nutrients)
y = df_carrots_noNA[df_carrots_noNA.columns[-3:]].values


    # We split the dataset into training set and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (352, 10)
Training Labels Shape: (352, 3)
Testing Features Shape: (151, 10)
Testing Labels Shape: (151, 3)


    # Define the model
RF_regressor = RandomForestRegressor(n_estimators=1000, random_state=0)
# Train model with Random Forest Regressor
train_model(RF_regressor, x_train, y_train)


    # evaluate model
# compute metrics
resume_metrics_rfr = compute_losses_reg(RF_regressor, x_test, y_test)

# ordinate axis labels
nutriments_units = ['Polyphenols (mg/100g)', 'Protein (mg/100g)', 'Antioxidant (µmol/100g)']
#plot mae and predicted variables
plot_mae_and_predict(model = RF_regressor,
                x = x_test,
                y = y_test,
                ylabel = nutriments_units)


    # Define the model
lin_regression = LinearRegression()
train_model(lin_regression, x_train, y_train)


    # evaluate model
# compute metrics
resume_metrics_lin = compute_losses_reg(lin_regression, x_test, y_test)

# plot mae and predicted variables
plot_mae_and_predict(model = lin_regression,
                x = x_test,
                y = y_test,
                ylabel = nutriments_units)


    # Minimally prepare dataset
# Independent variables
x = df_carrots_noNA[df_carrots_noNA.columns[:-3]].values
# variable to predict (antioxidant only because Random Forest Classifier can only take 1 output variable)
y = quantile(df_carrots_noNA[df_carrots_noNA.columns[-1:]].values)


    # We split the dataset into training set (to train the model) and testing set (to evaluate the model)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (352, 10)
Training Labels Shape: (352, 1)
Testing Features Shape: (151, 10)
Testing Labels Shape: (151, 1)


    # Define the model
classifier = RandomForestClassifier(n_estimators=1000)
train_model(classifier, x_train, y_train)


    # predict whether the carrot is rich or poor in nutrient
# We get a score by looking where the model was right, how many times has the model correctly classified
score_classifier = scoreClassifier(classifier, x_test, y_test)


    # Report performances
df_resume_performance_carrot = pd.DataFrame.from_dict({
    'Linear Regression': resume_metrics_lin,
    'RandomForest Regressor': resume_metrics_rfr,
    'RandomForest Classifier': np.nan
    })
df_resume_performance_carrot['RandomForest Classifier'].loc['Score Classifier'] = score_classifier
df_resume_performance_carrot.fillna('', inplace=True)
df_resume_performance_carrot


    # remove all missing values from the carrots dataframe
df_spinach_raw = pd.concat([df_spinachscan, df_target_spinach], axis=1 ,sort=False)
df_spinach_noNA = df_spinach_raw.dropna()


    x = df_spinach_noNA[df_spinach_noNA.columns[:-3]].values
y = df_spinach_noNA[df_spinach_noNA.columns[-3:]].values


    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (68, 10)
Training Labels Shape: (68, 3)
Testing Features Shape: (30, 10)
Testing Labels Shape: (30, 3)


    # define the model
RF_regressor = RandomForestRegressor(n_estimators=1000, random_state=42)
train_model(RF_regressor, x_train, y_train)


    # evaluate model
# compute metrics and plot the errors made by the model and the predicted values compared to the observed values
resume_metrics_rfr = compute_losses_train_test_reg(model = RF_regressor,
                             x_train = x_train,
                             y_train = y_train,
                             x_test = x_test,
                             y_test = y_test)
resume_metrics_rfr.rename(columns={"Train": "RForest Regressor Train",
                                   "Test": "RForest Regressor Test"}, inplace=True)

#plot mae and predicted variables
plot_mae_and_predict(model = RF_regressor,
                x = x_test,
                y = y_test,
                ylabel = nutriments_units)


    # Define the model
lin_regression = LinearRegression()
train_model(lin_regression, x_train, y_train)


    # evaluate model
# compute metrics and plot the errors made by the model and the predicted values compared to the observed values
resume_metrics_lin = compute_losses_train_test_reg(model = lin_regression,
                             x_train = x_train,
                             y_train = y_train,
                             x_test = x_test,
                             y_test = y_test)
resume_metrics_lin.rename(columns={"Train": "Linear Regression Train", "Test": "Linear Regression Test"}, inplace=True)

#plot mae and predicted variables
plot_mae_and_predict(model = lin_regression,
                x = x_test,
                y = y_test,
                ylabel = nutriments_units)


    # Independent variables
x = df_spinach_noNA[df_spinach_noNA.columns[:-3]].values
# Variable to predict (antioxidant only)
y = quantile(df_spinach_noNA[df_spinach_noNA.columns[-1:]].values)


    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (68, 10)
Training Labels Shape: (68, 1)
Testing Features Shape: (30, 10)
Testing Labels Shape: (30, 1)


    # Define the model
classifier = RandomForestClassifier(n_estimators=1000)
train_model(classifier, x_train, y_train)


    # predict whether the carrot is rich or poor in nutrient
# We get a score by looking where the model was right, how many times has the model correctly classified
score_classifier = scoreClassifier(classifier, x_test, y_test)


    df_resume_performance_spinach = pd.concat([resume_metrics_lin, resume_metrics_rfr], axis=1)
df_resume_performance_spinach.insert(4, "Random Forest Classifier", [np.nan, np.nan, np.nan, np.nan, score_classifier])
df_resume_performance_spinach.fillna('', inplace=True)
df_resume_performance_spinach

	carrots.carrotscan.median_500	carrots.carrotscan.median_530	spinach.spinachscan.median_850	spinach.spinachscan.median_880
0	NaN	NaN	40.661500	43.661500
1	NaN	NaN	40.747533	43.373233
2	4.640765	6.752740	NaN	NaN
3	2.274004	5.964625	NaN	NaN
4	NaN	NaN	NaN	NaN

	Carrots protein (mg/100g)	Carrots polyphenols (mg/100g)	Carrots antioxidants (µmol/100g)	Spinach protein (mg/100g)	Spinach antioxidants (µmol/100g)
0	NaN	NaN	NaN	2.18	294.49
1	NaN	NaN	NaN	4.28	168.33
2	1.70	2.38	14.43	NaN	NaN
3	1.94	3.62	20.65	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN

	count	mean	std	min	max	25%	50%	75%
Carrots polyphenols (mg/100g)	532.0	4.41	3.46	-1.42	27.39	2.41	3.52	5.24
Carrots protein (mg/100g)	528.0	1.92	1.08	0.01	9.37	1.34	1.70	2.19
Carrots antioxidants (µmol/100g)	527.0	48.18	44.87	-0.02	281.82	19.76	32.93	61.83

	count	mean	std	min	max	25%	50%	75%
Spinach polyphenols (mg/100g)	108.0	50.04	37.10	4.31	331.76	26.69	52.10	64.53
Spinach protein (mg/100g)	100.0	5.90	6.20	0.84	26.16	3.38	4.14	4.79
Spinach antioxidants (µmol/100g)	108.0	383.38	398.82	25.06	2974.90	56.56	272.32	647.82

	Linear Regression	RandomForest Regressor	RandomForest Classifier
MAE	11.071481	11.647704
MSE	591.216511	646.383389
RMSE	24.314944	25.424071
R2	0.148209	0.068728
Score Classifier	0.540839	0.518764	0.609272

Food Scanner - Soil Data¶

Import File¶

Independent variables¶

The first 5 lines of dataframe which represents the absorption of carrots and spinach according to the wavelength (in nm)¶

Dependent variables¶

The first 5 lines of dataframe which represents the nutritional values of carrots and spinach¶

In-depth analysis¶

Correlation beetween carbon/soil respiration/mineral in top soil vs bottom soil¶

Soil copper and soil respiration¶

Soil potassium and soil respiration¶

Soil carbon and soil respiration¶

Polyphenols and antioxidants¶

Antioxidants and soil respiration¶

Model Algorithm¶

Variables to predict¶

Features¶

Carrots¶

Spinach¶

Carrot sample - Training models¶

Random Forest Regressor¶

Linear Regression¶

Random Forest Classifier¶

Comparison of the 3 learning algorithms on carrots dataset¶

Spinach sample - Training models¶

Random Forest Regressor¶

Linear Regression¶

Random Forest Classifier¶

Comparison of the 3 learning algorithms on spinach dataset¶

	Linear Regression Train	Linear Regression Test	RForest Regressor Train	RForest Regressor Test	Random Forest Classifier
MAE	69.964061	100.900334	31.061594	86.980749
MSE	19235.321732	64166.645762	3746.876220	26405.740931
RMSE	138.691462	253.311361	61.211733	162.498434
R2	0.405606	-1.286898	0.884217	0.058900
Score Classifier	0.622549	0.655556	0.848039	0.655556	0.633333