Tutorial for multiple-postparturm diseases prediction with ensemble machine learning¶

Dan Lin, Jessica McArt¶

Dec 2024¶

workflow

Usage: python main.py¶

main.py

In [ ]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from tensorflow import keras
import tensorflow as tf
from models import create_xgboost_model, create_mlp_model, create_lstm_model
from evaluation import evaluate_ensemble

tf.config.set_visible_devices([], 'GPU')

# Data preprocessing function
def preprocess_data(df):
    # Separate features and labels
    spectral_cols = [col for col in df.columns if col.replace('.','').isdigit()]
    static_cols = ['milkweightlbs', 'parity', 'cells']
    X_spectral = df[spectral_cols].values
    X_static = df[static_cols].values
    y = df['group'].values
    
    # Standardization
    spectral_scaler = MinMaxScaler()
    static_scaler = MinMaxScaler()
    X_spectral_scaled = spectral_scaler.fit_transform(X_spectral)
    X_static_scaled = static_scaler.fit_transform(X_static)
    
    # Label encoding
    label_map = {'health': 0, 'met': 1, 'mast': 2}
    y_encoded = np.array([label_map[label] for label in y])
    
    return X_spectral_scaled, X_static_scaled, y_encoded

# Data splitting function
def split_data(X_spectral, X_static, y):
    # First split out validation set
    X_spectral_temp, X_spectral_val, X_static_temp, X_static_val, y_temp, y_val = train_test_split(
        X_spectral, X_static, y, test_size=0.15, stratify=y, random_state=42
    )
    
    # Then split out test set
    X_spectral_train, X_spectral_test, X_static_train, X_static_test, y_train, y_test = train_test_split(
        X_spectral_temp, X_static_temp, y_temp, test_size=0.176, stratify=y_temp, random_state=42
    )
    
    return (X_spectral_train, X_static_train, y_train,
            X_spectral_test, X_static_test, y_test,
            X_spectral_val, X_static_val, y_val)

# Oversampling function
def oversample_data(X_spectral, X_static, y):
    ros = RandomOverSampler(random_state=42)
    
    # Combine features for resampling
    X_combined = np.hstack((X_spectral, X_static))
    X_resampled, y_resampled = ros.fit_resample(X_combined, y)
    
    # Split back into spectral and static features
    n_spectral = X_spectral.shape[1]
    X_spectral_resampled = X_resampled[:, :n_spectral]
    X_static_resampled = X_resampled[:, n_spectral:]
    
    return X_spectral_resampled, X_static_resampled, y_resampled

if __name__ == "__main__":
    # Read data
    df = pd.read_csv('spc.csv')
    
    # Preprocess data
    X_spectral, X_static, y = preprocess_data(df)
    
    # Split data
    (X_spectral_train, X_static_train, y_train,
     X_spectral_test, X_static_test, y_test,
     X_spectral_val, X_static_val, y_val) = split_data(X_spectral, X_static, y)
    
    # Oversample training set
    X_spectral_train_ros, X_static_train_ros, y_train_ros = oversample_data(
        X_spectral_train, X_static_train, y_train
    )
    
    # Train XGBoost
    X_train_combined = np.hstack((X_spectral_train_ros, X_static_train_ros))
    X_test_combined = np.hstack((X_spectral_test, X_static_test))
    xgb_model = create_xgboost_model()
    xgb_model.fit(X_train_combined, y_train_ros)
    
    # Train MLP
    mlp_model = create_mlp_model((X_train_combined.shape[1],))
    mlp_model.fit(X_train_combined, y_train_ros,
                 validation_data=(X_test_combined, y_test),
                 epochs=50, batch_size=32, verbose=1)
    
    # Train LSTM
    lstm_model = create_lstm_model(
        (X_spectral_train.shape[1], 1),
        (X_static_train.shape[1],)
    )
    lstm_model.fit(
        [X_spectral_train_ros, X_static_train_ros],
        y_train_ros,
        validation_data=([X_spectral_test, X_static_test], y_test),
        epochs=3, batch_size=32, verbose=1
    )
    
    # Evaluate ensemble on validation set
    X_val_combined = np.hstack((X_spectral_val, X_static_val))
    ensemble_pred, class_accuracies = evaluate_ensemble(
        xgb_model, mlp_model, lstm_model,
        X_spectral_val, X_static_val, y_val
    )

models.py

In [ ]:
import tensorflow as tf
from tensorflow import keras
import xgboost as xgb

def create_lstm_model(spectral_shape, static_shape, num_classes=3):
    # Spectra input
    spectral_input = keras.layers.Input(shape=spectral_shape)
    lstm_out = keras.layers.LSTM(64, return_sequences=True)(spectral_input)
    lstm_out = keras.layers.LSTM(32)(lstm_out)
    
    # Static variables input
    static_input = keras.layers.Input(shape=static_shape)
    static_dense = keras.layers.Dense(16, activation='relu')(static_input)
    static_dense = keras.layers.Dense(8, activation='relu')(static_dense)
    
    # Concat Spectra and Static variables
    merged = keras.layers.Concatenate()([lstm_out, static_dense])

    dense = keras.layers.Dense(32, activation='relu')(merged)
    dense = keras.layers.Dropout(0.3)(dense)
    output = keras.layers.Dense(num_classes, activation='softmax')(dense)

    model = keras.Model(inputs=[spectral_input, static_input], outputs=output)
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model

def create_mlp_model(input_shape, num_classes=3):
    model = keras.Sequential([
        keras.layers.Dense(256, activation='relu', input_shape=input_shape),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    return model

def create_xgboost_model():
    return xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=3,
        learning_rate=0.1,
        max_depth=6,
        n_estimators=100,
        random_state=42
    )

evaluation.py

In [ ]:
import numpy as np
from sklearn.metrics import accuracy_score

def evaluate_ensemble(xgb_model, mlp_model, lstm_model, 
                     X_spectral, X_static, y_true):
    # Get predictions from each model
    X_combined = np.hstack((X_spectral, X_static))
    xgb_pred = xgb_model.predict(X_combined)
    
    mlp_pred = np.argmax(mlp_model.predict(X_combined), axis=1)
    lstm_pred = np.argmax(lstm_model.predict([X_spectral, X_static]), axis=1)
    
    # Get ensemble prediction (majority voting)
    predictions = np.array([xgb_pred, mlp_pred, lstm_pred])
    ensemble_pred = np.apply_along_axis(
        lambda x: np.bincount(x).argmax(), 
        axis=0, 
        arr=predictions
    )
    
    # Calculate accuracy for each class
    classes = [0, 1, 2]  # health, met, mast
    class_accuracies = {}
    for c in classes:
        mask = y_true == c
        if np.any(mask):
            acc = accuracy_score(y_true[mask], ensemble_pred[mask])
            class_accuracies[c] = acc
    
    # Print detailed results
    label_map = {0: 'health', 1: 'met', 2: 'mast'}
    print("\nDetailed Predictions:")
    for i in range(len(y_true)):
        print(f"Sample {i+1}:")
        print(f"XGBoost: {label_map[xgb_pred[i]]}")
        print(f"MLP: {label_map[mlp_pred[i]]}")
        print(f"LSTM: {label_map[lstm_pred[i]]}")
        print(f"Ensemble prediction: {label_map[ensemble_pred[i]]}")
        print(f"True label: {label_map[y_true[i]]}\n")
    
    print("\nClass-wise Accuracies:")
    for c in classes:
        print(f"{label_map[c]}: {class_accuracies[c]:.3f}")
    
    return ensemble_pred, class_accuracies