main.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from tensorflow import keras
import tensorflow as tf
from models import create_xgboost_model, create_mlp_model, create_lstm_model
from evaluation import evaluate_ensemble
tf.config.set_visible_devices([], 'GPU')
# Data preprocessing function
def preprocess_data(df):
# Separate features and labels
spectral_cols = [col for col in df.columns if col.replace('.','').isdigit()]
static_cols = ['milkweightlbs', 'parity', 'cells']
X_spectral = df[spectral_cols].values
X_static = df[static_cols].values
y = df['group'].values
# Standardization
spectral_scaler = MinMaxScaler()
static_scaler = MinMaxScaler()
X_spectral_scaled = spectral_scaler.fit_transform(X_spectral)
X_static_scaled = static_scaler.fit_transform(X_static)
# Label encoding
label_map = {'health': 0, 'met': 1, 'mast': 2}
y_encoded = np.array([label_map[label] for label in y])
return X_spectral_scaled, X_static_scaled, y_encoded
# Data splitting function
def split_data(X_spectral, X_static, y):
# First split out validation set
X_spectral_temp, X_spectral_val, X_static_temp, X_static_val, y_temp, y_val = train_test_split(
X_spectral, X_static, y, test_size=0.15, stratify=y, random_state=42
)
# Then split out test set
X_spectral_train, X_spectral_test, X_static_train, X_static_test, y_train, y_test = train_test_split(
X_spectral_temp, X_static_temp, y_temp, test_size=0.176, stratify=y_temp, random_state=42
)
return (X_spectral_train, X_static_train, y_train,
X_spectral_test, X_static_test, y_test,
X_spectral_val, X_static_val, y_val)
# Oversampling function
def oversample_data(X_spectral, X_static, y):
ros = RandomOverSampler(random_state=42)
# Combine features for resampling
X_combined = np.hstack((X_spectral, X_static))
X_resampled, y_resampled = ros.fit_resample(X_combined, y)
# Split back into spectral and static features
n_spectral = X_spectral.shape[1]
X_spectral_resampled = X_resampled[:, :n_spectral]
X_static_resampled = X_resampled[:, n_spectral:]
return X_spectral_resampled, X_static_resampled, y_resampled
if __name__ == "__main__":
# Read data
df = pd.read_csv('spc.csv')
# Preprocess data
X_spectral, X_static, y = preprocess_data(df)
# Split data
(X_spectral_train, X_static_train, y_train,
X_spectral_test, X_static_test, y_test,
X_spectral_val, X_static_val, y_val) = split_data(X_spectral, X_static, y)
# Oversample training set
X_spectral_train_ros, X_static_train_ros, y_train_ros = oversample_data(
X_spectral_train, X_static_train, y_train
)
# Train XGBoost
X_train_combined = np.hstack((X_spectral_train_ros, X_static_train_ros))
X_test_combined = np.hstack((X_spectral_test, X_static_test))
xgb_model = create_xgboost_model()
xgb_model.fit(X_train_combined, y_train_ros)
# Train MLP
mlp_model = create_mlp_model((X_train_combined.shape[1],))
mlp_model.fit(X_train_combined, y_train_ros,
validation_data=(X_test_combined, y_test),
epochs=50, batch_size=32, verbose=1)
# Train LSTM
lstm_model = create_lstm_model(
(X_spectral_train.shape[1], 1),
(X_static_train.shape[1],)
)
lstm_model.fit(
[X_spectral_train_ros, X_static_train_ros],
y_train_ros,
validation_data=([X_spectral_test, X_static_test], y_test),
epochs=3, batch_size=32, verbose=1
)
# Evaluate ensemble on validation set
X_val_combined = np.hstack((X_spectral_val, X_static_val))
ensemble_pred, class_accuracies = evaluate_ensemble(
xgb_model, mlp_model, lstm_model,
X_spectral_val, X_static_val, y_val
)
models.py
import tensorflow as tf
from tensorflow import keras
import xgboost as xgb
def create_lstm_model(spectral_shape, static_shape, num_classes=3):
# Spectra input
spectral_input = keras.layers.Input(shape=spectral_shape)
lstm_out = keras.layers.LSTM(64, return_sequences=True)(spectral_input)
lstm_out = keras.layers.LSTM(32)(lstm_out)
# Static variables input
static_input = keras.layers.Input(shape=static_shape)
static_dense = keras.layers.Dense(16, activation='relu')(static_input)
static_dense = keras.layers.Dense(8, activation='relu')(static_dense)
# Concat Spectra and Static variables
merged = keras.layers.Concatenate()([lstm_out, static_dense])
dense = keras.layers.Dense(32, activation='relu')(merged)
dense = keras.layers.Dropout(0.3)(dense)
output = keras.layers.Dense(num_classes, activation='softmax')(dense)
model = keras.Model(inputs=[spectral_input, static_input], outputs=output)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
def create_mlp_model(input_shape, num_classes=3):
model = keras.Sequential([
keras.layers.Dense(256, activation='relu', input_shape=input_shape),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
keras.layers.Dense(128, activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
keras.layers.Dense(64, activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.3),
keras.layers.Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
def create_xgboost_model():
return xgb.XGBClassifier(
objective='multi:softprob',
num_class=3,
learning_rate=0.1,
max_depth=6,
n_estimators=100,
random_state=42
)
evaluation.py
import numpy as np
from sklearn.metrics import accuracy_score
def evaluate_ensemble(xgb_model, mlp_model, lstm_model,
X_spectral, X_static, y_true):
# Get predictions from each model
X_combined = np.hstack((X_spectral, X_static))
xgb_pred = xgb_model.predict(X_combined)
mlp_pred = np.argmax(mlp_model.predict(X_combined), axis=1)
lstm_pred = np.argmax(lstm_model.predict([X_spectral, X_static]), axis=1)
# Get ensemble prediction (majority voting)
predictions = np.array([xgb_pred, mlp_pred, lstm_pred])
ensemble_pred = np.apply_along_axis(
lambda x: np.bincount(x).argmax(),
axis=0,
arr=predictions
)
# Calculate accuracy for each class
classes = [0, 1, 2] # health, met, mast
class_accuracies = {}
for c in classes:
mask = y_true == c
if np.any(mask):
acc = accuracy_score(y_true[mask], ensemble_pred[mask])
class_accuracies[c] = acc
# Print detailed results
label_map = {0: 'health', 1: 'met', 2: 'mast'}
print("\nDetailed Predictions:")
for i in range(len(y_true)):
print(f"Sample {i+1}:")
print(f"XGBoost: {label_map[xgb_pred[i]]}")
print(f"MLP: {label_map[mlp_pred[i]]}")
print(f"LSTM: {label_map[lstm_pred[i]]}")
print(f"Ensemble prediction: {label_map[ensemble_pred[i]]}")
print(f"True label: {label_map[y_true[i]]}\n")
print("\nClass-wise Accuracies:")
for c in classes:
print(f"{label_map[c]}: {class_accuracies[c]:.3f}")
return ensemble_pred, class_accuracies