Tutorial for milk FTIR spectra pre-processing

Dan Lin, Jessica McArt

Dec 2024

Step 1: batch process all .spc files in a specific directory

options(warn=-1)
# Function for batch processing spectral data
# Input: Path to a directory containing all .spc files
# Output: A CSV file named 'spectra.csv' which contains all averaged spectra, where each row represents the average absorbance values for a specific sample, and each column corresponds to the absorbance value at a given wavenumber

batch_process_spc_files <- function(directory) {
    
  # Load required library
  if (!require(hyperSpec, quietly = TRUE)) {
    stop("Package 'hyperSpec' is required but is not installed.")
  }
  if (!require(progress, quietly = TRUE)) {
    stop("Package 'progress' is required but is not installed.")
  }
  if (!require(plotly, quietly = TRUE)) {
    stop("Package 'plotly' is required but is not installed.")
  }
  
  
  # Get all .spc files in the directory
  files <- list.files(directory, pattern = "\\.spc$", full.names = TRUE)
  
  # Initialize an empty data frame to store results
  result_df <- data.frame()
  
  # Initialize progress bar
  pb <- progress_bar$new(format = "  Processing [:bar] :percent in :elapsed secs", total = length(files), clear = FALSE, width = 60)
  
  # Iterate over each .spc file
  for (file in files) {
    # Update progress bar
    #pb$tick()
    
    # Read the .spc file
    spc_data <- read.spc(file, log.txt = FALSE)
    
    # Get the spectrum data and calculate the average
    avg_spectrum <- colMeans(spc_data@data$spc)
    
    # Append the averaged spectrum to the results data frame
    result_df <- rbind(result_df, avg_spectrum)
    rownames(result_df)[nrow(result_df)] <- basename(file)
    colnames(result_df) <- spc_data@wavelength
  }
  
  # Create the output file path in the same directory
  output_csv <- file.path(directory, "spectra.csv")
  
  # Save the result to a .csv file
  write.csv(result_df, file = output_csv, row.names = TRUE)
  
  message("Output saved to: ", output_csv)
    
  # Calculate mean and standard deviation for each wavenumber
  mean_spectrum <- colMeans(result_df)
  sd_spectrum <- apply(result_df, 2, sd)
  
  # Create a data frame for plotting
  plot_data <- data.frame(
    Wavenumber = as.numeric(colnames(result_df)),
    Mean = mean_spectrum,
    SD = sd_spectrum
  )
  
  # Use plotly to plot the mean and standard deviation
  p <- plot_ly(plot_data, x = ~Wavenumber) %>%
    add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
    add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
    layout(
      title = 'Spectra Mean and Standard Deviation',
      xaxis = list(title = 'Wavenumber'),
      yaxis = list(title = 'Absorbance'),
      showlegend = FALSE
    )
  
  # Display the plot
  return(p)
}
# Usage
# Assuming .spc files are located in "./spc_files" directory
batch_process_spc_files("./spc_demo/")
## Output saved to: ./spc_demo//spectra.csv

Step 2: Savitzky-Golay filters

It could be used for: 1) geting the first/second derivative of spectra to correct for potential baseline drift; 2) smoothing the spectra for noise and perturbations removal; 3) both

# Function for Savitzky-Golay filter
# Input: 
# 1) input_csv: A spectra.csv file that contains all spectra. Each row represents the average for each sample, and each column represents the absorbance value at each wavenumber.
# 2) m: The derivative order. m = 0 means only smoothing, without computing the derivative.
# 3) p: The polynomial order for fitting. Commonly set to 2 or 3.
# 4) w: The window size for the smoothing. Larger window sizes give more smoothing.
# Output: A filtered spectra CSV file saved in the same directory and a plots showing the mean and standard deviation of spectra before/after filtering.

sg_smooth_plot <- function(input_csv, m = 2, p = 2, w = 7) {
    
  # Load required libraries
  if (!require(plotly, quietly = TRUE)) {
    stop("Package 'plotly' is required but is not installed.")
  }
  if (!require(prospectr, quietly = TRUE)) {
    stop("Package 'prospectr' is required but is not installed.")
  }
  
  # Read the .csv file
  spectra_data <- read.csv(input_csv, row.names = 1)
  if (any(grepl("^X", names(spectra_data)))) {
    names(spectra_data) <- gsub("^X", "", names(spectra_data))
  }
  spectra_data <- spectra_data[, (names(spectra_data) > 1000 & names(spectra_data) < 3000) & !(names(spectra_data) >= 1585 & names(spectra_data) <= 1700)]
  
  # Apply Savitzky-Golay smoothing to each row
  smoothed_data <- t(apply(spectra_data, 1, function(x) {
    savitzkyGolay(x, m = m, p = p, w = w)
  }))
  
  # Remove the first and last w/2 columns due to NA values after smoothing
  half_window <- floor(w / 2)
  smoothed_data <- smoothed_data[, (half_window + 1):(ncol(smoothed_data) - half_window)]
    
  # Save the smoothed data to a new CSV file
  output_csv <- file.path(dirname(input_csv), paste0("smoothed_m", m, "_p", p, "_w", w, ".csv"))
  write.csv(smoothed_data, file = output_csv, row.names = TRUE)
  
  # Calculate the mean and standard deviation of spectra before Savitzky-Golay filters
  plot_data <- data.frame(
    Wavenumber = as.numeric(colnames(spectra_data)), # wavenumbers,
    Mean = colMeans(spectra_data, na.rm = TRUE),
    SD = apply(spectra_data, 2, sd, na.rm = TRUE)
  )
  
  # Use plotly to plot the mean and standard deviation of spectra before Savitzky-Golay filters
  p_original <- plot_ly(plot_data, x = ~Wavenumber) %>%
    add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
    add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
    layout(
      title = 'Spectra before Savitzky-Golay filters',
      xaxis = list(title = 'Wavenumber'),
      yaxis = list(title = 'Absorbance'),
      showlegend = FALSE
    )
  
  # Calculate the mean and standard deviation of spectra after Savitzky-Golay filters
  plot_data <- data.frame(
    Wavenumber = as.numeric(colnames(smoothed_data)), # wavenumbers,
    Mean = colMeans(smoothed_data, na.rm = TRUE),
    SD = apply(smoothed_data, 2, sd, na.rm = TRUE)
  )
  
  # Use plotly to plot the mean and standard deviation of spectra after Savitzky-Golay filters
  p_smoothed <- plot_ly(plot_data, x = ~Wavenumber) %>%
    add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
    add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
    layout(
      title = 'Spectra after Savitzky-Golay filters',
      xaxis = list(title = 'Wavenumber'),
      yaxis = list(title = 'Absorbance'),
      showlegend = FALSE
    )
  
  # Return the plot
  return(list(original_plot = p_original, smoothed_plot = p_smoothed))
}
# Usage
# Assuming the input file is ./spc_demo/spectra.csv and m = 0, p = 2, w = 7
sg_smooth_plot("./spc_demo/spectra.csv", m = 1, p = 3, w = 5)$original_plot
sg_smooth_plot("./spc_demo/spectra.csv", m = 1, p = 3, w = 5)$smoothed_plot

Step 3: GH/standardized Mahalanobis distance to identify potential outliers (Whitfield et al., 1987)

a. PCA is performed to define uncorrelated principal components (PC)
b. Calculate GH distance:

\[ GH = \sqrt{\frac{(x - \mu)^T S^{-1} (x - \mu)}{nPC}} \]

where x is the PC score of a spectrum; μ is the mean of PC scores estimated from the entire data set; S corresponds to the (co)variance matrix between PC scores estimated from the entire data set; nPC is the number of PC used in the calculation.
c. The potential outliers are defined if their GH are larger than a specific threshold, like 2,3,5
# Function for identification of potential outliers using global Mahalanobis distance
# Input: 
# 1) input_csv: A spectra.csv file that contains all spectra. Each row represents the average for each sample, and each column represents the absorbance value at each wavenumber.
# 2) threshold: The threshold factor for determining outliers based on Mahalanobis distance.
# Output: A filtered spectra CSV file and a plot showing outliers and non-outliers.

remove_outliers_pca_gh <- function(input_csv, threshold = 5) {
    
  # Load required libraries
  if (!require(FactoMineR, quietly = TRUE)) {
    stop("Package 'FactoMineR' is required but is not installed.")
  }
  if (!require(MASS, quietly = TRUE)) {
    stop("Package 'MASS' is required but is not installed.")
  }
  if (!require(plotly, quietly = TRUE)) {
    stop("Package 'plotly' is required but is not installed.")
  }
  
  # Read the .csv file
  spectra_data <- read.csv(input_csv, row.names = 1)
  if (any(grepl("^X", names(spectra_data)))) {
    names(spectra_data) <- gsub("^X", "", names(spectra_data))
  }
  spectra_data <- spectra_data[, (names(spectra_data) > 1000 & names(spectra_data) < 3000) & !(names(spectra_data) >= 1585 & names(spectra_data) <= 1700)]
  wavenumbers <- as.numeric(colnames(spectra_data))
  
  # Perform initial PCA to determine the number of principal components
  initial_pca_result <- PCA(spectra_data, graph = FALSE, ncp = ncol(spectra_data))
  eigenvalues <- initial_pca_result$eig[, 1]
  nPC <- sum(eigenvalues > 1)
  
  # Perform PCA with the determined number of principal components
  pca_result <- PCA(spectra_data, graph = FALSE, ncp = nPC)
  pc_scores <- pca_result$ind$coord[, 1:nPC]
  mean_pc_scores <- colMeans(pc_scores)
  cov_matrix <- cov(pc_scores)
  if (det(cov_matrix) == 0) {
    cov_matrix <- cov_matrix + diag(1e-6, nrow = ncol(cov_matrix))
  }
  
  # Calculate GH values for each spectrum
  gh_values <- apply(pc_scores, 1, function(x) {
    diff <- x - mean_pc_scores
    sqrt(t(diff) %*% solve(cov_matrix) %*% diff / ncol(pc_scores))
  })
  
  # Remove spectra with GH values greater than the threshold
  filtered_data <- spectra_data[gh_values <= threshold, ]
  
  # Save the filtered data to a new CSV file
  output_csv <- file.path(dirname(input_csv), "spectra_removed_outlier_gh.csv")
  write.csv(filtered_data, file = output_csv, row.names = TRUE)
  
  # Use plotly to plot the spectra, differentiating outliers and non-outliers by color
  p <- plot_ly()
  
  for (i in 1:nrow(spectra_data)) {
    spectrum <- as.numeric(spectra_data[i, ])
    color <- ifelse(gh_values[i] > threshold, '#bc4749', '#b5e2fa')
    p <- p %>% add_trace(x = wavenumbers, y = spectrum, type = 'scatter', mode = 'lines', name = rownames(spectra_data)[i], line = list(color = color))
  }
  
  p <- layout(p, title = 'Spectra with Outliers Highlighted (red)',
              xaxis = list(title = 'Wavenumber'),
              yaxis = list(title = 'Absorbance'),
              showlegend = FALSE)
  
  # Return the plot
  return(p)
}
# Usage
# Assuming the input file is ./spc_demo/spectra.csv and threshold = 3
remove_outliers_pca_gh("./spc_demo/spectra.csv", threshold = 3)

Reference for 3-Outlier identification:

  1. Whitfield, R G, Gerger M E, Sharp R L. Near-infra- red spectrum qualification via Mahalanobis distance determina- tion. Appl. Spectrosc. 1987, 41:1204–1213. https://doi.org/10.1366/0003702874447572.

  2. Soyeurt H, Grelet C, McParland S, et al. A comparison of 4 different machine learning algorithms to predict lactoferrin content in bovine milk from mid-infrared spectra. Journal of dairy science, 2020, 103(12): 11585-11596. https://doi.org/10.3168/jds.2020-18870.

  3. Ho P N, Luke T D W, Pryce J E. Validation of milk mid-infrared spectroscopy for predicting the metabolic status of lactating dairy cows in Australia. Journal of Dairy Science, 2021, 104(4): 4467-4477. https://doi.org/10.3168/jds.2020-19603.

  4. Lou W, Bonfatti V, Bovenhuis H, et al. Prediction of likelihood of conception in dairy cows using milk mid-infrared spectra collected before the first insemination and machine learning algorithms. Journal of Dairy Science, 2024. https://doi.org/10.3168/jds.2023-24621.