options(warn=-1)
# Function for batch processing spectral data
# Input: Path to a directory containing all .spc files
# Output: A CSV file named 'spectra.csv' which contains all averaged spectra, where each row represents the average absorbance values for a specific sample, and each column corresponds to the absorbance value at a given wavenumber
batch_process_spc_files <- function(directory) {
# Load required library
if (!require(hyperSpec, quietly = TRUE)) {
stop("Package 'hyperSpec' is required but is not installed.")
}
if (!require(progress, quietly = TRUE)) {
stop("Package 'progress' is required but is not installed.")
}
if (!require(plotly, quietly = TRUE)) {
stop("Package 'plotly' is required but is not installed.")
}
# Get all .spc files in the directory
files <- list.files(directory, pattern = "\\.spc$", full.names = TRUE)
# Initialize an empty data frame to store results
result_df <- data.frame()
# Initialize progress bar
pb <- progress_bar$new(format = " Processing [:bar] :percent in :elapsed secs", total = length(files), clear = FALSE, width = 60)
# Iterate over each .spc file
for (file in files) {
# Update progress bar
#pb$tick()
# Read the .spc file
spc_data <- read.spc(file, log.txt = FALSE)
# Get the spectrum data and calculate the average
avg_spectrum <- colMeans(spc_data@data$spc)
# Append the averaged spectrum to the results data frame
result_df <- rbind(result_df, avg_spectrum)
rownames(result_df)[nrow(result_df)] <- basename(file)
colnames(result_df) <- spc_data@wavelength
}
# Create the output file path in the same directory
output_csv <- file.path(directory, "spectra.csv")
# Save the result to a .csv file
write.csv(result_df, file = output_csv, row.names = TRUE)
message("Output saved to: ", output_csv)
# Calculate mean and standard deviation for each wavenumber
mean_spectrum <- colMeans(result_df)
sd_spectrum <- apply(result_df, 2, sd)
# Create a data frame for plotting
plot_data <- data.frame(
Wavenumber = as.numeric(colnames(result_df)),
Mean = mean_spectrum,
SD = sd_spectrum
)
# Use plotly to plot the mean and standard deviation
p <- plot_ly(plot_data, x = ~Wavenumber) %>%
add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
layout(
title = 'Spectra Mean and Standard Deviation',
xaxis = list(title = 'Wavenumber'),
yaxis = list(title = 'Absorbance'),
showlegend = FALSE
)
# Display the plot
return(p)
}
# Usage
# Assuming .spc files are located in "./spc_files" directory
batch_process_spc_files("./spc_demo/")
## Output saved to: ./spc_demo//spectra.csv
It could be used for: 1) geting the first/second derivative of spectra to correct for potential baseline drift; 2) smoothing the spectra for noise and perturbations removal; 3) both
# Function for Savitzky-Golay filter
# Input:
# 1) input_csv: A spectra.csv file that contains all spectra. Each row represents the average for each sample, and each column represents the absorbance value at each wavenumber.
# 2) m: The derivative order. m = 0 means only smoothing, without computing the derivative.
# 3) p: The polynomial order for fitting. Commonly set to 2 or 3.
# 4) w: The window size for the smoothing. Larger window sizes give more smoothing.
# Output: A filtered spectra CSV file saved in the same directory and a plots showing the mean and standard deviation of spectra before/after filtering.
sg_smooth_plot <- function(input_csv, m = 2, p = 2, w = 7) {
# Load required libraries
if (!require(plotly, quietly = TRUE)) {
stop("Package 'plotly' is required but is not installed.")
}
if (!require(prospectr, quietly = TRUE)) {
stop("Package 'prospectr' is required but is not installed.")
}
# Read the .csv file
spectra_data <- read.csv(input_csv, row.names = 1)
if (any(grepl("^X", names(spectra_data)))) {
names(spectra_data) <- gsub("^X", "", names(spectra_data))
}
spectra_data <- spectra_data[, (names(spectra_data) > 1000 & names(spectra_data) < 3000) & !(names(spectra_data) >= 1585 & names(spectra_data) <= 1700)]
# Apply Savitzky-Golay smoothing to each row
smoothed_data <- t(apply(spectra_data, 1, function(x) {
savitzkyGolay(x, m = m, p = p, w = w)
}))
# Remove the first and last w/2 columns due to NA values after smoothing
half_window <- floor(w / 2)
smoothed_data <- smoothed_data[, (half_window + 1):(ncol(smoothed_data) - half_window)]
# Save the smoothed data to a new CSV file
output_csv <- file.path(dirname(input_csv), paste0("smoothed_m", m, "_p", p, "_w", w, ".csv"))
write.csv(smoothed_data, file = output_csv, row.names = TRUE)
# Calculate the mean and standard deviation of spectra before Savitzky-Golay filters
plot_data <- data.frame(
Wavenumber = as.numeric(colnames(spectra_data)), # wavenumbers,
Mean = colMeans(spectra_data, na.rm = TRUE),
SD = apply(spectra_data, 2, sd, na.rm = TRUE)
)
# Use plotly to plot the mean and standard deviation of spectra before Savitzky-Golay filters
p_original <- plot_ly(plot_data, x = ~Wavenumber) %>%
add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
layout(
title = 'Spectra before Savitzky-Golay filters',
xaxis = list(title = 'Wavenumber'),
yaxis = list(title = 'Absorbance'),
showlegend = FALSE
)
# Calculate the mean and standard deviation of spectra after Savitzky-Golay filters
plot_data <- data.frame(
Wavenumber = as.numeric(colnames(smoothed_data)), # wavenumbers,
Mean = colMeans(smoothed_data, na.rm = TRUE),
SD = apply(smoothed_data, 2, sd, na.rm = TRUE)
)
# Use plotly to plot the mean and standard deviation of spectra after Savitzky-Golay filters
p_smoothed <- plot_ly(plot_data, x = ~Wavenumber) %>%
add_trace(y = ~Mean, type = 'scatter', mode = 'lines', name = 'Mean', line = list(color = '#0077b6', width = 4)) %>%
add_ribbons(ymin = ~Mean - SD, ymax = ~Mean + SD, name = 'Mean ± SD', fillcolor = 'rgba(255, 0, 0, 0.2)', line = list(color = 'transparent')) %>%
layout(
title = 'Spectra after Savitzky-Golay filters',
xaxis = list(title = 'Wavenumber'),
yaxis = list(title = 'Absorbance'),
showlegend = FALSE
)
# Return the plot
return(list(original_plot = p_original, smoothed_plot = p_smoothed))
}
# Usage
# Assuming the input file is ./spc_demo/spectra.csv and m = 0, p = 2, w = 7
sg_smooth_plot("./spc_demo/spectra.csv", m = 1, p = 3, w = 5)$original_plot
sg_smooth_plot("./spc_demo/spectra.csv", m = 1, p = 3, w = 5)$smoothed_plot
\[ GH = \sqrt{\frac{(x - \mu)^T S^{-1} (x - \mu)}{nPC}} \]
# Function for identification of potential outliers using global Mahalanobis distance
# Input:
# 1) input_csv: A spectra.csv file that contains all spectra. Each row represents the average for each sample, and each column represents the absorbance value at each wavenumber.
# 2) threshold: The threshold factor for determining outliers based on Mahalanobis distance.
# Output: A filtered spectra CSV file and a plot showing outliers and non-outliers.
remove_outliers_pca_gh <- function(input_csv, threshold = 5) {
# Load required libraries
if (!require(FactoMineR, quietly = TRUE)) {
stop("Package 'FactoMineR' is required but is not installed.")
}
if (!require(MASS, quietly = TRUE)) {
stop("Package 'MASS' is required but is not installed.")
}
if (!require(plotly, quietly = TRUE)) {
stop("Package 'plotly' is required but is not installed.")
}
# Read the .csv file
spectra_data <- read.csv(input_csv, row.names = 1)
if (any(grepl("^X", names(spectra_data)))) {
names(spectra_data) <- gsub("^X", "", names(spectra_data))
}
spectra_data <- spectra_data[, (names(spectra_data) > 1000 & names(spectra_data) < 3000) & !(names(spectra_data) >= 1585 & names(spectra_data) <= 1700)]
wavenumbers <- as.numeric(colnames(spectra_data))
# Perform initial PCA to determine the number of principal components
initial_pca_result <- PCA(spectra_data, graph = FALSE, ncp = ncol(spectra_data))
eigenvalues <- initial_pca_result$eig[, 1]
nPC <- sum(eigenvalues > 1)
# Perform PCA with the determined number of principal components
pca_result <- PCA(spectra_data, graph = FALSE, ncp = nPC)
pc_scores <- pca_result$ind$coord[, 1:nPC]
mean_pc_scores <- colMeans(pc_scores)
cov_matrix <- cov(pc_scores)
if (det(cov_matrix) == 0) {
cov_matrix <- cov_matrix + diag(1e-6, nrow = ncol(cov_matrix))
}
# Calculate GH values for each spectrum
gh_values <- apply(pc_scores, 1, function(x) {
diff <- x - mean_pc_scores
sqrt(t(diff) %*% solve(cov_matrix) %*% diff / ncol(pc_scores))
})
# Remove spectra with GH values greater than the threshold
filtered_data <- spectra_data[gh_values <= threshold, ]
# Save the filtered data to a new CSV file
output_csv <- file.path(dirname(input_csv), "spectra_removed_outlier_gh.csv")
write.csv(filtered_data, file = output_csv, row.names = TRUE)
# Use plotly to plot the spectra, differentiating outliers and non-outliers by color
p <- plot_ly()
for (i in 1:nrow(spectra_data)) {
spectrum <- as.numeric(spectra_data[i, ])
color <- ifelse(gh_values[i] > threshold, '#bc4749', '#b5e2fa')
p <- p %>% add_trace(x = wavenumbers, y = spectrum, type = 'scatter', mode = 'lines', name = rownames(spectra_data)[i], line = list(color = color))
}
p <- layout(p, title = 'Spectra with Outliers Highlighted (red)',
xaxis = list(title = 'Wavenumber'),
yaxis = list(title = 'Absorbance'),
showlegend = FALSE)
# Return the plot
return(p)
}
# Usage
# Assuming the input file is ./spc_demo/spectra.csv and threshold = 3
remove_outliers_pca_gh("./spc_demo/spectra.csv", threshold = 3)