Setup codes

##load the required packages
library(ggplot2)
library(baseline)
library(splus2R)

#customize and define the root_dir to access the downloaded repo
#root_dir = "./"
code_dir <- paste0(root_dir,"source_codes/") 
data_dir <- paste0(root_dir,"input_data/")
output_dir <- paste0(root_dir,"output/")

setwd(code_dir)
#run the following commands to load SPA-STOCSY's functions
{ 
  source("align.r")
  source("baseline_removal.r")
  source("Noise.estimate.r")
  source("PQ_norm_signal.r")
  source("Gen.spa.group.noints.r")
  source("Cor.stat.r")
  source("cor_landscape.r")
  source("norm_kern.r")
  source("Epane_kern.r")
  source("tri_cube_kern.r")
  source("Landscape_cut.r")
  source("Clust.member.r")
  source("pred.strength.r")
  source("Gen_co_member.r")
  source("Gen.ct.r")
  source("Select.gam.r")
  source("plot_pred.streth.r")
  source("Ints_avg_mod_highd.R")
  source("myImagePlot2.r")
  source("gen.J.couple.highd.r")
  source("peak_pick_local_max.R")
  source("combine_cluster_peak.r")
  source("gen_summary_profile.r")
  source("gen_SPA_peaks.r")
  source("gen_stocsy_corr_clusters.r")
  source("Build_new.r")
  source("FindMeta.r")
  source("Auto_dect_one.r")
  source("PkinReg.r")
  source("clus.r")
  source("clus_region.r")
  source("Peak_simple.r")
  
}

Load input data

Set file path

setwd(data_dir)
filename = "htt_c12.csv" #change to user's filename
htt.dat <- read.csv(filename, header = T) 

Extract ppm and adjust data format

ppm0 <- htt.dat[,1] #chemical shift (ppm)
dat0 <- t(htt.dat[, -1]) #spectra data
avg <- apply(dat0, 2, mean) #average spectrum
## [1] "Head of data:"
##              [,1]        [,2]        [,3]        [,4]        [,5]
## C1211 0.000248516 0.000266609 0.000282267 0.000296669 0.000314481
## C1212 0.000306765 0.000324108 0.000343583 0.000364306 0.000388333
## C1213 0.000310101 0.000321215 0.000335582 0.000353444 0.000377573
## C1214 0.000240847 0.000247723 0.000256431 0.000268619 0.000286862
## C1215 0.000166313 0.000170369 0.000180503 0.000199383 0.000230108
## [1] "Raw data loaded."

Raw data preprocessing steps:

Samples alignment:

region = c(-.05, .05)
ppm = ppm0
##find alignment marker
ali.mark =  ppm[which.min(abs(ppm))]
dat = dat0

##use Align function
ares <- Align(region, dat, ppm, ali.mark)
ahtt <- ares$tdat
appm <- ares$appm
avg.c <- apply(ahtt, 1, mean)

# set new standard: 
ppm0 = appm #aligned chemical shift values
dat0 = ahtt  # aligned NMR spectra
## [1] "Alignment results:"

Baseline correction

dat <- t(dat0) 
ppm <- ppm0 

dat.bl.rm <- baseline_removal(dat)
dat.bl <- dat.bl.rm$dat.bl #baseline-corrected NMR spectra
avg.bl <- apply(dat.bl, 2, mean) 
## [1] "Baseline corretion results:"

Keep informative spectra in 0.5-4.0ppm

dat.bl1 <- dat.bl
dat.bl1 <- dat.bl1[, which(ppm < 4 & ppm >0.5)]
ppm1 <- ppm[which(ppm < 4 & ppm >0.5)]

dim(dat.bl1) 
length(ppm1)
## [1]   10 2788
## [1] 2788

Noise estimation

nos.est <- Noise.est(dat.bl, pos.standar = c(0.08, 0.58), ppm)
nos <- nos.est * 5
nos #estimated noise threshold
## [1] 2.245626e-05
#red line is noise cutting line
qplot(ppm[which(ppm>0.5)], avg.bl[which(ppm>0.5)], geom = "line") + 
  geom_hline(yintercept = nos,col="red")+
  xlab("Chemical shift (ppm)")+
  ylab("Intensity (a.u.)")+
  scale_x_reverse()

Quotient normalization

qut_dat.bl1 <- PQ_norm_signal(t(dat.bl), method = "median", ppm, c(0.08, 0.58))
quot.bl1 <- qut_dat.bl1$quot
quot.final.bl1 <- qut_dat.bl1$quot.final
quot.final.bl1
##  [1] 1.0908776 1.1271445 1.0033805 0.7905787 1.0578980 0.7606097 0.9779892
##  [8] 0.9225305 1.1185942 0.9939858
dat_pqnorm.bl1 <- matrix(NA, dim(dat.bl1)[1], dim(dat.bl1)[2])
for(i in 1 : length(quot.final.bl1)){
    dat_pqnorm.bl1[i, ] <- dat.bl1[i, ] / quot.final.bl1[i]
}

# set noise to 0
dat_pqnorm.bl2 <- dat_pqnorm.bl1 
dat_pqnorm.bl2[which(dat_pqnorm.bl2 < nos)] <- 0

SPA process:

PACF

dat = dat_pqnorm.bl1 
ppm = ppm1 
avg.d1 <- apply(dat_pqnorm.bl1, 2, mean)

pacf(avg.d1) #estimate the optimal window size

## [1] "Take pacf result as reference and set k."
## [1] "Here k is set to 7."
k=7 #window size

Parameters for SPA

#find the detailed parameter information in the source code
span = 3
a1 = 0.8
a2 = 0.999
n.gam = 50
b2 = 0.99
kern.type = "tri_cube"
k = 7
n.tr <- dim(dat)[1]/2
n.cv <- 10

##set up a path to save output from SPA function
path=output_dir

Run SPA

ptm <- proc.time()
grp.res <- Gen.spa.clus(dat, ppm, k, method="mean", a1, a2, n.gam, n.tr, n.cv, 
                        b2, kern.type,auto_gam=TRUE,gam=0.8,path)
proc.time() - ptm
##    user  system elapsed 
##  20.232  13.012  33.250
#save out the SPA results for further reference
saveRDS(grp.res,paste(output_dir,"SPA_rds.rds",sep=""))
#grp.res=readRDS(paste(output_dir,"SPA_rds.rds",sep=""))
#cluster member assigned to every ppm point
clus.mem <- grp.res$clus.mem

#separation of ppm points into clusters and background 
land.cut <- grp.res$land.cut #2 for metabolite signals, 0 for background noise

##boundary label for every cluster
bond <- grp.res$bond

##chosen optimal threshold for the correlation landscape
gam <- grp.res$gam
## [1] "Total cluster numbers: "
max(clus.mem)
## [1] 50
## [1] "SPA results visualized in the spectra:blue zones are the separations among metabolite clusters (red)"

STOCSY process:

Prepare input for STOCSY

###For high-dim cases, instead of taking the mean of peak intensities within a cluster as the input for STOCSY, we keep all the peaks within a cluster as STOCSY's input
ints_highd <- Ints_avg_mod_highd(clus.mem, dat,nos,step=1)
ints = ints_highd$clus_peak
peak_clus = ints_highd$peak_clus
dim(ints) 
length(peak_clus) 

dat.spa <- ints 
cor.dat.spa <- cor(ints) #the correlations are calculated among peaks from different clusters
dim(ints) 
dim(cor.dat.spa)
range(cor.dat.spa)
## [1]  10 132
## [1] 132
## [1]  10 132
## [1] 132 132
## [1] -0.9290018  1.0000000
myImagePlot2(cor.dat.spa)

Heatmap for correlated clusters

#set threshold for correlation among clusters
#Here the threshold is set as 0.8

a.8 <- cor.dat.spa
a.8[a.8 < 0.8] <- 0

myImagePlot2(a.8)

Summarize and visulize correlated clusters

##Have highly correlated clusters in list
##The correlation between every two clusters are determined by the maximum/mean peak-to-peak correlation within the two clusters
a8 <- gen.J.couple(cor.dat.spa, .8,peak_clus,method="max")

Peak detection

Local maximum peak detection

pks = peak_local_max(dat,nos,step=1)
length(pks)
## [1] 122
plot(ppm,avg.d1,type="l")
points(ppm[pks],avg.d1[pks],col="red")

pk.ppm = ppm[pks]

Summarize clusters and peaks

cluster_peak=combine_cluster_peak(clus.mem,ppm,pk.ppm)
a8_summary=gen_summary_profile(cor_clus=a8,cluster_peak)

Automatic metabolites identification

Prepare inputs from SPA-STOCSY results

##summarize peak lists in every SPA cluster
clus_mem=gen_SPA_peaks(profile=a8_summary,cluster_peak)
write.csv(clus_mem,paste(output_dir,"SPA_cluster_peaks.csv",sep=""))

##summarize highly-correlated clusters
spa_stocsy_corr=gen_stocsy_corr(profile=a8_summary)
write.csv(spa_stocsy_corr, paste(output_dir,"STOCSY_corr_clusters.csv",sep = ""))

Prepare reference library

##load reference library data
setwd(data_dir)
filename = "ref_final_lib.csv"

# Read data
Chx_lib <- read.csv(filename, header = T)
Chx_lib <- as.data.frame(Chx_lib)
Chx_lib = Chx_lib[,-1]
colnames(Chx_lib) <- c("meta_name", "clus_ct")
meta_name <- as.character(unique(Chx_lib$meta_name))

#clus_ct: cluster central chemical shift value
head(Chx_lib)
##                    meta_name clus_ct
## 1       1,3-Dihydroxyacetone 3.56759
## 2       1,3-Dihydroxyacetone 4.41130
## 3          1,3-Dimethylurate 3.30829
## 4          1,3-Dimethylurate 3.43921
## 5 1,6-Anhydro-beta-D-Glucose 3.52645
## 6 1,6-Anhydro-beta-D-Glucose 3.52645
##shift to set upper and lower boundaries for each cluster
##the ul and ll are implemented to improve SPA-STOCSY's robustness to chemical shift variations
ppm_shift_thre = 0.025
new_lib <- Build_new(meta_name, ppm_shift_thre)
head(new_lib)
##                    meta_name clus_ct      ul      ll
## 1       1,3-Dihydroxyacetone 3.56759 3.59259 3.54259
## 2       1,3-Dihydroxyacetone  4.4113  4.4363  4.3863
## 3          1,3-Dimethylurate 3.30829 3.33329 3.28329
## 4          1,3-Dimethylurate 3.43921 3.46421 3.41421
## 5 1,6-Anhydro-beta-D-Glucose 3.52645 3.55145 3.50145
## 6 1,6-Anhydro-beta-D-Glucose 3.67457 3.69957 3.64957
##Load previous saved SPA_cluster_peaks.csv and STOCSY_corr_clusters.csv

setwd(output_dir)
clus_mem <- read.csv("SPA_cluster_peaks.csv", header = T)

col_name=NULL
for(nc in 1:(ncol(clus_mem)-1)){
  col_name=c(col_name,paste("pk",nc,sep=""))
}
colnames(clus_mem)=c("clus_id",col_name)

head(clus_mem) #chemical shift values for the identified peaks in each cluster
##   clus_id     pk1     pk2     pk3     pk4 pk5 pk6 pk7 pk8 pk9 pk10 pk11 pk12
## 1       1 3.97810      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA   NA
## 2       2 3.95432 3.94055 3.92428 3.91301  NA  NA  NA  NA  NA   NA   NA   NA
## 3       3 3.90050      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA   NA
## 4       4 3.88548 3.86044 3.84667 3.82790  NA  NA  NA  NA  NA   NA   NA   NA
## 5       5 3.77157      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA   NA
## 6       6      NA      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA   NA
spa_stocsy_corr <- read.csv('STOCSY_corr_clusters.csv', header = T)
spa_stocsy_corr=spa_stocsy_corr[,-1]
head(spa_stocsy_corr) #each row is one high-correlated group, with the highly-correlated cluster ids
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1  1 14 17 NA NA NA NA NA NA  NA  NA  NA
## 2  2 NA NA NA NA NA NA NA NA  NA  NA  NA
## 3  3  4  7 13 14 NA NA NA NA  NA  NA  NA
## 4  4  5  7  9 13 14 17 20 32  36  39  42
## 5  5  9 11 13 14 17 20 32 34  36  39  42
## 6  6  9 10 20 NA NA NA NA NA  NA  NA  NA

Automatic identification across all groups

##Load the metabolite list with one cluster in 0.5 to 4.0 ppm
##used when single_filter = TRUE in FindMeta()
##Limit the identification uncerntainty for single-cluster metabolites
single_meta = read.csv(paste0(data_dir,"chx_meta_with_single_clus.csv"))
single_meta=as.character(single_meta$x)
length(single_meta)
## [1] 54
#set detection rate threshold
d.thre = 0.55
now <- proc.time()
cand_res <- FindMeta(new_lib, spa_stocsy_corr, clus_mem, d.thre, single_filter = TRUE, single_meta = single_meta)
proc.time()- now
##    user  system elapsed 
##  22.812   0.036  22.848
all_nam <- unique(cand_res$all_name)
all_nam=all_nam[!(all_nam%in%c("No metabolite","No quanlified candidate"))]

write.csv(all_nam,paste(output_dir,"auto_identify_metabolites_highd.csv",sep=""))
## 150 metabolites are automatically identified with the detection rate set as 0.55