##load the required packages
library(ggplot2)
library(baseline)
library(msProcess)
library(splus2R)
#customize and define the root_dir to access the downloaded repo
#root_dir = "./"
code_dir <- paste0(root_dir,"source_codes/")
data_dir <- paste0(root_dir,"input_data/")
output_dir <- paste0(root_dir,"output/")
setwd(code_dir)
#run the following commands to load SPA-STOCSY's functions
{
source("align.r")
source("baseline_removal.r")
source("Noise.estimate.r")
source("PQ_norm_signal.r")
source("Gen.spa.group.noints.r")
source("Cor.stat.r")
source("cor_landscape.r")
source("norm_kern.r")
source("Epane_kern.r")
source("tri_cube_kern.r")
source("Landscape_cut.r")
source("Clust.member.r")
source("pred.strength.r")
source("Gen_co_member.r")
source("Gen.ct.r")
source("Select.gam.r")
source("plot_pred.streth.r")
source("Ints_avg_mod.r")
source("myImagePlot2.r")
source("gen.J.couple.r")
source("peak_pick_local_max.R")
source("combine_cluster_peak.r")
source("gen_summary_profile.r")
source("gen_SPA_peaks.r")
source("gen_stocsy_corr_clusters.r")
source("Build_new.r")
source("FindMeta.r")
source("Auto_dect_one.r")
source("PkinReg.r")
source("clus.r")
source("clus_region.r")
source("Peak_simple.r")
}
setwd(data_dir)
filename = "htt_c12.csv" #change to user's filename
htt.dat <- read.csv(filename, header = T)
ppm0 <- htt.dat[,1] #chemical shift (ppm)
dat0 <- t(htt.dat[, -1]) #spectra data
avg <- apply(dat0, 2, mean) #average spectrum
## [1] "Head of data:"
## [,1] [,2] [,3] [,4] [,5]
## C1211 0.000248516 0.000266609 0.000282267 0.000296669 0.000314481
## C1212 0.000306765 0.000324108 0.000343583 0.000364306 0.000388333
## C1213 0.000310101 0.000321215 0.000335582 0.000353444 0.000377573
## C1214 0.000240847 0.000247723 0.000256431 0.000268619 0.000286862
## C1215 0.000166313 0.000170369 0.000180503 0.000199383 0.000230108
## [1] "Raw data loaded."
region = c(-.05, .05)
ppm = ppm0
##find alignment marker
ali.mark = ppm[which.min(abs(ppm))]
dat = dat0
##use Align function
ares <- Align(region, dat, ppm, ali.mark)
ahtt <- ares$tdat
appm <- ares$appm
avg.c <- apply(ahtt, 1, mean)
# set new standard:
ppm0 = appm #aligned chemical shift values
dat0 = ahtt # aligned NMR spectra
## [1] "Alignment results:"
dat <- t(dat0)
ppm <- ppm0
dat.bl.rm <- baseline_removal(dat)
dat.bl <- dat.bl.rm$dat.bl #baseline-corrected NMR spectra
avg.bl <- apply(dat.bl, 2, mean)
## [1] "Baseline corretion results:"
dat.bl1 <- dat.bl
dat.bl1 <- dat.bl1[, which(ppm < 4 & ppm >0.5)]
ppm1 <- ppm[which(ppm < 4 & ppm >0.5)]
dim(dat.bl1)
length(ppm1)
## [1] 10 2788
## [1] 2788
nos.est <- Noise.est(dat.bl, pos.standar = c(0.08, 0.58), ppm)
nos <- nos.est * 5
nos #estimated noise threshold
## [1] 2.245626e-05
#red line is noise cutting line
qplot(ppm[which(ppm>0.5)], avg.bl[which(ppm>0.5)], geom = "line") +
geom_hline(yintercept = nos,col="red")+
xlab("Chemical shift (ppm)")+
ylab("Intensity (a.u.)")+
scale_x_reverse()
qut_dat.bl1 <- PQ_norm_signal(t(dat.bl), method = "median", ppm, c(0.08, 0.58))
quot.bl1 <- qut_dat.bl1$quot
quot.final.bl1 <- qut_dat.bl1$quot.final
quot.final.bl1
## [1] 1.0908776 1.1271445 1.0033805 0.7905787 1.0578980 0.7606097 0.9779892
## [8] 0.9225305 1.1185942 0.9939858
dat_pqnorm.bl1 <- matrix(NA, dim(dat.bl1)[1], dim(dat.bl1)[2])
for(i in 1 : length(quot.final.bl1)){
dat_pqnorm.bl1[i, ] <- dat.bl1[i, ] / quot.final.bl1[i]
}
# set noise to 0
dat_pqnorm.bl2 <- dat_pqnorm.bl1
dat_pqnorm.bl2[which(dat_pqnorm.bl2 < nos)] <- 0
dat = dat_pqnorm.bl1
ppm = ppm1
avg.d1 <- apply(dat_pqnorm.bl1, 2, mean)
pacf(avg.d1) #estimate the optimal window size
## [1] "Take pacf result as reference and set k."
## [1] "Here k is set to 7."
k=7 #window size
#find the detailed parameter information in the source code
span = 3
a1 = 0.8
a2 = 0.999
n.gam = 50
b2 = 0.99
kern.type = "tri_cube"
k = 7
n.tr <- dim(dat)[1]/2
n.cv <- 10
##set up a path to save output from SPA function
path=output_dir
ptm <- proc.time()
grp.res <- Gen.spa.clus(dat, ppm, k, method="mean", a1, a2, n.gam, n.tr, n.cv,
b2, kern.type,auto_gam=TRUE,gam=0.8,path)
proc.time() - ptm
## user system elapsed
## 23.984 13.160 37.147
#save out the SPA results for further reference
saveRDS(grp.res,paste(output_dir,"SPA_rds.rds",sep=""))
#grp.res=readRDS(paste(output_dir,"SPA_rds.rds",sep=""))
#cluster member assigned to every ppm point
clus.mem <- grp.res$clus.mem
#separation of ppm points into clusters and background
land.cut <- grp.res$land.cut #2 for metabolite signals, 0 for background noise
##boundary label for every cluster
bond <- grp.res$bond
##chosen optimal threshold for the correlation landscape
gam <- grp.res$gam
## [1] "Total cluster numbers: "
max(clus.mem)
## [1] 50
## [1] "SPA results visualized in the spectra:blue zones are the separations among metabolite clusters (red)"
ints = Ints_avg_mod(clus.mem, dat)
dat.spa <- ints # intensities are calcualted by averaging the hump intensities with in that cluster.
cor.dat.spa <- cor(ints)
dim(ints)
dim(cor.dat.spa)
range(cor.dat.spa)
## [1] 10 50
## [1] 50 50
## [1] -0.8892866 1.0000000
myImagePlot2(cor.dat.spa)
peaks=splus2R::peaks #specify that the peaks function is from splus2R package
pks <- Peak_simple(dat, ppm, thresh.scale = 10, snr = 3, snr.thresh=10)
## [1] "Number of peaks detected:"
## [1] 114
idx.pk <- pks$idx.pk
pk.ppm <- ppm[idx.pk]
cluster_peak=combine_cluster_peak(clus.mem,ppm,pk.ppm)
a8_summary=gen_summary_profile(cor_clus=a8,cluster_peak)
##summarize peak lists in every SPA cluster
clus_mem=gen_SPA_peaks(profile=a8_summary,cluster_peak)
write.csv(clus_mem,paste(output_dir,"SPA_cluster_peaks.csv",sep=""))
##summarize highly-correlated clusters
spa_stocsy_corr=gen_stocsy_corr(profile=a8_summary)
write.csv(spa_stocsy_corr, paste(output_dir,"STOCSY_corr_clusters.csv",sep = ""))
##load reference library data
setwd(data_dir)
filename = "ref_final_lib.csv"
# Read data
Chx_lib <- read.csv(filename, header = T)
Chx_lib <- as.data.frame(Chx_lib)
Chx_lib = Chx_lib[,-1]
colnames(Chx_lib) <- c("meta_name", "clus_ct")
meta_name <- as.character(unique(Chx_lib$meta_name))
head(Chx_lib)
## meta_name clus_ct
## 1 1,3-Dihydroxyacetone 3.56759
## 2 1,3-Dihydroxyacetone 4.41130
## 3 1,3-Dimethylurate 3.30829
## 4 1,3-Dimethylurate 3.43921
## 5 1,6-Anhydro-beta-D-Glucose 3.52645
## 6 1,6-Anhydro-beta-D-Glucose 3.52645
#clus_ct: cluster central chemical shift value
##shift to set upper and lower boundaries for each cluster
##the ul and ll are implemented to improve SPA-STOCSY's robustness to chemical shift variations
ppm_shift_thre = 0.025
new_lib <- Build_new(meta_name, ppm_shift_thre)
head(new_lib)
## meta_name clus_ct ul ll
## 1 1,3-Dihydroxyacetone 3.56759 3.59259 3.54259
## 2 1,3-Dihydroxyacetone 4.4113 4.4363 4.3863
## 3 1,3-Dimethylurate 3.30829 3.33329 3.28329
## 4 1,3-Dimethylurate 3.43921 3.46421 3.41421
## 5 1,6-Anhydro-beta-D-Glucose 3.52645 3.55145 3.50145
## 6 1,6-Anhydro-beta-D-Glucose 3.67457 3.69957 3.64957
##Load previous saved SPA_cluster_peaks.csv and STOCSY_corr_clusters.csv
setwd(output_dir)
clus_mem <- read.csv("SPA_cluster_peaks.csv", header = T)
col_name=NULL
for(nc in 1:(ncol(clus_mem)-1)){
col_name=c(col_name,paste("pk",nc,sep=""))
}
colnames(clus_mem)=c("clus_id",col_name)
head(clus_mem) #chemical shift values for the identified peaks in each cluster
## clus_id pk1 pk2 pk3 pk4 pk5 pk6 pk7 pk8 pk9 pk10 pk11
## 1 1 3.97810 NA NA NA NA NA NA NA NA NA NA
## 2 2 3.95432 3.94055 3.92428 3.91301 NA NA NA NA NA NA NA
## 3 3 3.90050 NA NA NA NA NA NA NA NA NA NA
## 4 4 3.88548 3.86044 3.84667 3.82790 NA NA NA NA NA NA NA
## 5 5 3.77157 NA NA NA NA NA NA NA NA NA NA
## 6 6 NA NA NA NA NA NA NA NA NA NA NA
spa_stocsy_corr <- read.csv('STOCSY_corr_clusters.csv', header = T)
spa_stocsy_corr=spa_stocsy_corr[,-1]
head(spa_stocsy_corr) #each row is one high-correlated group, with the highly-correlated cluster ids
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 NA NA NA NA NA NA NA
## 2 2 NA NA NA NA NA NA NA
## 3 3 7 13 14 NA NA NA NA
## 4 4 9 14 42 NA NA NA NA
## 5 5 9 11 17 34 39 42 NA
## 6 6 10 NA NA NA NA NA NA
#take the third highly-correlated group for example
#calculate the detection rate for all the metabolites in the reference library from this group
n=3
res <- Auto_dect_one(new_lib,as.numeric(spa_stocsy_corr[n,])[!is.na(as.numeric(spa_stocsy_corr[n,]))], clus_mem)
meta_ct <- res$meta_ct
# list the metabolite that has dect_rate above a set threshold d.thre:
#n_tot_clus: metabolite total cluster numbers
#n_known_clus: metabolite cluster numbers within 0.5 to 4.0ppm
#n_dect: metabolite cluster numbers detected within the highly-correlated group
#dect_rate = n_dect/n_known_clus
d.thre=0.55
meta_ct[which(meta_ct$dect_rate > d.thre), ]
## meta_name n_tot_clus n_known_clus n_dect dect_rate
## 38 3-Hydroxykynurenine 5 1 1 1
## 39 3-Hydroxyphenylacetate 5 1 1 1
## 47 4-Aminohippurate 4 1 1 1
## 51 4-Hydroxyphenylacetate 3 1 1 1
## 116 Ferulate 6 1 1 1
## 181 Vanillate 4 1 1 1
## 182 Glucose 14 12 7 0.583333333333333
## 190 Kynurenine 6 1 1 1
## 238 trans-Aconitate 2 1 1 1
## 256 Homogentisate 3 1 1 1
now <- proc.time()
cand_res <- FindMeta(new_lib, spa_stocsy_corr, clus_mem, d.thre, single_filter = FALSE)
proc.time()- now
## user system elapsed
## 23.268 0.044 23.314
all_nam <- unique(cand_res$all_name)
all_nam=all_nam[!(all_nam%in%c("No metabolite","No quanlified candidate"))]
## save the identified metabolites
write.csv(all_nam,paste(output_dir,"auto_identify_metabolites.csv",sep=""))
## 131 metabolites are automatically identified with the detection rate set as 0.55