##load the required packages
library(ggplot2)
library(baseline)
library(splus2R)
#customize and define the root_dir to access the downloaded repo
#root_dir = "./"
code_dir <- paste0(root_dir,"source_codes/")
data_dir <- paste0(root_dir,"input_data/")
output_dir <- paste0(root_dir,"output/")
setwd(code_dir)
#run the following commands to load SPA-STOCSY's functions
{
source("align.r")
source("baseline_removal.r")
source("Noise.estimate.r")
source("PQ_norm_signal.r")
source("Gen.spa.group.noints.r")
source("Cor.stat.r")
source("cor_landscape.r")
source("norm_kern.r")
source("Epane_kern.r")
source("tri_cube_kern.r")
source("Landscape_cut.r")
source("Clust.member.r")
source("pred.strength.r")
source("Gen_co_member.r")
source("Gen.ct.r")
source("Select.gam.r")
source("plot_pred.streth.r")
source("Ints_avg_mod_highd.R")
source("myImagePlot2.r")
source("gen.J.couple.highd.r")
source("peak_pick_local_max.R")
source("combine_cluster_peak.r")
source("gen_summary_profile.r")
source("gen_SPA_peaks.r")
source("gen_stocsy_corr_clusters.r")
source("Build_new.r")
source("FindMeta.r")
source("Auto_dect_one.r")
source("PkinReg.r")
source("clus.r")
source("clus_region.r")
source("Peak_simple.r")
}
setwd(data_dir)
filename = "htt_c12.csv" #change to user's filename
htt.dat <- read.csv(filename, header = T)
ppm0 <- htt.dat[,1] #chemical shift (ppm)
dat0 <- t(htt.dat[, -1]) #spectra data
avg <- apply(dat0, 2, mean) #average spectrum
## [1] "Head of data:"
## [,1] [,2] [,3] [,4] [,5]
## C1211 0.000248516 0.000266609 0.000282267 0.000296669 0.000314481
## C1212 0.000306765 0.000324108 0.000343583 0.000364306 0.000388333
## C1213 0.000310101 0.000321215 0.000335582 0.000353444 0.000377573
## C1214 0.000240847 0.000247723 0.000256431 0.000268619 0.000286862
## C1215 0.000166313 0.000170369 0.000180503 0.000199383 0.000230108
## [1] "Raw data loaded."
region = c(-.05, .05)
ppm = ppm0
##find alignment marker
ali.mark = ppm[which.min(abs(ppm))]
dat = dat0
##use Align function
ares <- Align(region, dat, ppm, ali.mark)
ahtt <- ares$tdat
appm <- ares$appm
avg.c <- apply(ahtt, 1, mean)
# set new standard:
ppm0 = appm #aligned chemical shift values
dat0 = ahtt # aligned NMR spectra
## [1] "Alignment results:"
dat <- t(dat0)
ppm <- ppm0
dat.bl.rm <- baseline_removal(dat)
dat.bl <- dat.bl.rm$dat.bl #baseline-corrected NMR spectra
avg.bl <- apply(dat.bl, 2, mean)
## [1] "Baseline corretion results:"
dat.bl1 <- dat.bl
dat.bl1 <- dat.bl1[, which(ppm < 4 & ppm >0.5)]
ppm1 <- ppm[which(ppm < 4 & ppm >0.5)]
dim(dat.bl1)
length(ppm1)
## [1] 10 2788
## [1] 2788
nos.est <- Noise.est(dat.bl, pos.standar = c(0.08, 0.58), ppm)
nos <- nos.est * 5
nos #estimated noise threshold
## [1] 2.245626e-05
#red line is noise cutting line
qplot(ppm[which(ppm>0.5)], avg.bl[which(ppm>0.5)], geom = "line") +
geom_hline(yintercept = nos,col="red")+
xlab("Chemical shift (ppm)")+
ylab("Intensity (a.u.)")+
scale_x_reverse()
qut_dat.bl1 <- PQ_norm_signal(t(dat.bl), method = "median", ppm, c(0.08, 0.58))
quot.bl1 <- qut_dat.bl1$quot
quot.final.bl1 <- qut_dat.bl1$quot.final
quot.final.bl1
## [1] 1.0908776 1.1271445 1.0033805 0.7905787 1.0578980 0.7606097 0.9779892
## [8] 0.9225305 1.1185942 0.9939858
dat_pqnorm.bl1 <- matrix(NA, dim(dat.bl1)[1], dim(dat.bl1)[2])
for(i in 1 : length(quot.final.bl1)){
dat_pqnorm.bl1[i, ] <- dat.bl1[i, ] / quot.final.bl1[i]
}
# set noise to 0
dat_pqnorm.bl2 <- dat_pqnorm.bl1
dat_pqnorm.bl2[which(dat_pqnorm.bl2 < nos)] <- 0
dat = dat_pqnorm.bl1
ppm = ppm1
avg.d1 <- apply(dat_pqnorm.bl1, 2, mean)
pacf(avg.d1) #estimate the optimal window size
## [1] "Take pacf result as reference and set k."
## [1] "Here k is set to 7."
k=7 #window size
#find the detailed parameter information in the source code
span = 3
a1 = 0.8
a2 = 0.999
n.gam = 50
b2 = 0.99
kern.type = "tri_cube"
k = 7
n.tr <- dim(dat)[1]/2
n.cv <- 10
##set up a path to save output from SPA function
path=output_dir
ptm <- proc.time()
grp.res <- Gen.spa.clus(dat, ppm, k, method="mean", a1, a2, n.gam, n.tr, n.cv,
b2, kern.type,auto_gam=TRUE,gam=0.8,path)
proc.time() - ptm
## user system elapsed
## 20.232 13.012 33.250
#save out the SPA results for further reference
saveRDS(grp.res,paste(output_dir,"SPA_rds.rds",sep=""))
#grp.res=readRDS(paste(output_dir,"SPA_rds.rds",sep=""))
#cluster member assigned to every ppm point
clus.mem <- grp.res$clus.mem
#separation of ppm points into clusters and background
land.cut <- grp.res$land.cut #2 for metabolite signals, 0 for background noise
##boundary label for every cluster
bond <- grp.res$bond
##chosen optimal threshold for the correlation landscape
gam <- grp.res$gam
## [1] "Total cluster numbers: "
max(clus.mem)
## [1] 50
## [1] "SPA results visualized in the spectra:blue zones are the separations among metabolite clusters (red)"
###For high-dim cases, instead of taking the mean of peak intensities within a cluster as the input for STOCSY, we keep all the peaks within a cluster as STOCSY's input
ints_highd <- Ints_avg_mod_highd(clus.mem, dat,nos,step=1)
ints = ints_highd$clus_peak
peak_clus = ints_highd$peak_clus
dim(ints)
length(peak_clus)
dat.spa <- ints
cor.dat.spa <- cor(ints) #the correlations are calculated among peaks from different clusters
dim(ints)
dim(cor.dat.spa)
range(cor.dat.spa)
## [1] 10 132
## [1] 132
## [1] 10 132
## [1] 132 132
## [1] -0.9290018 1.0000000
myImagePlot2(cor.dat.spa)
pks = peak_local_max(dat,nos,step=1)
length(pks)
## [1] 122
plot(ppm,avg.d1,type="l")
points(ppm[pks],avg.d1[pks],col="red")
pk.ppm = ppm[pks]
cluster_peak=combine_cluster_peak(clus.mem,ppm,pk.ppm)
a8_summary=gen_summary_profile(cor_clus=a8,cluster_peak)
##summarize peak lists in every SPA cluster
clus_mem=gen_SPA_peaks(profile=a8_summary,cluster_peak)
write.csv(clus_mem,paste(output_dir,"SPA_cluster_peaks.csv",sep=""))
##summarize highly-correlated clusters
spa_stocsy_corr=gen_stocsy_corr(profile=a8_summary)
write.csv(spa_stocsy_corr, paste(output_dir,"STOCSY_corr_clusters.csv",sep = ""))
##load reference library data
setwd(data_dir)
filename = "ref_final_lib.csv"
# Read data
Chx_lib <- read.csv(filename, header = T)
Chx_lib <- as.data.frame(Chx_lib)
Chx_lib = Chx_lib[,-1]
colnames(Chx_lib) <- c("meta_name", "clus_ct")
meta_name <- as.character(unique(Chx_lib$meta_name))
#clus_ct: cluster central chemical shift value
head(Chx_lib)
## meta_name clus_ct
## 1 1,3-Dihydroxyacetone 3.56759
## 2 1,3-Dihydroxyacetone 4.41130
## 3 1,3-Dimethylurate 3.30829
## 4 1,3-Dimethylurate 3.43921
## 5 1,6-Anhydro-beta-D-Glucose 3.52645
## 6 1,6-Anhydro-beta-D-Glucose 3.52645
##shift to set upper and lower boundaries for each cluster
##the ul and ll are implemented to improve SPA-STOCSY's robustness to chemical shift variations
ppm_shift_thre = 0.025
new_lib <- Build_new(meta_name, ppm_shift_thre)
head(new_lib)
## meta_name clus_ct ul ll
## 1 1,3-Dihydroxyacetone 3.56759 3.59259 3.54259
## 2 1,3-Dihydroxyacetone 4.4113 4.4363 4.3863
## 3 1,3-Dimethylurate 3.30829 3.33329 3.28329
## 4 1,3-Dimethylurate 3.43921 3.46421 3.41421
## 5 1,6-Anhydro-beta-D-Glucose 3.52645 3.55145 3.50145
## 6 1,6-Anhydro-beta-D-Glucose 3.67457 3.69957 3.64957
##Load previous saved SPA_cluster_peaks.csv and STOCSY_corr_clusters.csv
setwd(output_dir)
clus_mem <- read.csv("SPA_cluster_peaks.csv", header = T)
col_name=NULL
for(nc in 1:(ncol(clus_mem)-1)){
col_name=c(col_name,paste("pk",nc,sep=""))
}
colnames(clus_mem)=c("clus_id",col_name)
head(clus_mem) #chemical shift values for the identified peaks in each cluster
## clus_id pk1 pk2 pk3 pk4 pk5 pk6 pk7 pk8 pk9 pk10 pk11 pk12
## 1 1 3.97810 NA NA NA NA NA NA NA NA NA NA NA
## 2 2 3.95432 3.94055 3.92428 3.91301 NA NA NA NA NA NA NA NA
## 3 3 3.90050 NA NA NA NA NA NA NA NA NA NA NA
## 4 4 3.88548 3.86044 3.84667 3.82790 NA NA NA NA NA NA NA NA
## 5 5 3.77157 NA NA NA NA NA NA NA NA NA NA NA
## 6 6 NA NA NA NA NA NA NA NA NA NA NA NA
spa_stocsy_corr <- read.csv('STOCSY_corr_clusters.csv', header = T)
spa_stocsy_corr=spa_stocsy_corr[,-1]
head(spa_stocsy_corr) #each row is one high-correlated group, with the highly-correlated cluster ids
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 1 14 17 NA NA NA NA NA NA NA NA NA
## 2 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 3 4 7 13 14 NA NA NA NA NA NA NA
## 4 4 5 7 9 13 14 17 20 32 36 39 42
## 5 5 9 11 13 14 17 20 32 34 36 39 42
## 6 6 9 10 20 NA NA NA NA NA NA NA NA
##Load the metabolite list with one cluster in 0.5 to 4.0 ppm
##used when single_filter = TRUE in FindMeta()
##Limit the identification uncerntainty for single-cluster metabolites
single_meta = read.csv(paste0(data_dir,"chx_meta_with_single_clus.csv"))
single_meta=as.character(single_meta$x)
length(single_meta)
## [1] 54
#set detection rate threshold
d.thre = 0.55
now <- proc.time()
cand_res <- FindMeta(new_lib, spa_stocsy_corr, clus_mem, d.thre, single_filter = TRUE, single_meta = single_meta)
proc.time()- now
## user system elapsed
## 22.812 0.036 22.848
all_nam <- unique(cand_res$all_name)
all_nam=all_nam[!(all_nam%in%c("No metabolite","No quanlified candidate"))]
write.csv(all_nam,paste(output_dir,"auto_identify_metabolites_highd.csv",sep=""))
## 150 metabolites are automatically identified with the detection rate set as 0.55