SPA_STOCSY

Setup codes

##load the required packages
library(ggplot2)
library(baseline)
library(msProcess)
library(splus2R)

#customize and define the root_dir to access the downloaded repo
#root_dir = "./"
code_dir <- paste0(root_dir,"source_codes/") 
data_dir <- paste0(root_dir,"input_data/")
output_dir <- paste0(root_dir,"output/")
setwd(code_dir)

#run the following commands to load SPA-STOCSY's functions
{ 
  source("align.r")
  source("baseline_removal.r")
  source("Noise.estimate.r")
  source("PQ_norm_signal.r")
  source("Gen.spa.group.noints.r")
  source("Cor.stat.r")
  source("cor_landscape.r")
  source("norm_kern.r")
  source("Epane_kern.r")
  source("tri_cube_kern.r")
  source("Landscape_cut.r")
  source("Clust.member.r")
  source("pred.strength.r")
  source("Gen_co_member.r")
  source("Gen.ct.r")
  source("Select.gam.r")
  source("plot_pred.streth.r")
  source("Ints_avg_mod.r")
  source("myImagePlot2.r")
  source("gen.J.couple.r")
  source("peak_pick_local_max.R")
  source("combine_cluster_peak.r")
  source("gen_summary_profile.r")
  source("gen_SPA_peaks.r")
  source("gen_stocsy_corr_clusters.r")
  source("Build_new.r")
  source("FindMeta.r")
  source("Auto_dect_one.r")
  source("PkinReg.r")
  source("clus.r")
  source("clus_region.r")
  source("Peak_simple.r")
  
}

Load input data

Set file path

setwd(data_dir)
filename = "htt_c12.csv" #change to user's filename
htt.dat <- read.csv(filename, header = T)

Extract ppm and adjust data format

ppm0 <- htt.dat[,1] #chemical shift (ppm)
dat0 <- t(htt.dat[, -1]) #spectra data
avg <- apply(dat0, 2, mean) #average spectrum

## [1] "Head of data:"

##              [,1]        [,2]        [,3]        [,4]        [,5]
## C1211 0.000248516 0.000266609 0.000282267 0.000296669 0.000314481
## C1212 0.000306765 0.000324108 0.000343583 0.000364306 0.000388333
## C1213 0.000310101 0.000321215 0.000335582 0.000353444 0.000377573
## C1214 0.000240847 0.000247723 0.000256431 0.000268619 0.000286862
## C1215 0.000166313 0.000170369 0.000180503 0.000199383 0.000230108

## [1] "Raw data loaded."

Raw data preprocessing steps:

Samples alignment:

region = c(-.05, .05)
ppm = ppm0
##find alignment marker
ali.mark =  ppm[which.min(abs(ppm))]
dat = dat0

##use Align function
ares <- Align(region, dat, ppm, ali.mark)
ahtt <- ares$tdat
appm <- ares$appm
avg.c <- apply(ahtt, 1, mean)

# set new standard: 
ppm0 = appm #aligned chemical shift values
dat0 = ahtt  # aligned NMR spectra

## [1] "Alignment results:"

Baseline correction

dat <- t(dat0) 
ppm <- ppm0 

dat.bl.rm <- baseline_removal(dat)
dat.bl <- dat.bl.rm$dat.bl #baseline-corrected NMR spectra
avg.bl <- apply(dat.bl, 2, mean)

## [1] "Baseline corretion results:"

Keep informative spectra in 0.5-4.0ppm

dat.bl1 <- dat.bl
dat.bl1 <- dat.bl1[, which(ppm < 4 & ppm >0.5)]
ppm1 <- ppm[which(ppm < 4 & ppm >0.5)]

dim(dat.bl1) 
length(ppm1)

## [1]   10 2788
## [1] 2788

Noise estimation

nos.est <- Noise.est(dat.bl, pos.standar = c(0.08, 0.58), ppm)
nos <- nos.est * 5
nos #estimated noise threshold

## [1] 2.245626e-05

#red line is noise cutting line
qplot(ppm[which(ppm>0.5)], avg.bl[which(ppm>0.5)], geom = "line") + 
  geom_hline(yintercept = nos,col="red")+
  xlab("Chemical shift (ppm)")+
  ylab("Intensity (a.u.)")+
  scale_x_reverse()

Quotient normalization

qut_dat.bl1 <- PQ_norm_signal(t(dat.bl), method = "median", ppm, c(0.08, 0.58))
quot.bl1 <- qut_dat.bl1$quot
quot.final.bl1 <- qut_dat.bl1$quot.final
quot.final.bl1

##  [1] 1.0908776 1.1271445 1.0033805 0.7905787 1.0578980 0.7606097 0.9779892
##  [8] 0.9225305 1.1185942 0.9939858

dat_pqnorm.bl1 <- matrix(NA, dim(dat.bl1)[1], dim(dat.bl1)[2])
for(i in 1 : length(quot.final.bl1)){
    dat_pqnorm.bl1[i, ] <- dat.bl1[i, ] / quot.final.bl1[i]
}

# set noise to 0
dat_pqnorm.bl2 <- dat_pqnorm.bl1 
dat_pqnorm.bl2[which(dat_pqnorm.bl2 < nos)] <- 0

SPA process:

PACF

dat = dat_pqnorm.bl1 
ppm = ppm1 
avg.d1 <- apply(dat_pqnorm.bl1, 2, mean)

pacf(avg.d1) #estimate the optimal window size

## [1] "Take pacf result as reference and set k."

## [1] "Here k is set to 7."

k=7 #window size

Parameters for SPA

#find the detailed parameter information in the source code
span = 3
a1 = 0.8
a2 = 0.999
n.gam = 50
b2 = 0.99
kern.type = "tri_cube"
k = 7
n.tr <- dim(dat)[1]/2
n.cv <- 10

##set up a path to save output from SPA function
path=output_dir

Run SPA

ptm <- proc.time()
grp.res <- Gen.spa.clus(dat, ppm, k, method="mean", a1, a2, n.gam, n.tr, n.cv, 
                        b2, kern.type,auto_gam=TRUE,gam=0.8,path)
proc.time() - ptm

##    user  system elapsed 
##  23.984  13.160  37.147

#save out the SPA results for further reference
saveRDS(grp.res,paste(output_dir,"SPA_rds.rds",sep=""))
#grp.res=readRDS(paste(output_dir,"SPA_rds.rds",sep=""))

#cluster member assigned to every ppm point
clus.mem <- grp.res$clus.mem

#separation of ppm points into clusters and background 
land.cut <- grp.res$land.cut #2 for metabolite signals, 0 for background noise

##boundary label for every cluster
bond <- grp.res$bond

##chosen optimal threshold for the correlation landscape
gam <- grp.res$gam

## [1] "Total cluster numbers: "

max(clus.mem)

## [1] 50

## [1] "SPA results visualized in the spectra:blue zones are the separations among metabolite clusters (red)"

STOCSY process:

Prepare input for STOCSY

ints = Ints_avg_mod(clus.mem, dat)
dat.spa <- ints  # intensities are calcualted by averaging the hump intensities with in that cluster.
cor.dat.spa <- cor(ints)
dim(ints)
dim(cor.dat.spa)
range(cor.dat.spa)

## [1] 10 50
## [1] 50 50
## [1] -0.8892866  1.0000000

myImagePlot2(cor.dat.spa)

Heatmap for correlated clusters

#set threshold for correlation among clusters
#Here the threshold is set as 0.8

a.8 <- cor.dat.spa
a.8[a.8 < 0.8] <- 0

myImagePlot2(a.8)

Summarize and visulize correlated clusters

##Have highly correlated clusters in list
a8 <- gen.J.couple(cor.dat.spa, .8)

##Visualize the positions of different clusters with 'clus' function
##user can use this function to check the pattern for each highly-correlated group
clus(id=c(5,9,11,17))

Peak detection

Simple peak detection

peaks=splus2R::peaks #specify that the peaks function is from splus2R package
pks <- Peak_simple(dat, ppm, thresh.scale = 10, snr = 3, snr.thresh=10)

## [1] "Number of peaks detected:"
## [1] 114

Extract detected peaks

idx.pk <- pks$idx.pk
pk.ppm <- ppm[idx.pk]

Summarize clusters and peaks

cluster_peak=combine_cluster_peak(clus.mem,ppm,pk.ppm)
a8_summary=gen_summary_profile(cor_clus=a8,cluster_peak)

Automatic metabolites identification

Prepare inputs from SPA-STOCSY results

##summarize peak lists in every SPA cluster
clus_mem=gen_SPA_peaks(profile=a8_summary,cluster_peak)
write.csv(clus_mem,paste(output_dir,"SPA_cluster_peaks.csv",sep=""))

##summarize highly-correlated clusters
spa_stocsy_corr=gen_stocsy_corr(profile=a8_summary)
write.csv(spa_stocsy_corr, paste(output_dir,"STOCSY_corr_clusters.csv",sep = ""))

Prepare reference library

##load reference library data
setwd(data_dir)
filename = "ref_final_lib.csv"

# Read data
Chx_lib <- read.csv(filename, header = T)
Chx_lib <- as.data.frame(Chx_lib)
Chx_lib = Chx_lib[,-1]
colnames(Chx_lib) <- c("meta_name", "clus_ct")
meta_name <- as.character(unique(Chx_lib$meta_name))

head(Chx_lib)

##                    meta_name clus_ct
## 1       1,3-Dihydroxyacetone 3.56759
## 2       1,3-Dihydroxyacetone 4.41130
## 3          1,3-Dimethylurate 3.30829
## 4          1,3-Dimethylurate 3.43921
## 5 1,6-Anhydro-beta-D-Glucose 3.52645
## 6 1,6-Anhydro-beta-D-Glucose 3.52645

#clus_ct: cluster central chemical shift value

##shift to set upper and lower boundaries for each cluster
##the ul and ll are implemented to improve SPA-STOCSY's robustness to chemical shift variations
ppm_shift_thre = 0.025
new_lib <- Build_new(meta_name, ppm_shift_thre)
head(new_lib)

##                    meta_name clus_ct      ul      ll
## 1       1,3-Dihydroxyacetone 3.56759 3.59259 3.54259
## 2       1,3-Dihydroxyacetone  4.4113  4.4363  4.3863
## 3          1,3-Dimethylurate 3.30829 3.33329 3.28329
## 4          1,3-Dimethylurate 3.43921 3.46421 3.41421
## 5 1,6-Anhydro-beta-D-Glucose 3.52645 3.55145 3.50145
## 6 1,6-Anhydro-beta-D-Glucose 3.67457 3.69957 3.64957

##Load previous saved SPA_cluster_peaks.csv and STOCSY_corr_clusters.csv

setwd(output_dir)
clus_mem <- read.csv("SPA_cluster_peaks.csv", header = T)

col_name=NULL
for(nc in 1:(ncol(clus_mem)-1)){
  col_name=c(col_name,paste("pk",nc,sep=""))
}
colnames(clus_mem)=c("clus_id",col_name)

head(clus_mem) #chemical shift values for the identified peaks in each cluster

##   clus_id     pk1     pk2     pk3     pk4 pk5 pk6 pk7 pk8 pk9 pk10 pk11
## 1       1 3.97810      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA
## 2       2 3.95432 3.94055 3.92428 3.91301  NA  NA  NA  NA  NA   NA   NA
## 3       3 3.90050      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA
## 4       4 3.88548 3.86044 3.84667 3.82790  NA  NA  NA  NA  NA   NA   NA
## 5       5 3.77157      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA
## 6       6      NA      NA      NA      NA  NA  NA  NA  NA  NA   NA   NA

spa_stocsy_corr <- read.csv('STOCSY_corr_clusters.csv', header = T)
spa_stocsy_corr=spa_stocsy_corr[,-1]
head(spa_stocsy_corr) #each row is one high-correlated group, with the highly-correlated cluster ids

##   V1 V2 V3 V4 V5 V6 V7 V8
## 1  1 NA NA NA NA NA NA NA
## 2  2 NA NA NA NA NA NA NA
## 3  3  7 13 14 NA NA NA NA
## 4  4  9 14 42 NA NA NA NA
## 5  5  9 11 17 34 39 42 NA
## 6  6 10 NA NA NA NA NA NA

Automatic identification on one group

#take the third highly-correlated group for example
#calculate the detection rate for all the metabolites in the reference library from this group

n=3
res <- Auto_dect_one(new_lib,as.numeric(spa_stocsy_corr[n,])[!is.na(as.numeric(spa_stocsy_corr[n,]))], clus_mem)

meta_ct <- res$meta_ct

# list the metabolite that has dect_rate above a set threshold d.thre: 
#n_tot_clus: metabolite total cluster numbers
#n_known_clus: metabolite cluster numbers within 0.5 to 4.0ppm
#n_dect: metabolite cluster numbers detected within the highly-correlated group
#dect_rate = n_dect/n_known_clus
d.thre=0.55
meta_ct[which(meta_ct$dect_rate > d.thre), ]

##                  meta_name n_tot_clus n_known_clus n_dect         dect_rate
## 38     3-Hydroxykynurenine          5            1      1                 1
## 39  3-Hydroxyphenylacetate          5            1      1                 1
## 47        4-Aminohippurate          4            1      1                 1
## 51  4-Hydroxyphenylacetate          3            1      1                 1
## 116               Ferulate          6            1      1                 1
## 181              Vanillate          4            1      1                 1
## 182                Glucose         14           12      7 0.583333333333333
## 190             Kynurenine          6            1      1                 1
## 238        trans-Aconitate          2            1      1                 1
## 256          Homogentisate          3            1      1                 1

Automatic identification across all groups

now <- proc.time()
cand_res <- FindMeta(new_lib, spa_stocsy_corr, clus_mem, d.thre, single_filter = FALSE)
proc.time()- now

##    user  system elapsed 
##  23.268   0.044  23.314

all_nam <- unique(cand_res$all_name)
all_nam=all_nam[!(all_nam%in%c("No metabolite","No quanlified candidate"))]

## save the identified metabolites
write.csv(all_nam,paste(output_dir,"auto_identify_metabolites.csv",sep=""))

## 131 metabolites are automatically identified with the detection rate set as 0.55

SPA_STOCSY_menu

Setup codes

Load input data

Set file path

Extract ppm and adjust data format

Raw data preprocessing steps:

Samples alignment:

Baseline correction

Keep informative spectra in 0.5-4.0ppm

Noise estimation

Quotient normalization

SPA process:

PACF

Parameters for SPA

Run SPA

STOCSY process:

Prepare input for STOCSY

Heatmap for correlated clusters

Summarize and visulize correlated clusters

Peak detection

Simple peak detection

Extract detected peaks

Summarize clusters and peaks

Automatic metabolites identification

Prepare inputs from SPA-STOCSY results

Prepare reference library

Automatic identification on one group

Automatic identification across all groups