Visualize the annotated reference file

library(TraceQC)
ref_file <- system.file("extdata/test_data/ref","ref_carlin.txt",package="TraceQC")
ref <- parse_ref_file(ref_file)
plot_construct(ref,chr_per_row=50,chr_size=5)

Sequence alignment

input_file <- system.file("extdata/test_data/raw_sequence","carlin_example.bam",package="TraceQC")
output_file <- "./aligned_reads.txt"
sequence_alignment_for_10x(input_file=input_file,ref_file=ref_file,
                   output_file=output_file)

Find optimal mutation cutoff using permutation

library(readr)
library(ggplot2)
library(dplyr)

sequence_permutation(ref_file,out="./seq_permutate.txt")
## [1] TRUE
seq_permutate <- read_tsv("./seq_permutate.txt")
model <- loess(score~permutate_percent,data=seq_permutate)
aligned_reads <- read_tsv("./aligned_reads.txt")
ggplot(aligned_reads) +
  geom_histogram(aes(x=score,y=stat(count)/sum(count)),bins=50) +
  geom_vline(xintercept=predict(model,0.4),linetype="dashed",color="red",size=2) +
  xlab("alignment score") +
  ylab("frequency") +
  theme_classic()

Identify mutations from the aligned reads

# group the identical sequence together for faster runtime
unique_sequences <- group_by(aligned_reads,target_seq,target_ref) %>%
  summarise(score=max(score)) %>%
  ungroup
alignment_score_cutoff <- predict(model,0.4)
mutation <- seq_to_character(unique_sequences,use_CPM=FALSE,
                             alignment_score_cutoff=alignment_score_cutoff)
write_tsv(mutation,"./mutation.txt")
mutation <- read_tsv("./mutation.txt")

Merge mutations based on UMI and cell barcodes

mutation <- left_join(mutation,
                      select(aligned_reads,target_seq,CB,UB))
rc_per_UB <- get_read_count_per_UB(aligned_reads)
mutation <- left_join(mutation,rc_per_UB)
# select mutations appear in more than 50% reads in each UMI
mutation <- filter_mutations_per_UMI(mutation,include_max=FALSE,freq_threshold=0.5)
# select mutations appear in more than 50% UMI in each CB
ub_per_cell <- get_UMI_count_per_CB(aligned_reads)
mutation <- left_join(mutation,ub_per_cell)
mutation <- filter_mutations_per_cell(mutation,freq_threshold=0.5)

Visualize mutation results

mutation_type_donut(mutation)

plot_deletion_hotspot(mutation,ref,use_log_count = FALSE)

plot_insertion_hotspot(mutation,ref,use_log_count = FALSE)

plot_point_substitution_hotspot(mutation,ref)

write formatted output file

library(knitr)
out_df <- format_mutation_df(mutation,is_singlecell=TRUE)
kable(head(out_df))
character type start length alt cell
D:S68L192 deletion 68 192 AACCATGCAACAACCT-1,ACGAGGAAGGAGTAGA-1,ACTGAACTCCTTGGTC-1,ACTGTCCTCAGTCAGT-1,AGAATAGGTCGAATCT-1,AGATCTGCACACCGCA-1,AGATTGCCAGCTGCTG-1,AGCAGCCCATCTGGTA-1,CAACCAACAATGAAAC-1,CACCTTGAGCAGGTCA-1,CCCAGTTCAAGCCCAC-1,CCCAGTTTCGAGGTAG-1,CTCACACCAGGCTGAA-1,CTCTACGTCACATGCA-1,CTGAAACAGATGTAAC-1,CTGCCTAGTTGTGGAG-1,GACGTGCTCACTATTC-1,GCGGGTTAGGCCCTTG-1,GGACAGAAGCGAAGGG-1,GTGCGGTTCATCGATG-1,GTTCTCGAGCCAGGAT-1,TCTCATAAGGTAGCTG-1,TCTTCGGCATGAAGTA-1
D:S1L28 deletion 1 28 AACGTTGTCTTGCCGT-1
D:S62L9 deletion 62 9 AACGTTGTCTTGCCGT-1
D:S178L3 deletion 178 3 AACGTTGTCTTGCCGT-1
S:S32L1->G substitution 32 1 G AACGTTGTCTTGCCGT-1
S:S33L1->A substitution 33 1 A AACGTTGTCTTGCCGT-1
write_tsv(out_df,"./mutations.traceqc.txt")