Using atacr for Enriched RNAseq and ATACseq analysis

atacr is a package for creating statistics and diagnostic plots for short read sequence data from capture enriched RNAseq and ATACseq experiments.

Sample data

The function simulate_counts() will give us a small simulated data set of three replicates from a control and treatment. Each of the six sets of counts follows a mixed distribution of 10 counts drawn from a log-normal distribution with logmean 4 and SD 1, and 40 counts with logmean 10 and SD 1. This mimics the enrichment pattern we see with capture enriched data. 10 of the counts are multiplied by a value drawn from the normal distribution with mean 2 and SD 1 so can appear differentially expressed. These counts represent bait-windows - regions of the genome for which baits were designed. The bait-window counts are mixed with 50 non-bait-windows which have 0 counts.

library(atacr)
counts <-simulate_counts()

Experiment Summary Information

It’s very easy to get information on the coverage for bait/non-bait windows on a per sample basis

summary(counts)

## ATAC-seq experiment of 2 treatments in 6 samples
##  Treatments: control,treatment 
##  Samples: control_001,control_002,control_003,treatment_001,treatment_002,treatment_003 
##  Bait regions used: 50 
##  Total Windows: 100 
##  
##  On/Off target read counts:
##           sample off_target on_target percent_on_target
## 1   control_001          0   1096557               100
## 2   control_002          0   1195717               100
## 3   control_003          0   1103689               100
## 4 treatment_001          0    997809               100
## 5 treatment_002          0   1102814               100
## 6 treatment_003          0   1375378               100 
##  Quantiles: 
##  $bait_windows
##     control_001 control_002 control_003 treatment_001 treatment_002
## 1%         7.37        0.00        3.45         18.68         10.31
## 5%        35.50       27.55       26.15         52.40         43.95
## 95%    77893.35    89079.70    71063.25      68795.85      95217.85
## 99%   113062.17   180385.21   150238.19     117918.69     113095.22
##     treatment_003
## 1%           2.00
## 5%          57.25
## 95%     110209.25
## 99%     138024.45
## 
## $non_bait_windows
##     control_001 control_002 control_003 treatment_001 treatment_002
## 1%           NA          NA          NA            NA            NA
## 5%           NA          NA          NA            NA            NA
## 95%          NA          NA          NA            NA            NA
## 99%          NA          NA          NA            NA            NA
##     treatment_003
## 1%             NA
## 5%             NA
## 95%            NA
## 99%            NA
##  
##  Read depths:
##           sample off_target on_target
## 1   control_001         NA  21931.14
## 2   control_002         NA  23914.34
## 3   control_003         NA  22073.78
## 4 treatment_001         NA  19956.18
## 5 treatment_002         NA  22056.28
## 6 treatment_003         NA  27507.56

plot(counts)

## Picking joint bandwidth of 0.303

These plots can be generated individually with the following functions

QC Plots

Plot for coverage by sequence and sample

plot_count_by_chromosome(counts)

## Warning: Expected 3 pieces. Additional pieces discarded in 300 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

Correlations between sample counts

sample_correlation_plot(counts)

Count windows below a threshold.

windows_below_coverage_threshold_plot(counts, which = "bait_windows", from=0, to=1000)

MA plots

ma_plot(counts)

Normalisation

Normalisation strategies are easy to implement with atacr and helpful functions are included

counts$library_size_normalised <- library_size_normalisation(counts)
ma_plot(counts, which = "library_size_normalised")

Normalisation by control windows. Requires a text file with the control window positions

window_file <- "control_windows.txt"
counts$control_window_normalisation <- control_window_normalise(sim_counts, window_file)

Detect differentially expressed/accessible windows

Using a simple bootstrap t-test method for simple two-way comparisons.

result <- estimate_fdr(sim_counts, "treatment", "control", which = "bait_windows")

pander::pandoc.table(head(result))

Table continues below
window	t	mean_count_a	mean_count_b
synth_chrom:1-50:-	6.418	87	25
synth_chrom:51-100:+	-7.602	1.333	30.67
synth_chrom:101-150:+	4.744	79.67	28.67
synth_chrom:151-200:+	2.654	56	30.67
synth_chrom:201-250:+	4.181	44.33	27.67
synth_chrom:251-300:+	3.801	47.33	32.67

sd_a	sd_b	log2_fc	is_sig
16.09	4.583	1.799	TRUE
0.5774	6.658	-4.524	TRUE
17.79	5.508	1.475	TRUE
15.59	5.508	0.8688	TRUE
4.726	5.033	0.6802	TRUE
5.859	3.215	0.535	TRUE

This can also be done for multiclass designs with multiple samples against a common reference.

multi_result <- estimate_fdr_multiclass(sim_counts, "control", which = "bait_windows")
pander::pandoc.table(head(multi_result))

Table continues below
window	t	p_value	fdr	mean_count_a
synth_chrom:1-50:-	6.418	0	0	87
synth_chrom:51-100:+	-7.602	0.1	0.4342	1.333
synth_chrom:101-150:+	4.744	0	0	79.67
synth_chrom:151-200:+	2.654	0	0	56
synth_chrom:201-250:+	4.181	0	0	44.33
synth_chrom:251-300:+	3.801	0.1	0.4342	47.33

mean_count_b	sd_a	sd_b	log2_fc	is_sig	a	b
25	16.09	4.583	1.799	TRUE	treatment	control
30.67	0.5774	6.658	-4.524	FALSE	treatment	control
28.67	17.79	5.508	1.475	TRUE	treatment	control
30.67	15.59	5.508	0.8688	TRUE	treatment	control
27.67	4.726	5.033	0.6802	TRUE	treatment	control
32.67	5.859	3.215	0.535	FALSE	treatment	control

Visualise windows with the same pattern across experiments

library(UpSetR)
u <- make_UpSetR(multi_result)

upset(fromList(u))

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?