16.2.22 UTAP: Ribo-seq pipeline guidelines
The pipeline will process sequences and using htseq counts will quantify expression on genes UTR and CDS, in addition it will find the peaks and shift them.
Need to have a genome from igenomes bowtie2index , genome gtf file, rRNA indexed for bowtie (described below how to have such a file) and gtf file of UTR and CDS explained below how to prepare.
and chromosome size file such as hg38.chrom.sizes
rRNA indexed for bowtie
human U:\10and12drugs\rRNA
mouse U:\RPanalysis\rRNA
module load IGVTools/2.3.26
module load jre
module load samtools
module load fastqc
module load tophat
module load bowtie
module load bedtools
module load python/bio-2.7.13 (contains macs)
This pipline is for human
sample name HaringDMSO_2_S4_L001_R1_001
step 1
#cutadapt
cutadapt -a CTGTAGGCACCATCAATAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC --discard-untrimmed --times 1 -u 1 -m 20 -M 50 -o HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq ../HarringtoninFP_Drug_10_12/Fastq_RF_Drugs/HaringDMSO_2_S4_L001_R1_001.fastq.gz > log_HaringDMSO_2_S4_L001_R1_001 2>&1
step 2
#fastqc
mkdir HaringDMSO_2_S4_L001_R1_001
fastqc --outdir HaringDMSO_2_S4_L001_R1_001 --threads 5 --casava ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq
step 3
#filtering rRNA, makes a fastq of sequences that do not align to rRNA
bowtie -p 10 -norc --un HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq ../rRNA/rRNA ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq HaringDMSO_2_S4_L001_R1_001_rRNA.sam > log_HaringDMSO_2_S4_L001_R1_001.txt 2>&1
step 4
#cutadapt to select sequences of a certain length
#m M are a parameter
cutadapt -m 24 -M 35 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq
or
cutadapt -m 28 -M 32 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq
step 5
#fastqc
mkdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc
fastqc --outdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc --threads 5 --casava HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq
step 6
#tophat, copy and index
bsub -q bio -n 20 -R "rusage[mem=2000] span[hosts=1]" -J harring_DMSO_2 -o job_harring_DMSO_2.txt -e err_harring_DMSO_2_1.txt "tophat -N 1 --no-novel-juncs --library-type fr-firststrand -p 20 -o harring_DMSO_2 --GTF /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome ../10-cutadapt28-32/HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq;cp harring_DMSO_2/accepted_hits.bam /home/labs/rivkadk/Collaboration/10and12drugs/13-tophat28-32/harring_DMSO_2.bam;samtools index harring_DMSO_2.bam"
step 7
#filter alignments and index
samtools view -b -q 10 harring_DMSO_2.bam > harring_DMSO_2.filtered.bam;samtools index harring_DMSO_2.filtered.bam
step 7
#tdf
igvtools count -z 5 -w 1 harring_DMSO_2.filtered.bam harring_DMSO_2.filtered.tdf ../5-tophat24-35/hg38.chrom.sizes
step 8
#macs (not macs2!)
macs -w --call-subpeaks -t ../5-tophat24-35/harring_DMSO_2.bam -c ../5-tophat24-35/CHX_DMSO_2.bam -f BAM -g hs --keep-dup all -n macs_harring_DMSO_2_vs_CHX_DMSO_2 --nomodel --shiftsize=1
step 9
#shifting 12 bases and extending the the summit to 3 bases peak according to gene orientation
sortBed -i macs_harring_DMSO_2_vs_CHX_DMSO_2_summits.bed | intersectBed -wao -a - -b /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf | cut -f1-5,12 | uniq | awk 'BEGIN {OFS="\t"}; {if($6~"-" ) print $1,$2-13,$3-11,$4,$5,$6,$7 ; else if ($6~"+" ) print $1,$2+11,$3+13,$4,$5,$6,$7 }' > macs_harring_DMSO_2_vs_CHX_DMSO_2_summits_mv13_by_strand.bed
step 10
#htseq counting 5UTR and CDS counts
htseq-count ../13-tophat28-32/harring_DMSO_2.filtered.bam -f bam -s yes -t exon -m intersection-nonempty -i gene_id hg38_cds_UTR_without_overlap.gtf > htseq_harring_DMSO_2.filtered.txt
#need to merge all htseq counts to a file and add header
ls -1 htseqfiltered.txt > merged.htseq
awk 'NF > 1{ a[$1] = a[$1]"\t"$2} END {for( i in a ) print i a[i]}' htseqfiltered.txt >> merged.htseq
#END
###########
Downloading rRNA
You can find the intervals using the UCSC Table browser. For this, you go to
and then set group:all tables, table:rmsk, and filter to "repClass (does match) rRNA"
then output it as a GTF file. Voila! Works for both mouse and human.
preparing files for htseq UTR and CDS counts
for hg38 (ran the below in /home/labs/rivkadk/Collaboration/10and12drugs/30_hg38_CDS_UTR_gtf
R script part-
First, import the GTF-file that you have also used as input for htseq-count
library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/10and12drugs/9-UTR-counts")
txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf",format="gtf")
then collect the exons per gene id
exons.list.per.gene <- exonsBy(txdb,by="gene")
cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="hg38_fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="hg38_cds.per.gene.txt",sep="\t")
#Running the R script to get the CDS and 5UTR
awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_cds.per.gene.txt |sed 's/"//g' |grep -v 'start' > hg38_cds.per.gene.bed
awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_fiveUtr.per.transcript.txt |sed 's/"//g' |grep -v 'start' > hg38_fiveUtr.per.transcript.bed
intersectBed -s -v -a hg38_fiveUtr.per.transcript.bed -b hg38_cds.per.gene.bed > hg38_fiveUtr.per.transcript.nointersect.gene.bed
intersectBed -s -v -a hg38_cds.per.gene.bed -b hg38_fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed
wc -l *bed
230885 hg38_cds.per.gene.bed
223456 hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed
71961 hg38_fiveUtr.per.transcript.bed
61272 hg38_fiveUtr.per.transcript.nointersect.gene.bed
intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_fiveUtr.per.transcript.nointersect.gene.bed> hg38_fiveUtr.per.transcript.nointersect.gene.gtf
intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed.gtf
sed 's/gene_id "/gene_id "CDS-/' hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' hg38_fiveUtr.per.transcript.nointersect.gene.gtf > hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > hg38_cds_UTR_without_overlap.gtf
for mm10
R script
First, import the GTF-file that you have also used as input for htseq-count
library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/RPanalysis/21-CDS")
txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf",format="gtf")
then collect the exons per gene id
exons.list.per.gene <- exonsBy(txdb,by="gene")
cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="cds.per.gene.txt",sep="\t")
intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b fiveUtr.per.transcript.nointersect.gene.bed> fiveUtr.per.transcript.nointersect.gene.gtf
intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b cds.per.gene.nointersect.fiveUtr.per.transcript.bed > cds.per.gene.nointersect.fiveUtr.per.transcript.gtf
sed 's/gene_id "/gene_id "CDS-/' ../21-CDS/cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' ../21-CDS/fiveUtr.per.transcript.nointersect.gene.gtf > mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > mm10_cds_UTR_without_overlap.gtf