16.2.22 UTAP: Ribo-seq pipeline guidelines

The pipeline will process sequences and using htseq counts will quantify expression on genes UTR and CDS, in addition it will find the peaks and shift them.

Need to have a genome from igenomes bowtie2index , genome gtf file, rRNA indexed for bowtie (described below how to have such a file) and gtf file of UTR and CDS explained below how to prepare.
and chromosome size file such as hg38.chrom.sizes
rRNA indexed for bowtie
human U:\10and12drugs\rRNA
mouse U:\RPanalysis\rRNA

module load IGVTools/2.3.26
module load jre
module load samtools
module load fastqc
module load tophat
module load bowtie
module load bedtools
module load python/bio-2.7.13 (contains macs)

This pipline is for human
sample name HaringDMSO_2_S4_L001_R1_001

step 1
#cutadapt
cutadapt -a CTGTAGGCACCATCAATAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC --discard-untrimmed --times 1 -u 1 -m 20 -M 50 -o HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq ../HarringtoninFP_Drug_10_12/Fastq_RF_Drugs/HaringDMSO_2_S4_L001_R1_001.fastq.gz > log_HaringDMSO_2_S4_L001_R1_001 2>&1

step 2
#fastqc
mkdir HaringDMSO_2_S4_L001_R1_001
fastqc --outdir HaringDMSO_2_S4_L001_R1_001 --threads 5 --casava ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq

step 3
#filtering rRNA, makes a fastq of sequences that do not align to rRNA
bowtie -p 10 -norc --un HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq ../rRNA/rRNA ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq HaringDMSO_2_S4_L001_R1_001_rRNA.sam > log_HaringDMSO_2_S4_L001_R1_001.txt 2>&1

step 4
#cutadapt to select sequences of a certain length
#m M are a parameter
cutadapt -m 24 -M 35 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq
or
cutadapt -m 28 -M 32 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq

step 5
#fastqc
mkdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc
fastqc --outdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc --threads 5 --casava HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq

step 6
#tophat, copy and index
bsub -q bio -n 20 -R "rusage[mem=2000] span[hosts=1]" -J harring_DMSO_2 -o job_harring_DMSO_2.txt -e err_harring_DMSO_2_1.txt "tophat -N 1 --no-novel-juncs --library-type fr-firststrand -p 20 -o harring_DMSO_2 --GTF /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome ../10-cutadapt28-32/HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq;cp harring_DMSO_2/accepted_hits.bam /home/labs/rivkadk/Collaboration/10and12drugs/13-tophat28-32/harring_DMSO_2.bam;samtools index harring_DMSO_2.bam"

step 7
#filter alignments and index
samtools view -b -q 10 harring_DMSO_2.bam > harring_DMSO_2.filtered.bam;samtools index harring_DMSO_2.filtered.bam

step 7
#tdf
igvtools count -z 5 -w 1 harring_DMSO_2.filtered.bam harring_DMSO_2.filtered.tdf ../5-tophat24-35/hg38.chrom.sizes

step 8
#macs (not macs2!)
macs -w --call-subpeaks -t ../5-tophat24-35/harring_DMSO_2.bam -c ../5-tophat24-35/CHX_DMSO_2.bam -f BAM -g hs --keep-dup all -n macs_harring_DMSO_2_vs_CHX_DMSO_2 --nomodel --shiftsize=1

step 9
#shifting 12 bases and extending the the summit to 3 bases peak according to gene orientation
sortBed -i macs_harring_DMSO_2_vs_CHX_DMSO_2_summits.bed | intersectBed -wao -a - -b /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf | cut -f1-5,12 | uniq | awk 'BEGIN {OFS="\t"}; {if($6~"-" ) print $1,$2-13,$3-11,$4,$5,$6,$7 ; else if ($6~"+" ) print $1,$2+11,$3+13,$4,$5,$6,$7 }' > macs_harring_DMSO_2_vs_CHX_DMSO_2_summits_mv13_by_strand.bed

step 10
#htseq counting 5UTR and CDS counts
htseq-count ../13-tophat28-32/harring_DMSO_2.filtered.bam -f bam -s yes -t exon -m intersection-nonempty -i gene_id hg38_cds_UTR_without_overlap.gtf > htseq_harring_DMSO_2.filtered.txt

#need to merge all htseq counts to a file and add header
ls -1 htseqfiltered.txt > merged.htseq
awk 'NF > 1{ a[$1] = a[$1]"\t"$2} END {for( i in a ) print i a[i]}' htseqfiltered.txt >> merged.htseq

#END

###########
Downloading rRNA

You can find the intervals using the UCSC Table browser. For this, you go to

http://genome.ucsc.edu/cgi-bin/hgTables

and then set group:all tables, table:rmsk, and filter to "repClass (does match) rRNA"

then output it as a GTF file. Voila! Works for both mouse and human.

preparing files for htseq UTR and CDS counts

for hg38 (ran the below in /home/labs/rivkadk/Collaboration/10and12drugs/30_hg38_CDS_UTR_gtf
R script part-

First, import the GTF-file that you have also used as input for htseq-count

library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/10and12drugs/9-UTR-counts")

txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf",format="gtf")

then collect the exons per gene id

exons.list.per.gene <- exonsBy(txdb,by="gene")

cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="hg38_fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="hg38_cds.per.gene.txt",sep="\t")

#Running the R script to get the CDS and 5UTR

awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_cds.per.gene.txt |sed 's/"//g' |grep -v 'start' > hg38_cds.per.gene.bed
awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_fiveUtr.per.transcript.txt |sed 's/"//g' |grep -v 'start' > hg38_fiveUtr.per.transcript.bed

intersectBed -s -v -a hg38_fiveUtr.per.transcript.bed -b hg38_cds.per.gene.bed > hg38_fiveUtr.per.transcript.nointersect.gene.bed
intersectBed -s -v -a hg38_cds.per.gene.bed -b hg38_fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed

wc -l *bed
230885 hg38_cds.per.gene.bed
223456 hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed
71961 hg38_fiveUtr.per.transcript.bed
61272 hg38_fiveUtr.per.transcript.nointersect.gene.bed

intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_fiveUtr.per.transcript.nointersect.gene.bed> hg38_fiveUtr.per.transcript.nointersect.gene.gtf

intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed.gtf

sed 's/gene_id "/gene_id "CDS-/' hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' hg38_fiveUtr.per.transcript.nointersect.gene.gtf > hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > hg38_cds_UTR_without_overlap.gtf

for mm10

R script

First, import the GTF-file that you have also used as input for htseq-count

library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/RPanalysis/21-CDS")

txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf",format="gtf")

then collect the exons per gene id

exons.list.per.gene <- exonsBy(txdb,by="gene")

cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="cds.per.gene.txt",sep="\t")

intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b fiveUtr.per.transcript.nointersect.gene.bed> fiveUtr.per.transcript.nointersect.gene.gtf

intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b cds.per.gene.nointersect.fiveUtr.per.transcript.bed > cds.per.gene.nointersect.fiveUtr.per.transcript.gtf

sed 's/gene_id "/gene_id "CDS-/' ../21-CDS/cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' ../21-CDS/fiveUtr.per.transcript.nointersect.gene.gtf > mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > mm10_cds_UTR_without_overlap.gtf