Ribo-Seq Flow

The pipeline will go from sequences to htseq counts on UTR and CDS, and will find the peaks and shift them.

Need to have a genome from igenomes bowtie2index , genome gtf file, rRNA indexed for bowtie (descibed below how to have such a file) and gtf file of UTR and CDS explained below how to prepare.
and chromosome size file such as hg38.chrom.sizes
rRNA indexed for bowtie
human U:\10and12drugs\rRNA
mouse U:\RPanalysis\rRNA

module load IGVTools/2.3.26
module load jre
module load samtools
module load fastqc
module load tophat
module load bowtie
module load bedtools
module load python/bio-2.7.13 (contains macs)

This pipline is for human
sample name HaringDMSO_2_S4_L001_R1_001

step 1
#cutadapt
cutadapt -a CTGTAGGCACCATCAATAGATCGGAAGAGCACACGTCTGAACTCCAGTCAC --discard-untrimmed --times 1 -u 1 -m 20 -M 50 -o HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq ../HarringtoninFP_Drug_10_12/Fastq_RF_Drugs/HaringDMSO_2_S4_L001_R1_001.fastq.gz > log_HaringDMSO_2_S4_L001_R1_001 2>&1

step 2
#fastqc
mkdir HaringDMSO_2_S4_L001_R1_001
fastqc --outdir HaringDMSO_2_S4_L001_R1_001 --threads 5 --casava ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq

step 3
#filtering rRNA, makes a fastq of sequences that do not align to rRNA
bowtie -p 10 -norc --un HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq ../rRNA/rRNA ../1-cutadapt/HaringDMSO_2_S4_L001_R1_001_cutadapt.fastq HaringDMSO_2_S4_L001_R1_001_rRNA.sam > log_HaringDMSO_2_S4_L001_R1_001.txt 2>&1

step 4
#cutadapt to select sequences of a certain length
#m M are a parameter
cutadapt -m 24 -M 35 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq
or
cutadapt -m 28 -M 32 -o HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq ../3-bowtie-rRNA/HaringDMSO_2_S4_L001_R1_001_no_rRNA_cutadapt.fastq

step 5
#fastqc
mkdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc
fastqc --outdir fastqc/HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt_fastqc --threads 5 --casava HaringDMSO_2_S4_L001_R1_001_no_rRNA_24-35_cutadapt.fastq

step 6
#tophat, copy and index
bsub -q bio -n 20 -R "rusage[mem=2000] span[hosts=1]" -J harring_DMSO_2 -o job_harring_DMSO_2.txt -e err_harring_DMSO_2_1.txt "tophat -N 1 --no-novel-juncs --library-type fr-firststrand -p 20 -o harring_DMSO_2 --GTF /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome ../10-cutadapt28-32/HaringDMSO_2_S4_L001_R1_001_no_rRNA_28-32_cutadapt.fastq;cp harring_DMSO_2/accepted_hits.bam /home/labs/rivkadk/Collaboration/10and12drugs/13-tophat28-32/harring_DMSO_2.bam;samtools index harring_DMSO_2.bam"

step 7
#filter alignments and index
samtools view -b -q 10 harring_DMSO_2.bam > harring_DMSO_2.filtered.bam;samtools index harring_DMSO_2.filtered.bam

step 7
#tdf
igvtools count -z 5 -w 1 harring_DMSO_2.filtered.bam harring_DMSO_2.filtered.tdf ../5-tophat24-35/hg38.chrom.sizes

step 8
#macs (not macs2!)
macs -w --call-subpeaks -t ../5-tophat24-35/harring_DMSO_2.bam -c ../5-tophat24-35/CHX_DMSO_2.bam -f BAM -g hs --keep-dup all -n macs_harring_DMSO_2_vs_CHX_DMSO_2 --nomodel --shiftsize=1

step 9
#shifting 12 bases and extending the the summit to 3 bases peak according to gene orientation
sortBed -i macs_harring_DMSO_2_vs_CHX_DMSO_2_summits.bed | intersectBed -wao -a - -b /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf | cut -f1-5,12 | uniq | awk 'BEGIN {OFS="\t"}; {if($6~"-" ) print $1,$2-13,$3-11,$4,$5,$6,$7 ; else if ($6~"+" ) print $1,$2+11,$3+13,$4,$5,$6,$7 }' > macs_harring_DMSO_2_vs_CHX_DMSO_2_summits_mv13_by_strand.bed

step 10
#htseq counting 5UTR and CDS counts
htseq-count ../13-tophat28-32/harring_DMSO_2.filtered.bam -f bam -s yes -t exon -m intersection-nonempty -i gene_id hg38_cds_UTR_without_overlap.gtf > htseq_harring_DMSO_2.filtered.txt

#need to merge all htseq counts to a file and add header
ls -1 htseqfiltered.txt > merged.htseq
awk 'NF > 1{ a[$1] = a[$1]"\t"$2} END {for( i in a ) print i a[i]}' htseqfiltered.txt >> merged.htseq

#END

###########
Downloading rRNA

You can find the intervals using the UCSC Table browser. For this, you go to

http://genome.ucsc.edu/cgi-bin/hgTables

and then set group:all tables, table:rmsk, and filter to "repClass (does match) rRNA"

then output it as a GTF file. Voila! Works for both mouse and human.

preparing files for htseq UTR and CDS counts

for hg38 (ran the below in /home/labs/rivkadk/Collaboration/10and12drugs/30_hg38_CDS_UTR_gtf
R script part-

First, import the GTF-file that you have also used as input for htseq-count

library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/10and12drugs/9-UTR-counts")

txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf",format="gtf")

then collect the exons per gene id

exons.list.per.gene <- exonsBy(txdb,by="gene")

cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="hg38_fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="hg38_cds.per.gene.txt",sep="\t")

#Running the R script to get the CDS and 5UTR

awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_cds.per.gene.txt |sed 's/"//g' |grep -v 'start' > hg38_cds.per.gene.bed
awk 'BEGIN {OFS="\t"}; {print $4 , $5, $6, $3,$7,$8}' hg38_fiveUtr.per.transcript.txt |sed 's/"//g' |grep -v 'start' > hg38_fiveUtr.per.transcript.bed

intersectBed -s -v -a hg38_fiveUtr.per.transcript.bed -b hg38_cds.per.gene.bed > hg38_fiveUtr.per.transcript.nointersect.gene.bed
intersectBed -s -v -a hg38_cds.per.gene.bed -b hg38_fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed

wc -l *bed
230885 hg38_cds.per.gene.bed
223456 hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed
71961 hg38_fiveUtr.per.transcript.bed
61272 hg38_fiveUtr.per.transcript.nointersect.gene.bed

intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_fiveUtr.per.transcript.nointersect.gene.bed> hg38_fiveUtr.per.transcript.nointersect.gene.gtf

intersectBed -s -a /shareDB/iGenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf -b hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed > hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.bed.gtf

sed 's/gene_id "/gene_id "CDS-/' hg38_cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' hg38_fiveUtr.per.transcript.nointersect.gene.gtf > hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat hg38_cds.changeID.per.gene.nointersect.fiveUtr.gtf hg38_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > hg38_cds_UTR_without_overlap.gtf

for mm10

R script

First, import the GTF-file that you have also used as input for htseq-count

library(GenomicFeatures)
library(dplyr)
setwd("/home/labs/rivkadk/Collaboration/RPanalysis/21-CDS")

txdb <- makeTxDbFromGFF("/shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf",format="gtf")

then collect the exons per gene id

exons.list.per.gene <- exonsBy(txdb,by="gene")

cds.list.per.gene <- cdsBy(txdb,by="gene")
fiveUtr.list.per.transcript <- fiveUTRsByTranscript(txdb,use.names=TRUE)
write.table(as.data.frame(fiveUtr.list.per.transcript), file="fiveUtr.per.transcript.txt",sep="\t")
write.table(as.data.frame(cds.list.per.gene), file="cds.per.gene.txt",sep="\t")

intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b fiveUtr.per.transcript.nointersect.gene.bed> fiveUtr.per.transcript.nointersect.gene.gtf

intersectBed -s -a /shareDB/iGenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf -b cds.per.gene.nointersect.fiveUtr.per.transcript.bed > cds.per.gene.nointersect.fiveUtr.per.transcript.gtf

sed 's/gene_id "/gene_id "CDS-/' ../21-CDS/cds.per.gene.nointersect.fiveUtr.per.transcript.gtf> mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf
sed 's/gene_id "/gene_id "5UTR-/' ../21-CDS/fiveUtr.per.transcript.nointersect.gene.gtf > mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf
cat mm10_cds.changeID.per.gene.nointersect.fiveUtr.gtf mm10_fiveUTR.changeID.per.gene.nointersect.cds.gtf | sort -k 1,1 -k2,2n -k3,3n > mm10_cds_UTR_without_overlap.gtf

Bioinformatics public

Ribo-Seq Flow

Analytics

Related content