bash scripts for Lexogen QuantSeq 3prime UTR mRNA-Seq

2024/01/12

Set up the files

cd ${working_dir}
mkdir -p R_analysis  raw_data  results scripts  trim_data  workflow
cd R_analysis 
mkdir -p main_analysis data

FASTQC

#!/bin/bash
#SBATCH --job-name=FASTQC-job
#SBATCH -N 1
#SBATCH --mem=3G
#SBATCH --cpus-per-task=10
#SBATCH -t 5:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=FASTQC_raw-%j.out
#SBATCH --error=FASTQC_raw-%j.err

# saved to fastqc.sh at ${working_dir}/scripts

module load fastqc/0.11.5
module load multiqc/1.8

set -xe

# make a variable for the path where data is
raw_data_dir=/pipeline/Archives/NextSeq/231130_VH01624_45_AACF3KNHV/ProjectFolders/Project_Christabella-Mahendra
fastqc_out_dir="${working_dir}/raw_data"

cd $raw_data_dir

# make a variable for the path where output will be stored
mkdir -p $fastqc_out_dir/fastqc_out

# Run fastqc
for i in *R1_001.fastq.gz;
do
(cd "$i" && fastqc -t 9 -o $fastqc_out_dir/fastqc_out *_001.fastq.gz) ;
done

cd $fastqc_out_dir/fastqc_out
multiqc --filename multiqc_RNAseq *fastqc.zip

trim script

#!/bin/bash
#SBATCH --job-name=trimfastq-job
#SBATCH -N 1
#SBATCH --mem=12G
#SBATCH --cpus-per-task=15
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=Trim-%j.out
#SBATCH --error=Trim-%j.err

# saved as trim.sh at ${working_dir}/scripts

# load module
module load fastqc/0.11.5
module load multiqc/1.8

raw_data_dir="/pipeline/Archives/NextSeq/231130_VH01624_45_AACF3KNHV/ProjectFolders/Project_Christabella-Mahendra"
trim_output="${working_dir}/trim_data"
rawdata_fastqc_dir="${working_dir}/raw_data/fastqc_out"

mkdir -p $trim_output
mkdir -p $trim_output/fastqc_out_trim
cd $raw_data_dir

set -xe


for infile in *; 
do
(cd "$infile" && for i in *_R1_001.fastq.gz; 

do
base=`basename ${i} _R1_001.fastq.gz`
echo "sample name is ${base}"
/researchers/antonio.ahn/tools/bbmap/bbduk.sh in=${i} out=$trim_output/${base}_R1.trim.fastq.gz ref=/researchers/antonio.ahn/tools/bbmap/resources/truseq.fa.gz \
ktrim=r k=23 mink=11 hdist=1 tpe tbo qtrim=r trimq=5 minlen=20 \
stats=$trim_output/${base}_bbduk.trimstats.txt refstats=$trim_output/${base}_bbduk.refstats.txt &> $trim_output/${base}_bbduk.stdout_stats.txt;

done);

done


cd $trim_output

# perform fastqc
fastqc -t 14 -o fastqc_out_trim *.fastq.gz

# perform multiqc
cd $trim_output/fastqc_out_trim
multiqc --filename fastqc_out_trim *_fastqc.zip \
$rawdata_fastqc_dir/*_fastqc.zip \
$trim_output/*.txt

STAR

#!/bin/bash
#SBATCH --job-name=STARalign-job
#SBATCH -N 1
#SBATCH --mem-per-cpu=15G
#SBATCH --cpus-per-task=15
#SBATCH -t 24:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=STARalign-%j.out
#SBATCH --error=STARalign-%j.err

# script saved to STARalign.sh at ${working_dir}/scripts
# This script takes a fastq file of RNA-Seq data, runs STAR and outputs BAM files.

# paths
trim_data_dir="${working_dir}/trim_data"
GENOME="/researchers/antonio.ahn/refgenome/mouse/mm10_UCSC_STAR/STAR"
outputfiles="${working_dir}/results/STAR_bam_files"
cd $trim_data_dir

# load module
module load star/2.7.5b

# set -x is a debugging tool that will make bash display the command before executing it. In case of an issue with the commands in the shell script, this type of debugging lets you quickly pinpoint the step that is throwing an error. Often, tools will display the error that caused the program to stop running, so keep this in mind for times when you are running into issues where this is not available. You can turn this functionality off by saying set +x
# By default, a shell script containing a command that fails will not cause the entire shell script to exit: the shell script will just continue on to the next line. We always want errors to be loud and noticeable. This option prevents this, by terminating the script if any command exited with a nonzero exit status.
set -ex

# Unmapped reads can be output into the SAM/BAM Aligned.* file(s) with --outSAMunmapped Within option
# string: a string of desired SAM attributes, in the order desired for the output SAM. All gives NH HI AS nM NM MD jM jI MC ch

mkdir -p $outputfiles

for infile in *_R1.trim.fastq.gz;
do
base=`basename ${infile} _R1.trim.fastq.gz`
echo "Sample name is $base and fastq name is ${infile}"
STAR --runThreadN 14 \
--genomeDir $GENOME \
--readFilesIn ${infile} \
--outFileNamePrefix $outputfiles/${base} \
--outSAMtype BAM SortedByCoordinate \
--outSAMunmapped Within \
--outSAMattributes All \
--readFilesCommand zcat;
done

fcounts

#!/bin/bash
#SBATCH --job-name=fcounts-job
#SBATCH -N 1
#SBATCH --mem=12G
#SBATCH --cpus-per-task=12
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=fcounts-%j.out
#SBATCH --error=fcounts-%j.err

# script saved to fcounts_STAR.sh at ${working_dir}/scripts
# script to obtain counts

# set paths
bamfiles="${working_dir}/results/STAR_bam_files"
GENOME="/researchers/antonio.ahn/refgenome/mouse/mm10_UCSC_STAR"
FC="/home/aahn/tools/featurecounts/subread-2.0.1-Linux-x86_64/bin"

mkdir -p $bamfiles/featurecounts_unstranded
mkdir -p $bamfiles/featurecounts_stranded_1
mkdir -p $bamfiles/featurecounts_stranded_2

:
# -T Number of the threads
# -p If specified, fragments (or templates) will be counted instead of reads. This option is only applicable for paired-end reads.
# -s Perform strand-specific read counting. Possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default.
# -t Specify feature type in GTF annotation. `exon' by default. Features used for read counting will be extracted from annotation using the provided value
# -g Specify attribute type in GTF annotation. `gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value.


$FC/featureCounts -T 11 -s 0 -t exon -g gene_id \
-a $GENOME/gencode.vM25.basic.annotation.gtf \
-o $bamfiles/featurecounts_unstranded/featurecounts_unstranded.txt \
$bamfiles/*sortedByCoord.out.bam

$FC/featureCounts -T 11 -s 1 -t exon -g gene_id \
-a $GENOME/gencode.vM25.basic.annotation.gtf \
-o $bamfiles/featurecounts_stranded_1/featurecounts_stranded_1.txt \
$bamfiles/*sortedByCoord.out.bam

$FC/featureCounts -T 11 -s 2 -t exon -g gene_id \
-a $GENOME/gencode.vM25.basic.annotation.gtf \
-o $bamfiles/featurecounts_stranded_2/featurecounts_stranded_2.txt \
$bamfiles/*sortedByCoord.out.bam

multiQC

#!/bin/bash
#SBATCH --job-name=MultiQC-job
#SBATCH -N 1
#SBATCH --mem=5G
#SBATCH --cpus-per-task=5
#SBATCH -t 10:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=MultiQC-%j.out
#SBATCH --error=MultiQC-%j.err

# saved as multiQC.sh at ${working_dir}/scripts

STAR_dir="${working_dir}/results/STAR_bam_files"
fastqc_dir="${working_dir}/trim_data"
output_dir="${working_dir}/results/multiqc"
fcounts_dir="${STAR_dir}/featurecounts_stranded_1"

mkdir -p $output_dir

module load multiqc/1.8

set -xe

cd $output_dir

multiqc --filename multiqc_summary \
$fastqc_dir/fastqc_out_trim/*.trim_fastqc.zip \
$fastqc_dir/*_stats.txt \
$STAR_dir/*Log.final.out \
$fcounts_dir/*txt.summary

generate bigwig files

#!/bin/bash
#SBATCH --job-name=bigwig-job
#SBATCH -N 1
#SBATCH --mem=5G
#SBATCH --cpus-per-task=15
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=bigwig-%j.out
#SBATCH --error=bigwig-%j.err

# script saved to bigwig.sh at ${working_dir}/scripts

module load deeptools/3.5.0
module load samtools/1.9


set -xe

# set paths
bamfiles="${working_dir}/results/STAR_bam_files"
bw_output=${working_dir}/results/STAR_bam_files/bigwig

cd $bamfiles

for bam in *Aligned.sortedByCoord.out.bam;
do

base=$(basename ${bam} Aligned.sortedByCoord.out.bam)
echo "sample name is ${base}"
samtools index -@ 14 $bam
bamCoverage -b $bam \
-o $bw_output/${base}.bw \
--normalizeUsing RPKM \
-p 14;

done

subread (optional)

build index for subread

#!/bin/bash
#SBATCH --job-name=index-job
#SBATCH -N 1
#SBATCH --mem=12G
#SBATCH --cpus-per-task=12
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=index-%j.out
#SBATCH --error=index-%j.err

# script saved to subread_index.sh at ${working_dir}/scripts
# script to obtain counts

# set paths
bamfiles="${working_dir}/results/bamfiles"
GENOME="/researchers/antonio.ahn/refgenome/mouse/mm10_UCSC_subread"
FC="/home/aahn/tools/featurecounts/subread-2.0.1-Linux-x86_64/bin"


cd $GEOME

$FC/subread-buildindex -o index mm10.fa

(optional)

#!/bin/bash
#SBATCH --job-name=subread-job
#SBATCH -N 1
#SBATCH --mem=12G
#SBATCH --cpus-per-task=12
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=subread-%j.out
#SBATCH --error=subread-%j.err

# script saved to subread_align.sh at ${working_dir}/scripts
# script to obtain counts

# set paths
input_dir="${working_dir}/trim_data"
output_dir="${working_dir}/results/subread_bam_files"
GENOME_index_dir="/researchers/antonio.ahn/refgenome/mouse/mm10_UCSC_subread"
FC="/home/aahn/tools/featurecounts/subread-2.0.1-Linux-x86_64/bin"

mkdir -p $output_dir

cd $input_dir

for infile in *_R1.trim.fastq.gz;
do
base=`basename ${infile} _R1.trim.fastq.gz`
echo "Sample name is $base and fastq name is ${infile}"
${FC}/subread-align -t 0 -T 11 -i $GENOME_index_dir/index -r $infile -o ${output_dir}/${base}.bam;
done
#!/bin/bash
#SBATCH --job-name=fcounts-job
#SBATCH -N 1
#SBATCH --mem=12G
#SBATCH --cpus-per-task=12
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=fcounts-%j.out
#SBATCH --error=fcounts-%j.err

# script saved to subread_fcounts.sh at ${working_dir}/scripts
# script to obtain counts

# set paths
bamfiles_dir="${working_dir}/results/subread_bam_files"
annotation_dir="/home/aahn/tools/featurecounts/subread-2.0.1-Linux-x86_64/annotation"
FC="/home/aahn/tools/featurecounts/subread-2.0.1-Linux-x86_64/bin"

mkdir -p $bamfiles_dir/featurecounts_unstranded
mkdir -p $bamfiles_dir/featurecounts_stranded_1
mkdir -p $bamfiles_dir/featurecounts_stranded_2

# -T Number of the threads
# -p If specified, fragments (or templates) will be counted instead of reads. This option is only applicable for paired-end reads.
# -s Perform strand-specific read counting. Possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default.
# -t Specify feature type in GTF annotation. `exon' by default. Features used for read counting will be extracted from annotation using the provided value
# -g Specify attribute type in GTF annotation. `gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value.


$FC/featureCounts -T 11 -p -s 0 -F SAF \
-a $annotation_dir/mm10_RefSeq_exon.txt \
-o $bamfiles_dir/featurecounts_unstranded/featurecounts_unstranded.txt \
$bamfiles_dir/*.bam

$FC/featureCounts -T 11 -p -s 1 -F SAF \
-a $annotation_dir/mm10_RefSeq_exon.txt \
-o $bamfiles_dir/featurecounts_stranded_1/featurecounts_stranded_1.txt \
$bamfiles_dir/*.bam

$FC/featureCounts -T 11 -p -s 2 -F SAF \
-a $annotation_dir/mm10_RefSeq_exon.txt \
-o $bamfiles_dir/featurecounts_stranded_2/featurecounts_stranded_2.txt \
$bamfiles_dir/*.bam

#!/bin/bash
#SBATCH --job-name=bigwig-job
#SBATCH -N 1
#SBATCH --mem=5G
#SBATCH --cpus-per-task=15
#SBATCH -t 12:00:00
#SBATCH --mail-user=antonio.ahn@petermac.org
#SBATCH --mail-type=end
#SBATCH --output=bigwig-%j.out
#SBATCH --error=bigwig-%j.err

# script saved to [bigwig sh](bigwig.sh) at ${working_dir}/scripts

module load deeptools/3.5.0
module load samtools/1.9


set -xe

# set paths
bamfiles="${working_dir}/results/subread_bam_files"
bw_output=${working_dir}/results/subread_bam_files/bigwig

cd $bamfiles

mkdir -p $bw_output

for bam in *bam;
do

base=$(basename ${bam} .bam)
echo "sample name is ${base}"
samtools sort -@ 14 $bam -o ${base}_sorted.bam
samtools index -@ 14 ${base}_sorted.bam
bamCoverage -b ${base}_sorted.bam \
-o $bw_output/${base}.bw \
--normalizeUsing RPKM \
-p 14;

done