| 1 | |
| 2 | == Creating an analysis pipeline of compressed files == |
| 3 | |
| 4 | Almost all steps of a short-read analysis pipeline can be created with gzipped files to save storage space. For example, |
| 5 | |
| 6 | # Untar from solexa_public, gunzip, and gzip \\ |
| 7 | tar -xOzf /lab/solexa_public/LAB/RUN/QualityScore/Foo.1.tar.gz | gzip -f > fastq/My_sample_2.1.fq.gz \\ |
| 8 | tar -xOzf /lab/solexa_public/LAB/RUN/QualityScore/Foo.2.tar.gz | gzip -f > fastq/My_sample_2.2.fq.gz \\ |
| 9 | |
| 10 | # Run FastQC \\ |
| 11 | fastqc fastq/My_sample_2.1.fq.gz \\ |
| 12 | fastqc fastq/My_sample_2.2.fq.gz \\ |
| 13 | |
| 14 | # Trim adapters \\ |
| 15 | gunzip -c fastq/My_sample_2.1.fq.gz | fastx_clipper -v -z -l 30 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG -o fastq/My_sample_2.noVec.1.fq.gz \\ |
| 16 | gunzip -c fastq/My_sample_2.2.fq.gz | fastx_clipper -v -z -l 30 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG -o fastq/My_sample_2.noVec.2.fq.gz \\ |
| 17 | fastqc fastq/My_sample_2.noVec.1.fq.gz \\ |
| 18 | fastqc fastq/My_sample_2.noVec.2.fq.gz \\ |
| 19 | |
| 20 | # Drop reads of low quality \\ |
| 21 | gunzip -c fastq/My_sample_2.noVec.1.fq.gz | fastq_quality_filter -v -q 20 -p 75 -z -o fastq/My_sample_2.noVec.qgt20.1.fq.gz \\ |
| 22 | gunzip -c fastq/My_sample_2.noVec.2.fq.gz | fastq_quality_filter -v -q 20 -p 75 -z -o fastq/My_sample_2.noVec.qgt20.2.fq.gz \\ |
| 23 | |
| 24 | # Trim low-quality bases at end of reads \\ |
| 25 | gunzip -c fastq/My_sample_2.noVec.qgt20.1.fq.gz | fastq_quality_trimmer -v -t 20 -l 30 -z -o fastq/My_sample_2.noVec.qgt20.t.1.fq.gz \\ |
| 26 | gunzip -c fastq/My_sample_2.noVec.qgt20.2.fq.gz | fastq_quality_trimmer -v -t 20 -l 30 -z -o fastq/My_sample_2.noVec.qgt20.t.2.fq.gz \\ |
| 27 | |
| 28 | # After trimming, get reads that are still paired (and output as regular fastq) \\ |
| 29 | /nfs/BaRC_Public/BaRC_code/Perl/cmpfastq/cmpfastqgz.pl fastq/My_sample_2.noVec.qgt20.t.1.fq.gz fastq/My_sample_2.noVec.qgt20.t.2.fq.gz |
| 30 | |
| 31 | # Map paired reads with bowtie \\ |
| 32 | # THIS STEP CANNOT USE GZ FILES \\ |
| 33 | bowtie -l 30 -n 1 -X 1000 --best -S /nfs/genomes/a.thaliana_TAIR_10/bowtie/tair10_SI -1 fastq/My_sample_2.noVec.qgt20.t.1.fq-common.out -2 fastq/My_sample_2.noVec.qgt20.t.2.fq-common.out mapped_reads/My_sample_2.30.1.sam |
| 34 | |
| 35 | # Sort and index mapped reads \\ |
| 36 | /nfs/BaRC_Public/BaRC_code/Perl/SAM_to_BAM_sort_index/SAM_to_BAM_sort_index.pl mapped_reads/My_sample_2.30.1.sam \\ |
| 37 | # Delete SAM file (since BAM file has all the same info) \\ |
| 38 | rm -f mapped_reads/My_sample_2.30.1.sam \\ |
| 39 | |
| 40 | # Get summary counts \\ |
| 41 | zcat fastq/My_sample_2.1.fq.gz | echo $((`wc -l`/4)) \\ |
| 42 | zcat fastq/My_sample_2.noVec.1.fq.gz | echo $((`wc -l`/4)) \\ |
| 43 | zcat fastq/My_sample_2.noVec.2.fq.gz | echo $((`wc -l`/4)) \\ |
| 44 | zcat fastq/My_sample_2.noVec.qgt20.1.fq.gz | echo $((`wc -l`/4)) \\ |
| 45 | zcat fastq/My_sample_2.noVec.qgt20.2.fq.gz | echo $((`wc -l`/4)) \\ |
| 46 | more fastq/My_sample_2.noVec.qgt20.t.1.fq | echo $((`wc -l`/4)) \\ |
| 47 | more fastq/My_sample_2.noVec.qgt20.t.2.fq | echo $((`wc -l`/4)) \\ |
| 48 | zmore fastq/My_sample_2.noVec.qgt20.t.1.fq-common.out.gz | echo $((`wc -l`/4)) \\ |
| 49 | samtools flagstat mapped_reads/My_sample_2.30.1.sorted.bam > mapped_reads/My_sample_2.30.1.flagstat.txt \\ |
| 50 | grep ' mapped (' mapped_reads/My_sample_2.30.1.flagstat.txt |
| 51 | |
| 52 | |