{{{ #!/usr/bin/perl -w ####################################################################### # Copyright(c) 2011 Whitehead Institute for Biomedical Research. # All Rights Reserved # # Author: Inmaculada Barrasa # Bioinformatics and Research Computing # wibr-bioinformatics@wi.mit.edu # # Version: 1.1 # # Comment: This program produces the input files needed to run FIRE from the command line: # 1. Reads several files with sequences # 2. Adds a unique identifier to each sequence so FIRE thinks that they are all different # 3. Makes a file with all the sequences and a file with all the sequences identifiers and the group they belong to. # # ####################################################################### use strict; use Getopt::Std; my (%opts, $filea, $fileb, $filec, $filed, $filee, $filef, $fileg, $fileh, $filei, $filej, $name); getopts('a:b:c:d:e:f:g:h:i:j:n:',\%opts); my $time = time(); my %files; if ( $opts{'n'} ) { $name = $opts{'n'}; print STDERR "Name: $name\n";} if ( $opts{'a'} ) { $filea = $opts{'a'}; print STDERR "FileA: $filea\n"; $files{a} = $filea; print FH "a\t$filea\n";} if ( $opts{'b'} ) { $fileb = $opts{'b'}; print STDERR "FileB: $fileb\n"; $files{b} = $fileb; print FH "b\t$fileb\n";} if ( $opts{'c'} ) { $filec = $opts{'c'}; print STDERR "FileC: $filec\n"; $files{c} = $filec; print FH "c\t$filec\n";} if ( $opts{'d'} ) { $filed = $opts{'d'}; print STDERR "FileD: $filed\n"; $files{d} = $filed; print FH "d\t$filed\n";} if ( $opts{'e'} ) { $filee = $opts{'e'}; print STDERR "FileE: $filee\n"; $files{e} = $filee; print FH "e\t$filee\n";} if ( $opts{'f'} ) { $filef = $opts{'f'}; print STDERR "FileF: $filef\n"; $files{f} = $filef; print FH "f\t$filef\n";} if ( $opts{'g'} ) { $fileg = $opts{'g'}; print STDERR "FileG: $fileg\n"; $files{g} = $fileg; print FH "g\t$fileg\n";} if ( $opts{'h'} ) { $fileh = $opts{'h'}; print STDERR "FileH: $fileh\n"; $files{h} = $fileh; print FH "h\t$fileh\n";} if ( $opts{'i'} ) { $filei = $opts{'i'}; print STDERR "FileI: $filei\n"; $files{i} = $filei; print FH "i\t$filei\n";} if ( $opts{'j'} ) { $filej = $opts{'j'}; print STDERR "FileJ: $filej\n"; $files{j} = $filej; print FH "j\t$filej\n";} my $Instruct ="Use:\nPrepareFilesForFIRE_keepall.pl -a FileA -b FileB ... -n name\noptions \n-a to -j are used to identify the fasta files with sequences corresponding to the different groups. -n Name of the current analysis and files\n"; my $outPut = "Output files are:\n\"AllSequences.txt:\" is a fasta file with all the sequences.\n\"enrichmentFileAllSequences.txt\"has the list of sequences IDs and the groups they belong to\n"; if (!keys %opts) {print STDERR "$Instruct\n$outPut\n";} if (!$name) {die;} open (FH, ">log.$name.$time") || die "can not open log.$name.$time\n"; my %key2numb; $key2numb{a} = 0; $key2numb{b} = 1; $key2numb{c} = 2; $key2numb{d} = 3; $key2numb{e} = 4; $key2numb{f} = 5; $key2numb{g} = 6; $key2numb{h} = 7; $key2numb{i} = 8; $key2numb{j} = 9; my %regions; my %repSeqs; my %uniqueSeqs; my %AllSeqs; for (sort (keys %files)){ my $key = $_; my $file = $files{$key}; readFile($key, $file); } open (FH2, ">RepSeqs.$name.txt") || die "can not open RepSeqs.$name.txt\n"; for (sort keys %repSeqs) { my $ID = $_; print FH2 "$ID\t"; my @array = @{$repSeqs{$ID}}; my $seq = shift @array; for (@array) { my $key = $_; print FH2 "$files{$key}\t" } print FH2 "\n$seq\n"; } close FH2; open (FH3, ">UniqueSeqs.$name.txt") || die "can not open UniqueSeqs.txt\n"; open (FH4, ">enrichmentFile.$name.txt") || die "can not open enrichmentFile.txt\n"; for(sort keys %uniqueSeqs) { my $ID = $_; print FH3 "$ID\n"; my @arr = @{$uniqueSeqs{$ID}}; print FH3 "$arr[0]\n"; print FH4 "$ID\t$arr[1]\n"; } close FH3; close FH4; open (FH3, ">UniquePlusRepSeqs.$name.txt") || die "can not open UniqueSeqs.txt\n"; open (FH4, ">enrichmentFileUniqPlusRep.$name.txt") || die "can not open enrichmentFile.txt\n"; for(sort keys %uniqueSeqs) { my $ID = $_; print FH3 "$ID\n"; my @arr = @{$uniqueSeqs{$ID}}; print FH3 "$arr[0]\n"; print FH4 "$ID\t$arr[1]\n"; } for(sort keys %repSeqs) { my $ID = $_; print FH3 "$ID\n"; my @arr = @{$repSeqs{$ID}}; print FH3 "$arr[0]\n"; print FH4 "$ID\tM\n"; } open (FH5, ">AllSequences.$name.txt") || die "can not open AllSequences.$name.txt\n"; open (FH6, ">enrichmentFileAllSequences.$name.txt") || die "can not open enrichmentFile.txt\n"; for (sort keys %AllSeqs) { my $ID = $_; print FH5 "$ID\n"; my @arr = @{$AllSeqs{$ID}}; print FH5 "$arr[0]\n"; $ID =~ s/\>//; print FH6 "$ID\t$key2numb{$arr[1]}\n"; } sub readFile { my $key = $_[0]; my $file = $_[1]; my $seq = "N"; my $ID = "N"; open (FH1, "$file") || die "can not open $file\n"; my @lanes = ; for (@lanes) { chomp;} for (@lanes) { my $lane = $_; if ($lane =~ /\>/) { checkRedundant($ID, $seq, $key); $ID = $lane; $seq = ""; } else { $seq = $seq.$_; } } checkRedundant($ID, $seq, $key); close FH1; } sub checkRedundant{ my ($ID, $seq, $key) = @_; if ($repSeqs{$ID}) {push @{$repSeqs{$ID}}, $key;} elsif ($uniqueSeqs{$ID}) { push @{$repSeqs{$ID}}, @{$uniqueSeqs{$ID}}, $key; delete $uniqueSeqs{$ID}; } else { @{$uniqueSeqs{$ID}} = ($seq, $key); } #Make a new array with all sequences with unique identifiers even if some of the seqs are repeted on different groups my $ID2 = $ID."_".$key; @{$AllSeqs{$ID2}} = ($seq, $key); } }}}