wiki:SOPs/PrepareFilesForFIRE_keepall.txt
#!/usr/bin/perl -w

#######################################################################
# Copyright(c) 2011 Whitehead Institute for Biomedical Research.
#              All Rights Reserved
#
# Author: Inmaculada Barrasa
#         Bioinformatics and Research Computing
#         wibr-bioinformatics@wi.mit.edu
#
# Version: 1.1
#
# Comment: This program produces the input files needed to run FIRE from the command line:
#    1. Reads several files with sequences
#    2. Adds a unique identifier to each sequence so FIRE thinks that they are all different
#    3. Makes a file with all the sequences and a file with all the sequences identifiers and the group they belong to.
#
#
#######################################################################






use strict;
use Getopt::Std;

my (%opts,  $filea, $fileb,  $filec,  $filed,  $filee,  $filef,  $fileg,  $fileh,  $filei,  $filej, $name);
getopts('a:b:c:d:e:f:g:h:i:j:n:',\%opts);
my $time = time();

my %files;


if ( $opts{'n'} ) { $name = $opts{'n'}; print STDERR "Name: $name\n";}





if ( $opts{'a'} ) {	$filea = $opts{'a'}; print STDERR "FileA: $filea\n"; $files{a} = $filea; print FH "a\t$filea\n";}

if ( $opts{'b'} ) { $fileb = $opts{'b'}; print STDERR "FileB: $fileb\n"; $files{b} = $fileb; print FH "b\t$fileb\n";}

if ( $opts{'c'} ) { $filec = $opts{'c'}; print STDERR "FileC: $filec\n"; $files{c} = $filec; print FH "c\t$filec\n";}

if ( $opts{'d'} ) { $filed = $opts{'d'}; print STDERR "FileD: $filed\n"; $files{d} = $filed; print FH "d\t$filed\n";}

if ( $opts{'e'} ) { $filee = $opts{'e'}; print STDERR "FileE: $filee\n"; $files{e} = $filee; print FH "e\t$filee\n";}

if ( $opts{'f'} ) { $filef = $opts{'f'}; print STDERR "FileF: $filef\n"; $files{f} = $filef; print FH "f\t$filef\n";}

if ( $opts{'g'} ) { $fileg = $opts{'g'}; print STDERR "FileG: $fileg\n"; $files{g} = $fileg; print FH "g\t$fileg\n";}

if ( $opts{'h'} ) { $fileh = $opts{'h'}; print STDERR "FileH: $fileh\n"; $files{h} = $fileh; print FH "h\t$fileh\n";}

if ( $opts{'i'} ) { $filei = $opts{'i'}; print STDERR "FileI: $filei\n"; $files{i} = $filei; print FH "i\t$filei\n";}

if ( $opts{'j'} ) { $filej = $opts{'j'}; print STDERR "FileJ: $filej\n"; $files{j} = $filej; print FH "j\t$filej\n";}

my $Instruct ="Use:\nPrepareFilesForFIRE_keepall.pl -a FileA -b FileB ... -n name\noptions \n-a to -j are used to identify the fasta files with sequences corresponding to the different groups.
-n Name of the current analysis and files\n";

my $outPut = "Output files are:\n\"AllSequences.txt:\" is a fasta file with all the sequences.\n\"enrichmentFileAllSequences.txt\"has the list of sequences IDs and the groups they belong to\n";



 if (!keys %opts) {print STDERR "$Instruct\n$outPut\n";}

if (!$name) {die;}
open (FH, ">log.$name.$time") || die "can not open log.$name.$time\n";

my %key2numb;
$key2numb{a} = 0;  $key2numb{b} = 1; $key2numb{c} = 2; $key2numb{d} = 3; $key2numb{e} = 4; $key2numb{f} = 5; $key2numb{g} = 6; 
$key2numb{h} = 7; $key2numb{i} = 8; $key2numb{j} = 9; 



my %regions;
my %repSeqs;
my %uniqueSeqs;
my %AllSeqs;

for (sort (keys %files)){
	my $key = $_;
	my $file = $files{$key};
	readFile($key, $file);
}

open (FH2, ">RepSeqs.$name.txt") || die "can not open RepSeqs.$name.txt\n"; 
 for (sort keys %repSeqs) {
 	my $ID = $_;	
 	print FH2 "$ID\t";
 	my @array = @{$repSeqs{$ID}}; 
 	my $seq = shift @array;
 	
 	for (@array) {
 		my $key = $_;
 		print FH2 "$files{$key}\t" 
 	}
 	
 	print FH2 "\n$seq\n";
 }
close FH2;

open (FH3, ">UniqueSeqs.$name.txt") ||  die "can not open UniqueSeqs.txt\n"; 
open (FH4, ">enrichmentFile.$name.txt") ||  die "can not open enrichmentFile.txt\n"; 
for(sort keys %uniqueSeqs) {
	my $ID = $_;
	print FH3 "$ID\n";
	my @arr = @{$uniqueSeqs{$ID}};
	print FH3 "$arr[0]\n";
	print FH4 "$ID\t$arr[1]\n";
}
close FH3; close FH4;

open (FH3, ">UniquePlusRepSeqs.$name.txt") ||  die "can not open UniqueSeqs.txt\n"; 
open (FH4, ">enrichmentFileUniqPlusRep.$name.txt") ||  die "can not open enrichmentFile.txt\n"; 
for(sort keys %uniqueSeqs) {
	my $ID = $_;
	print FH3 "$ID\n";
	my @arr = @{$uniqueSeqs{$ID}};
	print FH3 "$arr[0]\n";
	print FH4 "$ID\t$arr[1]\n";
}

for(sort keys  %repSeqs) {
	my $ID = $_;
	print FH3 "$ID\n";
	my @arr = @{$repSeqs{$ID}};
	print FH3 "$arr[0]\n";
	print FH4 "$ID\tM\n";
}


open (FH5, ">AllSequences.$name.txt") ||  die "can not open AllSequences.$name.txt\n"; 
open (FH6, ">enrichmentFileAllSequences.$name.txt") ||  die "can not open enrichmentFile.txt\n"; 

for (sort keys %AllSeqs) {
	my $ID = $_;
	print FH5 "$ID\n";
	my @arr = @{$AllSeqs{$ID}};
	print FH5 "$arr[0]\n";
	$ID =~ s/\>//;
	print FH6 "$ID\t$key2numb{$arr[1]}\n";
}


sub readFile {
	my $key = $_[0]; 
	my $file = $_[1];
	my $seq = "N"; my $ID = "N";
	open (FH1, "$file") || die "can  not open $file\n";
	my @lanes = <FH1>;
	for (@lanes) { chomp;}
	for (@lanes) {
		my $lane = $_;
		if ($lane =~ /\>/) {
			checkRedundant($ID, $seq, $key);
			$ID = $lane;
			$seq = "";
		} else {
			$seq = $seq.$_;
		}
	}
	checkRedundant($ID, $seq, $key);
	close FH1;
} 

sub checkRedundant{
	my ($ID, $seq, $key) = @_;
	if ($repSeqs{$ID}) {push @{$repSeqs{$ID}}, $key;}
	elsif ($uniqueSeqs{$ID}) {
		push @{$repSeqs{$ID}}, @{$uniqueSeqs{$ID}}, $key;
		delete $uniqueSeqs{$ID};
	}  else {
		@{$uniqueSeqs{$ID}} = ($seq, $key);
	}
	#Make a new array with all sequences with unique identifiers even if some of the seqs are repeted on different groups
	my $ID2 = $ID."_".$key;
	@{$AllSeqs{$ID2}} = ($seq, $key);	
	
	
}


Note: See TracWiki for help on using the wiki.