wiki:SOPs/MapRegions_mm9.pl
#!/usr/bin/perl -w


#######################################################################
# Copyright(c) 2011 Whitehead Institute for Biomedical Research.
#              All Rights Reserved
#
# Author: Inmaculada Barrasa
#         Bioinformatics and Research Computing
#         wibr-bioinformatics@wi.mit.edu
#
# Version: 1.1
#
# Comment: This program produces takes a set of mouse genomic regions (in bed like format) and maps them to genes, exons, introns, intergenic
#         using a GFF database containing mm9 annotation
#        
#        to map human regions replace mm9 with hg18 or hg19
#######################################################################




my $info =
"This script 
1. Takes a bed file like
chr	start	stop	any other columns

2. Takes a cut off distance, this defines how far upstream of the TSS we will look.

3. Maps the regions defined with  chr	start	stop to genes

The input file and cut off are added to the output file names

There are several output files

Annotation: looks like the input file, but each row has extra columns with the information about the region intragenic or intergenic, if intra it print if it falls within and exon intron, 5UTR or 3UTR if intergeninc it will look up and down the region using the cut off distance and print out the genes in those areas and their dintance to the region.
Genes:  List of genes that have a region falling within the gene body
Genes3P: List of genes that have a gene within the cut off distance from the end of the gene
Genes5P: List of genes that have a gene within the cut off distance from the TSS.

GenesExon: List of genes that have a region falling within the exon of the gene
GenesIntron: List of genes that have a region falling within the intron of the gene
Counts: this file has the number of regions that fall withing each category


PARAMETERS:
-f inputFile
-c CutOffDistance    (distance should be in bp)

Sample command:

MapRegions_mm9.pl -f INPUTFILE -c 1000

To Map regions for a human replace mm9 with hg19 in the code

"; 





use strict;
use warnings;
use Bio::Seq;
use Bio::DB::GFF;
#use Bio::Graphics; 
use Getopt::Long;
use Getopt::Std;
use DBI;





###################################Getting input

my %opts;
my $file; #f
my $debug = 0;
my ($chrom,$start,$stop);
my $cutoff;
my $cutoffFoldEnrich;
getopts('f:c:d:e:',\%opts);


if ( $opts{'f'} ) {
	$file = $opts{'f'};
	print STDERR "File: $file\n";
}


if ( $opts{'c'} ) {
	$cutoff = $opts{'c'};
	print STDERR "cutoff distance: $cutoff\n";
}

#if ( $opts{'e'} ) {
#	$cutoffFoldEnrich = $opts{'e'};
#	print STDERR "Fold enrichment: $cutoffFoldEnrich\n";
#}

if ( $opts{'d'} ) {
	$debug = $opts{'d'};
	print STDERR "debug: $debug\n";
}

if (!$file || !$cutoff) {
	print "$info";
	die;
}


###########################






my $user_entrez = "entrezgene";
my $passwd_entrez = "wibr"; 

my $user = "gffinma";
my $passwd = "Hu289maN";




my $db_path = "dbi:mysql:host=canna.wi.mit.edu:mouse_mm9";

#my $db_path = "dbi:mysql:host=canna.wi.mit.edu:scerevisiae_oct09";
my $db    = Bio::DB::GFF->new( -adaptor => 'dbi::mysql',
	                                 -dsn     => $db_path,
	                                 -user => $user,
	                                 -pass => $passwd
	                                 ) or die();
	                                
print "connection done\n";
#



###To retrieve the geneIDs and gene symbols for NM IDs connect to our entrez database
my $entrezdb_path = "dbi:mysql:host=canna.wi.mit.edu:entrez_gene";

my $entrezdb    = DBI ->connect("$entrezdb_path", $user_entrez, $passwd_entrez, {RaiseError => 1});


#select  * FROM `entrez_gene`.`gene2refseq` where `RNA_nuc_access_version`= "NM_001011874";
my $refseqIDtogeneIDQ = $entrezdb->prepare('SELECT gene_id, RNA_nuc_access_version FROM gene2refseq WHERE RNA_nuc_access_version = ?')
                or die "Couldn't prepare statement: " . $entrezdb->errstr;

my $geneIDtoSymbolQ = $entrezdb->prepare('SELECT gene_id, symbol FROM gene_info WHERE gene_id = ?')
                or die "Couldn't prepare statement: " . $entrezdb->errstr;

print "connection 2 done\n";


#From prev script
#my $db_path = "dbi:mysql:host=canna.wi.mit.edu:entrez_gene";
#
#my $dbh    = DBI ->connect("$db_path", $user, $passwd, {RaiseError => 1});
#
#my $sth = $dbh->prepare('SELECT gene_id, symbol FROM gene_info WHERE gene_id = ?')
#                or die "Couldn't prepare statement: " . $dbh->errstr;





print "pass test point\n";


open (FH, "$file") || die "I can not open $file\n";
my @Lanes = <FH>;
close FH;

my $IntraCounts = 0;
my $InterCounts = 0;
my $INTRONCounts = 0;
my $fiveUTRCounts = 0;
my $threeUTRCounts= 0;
my $CodingExonCounts = 0;

open (FHG, ">$file.$cutoff.Genes.txt") || die "can not open $file.$cutoff.Genes.txt\n";
open (FHG3P, ">$file.$cutoff.Genes3P.txt") || die "can not open $file.$cutoff.Genes3P.txt\n";
open (FHG5P, ">$file.$cutoff.Genes5P.txt") || die "$file.$cutoff.Genes3P.txt\n";

open (FHA, ">$file.$cutoff.Annotation.txt") || die "can not open $file.$cutoff.Annotation.txt\n";

open (CHECK, ">$file.$cutoff.check.txt") || die "can not open $file.$cutoff.check.txt\n";
open (FHGE, ">$file.$cutoff.GenesExon.txt") || die "can not open $file.$cutoff.GenesExon.txt\n";
open (FHGI, ">$file.$cutoff.GenesIntron.txt") || die "can not open $file.$cutoff.GenesIntron.txt\n";


open (FHC, ">$file.$cutoff.Counts.txt") || die "can not open $file.$cutoff.Counts.txt\n";

#put list of genes regulated in an array and make it unique before I print it.
my @genes;    #genesinGeneand5P;
my @genes3P;
my @genes5P;
my @genesExon_5UTR;
my @genesIntron;


for (@Lanes){	
	chomp;
	my $lane = $_;
	#print FH1 "$_\t";
	
	my @data = split (/\t/, $_);
	my $chr = $data[0];
	my $start = $data[1];
	my $end = $data[2];
	#################
	#MAP the peak
	#################
	#get the length of the chr so we are not outside
	my $chrsegment = $db->segment($chr);
	my $chrStart = $chrsegment -> start(); my $chrEnd = $chrsegment -> stop();
	#if ($data[7] > $cutoffFoldEnrich) {
		print FHA "$_\t";
		my $Up = $start - $cutoff; if ($Up < $chrStart) {$Up = $chrStart;}
		my $Down = $end + $cutoff;  if ($Down > $chrEnd) {$Down = $chrEnd;}
		
		my $DMRSegment =  $db->segment($chr,$start,$end);
		my $segmentUpstream = $db->segment($chr,$Up,$start);
		my $segmentDownstream = $db->segment($chr,$end,$Down);
		
		if ($DMRSegment) {
			#print "got segment for $chr $summit\t$Up\t$Down\n";
		
		}
		if (!$DMRSegment ||!$segmentUpstream ||!$segmentUpstream)  {print "could not retrieve all fragments for $chr, $start, $end\n"};
		my @genesDMR = $DMRSegment ->features(-type => ['gene:mm9_refGene']);
		if (@genesDMR) {
			#print FH1 "intragenic\n";
			print FHA "intragenic\t";
			$IntraCounts++;
			
			my @intronsDMR = $DMRSegment ->features(-type => ['intron:mm9_refGene']);
			my @exonsDMR = $DMRSegment ->features(-type => ['coding_exon:mm9_refGene']);
			my @threeUTRDMR = $DMRSegment ->features(-type => ['utr3_exon:mm9_refGene']);
			my @fiveUTRDMR = $DMRSegment ->features(-type => ['utr5_exon:mm9_refGene']);
			
			
			if (@exonsDMR){
				
				print FHA "coding exon\t";
				$CodingExonCounts++;
				for (@exonsDMR) {
					#my $name = $_ -> name();
					#my $exon_id = $_ -> id();
					
					my $Name = $_ -> name();
					my  $geneStart = $_ -> start();
					my  $geneEnd = $_ -> stop();
					my  $geneStrand = $_ -> strand();
					my $entrezID = getEntrezIDfromRefSeqID($Name);
					my $symbol = getSymbolfromEntrezID($entrezID);
					push (@genesExon_5UTR, $symbol);
					
#					if ($name) {
#						print FHA "$name\t";
#						print STDERR "$name\t";
#					}
#					if ($exon_id) {
#						print FHA "$exon_id\t";
#						print STDERR "$exon_id\t";
#					}
				}
			}
			elsif (@threeUTRDMR) {
				print FHA "3UTR\t";
				$threeUTRCounts++;
			}
			elsif (@fiveUTRDMR){
				print FHA "5UTR\t";
				$fiveUTRCounts++;
#				for (@fiveUTRDMR) {
#					my $Name = $_ -> name();
#					my  $geneStart = $_ -> start();
#					my  $geneEnd = $_ -> stop();
#					my  $geneStrand = $_ -> strand();
#					my $entrezID = getEntrezIDfromRefSeqID($Name);
#					my $symbol = getSymbolfromEntrezID($entrezID);
#					push (@genesExon_5UTR, $symbol);
#				}
				
			}
			elsif (@intronsDMR){
				print FHA "INTRON\t";
				$INTRONCounts++;
				
				for (@intronsDMR) {
					my $Name = $_ -> name();
					my  $geneStart = $_ -> start();
					my  $geneEnd = $_ -> stop();
					my  $geneStrand = $_ -> strand();
					my $entrezID = getEntrezIDfromRefSeqID($Name);
					my $symbol = getSymbolfromEntrezID($entrezID);
					push (@genesIntron, $symbol);
				}
				
			}
			else {print STDERR "CHECK\t$chr\t$start\t$stop";}
			
			#print gene name.
			for (@genesDMR) {
				
				
				
				
				my $Name = $_ -> name();
				my  $geneStart = $_ -> start();
				my  $geneEnd = $_ -> stop();
				my  $geneStrand = $_ -> strand();
				my $entrezID = getEntrezIDfromRefSeqID($Name);
				my $symbol = getSymbolfromEntrezID($entrezID);
#				print "$Name\t$entrezID\t$symbol\n";
				print FHA "$symbol\t";
				print FHG "$symbol\n";
				push (@genes, $symbol);
				if ($geneStrand ==1) {$geneStrand = "+";}
				elsif ($geneStrand == -1) {$geneStrand = "-";}
			
				my $lengthS = length $geneStrand;
				
				
				
				
				
				
				
				
				
				
				#print "$Name\t$entrezID\t$symbol\t$geneStart\t$geneEnd\t$geneStrand\t$lengthS\n";
			}
			
			print FHA "\n";
		} else {
			print FHA "intergenic\t";
			$InterCounts++;
			my @genesUP= $segmentUpstream  ->features(-type => ['gene:mm9_refGene']);
			my @genesDown= $segmentDownstream  ->features(-type => ['gene:mm9_refGene']);
	#		print "gene UP\n";
			if (@genesUP) {
				#get closer and find out if it is the 5' or the 3' that is the closer
				my ($symbol, $distance, $strand)=  getClosestGene ($start,@genesUP);
				my $lengthS = length $strand;
				
				print FHA "UP\t$symbol\t$distance\t$strand\t";
				
				if ($lengthS> 1) {
					#2 genesymbols are at the same distance 
					print STDERR "2 genesymbols are at the same distance\n$lane\tUP\t$symbol\t$distance\t$strand\n";	
					print CHECK "$lane\tUP\t$symbol\t$distance\t$strand\n";
					my @strands = split (//,$strand);		
					my @geneSymbols = split (/_/, $symbol);
					for (my $i = 0; $i <=$#strands; $i++ ){
						if ($strands[$i] eq "-" ) {
							if ($geneSymbols[$i]) {
								push (@genes5P, $geneSymbols[$i]);
							}
						} else {
							if ($geneSymbols[$i]) {
								push (@genes3P, $geneSymbols[$i]);
							}
						}
					}
				
				} else{
					if ($strand eq "-") { #the 5' is the closest to the peak
						#print FHG "$symbol\n";
						push (@genes5P, $symbol);
					} else {
						#print FHG3P "$symbol\n";
						push (@genes3P, $symbol);
					}
				}

			} else {
				print FHA "UP\t#\t#\t#\t";
			}
			
			
			
		#	print "gene DOWN\n";
			if (@genesDown) {
				my ($symbol, $distance, $strand)=  getClosestGene ($end,@genesDown);
				my $lengthS = length $strand;
				print FHA "DOWN\t$symbol\t$distance\t$strand\t";
				
				if ($lengthS > 1) {
					#2 genesymbols are at the same distance 
					print STDERR "2 genesymbols are at the same distance\t$lane\tDOWN\t$symbol\t$distance\t$strand\n";	
					print CHECK "$lane\tDOWN\t$symbol\t$distance\t$strand\n";
					
					my @strands = split (//,$strand);		
					my @geneSymbols = split (/_/, $symbol);
					for (my $i = 0; $i <=$#strands; $i++ ){
						if ($strands[$i] eq "+" ) {
							if ($geneSymbols[$i]) {
								push (@genes5P, $geneSymbols[$i]);
							}
						} else {
							if ($geneSymbols[$i]) {
								push (@genes3P, $geneSymbols[$i]);
							}
						}
					}
					
					
					
				} else{
					if ($strand eq "+") { #the 5' is the closest to the peak
						#print FHG "$symbol\n";
						push (@genes5P, $symbol);
					} else {
						#print FHG3P "$symbol\n";
						push (@genes3P, $symbol);
					}
				}
			}  else {
				print FHA "DOWN\t#\t#\t#\t";
			}
			print FHA "\n";
		}
	#}#if ($data[7] > $cutoffFoldEnrich) {
}	

my @genesU = makeUnique(@genes);
my @genes3PU = makeUnique(@genes3P);
my @genes5PU = makeUnique(@genes5P);
my  @genesExon_5UTR_U =  makeUnique(@genesExon_5UTR);
my @genesIntron_U = makeUnique(@genesIntron);

for (@genesU) {print FHG "$_\n";}
for (@genes3PU) {print FHG3P "$_\n";}
for (@genes5PU) {print FHG5P "$_\n";}
for (@genesExon_5UTR_U) {print FHGE "$_\n";}
for (@genesIntron_U) {print FHGI"$_\n";}

#$InterCounts;
#$IntraCounts; $fiveUTRCounts; $threeUTRCounts; $CodingExonCounts;$INTRONCounts;
print FHC "Intragenic\t$IntraCounts\nINTRON\t$INTRONCounts\nCodingExons\t$CodingExonCounts\nfiveUTR\t$fiveUTRCounts\nthreeUTR\t$threeUTRCounts\nInterGenic\t$InterCounts";



sub getEntrezIDfromRefSeqID{
	my $NM = shift;
	$refseqIDtogeneIDQ->execute($NM)
	 or die "Couldn't execute statement: " . $entrezdb->errstr;
 	my @data;
 	my $NMIDcheck=""; my $entrezID="";
		while (@data = $refseqIDtogeneIDQ->fetchrow_array()) {
	        $entrezID = $data[0];
	        $NMIDcheck = $data[1];
#	        for (@data){
#	        	print STDERR "$_\n";
#	        }
	    }
	    if ($NMIDcheck) {
			if ($NMIDcheck ne $NM) {print STDERR "check sql query\n";}
 			return $entrezID;
	    }
	
	#have to get entrez gene ID from table and then use that to get the genesymbol  
	
}


sub getSymbolfromEntrezID {
	my $entrezID = shift;
	$geneIDtoSymbolQ->execute($entrezID)             # Execute the query
        or die "Couldn't execute statement: " . $entrezdb->errstr;
 		my @data;
 		my $entrezIDcheck=""; my $symbol="";
		while (@data = $geneIDtoSymbolQ->fetchrow_array()) {
	        $entrezIDcheck = $data[0];
	        $symbol = $data[1];
#	        for (@data){
#	        	print STDERR "$_\n";
#	        }
	    }
	    if ($entrezIDcheck) {
			if ($entrezIDcheck != $entrezID) {print STDERR "check sql query\n";}
 			return $symbol;
	    }

}


sub getClosestGene{
	my @args = @_;
	my $snp = shift @args; 
	my @genes = @args;
	my $closestG; my $distance = "d"; my $strand;
	for (@genes){
		my $name = $_ -> name();
		my $geneStart = $_ -> start();
		my $geneEnd = $_ -> stop();		
		my  $geneStrand = $_ -> strand();	
		if ($geneStrand ==1) {
			$geneStrand = "+";
		}elsif ($geneStrand == -1) {
			$geneStrand = "-";
		}
		my $entrezID = getEntrezIDfromRefSeqID($name);
		my $symbol = getSymbolfromEntrezID($entrezID);
		
		my $d1 = abs($geneStart-$snp);
		my $d2 = abs($geneEnd-$snp);
		my $d;
		if ($d1 < $d2) {$d = $d1;} else {$d = $d2;} 
		if ($distance eq "d") {
			$distance = $d; $closestG = $symbol;  $strand = $geneStrand;
		} else {
			if ($d < $distance) {
				$distance = $d; $closestG = $symbol; $strand = $geneStrand;
			} elsif ($d == $distance) {				
				if ($closestG ne $symbol) {
					$distance = $d; $closestG .=  "_"."$symbol"; $strand .= $geneStrand;
				}
			} 
		}
	}	
	
	return ($closestG, $distance, $strand);	
}
############################
sub makeUnique {
  my @array = @_;
  my %unique;
  my @UniqueArray = ();
  for (@array) {
    $unique{$_} = '';
  }
  for (sort (keys %unique)) {
    push @UniqueArray, $_;
  }
  return @UniqueArray;
}
Note: See TracWiki for help on using the wiki.