2 Replies - 1553 Views - Last Post: 25 August 2010 - 09:01 PM

#1 dstorey  Icon User is offline

  • New D.I.C Head

Reputation: 0
  • View blog
  • Posts: 11
  • Joined: 14-September 09

Memory Allocation issues in Perl

Posted 24 August 2010 - 10:19 PM

I'm currently running a script to fragment a genome into all possible peptide fragments. Everything appears to be going well but I'm pretty sure I have a memory leak as a 20mb text sequence causes the program to fail. I'm getting this error currently.

( I'm on a Mac Pro with 2GB of RAM OSX leopard)

perl(604) malloc: *** mmap(size=134217728) failed (error code=12)
*** error: can't allocate region
*** set a breakpoint in malloc_error_break to debug
Out of memory!

Here is the code. Any advice would be greatly appreciated - even if its just how to go about looking at the problem with tools !


#!/usr/bin/perl
if (scalar(@ARGV) != 2)
	{die "Not enough arguments";}
$file_in = $ARGV[0];
$min_aminoacid_length = $ARGV[1];
$file_out = $file_in."_SixFrames.fasta_".$min_aminoacid_length;
open ( FILE_IN , "<" , "$file_in")  or die "Cannot open $file_in";
unlink ("$fileout");
%sequence_hash;
print "Reading File in , This could take a while depending on how the file is formatted - go grab a cup of coffee !!\n";
while(my $line = <FILE_IN>)
	{
	chomp $line;
	if ($line =~/>/)
		{
		$s_count++;
		$scaffold = $line;
#		print $s_count."\n";
		}
	else 
		{
#		$seqcount++;
		$sequence_hash{$scaffold}.= $line;
		}
	}
print "Input is in memory\n";
close FILE_IN;
@keys = sort(keys(%sequence_hash));
$total_loops = scalar(@keys);
foreach $key(@keys)
	{
	$counter++;
	open(PRINTOUT, "+>",$counter) || die $!;
	print PRINTOUT $key ."\n";
	print PRINTOUT$sequence_hash{$key} . "\n";
	push(@file_list,$counter);
	close PRINTOUT;
	}
undef %sequence_hash;
foreach $file(@file_list)
	{
	$value = '';
	$rev_value = '';
	@value = ();
	@rev_value =();
	open(FILE,"<","$file");
		while(my $line = <FILE>)
			{
			chomp $line;
			if ($line =~ />/)
				{$name=$line;}
			else
				{$value=$line;}
			}
	close FILE;
	$rev_value = reverse($value);
	$rev_value =~ tr/[A,T,G,C]/[T,A,C,G]/;
	@value = split(//,$value);
	@rev_value = split(//,$rev_value);
	#first frame #
	$plus1 = $value;
	undef $value;
	&frame_plus($plus1,0,$name,"1");
	undef $plus1;
	$minus1 = $rev_value;
	undef $rev_value ;
	&frame_minus($minus1,0,$name,"4");
	undef $minus1;
	#shifts +/- 2 for next reading frames#
	shift(@value);
	shift(@rev_value);
	$plus2 = join('',@value);
	&frame_plus($plus2,1,$name,"2");
 	undef $plus2;
	$minus2 = join ('',@rev_value);
	&frame_minus($minus2,1,$name,"5");
	undef $minus2;
	### end shifts for reading frames##
	## shifts +/- 3 for next reading frames ##
	shift(@value);
	shift(@rev_value);
	$plus3 = join('',@value);
	&frame_plus($plus3,2,$name,"3");
	undef $plus3;
	$minus3 = join('',@rev_value);
	&frame_minus($minus3,2,$name,"6");	
	undef $minus3;
	## end shift for next reading frames ##
	$current_loop++;
	print "$current_loop of $total_loops finished ( ". $current_loop/$total_loops ." complete )\n";
	unlink($file);
	}

#Finds and reports on peptides greater than 34 AA long#	
sub frame_plus
{
open ( SIXFRAMES , ">>", "$file_out") or die $!;
#my $dna = $_[0];
#my $shift = $_[1];
#my $contig = $_[2];
#my $frame = $_[3];
my $first_nt=1;
my $last_nt=length($_[0]);
my $protein = "";
my $aminoacid_counter = 0;
for(my $i=0; $i < ($last_nt-2) ; $i += 3)
	{
	$aminoacid = codon2aa(substr($_[0],$i,3));
	if ($aminoacid ne "_")
		{
		$protein .=$aminoacid;
		$aminoacid_counter++;
		}
	else 
		{
		if ($aminoacid_counter >= $min_aminoacid_length) ## change this to make for longer or shorter AA ##
			{
			$starting_nt = ($first_nt + $_[1]);
			$ending_nt = (($first_nt + ($aminoacid_counter*3))-1);
			print SIXFRAMES $contig."_".$_[3]."_".$starting_nt."_".$ending_nt."_"."Forward\n";
			print SIXFRAMES $protein."\n";
			$protein = '';
			$first_nt = $i+3;
			$aminoacid_counter =0;
			}
		}
	}
if ($aminoacid_counter >= $min_aminoacid_length)
	{
	$starting_nt = ($first_nt + $_[1]);
	$ending_nt = (($first_nt + ($aminoacid_counter*3))-1);
	print SIXFRAMES "::LEFTOVER::\n";
	print SIXFRAMES $contig."_".$_[3]."_".$starting_nt."_".$ending_nt."_"."Forward\n";
	print SIXFRAMES $protein."\n";
	$protein='';
	}
close SIXFRAMES;
}


sub frame_minus
{
open ( SIXFRAMES , ">>", "$file_out") or die $!;
#my $dna = $_[0];
#my $shift = $_[1];
#my $contig = $_[2];
#my $frame = $_[3];
my $first_nt=1;
my $last_nt=length($_[0]);
my $length = length($_[0]);
my $protein = "";
my $aminoacid_counter = 0;
for(my $i=0; $i < ($last_nt-2) ; $i += 3)
	{
	$aminoacid = codon2aa(substr($_[0],$i,3));
	if ($aminoacid ne "_")
		{
		$protein .=$aminoacid;
		$aminoacid_counter++;
		}
	else 
		{
		if ($aminoacid_counter >= $min_aminoacid_length) ## change this to make for longer or shorter AA ##
			{
			$starting_nt = ($first_nt + $_[1]);
			$ending_nt = (($first_nt + ($aminoacid_counter*3))-1);
			## Index correction ##
			$fixed_starting_nt = ($length - $starting_nt);
			$fixed_ending_nt = ($length - $ending_nt);
			print SIXFRAMES $_[2]."_".$_[3]."_".$fixed_starting_nt."_".$fixed_ending_nt."_"."Reverse\n";
			print SIXFRAMES $protein."\n";
			$protein = '';
			$first_nt = $i+3;
			$aminoacid_counter =0;
			}
		}
	}
if ($aminoacid_counter >= $min_aminoacid_length)
	{
	$starting_nt = ($first_nt + $_[1]);
	$ending_nt = (($first_nt + ($aminoacid_counter*3))-1);
	## Index correction ##
	$fixed_starting_nt = ($length - $starting_nt);
	$fixed_ending_nt = ($length - $ending_nt);
	print SIXFRAMES "::LEFTOVER::\n";
	print SIXFRAMES $_[2]."_".$_[3]."_".$fixed_starting_nt."_".$fixed_ending_nt."_"."Reverse\n";
	print SIXFRAMES $protein."\n";
	$protein='';
	}
close SIXFRAMES;
}

## Returns the AA for a given Codon ##
sub codon2aa 
{
	my($codon) = @_;	
	$codon = uc $codon;
	$stop='_';
	if($codon =~ /[XN]/)
		{
		return $stop;
		}

my %genetic_code = ('TCA' => 'S','TCC' => 'S','TCG' => 'S','TCT' => 'S','TTC' => 'F','TTT' => 'F','TTA' => 'L','TTG' => 'L','TAC' => 'Y','TAT' => 'Y','TAA'=> '_','TAG' => '_','TGC' => 'C','TGT' => 'C','TGA' => '_','TGG' => 'W','CTA' => 'L','CTC' => 'L','CTG' => 'L','CTT' => 'L','CCA' => 'P','CCC' => 'P','CCG' => 'P','CCT' => 'P','CAC' => 'H','CAT' => 'H','CAA' => 'Q','CAG' => 'Q','CGA' => 'R','CGC' => 'R','CGG' => 'R','CGT' => 'R','ATA' => 'I','ATC' => 'I','ATT' => 'I','ATG' => 'M','ACA' => 'T','ACC' => 'T','ACG' => 'T','ACT' => 'T','AAC' => 'N','AAT' => 'N','AAA' => 'K','AAG' => 'K','AGC' => 'S','AGT' => 'S','AGA' => 'R','AGG' => 'R','GTA' => 'V','GTC' => 'V','GTG' => 'V','GTT' => 'V','GCA' => 'A',   'GCC' => 'A','GCG' => 'A','GCT' => 'A','GAC' => 'D','GAT' => 'D','GAA' => 'E','GAG' => 'E','GGA' => 'G','GGC' => 'G','GGG' => 'G','GGT' => 'G');

if(exists $genetic_code{$codon}) 
    	{
        return $genetic_code{$codon};
    	}
else
	{
        print STDOUT "Bad codon \"$codon\"!!q\n";
        exit;
    	}
}






Is This A Good Question/Topic? 0
  • +

Replies To: Memory Allocation issues in Perl

#2 dsherohman  Icon User is offline

  • Perl Parson
  • member icon

Reputation: 227
  • View blog
  • Posts: 654
  • Joined: 29-March 09

Re: Memory Allocation issues in Perl

Posted 25 August 2010 - 07:09 AM

Is there a specific point in the processing where it tends to die? If so, have you tried rearranging the order that the input sequences are processed (either don't sort @keys or reverse the sort) to see whether it still dies on the same data or after going through about the same amount of the total process?

Aside from that, a few thoughts which may or may not be relevant to your problem:

  • Use "my" more, particularly with variables used within loops. By restricting their scope to the loop, you can be sure that their value (and the memory required to store it) won't survive into the next iteration[1]. Adding "use strict" at the top will help you find non-"my" variables very quickly by refusing to compile if there are any undeclared variables. (While you're at it, consider adding "use warnings" also.)

  • Conversely, since %genetic_code will always be the same, move its definition outside sub codon2aa so that it won't disappear and have to be reallocated/reinitialized every time the sub is called.

  • In a similar vein, using lexical filehandles rather than globs may help avoid leaks in that area. In other words, use "open ( my $file_in_handle , "<" , $file_in)" instead of "open ( FILE_IN , "<" , "$file_in")". (I also took out the double quotes around "$file_in" because they weren't doing anything other than contributing visual noise.)

  • "undef %sequence_hash;" probably doesn't do what you intend it to. Use "%sequence_hash = ()" instead. "undef @foo"/"undef %bar" undefines the first element of the array/hash. It does not clear the entire structure. (And, again, there would be no need to explicitly clear %sequence_hash if you used "my" to limit its scope to a block, as it would vanish completely on exiting that block.)

  • Your @file_list array will end up as a list of numbers - the first value will always be 1, the second 2, etc. - so you can get rid of it entirely and replace "foreach $file(@file_list)" with "foreach $file(1 .. $total_loops)". (Or "for my $file (1 .. $total_loops)", which is how I'd write it - "for" and "foreach" are synonymous in Perl.)

  • Don't prefix user-defined function calls with "&", as in "&frame_plus($plus1,0,$name,"1");" Use just "frame_plus($plus1,0,$name,"1");". The "&" is a holdover from Perl 4 which has side effects which, again, are probably not what you intend.

  • You can save some memory in your prints by printing a list without concatenating, i.e., "print SIXFRAMES $contig, "_", $_[3], "_", $starting_nt, "_", $ending_nt, "_", "Forward\n";". Unless individual prints are extremely long, I doubt this will make a significant difference, though.



[1] ...unless you've stashed a reference to the value somewhere, but I didn't see any use of references in your code.
Was This Post Helpful? 1
  • +
  • -

#3 dstorey  Icon User is offline

  • New D.I.C Head

Reputation: 0
  • View blog
  • Posts: 11
  • Joined: 14-September 09

Re: Memory Allocation issues in Perl

Posted 25 August 2010 - 09:01 PM

Thanks for the advice ,
tried it all out , while it did decrease the amount of memory used I still was running over. - I've decided to take a route where I break everything up into the smallest possible pieces prior to translation.

Also I attempted to run this on a 32Gb RAM windows machine and got a out of memory issue at an earlier point than my 2GB mac pro - would anyone care to comment on why that is?
Was This Post Helpful? 0
  • +
  • -

Page 1 of 1