#! /usr/bin/perl
#
# Created by Roger Cole 9/4/01.
#
# version 1.7, 1/24/01
#
# Modified by AG Palmer 6/15/10
# Script explicitly searches for "I(0)" and "Rate" in curvfit output
# in order to extract rate constants, rather than counting lines of output.
#
# This takes a Sparky relaxation output file and fits the rate constants.
# using more sophisticated statistical techniques.
#
# The header of the Sparky file is parsed to get the necessary timepoints.
# The rest of the file is then parsed and duplicate timepoints used for
# error estimates. The errors from these points are then linearly interpolated
# to the remaining points.
#
# The data is then fed into Curvefit and fit to an exponential using a
# jackknife algorithm. XMGR output files are created and stored in an
# appropriately named directory. These files are them parsed into
# a single file containing a list of residues and rate constants.
#
# This script assumes that some data points are duplicated, but that
# no more than 2 pts are collected at the same relaxation time
#
# This script ignores any assignments containing "?" anywhere in the label
#
$curvefit = "/usr/local/bin/curvefit";
$curvefitcall = "$curvefit -grid";
# A subroutine to sort assignment names in a quasi-rational fashion
sub resort
{
if (($a =~ /\d+/) && ($b =~ /\d+/))
{
($c = $a) =~ s/[a-zA-Z]//g;
($d = $b) =~ s/[a-zA-Z]//g;
return ($c <=> $d);
}
elsif ($a =~ /\d+/) {return -1;}
elsif ($b =~ /\d+/) {return 1;}
else {return ($a cmp $b);}
}
# First set up the master hash
$ifile = @ARGV[0];
if (!($ifile))
{
print "\nThis script takes a Sparky relaxation output file and fits\n";
print "the rate constants using Curvefit. XMGR graph files of the\n";
print "various decay curves are made and the final calculated\n";
print "rates are gathered into a single file. Please see the\n";
print "script header for more details. \n";
print "\nUsage: process {input file}\n\n";
exit;
}
system ("mkdir $ifile.dat");
open (INPUT, $ifile) or die "Could not open file $ifile!\n";
@lines = ;
close INPUT;
# First get the number of time points and corresponding times
@par = split /\s+/, $lines[0];
$numpts = $#par - 3;
$times=();
$c=1;
$timemax=-1;
foreach $element (@par)
{
if ($element =~ /\//)
{
@par2 = split /\//, $element;
$times[$c]=$par2[1];
if ($par2[1] > $timemax) {$timemax = $par2[1];}
$c++;
}
}
$numtimes=$c-1;
# Next parse the rest of the file
$numres = 0;
$data = ();
$Rest = ();
foreach $line (@lines)
{
@par = split /\s+/, $line;
if (($par[1] =~ /-/) && ($par[1] !~ /\?/))
{
@par = split /\s+/, $line;
$seq = $par[1];
$seq =~ s/N-H//g;
$seq =~ s/\-//g;
$seq =~ s/\//\./g;
for ($i=1; $i<=$numpts; $i++)
{
$data{$seq}[$i]=$par[$i+3];
$Rest{$seq}=$par[2];
}
$numres++;
}
}
printf ("Data file has $numres peaks and $numtimes timepoints\n");
# Next I need to calculate the errors where I have duplicate data
# In order to do that I need to know which of my data are duplicate runs
$dup=();
$d=0;
for ($i=1;$i<=$numpts;$i++)
{
$test=$times[$i];
for ($j=$i;$j<=$numpts;$j++)
{
if (($i != $j) && ($times[$i] == $times[$j]))
{
$dup{$d}[0]=$i;
$dup{$d}[1]=$j;
$d++;
}
# GO SOONERS!
}
}
$numdup=$d;
if ($numdup < 2)
{
print "\n*** Warning: only 1 set of duplicate points. Errors from this\n";
print "*** point will be used for all other points.\n\n";
}
if ($numdup == 0)
{
print "\n*** Error: No duplicate data points. Error analysis\n";
print "*** not possible.\n\n";
exit;
}
# Initialize error so I can easily detect the ones not calculated
$error=();
for ($i=1;$i<=$numpts;$i++) {$error[$i] = -1;}
print "\n";
foreach $d (keys %dup)
{
$i=$dup{$d}[0];
$j=$dup{$d}[1];
print "Calculating errors for duplicates $i and $j.\n";
$intdiff=();
$intsum=0;
foreach $res (keys %data)
{
$int1=$data{$res}[$i];
$int2=$data{$res}[$j];
$intdiff{$res}=$int1-$int2;
$intsum=$intsum+$intdiff{$res};
}
$intave = $intsum/$numres;
$intstd = 0;
foreach $res (keys %data)
{
$d2 = $intdiff{$res} - $intave;
$intstd = $intstd +($d2 * $d2);
}
$intstd = sqrt(($intstd / ($numres -1)));
printf ("\tAverage: %.3f Unc: %.3f\n", $intave, $intstd);
$error[$i]=$intstd;
$error[$j]=$intstd;
}
print "\n";
%realerror=();
for ($i=1;$i<=$numpts;$i++)
{
if ($error[$i] != -1)
{
$realerror{$i}=$error[$i];
}
}
#
#
# Next I have to interpolate the errors to all the other data points
# Basically this invovles 3 possible cases depending on the situation
#
if ($numdup > 1)
{
$error[1]=-1;
for ($i=1;$i<=$numpts;$i++)
{
if ($error[$i] == -1) # If the error is unknown
{
$prev=-1;
$next=-1;
$next2=-1;
$prev2=-1;
for ($j=1;$j<=$numpts;$j++)
{
if (($j < $i) && ($error[$j] != -1))
{
$prev=$j;
}
if (($j > $i) && ($error[$j] != -1))
{
if ($next == -1)
{
$next=$j;
}
}
}
# If I have a data point on either side, do a standard
# linear interpolation
if (($prev != -1) && ($next != -1))
{
$slope = ($error[$next] - $error[$prev])/($times[$next] - $times[$prev]);
$dx = $times[$i] - $times[$prev];
$dy = $dx * $slope;
$error[$i] = $dy + $error[$prev];
print "For pt $i, interpolating error between pts $prev and $next.\n";
}
# If this is on the left edge, extrapolate backwards
if (($next != -1) && ($prev == -1))
{
$next2=-1;
$j=$next+1;
while (($next2 == -1) && ($j <= $numpts))
{
if (($error[$j] != -1) && ($error[$j] != $error[$next]))
{
$next2 = $j;
}
$j++;
}
if ($next2 ==-1)
{
print "Error extrapolating point $i\n";
die;
}
$slope = ($error[$next2]-$error[$next])/($times[$next2] - $times[$next]);
$dx = $times[$i] - $times[$next];
$error[$i] = $error[$next]+($dx*$slope);
print "For pt $i, extrapolating error from pts $next and $next2\n";
}
# If this is on the right edge, extrapolate forwards
if (($next == -1) && ($prev != -1))
{
$error[$i] = $error[$prev];
}
}
}
}
# If I only have 1 error point then I have to use that for all other
# data points. Always better to have more than 1 duplicate, though.
if ($numdup == 1)
{
$err=-1;
$i=1;
while ($err == -1)
{
if ($error[$i] != -1)
{
$err=$error[$i];
}
$i++;
}
for ($i=1;$i<=$numpts;$i++)
{
$error[$i]=$err;
}
}
open (OUTPUT, ">errors.xmgr");
$xmax=$timemax*1.1;
print OUTPUT "\@ TITLE \"Plot of Error vs Timepoint\"\n";
print OUTPUT "\@ SUBTITLE \"Solid points calculated, empty points estimated\"\n";
print OUTPUT "\@ XAXIS LABEL \"time (sec)\"\n";
print OUTPUT "\@ YAXIS LABEL \"Error (arb. units)\"\n";
print OUTPUT "\@ VIEW XMAX $xmax\n";
print OUTPUT "\@ WORLD XMAX $xmax\n";
print OUTPUT "\@ s0 symbol 4\n";
print OUTPUT "\@ s0 symbol fill 1\n";
print OUTPUT "\@ s0 symbol size 1\n";
print OUTPUT "\@ s0 symbol color 1\n";
print OUTPUT "\@ s0 linestyle 0\n";
print OUTPUT "\@ s0 color 1\n";
print OUTPUT "\@ s1 symbol 4\n";
print OUTPUT "\@ s1 symbol fill 0\n";
print OUTPUT "\@ s1 symbol size 1\n";
print OUTPUT "\@ s1 symbol color 4\n";
print OUTPUT "\@ s1 linestyle 1\n";
print OUTPUT "\@ s1 color 4\n";
print OUTPUT "\@TARGET S0\n";
print OUTPUT "\@TYPE xy\n";
foreach $x (sort {$a <=> $b} keys %realerror)
{
print OUTPUT "\t$times[$x]\t$realerror{$x}\n";
}
print OUTPUT "\&\n";
print OUTPUT "\@TARGET S1\n\@TYPE xy\n";
print "\nList of errors passed to Curvefit:\n";
for ($i=1;$i<=$numpts;$i++)
{
printf "%d:\t%.3f\t%.3e\n",$i,$times[$i],$error[$i];
printf OUTPUT "%.3f\t%.3e\n",$times[$i],$error[$i];
}
print OUTPUT "\&\n";
close OUTPUT;
#
# New section to check and see if any peak has a negative height.
# If so, throw out this peak and any later peaks.
#
foreach $res (keys %data)
{
$last{$res}=0;
for ($i=1; $i<=$numpts; $i++)
{
if ($data{$res}[$i] < 0)
{
if ($last{$res} == 0)
{
$last{$res}=$times[$i];
}
if ($times[$i] < $last{$res})
{
$last{$res}=$times[$i];
}
}
}
for ($i=1; $i<=$numpts; $i++)
{
if (($last{$res} > 0) && ($times[$i] < $last{$res}))
{$data2{$res}{$i}=$data{$res}[$i];}
elsif ($last{$res} == 0)
{$data2{$res}{$i}=$data{$res}[$i];}
}
}
#
# Next I need to output a series of input files for Curvefit
# Then run curvefit and assemble the output into a final
# list of rates as a function of peptide sequence
#
%Rate=();
%Rateerr=();
%I=();
%Ierr=();
print "\n";
foreach $res (sort resort keys %data)
{
print "Fitting relaxation for residue $res\n";
$imin=$data2{$res}{1}*0.5;
$imax=$data2{$res}{1}*1.5;
if ($Rest{$res} == 0) {$Rest{$res} = 0.001;}
$rmax=(1/$Rest{$res})*1.5;
$fname = $res;
open (OUTPUT, "> .\/$ifile.dat\/$fname.dat") or die "Error opening file!\n";
print OUTPUT "title \"$res\"\n\n";
print OUTPUT "function exponential\n";
printf OUTPUT " I(0) %.2e %.2e 200\n",$imin,$imax;
print OUTPUT "# $times[1] $data2{$res}{1}\n";
printf OUTPUT " Rate 0.00 %.2f 500\n",$rmax;
print OUTPUT "xmgr\n";
print OUTPUT "\t@ XAXIS LABEL \"time (sec)\"\n";
print OUTPUT "\t@ YAXIS LABEL \"intensity (arb. units)\"\n";
print OUTPUT "\t@ XAXIS TICKLABEL FORMAT DECIMAL\n";
print OUTPUT "\t@ YAXIS TICKLABEL FORMAT DECIMAL\n";
print OUTPUT "\t@ XAXIS TICKLABEL CHAR SIZE 0.8\n";
print OUTPUT "\t@ YAXIS TICKLABEL CHAR SIZE 0.8\n";
print OUTPUT "data\n";
foreach $i (sort resort keys %{$data2{$res}})
{
printf OUTPUT "%3.4f\t%.5e\t%.5e\n",$times[$i],$data2{$res}{$i},$error[$i];
}
close OUTPUT;
$d2 = system("$curvefitcall -xmgr -f .\/$ifile.dat\/$fname.dat > .\/$ifile.dat\/$fname.xmgr");
$lines=();
open (INPUT, ".\/$ifile.dat\/$fname.xmgr") or die "Error opening file!\n";
@lines=;
close INPUT;
# Modified by AGP 6/15/10
for ($iline=0; $iline<=$#lines; $iline++)
{
@l1 = split /\s+/, $lines[$iline];
if ($l1[1] eq "I(0)") {
$I{$res} = $l1[2];
$Ierr{$res} = $l1[5];
}
elsif ($l1[1] eq "Rate") {
$Rate{$res} = $l1[2];
$Rateerr{$res} = $l1[5];
}
}
# End of modification
# print "I:$I{$res}\t$Ierr{$res}\nRate:$Rate{$res}\t$Rateerr{$res}\n";
}
open (OUTPUT, "> $ifile.rates") or die "Error opening file!\n";
foreach $res (sort resort keys %Rate)
{
print OUTPUT "$res\t$Rate{$res}\t$Rateerr{$res}\n";
}
close OUTPUT;
open (OUTPUT, "> $ifile.rates.xmgr") or die "Error opening file!\n";
foreach $res (sort resort keys %Rate)
{
if ($res =~ /\d+/)
{
($r2 = $res) =~ s/[a-zA-Z]//g;
print OUTPUT "$r2\t$Rate{$res}\t$Rateerr{$res}\n";
}
}
close OUTPUT;
system ("rm .\/$ifile.dat\/*.dat");
print "\nFinal rate data saved to file $ifile.rates\n\n";
print "Rate data saved in XMGR compatible format in $ifile.rates.xmgr\n\n";