#! /usr/bin/perl # # Created by Roger Cole 9/4/01. # # version 1.7, 1/24/01 # # Modified by AG Palmer 6/15/10 # Script explicitly searches for "I(0)" and "Rate" in curvfit output # in order to extract rate constants, rather than counting lines of output. # # This takes a Sparky relaxation output file and fits the rate constants. # using more sophisticated statistical techniques. # # The header of the Sparky file is parsed to get the necessary timepoints. # The rest of the file is then parsed and duplicate timepoints used for # error estimates. The errors from these points are then linearly interpolated # to the remaining points. # # The data is then fed into Curvefit and fit to an exponential using a # jackknife algorithm. XMGR output files are created and stored in an # appropriately named directory. These files are them parsed into # a single file containing a list of residues and rate constants. # # This script assumes that some data points are duplicated, but that # no more than 2 pts are collected at the same relaxation time # # This script ignores any assignments containing "?" anywhere in the label # $curvefit = "/usr/local/bin/curvefit"; $curvefitcall = "$curvefit -grid"; # A subroutine to sort assignment names in a quasi-rational fashion sub resort { if (($a =~ /\d+/) && ($b =~ /\d+/)) { ($c = $a) =~ s/[a-zA-Z]//g; ($d = $b) =~ s/[a-zA-Z]//g; return ($c <=> $d); } elsif ($a =~ /\d+/) {return -1;} elsif ($b =~ /\d+/) {return 1;} else {return ($a cmp $b);} } # First set up the master hash $ifile = @ARGV[0]; if (!($ifile)) { print "\nThis script takes a Sparky relaxation output file and fits\n"; print "the rate constants using Curvefit. XMGR graph files of the\n"; print "various decay curves are made and the final calculated\n"; print "rates are gathered into a single file. Please see the\n"; print "script header for more details. \n"; print "\nUsage: process {input file}\n\n"; exit; } system ("mkdir $ifile.dat"); open (INPUT, $ifile) or die "Could not open file $ifile!\n"; @lines = ; close INPUT; # First get the number of time points and corresponding times @par = split /\s+/, $lines[0]; $numpts = $#par - 3; $times=(); $c=1; $timemax=-1; foreach $element (@par) { if ($element =~ /\//) { @par2 = split /\//, $element; $times[$c]=$par2[1]; if ($par2[1] > $timemax) {$timemax = $par2[1];} $c++; } } $numtimes=$c-1; # Next parse the rest of the file $numres = 0; $data = (); $Rest = (); foreach $line (@lines) { @par = split /\s+/, $line; if (($par[1] =~ /-/) && ($par[1] !~ /\?/)) { @par = split /\s+/, $line; $seq = $par[1]; $seq =~ s/N-H//g; $seq =~ s/\-//g; $seq =~ s/\//\./g; for ($i=1; $i<=$numpts; $i++) { $data{$seq}[$i]=$par[$i+3]; $Rest{$seq}=$par[2]; } $numres++; } } printf ("Data file has $numres peaks and $numtimes timepoints\n"); # Next I need to calculate the errors where I have duplicate data # In order to do that I need to know which of my data are duplicate runs $dup=(); $d=0; for ($i=1;$i<=$numpts;$i++) { $test=$times[$i]; for ($j=$i;$j<=$numpts;$j++) { if (($i != $j) && ($times[$i] == $times[$j])) { $dup{$d}[0]=$i; $dup{$d}[1]=$j; $d++; } # GO SOONERS! } } $numdup=$d; if ($numdup < 2) { print "\n*** Warning: only 1 set of duplicate points. Errors from this\n"; print "*** point will be used for all other points.\n\n"; } if ($numdup == 0) { print "\n*** Error: No duplicate data points. Error analysis\n"; print "*** not possible.\n\n"; exit; } # Initialize error so I can easily detect the ones not calculated $error=(); for ($i=1;$i<=$numpts;$i++) {$error[$i] = -1;} print "\n"; foreach $d (keys %dup) { $i=$dup{$d}[0]; $j=$dup{$d}[1]; print "Calculating errors for duplicates $i and $j.\n"; $intdiff=(); $intsum=0; foreach $res (keys %data) { $int1=$data{$res}[$i]; $int2=$data{$res}[$j]; $intdiff{$res}=$int1-$int2; $intsum=$intsum+$intdiff{$res}; } $intave = $intsum/$numres; $intstd = 0; foreach $res (keys %data) { $d2 = $intdiff{$res} - $intave; $intstd = $intstd +($d2 * $d2); } $intstd = sqrt(($intstd / ($numres -1))); printf ("\tAverage: %.3f Unc: %.3f\n", $intave, $intstd); $error[$i]=$intstd; $error[$j]=$intstd; } print "\n"; %realerror=(); for ($i=1;$i<=$numpts;$i++) { if ($error[$i] != -1) { $realerror{$i}=$error[$i]; } } # # # Next I have to interpolate the errors to all the other data points # Basically this invovles 3 possible cases depending on the situation # if ($numdup > 1) { $error[1]=-1; for ($i=1;$i<=$numpts;$i++) { if ($error[$i] == -1) # If the error is unknown { $prev=-1; $next=-1; $next2=-1; $prev2=-1; for ($j=1;$j<=$numpts;$j++) { if (($j < $i) && ($error[$j] != -1)) { $prev=$j; } if (($j > $i) && ($error[$j] != -1)) { if ($next == -1) { $next=$j; } } } # If I have a data point on either side, do a standard # linear interpolation if (($prev != -1) && ($next != -1)) { $slope = ($error[$next] - $error[$prev])/($times[$next] - $times[$prev]); $dx = $times[$i] - $times[$prev]; $dy = $dx * $slope; $error[$i] = $dy + $error[$prev]; print "For pt $i, interpolating error between pts $prev and $next.\n"; } # If this is on the left edge, extrapolate backwards if (($next != -1) && ($prev == -1)) { $next2=-1; $j=$next+1; while (($next2 == -1) && ($j <= $numpts)) { if (($error[$j] != -1) && ($error[$j] != $error[$next])) { $next2 = $j; } $j++; } if ($next2 ==-1) { print "Error extrapolating point $i\n"; die; } $slope = ($error[$next2]-$error[$next])/($times[$next2] - $times[$next]); $dx = $times[$i] - $times[$next]; $error[$i] = $error[$next]+($dx*$slope); print "For pt $i, extrapolating error from pts $next and $next2\n"; } # If this is on the right edge, extrapolate forwards if (($next == -1) && ($prev != -1)) { $error[$i] = $error[$prev]; } } } } # If I only have 1 error point then I have to use that for all other # data points. Always better to have more than 1 duplicate, though. if ($numdup == 1) { $err=-1; $i=1; while ($err == -1) { if ($error[$i] != -1) { $err=$error[$i]; } $i++; } for ($i=1;$i<=$numpts;$i++) { $error[$i]=$err; } } open (OUTPUT, ">errors.xmgr"); $xmax=$timemax*1.1; print OUTPUT "\@ TITLE \"Plot of Error vs Timepoint\"\n"; print OUTPUT "\@ SUBTITLE \"Solid points calculated, empty points estimated\"\n"; print OUTPUT "\@ XAXIS LABEL \"time (sec)\"\n"; print OUTPUT "\@ YAXIS LABEL \"Error (arb. units)\"\n"; print OUTPUT "\@ VIEW XMAX $xmax\n"; print OUTPUT "\@ WORLD XMAX $xmax\n"; print OUTPUT "\@ s0 symbol 4\n"; print OUTPUT "\@ s0 symbol fill 1\n"; print OUTPUT "\@ s0 symbol size 1\n"; print OUTPUT "\@ s0 symbol color 1\n"; print OUTPUT "\@ s0 linestyle 0\n"; print OUTPUT "\@ s0 color 1\n"; print OUTPUT "\@ s1 symbol 4\n"; print OUTPUT "\@ s1 symbol fill 0\n"; print OUTPUT "\@ s1 symbol size 1\n"; print OUTPUT "\@ s1 symbol color 4\n"; print OUTPUT "\@ s1 linestyle 1\n"; print OUTPUT "\@ s1 color 4\n"; print OUTPUT "\@TARGET S0\n"; print OUTPUT "\@TYPE xy\n"; foreach $x (sort {$a <=> $b} keys %realerror) { print OUTPUT "\t$times[$x]\t$realerror{$x}\n"; } print OUTPUT "\&\n"; print OUTPUT "\@TARGET S1\n\@TYPE xy\n"; print "\nList of errors passed to Curvefit:\n"; for ($i=1;$i<=$numpts;$i++) { printf "%d:\t%.3f\t%.3e\n",$i,$times[$i],$error[$i]; printf OUTPUT "%.3f\t%.3e\n",$times[$i],$error[$i]; } print OUTPUT "\&\n"; close OUTPUT; # # New section to check and see if any peak has a negative height. # If so, throw out this peak and any later peaks. # foreach $res (keys %data) { $last{$res}=0; for ($i=1; $i<=$numpts; $i++) { if ($data{$res}[$i] < 0) { if ($last{$res} == 0) { $last{$res}=$times[$i]; } if ($times[$i] < $last{$res}) { $last{$res}=$times[$i]; } } } for ($i=1; $i<=$numpts; $i++) { if (($last{$res} > 0) && ($times[$i] < $last{$res})) {$data2{$res}{$i}=$data{$res}[$i];} elsif ($last{$res} == 0) {$data2{$res}{$i}=$data{$res}[$i];} } } # # Next I need to output a series of input files for Curvefit # Then run curvefit and assemble the output into a final # list of rates as a function of peptide sequence # %Rate=(); %Rateerr=(); %I=(); %Ierr=(); print "\n"; foreach $res (sort resort keys %data) { print "Fitting relaxation for residue $res\n"; $imin=$data2{$res}{1}*0.5; $imax=$data2{$res}{1}*1.5; if ($Rest{$res} == 0) {$Rest{$res} = 0.001;} $rmax=(1/$Rest{$res})*1.5; $fname = $res; open (OUTPUT, "> .\/$ifile.dat\/$fname.dat") or die "Error opening file!\n"; print OUTPUT "title \"$res\"\n\n"; print OUTPUT "function exponential\n"; printf OUTPUT " I(0) %.2e %.2e 200\n",$imin,$imax; print OUTPUT "# $times[1] $data2{$res}{1}\n"; printf OUTPUT " Rate 0.00 %.2f 500\n",$rmax; print OUTPUT "xmgr\n"; print OUTPUT "\t@ XAXIS LABEL \"time (sec)\"\n"; print OUTPUT "\t@ YAXIS LABEL \"intensity (arb. units)\"\n"; print OUTPUT "\t@ XAXIS TICKLABEL FORMAT DECIMAL\n"; print OUTPUT "\t@ YAXIS TICKLABEL FORMAT DECIMAL\n"; print OUTPUT "\t@ XAXIS TICKLABEL CHAR SIZE 0.8\n"; print OUTPUT "\t@ YAXIS TICKLABEL CHAR SIZE 0.8\n"; print OUTPUT "data\n"; foreach $i (sort resort keys %{$data2{$res}}) { printf OUTPUT "%3.4f\t%.5e\t%.5e\n",$times[$i],$data2{$res}{$i},$error[$i]; } close OUTPUT; $d2 = system("$curvefitcall -xmgr -f .\/$ifile.dat\/$fname.dat > .\/$ifile.dat\/$fname.xmgr"); $lines=(); open (INPUT, ".\/$ifile.dat\/$fname.xmgr") or die "Error opening file!\n"; @lines=; close INPUT; # Modified by AGP 6/15/10 for ($iline=0; $iline<=$#lines; $iline++) { @l1 = split /\s+/, $lines[$iline]; if ($l1[1] eq "I(0)") { $I{$res} = $l1[2]; $Ierr{$res} = $l1[5]; } elsif ($l1[1] eq "Rate") { $Rate{$res} = $l1[2]; $Rateerr{$res} = $l1[5]; } } # End of modification # print "I:$I{$res}\t$Ierr{$res}\nRate:$Rate{$res}\t$Rateerr{$res}\n"; } open (OUTPUT, "> $ifile.rates") or die "Error opening file!\n"; foreach $res (sort resort keys %Rate) { print OUTPUT "$res\t$Rate{$res}\t$Rateerr{$res}\n"; } close OUTPUT; open (OUTPUT, "> $ifile.rates.xmgr") or die "Error opening file!\n"; foreach $res (sort resort keys %Rate) { if ($res =~ /\d+/) { ($r2 = $res) =~ s/[a-zA-Z]//g; print OUTPUT "$r2\t$Rate{$res}\t$Rateerr{$res}\n"; } } close OUTPUT; system ("rm .\/$ifile.dat\/*.dat"); print "\nFinal rate data saved to file $ifile.rates\n\n"; print "Rate data saved in XMGR compatible format in $ifile.rates.xmgr\n\n";