#!/usr/bin/perl use strict; use Getopt::Std; our ($opt_i, $opt_r, $opt_c, $opt_o); # command line options getopts('i:r:c:o:'); $opt_r = 2 if ($opt_r =~ /^$/); $opt_c = 2 if ($opt_c =~ /^$/); # Check if input and output files were specified die qq(usage: $0 -i -o [-r ] [-c ] -i : a file containing gene X array, whitespace delimited, microarray data. -r : optional integer indicating the first row on which data occures (default=2) -c : optional integer indicating the first column on which data occures (default=2) -o : filename to which the output will be written.\n) unless ($opt_i && $opt_o); my $arraydata = new ZipfsNormalize($opt_i, $opt_r, $opt_c); $arraydata->readfile; $arraydata->normalize; # Read the comments for the _output_data subroutine # for an explaination of the options passed. $arraydata->output_data('NORMDATA', $opt_o, 'N'); # Command line command to log transform the data textfiles # Leaves columns and row headers intact only if they contain at least one letter # perl -lane '@F = map {($_ =~ /[a-zA-Z]/) ? $_ : sprintf "%4.2f", $_} map {($_ <= 0 || $_ =~ /[a-zA-Z]/) ? $_ : log $_ } @F; print join "\t", @F' # Object to impliment the normalization package ZipfsNormalize; use PDL; sub new { my $proto = shift; my $class = ref($proto) || $proto; my $opt_i = shift; my $opt_r = shift; my $opt_c = shift; my $self = { DATAFILE => $opt_i, # the name of the file with the un-normalized data DATAROW => $opt_r, # the first row in which data occures DATACOL => $opt_c, # the first column in which data occures DATA => undef, # the PDL object to hold the data NORMDATA => undef, # the normalized PDL data object ROWHEADS => undef, # the row headers COLHEADS => undef, # the column headers SIGFIG => undef, # the number of significant figures used in the data RANKS => undef # the ranks (for outputing norm data with ranks) }; return bless ($self, $class); } sub readfile { my $self = shift; ($self->{ROWHEADS}, $self->{COLHEADS}, $self->{SIGFIG}) = $self->_preserve_headers(); # Create a piddle object $self->{DATA} = pdl $self->_read_file(); } sub normalize { my $self = shift; my $data = $self->{DATA}; my $badvalues = $data->copy; # Mark the values that are less than or equal to zero as bad $data = $data->setbadif( $data <= 0 ); # Data matrix dimensions my ($arrays, $genes) = $data->dims; # The median of each gene expression, # this is the standard curve to which all arrays are normalized. # Bad values are ignored in computing the median my $median_expressions = $data->medover; # The ranks of the median expression values. # qsorti is called twice, once to get the ranks after sorting, # and once again to transform them back into pre-sorted ranks. my $ranks = $median_expressions->qsorti->qsorti; # Transform ranks to decending, starting with 1 (not zero) $ranks .= -($ranks - ($ranks->max + 1)); $self->{RANKS} = $ranks; # Compute the regression statistics my ($median_slope, $median_intercept) = _regression($ranks, $median_expressions); my ($array_slopes, $array_intercepts) = _regression($ranks, $data->xchg(1,0)); my $scaling_factor = _scaling_factor($data, $genes, $array_slopes, $array_intercepts); # Normalize # En = exp( ln(E) x ( ( m x ln(r) + b ) / ( M x ln(r) + B) ) my $median_line = $median_slope * log($ranks) + $median_intercept + $scaling_factor; my $array_line = $array_slopes->dummy * log($ranks->dummy(1,$arrays)) + $array_intercepts->dummy + $scaling_factor; $self->{NORMDATA} = exp( log($data) * xchg($median_line / $array_line, 1, 0)) * exp(-$scaling_factor); # Copy the original 'bad' values to the normalized dataset. $self->{NORMDATA} = $self->{NORMDATA}->setbadtoval(0) + $badvalues * $data->isbad; $self->{DATA} = $self->{DATA}->setbadtoval(0) + $badvalues * $data->isbad; } sub output { my $self = shift; my $outfile = shift; # Output the normalized data $self->_output_normalized_data($outfile); } # Saves the row and column headers so that they can be pasted back onto the normalized results sub _preserve_headers { my $self = shift; my (@column_headers, @row_headers, $significant_figures); open FILE, qq(< $self->{DATAFILE}); for (my $c=0; $c<$self->{DATACOL}; $c++) { my $line = ; push @column_headers, $line; } while () { my @F = split /\s+/, $_; push @row_headers, join qq(\t), @F[0..$self->{DATAROW}-1]; foreach my $value (@F[$self->{DATAROW}..$#F]) { die qq(Non-numeric value found in data field.\n) if ($value !~ m/^[\d\.-]+$/); $value =~ s/\d*\.(\d*)/$1/; $significant_figures = length($value) if ($significant_figures < length($value)); } } close FILE; return (\@row_headers, \@column_headers, $significant_figures); } # Read data from a text file and return an array reference. sub _read_file { my $self = shift; my @data; open FILE, qq(< $self->{DATAFILE}); for (my $c=0; $c<$self->{DATACOL}; $c++) { ; } while () { my @F = split /\s+/, $_; next if (join '', @F == 0); # skip if line is empty (ie at the end of the file) push @data, [ @F[$self->{DATAROW}..$#F] ]; } close FILE; return \@data; } # Performs a least squares regression on the natural log data sub _regression { my $x = shift; my $y = shift; # Values marked as 'bad' in the PDL are excluded from regression calculation my $x_average = average(log($x->setbadif($y->isbad))); my $y_average = average(log($y)); my $sum_of_products = sumover((log($x->setbadif($y->isbad)) - $x_average->dummy) * (log($y) - $y_average->dummy)); my $sum_of_squares = sumover((log($x->setbadif($y->isbad)) - $x_average->dummy)**2); my $regression_coefficients = $sum_of_products / $sum_of_squares; my $regression_constants = $y_average - $regression_coefficients * $x_average; return ($regression_coefficients, $regression_constants); } # Deturmine the scaling factor required to the regression line from crossing into negative sub _scaling_factor { my $data = shift; my $number_of_genes = shift; my $array_slopes = shift; my $array_intercepts = shift; my $scaling_factor; my $min_data = log(min($data)); my $min_regression_point = min($array_slopes->dummy * log($number_of_genes) + $array_intercepts->dummy); # Set the scaling factor to the smaller of min_data and min_regression_point if ($min_data < $min_regression_point) { $scaling_factor = $min_data } else { $scaling_factor = $min_regression_point; } # Scaling is only neccessary if scaling factor is less than zero (i.e. data point is less than one) if ($scaling_factor < 0) { $scaling_factor *= -exp(1); } else { $scaling_factor = 0; } $data /= exp(-$scaling_factor); return $scaling_factor; } # Output the normalized data to a file with ranks # # Pass DATA or NORMDATA in 1st position to output # pre-normalized or normalized data. # # Pass the output filename in the 2nd position. # # Pass 'Y' or 'N' in 3rd position to print ranks. # sub output_data { my $self = shift; my $dataset = shift; my $outfile = shift; my $withranks = shift; my $data = $self->{$dataset}; my $row_headers = $self->{ROWHEADS}; my $column_headers = $self->{COLHEADS}; my $significant_figures = $self->{SIGFIG}; my $ranks = $self->{RANKS}; my ($arrays, $genes) = $data->dims; my $format = qq(%0.$significant_figures) . q(f); open OUTFILE, qq(> $outfile); for (my $i=0; $i < @$column_headers; $i++) { print OUTFILE $column_headers->[$i]; } for (my $g=0; $g < $genes; $g++) { print OUTFILE qq($row_headers->[$g]\t) if ($row_headers->[$g]); print OUTFILE $ranks->at($g,0) . qq(\t) if ($withranks =~ 'Y'); for (my $a=0; $a < $arrays; $a++) { print OUTFILE sprintf($format, $data->at($a,$g)) . qq(\t); } print OUTFILE qq(\n); } close OUTFILE; }