MakePED makePED.pl

Revision as of 14:34, 18 October 2007 by Bjagla (talk | contribs)

test file collection | Comments on geWorkbench | SAEC notes | SAEC protocol | SAEC executive summary | Other


#!/usr/bin/perl
## Create PED files for PLINK 
# C:\SAEC\scripts>perl makePED.pl -mapfile mapping_info.txt -snpfile snp_ids.txt -dir ../data -phenofile phenotypes.txt -outfile makePED.out 
 
use Getopt::Long;
use Data::Dumper;
use strict;

my $VERSION = '1_00';
my $mappingFile = 'C:/SAEC/SJS Delivery from GSK/PGX40001_Illumina1M/Documents/Locus_Annotation_Files/illumina1M.map';
# Check if programm called correctly

my ($snpfp, $mapfp, $datadir, $phefp, $outfp);
my $result = GetOptions ("snpfile=s"	=> \$snpfp,
						 "mapfile=s"	=> \$mapfp,
						 "dir=s"		=> \$datadir,
						 "phenofile=s"	=> \$phefp,
						 "outfile=s"		=> \$outfp);
if($snpfp eq "" || $mapfp eq "" || $datadir eq "" || $phefp eq "" || $outfp eq "" ){
	print <<EOF;
usage: perl makePED.pl 
		-snpfile <SNP ids (1st column)> 
		-mapfile <individual numbers (2nd column)> 
		-dir <directory with snp data files>  
		-phenofile <phenotype file (1st col Individual ID, 2nd col sex, 3rd col phenotype)>
		-outfile <output file>\n
EOF
	exit(-1);
}

open(OUTPUT2, '>', $outfp) || die("Could not open file $outfp!");
print2("#perl makePED.pl -snpfile $snpfp -mapfile $mapfp -dir $datadir -phenofile $phefp -outfile $outfp");
print2("#Version: $VERSION\n");



#read in information about mapping between Illumina individuals and GSK individuals
# file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
# we are only interested in the second column
print2("#Reading Mapfile $mapfp\n");
open(DAT, $mapfp) || die("Could not open file  $mapfp!");
my @map_data=<DAT>;
my %indi; #individual IDs
foreach my $m (@map_data) {
	if ($m =~ !/^#/) {#ignore comment lines
		$m =~ s/[\r\n]//g; #remove end of line characters
		my @a = split(/\t/, $m);
		$indi{lc $a[1]} = 1;
	}
}
close(DAT);
if (!defined %indi) {
	print2("#%indi not defined");
	die("%indi not defined");
}
#print Dumper(\%indi);

#read in information about which SNPs should be there
print2("#Reading SNP file $mappingFile\n");
open(DAT, $mappingFile) || die("Could not open file  $mappingFile!");
my @snp_data=<DAT>;
my %snps; #SNP names
foreach my $m (@snp_data) {
	if ($m =~ /^#/) { #ignore comment lines
		next;
	}
	$m =~ s/[\r\n]//g; #remove end of line characters
	my @a = split(/\t/, $m);
	$snps{$a[1]} = " 0 0"; #missing value
}
close(DAT);

if (!defined %snps) {
	print2("#%snps not defined");
	die("%snps not defined");
}


#read in information phenotypes
print2("#Reading Phenotype file $phefp\n");
open(DAT, $phefp) || die("Could not open file  $phefp!");
my @phe_data=<DAT>;
my %phes; #phenotypic data plus SNP data

#print Dumper(\@phe_data);
#initialize, we are only interested in things that have been declared in $mapfp
foreach my $key (keys %indi) {
	$phes{$key, "S"} = ""; # sex
	$phes{$key, "P"} = ""; # phenotype
}
foreach my $m (@phe_data) {
	#print $m ;
	if ($m =~ /^#/) { #ignore comment lines
		next;
	}
	$m =~ s/[\r\n]//g; #remove end of line characters
	my @a = split(/\t/, $m);
	#print "D1:" . Dumper(\@a);
	if (exists $phes{$a[0], "S"}) {
		$phes{$a[0], "S"} = $a[1];
		$phes{$a[0], "P"} = $a[2];
	}else {
		print2("#Unknown phenotype " . $a[0] . "\n");
	}

}
close(DAT);
#print Dumper(\%phes);
#exit(-1);

foreach my $key (keys %indi) {
	if ($phes{$key, "S"} eq "") {
		print2("#No Information for Individual $key\n");
		next;
	}
	
	if (!open(DAT, $datadir . "/" . $key . ".txt")) {
		print2("#No data file for $key\n");
		next;
	}
	#initialize SNP hash
	foreach my $m (keys %snps) {
		$snps{$m} = " 0 0"; #missing value
	}
	print2( "#working on $key ");
	# read the data
	my $co=0;
	while (<DAT>) {
		#print "$_";
		if (/^#/){next;} #ignore comment lines
		s/[\r\n]//g; #remove end of line characters
		my @line = split(/\t/);
		if (!exists $snps{$line[0]}) {
			print2("#Unknown SNP in Individual SNP file " . $line[0] . "\n");
			next;
		}
		$co ++;
		$line[1] =~ s/-/0/;
		$line[2] =~ s/-/0/;
		if ($line[1] !~ /[DIACGT0]/ || $line[2] !~ /[DIACGT0]/) {
			print ("#$line[0] $line[1] $line[2]\n");
		}
		$snps{$line[0]} = " " . $line[1] . " " . $line[2];
		#print Dumper(\@line) . "\n";
	}
	close(DAT);
	print2("found $co SNPs\n");

	#print out data
	my $sex = '0';
	my $phenotype = "-9";
	if (uc $phes{$key, "S"} eq 'M') {
		$sex = '1';
	}
	if (uc $phes{$key, "S"} eq 'F') {
		$sex = '2';
	}
	if (uc $phes{$key, "P"} eq 'CO') {
		$phenotype = '1';
	}
	if (uc $phes{$key, "P"} eq 'CA') {
		$phenotype = '2';
	}
	my $co2=0;
	foreach my $m (keys %snps) {
		if ($snps{$m} eq " 0 0") {
			$co2++;
		}
	}
	print2("#found no info for $co2 SNPs\n");
	print2("$key 1 0 0 $sex $phenotype ");
	foreach my $m (sort keys %snps) {
		#print this only to the outputfile
#		print( OUTPUT2 "#$m");
		print( OUTPUT2 $snps{$m});
	}
	print2 ("\n");

}



if (!defined %snps) {
	print2("#%snps not defined");
	die("%snps not defined");
}

#===============================

sub print2 {
	
	my ($str) = @_;
	print $str ;
	print(OUTPUT2 $str);
}