Difference between revisions of "MakePED makePED.pl"

 
Line 1: Line 1:
{{BJNav}}
+
 
#!/usr/bin/perl
 
## Create PED files for PLINK
 
# C:\SAEC\scripts>perl makePED.pl -mapfile mapping_info.txt -snpfile snp_ids.txt -dir ../data -phenofile phenotypes.txt -outfile makePED.out
 
 
 
use Getopt::Long;
 
use Data::Dumper;
 
use strict;
 
 
my $VERSION = '1_00';
 
my $mappingFile = 'C:/SAEC/SJS Delivery from GSK/PGX40001_Illumina1M/Documents/Locus_Annotation_Files/illumina1M.map';
 
# Check if programm called correctly
 
 
my ($snpfp, $mapfp, $datadir, $phefp, $outfp);
 
my $result = GetOptions ("snpfile=s" => \$snpfp,
 
"mapfile=s" => \$mapfp,
 
"dir=s" => \$datadir,
 
"phenofile=s" => \$phefp,
 
"outfile=s" => \$outfp);
 
if($snpfp eq "" || $mapfp eq "" || $datadir eq "" || $phefp eq "" || $outfp eq "" ){
 
print <<EOF;
 
usage: perl makePED.pl
 
-snpfile <SNP ids (1st column)>
 
-mapfile <individual numbers (2nd column)>
 
-dir <directory with snp data files> 
 
-phenofile <phenotype file (1st col Individual ID, 2nd col sex, 3rd col phenotype)>
 
-outfile <output file>\n
 
EOF
 
exit(-1);
 
}
 
 
open(OUTPUT2, '>', $outfp) || die("Could not open file $outfp!");
 
print2("#perl makePED.pl -snpfile $snpfp -mapfile $mapfp -dir $datadir -phenofile $phefp -outfile $outfp");
 
print2("#Version: $VERSION\n");
 
 
 
 
#read in information about mapping between Illumina individuals and GSK individuals
 
# file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
 
# we are only interested in the second column
 
print2("#Reading Mapfile $mapfp\n");
 
open(DAT, $mapfp) || die("Could not open file  $mapfp!");
 
my @map_data=<DAT>;
 
my %indi; #individual IDs
 
foreach my $m (@map_data) {
 
if ($m =~ !/^#/) {#ignore comment lines
 
$m =~ s/[\r\n]//g; #remove end of line characters
 
my @a = split(/\t/, $m);
 
$indi{lc $a[1]} = 1;
 
}
 
}
 
close(DAT);
 
if (!defined %indi) {
 
print2("#%indi not defined");
 
die("%indi not defined");
 
}
 
#print Dumper(\%indi);
 
 
#read in information about which SNPs should be there
 
print2("#Reading SNP file $mappingFile\n");
 
open(DAT, $mappingFile) || die("Could not open file  $mappingFile!");
 
my @snp_data=<DAT>;
 
my %snps; #SNP names
 
foreach my $m (@snp_data) {
 
if ($m =~ /^#/) { #ignore comment lines
 
next;
 
}
 
$m =~ s/[\r\n]//g; #remove end of line characters
 
my @a = split(/\t/, $m);
 
$snps{$a[1]} = " 0 0"; #missing value
 
}
 
close(DAT);
 
 
if (!defined %snps) {
 
print2("#%snps not defined");
 
die("%snps not defined");
 
}
 
 
 
#read in information phenotypes
 
print2("#Reading Phenotype file $phefp\n");
 
open(DAT, $phefp) || die("Could not open file  $phefp!");
 
my @phe_data=<DAT>;
 
my %phes; #phenotypic data plus SNP data
 
 
#print Dumper(\@phe_data);
 
#initialize, we are only interested in things that have been declared in $mapfp
 
foreach my $key (keys %indi) {
 
$phes{$key, "S"} = ""; # sex
 
$phes{$key, "P"} = ""; # phenotype
 
}
 
foreach my $m (@phe_data) {
 
#print $m ;
 
if ($m =~ /^#/) { #ignore comment lines
 
next;
 
}
 
$m =~ s/[\r\n]//g; #remove end of line characters
 
my @a = split(/\t/, $m);
 
#print "D1:" . Dumper(\@a);
 
if (exists $phes{$a[0], "S"}) {
 
$phes{$a[0], "S"} = $a[1];
 
$phes{$a[0], "P"} = $a[2];
 
}else {
 
print2("#Unknown phenotype " . $a[0] . "\n");
 
}
 
 
}
 
close(DAT);
 
#print Dumper(\%phes);
 
#exit(-1);
 
 
foreach my $key (keys %indi) {
 
if ($phes{$key, "S"} eq "") {
 
print2("#No Information for Individual $key\n");
 
next;
 
}
 
 
if (!open(DAT, $datadir . "/" . $key . ".txt")) {
 
print2("#No data file for $key\n");
 
next;
 
}
 
#initialize SNP hash
 
foreach my $m (keys %snps) {
 
$snps{$m} = " 0 0"; #missing value
 
}
 
print2( "#working on $key ");
 
# read the data
 
my $co=0;
 
while (<DAT>) {
 
#print "$_";
 
if (/^#/){next;} #ignore comment lines
 
s/[\r\n]//g; #remove end of line characters
 
my @line = split(/\t/);
 
if (!exists $snps{$line[0]}) {
 
print2("#Unknown SNP in Individual SNP file " . $line[0] . "\n");
 
next;
 
}
 
$co ++;
 
$line[1] =~ s/-/0/;
 
$line[2] =~ s/-/0/;
 
if ($line[1] !~ /[DIACGT0]/ || $line[2] !~ /[DIACGT0]/) {
 
print ("#$line[0] $line[1] $line[2]\n");
 
}
 
$snps{$line[0]} = " " . $line[1] . " " . $line[2];
 
#print Dumper(\@line) . "\n";
 
}
 
close(DAT);
 
print2("found $co SNPs\n");
 
 
#print out data
 
my $sex = '0';
 
my $phenotype = "-9";
 
if (uc $phes{$key, "S"} eq 'M') {
 
$sex = '1';
 
}
 
if (uc $phes{$key, "S"} eq 'F') {
 
$sex = '2';
 
}
 
if (uc $phes{$key, "P"} eq 'CO') {
 
$phenotype = '1';
 
}
 
if (uc $phes{$key, "P"} eq 'CA') {
 
$phenotype = '2';
 
}
 
my $co2=0;
 
foreach my $m (keys %snps) {
 
if ($snps{$m} eq " 0 0") {
 
$co2++;
 
}
 
}
 
print2("#found no info for $co2 SNPs\n");
 
print2("$key 1 0 0 $sex $phenotype ");
 
foreach my $m (sort keys %snps) {
 
#print this only to the outputfile
 
# print( OUTPUT2 "#$m");
 
print( OUTPUT2 $snps{$m});
 
}
 
print2 ("\n");
 
 
}
 
 
 
 
if (!defined %snps) {
 
print2("#%snps not defined");
 
die("%snps not defined");
 
}
 
 
#===============================
 
 
sub print2 {
 
 
my ($str) = @_;
 
print $str ;
 
print(OUTPUT2 $str);
 
}
 

Latest revision as of 14:34, 18 October 2007