MakePED
test file collection | Comments on geWorkbench | SAEC notes | SAEC protocol | SAEC executive summary | Other |
#!/usr/bin/perl ## Create PED files for PLINK # C:\SAEC\scripts>perl makePED.pl -mapfile mapping_info.txt -snpfile snp_ids.txt -dir ../data -phenofile phenotypes.txt -outfile makePED.out use Getopt::Long; use Data::Dumper; use strict; my $VERSION = '1_00'; my $mappingFile = 'C:/SAEC/SJS Delivery from GSK/PGX40001_Illumina1M/Documents/Locus_Annotation_Files/illumina1M.map'; # Check if programm called correctly my ($snpfp, $mapfp, $datadir, $phefp, $outfp); my $result = GetOptions ("snpfile=s" => \$snpfp, "mapfile=s" => \$mapfp, "dir=s" => \$datadir, "phenofile=s" => \$phefp, "outfile=s" => \$outfp); if($snpfp eq "" || $mapfp eq "" || $datadir eq "" || $phefp eq "" || $outfp eq "" ){ print <<EOF; usage: perl makePED.pl -snpfile <SNP ids (1st column)> -mapfile <individual numbers (2nd column)> -dir <directory with snp data files> -phenofile <phenotype file (1st col Individual ID, 2nd col sex, 3rd col phenotype)> -outfile <output file>\n EOF exit(-1); } open(OUTPUT2, '>', $outfp) || die("Could not open file $outfp!"); print2("#perl makePED.pl -snpfile $snpfp -mapfile $mapfp -dir $datadir -phenofile $phefp -outfile $outfp"); print2("#Version: $VERSION\n"); #read in information about mapping between Illumina individuals and GSK individuals # file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls" # we are only interested in the second column print2("#Reading Mapfile $mapfp\n"); open(DAT, $mapfp) || die("Could not open file $mapfp!"); my @map_data=<DAT>; my %indi; #individual IDs foreach my $m (@map_data) { if ($m =~ !/^#/) {#ignore comment lines $m =~ s/[\r\n]//g; #remove end of line characters my @a = split(/\t/, $m); $indi{lc $a[1]} = 1; } } close(DAT); if (!defined %indi) { print2("#%indi not defined"); die("%indi not defined"); } #print Dumper(\%indi); #read in information about which SNPs should be there print2("#Reading SNP file $mappingFile\n"); open(DAT, $mappingFile) || die("Could not open file $mappingFile!"); my @snp_data=<DAT>; my %snps; #SNP names foreach my $m (@snp_data) { if ($m =~ /^#/) { #ignore comment lines next; } $m =~ s/[\r\n]//g; #remove end of line characters my @a = split(/\t/, $m); $snps{$a[1]} = " 0 0"; #missing value } close(DAT); if (!defined %snps) { print2("#%snps not defined"); die("%snps not defined"); } #read in information phenotypes print2("#Reading Phenotype file $phefp\n"); open(DAT, $phefp) || die("Could not open file $phefp!"); my @phe_data=<DAT>; my %phes; #phenotypic data plus SNP data #print Dumper(\@phe_data); #initialize, we are only interested in things that have been declared in $mapfp foreach my $key (keys %indi) { $phes{$key, "S"} = ""; # sex $phes{$key, "P"} = ""; # phenotype } foreach my $m (@phe_data) { #print $m ; if ($m =~ /^#/) { #ignore comment lines next; } $m =~ s/[\r\n]//g; #remove end of line characters my @a = split(/\t/, $m); #print "D1:" . Dumper(\@a); if (exists $phes{$a[0], "S"}) { $phes{$a[0], "S"} = $a[1]; $phes{$a[0], "P"} = $a[2]; }else { print2("#Unknown phenotype " . $a[0] . "\n"); } } close(DAT); #print Dumper(\%phes); #exit(-1); foreach my $key (keys %indi) { if ($phes{$key, "S"} eq "") { print2("#No Information for Individual $key\n"); next; } if (!open(DAT, $datadir . "/" . $key . ".txt")) { print2("#No data file for $key\n"); next; } #initialize SNP hash foreach my $m (keys %snps) { $snps{$m} = " 0 0"; #missing value } print2( "#working on $key "); # read the data my $co=0; while (<DAT>) { #print "$_"; if (/^#/){next;} #ignore comment lines s/[\r\n]//g; #remove end of line characters my @line = split(/\t/); if (!exists $snps{$line[0]}) { print2("#Unknown SNP in Individual SNP file " . $line[0] . "\n"); next; } $co ++; $line[1] =~ s/-/0/; $line[2] =~ s/-/0/; if ($line[1] !~ /[DIACGT0]/ || $line[2] !~ /[DIACGT0]/) { print ("#$line[0] $line[1] $line[2]\n"); } $snps{$line[0]} = " " . $line[1] . " " . $line[2]; #print Dumper(\@line) . "\n"; } close(DAT); print2("found $co SNPs\n"); #print out data my $sex = '0'; my $phenotype = "-9"; if (uc $phes{$key, "S"} eq 'M') { $sex = '1'; } if (uc $phes{$key, "S"} eq 'F') { $sex = '2'; } if (uc $phes{$key, "P"} eq 'CO') { $phenotype = '1'; } if (uc $phes{$key, "P"} eq 'CA') { $phenotype = '2'; } my $co2=0; foreach my $m (keys %snps) { if ($snps{$m} eq " 0 0") { $co2++; } } print2("#found no info for $co2 SNPs\n"); print2("$key 1 0 0 $sex $phenotype "); foreach my $m (sort keys %snps) { #print this only to the outputfile # print( OUTPUT2 "#$m"); print( OUTPUT2 $snps{$m}); } print2 ("\n"); } if (!defined %snps) { print2("#%snps not defined"); die("%snps not defined"); } #=============================== sub print2 { my ($str) = @_; print $str ; print(OUTPUT2 $str); }