#!/usr/bin/perl
## Create PED files for PLINK
# C:\SAEC\scripts>perl makePED.pl -mapfile mapping_info.txt -snpfile snp_ids.txt -dir ../data -phenofile phenotypes.txt -outfile makePED.out
use Getopt::Long;
use Data::Dumper;
use strict;
my $VERSION = '1_00';
my $mappingFile = 'C:/SAEC/SJS Delivery from GSK/PGX40001_Illumina1M/Documents/Locus_Annotation_Files/illumina1M.map';
# Check if programm called correctly
my ($snpfp, $mapfp, $datadir, $phefp, $outfp);
my $result = GetOptions ("snpfile=s" => \$snpfp,
"mapfile=s" => \$mapfp,
"dir=s" => \$datadir,
"phenofile=s" => \$phefp,
"outfile=s" => \$outfp);
if($snpfp eq "" || $mapfp eq "" || $datadir eq "" || $phefp eq "" || $outfp eq "" ){
print <<EOF;
usage: perl makePED.pl
-snpfile <SNP ids (1st column)>
-mapfile <individual numbers (2nd column)>
-dir <directory with snp data files>
-phenofile <phenotype file (1st col Individual ID, 2nd col sex, 3rd col phenotype)>
-outfile <output file>\n
EOF
exit(-1);
}
open(OUTPUT2, '>', $outfp) || die("Could not open file $outfp!");
print2("#perl makePED.pl -snpfile $snpfp -mapfile $mapfp -dir $datadir -phenofile $phefp -outfile $outfp");
print2("#Version: $VERSION\n");
#read in information about mapping between Illumina individuals and GSK individuals
# file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
# we are only interested in the second column
print2("#Reading Mapfile $mapfp\n");
open(DAT, $mapfp) || die("Could not open file $mapfp!");
my @map_data=<DAT>;
my %indi; #individual IDs
foreach my $m (@map_data) {
if ($m =~ !/^#/) {#ignore comment lines
$m =~ s/[\r\n]//g; #remove end of line characters
my @a = split(/\t/, $m);
$indi{lc $a[1]} = 1;
}
}
close(DAT);
if (!defined %indi) {
print2("#%indi not defined");
die("%indi not defined");
}
#print Dumper(\%indi);
#read in information about which SNPs should be there
print2("#Reading SNP file $mappingFile\n");
open(DAT, $mappingFile) || die("Could not open file $mappingFile!");
my @snp_data=<DAT>;
my %snps; #SNP names
foreach my $m (@snp_data) {
if ($m =~ /^#/) { #ignore comment lines
next;
}
$m =~ s/[\r\n]//g; #remove end of line characters
my @a = split(/\t/, $m);
$snps{$a[1]} = " 0 0"; #missing value
}
close(DAT);
if (!defined %snps) {
print2("#%snps not defined");
die("%snps not defined");
}
#read in information phenotypes
print2("#Reading Phenotype file $phefp\n");
open(DAT, $phefp) || die("Could not open file $phefp!");
my @phe_data=<DAT>;
my %phes; #phenotypic data plus SNP data
#print Dumper(\@phe_data);
#initialize, we are only interested in things that have been declared in $mapfp
foreach my $key (keys %indi) {
$phes{$key, "S"} = ""; # sex
$phes{$key, "P"} = ""; # phenotype
}
foreach my $m (@phe_data) {
#print $m ;
if ($m =~ /^#/) { #ignore comment lines
next;
}
$m =~ s/[\r\n]//g; #remove end of line characters
my @a = split(/\t/, $m);
#print "D1:" . Dumper(\@a);
if (exists $phes{$a[0], "S"}) {
$phes{$a[0], "S"} = $a[1];
$phes{$a[0], "P"} = $a[2];
}else {
print2("#Unknown phenotype " . $a[0] . "\n");
}
}
close(DAT);
#print Dumper(\%phes);
#exit(-1);
foreach my $key (keys %indi) {
if ($phes{$key, "S"} eq "") {
print2("#No Information for Individual $key\n");
next;
}
if (!open(DAT, $datadir . "/" . $key . ".txt")) {
print2("#No data file for $key\n");
next;
}
#initialize SNP hash
foreach my $m (keys %snps) {
$snps{$m} = " 0 0"; #missing value
}
print2( "#working on $key ");
# read the data
my $co=0;
while (<DAT>) {
#print "$_";
if (/^#/){next;} #ignore comment lines
s/[\r\n]//g; #remove end of line characters
my @line = split(/\t/);
if (!exists $snps{$line[0]}) {
print2("#Unknown SNP in Individual SNP file " . $line[0] . "\n");
next;
}
$co ++;
$line[1] =~ s/-/0/;
$line[2] =~ s/-/0/;
if ($line[1] !~ /[DIACGT0]/ || $line[2] !~ /[DIACGT0]/) {
print ("#$line[0] $line[1] $line[2]\n");
}
$snps{$line[0]} = " " . $line[1] . " " . $line[2];
#print Dumper(\@line) . "\n";
}
close(DAT);
print2("found $co SNPs\n");
#print out data
my $sex = '0';
my $phenotype = "-9";
if (uc $phes{$key, "S"} eq 'M') {
$sex = '1';
}
if (uc $phes{$key, "S"} eq 'F') {
$sex = '2';
}
if (uc $phes{$key, "P"} eq 'CO') {
$phenotype = '1';
}
if (uc $phes{$key, "P"} eq 'CA') {
$phenotype = '2';
}
my $co2=0;
foreach my $m (keys %snps) {
if ($snps{$m} eq " 0 0") {
$co2++;
}
}
print2("#found no info for $co2 SNPs\n");
print2("$key 1 0 0 $sex $phenotype ");
foreach my $m (sort keys %snps) {
#print this only to the outputfile
# print( OUTPUT2 "#$m");
print( OUTPUT2 $snps{$m});
}
print2 ("\n");
}
if (!defined %snps) {
print2("#%snps not defined");
die("%snps not defined");
}
#===============================
sub print2 {
my ($str) = @_;
print $str ;
print(OUTPUT2 $str);
}