|
|
Line 1: |
Line 1: |
− | {{BJNav}}
| + | |
− | #!/usr/bin/perl
| |
− | ## Create PED files for PLINK
| |
− | # C:\SAEC\scripts>perl makePED.pl -mapfile mapping_info.txt -snpfile snp_ids.txt -dir ../data -phenofile phenotypes.txt -outfile makePED.out
| |
− |
| |
− | use Getopt::Long;
| |
− | use Data::Dumper;
| |
− | use strict;
| |
− |
| |
− | my $VERSION = '1_00';
| |
− | my $mappingFile = 'C:/SAEC/SJS Delivery from GSK/PGX40001_Illumina1M/Documents/Locus_Annotation_Files/illumina1M.map';
| |
− | # Check if programm called correctly
| |
− |
| |
− | my ($snpfp, $mapfp, $datadir, $phefp, $outfp);
| |
− | my $result = GetOptions ("snpfile=s" => \$snpfp,
| |
− | "mapfile=s" => \$mapfp,
| |
− | "dir=s" => \$datadir,
| |
− | "phenofile=s" => \$phefp,
| |
− | "outfile=s" => \$outfp);
| |
− | if($snpfp eq "" || $mapfp eq "" || $datadir eq "" || $phefp eq "" || $outfp eq "" ){
| |
− | print <<EOF;
| |
− | usage: perl makePED.pl
| |
− | -snpfile <SNP ids (1st column)>
| |
− | -mapfile <individual numbers (2nd column)>
| |
− | -dir <directory with snp data files>
| |
− | -phenofile <phenotype file (1st col Individual ID, 2nd col sex, 3rd col phenotype)>
| |
− | -outfile <output file>\n
| |
− | EOF
| |
− | exit(-1);
| |
− | }
| |
− |
| |
− | open(OUTPUT2, '>', $outfp) || die("Could not open file $outfp!");
| |
− | print2("#perl makePED.pl -snpfile $snpfp -mapfile $mapfp -dir $datadir -phenofile $phefp -outfile $outfp");
| |
− | print2("#Version: $VERSION\n");
| |
− |
| |
− |
| |
− |
| |
− | #read in information about mapping between Illumina individuals and GSK individuals
| |
− | # file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
| |
− | # we are only interested in the second column
| |
− | print2("#Reading Mapfile $mapfp\n");
| |
− | open(DAT, $mapfp) || die("Could not open file $mapfp!");
| |
− | my @map_data=<DAT>;
| |
− | my %indi; #individual IDs
| |
− | foreach my $m (@map_data) {
| |
− | if ($m =~ !/^#/) {#ignore comment lines
| |
− | $m =~ s/[\r\n]//g; #remove end of line characters
| |
− | my @a = split(/\t/, $m);
| |
− | $indi{lc $a[1]} = 1;
| |
− | }
| |
− | }
| |
− | close(DAT);
| |
− | if (!defined %indi) {
| |
− | print2("#%indi not defined");
| |
− | die("%indi not defined");
| |
− | }
| |
− | #print Dumper(\%indi);
| |
− |
| |
− | #read in information about which SNPs should be there
| |
− | print2("#Reading SNP file $mappingFile\n");
| |
− | open(DAT, $mappingFile) || die("Could not open file $mappingFile!");
| |
− | my @snp_data=<DAT>;
| |
− | my %snps; #SNP names
| |
− | foreach my $m (@snp_data) {
| |
− | if ($m =~ /^#/) { #ignore comment lines
| |
− | next;
| |
− | }
| |
− | $m =~ s/[\r\n]//g; #remove end of line characters
| |
− | my @a = split(/\t/, $m);
| |
− | $snps{$a[1]} = " 0 0"; #missing value
| |
− | }
| |
− | close(DAT);
| |
− |
| |
− | if (!defined %snps) {
| |
− | print2("#%snps not defined");
| |
− | die("%snps not defined");
| |
− | }
| |
− |
| |
− |
| |
− | #read in information phenotypes
| |
− | print2("#Reading Phenotype file $phefp\n");
| |
− | open(DAT, $phefp) || die("Could not open file $phefp!");
| |
− | my @phe_data=<DAT>;
| |
− | my %phes; #phenotypic data plus SNP data
| |
− |
| |
− | #print Dumper(\@phe_data);
| |
− | #initialize, we are only interested in things that have been declared in $mapfp
| |
− | foreach my $key (keys %indi) {
| |
− | $phes{$key, "S"} = ""; # sex
| |
− | $phes{$key, "P"} = ""; # phenotype
| |
− | }
| |
− | foreach my $m (@phe_data) {
| |
− | #print $m ;
| |
− | if ($m =~ /^#/) { #ignore comment lines
| |
− | next;
| |
− | }
| |
− | $m =~ s/[\r\n]//g; #remove end of line characters
| |
− | my @a = split(/\t/, $m);
| |
− | #print "D1:" . Dumper(\@a);
| |
− | if (exists $phes{$a[0], "S"}) {
| |
− | $phes{$a[0], "S"} = $a[1];
| |
− | $phes{$a[0], "P"} = $a[2];
| |
− | }else {
| |
− | print2("#Unknown phenotype " . $a[0] . "\n");
| |
− | }
| |
− |
| |
− | }
| |
− | close(DAT);
| |
− | #print Dumper(\%phes);
| |
− | #exit(-1);
| |
− |
| |
− | foreach my $key (keys %indi) {
| |
− | if ($phes{$key, "S"} eq "") {
| |
− | print2("#No Information for Individual $key\n");
| |
− | next;
| |
− | }
| |
− |
| |
− | if (!open(DAT, $datadir . "/" . $key . ".txt")) {
| |
− | print2("#No data file for $key\n");
| |
− | next;
| |
− | }
| |
− | #initialize SNP hash
| |
− | foreach my $m (keys %snps) {
| |
− | $snps{$m} = " 0 0"; #missing value
| |
− | }
| |
− | print2( "#working on $key ");
| |
− | # read the data
| |
− | my $co=0;
| |
− | while (<DAT>) {
| |
− | #print "$_";
| |
− | if (/^#/){next;} #ignore comment lines
| |
− | s/[\r\n]//g; #remove end of line characters
| |
− | my @line = split(/\t/);
| |
− | if (!exists $snps{$line[0]}) {
| |
− | print2("#Unknown SNP in Individual SNP file " . $line[0] . "\n");
| |
− | next;
| |
− | }
| |
− | $co ++;
| |
− | $line[1] =~ s/-/0/;
| |
− | $line[2] =~ s/-/0/;
| |
− | if ($line[1] !~ /[DIACGT0]/ || $line[2] !~ /[DIACGT0]/) {
| |
− | print ("#$line[0] $line[1] $line[2]\n");
| |
− | }
| |
− | $snps{$line[0]} = " " . $line[1] . " " . $line[2];
| |
− | #print Dumper(\@line) . "\n";
| |
− | }
| |
− | close(DAT);
| |
− | print2("found $co SNPs\n");
| |
− |
| |
− | #print out data
| |
− | my $sex = '0';
| |
− | my $phenotype = "-9";
| |
− | if (uc $phes{$key, "S"} eq 'M') {
| |
− | $sex = '1';
| |
− | }
| |
− | if (uc $phes{$key, "S"} eq 'F') {
| |
− | $sex = '2';
| |
− | }
| |
− | if (uc $phes{$key, "P"} eq 'CO') {
| |
− | $phenotype = '1';
| |
− | }
| |
− | if (uc $phes{$key, "P"} eq 'CA') {
| |
− | $phenotype = '2';
| |
− | }
| |
− | my $co2=0;
| |
− | foreach my $m (keys %snps) {
| |
− | if ($snps{$m} eq " 0 0") {
| |
− | $co2++;
| |
− | }
| |
− | }
| |
− | print2("#found no info for $co2 SNPs\n");
| |
− | print2("$key 1 0 0 $sex $phenotype ");
| |
− | foreach my $m (sort keys %snps) {
| |
− | #print this only to the outputfile
| |
− | # print( OUTPUT2 "#$m");
| |
− | print( OUTPUT2 $snps{$m});
| |
− | }
| |
− | print2 ("\n");
| |
− |
| |
− | }
| |
− |
| |
− |
| |
− |
| |
− | if (!defined %snps) {
| |
− | print2("#%snps not defined");
| |
− | die("%snps not defined");
| |
− | }
| |
− |
| |
− | #===============================
| |
− |
| |
− | sub print2 {
| |
− |
| |
− | my ($str) = @_;
| |
− | print $str ;
| |
− | print(OUTPUT2 $str);
| |
− | }
| |