#!/usr/bin/perl
## Extract individual patients from the original GSK file
# we are going to use the file mapping_info.txt to map Illumina IDs with patient IDs
# input will be
use Getopt::Long;
use Data::Dumper;
use strict;
my $VERSION = '1_00';
# Check if programm called correctly
my ($fp, $outdir, $fpout);
my $result = GetOptions ("infile=s" => \$fp,
"outfile=s"=> \$fpout,
"outdir=s"=> \$outdir);
if($fp eq "" || $fpout eq "" || $outdir eq "" ){
print "usage: perl extractPatients.pl -infile <csv file with SNP data> -outdir <output directory> -outfile <output file>\n";
exit(-1);
}
open(OUTPUT2, '>', $fpout) || die("Could not open file $fpout!");
print2("#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
print2("#Version: $VERSION\n");
#read in information about mapping between Illumina individuals and GSK individuals
# file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
my $mapfp = 'mapping_info.txt';
open(DAT, $mapfp) || die("Could not open file $mapfp!");
my @map_data=<DAT>;
my %map;
foreach my $m (@map_data) {
if ($m =~ !/^#/) {
$m =~ /(.*)\t(.*)\n/;
$map{lc $1} = $2
}
}
close(DAT);
if (!defined %map) {
print2("%map not defined");
die("%map not defined");
}
#print Dumper(\%map);
#$ head -30 PGx40001_12278-DNA.csv
#[Header]
#BSGT Version,3.1.10
#Processing Date,8/28/2007 11:53 AM
#Content,,Human1Mv1_C.bpm
#Num SNPs,1069083
#Total SNPs,1072820
#Num Samples,94
#Total Samples,292
#[Data]
#SNP Name,Sample ID,Allele1 - Forward,Allele2 - Forward,Allele1 - AB,Allele2 - AB,X,Y,X Raw,Y Raw,GC Score
#rs758676,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.420,0.611,3306,4684,0.8100
#rs3916934,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.341,0.511,2686,3914,0.9114
#rs2711935,WG0012278-DNA_A02_2949_A02,T,T,A,A,1.033,0.030,8298,255,0.9032
#rs17126880,WG0012278-DNA_A02_2949_A02,C,C,B,B,0.031,1.178,262,9201,0.9278
#rs12831433,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.947,1.015,7620,7947,0.8485
#rs12797197,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.132,0.290,1062,2265,0.8524
open(DAT, $fp) || die("Could not open file $fp!");
my $header=0;
my $l = 0;
my $oldId = "empty";
while (<DAT>) {
#ignore lines until we hit "[Data]"
#then read one more line
if ($header<2) {
if (/^\[Data\]/ || $header >0) {
$header ++;
}
next;
}
$ l ++;
if (!($l % 100000)) {
print2 ("processing line $l\n");
}
my @line = split(/,/);
if (!defined $map{lc $line[1]}) {
print2( "unknown ID: " . $map{lc $line[1]} . ":::" . $line[1] . "\n");
}
if ($oldId ne $line[1]) {
if (<OUTFILE>) {
close (OUTFILE);
}
open(OUTFILE, ">", $outdir . "/" . $map{lc $line[1]} . ".txt");
print(OUTFILE "#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
print(OUTFILE "#Version: $VERSION\n");
}
$oldId = $line[1];
print (OUTFILE $line[0] . "\t" . $line[2] . "\t" . $line[3] . "\n");
#print "$_";
}
close(OUTFILE);
close(OUTPUT2);
#===============================
sub print2 {
my ($str) = @_;
print $str ;
print(OUTPUT2 $str);
}