ExtractPatients
test file collection | Comments on geWorkbench | SAEC notes | SAEC protocol | SAEC executive summary | Other |
#!/usr/bin/perl ## Extract individual patients from the original GSK file # we are going to use the file mapping_info.txt to map Illumina IDs with patient IDs # input will be use Getopt::Long; use Data::Dumper; use strict; my $VERSION = '1_00'; # Check if programm called correctly my ($fp, $outdir, $fpout); my $result = GetOptions ("infile=s" => \$fp, "outfile=s"=> \$fpout, "outdir=s"=> \$outdir); if($fp eq "" || $fpout eq "" || $outdir eq "" ){ print "usage: perl extractPatients.pl -infile <csv file with SNP data> -outdir <output directory> -outfile <output file>\n"; exit(-1); } open(OUTPUT2, '>', $fpout) || die("Could not open file $fpout!"); print2("#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n"); print2("#Version: $VERSION\n"); #read in information about mapping between Illumina individuals and GSK individuals # file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls" my $mapfp = 'mapping_info.txt'; open(DAT, $mapfp) || die("Could not open file $mapfp!"); my @map_data=<DAT>; my %map; foreach my $m (@map_data) { if ($m =~ !/^#/) { $m =~ /(.*)\t(.*)\n/; $map{lc $1} = $2 } } close(DAT); if (!defined %map) { print2("%map not defined"); die("%map not defined"); } #print Dumper(\%map); #$ head -30 PGx40001_12278-DNA.csv #[Header] #BSGT Version,3.1.10 #Processing Date,8/28/2007 11:53 AM #Content,,Human1Mv1_C.bpm #Num SNPs,1069083 #Total SNPs,1072820 #Num Samples,94 #Total Samples,292 #[Data] #SNP Name,Sample ID,Allele1 - Forward,Allele2 - Forward,Allele1 - AB,Allele2 - AB,X,Y,X Raw,Y Raw,GC Score #rs758676,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.420,0.611,3306,4684,0.8100 #rs3916934,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.341,0.511,2686,3914,0.9114 #rs2711935,WG0012278-DNA_A02_2949_A02,T,T,A,A,1.033,0.030,8298,255,0.9032 #rs17126880,WG0012278-DNA_A02_2949_A02,C,C,B,B,0.031,1.178,262,9201,0.9278 #rs12831433,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.947,1.015,7620,7947,0.8485 #rs12797197,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.132,0.290,1062,2265,0.8524 open(DAT, $fp) || die("Could not open file $fp!"); my $header=0; my $l = 0; my $oldId = "empty"; while (<DAT>) { #ignore lines until we hit "[Data]" #then read one more line if ($header<2) { if (/^\[Data\]/ || $header >0) { $header ++; } next; } $ l ++; if (!($l % 100000)) { print2 ("processing line $l\n"); } my @line = split(/,/); if (!defined $map{lc $line[1]}) { print2( "unknown ID: " . $map{lc $line[1]} . ":::" . $line[1] . "\n"); } if ($oldId ne $line[1]) { if (<OUTFILE>) { close (OUTFILE); } open(OUTFILE, ">", $outdir . "/" . $map{lc $line[1]} . ".txt"); print(OUTFILE "#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n"); print(OUTFILE "#Version: $VERSION\n"); } $oldId = $line[1]; print (OUTFILE $line[0] . "\t" . $line[2] . "\t" . $line[3] . "\n"); #print "$_"; } close(OUTFILE); close(OUTPUT2); #=============================== sub print2 { my ($str) = @_; print $str ; print(OUTPUT2 $str); }