|
|
Line 1: |
Line 1: |
− | #!/usr/bin/perl
| + | |
− | ## Extract individual patients from the original GSK file
| |
− | # we are going to use the file mapping_info.txt to map Illumina IDs with patient IDs
| |
− | # input will be
| |
− |
| |
− | use Getopt::Long;
| |
− | use Data::Dumper;
| |
− | use strict;
| |
− |
| |
− | my $VERSION = '1_00';
| |
− |
| |
− | # Check if programm called correctly
| |
− |
| |
− | my ($fp, $outdir, $fpout);
| |
− | my $result = GetOptions ("infile=s" => \$fp,
| |
− | "outfile=s"=> \$fpout,
| |
− | "outdir=s"=> \$outdir);
| |
− | if($fp eq "" || $fpout eq "" || $outdir eq "" ){
| |
− | print "usage: perl extractPatients.pl -infile <csv file with SNP data> -outdir <output directory> -outfile <output file>\n";
| |
− | exit(-1);
| |
− | }
| |
− |
| |
− | open(OUTPUT2, '>', $fpout) || die("Could not open file $fpout!");
| |
− | print2("#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
| |
− | print2("#Version: $VERSION\n");
| |
− |
| |
− |
| |
− | #read in information about mapping between Illumina individuals and GSK individuals
| |
− | # file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
| |
− |
| |
− | my $mapfp = 'mapping_info.txt';
| |
− | open(DAT, $mapfp) || die("Could not open file $mapfp!");
| |
− | my @map_data=<DAT>;
| |
− | my %map;
| |
− | foreach my $m (@map_data) {
| |
− | if ($m =~ !/^#/) {
| |
− | $m =~ /(.*)\t(.*)\n/;
| |
− | $map{lc $1} = $2
| |
− | }
| |
− | }
| |
− | close(DAT);
| |
− | if (!defined %map) {
| |
− | print2("%map not defined");
| |
− | die("%map not defined");
| |
− | }
| |
− | #print Dumper(\%map);
| |
− |
| |
− | #$ head -30 PGx40001_12278-DNA.csv
| |
− | #[Header]
| |
− | #BSGT Version,3.1.10
| |
− | #Processing Date,8/28/2007 11:53 AM
| |
− | #Content,,Human1Mv1_C.bpm
| |
− | #Num SNPs,1069083
| |
− | #Total SNPs,1072820
| |
− | #Num Samples,94
| |
− | #Total Samples,292
| |
− | #[Data]
| |
− | #SNP Name,Sample ID,Allele1 - Forward,Allele2 - Forward,Allele1 - AB,Allele2 - AB,X,Y,X Raw,Y Raw,GC Score
| |
− | #rs758676,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.420,0.611,3306,4684,0.8100
| |
− | #rs3916934,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.341,0.511,2686,3914,0.9114
| |
− | #rs2711935,WG0012278-DNA_A02_2949_A02,T,T,A,A,1.033,0.030,8298,255,0.9032
| |
− | #rs17126880,WG0012278-DNA_A02_2949_A02,C,C,B,B,0.031,1.178,262,9201,0.9278
| |
− | #rs12831433,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.947,1.015,7620,7947,0.8485
| |
− | #rs12797197,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.132,0.290,1062,2265,0.8524
| |
− |
| |
− |
| |
− | open(DAT, $fp) || die("Could not open file $fp!");
| |
− |
| |
− | my $header=0;
| |
− | my $l = 0;
| |
− | my $oldId = "empty";
| |
− |
| |
− | while (<DAT>) {
| |
− | #ignore lines until we hit "[Data]"
| |
− | #then read one more line
| |
− | if ($header<2) {
| |
− | if (/^\[Data\]/ || $header >0) {
| |
− | $header ++;
| |
− | }
| |
− | next;
| |
− | }
| |
− | $ l ++;
| |
− | if (!($l % 100000)) {
| |
− | print2 ("processing line $l\n");
| |
− | }
| |
− | my @line = split(/,/);
| |
− |
| |
− | if (!defined $map{lc $line[1]}) {
| |
− | print2( "unknown ID: " . $map{lc $line[1]} . ":::" . $line[1] . "\n");
| |
− | }
| |
− | if ($oldId ne $line[1]) {
| |
− | if (<OUTFILE>) {
| |
− | close (OUTFILE);
| |
− | }
| |
− | open(OUTFILE, ">", $outdir . "/" . $map{lc $line[1]} . ".txt");
| |
− | print(OUTFILE "#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
| |
− | print(OUTFILE "#Version: $VERSION\n");
| |
− |
| |
− | }
| |
− | $oldId = $line[1];
| |
− |
| |
− | print (OUTFILE $line[0] . "\t" . $line[2] . "\t" . $line[3] . "\n");
| |
− |
| |
− |
| |
− | #print "$_";
| |
− | }
| |
− | close(OUTFILE);
| |
− | close(OUTPUT2);
| |
− | #===============================
| |
− |
| |
− | sub print2 {
| |
− |
| |
− | my ($str) = @_;
| |
− | print $str ;
| |
− | print(OUTPUT2 $str);
| |
− | }
| |