geWorkbench

@@ Line 1: / Line 1: @@
- #!/usr/bin/perl
- ## Extract individual patients from the original GSK file
- #  we are going to use the file mapping_info.txt to map Illumina IDs with patient IDs
- #  input will be
- use Getopt::Long;
- use Data::Dumper;
- use strict;
- my $VERSION = '1_00';
- # Check if programm called correctly
- my ($fp, $outdir, $fpout);
- my $result = GetOptions ("infile=s" => \$fp,
- 						 "outfile=s"=> \$fpout,
- 						 "outdir=s"=> \$outdir);
- if($fp eq "" || $fpout eq "" || $outdir eq "" ){
- 	print "usage: perl extractPatients.pl -infile <csv file with SNP data> -outdir <output directory> -outfile <output file>\n";
- 	exit(-1);
- }
- open(OUTPUT2, '>', $fpout) || die("Could not open file $fpout!");
- print2("#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
- print2("#Version: $VERSION\n");
- #read in information about mapping between Illumina individuals and GSK individuals
- # file has been created from "PGX40001_GSK_SJS_B137_29Aug2007_DNAReport.xls"
- my $mapfp = 'mapping_info.txt';
- open(DAT, $mapfp) || die("Could not open file  $mapfp!");
- my @map_data=<DAT>;
- my %map;
- foreach my $m (@map_data) {
- 	if ($m =~ !/^#/) {
- 		$m =~ /(.*)\t(.*)\n/;
- 		$map{lc $1} = $2
- 	}
- }
- close(DAT);
- if (!defined %map) {
- 	print2("%map not defined");
- 	die("%map not defined");
- }
- #print Dumper(\%map);
- #$ head -30 PGx40001_12278-DNA.csv
- #[Header]
- #BSGT Version,3.1.10
- #Processing Date,8/28/2007 11:53 AM
- #Content,,Human1Mv1_C.bpm
- #Num SNPs,1069083
- #Total SNPs,1072820
- #Num Samples,94
- #Total Samples,292
- #[Data]
- #SNP Name,Sample ID,Allele1 - Forward,Allele2 - Forward,Allele1 - AB,Allele2 - AB,X,Y,X Raw,Y Raw,GC Score
- #rs758676,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.420,0.611,3306,4684,0.8100
- #rs3916934,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.341,0.511,2686,3914,0.9114
- #rs2711935,WG0012278-DNA_A02_2949_A02,T,T,A,A,1.033,0.030,8298,255,0.9032
- #rs17126880,WG0012278-DNA_A02_2949_A02,C,C,B,B,0.031,1.178,262,9201,0.9278
- #rs12831433,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.947,1.015,7620,7947,0.8485
- #rs12797197,WG0012278-DNA_A02_2949_A02,T,C,A,B,0.132,0.290,1062,2265,0.8524
- open(DAT, $fp) || die("Could not open file $fp!");
- my $header=0;
- my $l = 0;
- my $oldId = "empty";
- while (<DAT>) {
- 	#ignore lines until we hit "[Data]"
- 	#then read one more line
- 	if ($header<2) {
- 		if (/^\[Data\]/ || $header >0) {
- 			$header ++;
- 		}
- 		next;
- 	}
- 	$ l ++;
- 	if (!($l % 100000)) {
- 		print2 ("processing line $l\n");
- 	}
- 	my @line = split(/,/);
- 	if (!defined $map{lc $line[1]}) {
- 		print2( "unknown ID: " . $map{lc $line[1]} . ":::" . $line[1] . "\n");
- 	}
- 	if ($oldId ne $line[1]) {
- 		if (<OUTFILE>) {
- 			close (OUTFILE);
- 		}
- 		open(OUTFILE, ">", $outdir . "/" . $map{lc $line[1]} . ".txt");
- 		print(OUTFILE "#perl extractPatients.pl -infile $fp -outdir $outdir -outfile $fpout\n");
- 		print(OUTFILE "#Version: $VERSION\n");
- 	}
- 	$oldId = $line[1];
- 	print (OUTFILE $line[0] . "\t" .  $line[2] . "\t" . $line[3] . "\n");
- 	#print "$_";
- }
- close(OUTFILE);
- close(OUTPUT2);
- #===============================
- sub print2 {
- 	my ($str) = @_;
- 	print $str ;
- 	print(OUTPUT2 $str);
- }

geWorkbench

Difference between revisions of "ExtractPatients extractPatients"

Latest revision as of 14:32, 18 October 2007

Search

Personal tools

Tools