# Program Name: piS # it so that it calculates pi (the mean pairwise difference) and # S, the number of segregating sites. # # The format for a *.seq file is like this: # sequence_name1 SEQUENCE1 # sequence_name2 SEQUENCE2 # for example, # Comments are delimited by sharp signs, as in the file you are now # reading. # Line 1: Contains number of samples, length of each sequence # Remaining lines have two fields: # Field 1: name of sequence # Field 2: the sequence # Example: # hiv1 GACTGACTGATCG # hiv2 GCATGCTAGCTAG # # Caveats: Sequences need - to be aligned # - to be the same length # - to be on one line (lines can run over the edge, like this one). # BEGIN { ngenes = 0; } NF==0 { # skip empty lines next } NR==1 { # read the number of genes and of sites G = $1; # number of genes nsites = $2; # number of sites } NR>1 { if(NF != 2){ printf("ERROR: line %d has %d fields (should have 2)\n", NR, NF); printf("bad line: \"%s\"\n", $0); exit; } # printf("%d: [%s] [%s]\n", NR, $1, $2); # put sequences into an array ++ngenes; sequence[ngenes]=$2; } END{ # Did we read the right number of sequences? if(ngenes != G){ printf("ERROR read %d sequences; expected %d\n", ngenes, G); exit; } # Check for uneven sequence length for (i=1; i<=G; i++){ if (length(sequence[i]) != nsites){ printf("ERROR - length(%d)=%d; nsites=%d\n", i, length(sequence[i]), nsites); exit; } } # Using the first sequence as a reference, see if any sites # in each other sequence are different. If a site is different, # mark it in the varsites array. In the varsites array, any # cell containing a value > 0 is variable. split(sequence[1], refseq, ""); for (i=2; i<=G; i++){ split(sequence[i], compseq, ""); for (j=1; j<=nsites; j++){ if (refseq[j] != compseq[j]) varsites[j] = 1; } } # Count segregating sites. for (i=1; i<=nsites; i++){ S += varsites[i]; } # Count differences between each pair of sequences printf("Doing %d pairwise comparisons...be patient\n", G*(G-1)/2); for(i=1; i<=G; i++){ printf("%2d:\r", i); split(sequence[i], seqi, ""); for(j=1; j<i; j++){ split(sequence[j], seqj, ""); for(k=1; k<=nsites; k++) if(seqi[k] != seqj[k]) pi++; } } # divide by the number of pairwise comparisons pi /= G*(G-1)/2; # divisor for estimating theta from S a1 = a2 = 0; for(i=1; i <= (G-1); i++) { a1 += 1/i; a2 += 1/(i*i); } thetaS = S/a1; printf("%d sequences, %d sites\n", G, nsites); printf(" pi: %f \n", pi); printf(" Segregating sites: %d/%d\n", S, nsites); printf("theta_hat[estimated from S]: %f\n", thetaS); # Calculate Tajima's D: b1 = (G+1.0) / (3.0*(G-1.0)); b2 = 2.0*(G^2 + G + 3)/(9.0*G*(G-1)); c1 = b1 - 1.0/a1; c2 = b2 - (G+2)/(a1*G) + a2/(a1*a1); e1 = c1/a1; e2 = c2/(a1*a1 + a2); D = (pi - thetaS)/sqrt(e1*S + e2*S*(S-1)); printf(" Tajima's D: %f\n", D); printf("a1=%f a2=%f b1=%f b2=%f\n", a1, a2, b1, b2); printf("c1=%f c2=%f e1=%f e2=%f\n", c1, c2, e1, e2); }