00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <cmath>
00023 #include <cfloat>
00024 #include <algorithm>
00025 #include <cctype>
00026 #include <Sequence/PolyTable.hpp>
00027 #include <Sequence/SimData.hpp>
00028 #include <Sequence/Recombination.hpp>
00029 #include <Sequence/stateCounter.hpp>
00030 #include <Sequence/SeqProperties.hpp>
00031 using std::pow;
00032 using std::vector;
00033
00034 namespace Sequence
00035 {
00036 namespace Recombination
00037 {
00038
00045 const double PRECISION = FLT_EPSILON;
00049 const double CMAX = 10000;
00050
00051 namespace
00052 {
00053
00054
00055
00056
00057 double CalcSamplingVariance (const Sequence::PolyTable * data,
00058 const bool haveOutgroup,
00059 const int outrgoup,
00060 int totsam, int ss);
00061 void GetPairDiffs (const Sequence::PolyTable * data,
00062 const bool haveOutgroup,
00063 const int outgroup,
00064 const int totsam, vector<int> &list,
00065 const int ss);
00066 void get_h_vals (const int ss, const int totsam,
00067 const Sequence::PolyTable * data, const bool haveOutgroup,
00068 const int outgroup, double *sumhi, double *sumhisq);
00069 double chatsub (const int totsam, const double sksq,
00070 const double sumhi, const double sumhisq);
00071 double solveit (const int nsam, const double stat);
00072 double g (const int nsam, const double c);
00073
00074 double
00075 CalcSamplingVariance (const Sequence::PolyTable *data,
00076 const bool haveOutgroup,
00077 const int outgroup, int totsam, int ss)
00082 {
00083 int i;
00084 double Svar = 0.0, meandiffs = 0.0;
00085 unsigned __meandiffs=0;
00086 double nsam = double (totsam);
00087 vector < int >pairdiffs;
00088 GetPairDiffs (data, haveOutgroup, outgroup, totsam, pairdiffs,
00089 ss);
00090
00091 for (i = 0; unsigned(i) < pairdiffs.size (); ++i)
00092 __meandiffs += pairdiffs[i];
00093 meandiffs = double(__meandiffs)/pow (nsam, 2.0);
00094 for (i = 0; unsigned(i) < pairdiffs.size (); ++i)
00095 Svar += pow ((double(pairdiffs[i]) - meandiffs), 2);
00096 Svar /= pow (nsam, 2.);
00097 return (Svar);
00098 }
00099
00100 void
00101 GetPairDiffs (const Sequence::PolyTable *data, const bool haveOutgroup,
00102 const int outgroup, const int totsam,
00103 vector < int > &list, const int ss)
00109 {
00110 unsigned i,j,k;
00111 for (i = 0; int(i) < totsam; ++i)
00112 {
00113 for (j = 0; int(j) < totsam ; ++j)
00114 {
00115 if ((!(haveOutgroup))
00116 || (haveOutgroup && i != unsigned(outgroup)
00117 && j != unsigned(outgroup)))
00118 {
00119 if(i!=j)
00120 {
00121 int ndiffs=0;
00122 for (k = 0; k < unsigned(ss); ++k)
00123 {
00124
00125 char ch1 = char(std::toupper((*data)[i][k])), ch2 = char(std::toupper((*data)[j][k]));
00126 if( ch2 !='N' && ch2 != 'N' )
00127 if ( ch1 != ch2 )
00128 ++ndiffs;
00129 }
00130 list.push_back (ndiffs);
00131 }
00132 else
00133 list.push_back(0);
00134 }
00135 }
00136 }
00137 }
00138
00139
00140 void
00141 get_h_vals (const int ss, const int totsam,
00142 const Sequence::PolyTable *data,
00143 const bool haveOutgroup, const int outgroup,
00144 double *sumhi, double *sumhisq)
00150 {
00151 *sumhi = 0.;
00152 *sumhisq = 0.;
00153
00154 if (ss == 0)
00155 {
00156
00157
00158
00159 }
00160 else
00161 {
00162 for (int i = 0; i < ss; ++i)
00163 {
00164 stateCounter Counts;
00165
00166 unsigned samplesize = totsam;
00167 for (unsigned j = 0; j < data->size (); ++j)
00168 {
00169 if ((!(haveOutgroup))
00170 || (haveOutgroup && j != unsigned(outgroup)))
00171 {
00172 samplesize -= (std::toupper((*data)[j][i]) == 'N') ? 1 : 0;
00173 Counts ((*data)[j][i]);
00174 }
00175 }
00176
00177
00178 double SSH = 0.;
00179 double denom = (double(samplesize)* (double(samplesize) - 1.0));
00180 SSH += (Counts.a > 0) ? double(Counts.a) * double (Counts.a-1) /denom : 0. ;
00181 SSH += (Counts.g > 0) ? double(Counts.g) * double (Counts.g-1) /denom : 0. ;
00182 SSH += (Counts.c > 0) ? double(Counts.c) * double (Counts.c-1) /denom : 0. ;
00183 SSH += (Counts.t > 0) ? double(Counts.t) * double (Counts.t-1) /denom : 0. ;
00184 SSH += (Counts.zero > 0) ? double(Counts.zero) * double (Counts.zero-1) /denom : 0. ;
00185 SSH += (Counts.one > 0) ? double(Counts.one) * double (Counts.one-1) /denom : 0. ;
00186 *sumhi += (1.0 - SSH);
00187 *sumhisq += pow(1.0-SSH,2.0);
00188 }
00189 }
00190 }
00191
00192
00193 double
00194 chatsub (const int totsam, const double sksq,
00195 const double sumhi, const double sumhisq)
00200 {
00201 double stat, thetahat, estimate;
00202 thetahat = sumhi;
00203 stat = (sksq - sumhi + sumhisq) / (thetahat * thetahat);
00204 estimate = solveit (totsam, stat);
00205 return (estimate);
00206 }
00207
00208
00209 double
00210 solveit (const int totsam, const double stat)
00216 {
00217 double xbig, xsmall, xtemp;
00218
00219 xbig = 10.;
00220 xsmall = PRECISION;
00221
00222 if(fabs(g(totsam,xsmall))-stat<=DBL_EPSILON)
00223 return (xsmall);
00224
00225 while (fabs(g (totsam, xbig))-stat >= DBL_EPSILON && xbig<CMAX)
00226 xbig += 10.;
00227 if ((xbig >= CMAX) && fabs(g (totsam, xbig)-stat) > DBL_EPSILON)
00228 return (CMAX);
00229 if (xbig > 10.)
00230 xsmall = xbig - 10.;
00231
00232 while ((xbig - xsmall) > PRECISION)
00233 {
00234 xtemp = (xbig + xsmall) / 2.;
00235 if (fabs(g (totsam, xtemp))-stat > DBL_EPSILON)
00236 xsmall = xtemp;
00237 else
00238 xbig = xtemp;
00239 }
00240 return ((xbig + xsmall) / 2.);
00241
00242 }
00243
00244
00245 double
00246 g (const int totsam, const double c)
00252 {
00253 double x, y, esk;
00254 double r97, i1, i2, sumpd, n, b1, b2, b3;
00255
00256 n = double (totsam);
00257
00258 r97 = sqrt (97.);
00259 x = 13. + r97;
00260 y = 13. - r97;
00261 i2 = (log (((2. * c + y) * x) / ((2. * c + x) * y))) / r97;
00262 i1 = (0.5) * log ((c * c + 13. * c + 18.) / 18.) - 13. * i2 / 2.;
00263 sumpd = (-c + (c - 1.) * i1 + 2. * (7. * c + 9.) * i2) / (c * c);
00264 b1 = 1. / 2. + 1. / c + ((5. - c) * i1 -
00265 18. * (c + 1.) * i2) / (c * c);
00266 b2 = -1. / 2. + 2. / c - 2. * ((c + 9.) * i1 +
00267 2. * (2. * c + 9.) * i2) / (c * c);
00268 b3 = -2. / c + 2. * ((c + 7.) * i1 + 6. * (c + 3.) * i2) / (c * c);
00269 esk = sumpd + b1 / n + b2 / (n * n) + b3 / (n * n * n);
00270 return (2. * esk);
00271 }
00272 }
00273 double
00274 HudsonsC (const Sequence::PolyTable *data, const bool & haveOutgroup,
00275 const unsigned & outgroup)
00283 {
00284 unsigned totsam = unsigned(data->size());
00285 if (haveOutgroup)
00286 --totsam;
00287 int ss = int((*data)[0].length ());
00288 double sumhi = 0.0, sumhisq = 0.0, sksq = 0.0;
00289 get_h_vals (ss, totsam, data, haveOutgroup, outgroup, &sumhi, &sumhisq);
00290 sksq = CalcSamplingVariance (data, haveOutgroup, outgroup, totsam, ss);
00291 return( chatsub (totsam, sksq, sumhi, sumhisq) );
00292 }
00293
00294 std::vector< std::vector<double> > Disequilibrium (const Sequence::PolyTable *data,
00295 const bool & haveOutgroup, const unsigned & outgroup,
00296 const unsigned & mincount,
00297 const double max_distance)
00298 {
00299 enum {SITEI,SITEJ,RSQ,D,DPRIME};
00300
00301 const char _alphabet[10] = {'A','G','C','T','0','1','a','g','c','t'};
00302 static const unsigned alphsize = 6;
00303 PolyTable::const_site_iterator beg = data->sbegin();
00304
00305 unsigned ss = data->numsites();
00306 const size_t datasize = data->size();
00307 const size_t nsam = datasize-haveOutgroup;
00308
00309
00310
00311
00312 std::pair<char,char> chars1,chars2;
00313
00314
00315
00316 std::pair<ptrdiff_t,ptrdiff_t> counts1,counts2;
00317
00318 std::vector< std::vector<double> > returnList;
00319
00320 unsigned states1=0,states2=0;
00321
00322 for(unsigned i = 0 ; i < ss-1 ; ++i)
00323 {
00324 chars1.first = 'Z';
00325 chars1.second = 'Z';
00326 counts1.first=0;
00327 counts1.second=0;
00328 states1=0;
00329 std::string site1((beg+i)->second);
00330 for(unsigned letter = 0 ; letter < alphsize ; ++letter)
00331 {
00332 std::transform(site1.begin(),site1.end(),site1.begin(),
00333 ::toupper);
00334 if(std::find(site1.begin(),
00335 site1.end(),
00336 _alphabet[letter]) != site1.end())
00337 {
00338 if (chars1.first == 'Z')
00339 {
00340 chars1.first = char(std::toupper(_alphabet[letter]));
00341 ++states1;
00342 }
00343 else
00344 {
00345 chars1.second = char(std::toupper(_alphabet[letter]));
00346 if(chars1.first!=chars1.second)
00347 {
00348 ++states1;
00349 }
00350 }
00351 }
00352 }
00353
00354 if (states1 == 2)
00355 {
00356 if(haveOutgroup)
00357 {
00358 site1.replace(outgroup,1,"");
00359 }
00360 for(unsigned j = i+1 ; j < ss ; ++j)
00361 {
00362 if ( std::abs( data->position(j) - data->position(i) ) <= max_distance )
00363 {
00364 std::string site2( (beg+j)->second );
00365 std::transform(site2.begin(),site2.end(),site2.begin(),
00366 ::toupper);
00367 chars2.first = 'Z';
00368 chars2.second = 'Z';
00369 counts2.first=0;
00370 counts2.second=0;
00371 states2=0;
00372 for(unsigned letter = 0 ; letter < alphsize ; ++letter)
00373 {
00374 if(std::find(site2.begin(),
00375 site2.end(),
00376 _alphabet[letter]) != site2.end())
00377 {
00378 if (chars2.first == 'Z')
00379 {
00380 chars2.first = char(std::toupper(_alphabet[letter]));
00381 ++states2;
00382 }
00383 else
00384 {
00385 chars2.second = char(std::toupper(_alphabet[letter]));
00386 if(chars2.first!=chars2.second)
00387 {
00388 ++states2;
00389 }
00390 }
00391 }
00392 }
00393
00394
00395 if (states2 == 2)
00396 {
00397 if(haveOutgroup)
00398 {
00399 site2.replace(outgroup,1,"");
00400 }
00401
00402
00403 if( std::find(site1.begin(),site1.end(),'N')==site1.end()
00404 && std::find(site2.begin(),site2.end(),'N')==site2.end() )
00405 {
00406
00407
00408
00409
00410
00411
00412
00413
00414 if ( haveOutgroup == false)
00415 {
00416
00417
00418 size_t x = std::count(site1.begin(),
00419 site1.end(),
00420 chars1.first);
00421
00422
00423 size_t y = std::count(site2.begin(),
00424 site2.end(),
00425 chars2.first);
00426
00427 if ( x > nsam-x )
00428 {
00429
00430
00431
00432 std::swap(chars1.first,chars1.second);
00433 counts1.first = nsam-x;
00434 counts1.second = x;
00435 }
00436 else
00437 {
00438 counts1.first = x;
00439 counts1.second = nsam-x;
00440 }
00441
00442 if ( y > nsam-y )
00443 {
00444
00445
00446
00447 std::swap(chars2.first,chars2.second);
00448 counts2.first = nsam-y;
00449 counts2.second = y;
00450 }
00451 else
00452 {
00453 counts2.first = y;
00454 counts2.second = nsam-y;
00455 }
00456 }
00457 else if (haveOutgroup == true)
00458 {
00459
00460
00461
00462 if (chars1.first == char(std::toupper((beg+i)->second[outgroup])))
00463 {
00464 std::swap(chars1.first,chars1.second);
00465 }
00466 counts1.first = std::count(site1.begin(),
00467 site1.end(),
00468 chars1.first);
00469 counts1.second = nsam-counts1.first;
00470
00471
00472
00473
00474 if (chars2.first == char(std::toupper((beg+j)->second[outgroup])))
00475 {
00476 std::swap(chars2.first,chars2.second);
00477 }
00478 counts2.first = std::count(site2.begin(),
00479 site2.end(),
00480 chars2.first);
00481 counts2.second = nsam-counts2.first;
00482 }
00483
00484
00485
00486
00487 if ( counts1.first >= ptrdiff_t(mincount) &&
00488 counts2.first >= ptrdiff_t(mincount) )
00489 {
00490
00491 std::vector<double> temp(5);
00492 temp[SITEI] = (beg+i)->first;
00493 temp[SITEJ] = (beg+j)->first;
00494
00495
00496 unsigned counts00=0,counts01=0,counts10=0,counts11=0;
00497 for(unsigned k = 0 ; k < nsam ; ++k)
00498 {
00499 if(site1[k]==chars1.first &&
00500 site2[k]==chars2.first)
00501 ++counts11;
00502
00503 if(site1[k]==chars1.first &&
00504 site2[k]==chars2.second)
00505 ++counts10;
00506
00507 if(site1[k]==chars1.second &&
00508 site2[k]==chars2.first)
00509 ++counts01;
00510
00511 if(site1[k]==chars1.second &&
00512 site2[k]==chars2.second)
00513 ++counts00;
00514 }
00515
00516 double p0 = double(counts1.second)/double(nsam);
00517 double p1 = double(counts1.first)/double(nsam);
00518 double q0 = double(counts2.second)/double(nsam);
00519 double q1 = double(counts2.first)/double(nsam);
00520
00521 temp[D] = double(counts11)/double(nsam) -p1*q1;
00522 temp[RSQ] = (temp[D]*temp[D])/(p0*p1*q0*q1);
00523 double dmin = std::max(-p0*q0,-p1*q1);
00524 double dmax = std::min(p1*q0,p0*q1);
00525 temp[DPRIME] = (temp[D] < 0 ? -(temp[D]/dmin) : temp[D]/dmax);
00526 returnList.push_back(temp);
00527 }
00528 }
00529 }
00530 }
00531 }
00532 }
00533 }
00534 return returnList;
00535 }
00536
00537 bool Disequilibrium (const Sequence::PolyTable *data,
00538 vector<double> & return_values,
00539 unsigned * i , unsigned * j,
00540 const bool & haveOutgroup,
00541 const unsigned & outgroup,
00542 const unsigned & mincount,
00543 const double max_distance )
00556 {
00557 unsigned ss = data->numsites();
00558 if ( ! (*i < ss - 1 ) ) return false;
00559 enum {SITEI,SITEJ,RSQ,D,DPRIME,SKIP};
00560
00561 const char _alphabet[10] = {'A','G','C','T','0','1','a','g','c','t'};
00562 static const unsigned alphsize = 6;
00563 PolyTable::const_site_iterator beg = data->sbegin();
00564 const double pos1=(beg + *i)->first,pos2=(beg + *j)->first;
00565 const size_t datasize = data->size();
00566 const size_t nsam = datasize-haveOutgroup;
00567
00568
00569
00570
00571 std::pair<char,char> chars1,chars2;
00572
00573
00574
00575 std::pair<ptrdiff_t,ptrdiff_t> counts1,counts2;
00576
00577 unsigned states1=0,states2=0;
00578
00579 chars1.first = 'Z';
00580 chars1.second = 'Z';
00581 counts1.first=0;
00582 counts1.second=0;
00583 states1=0;
00584 std::string site1((beg+*i)->second);
00585 for(unsigned letter = 0 ; letter < alphsize ; ++letter)
00586 {
00587 std::transform(site1.begin(),site1.end(),site1.begin(),
00588 ::toupper);
00589 if(std::find(site1.begin(),
00590 site1.end(),
00591 _alphabet[letter]) != site1.end())
00592 {
00593 if (chars1.first == 'Z')
00594 {
00595 chars1.first = char(std::toupper(_alphabet[letter]));
00596 ++states1;
00597 }
00598 else
00599 {
00600 chars1.second = char(std::toupper(_alphabet[letter]));
00601 if(chars1.first!=chars1.second)
00602 {
00603 ++states1;
00604 }
00605 }
00606 }
00607 }
00608
00609 if (states1 == 2)
00610 {
00611 if(haveOutgroup)
00612 {
00613 site1.replace(outgroup,1,"");
00614 }
00615 if ( std::abs( data->position(*j) - data->position(*i) ) <= max_distance )
00616 {
00617 std::string site2( (beg+*j)->second );
00618 std::transform(site2.begin(),site2.end(),site2.begin(),
00619 ::toupper);
00620 chars2.first = 'Z';
00621 chars2.second = 'Z';
00622 counts2.first=0;
00623 counts2.second=0;
00624 states2=0;
00625 for(unsigned letter = 0 ; letter < alphsize ; ++letter)
00626 {
00627 if(std::find(site2.begin(),
00628 site2.end(),
00629 _alphabet[letter]) != site2.end())
00630 {
00631 if (chars2.first == 'Z')
00632 {
00633 chars2.first = char(std::toupper(_alphabet[letter]));
00634 ++states2;
00635 }
00636 else
00637 {
00638 chars2.second = char(std::toupper(_alphabet[letter]));
00639 if(chars2.first!=chars2.second)
00640 {
00641 ++states2;
00642 }
00643 }
00644 }
00645 }
00646
00647
00648 if (states2 == 2)
00649 {
00650 if(haveOutgroup)
00651 {
00652 site2.replace(outgroup,1,"");
00653 }
00654
00655
00656 if( std::find(site1.begin(),site1.end(),'N')==site1.end()
00657 && std::find(site2.begin(),site2.end(),'N')==site2.end() )
00658 {
00659
00660
00661
00662
00663
00664
00665
00666
00667 if ( haveOutgroup == false)
00668 {
00669
00670
00671 size_t x = std::count(site1.begin(),
00672 site1.end(),
00673 chars1.first);
00674
00675
00676 size_t y = std::count(site2.begin(),
00677 site2.end(),
00678 chars2.first);
00679
00680 if ( x > nsam-x )
00681 {
00682
00683
00684
00685 std::swap(chars1.first,chars1.second);
00686 counts1.first = nsam-x;
00687 counts1.second = x;
00688 }
00689 else
00690 {
00691 counts1.first = x;
00692 counts1.second = nsam-x;
00693 }
00694
00695 if ( y > nsam-y )
00696 {
00697
00698
00699
00700 std::swap(chars2.first,chars2.second);
00701 counts2.first = nsam-y;
00702 counts2.second = y;
00703 }
00704 else
00705 {
00706 counts2.first = y;
00707 counts2.second = nsam-y;
00708 }
00709 }
00710 else if (haveOutgroup == true)
00711 {
00712
00713
00714
00715 if (chars1.first == char(std::toupper((beg+*i)->second[outgroup])))
00716 {
00717 std::swap(chars1.first,chars1.second);
00718 }
00719 counts1.first = std::count(site1.begin(),
00720 site1.end(),
00721 chars1.first);
00722 counts1.second = nsam-counts1.first;
00723
00724
00725
00726
00727 if (chars2.first == char(std::toupper((beg+*j)->second[outgroup])))
00728 {
00729 std::swap(chars2.first,chars2.second);
00730 }
00731 counts2.first = std::count(site2.begin(),
00732 site2.end(),
00733 chars2.first);
00734 counts2.second = nsam-counts2.first;
00735 }
00736
00737
00738
00739
00740 if ( counts1.first >= ptrdiff_t(mincount) &&
00741 counts2.first >= ptrdiff_t(mincount) )
00742 {
00743
00744 return_values[SITEI] = pos1;
00745 return_values[SITEJ] = pos2;
00746
00747 unsigned counts00=0,counts01=0,counts10=0,counts11=0;
00748 for(unsigned k = 0 ; k < nsam ; ++k)
00749 {
00750 if(site1[k]==chars1.first &&
00751 site2[k]==chars2.first)
00752 ++counts11;
00753
00754 if(site1[k]==chars1.first &&
00755 site2[k]==chars2.second)
00756 ++counts10;
00757
00758 if(site1[k]==chars1.second &&
00759 site2[k]==chars2.first)
00760 ++counts01;
00761
00762 if(site1[k]==chars1.second &&
00763 site2[k]==chars2.second)
00764 ++counts00;
00765 }
00766
00767 double p0 = double(counts1.second)/double(nsam);
00768 double p1 = double(counts1.first)/double(nsam);
00769 double q0 = double(counts2.second)/double(nsam);
00770 double q1 = double(counts2.first)/double(nsam);
00771
00772 return_values[D] = double(counts11)/double(nsam) - p1*q1;
00773 return_values[RSQ]=(return_values[D]*return_values[D])/(p0*p1*q0*q1);
00774 double dmin = std::max(-p0*q0,-p1*q1);
00775 double dmax = std::min(p1*q0,p0*q1);
00776 return_values[DPRIME] = (return_values[D] < 0 ?
00777 -(return_values[D]/dmin) :
00778 return_values[D]/dmax);
00779 return_values[SKIP]=0;
00780 }
00781 else
00782 {
00783 return_values[SKIP]=1;
00784 }
00785 }
00786 else
00787 {
00788 return_values[SKIP]=1;
00789 }
00790 }
00791 else
00792 {
00793 return_values[SKIP]=1;
00794 }
00795 }
00796 else
00797 {
00798 return_values[SKIP]=1;
00799 }
00800 }
00801 (*j)++;
00802 if(*j > ss-1)
00803 {
00804 (*i)++;
00805 *j = (*i+1);
00806 }
00807 return ( *i < ss-1 );
00808 }
00809
00810 }
00811 }
00812