00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00032 #include <Sequence/Alignment.hpp>
00033 #include <Sequence/SeqConstants.hpp>
00034 #include <Sequence/SeqProperties.hpp>
00035 #include <boost/static_assert.hpp>
00036 #include <boost/type_traits.hpp>
00037 #include <iterator>
00038 #include <algorithm>
00039 namespace Sequence
00040 {
00041 namespace Alignment
00042 {
00043 template < typename T >
00044 void GetData (std::vector < T >&seqarray,
00045 const char *infilename)
00046
00077 {
00078 if(!infilename || infilename == NULL)
00079 return;
00080
00081 std::ifstream infile(infilename);
00082 if (infile)
00083 {
00084 T temp;
00085 while (!(infile.eof ()))
00086 {
00087 infile >> temp;
00088 seqarray.push_back(temp);
00089 }
00090 }
00091 }
00092
00093 template < typename T >
00094 std::istream & GetData (std::vector <
00095 T >&seqarray,
00096 std::istream & input_stream)
00097
00106 {
00107 T temp;
00108 if (input_stream)
00109 {
00110 while (!(input_stream.eof ()))
00111 {
00112 input_stream >> temp;
00113 seqarray.push_back(temp);
00114 }
00115 }
00116 return input_stream;
00117 }
00118
00119 template < typename T >
00120 std::istream & ReadNObjects ( std::vector < T > &seqarray,
00121 unsigned n,
00122 std::istream & input_stream)
00135 {
00136 for(unsigned i = 0 ; i < n ; ++i)
00137 {
00138 if (!input_stream || input_stream.eof())
00139 return input_stream;
00140 T temp;
00141 input_stream >> temp;
00142 seqarray.push_back(temp);
00143 }
00144 return input_stream;
00145 }
00146
00147 template < typename T >
00148 void EmptyVector (std::vector< T * > &seqarray)
00154 {
00155 for(unsigned i = 0 ; i < seqarray.size() ; ++i)
00156 delete seqarray[i];
00157 seqarray.resize(0);
00158 }
00159
00160 template < typename T >
00161 bool Gapped (const std::vector < T >&data)
00167 {
00168 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00169 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00170 for (int i = 0; unsigned (i) < data.size (); ++i)
00171
00172 {
00173 if( data[i].second.find('-') != std::string::npos )
00174 return true;
00175 }
00176 return false;
00177 }
00178
00179 template < typename T >
00180 bool IsAlignment (const std::vector < T >&data)
00186 {
00187 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00188 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00189 for (int i = 0; unsigned (i) < data.size (); ++i)
00190 if (data[i].second.length () != data[0].second.length ())
00191 return 0;
00192
00193 return 1;
00194 }
00195
00196 template<typename Iterator>
00197 bool validForPolyAnalysis( Iterator beg,
00198 Iterator end )
00203 {
00204
00205
00206
00207 BOOST_STATIC_ASSERT( (boost::is_base_and_derived<
00208 std::pair<std::string,std::string>,
00209 typename std::iterator_traits<Iterator>::value_type
00210 >::value ||
00211 boost::is_same<
00212 std::pair<std::string,std::string>,
00213 typename std::iterator_traits<Iterator>::value_type
00214 >::value));
00215 while(beg < end)
00216 {
00217 if (std::find_if(beg->second.begin(),beg->second.end(),
00218 Sequence::invalidPolyChar())
00219 != beg->second.end())
00220 {
00221 return false;
00222 }
00223 ++beg;
00224 }
00225 return true;
00226 }
00227
00228 template < typename T >
00229 unsigned UnGappedLength (const std::vector <T>&data)
00236 {
00237 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00238 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00239 bool site_gapped = 0;
00240 int len = 0;
00241 if (!IsAlignment(data))
00242 return Sequence::SEQMAXUNSIGNED;
00243
00244 for (size_t j = 0; j < data[0].second.length (); ++j)
00245
00246 {
00247 site_gapped = 0;
00248 for (size_t i = 0; i < data.size (); ++i)
00249 {
00250 if (data[i][j] == '-')
00251 {
00252 site_gapped = 1;
00253 i = data.size();
00254 }
00255 }
00256 if (!(site_gapped))
00257 ++len;
00258 }
00259 return len;
00260 }
00261
00262 template <typename T>
00263 void RemoveGaps (std::vector <T> &data)
00269 {
00270
00271 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00272 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00273 size_t i, j,datasize=data.size();
00274 size_t length = data[0].second.length ();
00275 std::vector < std::string > ungapped_sequences(data.size());
00276 bool site_is_gapped;
00277
00278 for (i = 0; i < length; ++i)
00279 {
00280 for ( j = 0, site_is_gapped = 0;
00281 j < datasize; ++j)
00282 {
00283 if (data[j].second[i] == '-')
00284 {
00285 site_is_gapped = 1;
00286 j = datasize;
00287 }
00288 }
00289 if (!(site_is_gapped))
00290 {
00291 for ( j = 0 ; j != data.size(); ++j)
00292 ungapped_sequences[j] += data[j].second[i];
00293 }
00294 }
00295
00296
00297 for (j = 0; j != datasize ; ++j)
00298 {
00299 data[j] = T(data[j].first,ungapped_sequences[j]);
00300 }
00301 }
00302
00303
00304
00305 template < typename T >
00306 void RemoveTerminalGaps (std::vector <T>&data)
00312 {
00313 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00314 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00315 size_t i,j,length = data[0].second.length ();
00316 std::vector < std::string > trimmed_sequences;
00317 size_t leftmost, rightmost, numUngapped;
00318 size_t offset;
00319 size_t size = data.size ();
00320
00321 leftmost = SEQMAXUNSIGNED;
00322 rightmost = length + 1;
00323
00324
00325 for (i = 0; i < length; ++i)
00326 {
00327 for (numUngapped = 0, j = 0; j != data.size (); ++j)
00328 {
00329 if (data[j].second[i] != '-')
00330 ++numUngapped;
00331 }
00332 if (numUngapped == size)
00333 {
00334 leftmost = i;
00335 i = length + 1;
00336 }
00337 }
00338
00339
00340 bool exit_condition = false;
00341 for (i = length - 1; i < data[0].second.length() && exit_condition == false; --i)
00342 {
00343 for (numUngapped = 0, j = 0; j != data.size (); ++j)
00344 {
00345 if (data[j].second[i] != '-')
00346 ++numUngapped;
00347 }
00348 if (numUngapped == size)
00349 {
00350 rightmost = i;
00351 exit_condition = true;
00352 }
00353 }
00354
00355
00356 offset = rightmost - leftmost;
00357 for (j = 0; j != data.size (); ++j)
00358 trimmed_sequences.push_back (data[j].second.substr (leftmost, rightmost-leftmost+1));
00359
00360
00361 for ( j = 0; j != data.size (); ++j)
00362 {
00363 data[j] =T(data[j].first,trimmed_sequences[j]);
00364 }
00365 }
00366
00367 template < typename T >
00368 void RemoveFixedOutgroupInsertions( std::vector<T> & data,
00369 unsigned site,
00370 const unsigned & ref )
00378 {
00379 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00380 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00381 size_t nsam = data.size()-1;
00382 size_t s = data[0].second.length()-1,max=s+1;
00383 for(size_t i=0 ; i<max ; --s,++i)
00384 {
00385 unsigned ngap=0;
00386 for(size_t ind=0;ind<data.size();++ind)
00387 {
00388 if (ind != ref && data[ind].second[s] == '-')
00389 {
00390 ngap++;
00391 }
00392 }
00393 if(ngap==nsam)
00394 {
00395 for(unsigned ind=0;ind<data.size();++ind)
00396 {
00397 data[ind].second.erase(s,1);
00398 }
00399 }
00400 }
00401 }
00402
00403 template < typename T >
00404 std::vector < T >Trim (const std::vector < T >&data,
00405 const std::vector <int> &sites) throw (Sequence::SeqException)
00423 {
00424 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00425 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00426 size_t i, j, numseqs = data.size (), numIntervals = sites.size ();
00427 unsigned start, stop;
00428 std::vector < T >trimmedData(numseqs);
00429 std::vector < std::string > trimmedTemp(numseqs);
00430 if (sites.empty ())
00431 {
00432 throw SeqException ("Sequence::Alignment::Trim(): empty vector of positions passed to function");
00433 }
00434 if (numIntervals % 2 != 0)
00435 {
00436 throw SeqException ("Sequence::Alignment::Trim(): odd number of positions passed");
00437 }
00438
00439 for (i = 0; i < numIntervals; i += 2)
00440 {
00441 start = sites[i];
00442 stop = sites[i + 1];
00443 for (j = 0; j < numseqs; ++j)
00444 {
00445 trimmedTemp[j] += data[j].second.substr (start, stop - start + 1);
00446 }
00447 }
00448
00449 for (i = 0; i < numseqs; ++i)
00450 {
00451 trimmedData[i]=T(data[i].first,trimmedTemp[i]);
00452 }
00453
00454 return trimmedData;
00455 }
00456
00457 template < typename T >
00458 std::vector < T >TrimComplement (const std::vector < T >&data,
00459 const std::vector < int > &sites)
00460 throw (Sequence::SeqException)
00477 {
00478 BOOST_STATIC_ASSERT((boost::is_base_and_derived<std::pair<std::string,std::string>,T>::value
00479 || boost::is_same<std::pair<std::string,std::string>,T>::value));
00480 std::vector < int >newSites;
00481 size_t i, j, start, stop, numseqs = data.size (), numIntervals = sites.size (), lastval;
00482
00483 if (sites.empty ())
00484 {
00485 throw SeqException ("Sequence::Alignment::TrimComplement(): empty vector of positions passed to function");
00486 }
00487 if (numIntervals % 2 != 0)
00488 {
00489 throw SeqException ("Sequence::Alignment::TrimComplement(): odd numer of positions passed to function");
00490 }
00491
00492 std::vector < T >trimmedData(numseqs);
00493 std::vector < std::string > trimmedTemp(numseqs);
00494
00495 size_t odd_even;
00496 if (sites[0] == 0)
00497 {
00498 for (i = 1; i < numIntervals; ++i)
00499 {
00500 odd_even = i+1;
00501 if (odd_even%2==0.)
00502 {
00503 newSites.push_back (sites[i] + 1);
00504 }
00505 else if (odd_even%2!=0.)
00506 {
00507 newSites.push_back (sites[i] - 1);
00508 }
00509 }
00510 }
00511 else if (sites[0] > 0)
00512 {
00513 newSites.push_back (0);
00514 for (i = 0; i < numIntervals; ++i)
00515 {
00516 odd_even = i+1;
00517 if (odd_even%2==0.)
00518 {
00519 newSites.push_back (sites[i] + 1);
00520 }
00521 else if (odd_even%2!=0.)
00522 {
00523 newSites.push_back (sites[i] - 1);
00524 }
00525 }
00526 }
00527
00528 lastval = newSites[newSites.size () - 1];
00529 newSites.pop_back ();
00530 numIntervals = newSites.size ();
00531 for (i = 0; i < numIntervals; i += 2)
00532 {
00533 start = newSites[i];
00534 stop = newSites[i + 1];
00535 for (j = 0; j < numseqs; ++j)
00536 {
00537 trimmedTemp[j] +=
00538 data[j].second.substr (start, stop - start + 1);
00539 }
00540 }
00541
00542 for (j = 0; j < numseqs; ++j)
00543 {
00544 trimmedTemp[j] += data[j].second.substr (lastval);
00545 }
00546
00547 for (i = 0; i < numseqs; ++i)
00548 {
00549 trimmedData[i] = T(data[i].first,trimmedTemp[i]);
00550 }
00551
00552 return trimmedData;
00553 }
00554
00555 }
00556 }
00557
00558