Polymorphism tables for sequence data. More...
#include <Sequence/PolySites.hpp>
Public Types | |
| typedef std::string & | reference |
| typedef const std::string & | const_reference |
|
typedef std::vector < std::string >::size_type | size_type |
| typedef std::vector < std::string >::iterator | data_iterator |
| typedef std::vector < std::string > ::const_iterator | const_data_iterator |
| typedef std::vector< double > ::iterator | pos_iterator |
| typedef std::vector< double > ::const_iterator | const_pos_iterator |
| typedef Sequence::polySiteVector::const_iterator | const_site_iterator |
Public Member Functions | |
| template<typename __DataType > | |
| PolySites (const std::vector< __DataType > &alignment, bool strictInfSites=0, bool ignoregaps=1, bool skipMissing=false, bool skipAdjSNP=false, unsigned freqfilter=0) | |
| PolySites (const std::vector< double > &List, const std::vector< std::string > &stringList) | |
| PolySites (PolyTable::const_site_iterator beg, PolyTable::const_site_iterator end) | |
| std::istream & | read (std::istream &s) throw (Sequence::badFormat,std::exception) |
| std::ostream & | print (std::ostream &stream) const |
| output a tab-delimited array of positions and character states | |
| data_iterator | begin () |
| const_data_iterator | begin () const |
| data_iterator | end () |
| const_data_iterator | end () const |
| pos_iterator | pbegin () |
| const_pos_iterator | pbegin () const |
| pos_iterator | pend () |
| const_pos_iterator | pend () const |
| const_site_iterator | sbegin () const |
| const_site_iterator | send () const |
| std::vector< double > | GetPositions (void) const |
| std::vector< std::string > | GetData (void) const |
| virtual void | ApplyFreqFilter (unsigned mincount, bool haveOutgroup=false, unsigned outgroup=0) |
| virtual void | RemoveMultiHits (bool skipOutgroup=false, unsigned outgroup=0) |
| virtual void | RemoveMissing (bool skipOutgroup=false, unsigned outgroup=0) |
| virtual void | RemoveAmbiguous (bool skipOutgroup=false, unsigned outgroup=0) |
| virtual void | Binary (bool haveOutgroup=false, unsigned outgroup=0, bool strictInfSites=true) |
| virtual bool | operator== (const PolyTable &rhs) const |
| virtual bool | operator!= (const PolyTable &rhs) const |
| operator Sequence::polySiteVector () const | |
| const_reference | operator[] (const size_type &i) const |
| reference | operator[] (const size_type &i) |
| bool | empty () const |
| bool | assign (PolyTable::const_site_iterator beg, PolyTable::const_site_iterator end) |
| template<typename numeric_type , typename string_type > | |
| bool | assign (const numeric_type *_positions, const size_t &_num_positions, const string_type *_data, const size_t &_num_individuals) |
| size_type | size (void) const |
| double | position (const std::vector< double >::size_type &i) const |
| unsigned | numsites (void) const |
Protected Attributes | |
| size_t | numseqs |
| size_t | seqlen |
Polymorphism tables for sequence data.
This is one of the more useful classes in namespace Sequence. Its purpose is to take a bunch of data (a vector<Fasta> in fact), and turn it into a list of variable positions. It doesn't matter whether or not you have an outgroup in you vector (except that if you want to use it for later analysis, it had better be present).
The default behavior of this class is just to play with the std::strings themselves. So what you end up with is a vector of variable sites, stored in PolyData::positions, and a vector ofstd::strings containing the variable sites, stored in PolyData::data. Note that if you include an outgroup in your vector<Fasta>, and it contains a different character than the ingroup at some site, then the site is considered variable.
You can also try and turn the data into a "binary" (i.e. 0 and 1) format, by a call to Sequence::PolySites::Binary.
EXAMPLE:
Here is a common use of a PolySites class. You have a file, "gene.fasta", containing some number of sequences that represent polymorphism data. The file is assumed to be aligned, but we'll check for that, just in case you forgot to run ClustalW or something.
#include <string> #include <iostream> #include <Sequence/Fasta.hpp> #include <Sequence/Alignment.hpp> #include <Sequence/SeqExceptions.hpp> int main(int argc, char *argv[]) { const char *infile = "gene.fasta"; vector<Fasta> data; try { Sequence::Alignment::GetData (data,infile); assert( Sequence::Alignment::IsAlignment (data) ); if ( Sequence::Alignment::Gapped (data) ) Sequence::Alignment::RemoveTerminalGaps (data); } catch (SeqException &e) { cerr << "uh-oh! processing file gene.fasta resulted in throwing an exception"<<endl; e.print(cerr); cerr << endl; exit(1); } PolySites *polytable = new PolySites(data); }
Removing the terminal gaps guarantees that polymorphic site positions are labelled starting from the first ungapped position. Of course, a lot of the extra syntax in the example can be eliminated by giving the following 2 using declarations:
using namespace Sequence;
using namespace Sequence::Alignment;
For a second example, assume the data are in the file "gene.aln", the results of a ClustalW alignment.
#include <iostream> #include <Sequence/Clustalw.hpp> #include <Sequence/Fasta.hpp> #include <Sequence/SeqExceptions.hpp> int main(int argc, char *argv[]) { istream in; in.open("gene.aln"); ClustalW<Fasta> aligned_data; try { in >> aligned_data; assert(aligned_data.IsAlignment()); if(aligned_data.Gapped()) aligned_data.RemoveTerminalGaps(); } catch (SeqException &e) { cerr << "uh-oh! processing file gene.aln resulted in throwing an exception"<<endl; e.print(cerr); cerr << endl; exit(1); } PolySites *polytable = new PolySites(aligned_data.GetData()); }
PolyTableIterators.cc, slidingWindow.cc, and slidingWindow2.cc.
Definition at line 33 of file PolySites.hpp.
typedef std::vector<std::string>::const_iterator Sequence::PolyTable::const_data_iterator [inherited] |
const iterator to the data
Definition at line 90 of file PolyTable.hpp.
typedef std::vector<double>::const_iterator Sequence::PolyTable::const_pos_iterator [inherited] |
const iterator to the positions
Definition at line 98 of file PolyTable.hpp.
typedef Sequence::polySiteVector::const_iterator Sequence::PolyTable::const_site_iterator [inherited] |
Const iterator to segregating sites. The value type of this iterator is const std::pair<double,std::string>, where the double is the position of the segregating site, and the string the list of states at the site. The first character in the string corresponds to the state of the first character in the PolyTable (i.e. (*this)[0]), etc.
Definition at line 107 of file PolyTable.hpp.
typedef std::vector<std::string>::iterator Sequence::PolyTable::data_iterator [inherited] |
non-const iterator to the data
Definition at line 86 of file PolyTable.hpp.
typedef std::vector<double>::iterator Sequence::PolyTable::pos_iterator [inherited] |
non-const iterator to the positions
Definition at line 94 of file PolyTable.hpp.
| Sequence::PolySites::PolySites | ( | const std::vector< __DataType > & | alignment, | |
| bool | strictInfSites = 0, |
|||
| bool | ignoregaps = 1, |
|||
| bool | skipMissing = false, |
|||
| bool | skipAdjSNP = false, |
|||
| unsigned | freqfilter = 0 | |||
| ) | [inline] |
This is the constructor if you are using "string-like" data, such as std::string, or Sequence::Fasta. Note that the vector name is aligment, and that means that every sequence had better be the same length!
By default, there is no limit to how many characters can "segregate" at a variable position, although if there are more than 4, most biologists will start to worry. There are, however, times when you may wish to onlu consider sites that have a total of 2 character states. (NOTE: by two states, I mean including BOTH the ingroup and the outgroup sequence.) Setting strictInfSites to 1 will result in making a polymorphic sites object containing only sites with 2 states.
| alignment | vector of data | |
| strictInfSites | if true, throw out all sites with > 2 states | |
| ignoregaps | if true, do not count gapped sites as polymorphisms | |
| skipMissing | if true, ignore ALL sites with missing data ('N') | |
| skipAdjSNP | if does nothing. a placeholder for a future feature | |
| freqfilter | Defaults to 0. For a polymorphic site to be included in the final table, the minor allele count in the data (i.e. the number of times the minor allele occurs at that site) must be strictly greater than freqfilter |
Definition at line 48 of file PolySites.tcc.
| Sequence::PolySites::PolySites | ( | const std::vector< double > & | List, | |
| const std::vector< std::string > & | stringList | |||
| ) |
Use this constructor if you already have a list of positions and characters
| List | a list of doubles representing positions of polymorphic positions | |
| stringList | a vector of strings representing the polymorphic characters |
Definition at line 123 of file PolySites.cc.
| void Sequence::PolyTable::ApplyFreqFilter | ( | unsigned | mincount, | |
| bool | haveOutgroup = false, |
|||
| unsigned | outgroup = 0 | |||
| ) | [virtual, inherited] |
go through the data and remove all positions where there is a variant at count (# of occurences in the sample) < minfreq
| mincount | minimum count of a variant in the data. Variants that occur < mincount times are thrown out. | |
| haveOutgroup | true if an outgroup is present in the data, false otherwise | |
| outgroup | the index in the data array containing the outgroup (if present) |
Definition at line 256 of file PolyTable.cc.
| bool Sequence::PolyTable::assign | ( | const numeric_type * | _positions, | |
| const size_t & | _num_positions, | |||
| const string_type * | _data, | |||
| const size_t & | _num_individuals | |||
| ) | [inline, inherited] |
Assign SNP data to the polymorphism table from a vector/array.
| _positions | an array representing the positions of the SNPs | |
| _num_positions | the number of elements in _positions | |
| _data | an array containing the characters for each SNP in each individual | |
| _num_individuals | the number of elements in _data |
Sequence::PolySites snpTable; std::vector<double> positions; std::vector<std::string> data; //fill positions and data... if ( snpTable.assign(&positions[0],positions.size(),&data[0],data.size()) == true ) { //ok } else { //assignment failed for some reason... }
Definition at line 34 of file PolyTable.tcc.
| bool Sequence::PolyTable::assign | ( | PolyTable::const_site_iterator | beg, | |
| PolyTable::const_site_iterator | end | |||
| ) | [inherited] |
Assignment operation, allowing a range of polymorphic sites to be assigned to a polymorphism table. This exists mainly for two purposes. One is the ability to assign tables from "slices" of other tables. Second is to facilitate the writing of "sliding window" routines.
Definition at line 71 of file PolyTable.cc.
| PolyTable::const_data_iterator Sequence::PolyTable::begin | ( | ) | const [inherited] |
Definition at line 173 of file PolyTable.cc.
| PolyTable::data_iterator Sequence::PolyTable::begin | ( | ) | [inherited] |
Definition at line 153 of file PolyTable.cc.
| void Sequence::PolyTable::Binary | ( | bool | haveOutgroup = false, |
|
| unsigned | outgroup = 0, |
|||
| bool | strictInfSites = true | |||
| ) | [virtual, inherited] |
Recode the polymorphism table in 0,1 (binary notation)
| haveOutgroup | use true if an outgroup is present, false otherwise | |
| outgroup | the index of the outgroup in the data vector used to construct the object | |
| strictInfSites | if true, throw out all sites with > 2 character states (including outgroup!) |
true, then 0 means an ancestral state and 1 a derived state in the resulting. /note If haveOutgroup == true, and there are sites with missing data in the outrgroup sequence, those sites are removed from the data, since its assumed you actually want to know ancestral/derived for every site Reimplemented in Sequence::SimData.
Definition at line 440 of file PolyTable.cc.
| bool Sequence::PolyTable::empty | ( | ) | const [inherited] |
Definition at line 66 of file PolyTable.cc.
| PolyTable::const_data_iterator Sequence::PolyTable::end | ( | ) | const [inherited] |
Definition at line 182 of file PolyTable.cc.
| PolyTable::data_iterator Sequence::PolyTable::end | ( | ) | [inherited] |
Definition at line 163 of file PolyTable.cc.
| std::vector< std::string > Sequence::PolyTable::GetData | ( | void | ) | const [inherited] |
Returns PolyTable::data, a vector of std::strings containing polymorphic sites. Assuming the vector is returned to a vector<string> called data, accessing data[i][j] accesses the j-th site of the i-th sequence
Definition at line 527 of file PolyTable.cc.
| std::vector< double > Sequence::PolyTable::GetPositions | ( | void | ) | const [inherited] |
Returns PolyTable::positions.
Definition at line 519 of file PolyTable.cc.
| unsigned Sequence::PolyTable::numsites | ( | void | ) | const [inline, inherited] |
Return how many positions are stored in PolyTable::positions
Definition at line 233 of file PolyTable.hpp.
| Sequence::PolyTable::operator Sequence::polySiteVector | ( | ) | const [inherited] |
allow (implicit) typecast of Sequence::PolyTable to Sequence::polySiteVector
Definition at line 140 of file PolyTable.cc.
| reference Sequence::PolyTable::operator[] | ( | const size_type & | i | ) | [inline, inherited] |
Return the i-th element of PolyTable::data.
Definition at line 160 of file PolyTable.hpp.
| const_reference Sequence::PolyTable::operator[] | ( | const size_type & | i | ) | const [inline, inherited] |
Return the i-th element of PolyTable::data.
Definition at line 150 of file PolyTable.hpp.
| PolyTable::const_pos_iterator Sequence::PolyTable::pbegin | ( | ) | const [inherited] |
Definition at line 209 of file PolyTable.cc.
| PolyTable::pos_iterator Sequence::PolyTable::pbegin | ( | ) | [inherited] |
Definition at line 191 of file PolyTable.cc.
| PolyTable::const_pos_iterator Sequence::PolyTable::pend | ( | ) | const [inherited] |
Definition at line 217 of file PolyTable.cc.
| PolyTable::pos_iterator Sequence::PolyTable::pend | ( | ) | [inherited] |
Definition at line 200 of file PolyTable.cc.
| double Sequence::PolyTable::position | ( | const std::vector< double >::size_type & | i | ) | const [inline, inherited] |
Return the i-th position from the PolyTable::positions.
Definition at line 223 of file PolyTable.hpp.
| std::ostream & Sequence::PolySites::print | ( | std::ostream & | stream | ) | const [virtual] |
output a tab-delimited array of positions and character states
Allows objects of type Sequence::PolySites to be written to output streams. The output is a simple, tab-delimited table of variable site positions and characters
Implements Sequence::PolyTable.
Definition at line 169 of file PolySites.cc.
| std::istream & Sequence::PolySites::read | ( | std::istream & | h | ) | throw (Sequence::badFormat,std::exception) [virtual] |
read is a pure virtual function. Calls to istream & operator>> (istream & s, PolyTable & c) act via this routine, which must be defined in all derived classes
Implements Sequence::PolyTable.
Definition at line 142 of file PolySites.cc.
| void Sequence::PolyTable::RemoveAmbiguous | ( | bool | skipOutgroup = false, |
|
| unsigned | outgroup = 0 | |||
| ) | [virtual, inherited] |
go through the data and remove all the sites with states other than {A,G,C,T,N,-}
| skipOutgroup | default is false. If true, the character state of the outgroup is ignored. | |
| outgroup | the index of the outgroup in the data vector |
Definition at line 402 of file PolyTable.cc.
| void Sequence::PolyTable::RemoveMissing | ( | bool | skipOutgroup = false, |
|
| unsigned | outgroup = 0 | |||
| ) | [virtual, inherited] |
go through the data and remove all the sites with missing data (the character N).
| skipOutgroup | default is false. If true, the character state of the outgroup is ignored. | |
| outgroup | the index of the outgroup in the data vector |
Definition at line 360 of file PolyTable.cc.
| void Sequence::PolyTable::RemoveMultiHits | ( | bool | skipOutgroup = false, |
|
| unsigned | outgroup = 0 | |||
| ) | [virtual, inherited] |
go through the data and remove all the sites with more than 2 states segregating. By default, this routine also removes sites where there are 2 states segregating in the ingroup. and the outgroup (if present) has a 3rd state.
| skipOutgroup | default is false. If true, the character state of the outgroup is ignored. | |
| outgroup | the index of the outgroup in the data vector |
Definition at line 321 of file PolyTable.cc.
| PolyTable::const_site_iterator Sequence::PolyTable::sbegin | ( | ) | const [inherited] |
Definition at line 226 of file PolyTable.cc.
| PolyTable::const_site_iterator Sequence::PolyTable::send | ( | ) | const [inherited] |
Definition at line 241 of file PolyTable.cc.
| size_type Sequence::PolyTable::size | ( | void | ) | const [inline, inherited] |
Return how many std::strings are stored in PolyTable::data.
Definition at line 214 of file PolyTable.hpp.
1.6.3