00001 #ifndef OLIGOFAR_CSEQCODING__HPP
00002 #define OLIGOFAR_CSEQCODING__HPP
00003
00004 #include "defs.hpp"
00005 #include <cmath>
00006
00007 BEGIN_OLIGOFAR_SCOPES
00008
00009 class CIupacnaBase;
00010 class CNcbipnaBase;
00011 class CNcbiqnaBase;
00012 class CNcbi8naBase;
00013 class CNcbi4naBase;
00014 class CNcbi2naBase;
00015 class CColorTwoBase;
00016
00017 class CSeqCoding
00018 {
00019 public:
00020 enum ECoding {
00021 eCoding_iupacna,
00022 eCoding_ncbipna,
00023 eCoding_ncbi8na,
00024 eCoding_ncbi4na,
00025 eCoding_ncbi2na,
00026 eCoding_ncbiqna,
00027 eCoding_colorsp
00028 };
00029 enum EStrand {
00030 eStrand_pos = 0,
00031 eStrand_neg = 1
00032 };
00033
00034 friend EStrand operator ^ ( EStrand a, EStrand b ) { return EStrand( int(a) ^ int(b) ); }
00035 };
00036
00037 class CIupacnaBase : public CSeqCoding
00038 {
00039 public:
00040 CIupacnaBase( const char * c, EStrand strand ) : m_base( *c ) { if( strand == eStrand_neg ) m_base = s_complement[(int)m_base]; }
00041 CIupacnaBase( const char * c ) : m_base( *c ) {}
00042 CIupacnaBase( char c ) : m_base( c ) {}
00043 CIupacnaBase( const CNcbi8naBase& );
00044 CIupacnaBase( const CNcbi4naBase& );
00045 CIupacnaBase( const CNcbi2naBase& );
00046 CIupacnaBase( const CNcbiqnaBase& , int cutoff = 5 );
00047 CIupacnaBase( const CNcbipnaBase& , int score = 127 );
00048 CIupacnaBase( const CColorTwoBase& b );
00049 CIupacnaBase( const CIupacnaBase& b, const CColorTwoBase& c );
00050 CIupacnaBase Complement() const { return s_complement[(int)m_base]; }
00051 CIupacnaBase Get( EStrand strand ) const { return strand == eStrand_neg ? Complement() : *this; }
00052 operator char () const { return m_base; }
00053 static CIupacnaBase Any() { return CIupacnaBase('N'); }
00054 static int BytesPerBase() { return 1; }
00055 protected:
00056 char m_base;
00057 static char s_complement[];
00058 };
00059
00060 class CNcbipnaBase : public CSeqCoding
00061 {
00062 public:
00063 CNcbipnaBase( const char * base, EStrand strand ) { if( strand == eStrand_neg ) { copy( base + 4, base, m_base ); m_base[4] = base[4]; } else copy( base, base+5, m_base ); }
00064 CNcbipnaBase( const char * base ) { copy( base, base+5, m_base ); }
00065 CNcbipnaBase( const unsigned char * base ) { copy( base, base+5, m_base ); }
00066 CNcbipnaBase( const CIupacnaBase& b );
00067 CNcbipnaBase( const CNcbi8naBase& b ) { x_Init( b ); }
00068 CNcbipnaBase( const CNcbi4naBase& b );
00069 CNcbipnaBase( const CNcbi2naBase& );
00070 CNcbipnaBase( const CNcbiqnaBase& );
00071 CNcbipnaBase( const CColorTwoBase& b ) { THROW( logic_error, "CNcbipnaBase( CColorTwoBase ) should not be called" ); }
00072 operator char () const { THROW( logic_error, "CNcbiqnaBase::operator char () should not be called!" ); }
00073 operator const char * () const { return m_base; }
00074 unsigned char operator [] (int i) const { return m_base[i]; }
00075 CNcbipnaBase Complement() const { return CNcbipnaBase( m_base, eStrand_neg ); }
00076 CNcbipnaBase Get( EStrand strand ) const { return strand == eStrand_neg ? Complement() : *this; }
00077 static int BytesPerBase() { return 5; }
00078 protected:
00079 void x_Init( const CNcbi8naBase& );
00080 char m_base[5];
00081 };
00082
00083 class CNcbiqnaBase : public CSeqCoding
00084 {
00085 public:
00086 CNcbiqnaBase( const char * base, EStrand strand ): m_base( *base ) { if( strand == eStrand_neg ) m_base = m_base ^ '\x03'; }
00087 CNcbiqnaBase( const char * c ) : m_base( *c ) {}
00088 CNcbiqnaBase( char c ) : m_base( c ) {}
00089 CNcbiqnaBase( int c ) : m_base( c ) {}
00090 CNcbiqnaBase( const CNcbipnaBase& b );
00091 CNcbiqnaBase( const CNcbi8naBase& b, unsigned score = 63 );
00092 CNcbiqnaBase( const CNcbi4naBase& b, unsigned score = 63 );
00093 CNcbiqnaBase( const CNcbi2naBase& b, unsigned score = 63 );
00094 CNcbiqnaBase( const CIupacnaBase& b, unsigned score = 63 );
00095 CNcbiqnaBase( const CColorTwoBase& b ) { THROW( logic_error, "CNcbiqnaBase( CColorTwoBase ) should not be called" ); }
00096 CNcbiqnaBase Complement() const { return m_base ^ '\x03'; }
00097 CNcbiqnaBase Get( EStrand strand ) const { return ( strand == eStrand_neg ) ? Complement() : *this; }
00098 operator char () const { return m_base; }
00099 int GetPhrapScore() const { return ((unsigned char)m_base) >> 2; }
00100 static char AdjustScore( int score );
00101 static CNcbiqnaBase Any() { return CNcbiqnaBase( 0 ); }
00102 static int BytesPerBase() { return 1; }
00103 protected:
00104 char m_base;
00105 };
00106
00107 class CNcbi8naBase : public CSeqCoding
00108 {
00109 public:
00110 enum EBase { fBase_A = 0x01, fBase_C = 0x02, fBase_G = 0x04, fBase_T = 0x08 };
00111 CNcbi8naBase( const char * c, EStrand strand ) : m_base( *c ) { if( strand == eStrand_neg ) m_base = s_complement[(int)m_base]; }
00112 CNcbi8naBase( const char * c ) : m_base( *c ) {}
00113 CNcbi8naBase( char c, EStrand strand = eStrand_pos );
00114 CNcbi8naBase( int c, EStrand strand = eStrand_pos );
00115 CNcbi8naBase( unsigned int c, EStrand strand = eStrand_pos );
00116 CNcbi8naBase( const CIupacnaBase& b );
00117 CNcbi8naBase( const CNcbipnaBase& b, int score = 127 );
00118 CNcbi8naBase( const CNcbiqnaBase& b, int cutoff = 5 );
00119 CNcbi8naBase( const CNcbi8naBase& b, int ) : m_base( b ) {}
00120 CNcbi8naBase( const CNcbi4naBase& b );
00121 CNcbi8naBase( const CNcbi2naBase& b );
00122 CNcbi8naBase( const CColorTwoBase& b ) { THROW( logic_error, "CNcbi8naBase( CColorTwoBase ) should not be called" ); }
00123 CNcbi8naBase( const CNcbi8naBase& b, const CColorTwoBase& c );
00124 CNcbi8naBase Complement() const;
00125 CNcbi8naBase Get( EStrand strand ) const { return ( strand == eStrand_neg ) ? Complement() : *this; }
00126 bool IsAmbiguous() const { return GetAltCount() > 1; }
00127 int GetAltCount() const { return s_altCount[(int)m_base]; }
00128 operator char () const { return m_base; }
00129 static CNcbi8naBase Any() { return CNcbi8naBase( '\x0f' ); }
00130 static int BytesPerBase() { return 1; }
00131 CNcbi2naBase GetSmallestNcbi2na() const;
00132 protected:
00133 char m_base;
00134 static char s_fromIupacna[];
00135 static char s_complement[];
00136 static char s_altCount[];
00137 static char s_smallestNcbi2na[];
00138 };
00139
00140 class CNcbi4naBase : public CNcbi8naBase
00141 {
00142 public:
00143 CNcbi4naBase( const CNcbi8naBase& b ) : CNcbi8naBase( b ) {}
00144 };
00145
00146 class CNcbi2naBase : public CSeqCoding
00147 {
00148 public:
00149 CNcbi2naBase( const char * c, EStrand strand ) : m_base( *c ) { if( strand == eStrand_neg ) m_base = '\x03' ^ m_base; }
00150 CNcbi2naBase( const char * c ) : m_base( *c ) {}
00151 CNcbi2naBase( char c ) : m_base( c ) {}
00152 CNcbi2naBase( int c ) : m_base( c ) {}
00153 CNcbi2naBase( unsigned int c ) : m_base( c ) {}
00154 CNcbi2naBase( const CNcbipnaBase& b, int score = 127 );
00155 CNcbi2naBase( const CIupacnaBase& b );
00156 CNcbi2naBase( const CNcbi8naBase& b );
00157 CNcbi2naBase( const CNcbi4naBase& b );
00158 CNcbi2naBase( const CNcbiqnaBase& b );
00159 operator char () const { return m_base; }
00160 CNcbi2naBase Complement() const { return '\x03' ^ m_base; }
00161 CNcbi2naBase Get( EStrand strand ) const { return ( strand == eStrand_neg ) ? Complement() : *this; }
00162 protected:
00163 char m_base;
00164 static char s_iupacnaTable[];
00165 static char s_ncbi8naTable[];
00166 };
00167
00168 class CColorTwoBase : public CSeqCoding
00169 {
00170 public:
00171 enum EColorHi { eColor_BLUE = 0x00, eColor_GREEN = 0x10, eColor_YELLOW = 0x20, eColor_RED = 30 };
00172 enum EFromASCII { eFromASCII };
00173 CColorTwoBase( char c ) : m_base( c ) {}
00174 CColorTwoBase( char c, EStrand ) : m_base( c ) {}
00175 CColorTwoBase( const char * c ) : m_base( *c ) {}
00176 CColorTwoBase( const char * c, EStrand ) : m_base( *c ) {}
00177 CColorTwoBase( EFromASCII, char c ) { x_Init( c ); }
00178 CColorTwoBase( CNcbi2naBase b ) : m_base( CNcbi8naBase( b ) ) {}
00179 CColorTwoBase( CNcbiqnaBase b ) : m_base( CNcbi8naBase( b ) ) {}
00180 CColorTwoBase( CNcbi8naBase b ) : m_base( b ) {}
00181 CColorTwoBase( CNcbipnaBase b ) : m_base( CNcbi8naBase(b) ) {}
00182 CColorTwoBase( CNcbi2naBase prev, CNcbi2naBase b ) :
00183 m_base( s_dibase2code[(prev << 2) | b] | CColorTwoBase(b) ) {}
00184 CColorTwoBase( CNcbiqnaBase prev, CNcbiqnaBase b ) :
00185 m_base( s_dibase2code[(prev&0x30 << 2) | b&0x30] | CColorTwoBase(b) ) {}
00186 CColorTwoBase( CNcbi2naBase prev, CNcbi8naBase b ) :
00187 m_base( s_dibase2code[(prev << 2) | b.GetSmallestNcbi2na()] | CColorTwoBase(b) ) {}
00188 CColorTwoBase( CColorTwoBase prev, CColorTwoBase b ) { THROW( logic_error, "CColorTwoBase( CColorTwoBase, CColorTwoBase ) should never be called" ); }
00189 CColorTwoBase( CNcbi8naBase prev, CNcbi8naBase b ) :
00190 m_base( s_dibase2code[(prev.GetSmallestNcbi2na() << 2) | b.GetSmallestNcbi2na()] | CColorTwoBase(b) ) {}
00191 CColorTwoBase( CNcbipnaBase prev, CNcbipnaBase b ) :
00192 m_base( s_dibase2code[(CNcbi8naBase(prev).GetSmallestNcbi2na() << 2) | CNcbi8naBase(b).GetSmallestNcbi2na()] | CColorTwoBase(b) ) {}
00193
00194
00195 operator char () const { return m_base; }
00196 CNcbi8naBase GetBaseCalls() const { return m_base & 0xf; }
00197 CColorTwoBase Complement() const { return GetBaseCalls().Complement() | GetColor(); }
00198 int GetColor() const { return m_base & 0x30; }
00199 int GetColorOrd() const { return GetColor() >> 4; }
00200 char AsAscii( bool tryBaseCalls = true ) const { return tryBaseCalls && GetBaseCalls() ? (char)CIupacnaBase( GetBaseCalls() ) : ('0' + GetColorOrd()); }
00201 static int BytesPerBase() { return 1; }
00202 protected:
00203 void x_Init( char c );
00204 protected:
00205 static char s_dibase2code[];
00206 char m_base;
00207 };
00208
00209
00210 template<class BaseA, class BaseB>
00211 inline int ComputeScore( const BaseA& a, const BaseB& b )
00212 {
00213 return bool( CNcbi8naBase( a ) & CNcbi8naBase( b ) );
00214 }
00215
00216 inline int ComputeScore( const CNcbi8naBase& a, const CNcbipnaBase& b )
00217 {
00218 return
00219 (( a & 0x01 ? b[0] : 0 ) +
00220 ( a & 0x02 ? b[1] : 0 ) +
00221 ( a & 0x04 ? b[2] : 0 ) +
00222 ( a & 0x08 ? b[3] : 0 )) / (b[0] + b[1] + b[2] + b[3]);
00223 }
00224
00225 inline int ComputeScore( const CNcbipnaBase& b, const CNcbi8naBase& a )
00226 {
00227 return
00228 (( a & 0x01 ? b[0] : 0 ) +
00229 ( a & 0x02 ? b[1] : 0 ) +
00230 ( a & 0x04 ? b[2] : 0 ) +
00231 ( a & 0x08 ? b[3] : 0 )) / (b[0] + b[1] + b[2] + b[3]);
00232 }
00233
00234 inline int operator * ( const CNcbi2naBase& a, const CNcbi2naBase& b )
00235 {
00236 return a == b;
00237 }
00238
00239
00240
00241 inline CIupacnaBase::CIupacnaBase( const CNcbi8naBase& b ) : m_base("-ACMGRSVTWYHKDBN"[b&0x0f]) {}
00242 inline CIupacnaBase::CIupacnaBase( const CNcbi4naBase& b ) : m_base("-ACMGRSVTWYHKDBN"[b&0x0f]) {}
00243 inline CIupacnaBase::CIupacnaBase( const CNcbi2naBase& b ) : m_base("ACGT"[b&0x03]) {}
00244 inline CIupacnaBase::CIupacnaBase( const CNcbipnaBase& b, int score ) : m_base(0) { m_base = CIupacnaBase( CNcbi8naBase( b, score ) ); }
00245 inline CIupacnaBase::CIupacnaBase( const CNcbiqnaBase& b, int cutoff ) : m_base( b.GetPhrapScore() > cutoff ? "ACGT"[b&3] : 'N' ) {}
00246 inline CIupacnaBase::CIupacnaBase( const CColorTwoBase& b ) { m_base = b.GetBaseCalls() ? char(CIupacnaBase( b.GetBaseCalls() )) : char(( b.GetColorOrd() + '0' )); }
00247 inline CIupacnaBase::CIupacnaBase( const CIupacnaBase& b, const CColorTwoBase& c )
00248 { m_base = "ACGTCATGGTACTGCA"[(CNcbi2naBase(b) << 2)|c.GetColorOrd()]; }
00249
00250
00251
00252 inline CNcbipnaBase::CNcbipnaBase( const CIupacnaBase& b )
00253 {
00254 x_Init( CNcbi8naBase( b ) );
00255 }
00256
00257 inline CNcbipnaBase::CNcbipnaBase( const CNcbi2naBase& b )
00258 {
00259 fill( m_base, m_base + 5, 0 );
00260 m_base[b] = m_base[4] = '\xff';
00261 }
00262
00263 inline CNcbipnaBase::CNcbipnaBase( const CNcbi4naBase& b )
00264 {
00265 x_Init( b );
00266 }
00267
00268 inline void CNcbipnaBase::x_Init( const CNcbi8naBase& b )
00269 {
00270 fill( m_base, m_base + 5, 0 );
00271 if( b & 0x01 ) m_base[0] = '\xff';
00272 if( b & 0x02 ) m_base[1] = '\xff';
00273 if( b & 0x03 ) m_base[2] = '\xff';
00274 if( b & 0x04 ) m_base[3] = '\xff';
00275 m_base[4] = b ? '\xff' : 0;
00276 }
00277
00278 inline CNcbipnaBase::CNcbipnaBase( const CNcbiqnaBase& b )
00279 {
00280 Uint1 val = Uint1( 255 * ( std::pow(10.0, b.GetPhrapScore() / 10 ) ) );
00281 int x = CNcbi2naBase( b );
00282 m_base[0] = x-- ? val : '\xff';
00283 m_base[1] = x-- ? val : '\xff';
00284 m_base[2] = x-- ? val : '\xff';
00285 m_base[3] = x-- ? val : '\xff';
00286 m_base[4] = '\xff';
00287 }
00288
00289
00290
00291 inline char CNcbiqnaBase::AdjustScore( int score )
00292 {
00293 if( score < 0 ) score = 0;
00294 if( score > 63 ) score = 63;
00295 return score;
00296 }
00297
00298 inline CNcbiqnaBase::CNcbiqnaBase( const CNcbipnaBase& b ) : m_base(0)
00299 {
00300 int max = 0, next = 0;
00301 for( int i = 0; i < 4; ++i ) {
00302 if( b[i] > max ) {
00303 m_base = i;
00304 next = max;
00305 max = b[i];
00306 } else if( b[i] > next ) next = b[i];
00307 }
00308 if( next == 0 ) m_base |= 0xfc;
00309 else {
00310 double pscore = 10*std::log10( double( max ) / next );
00311 m_base |= AdjustScore( int(pscore) ) << 2;
00312 }
00313 }
00314 inline CNcbiqnaBase::CNcbiqnaBase( const CNcbi8naBase& b, unsigned score )
00315 {
00316 if( b.GetAltCount() > 1 ) m_base = 0;
00317 else m_base = CNcbi2naBase( b ) | (AdjustScore(score) << 2);
00318 }
00319
00320 inline CNcbiqnaBase::CNcbiqnaBase( const CNcbi4naBase& b, unsigned score )
00321 {
00322 if( b.GetAltCount() > 1 ) m_base = 0;
00323 else m_base = CNcbi2naBase( b ) | (AdjustScore(score) << 2);
00324 }
00325
00326 inline CNcbiqnaBase::CNcbiqnaBase( const CNcbi2naBase& b, unsigned score )
00327 {
00328 m_base = b | (AdjustScore(score) << 2);
00329 }
00330
00331 inline CNcbiqnaBase::CNcbiqnaBase( const CIupacnaBase& b, unsigned score )
00332 {
00333 CNcbi8naBase _b( b );
00334 if( _b.GetAltCount() > 1 ) m_base = 0;
00335 else m_base = CNcbi2naBase( b ) | (AdjustScore(score) << 2);
00336 }
00337
00338
00339
00340 inline CNcbi8naBase::CNcbi8naBase( char c, EStrand strand ) : m_base( c )
00341 {
00342 if( strand == eStrand_neg ) m_base = s_complement[int(m_base)];
00343 }
00344
00345 inline CNcbi8naBase::CNcbi8naBase( int c, EStrand strand ) : m_base( c )
00346 {
00347 if( strand == eStrand_neg ) m_base = s_complement[int(m_base)];
00348 }
00349
00350 inline CNcbi8naBase::CNcbi8naBase( unsigned int c, EStrand strand ) : m_base( c )
00351 {
00352 if( strand == eStrand_neg ) m_base = s_complement[int(m_base)];
00353 }
00354
00355 inline CNcbi8naBase::CNcbi8naBase( const CNcbi4naBase& b ) : m_base( b ) {}
00356 inline CNcbi8naBase::CNcbi8naBase( const CNcbi2naBase& b ) : m_base( "\x01\x02\x04\x08"[(int)b] ) {}
00357
00358 inline CNcbi8naBase::CNcbi8naBase( const CNcbipnaBase& b, int score ) : m_base(0)
00359 {
00360 if( b[0] > score ) m_base |= 0x01;
00361 if( b[1] > score ) m_base |= 0x02;
00362 if( b[2] > score ) m_base |= 0x04;
00363 if( b[3] > score ) m_base |= 0x08;
00364 if( (!m_base) && ( b[4] > score ) ) m_base = 0x0f;
00365 }
00366
00367 inline CNcbi8naBase::CNcbi8naBase( const CIupacnaBase& b ) : m_base( s_fromIupacna[b] )
00368 {
00369 if( m_base == '\xf0' ) THROW( runtime_error, "Invalid IUPACNA base ASCII[" << int(b) << "] " << b );
00370 }
00371
00372 inline CNcbi8naBase::CNcbi8naBase( const CNcbi8naBase& b, const CColorTwoBase& c )
00373 { m_base = "\x1\x2\x4\x8\x2\x1\x8\x4\x4\x8\x1\x2\x8\x4\x2\x1"[(CNcbi2naBase(b) << 2)|c.GetColorOrd()]; }
00374
00375 inline CNcbi8naBase::CNcbi8naBase( const CNcbiqnaBase& b, int cutoff ) : m_base( b.GetPhrapScore() > cutoff ? CNcbi8naBase( CNcbi2naBase( b&3 ) ) : Any() ) {}
00376 inline CNcbi8naBase CNcbi8naBase::Complement() const { return s_complement[(int)m_base]; }
00377 inline CNcbi2naBase CNcbi8naBase::GetSmallestNcbi2na() const { return s_smallestNcbi2na[int(m_base)]; }
00378
00379
00380
00381 inline CNcbi2naBase::CNcbi2naBase( const CIupacnaBase& b ) : m_base( s_iupacnaTable[b] )
00382 {
00383 if( m_base == '\xf0' ) THROW( runtime_error, "Invalid IUPACNA base ASCII[" << int(b) << "] " << b );
00384 }
00385
00386 inline CNcbi2naBase::CNcbi2naBase( const CNcbi8naBase& b ) : m_base( s_ncbi8naTable[b] )
00387 {
00388 if( m_base == '\xf0' ) THROW( runtime_error, "Invalid NCBI8NA base 0x" << setw(2) << hex << setfill('0') << int(b) << " IUPACNA " << CIupacnaBase(b) );
00389 }
00390
00391 inline CNcbi2naBase::CNcbi2naBase( const CNcbi4naBase& b ) : m_base( s_ncbi8naTable[b] )
00392 {
00393 if( m_base == '\xf0' ) THROW( runtime_error, "Invalid NCBI8NA base 0x" << setw(2) << hex << setfill('0') << int(b) << " IUPACNA " << CIupacnaBase(b) );
00394 }
00395
00396 inline CNcbi2naBase::CNcbi2naBase( const CNcbiqnaBase& b ) : m_base( b & 3 ) {}
00397
00398
00399
00400 inline void CColorTwoBase::x_Init( char c )
00401 {
00402 static const char * t = "0123ACGT";
00403 const char * x = strchr( t, toupper( c ) );
00404 if( x == 0 || *x == 0 ) THROW( runtime_error, "Invalid colorspace base " << c );
00405 m_base = x - t;
00406 if( m_base < 4 ) (m_base <<= 4);
00407 else m_base = CNcbi8naBase( CNcbi2naBase( m_base & 0x03 ) );
00408 }
00409
00410 END_OLIGOFAR_SCOPES
00411
00412 #endif
00413
00414