00001 #ifndef OLIGOFAR_CSAMALIGNMENTS__HPP
00002 #define OLIGOFAR_CSAMALIGNMENTS__HPP
00003
00004 #include "ialignmentmap.hpp"
00005 #include "csam.hpp"
00006
00007 BEGIN_OLIGOFAR_SCOPES
00008
00009 class CSRead : public IShortRead
00010 {
00011 public:
00012 ~CSRead() { delete[] m_data; --s_count; }
00013
00014 virtual EPairType GetPairType() const { return x_GetPairType(); }
00015 virtual CSeqCoding::ECoding GetSequenceCoding() const { return x_GetSequenceCoding(); }
00016 virtual int GetSequenceLength() const { return x_GetSequenceLength(); }
00017 const IShortRead * GetMate() const { return x_GetPairType() ? x_GetMate() : 0; }
00018 const char * GetSequenceData() const { return m_data + x_GetSequenceOffset(); }
00019 const char * GetId() const { return x_GetPairType() == eRead_second ? x_GetMate()->GetId() : m_data + x_GetIdOffset(); }
00020
00021 CSRead( const string& id, CSeqCoding::ECoding coding, const char * sequence, int length, EPairType intended = eRead_single );
00022 CSRead( const string& id, const string& sequence, const string& quality = "", EPairType intended = eRead_single );
00023 CSRead( const CSRead& r );
00024 CSRead& operator = ( const CSRead& r );
00025
00026 void SetMate( CSRead * other );
00027
00028 EPairType GetIntendedPairType() const { return EPairType((m_data[2] >> 2)&3); }
00029
00030 string GetIupacna( CSeqCoding::EStrand = CSeqCoding::eStrand_pos ) const;
00031
00032 static int GetCount() { return s_count; }
00033
00034 protected:
00035 Uint2 x_GetIdLength() const { return Uint1( m_data[0] ); }
00036 Uint2 x_GetSequenceLength() const { return Uint1( m_data[1] ); }
00037 EPairType x_GetPairType() const { return EPairType( m_data[2] & 3 ); }
00038 CSeqCoding::ECoding x_GetSequenceCoding() const { return CSeqCoding::ECoding( Uint1( m_data[3] ) ); }
00039
00040 Uint1 x_GetHeaderSize() const { return 4 + ( x_GetPairType() ? sizeof( void* ) : 0 ); }
00041 Uint2 x_GetIdOffset() const { return x_GetHeaderSize(); }
00042 Uint2 x_GetSequenceOffset() const { return x_GetIdOffset() + x_GetIdLength() + 1; }
00043 Uint2 x_GetDataSize() const { return x_GetSequenceOffset() + x_GetSequenceLength() + 1; }
00044 Uint2 x_GetDataSize( int idlen, int seqlen, EPairType type ) const {
00045 switch( type ) {
00046 case eRead_single: return 6 + idlen + seqlen;
00047 case eRead_first: return 6 + sizeof( void * ) + idlen + seqlen;
00048 case eRead_second: return 5 + sizeof( void * ) + seqlen;
00049 }
00050 THROW( logic_error, "Unknown read pair type" );
00051 }
00052 const CSRead * x_GetMate() const { return *(CSRead**)(m_data + 4); }
00053 CSRead * & x_SetMate() { return *(CSRead**)(m_data + 4); }
00054 protected:
00055 char * m_data;
00056 static int s_count;
00057 };
00058
00059 class CContig : public INucSeq
00060 {
00061 public:
00062 ~CContig() { if( m_owns ) delete[] m_data; --s_count; }
00063 CContig( const string& name );
00064 CContig( const CContig& ctg );
00065 void SetSequenceData( char * data, size_t length, CSeqCoding::ECoding coding = CSeqCoding::eCoding_ncbi8na, bool owns = false ) {
00066 if( m_owns ) delete[] m_data;
00067 m_data = data; m_size = length; m_coding = coding; m_owns = owns;
00068 }
00069 virtual const char * GetId() const { return m_id.c_str(); }
00070 virtual const char * GetSequenceData() const { return m_data; }
00071 virtual int GetSequenceLength() const { return m_size; }
00072 virtual CSeqCoding::ECoding GetSequenceCoding() const { return m_coding; }
00073
00074 static int GetCount() { return s_count; }
00075 private:
00076 CContig& operator = ( const CContig& );
00077 protected:
00078 string m_id;
00079 char * m_data;
00080 size_t m_size;
00081 CSeqCoding::ECoding m_coding;
00082 bool m_owns;
00083
00084 static int s_count;
00085 };
00086
00087 class IAligner;
00088 class CMappedAlignment : public IMappedAlignment
00089 {
00090 public:
00091 typedef map<string,string> TTags;
00092 enum EFlags { fOwnQuery = 0x01, fOwnSubject = 0x02, fOwnSeqs = fOwnQuery|fOwnSubject };
00093 enum ETagType { eType_int = 'i', eType_float = 'f', eType_string = 'Z', eType_NONE = 0 };
00094 virtual const char * GetId() const { return 0; }
00095 virtual const IShortRead * GetQuery() const { return m_query; }
00096 virtual const INucSeq * GetSubject() const { return m_subject; }
00097 virtual TRange GetSubjectBounding() const { return TRange( m_sstart, m_sstart + m_slength - 1 ); }
00098 virtual TRange GetQueryBounding() const { return m_qlength > 0 ? TRange( m_qstart, m_qstart + m_qlength - 1 ) : TRange( m_qstart + m_qlength + 1, m_qstart ); }
00099 virtual const TTrSequence GetCigar() const { return m_cigar; }
00100 virtual bool IsReverseComplement() const { return m_qlength < 0; }
00101 virtual const IMappedAlignment * GetMate() const { return m_mate; }
00102 void SetMate( CMappedAlignment * other );
00103 void WriteAsSam( ostream& out ) const;
00104 void SetFlags( int flags, bool on = true ) { if( on ) m_flags |= flags; else m_flags &= ~flags; }
00105 void AdjustQueryFromBy( int );
00106 void AdjustQueryToBy( int );
00107 void AdjustSubjectFromBy( int );
00108 void AdjustSubjectToBy( int );
00109 void AdjustQueryFromTo( int );
00110 void AdjustQueryToTo( int );
00111 void AdjustSubjectFromTo( int );
00112 void AdjustSubjectToTo( int );
00113 void SetTagS( const string& name, const string& val );
00114 void SetTagI( const string& name, int val );
00115 void SetTagF( const string& name, float val );
00116 char GetTagType( const string& name ) const;
00117 string GetTagS( const string& name, char type = 'Z' ) const;
00118 int GetTagI( const string& name ) const;
00119 float GetTagF( const string& name ) const;
00120 void RemoveTag( const string& name );
00121 void Assign( const CMappedAlignment * other );
00122 void SetTags( TTags * tags ) { delete m_tags; m_tags = tags; }
00123 const TTags& GetTags() const { static TTags notags; if( m_tags ) return *m_tags; else return notags; }
00124 CMappedAlignment * MakeExtended( IAligner * aligner ) const;
00125 CMappedAlignment * Clone() const { return new CMappedAlignment( *this ); }
00126 CMappedAlignment( const CSRead * query, const CContig * subject, int from, const TTrVector& cigar, CSeqCoding::EStrand strand, int flags = 0, TTags * tags = 0 );
00127 ~CMappedAlignment() { if( m_flags & fOwnQuery ) delete m_query; if( m_flags & fOwnSubject ) delete m_subject; delete m_tags; --s_count; }
00128 void PrintDebug( ostream& out ) const {
00129 out << m_cigar.ToString() << DISPLAY( m_sstart ) << DISPLAY( m_slength ) << DISPLAY( m_qstart ) << DISPLAY( m_qlength ) << hex << DISPLAY( m_flags ) << dec;
00130 }
00131 bool ValidateConsistency( const string& context = "" ) const;
00132 bool operator == ( const CMappedAlignment& other ) const {
00133 return
00134 strcmp( m_query->GetId(), other.m_query->GetId() ) == 0 &&
00135 strcmp( m_subject->GetId(), other.m_subject->GetId() ) == 0 &&
00136 m_sstart == other.m_sstart && m_slength == other.m_slength &&
00137 m_qstart == other.m_qstart && m_qlength == other.m_qlength &&
00138 m_cigar == other.m_cigar;
00139 }
00140 bool operator != ( const CMappedAlignment& other ) const { return !operator == ( other ); }
00141
00142
00143 static int GetCount() { return s_count; }
00144
00145 double ScoreAlignment( double id = 1, double mm = -1, double go = -3, double ge = -1.5 ) const;
00146 static double ScoreAlignment( const TTrSequence& cigar, double id = 1, double mm = -1, double go = -3, double ge = -1.5 );
00147
00148 private:
00149 CMappedAlignment& operator = ( const CMappedAlignment& );
00150 enum EAdjustMode { eAdjust_query, eAdjust_subj };
00151 void AdjustAlignment( EAdjustMode, int fromBy, int toBy );
00152 protected:
00153
00154 CMappedAlignment( const CMappedAlignment& a );
00155 protected:
00156 const CSRead * m_query;
00157 const CContig * m_subject;
00158 const CMappedAlignment * m_mate;
00159 Int4 m_sstart;
00160 Int2 m_slength;
00161 Int2 m_qstart;
00162 Int2 m_qlength;
00163 Int2 m_flags;
00164 TTrSequence m_cigar;
00165 TTags * m_tags;
00166 static int s_count;
00167 public:
00168 static bool s_validate_consistency;
00169 };
00170
00171 class CSamSource : public IAlignmentMap, public CSamBase
00172 {
00173 public:
00174 typedef vector<CMappedAlignment*> TAlignmentList;
00175 typedef vector<Uint8> TFileOffsets;
00176 typedef map<string,CSRead*> TReads;
00177 typedef map<string,CContig*> TContigs;
00178 typedef map<string,TAlignmentList> TAlignmentMap;
00179 typedef map<string,TFileOffsets> TAlignmentOffsetMap;
00180 typedef map<pair<string, pair<string,int> >, CMappedAlignment* > TPendingHits;
00181
00182 enum ERegisterFlags { fRegisterQuery = 0x01, fRegisterSubject = 0x02, fRegisterAlignmentByQuery = 0x04, fRegisterAlignmentBySubject = 0x08 };
00183
00184 ~CSamSource();
00185 CMappedAlignment * ParseSamLine( const vector<string>& samLine, int flags = 0 );
00186 CMappedAlignment * ParseSamLine( const string& samLine, int flags = 0 );
00187
00188 void IndexFile( const string& samFile, unsigned start, unsigned limit );
00189
00190 static TTrVector GetFullCigar( const TTrVector& cigar, const string& mismatches );
00191
00192 virtual void GetAlignmentsForQueryId( ICallback *, const char * id, int mates = 3 ) {}
00193 virtual void GetAlignmentsForSubjectId( ICallback *, const char * id, int strands = 3, int from = kSequenceBegin, int to = kSequenceEnd ) {}
00194 virtual void GetAllAlignments( ICallback * ) {}
00195 virtual void GetAllQueries( ICallback * ) {}
00196 virtual void GetAllSubjects( ICallback * ) {}
00197
00198 virtual void AddAlignment( IMappedAlignment * ma ) {}
00199 virtual void AddSubject( INucSeq * ma ) {}
00200 virtual void AddQuery( IShortRead * ma ) {}
00201
00202 const TAlignmentOffsetMap& GetAlignmentOffsetMap() const { return m_alignmentOffsets; }
00203 const TFileOffsets& GetAlignmentOffsets( const string& ctg ) const {
00204 TAlignmentOffsetMap::const_iterator x = m_alignmentOffsets.find( ctg );
00205 if( x == m_alignmentOffsets.end() ) {
00206 static TFileOffsets null;
00207 return null;
00208 }
00209 else return x->second;
00210 }
00211 protected:
00212 TContigs m_contigs;
00213 TReads m_reads;
00214 TAlignmentMap m_alignmentsByContig;
00215 TAlignmentMap m_alignmentsByRead;
00216 TAlignmentOffsetMap m_alignmentOffsets;
00217 TPendingHits m_pendingHits;
00218 auto_ptr<ifstream> m_samFile;
00219 };
00220
00221 inline void CMappedAlignment::AdjustQueryFromBy( int by )
00222 {
00223 AdjustAlignment( eAdjust_query, by, 0 );
00224 }
00225
00226 inline void CMappedAlignment::AdjustQueryToBy( int by )
00227 {
00228 AdjustAlignment( eAdjust_query, 0, by );
00229 }
00230
00231 inline void CMappedAlignment::AdjustSubjectFromBy( int by )
00232 {
00233 AdjustAlignment( eAdjust_subj, by, 0 );
00234 }
00235
00236 inline void CMappedAlignment::AdjustSubjectToBy( int by )
00237 {
00238 AdjustAlignment( eAdjust_subj, 0, by );
00239 }
00240
00241 inline void CMappedAlignment::AdjustQueryFromTo( int newFrom )
00242 {
00243 AdjustQueryFromBy( newFrom - GetQueryBounding().GetFrom() );
00244 }
00245
00246 inline void CMappedAlignment::AdjustQueryToTo( int newTo )
00247 {
00248 AdjustQueryToBy( newTo - GetQueryBounding().GetTo() );
00249 }
00250
00251 inline void CMappedAlignment::AdjustSubjectFromTo( int newFrom )
00252 {
00253 AdjustSubjectFromBy( newFrom - GetSubjectBounding().GetFrom() );
00254 }
00255
00256 inline void CMappedAlignment::AdjustSubjectToTo( int newTo )
00257 {
00258 AdjustSubjectFromBy( newTo - GetSubjectBounding().GetTo() );
00259 }
00260
00261 END_OLIGOFAR_SCOPES
00262
00263 #endif
00264
00265