00001 #ifndef ALGO_ALIGN_SPLIGN__HPP
00002 #define ALGO_ALIGN_SPLIGN__HPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #include <corelib/ncbistd.hpp>
00037
00038 #include <objmgr/scope.hpp>
00039 #include <algo/align/nw/nw_formatter.hpp>
00040 #include <algo/align/util/blast_tabular.hpp>
00041
00042
00043 BEGIN_NCBI_SCOPE
00044
00045 class CBlastTabular;
00046
00047 BEGIN_SCOPE(objects)
00048 class CScope;
00049 class CSeq_id;
00050 class CScore_set;
00051 class CSeq_align_set;
00052 END_SCOPE(objects)
00053
00054
00055 class CSplign: public CObject
00056 {
00057 public:
00058
00059 typedef CSplicedAligner TAligner;
00060
00061 CSplign(void);
00062 ~CSplign();
00063
00064
00065 CRef<TAligner>& SetAligner(void);
00066 CConstRef<TAligner> GetAligner(void) const;
00067 static CRef<CSplicedAligner> s_CreateDefaultAligner(bool low_query_quality);
00068
00069 CRef<objects::CScope> GetScope(void) const;
00070 CRef<objects::CScope>& SetScope(void);
00071 void PreserveScope(bool preserve_scope = true);
00072
00073 void SetEndGapDetection(bool on);
00074 bool GetEndGapDetection(void) const;
00075
00076 void SetPolyaDetection(bool on);
00077 bool GetPolyaDetection(void) const;
00078
00079 void SetStrand(bool strand);
00080 bool GetStrand(void) const;
00081
00082 void SetMaxGenomicExtent(size_t mge);
00083 static size_t s_GetDefaultMaxGenomicExtent(void);
00084 size_t GetMaxGenomicExtent(void) const;
00085
00086 void SetMaxIntron(size_t max_intron);
00087 size_t GetMaxIntron(void) const;
00088
00089 void SetCompartmentPenalty(double penalty);
00090 static double s_GetDefaultCompartmentPenalty(void);
00091 double GetCompartmentPenalty(void) const;
00092
00093 void SetMinCompartmentIdentity(double idty);
00094 static double s_GetDefaultMinCompartmentIdty(void);
00095 double GetMinCompartmentIdentity(void) const;
00096
00097 void SetMinSingletonIdentity(double idty);
00098 double GetMinSingletonIdentity(void) const;
00099
00100 void SetMinSingletonIdentityBps(size_t idty);
00101 size_t GetMinSingletonIdentityBps(void) const;
00102
00103 void SetMinExonIdentity(double idty);
00104 static double s_GetDefaultMinExonIdty(void);
00105 double GetMinExonIdentity(void) const;
00106
00107 void SetStartModelId(size_t model_id) {
00108 m_model_id = model_id - 1;
00109 }
00110 size_t GetNextModelId(void) const {
00111 return m_model_id + 1;
00112 }
00113
00114 void SetMaxCompsPerQuery(size_t m);
00115 size_t GetMaxCompsPerQuery(void) const;
00116
00117 typedef CNWFormatter::SSegment TSegment;
00118 typedef vector<TSegment> TSegments;
00119
00120
00121 struct SAlignedCompartment {
00122
00123 size_t m_Id;
00124
00125 enum ECompartmentStatus {
00126 eStatus_Ok,
00127 eStatus_Empty,
00128 eStatus_Error
00129 };
00130
00131 ECompartmentStatus m_Status;
00132
00133 string m_Msg;
00134 bool m_QueryStrand, m_SubjStrand;
00135 size_t m_Cds_start, m_Cds_stop;
00136 size_t m_QueryLen;
00137 size_t m_PolyA;
00138 float m_Score;
00139 TSegments m_Segments;
00140
00141 SAlignedCompartment(void):
00142 m_Id(0),
00143 m_Status(eStatus_Empty),
00144 m_Cds_start(0), m_Cds_stop(0),
00145 m_QueryLen (0),
00146 m_PolyA(0),
00147 m_Score(0)
00148 {}
00149
00150 SAlignedCompartment(size_t id, const char* msg):
00151 m_Id(id),
00152 m_Status(eStatus_Empty),
00153 m_Msg(msg),
00154 m_Cds_start(0), m_Cds_stop(0),
00155 m_QueryLen(0),
00156 m_PolyA(0),
00157 m_Score(0)
00158 {}
00159
00160
00161 double GetIdentity(void) const;
00162
00163
00164 void GetBox(Uint4* box) const;
00165
00166
00167 typedef vector<char> TNetCacheBuffer;
00168 void ToBuffer (TNetCacheBuffer* buf) const;
00169 void FromBuffer (const TNetCacheBuffer& buf);
00170 };
00171
00172 typedef CBlastTabular THit;
00173 typedef CRef<THit> THitRef;
00174 typedef vector<THitRef> THitRefs;
00175
00176
00177 void Run(THitRefs* hitrefs);
00178 typedef vector<SAlignedCompartment> TResults;
00179
00180
00181 const TResults& GetResult(void) const {
00182 return m_result;
00183 }
00184
00185
00186 bool AlignSingleCompartment(THitRefs* hitrefs,
00187 size_t range_left, size_t range_right,
00188 SAlignedCompartment* result);
00189
00190
00191 bool AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
00192 SAlignedCompartment* result);
00193
00194
00195
00196 void ClearMem(void);
00197
00198 typedef pair<size_t,size_t> TOrf;
00199 typedef pair<TOrf,TOrf> TOrfPair;
00200 TOrfPair GetCds(const THit::TId & id, const vector<char> * seq_data = 0);
00201
00202 static size_t s_TestPolyA(const char * seq, size_t dim, size_t cds_stop = 0);
00203
00204
00205
00206 enum ECompartmentScores {
00207 eCS_Matches = 6,
00208 eCS_OverallIdentity = 10,
00209 eCS_InframeMatches = 20,
00210 eCS_InframeIdentity = 22,
00211 eCS_Splices = 23,
00212 eCS_ConsensusSplices = 24,
00213 eCS_ProductCoverage = 27,
00214 eCS_ExonIdentity = 28,
00215 eCS_CombinationIdentity = 32
00216 };
00217
00218 enum EStatFlags {
00219 eSF_BasicNonCds = 1 << 0,
00220 eSF_BasicCds = 1 << 1
00221 };
00222
00223 typedef list<CRef<objects::CScore_set> > TScoreSets;
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239 static size_t s_ComputeStats(
00240 CRef<objects::CSeq_align_set> sas,
00241 TScoreSets * output_stats,
00242 TOrf cds = TOrf(0, 0),
00243 EStatFlags flags = eSF_BasicNonCds);
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259 static CRef<objects::CScore_set> s_ComputeStats(
00260 CRef<objects::CSeq_align> sa,
00261 bool embed_scoreset = true,
00262 TOrf cds = TOrf(0, 0),
00263 EStatFlags flags = eSF_BasicNonCds);
00264
00265 protected:
00266
00267
00268 CRef<TAligner> m_aligner;
00269
00270
00271 CRef<objects::CScope> m_Scope;
00272 bool m_CanResetHistory;
00273
00274
00275 vector<size_t> m_pattern;
00276
00277
00278 double m_MinExonIdty;
00279
00280
00281 double m_CompartmentPenalty;
00282
00283
00284 double m_MinCompartmentIdty;
00285
00286
00287
00288
00289
00290 double m_MinSingletonIdty;
00291
00292 size_t m_MinSingletonIdtyBps;
00293
00294
00295
00296 bool m_endgaps;
00297
00298
00299 struct SAlnMapElem {
00300 size_t m_box [4];
00301 int m_pattern_start, m_pattern_end;
00302 };
00303 vector<SAlnMapElem> m_alnmap;
00304
00305 typedef map<string,TOrfPair> TStrIdToOrfs;
00306 TStrIdToOrfs m_OrfMap;
00307
00308
00309 vector<char> m_mrna;
00310 bool m_strand;
00311 size_t m_polya_start;
00312 bool m_nopolya;
00313
00314 size_t m_cds_start;
00315 size_t m_cds_stop;
00316
00317
00318 vector<char> m_genomic;
00319
00320
00321 size_t m_max_genomic_ext;
00322
00323
00324 size_t m_MaxIntron;
00325
00326
00327
00328 pair<size_t, size_t> m_BoundingRange;
00329
00330
00331 TSegments m_segments;
00332
00333
00334 size_t m_model_id;
00335 TResults m_result;
00336
00337 size_t m_MaxCompsPerQuery;
00338
00339 size_t m_MinPatternHitLength;
00340
00341 SAlignedCompartment x_RunOnCompartment( THitRefs* hitrefs,
00342 size_t range_left,
00343 size_t range_right);
00344
00345 float x_Run(const char* seq1, const char* seq2);
00346
00347 void x_SplitQualifyingHits(THitRefs* phitrefs);
00348 void x_SetPattern(THitRefs* hitrefs);
00349 bool x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const;
00350 Uint4 x_GetGenomicExtent(const Uint4 query_extent, Uint4 max_ext = 0) const;
00351 void x_FinalizeAlignedCompartment(SAlignedCompartment & ac);
00352
00353 void x_LoadSequence(vector<char>* seq,
00354 const objects::CSeq_id& seqid,
00355 THit::TCoord start,
00356 THit::TCoord finish,
00357 bool retain);
00358
00359 static THitRef sx_NewHit(THit::TCoord q0, THit::TCoord q,
00360 THit::TCoord s0, THit::TCoord s);
00361
00362
00363 CSplign(const CSplign&);
00364 CSplign& operator=(const CSplign&);
00365 };
00366
00367
00368 END_NCBI_SCOPE
00369
00370
00371 #endif
00372
00373