00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <ncbi_pch.hpp>
00033
00034 #include "compart.hpp"
00035 #include "em.hpp"
00036
00037 #include <algo/align/util/compartment_finder.hpp>
00038 #include <objtools/data_loaders/genbank/gbloader.hpp>
00039 #include <objects/seqloc/Seq_id.hpp>
00040 #include <objmgr/util/seq_loc_util.hpp>
00041
00042 #include <math.h>
00043
00044 BEGIN_NCBI_SCOPE
00045
00046
00047 void CCompartApp::Init()
00048 {
00049 HideStdArgs(fHideLogfile | fHideConffile | fHideVersion);
00050
00051 auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
00052 argdescr->SetUsageContext(GetArguments().GetProgramName(),
00053 "Compart v.1.35. Unless -qdb and -sdb are specified, "
00054 "the tool expects tabular blast hits at stdin collated "
00055 "by query and subject, e.g. with 'sort -k 1,1 -k 2,2'");
00056
00057 argdescr->AddOptionalKey ("qdb", "qdb", "cDNA BLAST database",
00058 CArgDescriptions::eString);
00059
00060 argdescr->AddOptionalKey ("sdb", "sdb", "Genomic BLAST database",
00061 CArgDescriptions::eString);
00062
00063 argdescr->AddFlag ("ho", "Print raw hits only - no compartments");
00064
00065 argdescr->AddDefaultKey("penalty", "penalty", "Per-compartment penalty",
00066 CArgDescriptions::eDouble, "0.55");
00067
00068 argdescr->AddDefaultKey("min_idty", "min_idty", "Minimal overall identity",
00069 CArgDescriptions::eDouble, "0.70");
00070
00071 argdescr->AddDefaultKey("min_singleton_idty", "min_singleton_idty",
00072 "Minimal identity for singleton compartments. "
00073 "The actual parameter passed to the compartmentization "
00074 "procedure is least of this parameter multipled "
00075 "by the seq length, and min_singleton_idty_bps.",
00076 CArgDescriptions::eDouble, "0.70");
00077
00078 argdescr->AddDefaultKey("min_singleton_idty_bps", "min_singleton_idty_bps",
00079 "Minimal identity for singleton compartments "
00080 "in base pairs. Default = parameter disabled.",
00081 CArgDescriptions::eInteger, "9999999");
00082
00083 argdescr->AddDefaultKey("dropoff", "dropoff",
00084 "Max score drop-off during hit extension.",
00085 CArgDescriptions::eInteger,
00086 NStr::IntToString(CElementaryMatching::
00087 s_GetDefaultDropOff()));
00088
00089 argdescr->AddDefaultKey("min_query_len", "min_query_len",
00090 "Minimum length for individual cDNA sequences.",
00091 CArgDescriptions::eInteger, "50");
00092
00093 argdescr->AddDefaultKey("min_hit_len", "min_hit_len",
00094 "Minimum length for reported hits in hits-only mode. "
00095 "No effect in compartments mode.",
00096 CArgDescriptions::eInteger, "16");
00097
00098 argdescr->AddDefaultKey ("maxvol", "maxvol",
00099 "Maximum index volume size in MB (approximate)",
00100 CArgDescriptions::eInteger,
00101 "512");
00102
00103 argdescr->AddFlag("noxf", "[With external hits] Suppress overlap x-filtering: "
00104 "print all compartment hits intact.");
00105
00106 argdescr->AddOptionalKey("seqlens", "seqlens",
00107 "[With external hits] Two-column file with sequence IDs "
00108 "and their lengths. If none supplied, the program will "
00109 "attempt fetching the lengths from GenBank. "
00110 "Cannot be used with -qdb.",
00111 CArgDescriptions::eInputFile);
00112
00113 argdescr->AddDefaultKey("N", "N",
00114 "[With external hits] Max number of compartments "
00115 "per query (0 = All).",
00116 CArgDescriptions::eInteger, "0");
00117
00118 CArgAllow* constrain01 (new CArgAllow_Doubles(0.0, 1.0));
00119 argdescr->SetConstraint("penalty", constrain01);
00120 argdescr->SetConstraint("min_idty", constrain01);
00121 argdescr->SetConstraint("min_singleton_idty", constrain01);
00122
00123 CArgAllow_Integers* constrain_maxvol (new CArgAllow_Integers(128,1024));
00124 argdescr->SetConstraint("maxvol", constrain_maxvol);
00125
00126 CArgAllow_Integers* constrain_minqlen (new CArgAllow_Integers(21,99999));
00127 argdescr->SetConstraint("min_query_len", constrain_minqlen);
00128
00129 CArgAllow_Integers* constrain_minhitlen (new CArgAllow_Integers(1,99999));
00130 argdescr->SetConstraint("min_hit_len", constrain_minhitlen);
00131
00132 SetupArgDescriptions(argdescr.release());
00133 }
00134
00135
00136 void CCompartApp::x_ReadSeqLens(CNcbiIstream& istr)
00137 {
00138 m_id2len.clear();
00139 while(istr) {
00140 string id;
00141 istr >> id;
00142 if(id.size() && id[0] != '#') {
00143 size_t len (0);
00144 istr >> len;
00145 if(len != 0) {
00146 m_id2len[id] = len;
00147 }
00148 }
00149 }
00150 }
00151
00152
00153 size_t CCompartApp::x_GetSeqLength(const string& id)
00154 {
00155 TStrIdToLen::const_iterator ie (m_id2len.end()), im (m_id2len.find(id));
00156 if(im != ie) {
00157 return im->second;
00158 }
00159 else {
00160 USING_SCOPE(objects);
00161
00162 CRef<CSeq_id> seqid;
00163 try { seqid.Reset(new CSeq_id(id)); }
00164 catch(CSeqIdException& e) {
00165 return 0;
00166 }
00167
00168 const size_t len (sequence::GetLength(*seqid, m_Scope.GetNonNullPointer()));
00169
00170 m_id2len[id] = len;
00171
00172 if(m_id2len.size() >= 1000) {
00173 m_Scope->ResetHistory();
00174 }
00175
00176 return len;
00177 }
00178 }
00179
00180
00181 int CCompartApp::Run()
00182 {
00183 const CArgs& args (GetArgs());
00184
00185 const bool is_qdb (args["qdb"]);
00186 const bool is_sdb (args["sdb"]);
00187 const bool is_seqlens (args["seqlens"]);
00188 const bool is_ho (args["ho"]);
00189 const bool is_maxvol (args["maxvol"]);
00190 const bool is_n (args["N"]);
00191
00192 bool invalid_args (false);
00193 if(is_qdb ^ is_sdb) { invalid_args = true; }
00194 if(is_qdb && is_seqlens) { invalid_args = true; }
00195 if(is_qdb && is_n) { invalid_args = true; }
00196 if(!is_qdb && is_ho) { invalid_args = true; }
00197 if(!is_qdb && is_maxvol) { invalid_args = true; }
00198
00199 m_NoXF = args["noxf"];
00200 m_penalty = args["penalty"].AsDouble();
00201 m_min_idty = args["min_idty"].AsDouble();
00202 m_min_singleton_idty = args["min_singleton_idty"].AsDouble();
00203 m_min_singleton_idty_bps = args["min_singleton_idty_bps"].AsInteger();
00204 m_min_query_len = args["min_query_len"].AsInteger();
00205
00206 int rv (0);
00207 if(!is_qdb) {
00208 if(is_seqlens) {
00209 x_ReadSeqLens(args["seqlens"].AsInputFile());
00210 }
00211 else {
00212 USING_SCOPE(objects);
00213 CRef<CObjectManager> objmgr (CObjectManager::GetInstance());
00214 CGBDataLoader::RegisterInObjectManager(*objmgr);
00215 m_Scope = new CScope(*objmgr);
00216 m_Scope->AddDefaults();
00217 }
00218 m_MaxCompsPerQuery = args["N"].AsInteger();
00219 rv = x_DoWithExternalHits();
00220 }
00221 else {
00222 CRef<CElementaryMatching> matcher (
00223 new CElementaryMatching(args["qdb"].AsString(),
00224 args["sdb"].AsString()));
00225
00226 matcher->SetMinQueryLength(m_min_query_len);
00227
00228 matcher->SetPenalty(m_penalty);
00229 matcher->SetMinIdty(m_min_idty);
00230 matcher->SetMinSingletonIdty(m_min_singleton_idty);
00231
00232 matcher->SetHitsOnly(args["ho"]);
00233 matcher->SetMinHitLength(args["min_hit_len"].AsInteger());
00234 matcher->SetMaxVolSize(1024 * 1024 * (args["maxvol"].AsInteger()));
00235
00236 matcher->SetDropOff(args["dropoff"].AsInteger());
00237
00238 try { matcher->Run(); }
00239 catch(std::bad_alloc&) {
00240 NCBI_THROW(CException, eUnknown,
00241 "Not enough memory available to run this program");
00242 }
00243 }
00244
00245 return rv;
00246 }
00247
00248
00249 int CCompartApp::x_DoWithExternalHits(void)
00250 {
00251 m_CompartmentsPermanent.resize(0);
00252 m_Allocated = 0;
00253
00254 THitRefs hitrefs;
00255
00256 typedef map<string,string> TIdToId;
00257 TIdToId id2id;
00258
00259 char line [1024];
00260 string query0, subj0;
00261 while(cin) {
00262
00263 cin.getline(line, sizeof line, '\n');
00264 string s (NStr::TruncateSpaces(line));
00265 if(s.size()) {
00266
00267 THitRef hit (new THit(s.c_str()));
00268
00269 const string query (hit->GetQueryId()->GetSeqIdString(true));
00270 const string subj (hit->GetSubjId()->GetSeqIdString(true));
00271
00272 if(query0.size() == 0 || subj0.size() == 0) {
00273 query0 = query;
00274 subj0 = subj;
00275 id2id[query0] = subj0;
00276 }
00277 else {
00278
00279 if(query != query0 || subj != subj0) {
00280
00281 const int rv (x_ProcessPair(query0, hitrefs));
00282 if(rv != 0) return rv;
00283
00284 if(query != query0) {
00285
00286 x_RankAndStore();
00287
00288 if(m_Allocated > 128 * 1024 * 1024) {
00289
00290 stable_sort(m_CompartmentsPermanent.begin(),
00291 m_CompartmentsPermanent.end());
00292
00293 ITERATE(TCompartRefs, ii, m_CompartmentsPermanent) {
00294 cout << **ii << endl;
00295 m_Allocated -= (*ii)->GetHitCount()*sizeof(THit);
00296 }
00297 m_CompartmentsPermanent.clear();
00298 }
00299 }
00300
00301 query0 = query;
00302 subj0 = subj;
00303 hitrefs.clear();
00304
00305 TIdToId::const_iterator im = id2id.find(query0);
00306 if(im == id2id.end() || im->second != subj0) {
00307 id2id[query0] = subj0;
00308 }
00309 else {
00310 cerr << "Input hit stream not properly ordered" << endl;
00311 return 2;
00312 }
00313 }
00314 }
00315
00316 hitrefs.push_back(hit);
00317 }
00318 }
00319
00320 if(hitrefs.size()) {
00321 int rv = x_ProcessPair(query0, hitrefs);
00322 if(rv != 0) return rv;
00323 x_RankAndStore();
00324 hitrefs.clear();
00325 }
00326
00327 stable_sort(m_CompartmentsPermanent.begin(), m_CompartmentsPermanent.end());
00328
00329 ITERATE(TCompartRefs, ii, m_CompartmentsPermanent) {
00330 cout << **ii << endl;
00331 }
00332
00333 m_CompartmentsPermanent.clear();
00334
00335 return 0;
00336 }
00337
00338
00339 int CCompartApp::x_ProcessPair(const string& query0, THitRefs& hitrefs)
00340 {
00341 const size_t qlen (x_GetSeqLength(query0));
00342
00343 if(qlen == 0) {
00344 cerr << "Cannot retrieve sequence lengths for: "
00345 << query0 << endl;
00346 return 1;
00347 }
00348
00349 if(qlen < m_min_query_len) {
00350 return 0;
00351 }
00352
00353 typedef CCompartmentAccessor<THit> TAccessor;
00354 typedef TAccessor::TCoord TCoord;
00355
00356 const TCoord penalty_bps (TCoord(m_penalty * qlen + 0.5));
00357 const TCoord min_matches (TCoord(m_min_idty * qlen + 0.5));
00358 const TCoord msm1 (TCoord(m_min_singleton_idty * qlen + 0.5));
00359 const TCoord msm2 (m_min_singleton_idty_bps);
00360 const TCoord min_singleton_matches (min(msm1, msm2));
00361
00362 TAccessor ca (hitrefs.begin(), hitrefs.end(),
00363 penalty_bps,
00364 min_matches,
00365 min_singleton_matches,
00366 !m_NoXF);
00367
00368 THitRefs comp;
00369 for(bool b0 (ca.GetFirst(comp)); b0 ; b0 = ca.GetNext(comp)) {
00370
00371 TCompartRef cr (new CCompartment (comp, qlen));
00372 m_Compartments.push_back(cr);
00373 }
00374
00375 return 0;
00376 }
00377
00378
00379 bool PCompartmentRanker(const CCompartApp::TCompartRef& lhs,
00380 const CCompartApp::TCompartRef& rhs)
00381 {
00382
00383
00384 #ifdef PCOMPARTMENT_RANKER_M1
00385
00386 const size_t exons_lhs (lhs->GetExonCount());
00387 const size_t exons_rhs (rhs->GetExonCount());
00388 if(exons_lhs == exons_rhs) {
00389 return lhs->GetMatchCount() > rhs->GetMatchCount();
00390 }
00391 else {
00392 return exons_lhs > exons_rhs;
00393 }
00394
00395 #else
00396
00397 const size_t idtybin_lhs (lhs->GetIdentityBin());
00398 const size_t idtybin_rhs (rhs->GetIdentityBin());
00399 if(idtybin_lhs == idtybin_rhs) {
00400 const size_t exons_lhs (lhs->GetExonCount());
00401 const size_t exons_rhs (rhs->GetExonCount());
00402 if(exons_lhs == exons_rhs) {
00403 return lhs->GetMatchCount() > rhs->GetMatchCount();
00404 }
00405 else {
00406 return exons_lhs > exons_rhs;
00407 }
00408 }
00409 else {
00410 return idtybin_lhs > idtybin_rhs;
00411 }
00412 #endif
00413
00414 #undef PCOMPARTMENT_RANKER_M1
00415 }
00416
00417
00418 void CCompartApp::x_RankAndStore(void)
00419 {
00420 const size_t cdim (m_Compartments.size());
00421 if(cdim == 0) {
00422 return;
00423 }
00424
00425 if(m_MaxCompsPerQuery > 0 && cdim > m_MaxCompsPerQuery) {
00426 stable_sort(m_Compartments.begin(), m_Compartments.end(), PCompartmentRanker);
00427 m_Compartments.resize(m_MaxCompsPerQuery);
00428 }
00429
00430 for(size_t i (0), in (m_Compartments.size()); i < in; ++i) {
00431 TCompartRef cr (m_Compartments[i]);
00432 m_CompartmentsPermanent.push_back(cr);
00433 m_Allocated += cr->GetHitCount() * sizeof(THit);
00434 }
00435
00436 m_Compartments.resize(0);
00437 }
00438
00439
00440 void CCompartApp::Exit()
00441 {
00442 return;
00443 }
00444
00445
00446 CCompartApp::CCompartment::TRange CCompartApp::CCompartment::GetSpan(void) const
00447 {
00448 if(m_HitRefs.size() == 0) {
00449 NCBI_THROW(CException, eUnknown, "Span requested for empty compartment");
00450 }
00451 THit::TCoord a (m_HitRefs.front()->GetSubjStart()),
00452 b (m_HitRefs.back()->GetSubjStop());
00453 if(a > b) {
00454 THit::TCoord c (a);
00455 a = b;
00456 b = c;
00457 }
00458
00459 return CCompartApp::CCompartment::TRange(a, b);
00460 }
00461
00462 CCompartApp::CCompartment::CCompartment(const THitRefs& hitrefs, size_t length):
00463 m_SeqLength(length), m_IdentityBin(0), m_ExonCount(0), m_MatchCount(0)
00464 {
00465 if(hitrefs.size() == 0) {
00466 NCBI_THROW(CException, eUnknown,
00467 "Cannot init compartment with empty hit list");
00468 }
00469
00470 for(THitRefs::const_reverse_iterator ii(hitrefs.rbegin()), ie(hitrefs.rend());
00471 ii != ie; x_AddHit(*ii++));
00472
00473 x_EvalExons();
00474 }
00475
00476
00477 void CCompartApp::CCompartment::x_AddHit(const THitRef& hitref)
00478 {
00479 if(m_HitRefs.size() == 0) {
00480 m_HitRefs.push_back(hitref);
00481 }
00482 else {
00483
00484 const THitRef& hb (m_HitRefs.back());
00485 const bool cs (hb->GetSubjStrand());
00486 if(cs != hitref->GetSubjStrand()) {
00487 NCBI_THROW(CException, eUnknown, "Hit being added has strand "
00488 "different from that of the compartment.");
00489 }
00490
00491 m_HitRefs.push_back(hitref);
00492 }
00493 }
00494
00495
00496 bool CCompartApp::CCompartment::GetStrand(void) const
00497 {
00498 if(m_HitRefs.size()) {
00499 return m_HitRefs.front()->GetSubjStrand();
00500 }
00501 NCBI_THROW(CException, eUnknown, "Cannot determine compartment strand");
00502 }
00503
00504
00505
00506 bool CCompartApp::CCompartment::operator < (const CCompartApp::CCompartment& rhs)
00507 const
00508 {
00509 const THit::TId& subjid_lhs (m_HitRefs.front()->GetSubjId());
00510 const THit::TId& subjid_rhs (rhs.m_HitRefs.front()->GetSubjId());
00511 const int co (subjid_lhs->CompareOrdered(*subjid_rhs));
00512 if(co == 0) {
00513
00514 const THit::TId& queryid_lhs (m_HitRefs.front()->GetQueryId());
00515 const THit::TId& queryid_rhs (rhs.m_HitRefs.front()->GetQueryId());
00516 const int co (queryid_lhs->CompareOrdered(*queryid_rhs));
00517
00518 if(co == 0) {
00519
00520 const bool strand_lhs (GetStrand());
00521 const bool strand_rhs (rhs.GetStrand());
00522 if(strand_lhs == strand_rhs) {
00523 if(strand_lhs) {
00524 return GetSpan().first < rhs.GetSpan().first;
00525 }
00526 else {
00527 return GetSpan().first > rhs.GetSpan().first;
00528 }
00529 }
00530 else {
00531 return strand_lhs < strand_rhs;
00532 }
00533 }
00534 else {
00535 return co < 0;
00536 }
00537 }
00538 else {
00539 return co < 0;
00540 }
00541 }
00542
00543
00544 bool operator < (const CCompartApp::TCompartRef& lhs,
00545 const CCompartApp::TCompartRef& rhs)
00546 {
00547 return *lhs < *rhs;
00548 }
00549
00550
00551
00552
00553
00554
00555 void CCompartApp::CCompartment::x_EvalExons(void)
00556 {
00557 const size_t kMinIntronLength (25);
00558 const size_t kMinExonLength (10);
00559
00560 size_t exons (1);
00561 THitRef& h (m_HitRefs.front());
00562 double matches ( h->GetLength() * h->GetIdentity() );
00563
00564 if(m_HitRefs.size() > 1) {
00565
00566 if(GetStrand()) {
00567
00568 THitRef prev;
00569 ITERATE(THitRefs, ii, m_HitRefs) {
00570
00571 const THitRef& h (*ii);
00572 if(prev.NotEmpty()) {
00573
00574 const THit::TCoord q0 (prev->GetQueryStop());
00575 if(q0 + kMinExonLength <= h->GetQueryStop()) {
00576
00577 const THit::TCoord s0 (h->GetSubjStart()
00578 - (h->GetQueryStart() - q0));
00579 if(prev->GetSubjStop() + kMinIntronLength <= s0) {
00580 ++exons;
00581 }
00582 const THit::TCoord q0max (max(q0,h->GetQueryStart()));
00583 matches += (h->GetQueryStop() - q0max) * h->GetIdentity();
00584 }
00585 }
00586 prev = h;
00587 }
00588 }
00589 else {
00590
00591 THitRef prev;
00592 ITERATE(THitRefs, ii, m_HitRefs) {
00593
00594 const THitRef& h (*ii);
00595 if(prev.NotEmpty()) {
00596
00597 const THit::TCoord q0 (prev->GetQueryStop());
00598 if(q0 + kMinExonLength <= h->GetQueryStop()) {
00599
00600 const THit::TCoord s0 (h->GetSubjStart()
00601 + h->GetQueryStart() - q0);
00602 if(s0 + kMinIntronLength <= prev->GetSubjStop()) {
00603 ++exons;
00604 }
00605 const THit::TCoord q0max (max(q0,h->GetQueryStart()));
00606 matches += (h->GetQueryStop() - q0max) * h->GetIdentity();
00607 }
00608 }
00609 prev = h;
00610 }
00611 }
00612 }
00613
00614 m_ExonCount = exons;
00615 m_MatchCount = size_t(round(matches));
00616 m_IdentityBin = size_t(floor(double(m_MatchCount) / m_SeqLength / 0.1));
00617 }
00618
00619
00620 ostream& operator << (ostream& ostr, const CCompartApp::CCompartment& rhs)
00621 {
00622 ITERATE(CCompartApp::THitRefs, ii, rhs.m_HitRefs) {
00623 ostr << **ii << endl;
00624 }
00625 return ostr;
00626 }
00627
00628
00629 END_NCBI_SCOPE
00630
00631
00632 USING_NCBI_SCOPE;
00633
00634 int main(int argc, const char* argv[])
00635 {
00636 return CCompartApp().AppMain(argc, argv, 0, eDS_Default, 0);
00637 }
00638
00639