src/algo/align/util/demo/compart/compart.cpp

Go to the documentation of this file.
00001 /* $Id: compart.cpp 170238 2009-09-10 14:55:17Z kapustin $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE                          
00005  *               National Center for Biotechnology Information
00006  *                                                                          
00007  *  This software/database is a "United States Government Work" under the   
00008  *  terms of the United States Copyright Act.  It was written as part of    
00009  *  the author's official duties as a United States Government employee and 
00010  *  thus cannot be copyrighted.  This software/database is freely available 
00011  *  to the public for use. The National Library of Medicine and the U.S.    
00012  *  Government have not placed any restriction on its use or reproduction.  
00013  *                                                                          
00014  *  Although all reasonable efforts have been taken to ensure the accuracy  
00015  *  and reliability of the software and data, the NLM and the U.S.          
00016  *  Government do not and cannot warrant the performance or results that    
00017  *  may be obtained by using this software or data. The NLM and the U.S.    
00018  *  Government disclaim all warranties, express or implied, including       
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.                                                                
00021  *                                                                          
00022  *  Please cite the author in any work or product based on this material.   
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Yuri Kapustin
00027  *
00028  * File Description: cDNA-to-Genomic local alignment (same species)
00029  *                   and compartmentization utility
00030 */
00031 
00032 #include <ncbi_pch.hpp>
00033 
00034 #include "compart.hpp"
00035 #include "em.hpp"
00036 
00037 #include <algo/align/util/compartment_finder.hpp>
00038 #include <objtools/data_loaders/genbank/gbloader.hpp>
00039 #include <objects/seqloc/Seq_id.hpp>
00040 #include <objmgr/util/seq_loc_util.hpp>
00041 
00042 #include <math.h>
00043 
00044 BEGIN_NCBI_SCOPE
00045 
00046 
00047 void CCompartApp::Init()
00048 {
00049     HideStdArgs(fHideLogfile | fHideConffile | fHideVersion);
00050 
00051     auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
00052     argdescr->SetUsageContext(GetArguments().GetProgramName(),
00053                               "Compart v.1.35. Unless -qdb and -sdb are specified, "
00054                               "the tool expects tabular blast hits at stdin collated "
00055                               "by query and subject, e.g. with 'sort -k 1,1 -k 2,2'");
00056 
00057     argdescr->AddOptionalKey ("qdb", "qdb", "cDNA BLAST database", 
00058                               CArgDescriptions::eString);
00059 
00060     argdescr->AddOptionalKey ("sdb", "sdb", "Genomic BLAST database", 
00061                               CArgDescriptions::eString);
00062 
00063     argdescr->AddFlag ("ho", "Print raw hits only - no compartments");
00064 
00065     argdescr->AddDefaultKey("penalty", "penalty", "Per-compartment penalty",
00066                             CArgDescriptions::eDouble, "0.55");
00067     
00068     argdescr->AddDefaultKey("min_idty", "min_idty", "Minimal overall identity",
00069                             CArgDescriptions::eDouble, "0.70");
00070     
00071     argdescr->AddDefaultKey("min_singleton_idty", "min_singleton_idty", 
00072                             "Minimal identity for singleton compartments. "
00073                             "The actual parameter passed to the compartmentization "
00074                             "procedure is least of this parameter multipled "
00075                             "by the seq length, and min_singleton_idty_bps.",
00076                             CArgDescriptions::eDouble, "0.70");
00077 
00078     argdescr->AddDefaultKey("min_singleton_idty_bps", "min_singleton_idty_bps", 
00079                             "Minimal identity for singleton compartments "
00080                             "in base pairs. Default = parameter disabled.",
00081                             CArgDescriptions::eInteger, "9999999");
00082     
00083     argdescr->AddDefaultKey("dropoff", "dropoff", 
00084                             "Max score drop-off during hit extension.",
00085                             CArgDescriptions::eInteger,
00086                             NStr::IntToString(CElementaryMatching::
00087                                               s_GetDefaultDropOff()));
00088 
00089     argdescr->AddDefaultKey("min_query_len", "min_query_len", 
00090                             "Minimum length for individual cDNA sequences.",
00091                             CArgDescriptions::eInteger, "50");
00092 
00093     argdescr->AddDefaultKey("min_hit_len", "min_hit_len", 
00094                             "Minimum length for reported hits in hits-only mode. "
00095                             "No effect in compartments mode.",
00096                             CArgDescriptions::eInteger, "16");
00097   
00098     argdescr->AddDefaultKey ("maxvol", "maxvol", 
00099                              "Maximum index volume size in MB (approximate)",
00100                              CArgDescriptions::eInteger,
00101                              "512");
00102 
00103     argdescr->AddFlag("noxf", "[With external hits] Suppress overlap x-filtering: "
00104                       "print all compartment hits intact.");
00105 
00106     argdescr->AddOptionalKey("seqlens", "seqlens", 
00107                              "[With external hits] Two-column file with sequence IDs "
00108                              "and their lengths. If none supplied, the program will "
00109                              "attempt fetching the lengths from GenBank. "
00110                              "Cannot be used with -qdb.",
00111                              CArgDescriptions::eInputFile);
00112 
00113     argdescr->AddDefaultKey("N", "N", 
00114                             "[With external hits] Max number of compartments "
00115                             "per query (0 = All).",
00116                             CArgDescriptions::eInteger, "0");
00117 
00118     CArgAllow* constrain01 (new CArgAllow_Doubles(0.0, 1.0));
00119     argdescr->SetConstraint("penalty", constrain01);
00120     argdescr->SetConstraint("min_idty", constrain01);
00121     argdescr->SetConstraint("min_singleton_idty", constrain01);
00122 
00123     CArgAllow_Integers* constrain_maxvol (new CArgAllow_Integers(128,1024));
00124     argdescr->SetConstraint("maxvol", constrain_maxvol);
00125 
00126     CArgAllow_Integers* constrain_minqlen (new CArgAllow_Integers(21,99999));
00127     argdescr->SetConstraint("min_query_len", constrain_minqlen);
00128 
00129     CArgAllow_Integers* constrain_minhitlen (new CArgAllow_Integers(1,99999));
00130     argdescr->SetConstraint("min_hit_len", constrain_minhitlen);
00131 
00132     SetupArgDescriptions(argdescr.release());
00133 }
00134 
00135 
00136 void CCompartApp::x_ReadSeqLens(CNcbiIstream& istr)
00137 {
00138     m_id2len.clear();
00139     while(istr) {
00140         string id;
00141         istr >> id;
00142         if(id.size() && id[0] != '#') {
00143             size_t len (0);
00144             istr >> len;
00145             if(len != 0) {
00146                 m_id2len[id] = len;
00147             }
00148         }
00149     }
00150 }
00151 
00152 
00153 size_t CCompartApp::x_GetSeqLength(const string& id)
00154 {
00155     TStrIdToLen::const_iterator ie (m_id2len.end()), im (m_id2len.find(id));
00156     if(im != ie) {
00157         return im->second;
00158     }
00159     else {
00160         USING_SCOPE(objects);
00161 
00162         CRef<CSeq_id> seqid;
00163         try { seqid.Reset(new CSeq_id(id)); }
00164         catch(CSeqIdException& e) {
00165             return 0;
00166         }
00167 
00168         const size_t len (sequence::GetLength(*seqid, m_Scope.GetNonNullPointer()));
00169         
00170         m_id2len[id] = len;
00171 
00172         if(m_id2len.size() >= 1000) {
00173             m_Scope->ResetHistory();
00174         }
00175 
00176         return len;
00177     }
00178 }
00179 
00180 
00181 int CCompartApp::Run()
00182 {   
00183     const CArgs& args (GetArgs());
00184 
00185     const bool is_qdb     (args["qdb"]);
00186     const bool is_sdb     (args["sdb"]);
00187     const bool is_seqlens (args["seqlens"]);
00188     const bool is_ho      (args["ho"]);
00189     const bool is_maxvol  (args["maxvol"]);
00190     const bool is_n       (args["N"]);
00191     
00192     bool invalid_args (false);
00193     if(is_qdb ^ is_sdb)        { invalid_args = true; }
00194     if(is_qdb  && is_seqlens)  { invalid_args = true; }
00195     if(is_qdb  && is_n)        { invalid_args = true; }
00196     if(!is_qdb && is_ho)       { invalid_args = true; }
00197     if(!is_qdb && is_maxvol)   { invalid_args = true; }
00198 
00199     m_NoXF                     = args["noxf"];
00200     m_penalty                  = args["penalty"].AsDouble();
00201     m_min_idty                 = args["min_idty"].AsDouble();
00202     m_min_singleton_idty       = args["min_singleton_idty"].AsDouble();
00203     m_min_singleton_idty_bps   = args["min_singleton_idty_bps"].AsInteger();
00204     m_min_query_len            = args["min_query_len"].AsInteger();
00205 
00206     int rv (0);
00207     if(!is_qdb) {
00208         if(is_seqlens) {
00209             x_ReadSeqLens(args["seqlens"].AsInputFile());
00210         }
00211         else {
00212             USING_SCOPE(objects);    
00213             CRef<CObjectManager> objmgr (CObjectManager::GetInstance());
00214             CGBDataLoader::RegisterInObjectManager(*objmgr);
00215             m_Scope = new CScope(*objmgr);
00216             m_Scope->AddDefaults();
00217         }
00218         m_MaxCompsPerQuery         = args["N"].AsInteger();
00219         rv = x_DoWithExternalHits();
00220     }
00221     else {
00222         CRef<CElementaryMatching> matcher (
00223                      new CElementaryMatching(args["qdb"].AsString(),
00224                                              args["sdb"].AsString()));
00225 
00226         matcher->SetMinQueryLength(m_min_query_len);
00227 
00228         matcher->SetPenalty(m_penalty);
00229         matcher->SetMinIdty(m_min_idty);
00230         matcher->SetMinSingletonIdty(m_min_singleton_idty);
00231 
00232         matcher->SetHitsOnly(args["ho"]);
00233         matcher->SetMinHitLength(args["min_hit_len"].AsInteger());
00234         matcher->SetMaxVolSize(1024 * 1024 * (args["maxvol"].AsInteger()));
00235 
00236         matcher->SetDropOff(args["dropoff"].AsInteger());
00237 
00238         try { matcher->Run(); }
00239         catch(std::bad_alloc&) {
00240             NCBI_THROW(CException, eUnknown, 
00241                        "Not enough memory available to run this program");
00242         }      
00243     }
00244 
00245     return rv;
00246 }
00247 
00248 
00249 int CCompartApp::x_DoWithExternalHits(void)
00250 {
00251     m_CompartmentsPermanent.resize(0);
00252     m_Allocated = 0;
00253 
00254     THitRefs hitrefs;
00255 
00256     typedef map<string,string> TIdToId;
00257     TIdToId id2id;
00258 
00259     char line [1024];
00260     string query0, subj0;
00261     while(cin) {
00262 
00263         cin.getline(line, sizeof line, '\n');
00264         string s (NStr::TruncateSpaces(line));
00265         if(s.size()) {
00266 
00267             THitRef hit (new THit(s.c_str()));
00268 
00269             const string query (hit->GetQueryId()->GetSeqIdString(true));
00270             const string subj  (hit->GetSubjId()->GetSeqIdString(true));
00271 
00272             if(query0.size() == 0 || subj0.size() == 0) {
00273                 query0 = query;
00274                 subj0 = subj;
00275                 id2id[query0] = subj0;
00276             }
00277             else {
00278 
00279                 if(query != query0 || subj != subj0) {
00280 
00281                     const int rv (x_ProcessPair(query0, hitrefs));
00282                     if(rv != 0) return rv;
00283 
00284                     if(query != query0) {
00285 
00286                         x_RankAndStore();
00287 
00288                         if(m_Allocated > 128 * 1024 * 1024) {
00289 
00290                             stable_sort(m_CompartmentsPermanent.begin(),
00291                                         m_CompartmentsPermanent.end());
00292 
00293                             ITERATE(TCompartRefs, ii, m_CompartmentsPermanent) {
00294                                 cout << **ii << endl;
00295                                 m_Allocated -= (*ii)->GetHitCount()*sizeof(THit);
00296                             }
00297                             m_CompartmentsPermanent.clear();
00298                         }
00299                     }
00300 
00301                     query0 = query;
00302                     subj0 = subj;
00303                     hitrefs.clear();
00304 
00305                     TIdToId::const_iterator im = id2id.find(query0);
00306                     if(im == id2id.end() || im->second != subj0) {
00307                         id2id[query0] = subj0;
00308                     }
00309                     else {
00310                         cerr << "Input hit stream not properly ordered" << endl;
00311                         return 2;
00312                     }
00313                 }
00314             }
00315 
00316             hitrefs.push_back(hit);
00317         }
00318     }
00319 
00320     if(hitrefs.size()) {
00321         int rv = x_ProcessPair(query0, hitrefs);
00322         if(rv != 0) return rv;
00323         x_RankAndStore();
00324         hitrefs.clear();
00325     }
00326 
00327     stable_sort(m_CompartmentsPermanent.begin(), m_CompartmentsPermanent.end());
00328 
00329     ITERATE(TCompartRefs, ii, m_CompartmentsPermanent) {
00330         cout << **ii << endl;
00331     }
00332 
00333     m_CompartmentsPermanent.clear();
00334 
00335     return 0;
00336 }
00337 
00338 
00339 int CCompartApp::x_ProcessPair(const string& query0, THitRefs& hitrefs)
00340 {
00341     const size_t qlen (x_GetSeqLength(query0));
00342 
00343     if(qlen == 0) {
00344         cerr << "Cannot retrieve sequence lengths for: " 
00345              << query0 << endl;
00346         return 1;
00347     }
00348 
00349     if(qlen < m_min_query_len) {
00350         return 0;
00351     }
00352 
00353     typedef CCompartmentAccessor<THit> TAccessor;
00354     typedef TAccessor::TCoord          TCoord;
00355 
00356     const TCoord penalty_bps (TCoord(m_penalty * qlen + 0.5));
00357     const TCoord min_matches (TCoord(m_min_idty * qlen + 0.5));
00358     const TCoord msm1        (TCoord(m_min_singleton_idty * qlen + 0.5));
00359     const TCoord msm2        (m_min_singleton_idty_bps);
00360     const TCoord min_singleton_matches (min(msm1, msm2));
00361 
00362     TAccessor ca (hitrefs.begin(), hitrefs.end(),
00363                   penalty_bps,
00364                   min_matches,
00365                   min_singleton_matches,
00366                   !m_NoXF);
00367 
00368     THitRefs comp;
00369     for(bool b0 (ca.GetFirst(comp)); b0 ; b0 = ca.GetNext(comp)) {
00370 
00371         TCompartRef cr (new CCompartment (comp, qlen));
00372         m_Compartments.push_back(cr);
00373     }
00374 
00375     return 0;
00376 }
00377 
00378 
00379 bool PCompartmentRanker(const CCompartApp::TCompartRef& lhs,
00380                         const CCompartApp::TCompartRef& rhs)
00381 {
00382     //#define PCOMPARTMENT_RANKER_M1
00383 
00384 #ifdef PCOMPARTMENT_RANKER_M1
00385 
00386     const size_t exons_lhs (lhs->GetExonCount());
00387     const size_t exons_rhs (rhs->GetExonCount());
00388     if(exons_lhs == exons_rhs) {
00389         return lhs->GetMatchCount() > rhs->GetMatchCount();
00390     }
00391     else {
00392         return exons_lhs > exons_rhs;
00393     }
00394 
00395 #else
00396 
00397     const size_t idtybin_lhs (lhs->GetIdentityBin());
00398     const size_t idtybin_rhs (rhs->GetIdentityBin());
00399     if(idtybin_lhs == idtybin_rhs) {
00400         const size_t exons_lhs (lhs->GetExonCount());
00401         const size_t exons_rhs (rhs->GetExonCount());
00402         if(exons_lhs == exons_rhs) {
00403             return lhs->GetMatchCount() > rhs->GetMatchCount();
00404         }
00405         else {
00406             return exons_lhs > exons_rhs;
00407         }
00408     }
00409     else {
00410         return idtybin_lhs > idtybin_rhs;
00411     }
00412 #endif
00413 
00414 #undef PCOMPARTMENT_RANKER_M1
00415 }
00416 
00417 
00418 void CCompartApp::x_RankAndStore(void)
00419 {
00420     const size_t cdim (m_Compartments.size());
00421     if(cdim == 0) {
00422         return;
00423     }
00424 
00425     if(m_MaxCompsPerQuery > 0 && cdim > m_MaxCompsPerQuery) {
00426         stable_sort(m_Compartments.begin(), m_Compartments.end(), PCompartmentRanker);
00427         m_Compartments.resize(m_MaxCompsPerQuery);
00428     }
00429 
00430     for(size_t i (0), in (m_Compartments.size()); i < in; ++i) {
00431         TCompartRef cr (m_Compartments[i]);
00432         m_CompartmentsPermanent.push_back(cr);
00433         m_Allocated += cr->GetHitCount() * sizeof(THit);
00434     }
00435     
00436     m_Compartments.resize(0);
00437 }
00438 
00439 
00440 void CCompartApp::Exit()
00441 {
00442     return;
00443 }
00444 
00445  
00446 CCompartApp::CCompartment::TRange CCompartApp::CCompartment::GetSpan(void) const
00447 {
00448     if(m_HitRefs.size() == 0) {
00449         NCBI_THROW(CException, eUnknown, "Span requested for empty compartment");
00450     }
00451     THit::TCoord a (m_HitRefs.front()->GetSubjStart()),
00452         b (m_HitRefs.back()->GetSubjStop());
00453     if(a > b) {
00454         THit::TCoord c (a);
00455         a = b;
00456         b = c;
00457     }
00458 
00459     return CCompartApp::CCompartment::TRange(a, b);
00460 }
00461 
00462 CCompartApp::CCompartment::CCompartment(const THitRefs& hitrefs, size_t length):
00463     m_SeqLength(length), m_IdentityBin(0), m_ExonCount(0), m_MatchCount(0)
00464 {
00465     if(hitrefs.size() == 0) {
00466         NCBI_THROW(CException, eUnknown,
00467                    "Cannot init compartment with empty hit list");
00468     }
00469 
00470     for(THitRefs::const_reverse_iterator ii(hitrefs.rbegin()), ie(hitrefs.rend());
00471         ii != ie; x_AddHit(*ii++));
00472 
00473     x_EvalExons();
00474 }
00475 
00476 
00477 void CCompartApp::CCompartment::x_AddHit(const THitRef& hitref)
00478 {
00479     if(m_HitRefs.size() == 0) {
00480         m_HitRefs.push_back(hitref);
00481     }
00482     else {
00483 
00484         const THitRef& hb (m_HitRefs.back());
00485         const bool cs (hb->GetSubjStrand());
00486         if(cs != hitref->GetSubjStrand()) {
00487             NCBI_THROW(CException, eUnknown, "Hit being added has strand "
00488                        "different from that of the compartment.");
00489         }
00490 
00491         m_HitRefs.push_back(hitref);
00492     }
00493 }
00494 
00495 
00496 bool CCompartApp::CCompartment::GetStrand(void) const
00497 {
00498     if(m_HitRefs.size()) {
00499         return m_HitRefs.front()->GetSubjStrand();
00500     }
00501     NCBI_THROW(CException, eUnknown, "Cannot determine compartment strand");
00502 }
00503 
00504 
00505 // compares by subject, query, strand, then order on the subject
00506 bool CCompartApp::CCompartment::operator < (const CCompartApp::CCompartment& rhs)
00507 const
00508 {
00509     const THit::TId& subjid_lhs (m_HitRefs.front()->GetSubjId());
00510     const THit::TId& subjid_rhs (rhs.m_HitRefs.front()->GetSubjId());
00511     const int co (subjid_lhs->CompareOrdered(*subjid_rhs));
00512     if(co == 0) {
00513 
00514         const THit::TId& queryid_lhs (m_HitRefs.front()->GetQueryId());
00515         const THit::TId& queryid_rhs (rhs.m_HitRefs.front()->GetQueryId());
00516         const int co (queryid_lhs->CompareOrdered(*queryid_rhs));
00517 
00518         if(co == 0) {
00519 
00520             const bool strand_lhs (GetStrand());
00521             const bool strand_rhs (rhs.GetStrand());
00522             if(strand_lhs == strand_rhs) {
00523                 if(strand_lhs) {
00524                     return GetSpan().first < rhs.GetSpan().first;
00525                 }
00526                 else {
00527                     return GetSpan().first > rhs.GetSpan().first;
00528                 }
00529             }
00530             else {
00531                 return strand_lhs < strand_rhs;
00532             }
00533         }
00534         else {
00535             return co < 0;
00536         }
00537     }
00538     else {
00539         return co < 0;
00540     }
00541 }
00542 
00543 
00544 bool operator < (const CCompartApp::TCompartRef& lhs,
00545                  const CCompartApp::TCompartRef& rhs)
00546 {
00547     return *lhs < *rhs;
00548 }
00549 
00550 
00551 // Evaluate all variables used in comaprtment ranking. These are:
00552 // - m_IdentityBin
00553 // - m_ExonCount
00554 // - m_MatchCount
00555 void CCompartApp::CCompartment::x_EvalExons(void)
00556 {
00557     const size_t kMinIntronLength (25);
00558     const size_t kMinExonLength   (10);
00559 
00560     size_t exons (1);
00561     THitRef& h (m_HitRefs.front());
00562     double matches ( h->GetLength() * h->GetIdentity() );
00563 
00564     if(m_HitRefs.size() > 1) {
00565 
00566         if(GetStrand()) {
00567 
00568             THitRef prev;
00569             ITERATE(THitRefs, ii, m_HitRefs) {
00570 
00571                 const THitRef& h (*ii);
00572                 if(prev.NotEmpty()) {
00573 
00574                     const THit::TCoord q0 (prev->GetQueryStop());
00575                     if(q0 + kMinExonLength <= h->GetQueryStop()) {
00576 
00577                         const THit::TCoord s0 (h->GetSubjStart() 
00578                                                - (h->GetQueryStart() - q0));
00579                         if(prev->GetSubjStop() + kMinIntronLength <= s0) {
00580                             ++exons;
00581                         }
00582                         const THit::TCoord q0max (max(q0,h->GetQueryStart()));
00583                         matches += (h->GetQueryStop() - q0max) * h->GetIdentity();
00584                     }
00585                 }
00586                 prev = h;
00587             }
00588         }
00589         else {
00590 
00591             THitRef prev;
00592             ITERATE(THitRefs, ii, m_HitRefs) {
00593 
00594                 const THitRef& h (*ii);
00595                 if(prev.NotEmpty()) {
00596 
00597                     const THit::TCoord q0 (prev->GetQueryStop());
00598                     if(q0 + kMinExonLength <= h->GetQueryStop()) {
00599 
00600                         const THit::TCoord s0 (h->GetSubjStart() 
00601                                                + h->GetQueryStart() - q0);
00602                         if(s0 + kMinIntronLength <= prev->GetSubjStop()) {
00603                             ++exons;
00604                         }
00605                         const THit::TCoord q0max (max(q0,h->GetQueryStart()));
00606                         matches += (h->GetQueryStop() - q0max) * h->GetIdentity();
00607                     }
00608                 }
00609                 prev = h;
00610             }
00611         }
00612     }
00613     
00614     m_ExonCount = exons;
00615     m_MatchCount = size_t(round(matches));
00616     m_IdentityBin = size_t(floor(double(m_MatchCount) / m_SeqLength / 0.1));
00617 }
00618 
00619 
00620 ostream& operator << (ostream& ostr, const CCompartApp::CCompartment& rhs)
00621 {
00622     ITERATE(CCompartApp::THitRefs, ii, rhs.m_HitRefs) {
00623         ostr << **ii << endl;
00624     }
00625     return ostr;
00626 }
00627 
00628 
00629 END_NCBI_SCOPE
00630 
00631 
00632 USING_NCBI_SCOPE;
00633 
00634 int main(int argc, const char* argv[]) 
00635 {
00636     return CCompartApp().AppMain(argc, argv, 0, eDS_Default, 0);
00637 }
00638 
00639 

Generated on Sun Dec 6 22:16:46 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:49 2009 by modify_doxy.py rev. 173732