src/app/splign/splign_app.cpp

Go to the documentation of this file.
00001 /* $Id: splign_app.cpp 176653 2009-11-19 18:45:38Z kapustin $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE                          
00005  *               National Center for Biotechnology Information
00006  *                                                                          
00007  *  This software/database is a "United States Government Work" under the   
00008  *  terms of the United States Copyright Act.  It was written as part of    
00009  *  the author's official duties as a United States Government employee and 
00010  *  thus cannot be copyrighted.  This software/database is freely available 
00011  *  to the public for use. The National Library of Medicine and the U.S.    
00012  *  Government have not placed any restriction on its use or reproduction.  
00013  *                                                                          
00014  *  Although all reasonable efforts have been taken to ensure the accuracy  
00015  *  and reliability of the software and data, the NLM and the U.S.          
00016  *  Government do not and cannot warrant the performance or results that    
00017  *  may be obtained by using this software or data. The NLM and the U.S.    
00018  *  Government disclaim all warranties, express or implied, including       
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.                                                                
00021  *                                                                          
00022  *  Please cite the author in any work or product based on this material.   
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Yuri Kapustin
00027  *
00028  * File Description: Splign application
00029  *                   
00030 */
00031 
00032 #include <ncbi_pch.hpp>
00033 
00034 #include "splign_app.hpp"
00035 #include "splign_app_exception.hpp"
00036 
00037 #include <corelib/ncbistd.hpp>
00038 #include <corelib/ncbi_system.hpp>
00039 
00040 #include <serial/objostrasn.hpp>
00041 #include <serial/serial.hpp>
00042 
00043 #include <algo/align/nw/nw_spliced_aligner16.hpp>
00044 #include <algo/align/splign/splign_cmdargs.hpp>
00045 #include <algo/align/util/hit_comparator.hpp>
00046 
00047 #include <algo/blast/api/bl2seq.hpp>
00048 #include <algo/blast/api/local_blast.hpp>
00049 #include <algo/blast/api/objmgr_query_data.hpp>
00050 #include <algo/blast/api/local_db_adapter.hpp>
00051 
00052 #include <objmgr/seq_vector.hpp>
00053 
00054 #include <objects/seq/Bioseq.hpp>
00055 #include <objects/seqloc/Seq_loc.hpp>
00056 
00057 #include <objtools/readers/fasta.hpp>
00058 #include <objtools/readers/reader_exception.hpp>
00059 #include <objtools/lds/lds_manager.hpp>
00060 #include <objtools/data_loaders/lds/lds_dataloader.hpp>
00061 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
00062 
00063 #include <algorithm>
00064 #include <memory>
00065 
00066 
00067 #ifndef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00068 #define GENOME_PIPELINE
00069 #endif
00070 
00071 namespace {
00072     const char kDirSense[]     = "sense";
00073     const char kDirAntisense[] = "antisense";
00074     const char kDirBoth[]      = "both";
00075     const char kDirAuto[]      = "auto";
00076     const char kDirDefault[]   = "default";
00077 }
00078 
00079 
00080 BEGIN_NCBI_SCOPE
00081 
00082 CSplignApp::CSplignApp(void):
00083     m_AppName("Splign v.1.39")
00084 {
00085     SetVersion(CVersionInfo(1, 39, 0, "Splign"));
00086 #ifdef GENOME_PIPELINE
00087     m_AppName += 'p';
00088 #endif
00089 }
00090 
00091 
00092 void CSplignApp::Init()
00093 {
00094 #ifndef GENOME_PIPELINE
00095     HideStdArgs(fHideHelp    | fHideLogfile | fHideConffile |
00096                 fHideVersion | fHideFullVersion | fHideDryRun  |
00097                 fHideXmlHelp | fHideFullHelp);
00098 #endif
00099 
00100 
00101     auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
00102     argdescr->SetUsageContext(GetArguments().GetProgramName(), m_AppName);
00103     
00104     argdescr->AddOptionalKey
00105         ("hits", "hits",
00106          "[Batch mode] Externally computed local alignments "
00107          "(such as blast hits), in blast tabular format. "
00108          "The file must be collated by subject and query "
00109          "(e.g. sort -k 2,2 -k 1,1).",
00110          CArgDescriptions::eInputFile);
00111    
00112     argdescr->AddOptionalKey
00113         ("comps", "comps",
00114          "[Batch mode] Compartments computed with Compart utility.",
00115          CArgDescriptions::eInputFile);
00116 
00117     argdescr->AddOptionalKey
00118         ("mklds", "mklds",
00119          "[Batch mode] "
00120          "Make LDS DB under the specified directory "
00121          "with cDNA and genomic FASTA files or symlinks.",
00122          CArgDescriptions::eString);
00123 
00124     argdescr->AddOptionalKey
00125         ("ldsdir", "ldsdir",
00126          "[Batch mode] Directory holding LDS subdirectory.",
00127          CArgDescriptions::eString);
00128 
00129     argdescr->AddOptionalKey
00130         ("query", "query",
00131          "[Pairwise mode] FASTA file with the spliced sequence.",
00132          CArgDescriptions::eInputFile);
00133     
00134     argdescr->AddOptionalKey
00135         ("subj", "subj",
00136          "[Pairwise mode] FASTA file with the genomic sequence.",
00137          CArgDescriptions::eInputFile);
00138     
00139     argdescr->AddFlag
00140         ("disc",
00141          "[Pairwise mode] Use discontiguous megablast to facilitate "
00142          "alignment of more divergent sequences such as those "
00143          "from different organisms (cross-species alignment).");
00144 
00145     argdescr->AddDefaultKey
00146         ("W", "mbwordsize", "[Pairwise mode] Megablast word size",
00147          CArgDescriptions::eInteger,
00148          "28");
00149   
00150     CSplignArgUtil::SetupArgDescriptions(argdescr.get());
00151 
00152     argdescr->AddDefaultKey
00153         ("direction", 
00154          "direction", 
00155          "Query sequence orientation. "
00156          "Auto orientation begins with the longest ORF direction (d1) "
00157          "and proceeds with the opposite direction (d2) "
00158          "if found a non-consensus splice in d1 or poly-a tail in d2. "
00159 
00160 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00161          "Default translates to 'auto' in mRNA and "
00162          "'both' in EST mode", 
00163          CArgDescriptions::eString,   kDirDefault
00164 #else
00165          , CArgDescriptions::eString, kDirSense
00166 #endif
00167          );
00168 
00169     argdescr->AddDefaultKey("log", "log", "Splign log file",
00170                             CArgDescriptions::eOutputFile,
00171                             "splign.log");
00172     
00173     argdescr->AddOptionalKey("asn", "asn", "ASN.1 output file name", 
00174                              CArgDescriptions::eOutputFile);
00175 
00176     argdescr->AddOptionalKey("aln", "aln", "Pairwise alignment output file name", 
00177                              CArgDescriptions::eOutputFile);
00178     
00179     CArgAllow_Strings * constrain_direction (new CArgAllow_Strings);
00180     constrain_direction
00181 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00182         ->Allow(kDirDefault)
00183 #endif
00184         ->Allow(kDirSense)
00185         ->Allow(kDirAntisense)
00186         ->Allow(kDirBoth)
00187         ->Allow(kDirAuto);
00188     
00189     argdescr->SetConstraint("direction", constrain_direction);
00190 
00191     SetupArgDescriptions(argdescr.release());
00192 
00193     m_ObjMgr = CObjectManager::GetInstance();
00194 }
00195 
00196 
00197 CSplign::THitRef CSplignApp::s_ReadBlastHit(const string& m8)
00198 {
00199     THitRef rv (new CBlastTabular(m8.c_str()));
00200 
00201 #ifdef SPLIGNAPP_UNDECORATED_ARE_LOCALS
00202     // make seq-id local if no type specified in the original m8
00203     string::const_iterator ie = m8.end(), i0 = m8.begin(), i1 = i0;
00204     while(i1 != ie && *i1 !='\t') ++i1;
00205     if(i1 != ie) {
00206         string::const_iterator i2 = ++i1;
00207         while(i2 != ie && *i2 !='\t') ++i2;
00208         if(i2 != ie) {
00209             if(find(i0, i1, '|') == i1) {
00210                 const string strid = rv->GetQueryId()->GetSeqIdString(true);
00211                 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
00212                 rv->SetQueryId(seqid);
00213             }
00214             if(find(i1, i2, '|') == i2) {
00215                 const string strid = rv->GetSubjId()->GetSeqIdString(true);
00216                 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
00217                 rv->SetSubjId(seqid);
00218             }
00219             return rv;
00220         }
00221     }
00222     const string errmsg = string("Incorrectly formatted blast hit:\n") + m8;
00223     NCBI_THROW(CSplignAppException, eBadData, errmsg);
00224 #else
00225     return rv;
00226 #endif
00227 }
00228 
00229 
00230 bool CSplignApp::x_GetNextPair(const THitRefs& hitrefs, THitRefs* hitrefs_pair)
00231 {
00232     USING_SCOPE(objects);
00233     
00234     hitrefs_pair->resize(0);
00235     
00236     const size_t dim = hitrefs.size();
00237     if(dim == 0) {
00238         return false;
00239     }
00240     
00241     if(m_CurHitRef == dim) {
00242         m_CurHitRef = numeric_limits<size_t>::max();
00243         return false;
00244     }
00245  
00246     if(m_CurHitRef == numeric_limits<size_t>::max()) {
00247         m_CurHitRef = 0;
00248     }
00249     
00250     CConstRef<CSeq_id> query (hitrefs[m_CurHitRef]->GetQueryId());
00251     CConstRef<CSeq_id> subj  (hitrefs[m_CurHitRef]->GetSubjId());
00252     while(m_CurHitRef < dim 
00253           && hitrefs[m_CurHitRef]->GetQueryId()->Match(*query)
00254            && hitrefs[m_CurHitRef]->GetSubjId()->Match(*subj)  ) 
00255     {
00256         hitrefs_pair->push_back(hitrefs[m_CurHitRef++]);
00257     }
00258     return true;
00259 }
00260 
00261 
00262 bool CSplignApp::x_GetNextPair(istream& ifs, THitRefs* hitrefs)
00263 {
00264     hitrefs->resize(0);
00265 
00266     if(!m_PendingHits.size() && !ifs ) {
00267         return false;
00268     }
00269     
00270     if(!m_PendingHits.size()) {
00271 
00272         THit::TId query, subj;
00273 
00274         if(m_firstline.size()) {
00275 
00276             THitRef hitref (s_ReadBlastHit(m_firstline));
00277             query = hitref->GetQueryId();
00278             subj  = hitref->GetSubjId();
00279             m_PendingHits.push_back(hitref);
00280         }
00281 
00282         char buf [1024];
00283         while(ifs) {
00284 
00285             buf[0] = 0;
00286             CT_POS_TYPE pos0 = ifs.tellg();
00287             ifs.getline(buf, sizeof buf, '\n');
00288             CT_POS_TYPE pos1 = ifs.tellg();
00289             if(pos1 == pos0) break; // GCC hack
00290             if(buf[0] == '#') continue; // skip comments
00291             const char* p = buf; // skip leading spaces
00292             while(*p == ' ' || *p == '\t') ++p;
00293             if(*p == 0) continue; // skip empty lines
00294             
00295             THitRef hit (s_ReadBlastHit(p));
00296             if(query.IsNull()) {
00297                 query = hit->GetQueryId();
00298             }
00299             if(subj.IsNull()) {
00300                 subj = hit->GetSubjId();
00301             }
00302             if(hit->GetQueryStrand() == false) {
00303                 hit->FlipStrands();
00304             }
00305             if(hit->GetSubjStop() == hit->GetSubjStart()) {
00306                 // skip single bases
00307                 continue;
00308             }
00309             
00310             if(hit->GetQueryId()->Match(*query) == false || 
00311                hit->GetSubjId()->Match(*subj) == false) {
00312 
00313                 m_firstline = p;
00314                 break;
00315             }
00316             
00317             m_PendingHits.push_back(hit);
00318         }
00319     }
00320 
00321     const size_t pending_size = m_PendingHits.size();
00322     if(pending_size) {
00323 
00324         THit::TId query = m_PendingHits[0]->GetQueryId();
00325         THit::TId subj  = m_PendingHits[0]->GetSubjId();
00326         size_t i = 1;
00327         for(; i < pending_size; ++i) {
00328 
00329             THitRef h = m_PendingHits[i];
00330             if(h->GetQueryId()->Match(*query) == false || 
00331                h->GetSubjId()->Match(*subj) == false) {
00332                 break;
00333             }
00334         }
00335         hitrefs->resize(i);
00336         copy(m_PendingHits.begin(), m_PendingHits.begin() + i, 
00337              hitrefs->begin());
00338         m_PendingHits.erase(m_PendingHits.begin(), m_PendingHits.begin() + i);
00339     }
00340     
00341     return hitrefs->size() > 0;
00342 }
00343 
00344 
00345 void ReadCompartment(istream& istr, CSplign::THitRefs* phitrefs)
00346 {
00347     phitrefs->clear();
00348     while(istr) {
00349         string line;
00350         getline(istr, line);
00351         if(line.empty()) {
00352             if(phitrefs->empty()) continue; else break;
00353         }
00354         CSplign::THitRef h (new CSplign::THit(line.c_str()));
00355         phitrefs->push_back(h);
00356     }
00357 }
00358 
00359 
00360 bool CSplignApp::x_GetNextComp(istream& ifs,
00361                                THitRefs* phitrefs,
00362                                THit::TCoord* psubj_min,
00363                                THit::TCoord* psubj_max)
00364 {
00365     static THitRefs hitrefs_next;
00366     THitRefs & hitrefs (*phitrefs);
00367 
00368     const THit::TCoord kUndef (numeric_limits<THit::TCoord>::max());
00369     const THit::TCoord kMax (numeric_limits<THit::TCoord>::max() - 1);
00370     static THit::TCoord smin (kUndef), smax (kUndef);
00371 
00372     if(!hitrefs_next.empty()) {
00373         hitrefs.resize(hitrefs_next.size());
00374         copy(hitrefs_next.begin(), hitrefs_next.end(), hitrefs.begin());
00375         hitrefs_next.clear();
00376     }
00377     else {
00378         // read the first compartment
00379         ReadCompartment(ifs, phitrefs);
00380     }
00381 
00382     // read the next compartment
00383     ReadCompartment(ifs, &hitrefs_next);
00384 
00385     // init coord range - may clarify further
00386     if(smin != kUndef) {
00387         *psubj_min = smin;
00388         *psubj_max = kMax;
00389     }
00390     else if(smax != kUndef) {
00391         *psubj_min = 0;
00392         *psubj_max = smax;
00393     }
00394     else {
00395         *psubj_min = 0;
00396         *psubj_max = kMax;
00397     }
00398     
00399     if(!hitrefs_next.empty()
00400        && hitrefs.front()->GetSubjStrand() == hitrefs_next.front()->GetSubjStrand()
00401        && hitrefs.front()->GetQueryId()->Match(*(hitrefs_next.front()->GetQueryId()))
00402        && hitrefs.front()->GetSubjId()->Match(*(hitrefs_next.front()->GetSubjId())))
00403     {
00404         if(hitrefs.front()->GetSubjStart() < hitrefs_next.front()->GetSubjStart()) {
00405             *psubj_min = smin != kUndef? smin: 0;
00406             *psubj_max = min(hitrefs_next.front()->GetSubjMin(),
00407                              hitrefs_next.back()->GetSubjMin());
00408             smin = max(hitrefs.front()->GetSubjMax(),
00409                        hitrefs.back()->GetSubjMax());
00410             smax = kUndef;
00411         }
00412         else {
00413             *psubj_min = max(hitrefs_next.front()->GetSubjMax(),
00414                              hitrefs_next.back()->GetSubjMax());
00415             *psubj_max = smax != kUndef? smax: kMax;
00416             smin = kUndef;
00417             smax = min(hitrefs.front()->GetSubjMin(),
00418                        hitrefs.back()->GetSubjMin());
00419         }
00420     }
00421     else {
00422         smin = smax = kUndef;
00423     }
00424     
00425     return !hitrefs.empty();
00426 }
00427 
00428 
00429 void CSplignApp::x_LogCompartmentStatus(const THit::TId & query, 
00430                                         const THit::TId & subj, 
00431                                         const CSplign::SAlignedCompartment & ac)
00432 {
00433     typedef CSplign::SAlignedCompartment TCompartment;
00434 
00435     switch(ac.m_Status) {
00436 
00437         case TCompartment::eStatus_Ok: {
00438 
00439             if(ac.m_Id == 0) {
00440                 NCBI_THROW(CSplignAppException, eInternal, "Missing compartment id.");
00441             }
00442 
00443             *m_logstream << (ac.m_QueryStrand? '+': '-') << ac.m_Id
00444                          << '\t' << query->GetSeqIdString(true)
00445                          << '\t' << subj->GetSeqIdString(true)
00446                          << '\t' << ac.m_Msg
00447                          << '\t' << ac.m_Score
00448                          << endl;
00449         }
00450         break;
00451 
00452         case TCompartment::eStatus_Error: {
00453 
00454             *m_logstream << '-'
00455                          << '\t' << query->GetSeqIdString(true)
00456                          << '\t' << subj->GetSeqIdString(true)
00457                          << '\t' << ac.m_Msg
00458                          << '\t' << '-'
00459                          << endl;
00460         }
00461         break;
00462 
00463         case TCompartment::eStatus_Empty:
00464         break;
00465 
00466         default: {
00467             NCBI_THROW(CSplignAppException, eInternal,
00468                        "Unexpected compartment status.");
00469         }
00470     }
00471 
00472 }
00473 
00474 
00475 CRef<blast::CBlastOptionsHandle> 
00476 CSplignApp::x_SetupBlastOptions(bool use_disc)
00477 {
00478     USING_SCOPE(blast);
00479 
00480     m_BlastProgram = use_disc? eDiscMegablast: eMegablast;
00481 
00482     CRef<CBlastOptionsHandle> blast_options_handle
00483         (CBlastOptionsFactory::Create(m_BlastProgram));
00484 
00485     blast_options_handle->SetDefaults();
00486 
00487     CBlastOptions& blast_opt = blast_options_handle->SetOptions();
00488 
00489     if(!use_disc) {
00490 
00491         const CArgs& args = GetArgs();
00492         blast_opt.SetWordSize(args["W"].AsInteger());
00493         blast_opt.SetMaskAtHash(true);
00494         blast_opt.SetDustFiltering(false);
00495     }
00496 
00497     if(blast_options_handle->Validate() == false) {
00498         NCBI_THROW(CSplignAppException,
00499                    eInternal,
00500                    "Incorrect blast setup");
00501     }
00502 
00503     return blast_options_handle;
00504 }
00505 
00506 
00507 enum ERunMode {
00508     eNotSet,
00509     ePairwise, // single query vs single subj
00510     eBatch1,   // use external raw blast hits
00511     eBatch2    // use pre-computed compartments
00512 };
00513 
00514 
00515 const string kSplignLdsDb ("splign.ldsdb");
00516 
00517 string GetLdsDbDir(const string& fasta_dir)
00518 {
00519     string lds_db_dir = fasta_dir;
00520     const char sep = CDirEntry::GetPathSeparator();
00521     const size_t fds = fasta_dir.size();
00522     if(fds > 0 && fasta_dir[fds-1] != sep) {
00523         lds_db_dir += sep;
00524     }
00525     lds_db_dir += "_SplignLDS_";
00526     return lds_db_dir;
00527 }
00528 
00529 
00530 CRef<objects::CSeq_id> CSplignApp::x_ReadFastaSetId(const CArgValue& argval,
00531                                                     CRef<objects::CScope> scope)
00532 {
00533     USING_SCOPE(objects);
00534 
00535     CRef<ILineReader> line_reader;
00536     try {
00537         line_reader.Reset(
00538             new CMemoryLineReader(new CMemoryFile(argval.AsString()),
00539                                   eTakeOwnership));
00540     } catch (...) { // fall back to streams
00541         line_reader.Reset(new CStreamLineReader(argval.AsInputFile()));
00542     }
00543     CFastaReader fasta_reader(* line_reader,
00544                               CFastaReader::fAssumeNuc | CFastaReader::fOneSeq);
00545     CConstRef<CSeq_entry> se (fasta_reader.ReadOneSeq());
00546 
00547     scope->AddTopLevelSeqEntry(*se);
00548     const CSeq_entry::TSeq& bioseq = se->GetSeq();    
00549     const CSeq_entry::TSeq::TId& seqid = bioseq.GetId();
00550     return seqid.back();
00551 }
00552 
00553 
00554 int CSplignApp::Run()
00555 { 
00556     USING_SCOPE(objects);
00557 
00558     const CArgs & args (GetArgs());
00559 
00560     // check that modes aren't mixed
00561 
00562     const bool is_mklds   = args["mklds"];
00563     const bool is_ldsdir  = args["ldsdir"];
00564 
00565     const bool is_hits    = args["hits"];
00566     const bool is_query   = args["query"];
00567     const bool is_subj    = args["subj"];
00568 
00569     const bool is_comps   = args["comps"];
00570 
00571     const bool use_disc_megablast (args["disc"]);
00572 
00573     if(is_mklds) {
00574 
00575         // create LDS DB and exit
00576         string fa_dir = args["mklds"].AsString();
00577         if(CDirEntry::IsAbsolutePath(fa_dir) == false) {
00578             string curdir = CDir::GetCwd();
00579             const char sep = CDirEntry::GetPathSeparator();            
00580             const size_t curdirsize = curdir.size();
00581             if(curdirsize && curdir[curdirsize-1] != sep) {
00582                 curdir += sep;
00583             }
00584             fa_dir = curdir + fa_dir;
00585         }
00586 
00587         const string lds_db_dir (GetLdsDbDir(fa_dir));
00588 
00589 // #define  CPPTOOLKIT_LDS_MANAGEMENT
00590 #ifdef   CPPTOOLKIT_LDS_MANAGEMENT
00591 
00592         CLDS_Database ldsdb (lds_db_dir, kSplignLdsDb);
00593         CLDS_Management ldsmgt (ldsdb);
00594         ldsmgt.Create();
00595         ldsmgt.SyncWithDir(fa_dir,
00596                            CLDS_Management::eRecurseSubDirs,
00597                            CLDS_Management::eNoControlSum);
00598 #else
00599         CLDS_Manager ldsmgr (fa_dir, lds_db_dir, kSplignLdsDb);
00600         ldsmgr.Index(CLDS_Manager::eRecurseSubDirs,
00601                      CLDS_Manager::eNoControlSum);
00602 #endif
00603 
00604         return 0;
00605     }
00606 
00607     // determine mode and verify arguments
00608     ERunMode run_mode (eNotSet);
00609     
00610     if(is_query && is_subj && !(is_hits || is_comps || is_ldsdir)) {
00611         run_mode = ePairwise;
00612     }
00613     else if(is_hits && is_ldsdir && !(is_comps ||is_query || is_subj)) {
00614         run_mode = eBatch1;
00615     }
00616     else if(is_comps && is_ldsdir && !(is_hits ||is_query || is_subj)) {
00617         run_mode = eBatch2;
00618     }
00619 
00620     if(run_mode == eNotSet) {
00621         NCBI_THROW(CSplignAppException,
00622                    eBadParameter,
00623                    "Incomplete or inconsistent set of arguments specified. "
00624                    "Specify -help to print arguments." );
00625     }   
00626 
00627     // open log stream
00628     m_logstream = & args["log"].AsOutputFile();
00629     
00630     // open asn output stream, if any
00631     m_AsnOut = args["asn"]? & args["asn"].AsOutputFile(): NULL;
00632     
00633     // open paiwise alignment output stream, if any
00634     m_AlnOut = args["aln"]? & args["aln"].AsOutputFile(): NULL;
00635     
00636     // in pairwise, batch 2 or incremental mode, setup blast options
00637     if(run_mode != eBatch1 && run_mode != eBatch2) {
00638         m_BlastOptionsHandle = x_SetupBlastOptions(use_disc_megablast);
00639     }
00640 
00641     // splign and formatter setup    
00642     m_Splign.Reset(new CSplign);
00643     CSplignArgUtil::ArgsToSplign(m_Splign, args);
00644 
00645     m_Splign->SetStartModelId(1);
00646 
00647     // splign formatter object    
00648     m_Formatter.Reset(new CSplignFormatter(*m_Splign));
00649 
00650     // do mode-specific preparations
00651     CRef<CScope> scope;
00652     CRef<CSeq_id> seqid_query, seqid_subj;
00653     if(run_mode == ePairwise) {
00654 
00655         scope.Reset (new CScope(*m_ObjMgr));
00656         scope->AddDefaults();
00657         seqid_query = x_ReadFastaSetId(args["query"], scope);
00658         seqid_subj  = x_ReadFastaSetId(args["subj"] , scope);
00659     }
00660     else if(run_mode == eBatch1 || run_mode == eBatch2) {
00661         
00662         const string fasta_dir = args["ldsdir"].AsString();
00663         const string ldsdb_dir = GetLdsDbDir(fasta_dir);
00664         CLDS_Database* ldsdb (
00665               new CLDS_Database(ldsdb_dir, kSplignLdsDb));
00666         m_LDS_db.reset(ldsdb);
00667         m_LDS_db->Open();
00668         CLDS_DataLoader::RegisterInObjectManager(
00669             *m_ObjMgr, *ldsdb, CObjectManager::eDefault);
00670         scope.Reset (new CScope(*m_ObjMgr));
00671         scope->AddDefaults();
00672     }
00673     else {
00674         NCBI_THROW(CSplignAppException,
00675                    eGeneral,
00676                    "Requested mode not implemented." );
00677     }
00678 
00679     m_Splign->SetScope() = scope;
00680 
00681     // run splign in selected mode 
00682     if(run_mode == ePairwise) {
00683 
00684         THitRefs hitrefs;
00685         x_GetBl2SeqHits(seqid_query, seqid_subj, scope, &hitrefs);
00686         x_ProcessPair(hitrefs, args);
00687     }
00688     else if (run_mode == eBatch1) {
00689 
00690         THitRefs hitrefs;
00691         CNcbiIstream& hit_stream = args["hits"].AsInputFile();
00692         while(x_GetNextPair(hit_stream, &hitrefs) ) {
00693             x_ProcessPair(hitrefs, args);
00694         }
00695     }
00696     else if (run_mode == eBatch2) {
00697 
00698         CNcbiIstream& hit_stream (args["comps"].AsInputFile());
00699         THitRefs hitrefs;
00700         THit::TCoord subj_min, subj_max;
00701 
00702         while(x_GetNextComp(hit_stream, &hitrefs, &subj_min, &subj_max) ) {
00703 
00704             if(hitrefs.front()->GetScore() > 0) {
00705                 x_ProcessPair(hitrefs, args, subj_min, subj_max);
00706             }
00707         }
00708     }
00709     else {
00710         NCBI_THROW(CSplignAppException,
00711                    eInternal,
00712                    "Mode not implemented");
00713     }
00714      
00715     cout << "# END" << endl;
00716    
00717     return 0;
00718 }
00719 
00720 
00721 void CSplignApp::x_GetBl2SeqHits(
00722     CRef<objects::CSeq_id> seqid_query,  
00723     CRef<objects::CSeq_id> seqid_subj, 
00724     CRef<objects::CScope>  scope,  
00725     THitRefs* phitrefs)
00726 {
00727     USING_SCOPE(blast);
00728     USING_SCOPE(objects);
00729 
00730     phitrefs->resize(0);
00731     phitrefs->reserve(100);
00732 
00733     CRef<CSeq_loc> seqloc_query (new CSeq_loc);
00734     seqloc_query->SetWhole().Assign(*seqid_query);
00735     CRef<CSeq_loc> seqloc_subj (new CSeq_loc);
00736     seqloc_subj->SetWhole().Assign(*seqid_subj);
00737 
00738     CBl2Seq Blast( SSeqLoc(seqloc_query.GetNonNullPointer(),
00739                            scope.GetNonNullPointer()),
00740                    SSeqLoc(seqloc_subj.GetNonNullPointer(),
00741                            scope.GetNonNullPointer()),
00742                    m_BlastProgram);
00743     
00744     Blast.SetOptionsHandle() = *m_BlastOptionsHandle;
00745 
00746     TSeqAlignVector blast_output (Blast.Run());
00747 
00748     ITERATE(TSeqAlignVector, ii, blast_output) {
00749         if((*ii)->IsSet()) {
00750             const CSeq_align_set::Tdata &sas0 = (*ii)->Get();
00751             ITERATE(CSeq_align_set::Tdata, sa_iter, sas0) {
00752                     CRef<CBlastTabular> hitref (new CBlastTabular(**sa_iter));
00753                     if(hitref->GetQueryStrand() == false) {
00754                         hitref->FlipStrands();
00755                     }
00756                     phitrefs->push_back(hitref);
00757             }
00758         }
00759     }
00760 }
00761 
00762 void CSplignApp::x_RunSplign(bool raw_hits, THitRefs* phitrefs, 
00763                              THit::TCoord smin, THit::TCoord smax,
00764                              CSplign::TResults * psplign_results)
00765 {
00766     if(raw_hits) {
00767         m_Splign->Run(phitrefs);
00768         const CSplign::TResults& results (m_Splign->GetResult());
00769         copy(results.begin(), results.end(), back_inserter(*psplign_results));
00770     }
00771     else {
00772         CSplign::SAlignedCompartment ac;
00773         m_Splign->AlignSingleCompartment(phitrefs, smin, smax, &ac);
00774         psplign_results->push_back(ac);
00775     }
00776 }
00777 
00778 
00779 // get the number of non-consensus splices in a compartment 
00780 // with the highest match count
00781 size_t GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)
00782 {
00783     size_t top_matches (0);
00784     size_t rv (0);
00785     ITERATE(CSplign::TResults, ii, splign_results) {
00786 
00787         const CSplign::SAlignedCompartment & ac (*ii);
00788         size_t matches (0), nc_count(0);
00789         typedef CSplign::TSegments::const_iterator TIterator;
00790         char dnr [] = {0, 0, 0};
00791         char acc [] = {0, 0, 0};
00792         size_t exon_count (0);
00793 
00794         for(TIterator jjb (ac.m_Segments.begin()), jje (ac.m_Segments.end()), jj(jjb);
00795             jj != jje; ++jj)
00796         {
00797 
00798             if(jj->m_exon) {
00799 
00800                 const char * p (jj->m_details.data()), * pe (p +jj->m_details.size());
00801                 int n (-1);
00802                 for(; p != pe; ++p) {
00803                     if(*p == 'M') {
00804                         if(n == 0) ++matches; else if(n > 0) matches += n;
00805                         n = 0;
00806                     }
00807                     else if(isdigit(*p) && n >= 0) {
00808                         n = n * 10 + *p - '0';
00809                     }
00810                     else {
00811                         if(n == 0) {
00812                             ++matches;
00813                         }
00814                         n = -1;
00815                     }
00816                 }
00817                 if(n == 0) ++matches; else if(n > 0) matches += n;
00818 
00819                 if(exon_count > 0) {
00820 
00821                     if(jj->m_annot[2] == '<') {
00822 
00823                         acc[0] = jj->m_annot[0];
00824                         acc[1] = jj->m_annot[1];
00825 
00826                         if(!CNWFormatter::SSegment::s_IsConsensusSplice(dnr, acc)) {
00827                             ++nc_count;
00828                         }
00829                     }
00830                     acc[0] = acc[1] = 0;
00831                 }
00832 
00833                 p = jj->m_annot.data();
00834                 while(*p++ != '>');
00835                 dnr[0] = *p++;
00836                 dnr[1] = *p;
00837 
00838                 ++exon_count;
00839             }
00840         }
00841 
00842         if(matches > top_matches) {
00843             rv = nc_count;
00844             top_matches = matches;
00845         }
00846     }
00847 
00848     return rv;
00849 }
00850 
00851 
00852 struct SComplement
00853 {
00854     char operator() (char c) {
00855         switch(c) {
00856         case 'A': return 'T';
00857         case 'G': return 'C';
00858         case 'T': return 'A';
00859         case 'C': return 'G';
00860         }
00861         return c;
00862     }
00863 };
00864 
00865 
00866 void CSplignApp::x_ProcessPair(THitRefs& hitrefs, const CArgs& args,
00867                                THit::TCoord smin, THit::TCoord smax)
00868 {
00869 
00870 #ifdef GENOME_PIPELINE
00871     const CSplignFormatter::ETextFlags flags (CSplignFormatter::eTF_UseFastaStyleIds);
00872 #else
00873     const CSplignFormatter::ETextFlags flags (CSplignFormatter::eTF_NoExonScores);
00874 #endif
00875 
00876     const bool raw_hits (!args["comps"]);
00877 
00878     if(hitrefs.size() == 0) {
00879         return;
00880     }
00881 
00882     // skip void compartments but obey their bounds
00883     if(hitrefs.front()->GetScore() < 0) {
00884         return;
00885     }
00886 
00887     THit::TId query (hitrefs.front()->GetQueryId());
00888     THit::TId subj  (hitrefs.front()->GetSubjId());
00889     
00890     m_Formatter->SetSeqIds(query, subj);
00891 
00892     string strand (args["direction"].AsString());
00893 
00894 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00895     if(strand == kDirDefault) {
00896         strand = (args["type"].AsString() == kQueryType_mRNA)? kDirAuto: kDirBoth;
00897     }
00898 #endif
00899 
00900     CSplign::TResults splign_results;
00901 
00902     if(strand == kDirSense) {
00903 
00904         m_Splign->SetStrand(true);
00905         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00906     }
00907     else if(strand == kDirAntisense) {
00908             
00909         m_Splign->SetStrand(false);
00910         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00911     }
00912     else if(strand == kDirBoth) {
00913 
00914         // save original hits
00915         THitRefs hits0;
00916         ITERATE(THitRefs, ii, hitrefs) {
00917             const THitRef & h0 (*ii);
00918             THitRef h1 (new THit (*h0));
00919             hits0.push_back(h1);
00920         }
00921 
00922         static size_t mid (1);
00923         size_t mid_plus, mid_minus;
00924         {{
00925             m_Splign->SetStrand(true);
00926             m_Splign->SetStartModelId(mid);
00927             x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00928             mid_plus = m_Splign->GetNextModelId();
00929         }}
00930         {{
00931             m_Splign->SetStrand(false);
00932             m_Splign->SetStartModelId(mid);
00933             x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
00934             mid_minus = m_Splign->GetNextModelId();
00935         }}
00936         mid = max(mid_plus, mid_minus);
00937     }
00938     else {
00939 
00940         // 'auto' means to align both directions when in doubt
00941 
00942         THitRefs hits0;
00943         ITERATE(THitRefs, ii, hitrefs) {
00944             const THitRef & h0 (*ii);
00945             THitRef h1 (new THit (*h0));
00946             hits0.push_back(h1);
00947         }
00948 
00949         // determine the direction with the longest ORF
00950         const CSplign::TOrfPair orfs (m_Splign->GetCds(hitrefs.front()->GetQueryId()));
00951         const size_t orf_sense (orfs.first.second - orfs.first.first);
00952         const size_t orf_antisense (orfs.second.first - orfs.second.second);
00953         const bool sense_first (orf_sense >= orf_antisense);
00954         
00955         static size_t mid (1);
00956         size_t mid_first, mid_second;
00957 
00958         // align in the longest ORF direction
00959         m_Splign->SetStrand(sense_first);
00960         m_Splign->SetStartModelId(mid);
00961         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00962         mid_first = m_Splign->GetNextModelId();
00963 
00964         // if there is a non-consensus splice, also align in the opposite direction
00965         const size_t nc_count (GetNonConsensusSpliceCount(splign_results));
00966 
00967         // same if there is a poly-a in the opposite direction 
00968         bool polya_found (false);
00969         if(nc_count == 0) {
00970             CRef<CScope> scope (m_Splign->GetScope());
00971             CConstRef<CSeq_id> seqid_query (hits0.front()->GetQueryId());
00972             CBioseq_Handle bh (scope->GetBioseqHandle(*seqid_query));
00973                                CSeqVector sv (bh.GetSeqVector(CBioseq_Handle
00974                                                               ::eCoding_Iupac));
00975             string str;
00976             sv.GetSeqData(0, sv.size(), str);
00977             if(sense_first) {
00978                 reverse (str.begin(), str.end());
00979                 transform(str.begin(), str.end(), str.begin(), SComplement());
00980             }
00981             const size_t polya (CSplign::s_TestPolyA(str.data(), str.size()));
00982             polya_found = (0 < polya && polya < str.size());
00983         }
00984 
00985         if(nc_count > 0 || polya_found) {
00986             m_Splign->SetStrand(!sense_first);
00987             m_Splign->SetStartModelId(mid);
00988             x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
00989             mid_second = m_Splign->GetNextModelId();
00990             mid = max(mid_first, mid_second);
00991         }
00992         else {
00993             mid = mid_first;
00994         }
00995     }
00996     
00997     cout << m_Formatter->AsExonTable(&splign_results, flags);
00998 
00999     if(m_AsnOut) {
01000         CRef<CSeq_align_set> sas (
01001             m_Formatter-> AsSeqAlignSet(&splign_results,
01002                                         CSplignFormatter::
01003                                         eAF_SplicedSegWithParts));
01004         *m_AsnOut << MSerial_AsnText  << *sas << endl;
01005     }
01006     
01007     if(m_AlnOut) {       
01008         *m_AlnOut << m_Formatter->AsAlignmentText(m_Splign->GetScope(),
01009                                                   &splign_results);
01010     }
01011         
01012     ITERATE(CSplign::TResults, ii, splign_results) {
01013         x_LogCompartmentStatus(query, subj, *ii);
01014     }
01015 }
01016 
01017 
01018 END_NCBI_SCOPE
01019 
01020 /////////////////////////////////////
01021 
01022 USING_NCBI_SCOPE;
01023 
01024 int main(int argc, const char* argv[]) 
01025 {
01026     const int rv (CSplignApp().AppMain(argc, argv, 0, eDS_Default, 0));
01027     return rv;
01028 }
01029 
01030 

Generated on Wed Dec 9 04:12:07 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:17:54 2009 by modify_doxy.py rev. 173732