00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <ncbi_pch.hpp>
00033
00034 #include "splign_app.hpp"
00035 #include "splign_app_exception.hpp"
00036
00037 #include <corelib/ncbistd.hpp>
00038 #include <corelib/ncbi_system.hpp>
00039
00040 #include <serial/objostrasn.hpp>
00041 #include <serial/serial.hpp>
00042
00043 #include <algo/align/nw/nw_spliced_aligner16.hpp>
00044 #include <algo/align/splign/splign_cmdargs.hpp>
00045 #include <algo/align/util/hit_comparator.hpp>
00046
00047 #include <algo/blast/api/bl2seq.hpp>
00048 #include <algo/blast/api/local_blast.hpp>
00049 #include <algo/blast/api/objmgr_query_data.hpp>
00050 #include <algo/blast/api/local_db_adapter.hpp>
00051
00052 #include <objmgr/seq_vector.hpp>
00053
00054 #include <objects/seq/Bioseq.hpp>
00055 #include <objects/seqloc/Seq_loc.hpp>
00056
00057 #include <objtools/readers/fasta.hpp>
00058 #include <objtools/readers/reader_exception.hpp>
00059 #include <objtools/lds/lds_manager.hpp>
00060 #include <objtools/data_loaders/lds/lds_dataloader.hpp>
00061 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
00062
00063 #include <algorithm>
00064 #include <memory>
00065
00066
00067 #ifndef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00068 #define GENOME_PIPELINE
00069 #endif
00070
00071 namespace {
00072 const char kDirSense[] = "sense";
00073 const char kDirAntisense[] = "antisense";
00074 const char kDirBoth[] = "both";
00075 const char kDirAuto[] = "auto";
00076 const char kDirDefault[] = "default";
00077 }
00078
00079
00080 BEGIN_NCBI_SCOPE
00081
00082 CSplignApp::CSplignApp(void):
00083 m_AppName("Splign v.1.39")
00084 {
00085 SetVersion(CVersionInfo(1, 39, 0, "Splign"));
00086 #ifdef GENOME_PIPELINE
00087 m_AppName += 'p';
00088 #endif
00089 }
00090
00091
00092 void CSplignApp::Init()
00093 {
00094 #ifndef GENOME_PIPELINE
00095 HideStdArgs(fHideHelp | fHideLogfile | fHideConffile |
00096 fHideVersion | fHideFullVersion | fHideDryRun |
00097 fHideXmlHelp | fHideFullHelp);
00098 #endif
00099
00100
00101 auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
00102 argdescr->SetUsageContext(GetArguments().GetProgramName(), m_AppName);
00103
00104 argdescr->AddOptionalKey
00105 ("hits", "hits",
00106 "[Batch mode] Externally computed local alignments "
00107 "(such as blast hits), in blast tabular format. "
00108 "The file must be collated by subject and query "
00109 "(e.g. sort -k 2,2 -k 1,1).",
00110 CArgDescriptions::eInputFile);
00111
00112 argdescr->AddOptionalKey
00113 ("comps", "comps",
00114 "[Batch mode] Compartments computed with Compart utility.",
00115 CArgDescriptions::eInputFile);
00116
00117 argdescr->AddOptionalKey
00118 ("mklds", "mklds",
00119 "[Batch mode] "
00120 "Make LDS DB under the specified directory "
00121 "with cDNA and genomic FASTA files or symlinks.",
00122 CArgDescriptions::eString);
00123
00124 argdescr->AddOptionalKey
00125 ("ldsdir", "ldsdir",
00126 "[Batch mode] Directory holding LDS subdirectory.",
00127 CArgDescriptions::eString);
00128
00129 argdescr->AddOptionalKey
00130 ("query", "query",
00131 "[Pairwise mode] FASTA file with the spliced sequence.",
00132 CArgDescriptions::eInputFile);
00133
00134 argdescr->AddOptionalKey
00135 ("subj", "subj",
00136 "[Pairwise mode] FASTA file with the genomic sequence.",
00137 CArgDescriptions::eInputFile);
00138
00139 argdescr->AddFlag
00140 ("disc",
00141 "[Pairwise mode] Use discontiguous megablast to facilitate "
00142 "alignment of more divergent sequences such as those "
00143 "from different organisms (cross-species alignment).");
00144
00145 argdescr->AddDefaultKey
00146 ("W", "mbwordsize", "[Pairwise mode] Megablast word size",
00147 CArgDescriptions::eInteger,
00148 "28");
00149
00150 CSplignArgUtil::SetupArgDescriptions(argdescr.get());
00151
00152 argdescr->AddDefaultKey
00153 ("direction",
00154 "direction",
00155 "Query sequence orientation. "
00156 "Auto orientation begins with the longest ORF direction (d1) "
00157 "and proceeds with the opposite direction (d2) "
00158 "if found a non-consensus splice in d1 or poly-a tail in d2. "
00159
00160 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00161 "Default translates to 'auto' in mRNA and "
00162 "'both' in EST mode",
00163 CArgDescriptions::eString, kDirDefault
00164 #else
00165 , CArgDescriptions::eString, kDirSense
00166 #endif
00167 );
00168
00169 argdescr->AddDefaultKey("log", "log", "Splign log file",
00170 CArgDescriptions::eOutputFile,
00171 "splign.log");
00172
00173 argdescr->AddOptionalKey("asn", "asn", "ASN.1 output file name",
00174 CArgDescriptions::eOutputFile);
00175
00176 argdescr->AddOptionalKey("aln", "aln", "Pairwise alignment output file name",
00177 CArgDescriptions::eOutputFile);
00178
00179 CArgAllow_Strings * constrain_direction (new CArgAllow_Strings);
00180 constrain_direction
00181 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00182 ->Allow(kDirDefault)
00183 #endif
00184 ->Allow(kDirSense)
00185 ->Allow(kDirAntisense)
00186 ->Allow(kDirBoth)
00187 ->Allow(kDirAuto);
00188
00189 argdescr->SetConstraint("direction", constrain_direction);
00190
00191 SetupArgDescriptions(argdescr.release());
00192
00193 m_ObjMgr = CObjectManager::GetInstance();
00194 }
00195
00196
00197 CSplign::THitRef CSplignApp::s_ReadBlastHit(const string& m8)
00198 {
00199 THitRef rv (new CBlastTabular(m8.c_str()));
00200
00201 #ifdef SPLIGNAPP_UNDECORATED_ARE_LOCALS
00202
00203 string::const_iterator ie = m8.end(), i0 = m8.begin(), i1 = i0;
00204 while(i1 != ie && *i1 !='\t') ++i1;
00205 if(i1 != ie) {
00206 string::const_iterator i2 = ++i1;
00207 while(i2 != ie && *i2 !='\t') ++i2;
00208 if(i2 != ie) {
00209 if(find(i0, i1, '|') == i1) {
00210 const string strid = rv->GetQueryId()->GetSeqIdString(true);
00211 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
00212 rv->SetQueryId(seqid);
00213 }
00214 if(find(i1, i2, '|') == i2) {
00215 const string strid = rv->GetSubjId()->GetSeqIdString(true);
00216 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
00217 rv->SetSubjId(seqid);
00218 }
00219 return rv;
00220 }
00221 }
00222 const string errmsg = string("Incorrectly formatted blast hit:\n") + m8;
00223 NCBI_THROW(CSplignAppException, eBadData, errmsg);
00224 #else
00225 return rv;
00226 #endif
00227 }
00228
00229
00230 bool CSplignApp::x_GetNextPair(const THitRefs& hitrefs, THitRefs* hitrefs_pair)
00231 {
00232 USING_SCOPE(objects);
00233
00234 hitrefs_pair->resize(0);
00235
00236 const size_t dim = hitrefs.size();
00237 if(dim == 0) {
00238 return false;
00239 }
00240
00241 if(m_CurHitRef == dim) {
00242 m_CurHitRef = numeric_limits<size_t>::max();
00243 return false;
00244 }
00245
00246 if(m_CurHitRef == numeric_limits<size_t>::max()) {
00247 m_CurHitRef = 0;
00248 }
00249
00250 CConstRef<CSeq_id> query (hitrefs[m_CurHitRef]->GetQueryId());
00251 CConstRef<CSeq_id> subj (hitrefs[m_CurHitRef]->GetSubjId());
00252 while(m_CurHitRef < dim
00253 && hitrefs[m_CurHitRef]->GetQueryId()->Match(*query)
00254 && hitrefs[m_CurHitRef]->GetSubjId()->Match(*subj) )
00255 {
00256 hitrefs_pair->push_back(hitrefs[m_CurHitRef++]);
00257 }
00258 return true;
00259 }
00260
00261
00262 bool CSplignApp::x_GetNextPair(istream& ifs, THitRefs* hitrefs)
00263 {
00264 hitrefs->resize(0);
00265
00266 if(!m_PendingHits.size() && !ifs ) {
00267 return false;
00268 }
00269
00270 if(!m_PendingHits.size()) {
00271
00272 THit::TId query, subj;
00273
00274 if(m_firstline.size()) {
00275
00276 THitRef hitref (s_ReadBlastHit(m_firstline));
00277 query = hitref->GetQueryId();
00278 subj = hitref->GetSubjId();
00279 m_PendingHits.push_back(hitref);
00280 }
00281
00282 char buf [1024];
00283 while(ifs) {
00284
00285 buf[0] = 0;
00286 CT_POS_TYPE pos0 = ifs.tellg();
00287 ifs.getline(buf, sizeof buf, '\n');
00288 CT_POS_TYPE pos1 = ifs.tellg();
00289 if(pos1 == pos0) break;
00290 if(buf[0] == '#') continue;
00291 const char* p = buf;
00292 while(*p == ' ' || *p == '\t') ++p;
00293 if(*p == 0) continue;
00294
00295 THitRef hit (s_ReadBlastHit(p));
00296 if(query.IsNull()) {
00297 query = hit->GetQueryId();
00298 }
00299 if(subj.IsNull()) {
00300 subj = hit->GetSubjId();
00301 }
00302 if(hit->GetQueryStrand() == false) {
00303 hit->FlipStrands();
00304 }
00305 if(hit->GetSubjStop() == hit->GetSubjStart()) {
00306
00307 continue;
00308 }
00309
00310 if(hit->GetQueryId()->Match(*query) == false ||
00311 hit->GetSubjId()->Match(*subj) == false) {
00312
00313 m_firstline = p;
00314 break;
00315 }
00316
00317 m_PendingHits.push_back(hit);
00318 }
00319 }
00320
00321 const size_t pending_size = m_PendingHits.size();
00322 if(pending_size) {
00323
00324 THit::TId query = m_PendingHits[0]->GetQueryId();
00325 THit::TId subj = m_PendingHits[0]->GetSubjId();
00326 size_t i = 1;
00327 for(; i < pending_size; ++i) {
00328
00329 THitRef h = m_PendingHits[i];
00330 if(h->GetQueryId()->Match(*query) == false ||
00331 h->GetSubjId()->Match(*subj) == false) {
00332 break;
00333 }
00334 }
00335 hitrefs->resize(i);
00336 copy(m_PendingHits.begin(), m_PendingHits.begin() + i,
00337 hitrefs->begin());
00338 m_PendingHits.erase(m_PendingHits.begin(), m_PendingHits.begin() + i);
00339 }
00340
00341 return hitrefs->size() > 0;
00342 }
00343
00344
00345 void ReadCompartment(istream& istr, CSplign::THitRefs* phitrefs)
00346 {
00347 phitrefs->clear();
00348 while(istr) {
00349 string line;
00350 getline(istr, line);
00351 if(line.empty()) {
00352 if(phitrefs->empty()) continue; else break;
00353 }
00354 CSplign::THitRef h (new CSplign::THit(line.c_str()));
00355 phitrefs->push_back(h);
00356 }
00357 }
00358
00359
00360 bool CSplignApp::x_GetNextComp(istream& ifs,
00361 THitRefs* phitrefs,
00362 THit::TCoord* psubj_min,
00363 THit::TCoord* psubj_max)
00364 {
00365 static THitRefs hitrefs_next;
00366 THitRefs & hitrefs (*phitrefs);
00367
00368 const THit::TCoord kUndef (numeric_limits<THit::TCoord>::max());
00369 const THit::TCoord kMax (numeric_limits<THit::TCoord>::max() - 1);
00370 static THit::TCoord smin (kUndef), smax (kUndef);
00371
00372 if(!hitrefs_next.empty()) {
00373 hitrefs.resize(hitrefs_next.size());
00374 copy(hitrefs_next.begin(), hitrefs_next.end(), hitrefs.begin());
00375 hitrefs_next.clear();
00376 }
00377 else {
00378
00379 ReadCompartment(ifs, phitrefs);
00380 }
00381
00382
00383 ReadCompartment(ifs, &hitrefs_next);
00384
00385
00386 if(smin != kUndef) {
00387 *psubj_min = smin;
00388 *psubj_max = kMax;
00389 }
00390 else if(smax != kUndef) {
00391 *psubj_min = 0;
00392 *psubj_max = smax;
00393 }
00394 else {
00395 *psubj_min = 0;
00396 *psubj_max = kMax;
00397 }
00398
00399 if(!hitrefs_next.empty()
00400 && hitrefs.front()->GetSubjStrand() == hitrefs_next.front()->GetSubjStrand()
00401 && hitrefs.front()->GetQueryId()->Match(*(hitrefs_next.front()->GetQueryId()))
00402 && hitrefs.front()->GetSubjId()->Match(*(hitrefs_next.front()->GetSubjId())))
00403 {
00404 if(hitrefs.front()->GetSubjStart() < hitrefs_next.front()->GetSubjStart()) {
00405 *psubj_min = smin != kUndef? smin: 0;
00406 *psubj_max = min(hitrefs_next.front()->GetSubjMin(),
00407 hitrefs_next.back()->GetSubjMin());
00408 smin = max(hitrefs.front()->GetSubjMax(),
00409 hitrefs.back()->GetSubjMax());
00410 smax = kUndef;
00411 }
00412 else {
00413 *psubj_min = max(hitrefs_next.front()->GetSubjMax(),
00414 hitrefs_next.back()->GetSubjMax());
00415 *psubj_max = smax != kUndef? smax: kMax;
00416 smin = kUndef;
00417 smax = min(hitrefs.front()->GetSubjMin(),
00418 hitrefs.back()->GetSubjMin());
00419 }
00420 }
00421 else {
00422 smin = smax = kUndef;
00423 }
00424
00425 return !hitrefs.empty();
00426 }
00427
00428
00429 void CSplignApp::x_LogCompartmentStatus(const THit::TId & query,
00430 const THit::TId & subj,
00431 const CSplign::SAlignedCompartment & ac)
00432 {
00433 typedef CSplign::SAlignedCompartment TCompartment;
00434
00435 switch(ac.m_Status) {
00436
00437 case TCompartment::eStatus_Ok: {
00438
00439 if(ac.m_Id == 0) {
00440 NCBI_THROW(CSplignAppException, eInternal, "Missing compartment id.");
00441 }
00442
00443 *m_logstream << (ac.m_QueryStrand? '+': '-') << ac.m_Id
00444 << '\t' << query->GetSeqIdString(true)
00445 << '\t' << subj->GetSeqIdString(true)
00446 << '\t' << ac.m_Msg
00447 << '\t' << ac.m_Score
00448 << endl;
00449 }
00450 break;
00451
00452 case TCompartment::eStatus_Error: {
00453
00454 *m_logstream << '-'
00455 << '\t' << query->GetSeqIdString(true)
00456 << '\t' << subj->GetSeqIdString(true)
00457 << '\t' << ac.m_Msg
00458 << '\t' << '-'
00459 << endl;
00460 }
00461 break;
00462
00463 case TCompartment::eStatus_Empty:
00464 break;
00465
00466 default: {
00467 NCBI_THROW(CSplignAppException, eInternal,
00468 "Unexpected compartment status.");
00469 }
00470 }
00471
00472 }
00473
00474
00475 CRef<blast::CBlastOptionsHandle>
00476 CSplignApp::x_SetupBlastOptions(bool use_disc)
00477 {
00478 USING_SCOPE(blast);
00479
00480 m_BlastProgram = use_disc? eDiscMegablast: eMegablast;
00481
00482 CRef<CBlastOptionsHandle> blast_options_handle
00483 (CBlastOptionsFactory::Create(m_BlastProgram));
00484
00485 blast_options_handle->SetDefaults();
00486
00487 CBlastOptions& blast_opt = blast_options_handle->SetOptions();
00488
00489 if(!use_disc) {
00490
00491 const CArgs& args = GetArgs();
00492 blast_opt.SetWordSize(args["W"].AsInteger());
00493 blast_opt.SetMaskAtHash(true);
00494 blast_opt.SetDustFiltering(false);
00495 }
00496
00497 if(blast_options_handle->Validate() == false) {
00498 NCBI_THROW(CSplignAppException,
00499 eInternal,
00500 "Incorrect blast setup");
00501 }
00502
00503 return blast_options_handle;
00504 }
00505
00506
00507 enum ERunMode {
00508 eNotSet,
00509 ePairwise,
00510 eBatch1,
00511 eBatch2
00512 };
00513
00514
00515 const string kSplignLdsDb ("splign.ldsdb");
00516
00517 string GetLdsDbDir(const string& fasta_dir)
00518 {
00519 string lds_db_dir = fasta_dir;
00520 const char sep = CDirEntry::GetPathSeparator();
00521 const size_t fds = fasta_dir.size();
00522 if(fds > 0 && fasta_dir[fds-1] != sep) {
00523 lds_db_dir += sep;
00524 }
00525 lds_db_dir += "_SplignLDS_";
00526 return lds_db_dir;
00527 }
00528
00529
00530 CRef<objects::CSeq_id> CSplignApp::x_ReadFastaSetId(const CArgValue& argval,
00531 CRef<objects::CScope> scope)
00532 {
00533 USING_SCOPE(objects);
00534
00535 CRef<ILineReader> line_reader;
00536 try {
00537 line_reader.Reset(
00538 new CMemoryLineReader(new CMemoryFile(argval.AsString()),
00539 eTakeOwnership));
00540 } catch (...) {
00541 line_reader.Reset(new CStreamLineReader(argval.AsInputFile()));
00542 }
00543 CFastaReader fasta_reader(* line_reader,
00544 CFastaReader::fAssumeNuc | CFastaReader::fOneSeq);
00545 CConstRef<CSeq_entry> se (fasta_reader.ReadOneSeq());
00546
00547 scope->AddTopLevelSeqEntry(*se);
00548 const CSeq_entry::TSeq& bioseq = se->GetSeq();
00549 const CSeq_entry::TSeq::TId& seqid = bioseq.GetId();
00550 return seqid.back();
00551 }
00552
00553
00554 int CSplignApp::Run()
00555 {
00556 USING_SCOPE(objects);
00557
00558 const CArgs & args (GetArgs());
00559
00560
00561
00562 const bool is_mklds = args["mklds"];
00563 const bool is_ldsdir = args["ldsdir"];
00564
00565 const bool is_hits = args["hits"];
00566 const bool is_query = args["query"];
00567 const bool is_subj = args["subj"];
00568
00569 const bool is_comps = args["comps"];
00570
00571 const bool use_disc_megablast (args["disc"]);
00572
00573 if(is_mklds) {
00574
00575
00576 string fa_dir = args["mklds"].AsString();
00577 if(CDirEntry::IsAbsolutePath(fa_dir) == false) {
00578 string curdir = CDir::GetCwd();
00579 const char sep = CDirEntry::GetPathSeparator();
00580 const size_t curdirsize = curdir.size();
00581 if(curdirsize && curdir[curdirsize-1] != sep) {
00582 curdir += sep;
00583 }
00584 fa_dir = curdir + fa_dir;
00585 }
00586
00587 const string lds_db_dir (GetLdsDbDir(fa_dir));
00588
00589
00590 #ifdef CPPTOOLKIT_LDS_MANAGEMENT
00591
00592 CLDS_Database ldsdb (lds_db_dir, kSplignLdsDb);
00593 CLDS_Management ldsmgt (ldsdb);
00594 ldsmgt.Create();
00595 ldsmgt.SyncWithDir(fa_dir,
00596 CLDS_Management::eRecurseSubDirs,
00597 CLDS_Management::eNoControlSum);
00598 #else
00599 CLDS_Manager ldsmgr (fa_dir, lds_db_dir, kSplignLdsDb);
00600 ldsmgr.Index(CLDS_Manager::eRecurseSubDirs,
00601 CLDS_Manager::eNoControlSum);
00602 #endif
00603
00604 return 0;
00605 }
00606
00607
00608 ERunMode run_mode (eNotSet);
00609
00610 if(is_query && is_subj && !(is_hits || is_comps || is_ldsdir)) {
00611 run_mode = ePairwise;
00612 }
00613 else if(is_hits && is_ldsdir && !(is_comps ||is_query || is_subj)) {
00614 run_mode = eBatch1;
00615 }
00616 else if(is_comps && is_ldsdir && !(is_hits ||is_query || is_subj)) {
00617 run_mode = eBatch2;
00618 }
00619
00620 if(run_mode == eNotSet) {
00621 NCBI_THROW(CSplignAppException,
00622 eBadParameter,
00623 "Incomplete or inconsistent set of arguments specified. "
00624 "Specify -help to print arguments." );
00625 }
00626
00627
00628 m_logstream = & args["log"].AsOutputFile();
00629
00630
00631 m_AsnOut = args["asn"]? & args["asn"].AsOutputFile(): NULL;
00632
00633
00634 m_AlnOut = args["aln"]? & args["aln"].AsOutputFile(): NULL;
00635
00636
00637 if(run_mode != eBatch1 && run_mode != eBatch2) {
00638 m_BlastOptionsHandle = x_SetupBlastOptions(use_disc_megablast);
00639 }
00640
00641
00642 m_Splign.Reset(new CSplign);
00643 CSplignArgUtil::ArgsToSplign(m_Splign, args);
00644
00645 m_Splign->SetStartModelId(1);
00646
00647
00648 m_Formatter.Reset(new CSplignFormatter(*m_Splign));
00649
00650
00651 CRef<CScope> scope;
00652 CRef<CSeq_id> seqid_query, seqid_subj;
00653 if(run_mode == ePairwise) {
00654
00655 scope.Reset (new CScope(*m_ObjMgr));
00656 scope->AddDefaults();
00657 seqid_query = x_ReadFastaSetId(args["query"], scope);
00658 seqid_subj = x_ReadFastaSetId(args["subj"] , scope);
00659 }
00660 else if(run_mode == eBatch1 || run_mode == eBatch2) {
00661
00662 const string fasta_dir = args["ldsdir"].AsString();
00663 const string ldsdb_dir = GetLdsDbDir(fasta_dir);
00664 CLDS_Database* ldsdb (
00665 new CLDS_Database(ldsdb_dir, kSplignLdsDb));
00666 m_LDS_db.reset(ldsdb);
00667 m_LDS_db->Open();
00668 CLDS_DataLoader::RegisterInObjectManager(
00669 *m_ObjMgr, *ldsdb, CObjectManager::eDefault);
00670 scope.Reset (new CScope(*m_ObjMgr));
00671 scope->AddDefaults();
00672 }
00673 else {
00674 NCBI_THROW(CSplignAppException,
00675 eGeneral,
00676 "Requested mode not implemented." );
00677 }
00678
00679 m_Splign->SetScope() = scope;
00680
00681
00682 if(run_mode == ePairwise) {
00683
00684 THitRefs hitrefs;
00685 x_GetBl2SeqHits(seqid_query, seqid_subj, scope, &hitrefs);
00686 x_ProcessPair(hitrefs, args);
00687 }
00688 else if (run_mode == eBatch1) {
00689
00690 THitRefs hitrefs;
00691 CNcbiIstream& hit_stream = args["hits"].AsInputFile();
00692 while(x_GetNextPair(hit_stream, &hitrefs) ) {
00693 x_ProcessPair(hitrefs, args);
00694 }
00695 }
00696 else if (run_mode == eBatch2) {
00697
00698 CNcbiIstream& hit_stream (args["comps"].AsInputFile());
00699 THitRefs hitrefs;
00700 THit::TCoord subj_min, subj_max;
00701
00702 while(x_GetNextComp(hit_stream, &hitrefs, &subj_min, &subj_max) ) {
00703
00704 if(hitrefs.front()->GetScore() > 0) {
00705 x_ProcessPair(hitrefs, args, subj_min, subj_max);
00706 }
00707 }
00708 }
00709 else {
00710 NCBI_THROW(CSplignAppException,
00711 eInternal,
00712 "Mode not implemented");
00713 }
00714
00715 cout << "# END" << endl;
00716
00717 return 0;
00718 }
00719
00720
00721 void CSplignApp::x_GetBl2SeqHits(
00722 CRef<objects::CSeq_id> seqid_query,
00723 CRef<objects::CSeq_id> seqid_subj,
00724 CRef<objects::CScope> scope,
00725 THitRefs* phitrefs)
00726 {
00727 USING_SCOPE(blast);
00728 USING_SCOPE(objects);
00729
00730 phitrefs->resize(0);
00731 phitrefs->reserve(100);
00732
00733 CRef<CSeq_loc> seqloc_query (new CSeq_loc);
00734 seqloc_query->SetWhole().Assign(*seqid_query);
00735 CRef<CSeq_loc> seqloc_subj (new CSeq_loc);
00736 seqloc_subj->SetWhole().Assign(*seqid_subj);
00737
00738 CBl2Seq Blast( SSeqLoc(seqloc_query.GetNonNullPointer(),
00739 scope.GetNonNullPointer()),
00740 SSeqLoc(seqloc_subj.GetNonNullPointer(),
00741 scope.GetNonNullPointer()),
00742 m_BlastProgram);
00743
00744 Blast.SetOptionsHandle() = *m_BlastOptionsHandle;
00745
00746 TSeqAlignVector blast_output (Blast.Run());
00747
00748 ITERATE(TSeqAlignVector, ii, blast_output) {
00749 if((*ii)->IsSet()) {
00750 const CSeq_align_set::Tdata &sas0 = (*ii)->Get();
00751 ITERATE(CSeq_align_set::Tdata, sa_iter, sas0) {
00752 CRef<CBlastTabular> hitref (new CBlastTabular(**sa_iter));
00753 if(hitref->GetQueryStrand() == false) {
00754 hitref->FlipStrands();
00755 }
00756 phitrefs->push_back(hitref);
00757 }
00758 }
00759 }
00760 }
00761
00762 void CSplignApp::x_RunSplign(bool raw_hits, THitRefs* phitrefs,
00763 THit::TCoord smin, THit::TCoord smax,
00764 CSplign::TResults * psplign_results)
00765 {
00766 if(raw_hits) {
00767 m_Splign->Run(phitrefs);
00768 const CSplign::TResults& results (m_Splign->GetResult());
00769 copy(results.begin(), results.end(), back_inserter(*psplign_results));
00770 }
00771 else {
00772 CSplign::SAlignedCompartment ac;
00773 m_Splign->AlignSingleCompartment(phitrefs, smin, smax, &ac);
00774 psplign_results->push_back(ac);
00775 }
00776 }
00777
00778
00779
00780
00781 size_t GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)
00782 {
00783 size_t top_matches (0);
00784 size_t rv (0);
00785 ITERATE(CSplign::TResults, ii, splign_results) {
00786
00787 const CSplign::SAlignedCompartment & ac (*ii);
00788 size_t matches (0), nc_count(0);
00789 typedef CSplign::TSegments::const_iterator TIterator;
00790 char dnr [] = {0, 0, 0};
00791 char acc [] = {0, 0, 0};
00792 size_t exon_count (0);
00793
00794 for(TIterator jjb (ac.m_Segments.begin()), jje (ac.m_Segments.end()), jj(jjb);
00795 jj != jje; ++jj)
00796 {
00797
00798 if(jj->m_exon) {
00799
00800 const char * p (jj->m_details.data()), * pe (p +jj->m_details.size());
00801 int n (-1);
00802 for(; p != pe; ++p) {
00803 if(*p == 'M') {
00804 if(n == 0) ++matches; else if(n > 0) matches += n;
00805 n = 0;
00806 }
00807 else if(isdigit(*p) && n >= 0) {
00808 n = n * 10 + *p - '0';
00809 }
00810 else {
00811 if(n == 0) {
00812 ++matches;
00813 }
00814 n = -1;
00815 }
00816 }
00817 if(n == 0) ++matches; else if(n > 0) matches += n;
00818
00819 if(exon_count > 0) {
00820
00821 if(jj->m_annot[2] == '<') {
00822
00823 acc[0] = jj->m_annot[0];
00824 acc[1] = jj->m_annot[1];
00825
00826 if(!CNWFormatter::SSegment::s_IsConsensusSplice(dnr, acc)) {
00827 ++nc_count;
00828 }
00829 }
00830 acc[0] = acc[1] = 0;
00831 }
00832
00833 p = jj->m_annot.data();
00834 while(*p++ != '>');
00835 dnr[0] = *p++;
00836 dnr[1] = *p;
00837
00838 ++exon_count;
00839 }
00840 }
00841
00842 if(matches > top_matches) {
00843 rv = nc_count;
00844 top_matches = matches;
00845 }
00846 }
00847
00848 return rv;
00849 }
00850
00851
00852 struct SComplement
00853 {
00854 char operator() (char c) {
00855 switch(c) {
00856 case 'A': return 'T';
00857 case 'G': return 'C';
00858 case 'T': return 'A';
00859 case 'C': return 'G';
00860 }
00861 return c;
00862 }
00863 };
00864
00865
00866 void CSplignApp::x_ProcessPair(THitRefs& hitrefs, const CArgs& args,
00867 THit::TCoord smin, THit::TCoord smax)
00868 {
00869
00870 #ifdef GENOME_PIPELINE
00871 const CSplignFormatter::ETextFlags flags (CSplignFormatter::eTF_UseFastaStyleIds);
00872 #else
00873 const CSplignFormatter::ETextFlags flags (CSplignFormatter::eTF_NoExonScores);
00874 #endif
00875
00876 const bool raw_hits (!args["comps"]);
00877
00878 if(hitrefs.size() == 0) {
00879 return;
00880 }
00881
00882
00883 if(hitrefs.front()->GetScore() < 0) {
00884 return;
00885 }
00886
00887 THit::TId query (hitrefs.front()->GetQueryId());
00888 THit::TId subj (hitrefs.front()->GetSubjId());
00889
00890 m_Formatter->SetSeqIds(query, subj);
00891
00892 string strand (args["direction"].AsString());
00893
00894 #ifdef ALGOALIGN_NW_SPLIGN_MAKE_PUBLIC_BINARY
00895 if(strand == kDirDefault) {
00896 strand = (args["type"].AsString() == kQueryType_mRNA)? kDirAuto: kDirBoth;
00897 }
00898 #endif
00899
00900 CSplign::TResults splign_results;
00901
00902 if(strand == kDirSense) {
00903
00904 m_Splign->SetStrand(true);
00905 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00906 }
00907 else if(strand == kDirAntisense) {
00908
00909 m_Splign->SetStrand(false);
00910 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00911 }
00912 else if(strand == kDirBoth) {
00913
00914
00915 THitRefs hits0;
00916 ITERATE(THitRefs, ii, hitrefs) {
00917 const THitRef & h0 (*ii);
00918 THitRef h1 (new THit (*h0));
00919 hits0.push_back(h1);
00920 }
00921
00922 static size_t mid (1);
00923 size_t mid_plus, mid_minus;
00924 {{
00925 m_Splign->SetStrand(true);
00926 m_Splign->SetStartModelId(mid);
00927 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00928 mid_plus = m_Splign->GetNextModelId();
00929 }}
00930 {{
00931 m_Splign->SetStrand(false);
00932 m_Splign->SetStartModelId(mid);
00933 x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
00934 mid_minus = m_Splign->GetNextModelId();
00935 }}
00936 mid = max(mid_plus, mid_minus);
00937 }
00938 else {
00939
00940
00941
00942 THitRefs hits0;
00943 ITERATE(THitRefs, ii, hitrefs) {
00944 const THitRef & h0 (*ii);
00945 THitRef h1 (new THit (*h0));
00946 hits0.push_back(h1);
00947 }
00948
00949
00950 const CSplign::TOrfPair orfs (m_Splign->GetCds(hitrefs.front()->GetQueryId()));
00951 const size_t orf_sense (orfs.first.second - orfs.first.first);
00952 const size_t orf_antisense (orfs.second.first - orfs.second.second);
00953 const bool sense_first (orf_sense >= orf_antisense);
00954
00955 static size_t mid (1);
00956 size_t mid_first, mid_second;
00957
00958
00959 m_Splign->SetStrand(sense_first);
00960 m_Splign->SetStartModelId(mid);
00961 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
00962 mid_first = m_Splign->GetNextModelId();
00963
00964
00965 const size_t nc_count (GetNonConsensusSpliceCount(splign_results));
00966
00967
00968 bool polya_found (false);
00969 if(nc_count == 0) {
00970 CRef<CScope> scope (m_Splign->GetScope());
00971 CConstRef<CSeq_id> seqid_query (hits0.front()->GetQueryId());
00972 CBioseq_Handle bh (scope->GetBioseqHandle(*seqid_query));
00973 CSeqVector sv (bh.GetSeqVector(CBioseq_Handle
00974 ::eCoding_Iupac));
00975 string str;
00976 sv.GetSeqData(0, sv.size(), str);
00977 if(sense_first) {
00978 reverse (str.begin(), str.end());
00979 transform(str.begin(), str.end(), str.begin(), SComplement());
00980 }
00981 const size_t polya (CSplign::s_TestPolyA(str.data(), str.size()));
00982 polya_found = (0 < polya && polya < str.size());
00983 }
00984
00985 if(nc_count > 0 || polya_found) {
00986 m_Splign->SetStrand(!sense_first);
00987 m_Splign->SetStartModelId(mid);
00988 x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
00989 mid_second = m_Splign->GetNextModelId();
00990 mid = max(mid_first, mid_second);
00991 }
00992 else {
00993 mid = mid_first;
00994 }
00995 }
00996
00997 cout << m_Formatter->AsExonTable(&splign_results, flags);
00998
00999 if(m_AsnOut) {
01000 CRef<CSeq_align_set> sas (
01001 m_Formatter-> AsSeqAlignSet(&splign_results,
01002 CSplignFormatter::
01003 eAF_SplicedSegWithParts));
01004 *m_AsnOut << MSerial_AsnText << *sas << endl;
01005 }
01006
01007 if(m_AlnOut) {
01008 *m_AlnOut << m_Formatter->AsAlignmentText(m_Splign->GetScope(),
01009 &splign_results);
01010 }
01011
01012 ITERATE(CSplign::TResults, ii, splign_results) {
01013 x_LogCompartmentStatus(query, subj, *ii);
01014 }
01015 }
01016
01017
01018 END_NCBI_SCOPE
01019
01020
01021
01022 USING_NCBI_SCOPE;
01023
01024 int main(int argc, const char* argv[])
01025 {
01026 const int rv (CSplignApp().AppMain(argc, argv, 0, eDS_Default, 0));
01027 return rv;
01028 }
01029
01030