NCBI C++ ToolKit
igblast.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 #ifndef SKIP_DOXYGEN_PROCESSING
00002 static char const rcsid[] =
00003     "$Id: igblast.cpp 61512 2014-01-28 19:44:58Z jianye $";
00004 #endif /* SKIP_DOXYGEN_PROCESSING */
00005 /* ===========================================================================
00006  *
00007  *                            PUBLIC DOMAIN NOTICE
00008  *               National Center for Biotechnology Information
00009  *
00010  *  This software/database is a "United States Government Work" under the
00011  *  terms of the United States Copyright Act.  It was written as part of
00012  *  the author's official duties as a United States Government employee and
00013  *  thus cannot be copyrighted.  This software/database is freely available
00014  *  to the public for use. The National Library of Medicine and the U.S.
00015  *  Government have not placed any restriction on its use or reproduction.
00016  *
00017  *  Although all reasonable efforts have been taken to ensure the accuracy
00018  *  and reliability of the software and data, the NLM and the U.S.
00019  *  Government do not and cannot warrant the performance or results that
00020  *  may be obtained by using this software or data. The NLM and the U.S.
00021  *  Government disclaim all warranties, express or implied, including
00022  *  warranties of performance, merchantability or fitness for any particular
00023  *  purpose.
00024  *
00025  *  Please cite the author in any work or product based on this material.
00026  *
00027  * ===========================================================================
00028  *
00029  * Author:  Ning Ma
00030  *
00031  */
00032 
00033 /** @file igblast.cpp
00034  * Implementation of CIgBlast.
00035  */
00036 
00037 #include <ncbi_pch.hpp>
00038 #include <algo/blast/igblast/igblast.hpp>
00039 #include <algo/blast/api/local_blast.hpp>
00040 #include <algo/blast/api/bl2seq.hpp>
00041 #include <algo/blast/api/remote_blast.hpp>
00042 #include <algo/blast/api/objmgr_query_data.hpp>
00043 #include <objtools/alnmgr/alnmap.hpp>
00044 #include <algo/blast/composition_adjustment/composition_constants.h>
00045 #include <objmgr/object_manager.hpp>
00046 
00047 
00048 /** @addtogroup AlgoBlast
00049  *
00050  * @{
00051  */
00052 
00053 BEGIN_NCBI_SCOPE
00054 USING_SCOPE(objects);
00055 BEGIN_SCOPE(blast)
00056 
00057 static int max_allowed_VJ_distance_with_D = 90;
00058 static int max_allowed_VJ_distance_without_D = 40;
00059 static int max_allowed_VD_distance = 40;
00060 
00061 static void s_ReadLinesFromFile(const string& fn, vector<string>& lines)
00062 {
00063     CNcbiIfstream fs(fn.c_str(), IOS_BASE::in);
00064     lines.clear();
00065 
00066     if (CFile(fn).Exists() && ! fs.fail()) {
00067         char line[256];
00068         while(true) {
00069             fs.getline(line, 256);
00070             if (fs.eof()) break;
00071             if (line[0] == '#') continue;
00072             string l(line);
00073             lines.push_back(l);
00074         }
00075     }
00076     fs.close();
00077 };
00078 
00079 CIgAnnotationInfo::CIgAnnotationInfo(CConstRef<CIgBlastOptions> &ig_opt)
00080 {
00081     vector<string> lines;
00082 
00083     // read domain info from pdm or ndm file
00084     const string suffix = (ig_opt->m_IsProtein) ? ".pdm." : ".ndm.";
00085     string fn(SeqDB_ResolveDbPath(ig_opt->m_IgDataPath + "/" + ig_opt->m_Origin + "/" 
00086                                + ig_opt->m_Origin + suffix + ig_opt->m_DomainSystem));
00087     if (fn == "") {
00088         NCBI_THROW(CBlastException,  eInvalidArgument, 
00089               "Domain annotation data file could not be found in [internal_data] directory");
00090     }
00091     s_ReadLinesFromFile(fn, lines);
00092     int index = 0;
00093     ITERATE(vector<string>, l, lines) {
00094         vector<string> tokens;
00095         NStr::Tokenize(*l, " \t\n\r", tokens, NStr::eMergeDelims);
00096         if (!tokens.empty()) {
00097             m_DomainIndex[tokens[0]] = index;
00098             for (int i=1; i<11; ++i) {
00099                 m_DomainData.push_back(NStr::StringToInt(tokens[i]));
00100             }
00101             index += 10;
00102             m_DomainChainType[tokens[0]] = tokens[11];
00103             int frame = NStr::StringToInt(tokens[12]);
00104             if (frame != -1) {
00105                 m_FrameOffset[tokens[0]] = frame;
00106             }
00107         } 
00108     }
00109 
00110     // read frame info from aux files
00111     if (ig_opt->m_IsProtein) return;
00112     fn = ig_opt->m_AuxFilename;
00113     s_ReadLinesFromFile(fn, lines);
00114     if (lines.size() == 0) {
00115         ERR_POST(Warning << "Auxilary data file could not be found");
00116     }
00117     ITERATE(vector<string>, l, lines) {
00118         vector<string> tokens;
00119         NStr::Tokenize(*l, " \t\n\r", tokens, NStr::eMergeDelims);
00120         if (!tokens.empty()) {
00121             int frame = NStr::StringToInt(tokens[1]);
00122             if (frame != -1) {
00123                 m_FrameOffset[tokens[0]] = frame;
00124             }
00125             if (tokens.size() == 3) { //just backward compatible as there was no such field
00126                 m_DJChainType[tokens[0]] = tokens[2];
00127             }
00128         }
00129     }
00130 };
00131 
00132 CRef<CSearchResultSet>
00133 CIgBlast::Run()
00134 {
00135     vector<CRef <CIgAnnotation> > annots;
00136     CRef<CSearchResultSet> final_results;
00137     CRef<IQueryFactory> qf;
00138     CRef<CBlastOptionsHandle> opts_hndl(CBlastOptionsFactory
00139            ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
00140     CRef<CSearchResultSet> results[4], result;
00141 
00142     /*** search V germline */
00143     {
00144         x_SetupVSearch(qf, opts_hndl);
00145         CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[0]);
00146         results[0] = blast.Run();
00147         x_ConvertResultType(results[0]);
00148         s_SortResultsByEvalue(results[0]);
00149         x_AnnotateV(results[0], annots);
00150     }
00151 
00152     /*** search internal V for domain annotation */
00153     {
00154         opts_hndl->SetHitlistSize(20);  // use a larger number to ensure annotation
00155         CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[3]);
00156         results[3] = blast.Run();
00157         s_SortResultsByEvalue(results[3]);
00158         x_AnnotateDomain(results[0], results[3], annots);
00159     }
00160 
00161     /*** search DJ germline */
00162     int num_genes =  (m_IgOptions->m_IsProtein) ? 1 : 3;
00163     if (num_genes > 1) {
00164         
00165         for (int gene = 1; gene < num_genes; ++gene) {
00166             x_SetupDJSearch(annots, qf, opts_hndl, gene);
00167             CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[gene]);
00168             try {
00169                 results[gene] = blast.Run();
00170                 x_ConvertResultType(results[gene]);
00171             } catch(...) {
00172                 num_genes = 1;
00173                 break;
00174             }
00175         }
00176         if (num_genes > 1) 
00177             x_AnnotateDJ(results[1], results[2], annots);
00178     }
00179 
00180     /*** collect germline search results */
00181     for (int gene = 0; gene  < num_genes; ++gene) {
00182         s_AppendResults(results[gene], m_IgOptions->m_NumAlign[gene], gene, final_results);
00183     }
00184 
00185     /*** search user specified db */
00186     bool skipped = false;
00187     if (m_IsLocal) {
00188         if (&(*m_LocalDb) != &(*(m_IgOptions->m_Db[0]))) {
00189             x_SetupDbSearch(annots, qf);
00190             CLocalBlast blast(qf, m_Options, m_LocalDb);
00191             blast.SetNumberOfThreads(m_NumThreads);
00192             result = blast.Run();
00193         } else {
00194             skipped = true;
00195         }
00196     } else {
00197         x_SetupDbSearch(annots, qf);
00198         CRef<CRemoteBlast> blast;
00199         if (m_RemoteDb.NotEmpty()) {
00200             _ASSERT(m_Subject.Empty());
00201             blast.Reset(new CRemoteBlast(qf, m_Options, *m_RemoteDb));
00202             if(m_EntrezQuery != NcbiEmptyString){
00203                 blast->SetEntrezQuery(m_EntrezQuery.c_str());
00204             }
00205         } else {
00206             blast.Reset(new CRemoteBlast(qf, m_Options, m_Subject));
00207         }
00208         result = blast->GetResultSet();
00209     }
00210     if (! skipped) {
00211         x_ConvertResultType(result);
00212         s_SortResultsByEvalue(result);
00213         s_AppendResults(result, -1, -1, final_results);
00214     }
00215 
00216     /*** set chain type info */
00217     x_SetChainType(final_results, annots);
00218 
00219     /*** attach annotation info back to the results */
00220     s_SetAnnotation(annots, final_results);
00221 
00222     return final_results;
00223 };
00224 
00225 void CIgBlast::x_SetupVSearch(CRef<IQueryFactory>       &qf,
00226                               CRef<CBlastOptionsHandle> &opts_hndl)
00227 {
00228     CBlastOptions & opts = opts_hndl->SetOptions();
00229     if (m_IgOptions->m_IsProtein) {
00230         opts.SetCompositionBasedStats(eNoCompositionBasedStats);
00231     } else {
00232         int penalty = m_Options->GetOptions().GetMismatchPenalty();
00233         opts.SetMatchReward(1);
00234         opts.SetMismatchPenalty(penalty);
00235         opts.SetWordSize(9);
00236         if (penalty == -1) {
00237             opts.SetGapOpeningCost(4);
00238             opts.SetGapExtensionCost(1);
00239         }
00240     }
00241     opts_hndl->SetEvalueThreshold(20.0);
00242     opts_hndl->SetFilterString("F");
00243     opts_hndl->SetHitlistSize(15+ m_IgOptions->m_NumAlign[0]);
00244     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
00245 
00246 };
00247 
00248 void CIgBlast::x_SetupDJSearch(const vector<CRef <CIgAnnotation> > &annots,
00249                                CRef<IQueryFactory>           &qf,
00250                                CRef<CBlastOptionsHandle>     &opts_hndl,
00251                                int db_type)
00252 {
00253     // Only igblastn will search DJ
00254     CBlastOptions & opts = opts_hndl->SetOptions();
00255     opts.SetMatchReward(1);
00256     if (db_type == 2){ //J genes are longer so if can afford more reliable identification
00257         opts.SetWordSize(7);
00258         opts.SetMismatchPenalty(-3);
00259     } else {
00260         opts.SetWordSize(m_IgOptions->m_Min_D_match);
00261         opts.SetMismatchPenalty(m_IgOptions->m_D_penalty);
00262     }
00263 
00264     opts.SetGapOpeningCost(5);
00265     opts.SetGapExtensionCost(2);
00266     opts_hndl->SetEvalueThreshold((db_type == 2) ? 1000.0 : 100000.0);
00267     opts_hndl->SetFilterString("F");
00268     opts_hndl->SetHitlistSize(max(max(50, 
00269                m_IgOptions->m_NumAlign[1]), 
00270                m_IgOptions->m_NumAlign[2]));
00271 
00272     // Mask query for D, J search
00273     int iq = 0;
00274     ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
00275         CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
00276         CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
00277         int len = query->GetLength();
00278         if ((*annot)->m_GeneInfo[0] == -1) {
00279             // This is not a germline sequence.  Mask it out
00280             TMaskedQueryRegions mask_list;
00281             CRef<CSeqLocInfo> mask(
00282                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
00283             mask_list.push_back(mask);
00284             m_Query->SetMaskedRegions(iq, mask_list);
00285         } else {
00286             // Excluding the V gene except the last 7 bp for D and J gene search;
00287             // also limit the J match to 150bp beyond V gene.
00288             bool ms = (*annot)->m_MinusStrand;
00289             int begin = (ms)? 
00290               (*annot)->m_GeneInfo[0] - 150: (*annot)->m_GeneInfo[1] - 7;
00291             int end = (ms)? 
00292               (*annot)->m_GeneInfo[0] + 7: (*annot)->m_GeneInfo[1] + 150;
00293             if (begin > 0) {
00294                 CRef<CSeqLocInfo> mask(
00295                   new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin-1), 0));
00296                 m_Query->AddMask(iq, mask);
00297             }
00298             if (end < len) {
00299                 CRef<CSeqLocInfo> mask(
00300                   new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
00301                 m_Query->AddMask(iq, mask);
00302             }
00303         }
00304         ++iq;
00305     }
00306 
00307     // Generate query factory
00308     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
00309 };
00310 
00311 void CIgBlast::x_SetupDbSearch(vector<CRef <CIgAnnotation> > &annots,
00312                                CRef<IQueryFactory>           &qf)
00313 {
00314     // Options already passed in as m_Options.  Only set up (mask) the query
00315     int iq = 0;
00316     ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
00317         CRef<CBlastSearchQuery> query = m_Query->GetBlastSearchQuery(iq);
00318         CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
00319         int len = query->GetLength();
00320         TMaskedQueryRegions mask_list;
00321         if ((*annot)->m_GeneInfo[0] ==-1) {
00322             // This is not a germline sequence.  Mask it out
00323             CRef<CSeqLocInfo> mask(
00324                       new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
00325             mask_list.push_back(mask);
00326         } else if (m_IgOptions->m_FocusV) {
00327             // Restrict to V gene 
00328             int begin = (*annot)->m_GeneInfo[0];
00329             int end = (*annot)->m_GeneInfo[1];
00330             if (begin > 0) {
00331                 CRef<CSeqLocInfo> mask(
00332                       new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin-1), 0));
00333                 mask_list.push_back(mask);
00334             }
00335             if (end < len) {
00336                 CRef<CSeqLocInfo> mask(
00337                       new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
00338                 mask_list.push_back(mask);
00339             }
00340         }
00341         m_Query->SetMaskedRegions(iq, mask_list);
00342         ++iq;
00343     }
00344     qf.Reset(new CObjMgr_QueryFactory(*m_Query));
00345 };
00346 
00347 // Compare the second seqalign to see if it is as good as the first one
00348 static bool s_IsSeqAlignAsGood(const CRef<CSeq_align> &x, 
00349                                const CRef<CSeq_align> &y)
00350 {
00351     double sx, sy;
00352     x->GetNamedScore(CSeq_align::eScore_EValue, sx);
00353     y->GetNamedScore(CSeq_align::eScore_EValue, sy);
00354     if (sx < 0.999999 * sy || sy < 0.999999 * sx) return false;
00355     int ix, iy;
00356     x->GetNamedScore(CSeq_align::eScore_Score, ix);
00357     y->GetNamedScore(CSeq_align::eScore_Score, iy);
00358     if (ix > iy) return false;
00359     x->GetNamedScore(CSeq_align::eScore_IdentityCount, ix);
00360     y->GetNamedScore(CSeq_align::eScore_IdentityCount, iy);
00361     int dx, dy;
00362     dx = x->GetAlignLength();
00363     dy = y->GetAlignLength();
00364     return (ix*dy <= iy*dx);
00365 }
00366 
00367 // Remove lcl| from seqid label
00368 static string s_RemoveLocalPrefix(const string & sid) 
00369 {
00370     if (sid.substr(0, 4) == "lcl|") return(sid.substr(4, sid.length()));
00371     return sid;
00372 }
00373 
00374 void CIgBlast::x_AnnotateV(CRef<CSearchResultSet>        &results, 
00375                            vector<CRef <CIgAnnotation> > &annots)
00376 {
00377     ITERATE(CSearchResultSet, result, *results) {
00378 
00379         CIgAnnotation *annot = new CIgAnnotation();
00380         annots.push_back(CRef<CIgAnnotation>(annot));
00381  
00382         if ((*result)->HasAlignments()) {
00383             const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
00384             CRef<CSeq_align> align = align_list.front();
00385             annot->m_GeneInfo[0] = align->GetSeqStart(0);
00386             annot->m_GeneInfo[1] = align->GetSeqStop(0)+1;
00387             annot->m_TopGeneIds[0] = "";
00388 
00389             int ii=0;
00390             ITERATE(CSeq_align_set::Tdata, it, align_list) {
00391                 if (ii++ < m_IgOptions->m_NumAlign[0] && s_IsSeqAlignAsGood(align, (*it))) {
00392                     if (annot->m_TopGeneIds[0] != "") annot->m_TopGeneIds[0] += ",";
00393                     annot->m_TopGeneIds[0] += s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
00394                 } else break;
00395             }
00396         } 
00397     }
00398 };
00399 
00400 // Test if the alignment is already in the align_list
00401 static bool s_SeqAlignInSet(CSeq_align_set::Tdata & align_list, CRef<CSeq_align> &align)
00402 {
00403     ITERATE(CSeq_align_set::Tdata, it, align_list) {
00404         if ((*it)->GetSeq_id(1).Match(align->GetSeq_id(1)) &&
00405             (*it)->GetSeqStart(1) == align->GetSeqStart(1) &&
00406             (*it)->GetSeqStop(1) == align->GetSeqStop(1)) return true;
00407     }
00408     return false;
00409 };
00410 
00411 // Compare two seqaligns according to their evalue and coverage
00412 static bool s_CompareSeqAlignByEvalue(const CRef<CSeq_align> &x, 
00413                                       const CRef<CSeq_align> &y)
00414 {
00415     double sx, sy;
00416     x->GetNamedScore(CSeq_align::eScore_EValue, sx);
00417     y->GetNamedScore(CSeq_align::eScore_EValue, sy);
00418     if (sx < 0.999999 * sy) return true;
00419     if (sy < 0.999999 * sx) return false;
00420     int ix, iy;
00421     x->GetNamedScore(CSeq_align::eScore_Score, ix);
00422     y->GetNamedScore(CSeq_align::eScore_Score, iy);
00423     if (ix != iy) return (ix > iy);
00424     x->GetNamedScore(CSeq_align::eScore_IdentityCount, ix);
00425     y->GetNamedScore(CSeq_align::eScore_IdentityCount, iy);
00426     int dx, dy;
00427     dx = x->GetAlignLength();
00428     dy = y->GetAlignLength();
00429     return (ix*dy >= iy*dx);
00430 };
00431 
00432 // Compare two seqaligns according to their evalue and coverage
00433 static bool s_CompareSeqAlignByScore(const CRef<CSeq_align> &x, const CRef<CSeq_align> &y)
00434 {
00435     int sx, sy;
00436     x->GetNamedScore(CSeq_align::eScore_Score, sx);
00437     y->GetNamedScore(CSeq_align::eScore_Score, sy);
00438     if (sx != sy) return (sx > sy);
00439     x->GetNamedScore(CSeq_align::eScore_IdentityCount, sx);
00440     y->GetNamedScore(CSeq_align::eScore_IdentityCount, sy);
00441     return (sx <= sy);
00442 };
00443 
00444 // Test if D and J annotation not compatible
00445 static bool s_DJNotCompatible(const CSeq_align &d, const CSeq_align &j, bool ms)
00446 {
00447     int ds = d.GetSeqStart(0);
00448     int de = d.GetSeqStop(0);
00449     int js = j.GetSeqStart(0);
00450     int je = j.GetSeqStop(0);
00451     if (ms) {
00452         if (ds < js + 3 || de < je + 3) return true;
00453     } else { 
00454         if (ds > js - 3 || de > je - 3) return true;
00455     }
00456     return false;
00457 };
00458 
00459 /*
00460 static bool s_IsTopMatchJD(CSearchResults& res_J, CIgAnnotationInfo& annotation_info){
00461     bool result = true; //default
00462     CRef<CSeq_align_set> align_J;
00463     if (res_J.HasAlignments()) {
00464         align_J.Reset(const_cast<CSeq_align_set *>
00465                       (&*(res_J.GetSeqAlign())));
00466         CSeq_align_set::Tdata & align_list = align_J->Set();
00467         CSeq_align_set::Tdata::iterator it = align_list.begin();
00468         int prev_score = 0;
00469         result = false;
00470         while (it != align_list.end()) {
00471             int current_score;
00472             (*it)->GetNamedScore(CSeq_align::eScore_Score, current_score);
00473             if(current_score >= prev_score){
00474                 string j_id;
00475                 (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
00476                 string j_chain_type = annotation_info.GetDJChainType(j_id);
00477                 if (j_chain_type == "N/A"){
00478                     //assume J gene id style 
00479                     
00480                     string sid = NStr::ToUpper(j_id);
00481                     if (sid.substr(0, 2) == "TR" && sid[3] == 'J') {
00482                         j_chain_type = "J" + sid.substr(2,1);
00483                     } else if (sid[0] == 'J') {
00484                         j_chain_type = sid.substr(0,2);
00485                     }
00486                 }
00487                 if (j_chain_type == "JD"){
00488                     result = true;
00489                     break;
00490                 }
00491                 
00492             } else {
00493                 break;
00494             } 
00495             prev_score = current_score;
00496             ++it;
00497         }
00498            
00499     }
00500     return result;
00501 };
00502 */
00503 void CIgBlast::x_FindDJAln(CRef<CSeq_align_set>& align_D,
00504                            CRef<CSeq_align_set>& align_J,
00505                            string q_ct,
00506                            bool q_ms,
00507                            ENa_strand q_st,
00508                            int q_ve,
00509                            int iq,
00510                            bool va_or_vd_as_heavy_chain) {
00511     
00512     int allowed_VJ_distance = max_allowed_VJ_distance_with_D;
00513         /* preprocess D */
00514         if (align_D && !align_D->Get().empty()) {
00515             CSeq_align_set::Tdata & align_list = align_D->Set();
00516             CSeq_align_set::Tdata::iterator it = align_list.begin();
00517             /* chain type test */
00518             if (q_ct!="VH" && q_ct!="VD" && q_ct!="VA" && q_ct!="VB" && q_ct!="N/A") {
00519                 while (it != align_list.end()) {
00520                     it = align_list.erase(it);
00521                 }
00522                 allowed_VJ_distance = max_allowed_VJ_distance_without_D;
00523             } else if (q_ct =="VA" || q_ct =="VD") {
00524                 if (va_or_vd_as_heavy_chain) {
00525                     //VA could behave like VD and is allowed to rearrange to JA or DD/JD
00526                     q_ct = "VD";
00527                     //annot->m_ChainType[0] = "VD";
00528                 } else {
00529                     q_ct = "VA";
00530                     while (it != align_list.end()) {
00531                         it = align_list.erase(it);
00532                     } 
00533                     allowed_VJ_distance = max_allowed_VJ_distance_without_D;
00534                 }
00535             }
00536             //test compatability between V and D
00537             it = align_list.begin();
00538             while (it != align_list.end()) {
00539                 bool keep = true;
00540                 /* chain type test */
00541                 if (q_ct!="N/A") {
00542                     char s_ct = q_ct[1];
00543                     string d_id;
00544                     (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
00545                     string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
00546                     if (d_chain_type != "N/A"){
00547                         if (d_chain_type[1] != q_ct[1]) keep = false;
00548                     } else { //assume D gene id style 
00549                         string sid = (*it)->GetSeq_id(1).AsFastaString();
00550                         sid = NStr::ToUpper(sid);
00551                         if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
00552                         if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
00553                             && sid[3] == 'D') {
00554                             s_ct = sid[2];
00555                         }
00556                         if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
00557                         if (s_ct != q_ct[1]) keep = false;
00558                     }
00559                 }
00560                 
00561                 /* remove failed seq_align */
00562                 if (!keep) it = align_list.erase(it);
00563                 else ++it;
00564             }
00565 
00566 
00567             /* strand test */
00568             bool strand_found = false;
00569             ITERATE(CSeq_align_set::Tdata, it, align_list) {
00570                 if ((*it)->GetSeqStrand(0) == q_st) {
00571                     strand_found = true;
00572                     break;
00573                 }
00574             }
00575             if (strand_found) {
00576                 it = align_list.begin();
00577                 while (it != align_list.end()) {
00578                     if ((*it)->GetSeqStrand(0) != q_st) {
00579                         it = align_list.erase(it);
00580                     } else ++it;
00581                 }
00582             }
00583             /* v end test */
00584             it = align_list.begin();
00585             while (it != align_list.end()) {
00586                 bool keep = false;
00587                 int q_ds = (*it)->GetSeqStart(0);
00588                 int q_de = (*it)->GetSeqStop(0);
00589                 if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - 3);
00590                 else      keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + 3);
00591                 if (!keep) it = align_list.erase(it);
00592                 else ++it;
00593             }
00594             /* sort according to score */
00595             align_list.sort(s_CompareSeqAlignByScore);
00596         }
00597 
00598         /* preprocess J */
00599         if (align_J && !align_J->Get().empty()) {
00600             CSeq_align_set::Tdata & align_list = align_J->Set();
00601             CSeq_align_set::Tdata::iterator it = align_list.begin();
00602             while (it != align_list.end()) {
00603                 bool keep = true;
00604                 /* chain type test */
00605                 if (q_ct!="N/A") {
00606                     char s_ct = q_ct[1];
00607                     string j_id;
00608                     (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
00609                     string j_chain_type = m_AnnotationInfo.GetDJChainType(j_id);
00610                     if (j_chain_type != "N/A"){
00611                         if (j_chain_type[1] != q_ct[1]) keep = false;
00612                     } else { //assume J gene id style 
00613                         string sid = (*it)->GetSeq_id(1).AsFastaString();
00614                         sid = NStr::ToUpper(sid);
00615                         if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
00616                         if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
00617                             && sid[3] == 'J') {
00618                             s_ct = sid[2];
00619                         } else if (sid[0] == 'J') {
00620                             s_ct = sid[1];
00621                         }
00622                         if (s_ct!='H' && s_ct!='L' && s_ct!='K' &&
00623                             s_ct!='A' && s_ct!='B' && s_ct!='D' && s_ct!='G') s_ct = q_ct[1];
00624                         if (s_ct != q_ct[1]) keep = false;
00625                     }
00626                 }
00627                 /* strand test */
00628                 if ((*it)->GetSeqStrand(0) != q_st) keep = false;
00629                 /* subject start test */
00630                 if ((*it)->GetSeqStart(1) > 32) keep = false;
00631                 /* v end test */
00632                 int q_js = (*it)->GetSeqStart(0);
00633                 int q_je = (*it)->GetSeqStop(0);
00634                 if (q_ms) { 
00635                     if (q_je < q_ve - allowed_VJ_distance  || q_js > q_ve) keep = false;
00636                 } else {
00637                     if (q_js > q_ve + allowed_VJ_distance || q_je < q_ve) keep = false;
00638                 }
00639                 /* remove failed seq_align */
00640                 if (!keep) it = align_list.erase(it);
00641                 else ++it;
00642             }
00643             /* sort according to score */
00644             align_list.sort(s_CompareSeqAlignByScore);
00645         }
00646 
00647         /* which one to keep, D or J? */
00648         if (align_D.NotEmpty() && !align_D->IsEmpty() &&
00649             align_J.NotEmpty() && !align_J->IsEmpty()) {
00650             CSeq_align_set::Tdata & al_D = align_D->Set();
00651             CSeq_align_set::Tdata & al_J = align_J->Set();
00652             CSeq_align_set::Tdata::iterator it;
00653             bool keep_J = s_CompareSeqAlignByScore(*(al_J.begin()), *(al_D.begin()));
00654             if (keep_J) {
00655                 it = al_D.begin();
00656                 while (it != al_D.end()) {
00657                     if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms)) {
00658                         it = al_D.erase(it);
00659                     } else ++it;
00660                 }
00661 
00662                 if (align_D.NotEmpty() && !align_D->IsEmpty()){
00663                     it = al_J.begin();
00664                     while (it != al_J.end()) {
00665                         if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms)) {
00666                             it = al_J.erase(it);
00667                         } else ++it;
00668                     }
00669                 }
00670             } else {
00671                 it = al_J.begin();
00672                 while (it != al_J.end()) {
00673                     if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms)) {
00674                         it = al_J.erase(it);
00675                     } else ++it;
00676                 }
00677                 if (align_J.NotEmpty() && !align_J->IsEmpty()) {
00678                     it = al_D.begin();
00679                     while (it != al_D.end()) {
00680                         if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms)) {
00681                             it = al_D.erase(it);
00682                         } else ++it;
00683                     }
00684                     
00685                 }
00686             }
00687         }
00688                    
00689 
00690 }
00691 
00692 
00693 void CIgBlast::x_FindDJ(CRef<CSearchResultSet>& results_D,
00694                         CRef<CSearchResultSet>& results_J,
00695                         CRef <CIgAnnotation>& annot,
00696                         CRef<CSeq_align_set>& align_D,
00697                         CRef<CSeq_align_set>& align_J,
00698                         string q_ct,
00699                         bool q_ms,
00700                         ENa_strand q_st,
00701                         int q_ve,
00702                         int iq) {
00703     
00704     CRef<CSeq_align_set> original_align_D(new CSeq_align_set);
00705     CRef<CSeq_align_set> original_align_J(new CSeq_align_set);
00706     
00707         /* preprocess D */
00708         CSearchResults& res_D = (*results_D)[iq];
00709         if (res_D.HasAlignments()) {
00710 
00711             align_D.Reset(const_cast<CSeq_align_set *>
00712                                            (&*(res_D.GetSeqAlign())));
00713             original_align_D->Assign(*align_D);
00714 ;
00715         }
00716 
00717         /* preprocess J */
00718         CSearchResults& res_J = (*results_J)[iq];
00719         if (res_J.HasAlignments()) {
00720             align_J.Reset(const_cast<CSeq_align_set *>
00721                                            (&*(res_J.GetSeqAlign())));
00722             original_align_J->Assign(*align_J);
00723            
00724         } 
00725         //try as VA
00726         x_FindDJAln(align_D, align_J, q_ct, q_ms, q_st, q_ve, iq, false);
00727         if (q_ct =="VA" || q_ct =="VD") {
00728             annot->m_ChainType[0] = "VA";
00729             //try as VD
00730             x_FindDJAln(original_align_D, original_align_J, q_ct, q_ms, q_st, q_ve, iq, true);
00731             int as_heavy_chain_score = 0;
00732             int as_light_chain_score = 0;
00733             int d_score = 0;
00734             if(original_align_J.NotEmpty() && !original_align_J->Get().empty()){
00735                 original_align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_heavy_chain_score);
00736             }
00737            
00738             if(original_align_D.NotEmpty() && !original_align_D->Get().empty()){
00739                 original_align_D->Get().front()->GetNamedScore(CSeq_align::eScore_Score, d_score);
00740             }
00741             if (align_J.NotEmpty() && !align_J->Get().empty()){
00742                 align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_light_chain_score);
00743             }
00744             if (as_heavy_chain_score + d_score> as_light_chain_score){
00745                 if (align_D.NotEmpty() && original_align_D.NotEmpty()){
00746                     align_D->Assign(*original_align_D);
00747                 }
00748                 if (align_J.NotEmpty() && original_align_J.NotEmpty()){
00749                     align_J->Assign(*original_align_J);
00750                 }
00751                 
00752                 annot->m_ChainType[0] = "VD";
00753             }
00754             
00755         }
00756         
00757 }
00758 
00759 void CIgBlast::x_AnnotateDJ(CRef<CSearchResultSet>        &results_D,
00760                             CRef<CSearchResultSet>        &results_J,
00761                             vector<CRef <CIgAnnotation> > &annots)
00762 {
00763     int iq = 0;
00764     NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
00765 
00766         string q_ct = (*annot)->m_ChainType[0];
00767         bool q_ms = (*annot)->m_MinusStrand;
00768         ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
00769         int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
00770 
00771         CRef<CSeq_align_set> align_D, align_J;
00772 
00773         x_FindDJ( results_D, results_J, *annot, align_D, align_J, q_ct, q_ms, q_st, q_ve, iq); 
00774 
00775        
00776         /* annotate D */    
00777         if (align_D.NotEmpty() && !align_D->IsEmpty()) {
00778             const CSeq_align_set::Tdata & align_list = align_D->Get();
00779             CRef<CSeq_align> align = align_list.front();
00780             (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
00781             (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
00782             (*annot)->m_TopGeneIds[1] = "";
00783 
00784             int ii=0;
00785             ITERATE(CSeq_align_set::Tdata, it, align_list) {
00786                 if (ii++ < m_IgOptions->m_NumAlign[1] && s_IsSeqAlignAsGood(align, (*it))) {
00787                     if ((*annot)->m_TopGeneIds[1] != "") (*annot)->m_TopGeneIds[1] += ",";
00788                     (*annot)->m_TopGeneIds[1] += s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
00789                 } else break;
00790             }
00791         }
00792             
00793         /* annotate J */    
00794         if (align_J.NotEmpty() && !align_J->IsEmpty()) {
00795             const CSeq_align_set::Tdata & align_list = align_J->Get();
00796             CRef<CSeq_align> align = align_list.front();
00797             (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
00798             (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
00799             string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
00800             int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
00801             if (frame_offset >= 0) {
00802                 int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
00803                 (*annot)->m_FrameInfo[2] = (q_ms) ?
00804                                            align->GetSeqStop(0)  + frame_adj 
00805                                          : align->GetSeqStart(0) - frame_adj;
00806             } 
00807             (*annot)->m_TopGeneIds[2] = "";
00808 
00809             int ii=0;
00810             ITERATE(CSeq_align_set::Tdata, it, align_list) {
00811                 if (ii++ < m_IgOptions->m_NumAlign[2] && s_IsSeqAlignAsGood(align, (*it))) {
00812                     if ((*annot)->m_TopGeneIds[2] != "") (*annot)->m_TopGeneIds[2] += ",";
00813                     (*annot)->m_TopGeneIds[2] += s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
00814                 } else break;
00815             }
00816         }
00817      
00818         /* next set of results */
00819         ++iq;
00820     }
00821 };
00822 
00823 // query chain type and domain is annotated by germline alignment
00824 void CIgBlast::x_AnnotateDomain(CRef<CSearchResultSet>        &gl_results,
00825                                 CRef<CSearchResultSet>        &dm_results, 
00826                                 vector<CRef <CIgAnnotation> > &annots)
00827 {
00828     CRef<CObjectManager> mgr = CObjectManager::GetInstance();
00829     CScope scope_q(*mgr), scope_s(*mgr);
00830     CRef<CSeqDB> db_V, db_domain;
00831     bool annotate_subject = false;
00832     if (m_IgOptions->m_Db[0]->IsBlastDb()) {
00833         string db_name_V = m_IgOptions->m_Db[0]->GetDatabaseName(); 
00834         string db_name_domain = m_IgOptions->m_Db[3]->GetDatabaseName(); 
00835         CSeqDB::ESeqType db_type = (m_IgOptions->m_IsProtein)? 
00836                                    CSeqDB::eProtein : CSeqDB::eNucleotide;
00837         db_V.Reset(new CSeqDB(db_name_V, db_type));
00838         if (db_name_V == db_name_domain) {
00839             db_domain.Reset(&(*db_V));
00840         } else {
00841             db_domain.Reset(new CSeqDB(db_name_domain, db_type));
00842         }
00843         annotate_subject = true;
00844     }
00845 
00846     int iq = 0;
00847     ITERATE(CSearchResultSet, result, *dm_results) {
00848 
00849         CIgAnnotation *annot = &*(annots[iq]);
00850         annot->m_ChainType.push_back("NON");  // Assuming non-ig sequence first
00851         annot->m_ChainTypeToShow = "NON";
00852         if ((*result)->HasAlignments() && (*gl_results)[iq].HasAlignments()) {
00853 
00854 
00855             CConstRef<CSeq_align> master_align = 
00856                             (*gl_results)[iq].GetSeqAlign()->Get().front();
00857             CAlnMap q_map(master_align->GetSegs().GetDenseg());
00858 
00859             if (master_align->GetSeqStrand(0) == eNa_strand_minus) {
00860                 annot->m_MinusStrand = true;
00861             }
00862 
00863             int q_ends[2], q_dir;
00864 
00865             if (annot->m_MinusStrand) {
00866                 q_ends[1] = master_align->GetSeqStart(0);
00867                 q_ends[0] = master_align->GetSeqStop(0);
00868                 q_dir = -1;
00869 
00870             } else {
00871                 q_ends[0] = master_align->GetSeqStart(0);
00872                 q_ends[1] = master_align->GetSeqStop(0);
00873                 q_dir = 1;
00874             }
00875 
00876             const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
00877 
00878             ITERATE(CSeq_align_set::Tdata, it, align_list) {
00879 
00880                 string sid = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
00881                 annot->m_ChainType[0] = m_AnnotationInfo.GetDomainChainType(sid);
00882                 annot->m_ChainTypeToShow = annot->m_ChainType[0];
00883                 int domain_info[10];
00884 
00885                 if (m_AnnotationInfo.GetDomainInfo(sid, domain_info)) {
00886 
00887 
00888                     CAlnMap s_map((*it)->GetSegs().GetDenseg());
00889                     int s_start = (*it)->GetSeqStart(1);
00890                     int s_stop = (*it)->GetSeqStop(1);
00891 
00892                     CRef<CAlnMap> d_map;
00893                     int d_start = -1;
00894                     int d_stop = -1;
00895 
00896                     int start, stop;
00897 
00898                     if (annotate_subject) {
00899                         CRef<CBioseq> seq_q = db_domain->SeqidToBioseq((*it)->GetSeq_id(1));
00900                         CBioseq_Handle hdl_q = scope_q.AddBioseq(*seq_q);
00901                         CRef<CBioseq> seq_s = db_V->SeqidToBioseq(master_align->GetSeq_id(1));
00902                         CBioseq_Handle hdl_s = scope_s.AddBioseq(*seq_s);
00903                         CSeq_loc query, subject;
00904                         query.SetWhole();
00905                         query.SetId((*it)->GetSeq_id(1));
00906                         subject.SetWhole();
00907                         subject.SetId(master_align->GetSeq_id(1));
00908                         SSeqLoc q_loc(&query, &scope_q);
00909                         SSeqLoc s_loc(&subject, &scope_s);
00910                         CBl2Seq bl2seq(q_loc, s_loc, (m_IgOptions->m_IsProtein)? eBlastp: eBlastn);
00911                         const CSearchResults& result = (*(bl2seq.RunEx()))[0];
00912                         if (result.HasAlignments()) {
00913                             CConstRef<CSeq_align> subject_align = result.GetSeqAlign()->Get().front();
00914                             d_map.Reset(new CAlnMap(subject_align->GetSegs().GetDenseg()));
00915                             d_start = subject_align->GetSeqStart(0);
00916                             d_stop = subject_align->GetSeqStop(0);
00917                         }
00918                         scope_q.RemoveBioseq(hdl_q);
00919                         scope_s.RemoveBioseq(hdl_s);
00920                     }
00921 
00922                     for (int i =0; i<10; i+=2) {
00923 
00924                         start = domain_info[i] - 1;
00925                         stop = domain_info[i+1] - 1;
00926 
00927                         if (start <= d_stop && stop >= d_start) {
00928                             int start_copy = start;
00929                             int stop_copy = stop;
00930                             if (start_copy < d_start) start_copy = d_start;
00931                             if (stop_copy > d_stop) stop_copy = d_stop;
00932                             if (start_copy <= stop_copy) {
00933                                 if (i>0 && annot->m_DomainInfo_S[i-1]>=0) {
00934                                     annot->m_DomainInfo_S[i] = annot->m_DomainInfo_S[i-1] + 1;
00935                                 } else {
00936                                     annot->m_DomainInfo_S[i] = 
00937                                        d_map->GetSeqPosFromSeqPos(1, 0, start_copy, IAlnExplorer::eForward);
00938                                 }
00939                                 annot->m_DomainInfo_S[i+1] = 
00940                                    d_map->GetSeqPosFromSeqPos(1, 0, stop_copy, IAlnExplorer::eBackwards);
00941                             }
00942                         }
00943                     
00944                         if (start > s_stop || stop < s_start) continue;
00945 
00946                         if (start < s_start) start = s_start;
00947 
00948                         if (stop > s_stop) stop = s_stop;
00949 
00950                         if (start > stop) continue;
00951 
00952                         start = s_map.GetSeqPosFromSeqPos(0, 1, start, IAlnExplorer::eForward);
00953                         stop = s_map.GetSeqPosFromSeqPos(0, 1, stop, IAlnExplorer::eBackwards);
00954 
00955                         if ((start - q_ends[1])*q_dir > 0 || (stop - q_ends[0])*q_dir < 0) continue;
00956 
00957                         if ((start - q_ends[0])*q_dir < 0) start = q_ends[0];
00958 
00959                         if ((stop - q_ends[1])*q_dir > 0) stop = q_ends[1];
00960 
00961                         if ((start - stop)*q_dir > 0) continue;
00962 
00963                         start = q_map.GetSeqPosFromSeqPos(1, 0, start, IAlnExplorer::eForward);
00964                         stop = q_map.GetSeqPosFromSeqPos(1, 0, stop, IAlnExplorer::eBackwards);
00965 
00966                         start = q_map.GetSeqPosFromSeqPos(0, 1, start);
00967                         stop = q_map.GetSeqPosFromSeqPos(0, 1, stop);
00968  
00969                         if ((start - stop)*q_dir > 0) continue;
00970 
00971                         annot->m_DomainInfo[i] = start;
00972                         annot->m_DomainInfo[i+1] = stop;
00973                     }
00974 
00975                     // any extra alignments after FWR3 are attributed to CDR3
00976                     start = annot->m_DomainInfo[9];
00977 
00978                     if (start >= 0 && (start - q_ends[1])*q_dir < 0) {
00979                         start = q_map.GetSeqPosFromSeqPos(1, 0, start+q_dir, IAlnExplorer::eForward);
00980                         start = q_map.GetSeqPosFromSeqPos(0, 1, start);
00981  
00982                         if ((start - q_ends[1])*q_dir <= 0) {
00983                             annot->m_DomainInfo[10] = start;
00984                             annot->m_DomainInfo[11] = q_ends[1];
00985                         }
00986                     }
00987 
00988                     // extension of the first and last annotated domain (if any)
00989                     int i = 0;
00990                     while (i<10 && annot->m_DomainInfo[i] < 0) i+=2;
00991                     annot->m_DomainInfo[i] += (domain_info[i] - 1 -
00992                                        s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
00993                                                                  IAlnExplorer::eBackwards))*q_dir;
00994                     if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
00995                     i+=2;
00996                     while (i<10 && annot->m_DomainInfo[i] >=0) {
00997                         annot->m_DomainInfo[i] = annot->m_DomainInfo[i-1] + q_dir;
00998                         i+=2;
00999                     }
01000                     i = 9;
01001                     while (i>0 && annot->m_DomainInfo[i] < 0) i-=2;
01002                     if (i >= 0) {
01003                         annot->m_DomainInfo[i] += (domain_info[i] - 1 -
01004                                                    s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
01005                                                                              IAlnExplorer::eForward))*q_dir;
01006                         if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
01007                     }
01008 
01009                  
01010                     // annotate the query frame offset
01011                     int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
01012                     if (frame_offset >= 0) {
01013                         int q_start = (*it)->GetSeqStart(0); 
01014                         int q_stop = (*it)->GetSeqStop(0); 
01015                         int q_mid = q_start + q_stop;
01016                         int q_dif = q_stop - q_start;
01017                         int frame_adj = (3 - ((*it)->GetSeqStart(1) + 3 - frame_offset) % 3) %3;
01018                         annot->m_FrameInfo[0] = (q_mid - q_dir *q_dif)/2 + q_dir * frame_adj;
01019                         frame_adj = ((*it)->GetSeqStop(1) + 3 - frame_offset) % 3;
01020                         annot->m_FrameInfo[1] = (q_mid + q_dir *q_dif)/2 - q_dir * frame_adj;
01021                     }
01022                     break;
01023 
01024                 }
01025             }
01026         }
01027         ++iq;
01028     }
01029 };
01030 
01031 void CIgBlast::x_SetChainType(CRef<CSearchResultSet> &results,
01032                               vector<CRef <CIgAnnotation> > &annots)
01033 {
01034     int iq = 0;
01035     ITERATE(CSearchResultSet, result, *results) {
01036 
01037         CIgAnnotation *annot = &*(annots[iq++]);
01038 
01039         if ((*result)->HasAlignments()) {
01040             int num_aligns = (*result)->GetSeqAlign()->Size();
01041             CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
01042                                      (const_cast<CSearchResults *>(&**result));
01043             for (int i=0; i<ig_result->m_NumActualV; ++i, --num_aligns) {
01044                  annot->m_ChainType.push_back("V");
01045             }
01046             for (int i=0; i<ig_result->m_NumActualD; ++i, --num_aligns) {
01047                  annot->m_ChainType.push_back("D");
01048             }
01049             for (int i=0; i<ig_result->m_NumActualJ; ++i, --num_aligns) {
01050                  annot->m_ChainType.push_back("J");
01051             }
01052             for (int i=0; i<num_aligns; ++i) {
01053                  annot->m_ChainType.push_back("N/A");
01054             }
01055         }
01056     }
01057 };
01058 
01059 void CIgBlast::s_SortResultsByEvalue(CRef<CSearchResultSet>& results)
01060 {
01061     ITERATE(CSearchResultSet, result, *results) {
01062         if ((*result)->HasAlignments()) {
01063             CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
01064                                    (&*((*result)->GetSeqAlign())));
01065             CSeq_align_set::Tdata & align_list = align->Set();
01066             align_list.sort(s_CompareSeqAlignByEvalue);
01067         }
01068     }
01069 };
01070 
01071 // convert sequencecomparison to database mode
01072 void CIgBlast::x_ConvertResultType(CRef<CSearchResultSet> &result) 
01073 {
01074     if (result->GetResultType() != eSequenceComparison) {
01075         return;
01076     }
01077 
01078     int num_queries = m_Query->Size();
01079     int num_results = result->GetNumResults();
01080     int ir = 0;
01081     CSearchResultSet *retv = new CSearchResultSet();
01082 
01083     for (int iq = 0; iq< num_queries && ir< num_results; ++iq) {
01084 
01085         CSearchResults &res = (*result)[ir++];
01086         CRef<CBlastAncillaryData> ancillary = res.GetAncillaryData();
01087         TQueryMessages errmsg = res.GetErrors();
01088         CConstRef<CSeq_id> rid = res.GetSeqId();
01089         CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
01090                           (&*(res.GetSeqAlign())));
01091         CSeq_align_set::Tdata & align_list = align->Set();
01092 
01093         CConstRef<CSeq_id> qid = m_Query->GetBlastSearchQuery(iq)->GetQueryId();
01094         while(!qid->Match(*rid)) {
01095             CRef<CSeq_align_set> empty;
01096             CRef<CSearchResults> r(new CSearchResults(qid, empty, errmsg, ancillary));
01097             retv->push_back(r);
01098             qid = m_Query->GetBlastSearchQuery(++iq)->GetQueryId();
01099         }
01100 
01101         while(ir < num_results && (*result)[ir].GetSeqId()->Match(*qid)) {
01102             CSearchResults &add_res = (*result)[ir++];
01103             CRef<CSeq_align_set> add;
01104             add.Reset(const_cast<CSeq_align_set *>
01105                           (&*(add_res.GetSeqAlign())));
01106             CSeq_align_set::Tdata & add_list = add->Set();
01107             align_list.insert(align_list.end(), add_list.begin(), add_list.end());
01108         }
01109         CRef<CSearchResults> r(new CSearchResults(qid, align, errmsg, ancillary));
01110         retv->push_back(r);
01111     }
01112     
01113     result.Reset(retv);
01114 };
01115 
01116 void CIgBlast::s_AppendResults(CRef<CSearchResultSet> &results,
01117                                int                     num_aligns,
01118                                int                     gene,
01119                                CRef<CSearchResultSet> &final_results)
01120 {
01121     bool  new_result = (final_results.Empty());
01122     if (new_result) {
01123         final_results.Reset(new CSearchResultSet());
01124     }
01125 
01126     int iq = 0;
01127     ITERATE(CSearchResultSet, result, *results) {
01128 
01129         CRef<CSeq_align_set> align;
01130         int actual_align = 0;
01131 
01132         if ((*result)->HasAlignments()) {
01133             align.Reset(const_cast<CSeq_align_set *>
01134                                    (&*((*result)->GetSeqAlign())));
01135 
01136             // keep only the first num_alignments
01137             if (num_aligns >= 0) {
01138                 CSeq_align_set::Tdata & align_list = align->Set();
01139                 if (align_list.size() > (CSeq_align_set::Tdata::size_type)num_aligns) {
01140                     CSeq_align_set::Tdata::iterator it = align_list.begin();
01141                     for (int i=0; i<num_aligns; ++i) ++it;
01142                     align_list.erase(it, align_list.end());
01143                     actual_align = num_aligns;
01144                 } else {
01145                     actual_align = align_list.size();
01146                 }
01147             }
01148         }
01149 
01150         TQueryMessages errmsg = (*result)->GetErrors();
01151         CConstRef<CSeq_id> query = (*result)->GetSeqId();
01152 
01153         CIgBlastResults *ig_result;
01154         if (new_result) {
01155             // TODO maybe we need the db ancillary instead?
01156             CRef<CBlastAncillaryData> ancillary = (*result)->GetAncillaryData();
01157             ig_result = new CIgBlastResults(query, align, errmsg, ancillary);
01158             CRef<CSearchResults> r(ig_result);
01159             final_results->push_back(r);
01160         } else {
01161             while( !(*final_results)[iq].GetSeqId()->Match(*query)) ++iq;
01162             ig_result = dynamic_cast<CIgBlastResults *> (&(*final_results)[iq]);
01163             if (!align.Empty()) {
01164                 CSeq_align_set::Tdata & ig_list = ig_result->SetSeqAlign()->Set();
01165                 CSeq_align_set::Tdata & align_list = align->Set();
01166 
01167                 if (gene < 0) {
01168                     // Remove duplicate seq_aligns
01169                     CSeq_align_set::Tdata::iterator it = align_list.begin();
01170                     while (it != align_list.end()) {
01171                         if (s_SeqAlignInSet(ig_list, *it)) it = align_list.erase(it);
01172                         else ++it;
01173                     }
01174                 }
01175 
01176                 if (!align_list.empty()) {
01177                     ig_list.insert(ig_list.end(), align_list.begin(), align_list.end());
01178                     ig_result->GetErrors().Combine(errmsg);
01179                 }
01180             }
01181         }
01182 
01183         switch(gene) {
01184         case 0: ig_result->m_NumActualV = actual_align; break;
01185         case 1: ig_result->m_NumActualD = actual_align; break;
01186         case 2: ig_result->m_NumActualJ = actual_align; break;
01187         default: break;
01188         }
01189     }
01190 };
01191 
01192 void CIgBlast::s_SetAnnotation(vector<CRef <CIgAnnotation> > &annots,
01193                                CRef<CSearchResultSet> &final_results)
01194 {
01195     int iq = 0;
01196     ITERATE(CSearchResultSet, result, *final_results) {
01197         CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
01198                                      (const_cast<CSearchResults *>(&**result));
01199         CIgAnnotation *annot = &*(annots[iq++]);
01200         ig_result->SetIgAnnotation().Reset(annot);
01201     }
01202 };
01203 
01204 END_SCOPE(blast)
01205 END_NCBI_SCOPE
01206 
01207 /* @} */
Modified on Sun Jul 13 17:38:28 2014 by modify_doxy.py rev. 426318