NCBI C++ ToolKit
problems.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*
00002 * ===========================================================================
00003 *    
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *    
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's official duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 * 
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 * 
00026 * Author: Azat Badretdin
00027 *   
00028 * File Description:            
00029 *     
00030 * ===========================================================================
00031 */  
00032 #include <ncbi_pch.hpp>
00033 #include "read_blast_result.hpp"
00034 
00035 
00036 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
00037    const CBioseq::TAnnot& annots, const EProblem type)
00038 {
00039    ITERATE(CBioseq::TAnnot, gen_feature, annots)
00040      {
00041      if ( !(*gen_feature)->GetData().IsFtable() ) continue;
00042      reportProblems(report_and_forget, diag, out, (*gen_feature)->GetData().GetFtable(), type);
00043      }
00044 }
00045 
00046 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
00047    const CSeq_annot::C_Data::TFtable& feats, const EProblem type)
00048 {
00049   ITERATE(CSeq_annot::C_Data::TFtable, f, feats)
00050     {
00051     if( !(*f)->GetData().IsGene() ) continue;
00052     string qname; (*f)->GetData().GetGene().GetLabel(&qname);
00053     if( diag.find(qname) == diag.end() ) continue;
00054     reportProblems(qname, diag, out, type);
00055     if(report_and_forget) erase_problems(qname, diag, type);
00056     }
00057 }
00058 void CReadBlastApp::reportProblems(const string& qname, diagMap& diag, ostream& out, const EProblem type)
00059 {
00060       if(!hasProblems(qname, diag, type)) return;
00061       if(PrintDetails()) NcbiCerr << "problem type: " << ProblemType(type) << NcbiEndl;
00062       reportProblemSequenceName(qname, out);
00063 
00064 // reporting
00065       IncreaseVerbosity();
00066       NON_CONST_ITERATE(list<problemStr>, problem, diag[qname].problems)
00067         {
00068           if(PrintDetails()) NcbiCerr << "current problem: " << ProblemType(problem->type) << NcbiEndl;
00069           if( !(problem->type & type) ) continue;
00070           if(PrintDetails()) NcbiCerr << "it is that problem"  << NcbiEndl;
00071           if(!problem->message.size()) continue;
00072           if(PrintDetails()) NcbiCerr << "has nonzero message"  << NcbiEndl;
00073           reportProblemType(problem->type, out);
00074           reportProblemMessage(problem->message, out);
00075           if(PrintDetails()) NcbiCerr << "current problem: done: " << problem->type << NcbiEndl;
00076         }
00077       DecreaseVerbosity();
00078 }
00079 void CReadBlastApp::erase_problems(const string& qname, diagMap& diag, const EProblem type)
00080 {
00081       IncreaseVerbosity();
00082       for(list<problemStr>::iterator problem = diag[qname].problems.begin(); problem!=diag[qname].problems.end();)
00083         {
00084         if(PrintDetails()) NcbiCerr << "erasing? current problem: " << problem->type << NcbiEndl;
00085         if( !(problem->type & type) )
00086              {
00087              problem++;
00088              continue;
00089              }
00090         if(PrintDetails()) NcbiCerr << "it is that problem for erazing"  << NcbiEndl;
00091         problem=diag[qname].problems.erase(problem);
00092         if(PrintDetails()) NcbiCerr << "erased"  << NcbiEndl;
00093         if(PrintDetails()) NcbiCerr << "next current problem: " << problem->type << NcbiEndl;
00094         }
00095       DecreaseVerbosity();
00096 }
00097 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out, const EProblem type)
00098 {
00099    IncreaseVerbosity();
00100    map<string,bool> done;
00101    if(PrintDetails()) NcbiCerr << "reportProblems: all problems of type ("  << ProblemType(type) << ")" << NcbiEndl;
00102    for (CTypeConstIterator<CBioseq> seq = ConstBegin();  seq;  ++seq)
00103       {
00104       if ( !is_prot_entry(*seq) )
00105         {
00106         reportProblems(report_and_forget, diag, out, seq->GetAnnot(), type);
00107         }
00108       string qname1 = GetStringDescr (*seq);
00109       string qname2 = CSeq_id::GetStringDescr (*seq, CSeq_id::eFormat_FastA);
00110       if(PrintDetails()) NcbiCerr << "reportProblems: start: " 
00111           << qname1 << " or "
00112           << qname2 
00113           << NcbiEndl;
00114       string qnames[2];
00115       qnames[0]=qname1; qnames[1]=qname2;
00116       for(int i=0; i<2; i++)
00117         {
00118         string& qname = qnames[i];
00119         if( diag.find(qname) != diag.end() )
00120           {
00121           reportProblems(qname, diag, out, type);
00122           if(report_and_forget) erase_problems(qname, diag, type);
00123           done[qname]=true;
00124           if(PrintDetails()) NcbiCerr << "reportProblems: full end: " << qname << NcbiEndl;
00125           }
00126         }
00127       }
00128 
00129    ITERATE(diagMap, problem, diag)
00130      {
00131      string qname = problem->first;
00132      if(done.find(qname)!=done.end()) continue;
00133      reportProblems(qname, diag, out, type);
00134      if(report_and_forget) erase_problems(qname, diag, type);
00135      if(PrintDetails()) NcbiCerr << "reportProblems: alternative problems: " << qname << NcbiEndl;
00136      }
00137    DecreaseVerbosity();
00138 }
00139 // hasProblemType(seq, diag[qname].problems), problem->type
00140 
00141 bool CReadBlastApp::hasProblems(const CBioseq& seq, diagMap& diag, const EProblem type)
00142 {
00143    string qname = GetStringDescr (seq);
00144    string qname2 = CSeq_id::GetStringDescr (seq, CSeq_id::eFormat_FastA);
00145    return hasProblems(qname, diag, type) || hasProblems(qname2, diag, type);
00146 }
00147 
00148 bool CReadBlastApp::hasProblems(const string& qname, diagMap& diag, const EProblem type)
00149 {
00150   if(PrintDetails()) NcbiCerr << "hasProblems: start: " << qname << NcbiEndl;
00151   if( type != eAllProblems && diag.find(qname) == diag.end() ) return false;
00152   if ( type == eAllProblems && diag[qname].problems.size()>0) return true;
00153 
00154   IncreaseVerbosity();
00155   ITERATE(list<problemStr>, problem, diag[qname].problems)
00156     {
00157     if(PrintDetails()) NcbiCerr << "hasProblems: checking problem type: " << "\t"
00158                                << ProblemType(problem->type) << "\t"
00159                                << qname << "\t"
00160                                << NcbiEndl;
00161     if (problem->type & type)
00162       {
00163       if(PrintDetails()) NcbiCerr << "hasProblems: end: does have problem: " << qname << NcbiEndl;
00164       return true;
00165       }
00166     }
00167   DecreaseVerbosity();
00168   if(PrintDetails()) NcbiCerr << "hasProblems: end: no problem: " << qname << NcbiEndl;
00169   return false;
00170 }
00171 
00172 void CReadBlastApp::reportProblemMessage(const string& message, ostream& out)
00173 {
00174    out << message.c_str() << NcbiEndl;
00175 }
00176 
00177 string CReadBlastApp::ProblemType(const EProblem type)
00178 {
00179    strstream strres;
00180    strres << "unknown_problem_type=" << type << '\0';
00181    string result=strres.str();
00182    if(type & eOverlap)
00183       result = "Potential overlap found";
00184    else if(type & eRnaOverlap)
00185       result =  "Potential RNA overlap found";
00186    else if(type & eCompleteOverlap)
00187       result =  "Complete overlap found";
00188    else if(type & eRemoveOverlap)
00189       result =  "overlap marked for removal";
00190    else if(type & eRelFrameShift)
00191       result =  "Something relevant to frame shift found";
00192    else if(type & eFrameShift)
00193       result =  "Potential frame shift evidence found";
00194    else if(type & eMayBeNotFrameShift)
00195       result =  "Evidence absolving from the frame shift accusation found";
00196    else if(type & ePartial)
00197       result =  "Potential partial protein annotation found";
00198    else if(type & eShortProtein)
00199       result =  "Short annotation found";
00200    else if(type & eTRNAMissing)
00201       result =  "tRNA is missing in the list of independently annotated tRNAs";
00202    else if(type & eTRNAAbsent)
00203       result =  "RNA is missing in the list of annotated RNAs in the input";
00204    else if(type & eTRNABadStrand)
00205       result =  "RNA is present at the wrong strand";
00206    else if(type & eTRNAUndefStrand)
00207       result =  "RNA is present with undefined strand";
00208    else if(type & eTRNAComMismatch)
00209       result =  "tRNA is a complete mismatch";
00210    else if(type & eTRNAMismatch)
00211       result =  "tRNA has mismatched ends";
00212 
00213    return result;
00214 
00215 }
00216 
00217 void CReadBlastApp::reportProblemType(const EProblem type, ostream& out)
00218 {
00219    out
00220       << "---"
00221       << " ";
00222    string stype = ProblemType(type);
00223    if(!stype.empty())
00224       {
00225       out << stype;
00226       }
00227    else
00228       {
00229       NcbiCerr << "FATAL: internal error: unknown problem type" << type << NcbiEndl;
00230       throw;
00231       }
00232    out
00233       << " "
00234       << "---"
00235       << NcbiEndl;
00236 }
00237 
00238 void CReadBlastApp::reportProblemSequenceName(const string& name, ostream& out)
00239 {
00240    out
00241       << "====="
00242       << " ";
00243    out << name.c_str() ;
00244    out
00245       << " "
00246       << "====="
00247       << NcbiEndl;
00248 }
00249 
00250 int CReadBlastApp::FixStrands(void)
00251 {
00252   TProblem_locs problem_locs;
00253   if(PrintDetails()) NcbiCerr << "FixStrands: start " << NcbiEndl;
00254 
00255 // relevant problems
00256   CollectRNAFeatures(problem_locs);
00257 
00258 // first, count features matching each range
00259   for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
00260     {
00261     if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
00262     const CSeq_loc&  loc = feat->GetLocation();
00263     ENa_strand strand;
00264     TSeqPos from, to;
00265     getFromTo(loc, from, to, strand); 
00266     string range = printed_range(from, to);
00267     if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene [" << range << "]" << NcbiEndl;
00268     
00269     if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
00270     if(PrintDetails()) NcbiCerr << "FixStrands: range: " << range << NcbiEndl;
00271     problem_locs[range].count++;
00272     if(feat->GetData().IsRna()) problem_locs[range].rnacount++;
00273     if(feat->GetData().IsGene()) problem_locs[range].genecount++;
00274     }
00275 // now fix
00276   for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
00277     {
00278     if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
00279     if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene for fixing" << NcbiEndl;
00280     CSeq_loc&  loc = feat->SetLocation();
00281     ENa_strand strand;
00282     TSeqPos from, to;
00283     getFromTo(loc, from, to, strand);
00284     string range = printed_range(from, to);
00285     if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
00286     if(PrintDetails()) NcbiCerr << "FixStrands: range for fix: " << range << NcbiEndl;
00287     if(   problem_locs[range].count!=2 
00288        || problem_locs[range].rnacount!=1 
00289        || problem_locs[range].genecount!=1 
00290       )
00291       {
00292       // no touching
00293       // warning
00294       NcbiCerr << "CReadBlastApp::FixStrands: "
00295                << "WARNING: "
00296                << "location found, but the number of features with that location is confusing, "
00297                << "no fixing for "
00298                << "[" << problem_locs[range].name << "]"
00299                << "(" << range << ")"
00300                << NcbiEndl;
00301       continue;
00302       }
00303 // over all intervals
00304     int ninter=0; 
00305     for(CTypeIterator<CSeq_interval> inter = ::Begin(loc);  inter; ++inter, ++ninter)
00306       {
00307       inter->SetStrand(problem_locs[range].strand);
00308       NcbiCerr << "CReadBlastApp::FixStrands: "
00309                << "[" << problem_locs[range].name << "] "
00310                << "fixed"
00311                << NcbiEndl;
00312       }
00313     NcbiCerr << "CReadBlastApp::FixStrands: ninters= " << ninter  << NcbiEndl;
00314     } // for(CTypeIterator<CSeq_feat>
00315   if(PrintDetails()) NcbiCerr << "FixStrands: end" << NcbiEndl;
00316   return 1;
00317 }
00318 
00319 int CReadBlastApp::RemoveProblems(map<string,string>& problem_names, LocMap& loc_map)
00320 {
00321    if(PrintDetails()) NcbiCerr << "RemoveProblems: start " << NcbiEndl;
00322 
00323    PushVerbosity();
00324    if(IsSubmit())
00325      { 
00326      if ( m_Submit.GetData().IsEntrys()) 
00327        {
00328        for(CSeq_submit::C_Data::TEntrys::iterator entry = m_Submit.SetData().SetEntrys().begin();
00329            entry != m_Submit.SetData().SetEntrys().end();)
00330        // NON_CONST_ITERATE(CSeq_submit::C_Data::TEntrys, entry, m_Submit.SetData().SetEntrys())
00331          {
00332          int removeme = RemoveProblems(**entry, problem_names, loc_map);
00333          if(PrintDetails())
00334             NcbiCerr
00335                  << "RemoveProblems(void): doing entry: removeme =  "
00336                  << removeme
00337                  << NcbiEndl;
00338          if(removeme) 
00339            {
00340            NcbiCerr << "RemoveProblems(): WARNING: "
00341                     << "CSeq_entry deleted, loss of annotation might occur"
00342                     << NcbiEndl;
00343            entry=m_Submit.SetData().SetEntrys().erase(entry);
00344            }
00345          else entry++;
00346          }
00347        }
00348      else
00349        {
00350        NcbiCerr << "ERROR: submit file does not have proper seqset"<< NcbiEndl;
00351        }
00352      }
00353    else 
00354      { 
00355      if(PrintDetails())
00356          NcbiCerr
00357                  << "RemoveProblems(void): case is single entry "
00358                  << NcbiEndl;
00359      RemoveProblems(m_Entry, problem_names, loc_map);
00360      }
00361 
00362    PopVerbosity();
00363    if(PrintDetails()) NcbiCerr << "RemoveProblems: end" << NcbiEndl;
00364    return 1;
00365 }
00366 
00367 int CReadBlastApp::RemoveProblems(CSeq_entry& entry, map<string, string>& problem_seqs, LocMap& loc_map)
00368 {
00369    int removeme=0;
00370    if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): start" << NcbiEndl;
00371    if(entry.IsSeq()) 
00372      {
00373      removeme=RemoveProblems(entry.SetSeq(), problem_seqs, loc_map);
00374      if(PrintDetails())
00375          NcbiCerr
00376                  << "RemoveProblems(CSeq_entry)(seq case): removeme = "
00377                  << removeme
00378                  << NcbiEndl;
00379 
00380      }
00381    else //several seqs
00382      {
00383      CBioseq_set& bioset = entry.SetSet();
00384      removeme=RemoveProblems(bioset, problem_seqs, loc_map);
00385      CBioseq_set::TSeq_set& entries =  bioset.SetSeq_set();
00386      int size=0;
00387      for (CTypeConstIterator<CBioseq> seq = ::ConstBegin(entry);  seq;  ++seq, size++);
00388      if(PrintDetails())
00389          NcbiCerr
00390                  << "RemoveProblems(CSeq_entry)(set case): removeme = "
00391                  << removeme
00392                  << ", entries.size = "
00393                  << entries.size()
00394                  << ", total seqs = "
00395                  << size
00396                  << NcbiEndl;
00397      if (size<=1) NormalizeSeqentry(entry);
00398      } // entry.IsSet()
00399    if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): end" << NcbiEndl;
00400    return removeme;
00401 }
00402 
00403 void CReadBlastApp::NormalizeSeqentry(CSeq_entry& entry)
00404 {
00405   if(!entry.IsSet()) return;
00406   CBioseq_set& bioset = entry.SetSet();
00407   CBioseq_set::TSeq_set& entries =  bioset.SetSeq_set();
00408   if(entries.size()!=1) return;
00409 // 1. merge descriptions
00410   CBioseq& seq = (*entries.begin())->SetSeq();
00411   if(bioset.IsSetDescr())
00412     {
00413     CSeq_descr::Tdata& descs = bioset.SetDescr().Set();
00414     for(CSeq_descr::Tdata::iterator desc = descs.begin(); desc!=descs.end(); )
00415       {
00416       seq.SetDescr().Set().push_back(*desc);
00417       desc=descs.erase(desc);
00418       }
00419     } // if(entry.SetSet().IsSetDescr())
00420 // 2.  move CBioseq under the CSeq_entr
00421   CRef<CBioseq> pseq (&seq);
00422   entry.SetSeq(*pseq);
00423   NcbiCerr << "NormalizeSeqentry(CSeq_entry...): "
00424            << "WARNING: "
00425            << "converted sequence set to sequence"
00426            << NcbiEndl;
00427   return;
00428 }
00429 
00430 int CReadBlastApp::RemoveProblems(CBioseq_set& setseq, map<string, string>& problem_seqs, LocMap& loc_map)
00431 {
00432    bool noseqs=false;
00433    bool noannot=false;
00434    int removeme=0;
00435    if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq_set): start" << NcbiEndl;
00436    if(setseq.IsSetSeq_set())
00437      {
00438      int all_entries_removed = RemoveProblems(setseq.SetSeq_set(), problem_seqs, loc_map);
00439      if(all_entries_removed > 0) {/* mandatory, no deletion */; noseqs=true;}
00440      }
00441    if(setseq.IsSetAnnot())
00442      {
00443      int all_annot_removed = RemoveProblems(setseq.SetAnnot(), problem_seqs, loc_map);
00444      if(all_annot_removed > 0) {setseq.ResetAnnot(); noannot=true;}
00445      }
00446    if(noseqs ) removeme = 1;
00447    if(PrintDetails())
00448          NcbiCerr
00449                  << "RemoveProblems(CBioseq_set): noseqs = "
00450                  << noseqs
00451                  << ", noannot = "
00452                  << noannot
00453                  << ", removeme (return) = "
00454                  << removeme
00455                  << NcbiEndl;
00456 
00457    return removeme;
00458 }
00459 
00460 int CReadBlastApp::RemoveProblems(CBioseq& seq, map<string, string>& problem_seqs, LocMap& loc_map)
00461 {      
00462    int remove=0;
00463    if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): start" << NcbiEndl;
00464    if(!seq.IsAa())  // nucleotide sequnce
00465      {           
00466      if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): nuc" << NcbiEndl;
00467      if(seq.IsSetAnnot())
00468        {
00469        int annotations_removed = RemoveProblems(seq.SetAnnot(), problem_seqs, loc_map);
00470        if(annotations_removed) seq.ResetAnnot();
00471        }
00472      }           
00473    else // aminoacid sequence
00474 // checkif needed to kill it
00475      {
00476      string thisName = GetStringDescr(seq);
00477      string origName = thisName;
00478      string::size_type ipos = thisName.rfind('|'); if(ipos!=string::npos) thisName.erase(0, ipos+1);
00479      ipos = thisName.rfind('_'); if(ipos!=string::npos) ipos= thisName.rfind('_', ipos-1);
00480      if(PrintDetails())
00481          NcbiCerr
00482                  << "RemoveProblems(CBioseq): remove? sequence "
00483                  << "[" << origName << "]"
00484                  << " looking for "
00485                  << "[" << thisName << "]"
00486                  << NcbiEndl;
00487 
00488      if(problem_seqs.find(thisName) != problem_seqs.end()) 
00489        {
00490        NcbiCerr
00491                  << "RemoveProblems(CBioseq): sequence "
00492                  << "[" << origName << "]"
00493                  << " is marked for removal, because of a match to " 
00494                  << "[" << thisName << "]"
00495                  << NcbiEndl;
00496        remove=1; // whack the sequence
00497        }
00498      }
00499    if(PrintDetails())
00500          NcbiCerr
00501                  << "RemoveProblems(CBioseq): remove =  "
00502                  << remove 
00503                  << NcbiEndl;
00504 
00505 
00506    return remove;
00507 }          
00508 
00509 
00510 int CReadBlastApp::RemoveProblems(CBioseq_set::TSeq_set& entries, map<string, string>& problem_seqs, LocMap& loc_map)
00511 {
00512    IncreaseVerbosity();
00513    int remove=0;
00514    for(CBioseq_set::TSeq_set::iterator entries_end = entries.end(), entry=entries.begin(); entry != entries_end; )
00515      {
00516      int removeseq=RemoveProblems(**entry, problem_seqs, loc_map);
00517      if(PrintDetails())
00518          NcbiCerr
00519                  << "RemoveProblems(CBioseq_set::TSeq_set): removeseq = "
00520                  << removeseq
00521                  << NcbiEndl;
00522 
00523      if(removeseq) entry=entries.erase(entry);
00524      else entry++;
00525      } // each seqs
00526    if(entries.size()==0) remove=1;
00527    if(PrintDetails())
00528          NcbiCerr
00529                  << "RemoveProblems(CBioseq_set::TSeq_set): nentries = "
00530                  << entries.size()
00531                  << NcbiEndl;
00532 
00533    DecreaseVerbosity();
00534    return remove;
00535 }
00536 
00537 int CReadBlastApp::RemoveProblems(CBioseq::TAnnot& annots, map<string, string>& problem_seqs, LocMap& loc_map)
00538 {
00539   int remove=0;
00540   if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): start" << NcbiEndl;
00541   for(CBioseq::TAnnot::iterator annot=annots.begin(); annot!=annots.end(); )
00542     {
00543     int removeme=0;
00544     if( (*annot)->GetData().IsFtable()) removeme=RemoveProblems((*annot)->SetData().SetFtable(), problem_seqs, loc_map);
00545     if(removeme) 
00546       {
00547       NcbiCerr << "RemoveProblems(annots, problem_seqs): "
00548                  << "INFO: "
00549                  << "annotation has empty feature table and it will be removed"
00550                  << NcbiEndl;
00551       annot=annots.erase(annot);
00552       }
00553     else annot++;
00554     }
00555   if(annots.size()==0) remove=1;
00556   if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): end" << NcbiEndl;
00557 
00558   return remove;
00559 }
00560 
00561 int CReadBlastApp::RemoveProblems(CSeq_annot::C_Data::TFtable& table, map<string, string>& problem_seqs, LocMap& loc_map)
00562 // this one needs cleaning
00563 {
00564   int removeme=0;
00565   if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): start" << NcbiEndl;
00566   GetLocMap(loc_map, table);
00567   for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
00568     {
00569     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat 000" << NcbiEndl;
00570     bool gene, cdregion;
00571     gene = (*feat)->GetData().IsGene();
00572     cdregion = (*feat)->GetData().IsCdregion();
00573     bool del_feature=false;
00574     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat I" << NcbiEndl;
00575     string real_loc_string = GetLocationString((*feat)->GetLocation());
00576     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat II" << NcbiEndl;
00577 
00578     string loc_string = GetLocusTag(**feat, loc_map); // more general, returns location string
00579     if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat: (" << real_loc_string  << ")(" << loc_string << ")" << NcbiEndl;
00580 //
00581 // case *: matching locus tag
00582 //
00583     if(problem_seqs.find(loc_string) != problem_seqs.end()) 
00584       {
00585       if((*feat)->GetData().IsImp() &&
00586          (*feat)->GetData().GetImp().CanGetKey())
00587          {
00588          NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, key = " << (*feat)->GetData().GetImp().GetKey()  << NcbiEndl;
00589          }
00590       if((*feat)->GetData().IsImp() &&
00591           (*feat)->CanGetComment() )
00592          {
00593          NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, comment = " << (*feat)->GetComment()  << NcbiEndl;
00594          }
00595 /*
00596       if((*feat)->GetData().IsImp() &&
00597          (*feat)->GetData().GetImp().CanGetKey() &&
00598          (*feat)->GetData().GetImp().GetKey() == "misc_feature" &&
00599          (*feat)->CanGetComment() &&
00600          (*feat)->GetComment().find("potential frameshift") != string::npos
00601         ) del_feature = false; // this is a new feature, that we are not supposed to delete
00602 */
00603       if((*feat)->GetData().IsImp() &&
00604          (*feat)->GetData().GetImp().CanGetKey() &&
00605          (*feat)->GetData().GetImp().GetKey() == "misc_feature"
00606         ) del_feature = false; // this is a new feature, we do not delete them
00607       else del_feature=true;
00608       }
00609 
00610     if ( PrintDetails() )
00611       {
00612       NcbiCerr << "RemoveProblems: feature " << loc_string << ": ";
00613       if(del_feature) NcbiCerr << "WILL BE REMOVED";
00614       else            NcbiCerr << "stays until further analysis for it";
00615       NcbiCerr << NcbiEndl;
00616       }
00617     if(del_feature)
00618         {
00619         NcbiCerr << "RemoveProblems: WARNING: feature " 
00620                  << "{" << (*feat)->GetData().SelectionName((*feat)->GetData().Which()) << "} "
00621                  << loc_string << ": ";
00622         NcbiCerr << "will be removed because of a problem: ";
00623         NcbiCerr << problem_seqs.find(loc_string)->second;
00624         NcbiCerr << NcbiEndl;
00625         }
00626     if(!del_feature && gene  && (*feat)->GetData().GetGene().CanGetLocus_tag() )
00627 //
00628 // case *: gene
00629 //
00630       {
00631       string locus_tag = (*feat)->GetData().GetGene().GetLocus_tag();
00632       if(problem_seqs.find(locus_tag) != problem_seqs.end()) del_feature=true;
00633       if ( PrintDetails() )
00634         {
00635         NcbiCerr << "RemoveProblems: gene " << locus_tag << ": ";
00636         if(del_feature)
00637            NcbiCerr << "WILL BE REMOVED";
00638         else
00639            NcbiCerr << "stays";
00640         NcbiCerr << NcbiEndl;
00641         }
00642       if(del_feature)
00643         {
00644         NcbiCerr << "RemoveProblems: WARNING: gene " << locus_tag << ": ";
00645         NcbiCerr << "will be removed because of a problem: ";
00646         NcbiCerr << problem_seqs.find(locus_tag)->second;
00647         NcbiCerr << NcbiEndl;
00648         }
00649       }
00650     if(!del_feature && cdregion && (*feat)->CanGetProduct() )
00651       {
00652 //
00653 // case *: cdregion
00654 //
00655       string productName;
00656       if( (*feat)->CanGetProduct() &&
00657           (*feat)->GetProduct().IsWhole() &&
00658           (*feat)->GetProduct().GetWhole().IsGeneral() &&
00659           (*feat)->GetProduct().GetWhole().GetGeneral().CanGetTag() &&
00660           (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().IsStr() )
00661         {
00662         productName = (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().GetStr();
00663         }
00664       else if ( 
00665            (*feat)->CanGetProduct() &&
00666           (*feat)->GetProduct().IsWhole())
00667         {
00668         productName = (*feat)->GetProduct().GetWhole().AsFastaString();
00669         }
00670 // strip leading contig ID if any
00671       string::size_type ipos=productName.rfind('_', productName.size());
00672       if(ipos != string::npos) 
00673         {
00674         string::size_type ipos2;
00675         ipos2=productName.rfind('_', ipos-1);
00676         if(ipos2 != string::npos) productName.erase(0, ipos2+1);
00677       // "1103032000567_RAZWK3B_00550" -> "RAZWK3B_00550";
00678         else 
00679           {
00680           ipos2=productName.rfind('|', ipos-1); 
00681           if(ipos2 != string::npos) productName.erase(0, ipos2+1);
00682           }
00683 // lcl|Xoryp_00025 -> Xoryp_00025
00684         }
00685       if(productName.length() && problem_seqs.find(productName) != problem_seqs.end()) del_feature=true;
00686       if ( PrintDetails() )
00687         {
00688         NcbiCerr << "RemoveProblems: cdregion " << productName << ": ";
00689         if(del_feature)
00690            NcbiCerr << "WILL BE REMOVED";
00691         else
00692            NcbiCerr << "stays";
00693         NcbiCerr << NcbiEndl;
00694         }
00695       }
00696     if(del_feature)
00697       {
00698       if(problem_seqs.find(real_loc_string) == problem_seqs.end()) 
00699         {
00700         problem_seqs[real_loc_string]=problem_seqs[loc_string]; // saving for later
00701         }
00702       }
00703     if(del_feature) feat=table.erase(feat);
00704     else feat++;
00705     }
00706   if(table.size()==0) removeme=1; 
00707   if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): end" << NcbiEndl;
00708   return removeme;
00709 }
00710 
00711 // RemoveInterim
00712 
00713 int CReadBlastApp::RemoveInterim(void)
00714 {
00715 
00716    int nremoved=0;
00717 
00718    if(PrintDetails()) NcbiCerr << "RemoveInterim: start " << NcbiEndl;
00719    PushVerbosity();
00720    for(CTypeIterator<CBioseq> seq=Begin(); seq; ++seq)
00721      {
00722      if(seq->IsSetAnnot() && seq->IsAa()) nremoved+= RemoveInterim(seq->SetAnnot());
00723      if(seq->IsSetAnnot() && seq->IsNa()) nremoved+= RemoveInterim2(seq->SetAnnot());
00724      }
00725 
00726    PopVerbosity();
00727    if(PrintDetails()) NcbiCerr << "RemoveInterim: end" << NcbiEndl;
00728    return nremoved;
00729 }
00730 // protein sequence annotations
00731 int CReadBlastApp::RemoveInterim(CBioseq::TAnnot& annots)
00732 {
00733    int nremoved=0;
00734    if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): start " << NcbiEndl;
00735 
00736    for(CBioseq::TAnnot::iterator annot=annots.begin(), annot_end = annots.end(); annot != annot_end; )
00737      {
00738      bool erased = false;
00739      if((*annot)->GetData().IsAlign())
00740        {
00741        nremoved++; erased = true;
00742        }
00743      if ( (*annot)->GetData().IsFtable())
00744        {
00745        int dremoved=0;
00746        CSeq_annot::C_Data::TFtable& table = (*annot)->SetData().SetFtable();
00747        for(CSeq_annot::C_Data::TFtable::iterator feat=table.begin(), feat_end= table.end(); feat !=  feat_end; )
00748          {
00749          string test = "Genomic Location:"; 
00750          if ((*feat)->IsSetData() && (*feat)->GetData().IsProt() && 
00751               (*feat)->IsSetComment() && (*feat)->GetComment().substr(0, test.size()) == test  )
00752            {
00753            table.erase(feat++); dremoved++; 
00754            nremoved++;
00755            }
00756          else
00757            {
00758            feat++;
00759            }
00760          }
00761 
00762 /* 
00763   this is really crappy way of doing it!
00764 */
00765 /*
00766        while( (*annot)->GetData().GetFtable().size()>1 )
00767          {
00768          (*annot)->SetData().SetFtable().pop_back();
00769          nremoved++;
00770          dremoved++;
00771          }
00772 */
00773        if(PrintDetails()) NcbiCerr << "RemoveInterim(CBioseq::TAnnot& annots): dremoved = "
00774         << dremoved
00775         << ", left=" << (*annot)->GetData().GetFtable().size()
00776         << NcbiEndl;
00777        if((*annot)->SetData().SetFtable().size() == 0) 
00778          {
00779          nremoved++;
00780          erased = true;
00781          }
00782        }
00783      if(erased) annot=annots.erase(annot);
00784      else annot++;
00785      }
00786 
00787   if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): end" << NcbiEndl;
00788   return nremoved;
00789 }
00790 
00791 // nucleotide sequence annotations
00792 int CReadBlastApp::RemoveInterim2(CBioseq::TAnnot& annots)
00793 {
00794    int nremoved=0;
00795    if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): start " << NcbiEndl;
00796 
00797    NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, annots)
00798      {
00799      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature start" << NcbiEndl;
00800      if ( !(*gen_feature)->GetData().IsFtable() ) continue;
00801      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature is ftable" << NcbiEndl;
00802      map<string,bool> feat_defined;
00803      CSeq_annot::C_Data::TFtable& table = (*gen_feature)->SetData().SetFtable();
00804      for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
00805        {
00806        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat start" << NcbiEndl;
00807        const CSeq_feat& featr = **feat;
00808        strstream buff;
00809        buff << MSerial_AsnText  << featr;
00810        buff << '\0';
00811        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): feat ASN:" << NcbiEndl;
00812        if(PrintDetails()) NcbiCerr << buff.str() << NcbiEndl;
00813        if(feat_defined.find(buff.str()) != feat_defined.end()) // dupe
00814          {
00815          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat dupe: erase" << NcbiEndl;
00816          feat=table.erase(feat);
00817          }
00818        else 
00819          {
00820          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map" << NcbiEndl;
00821          feat_defined[buff.str()]=true;
00822          if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map done" << NcbiEndl;
00823          // feat_defined[featr]=true;
00824          feat++;
00825          }
00826        if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat end" << NcbiEndl;
00827        }
00828      
00829      if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature end" << NcbiEndl;
00830      }
00831 
00832   if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): end" << NcbiEndl;
00833   return nremoved;
00834 }
00835 
00836 
00837 
00838 
00839 string diagName(const string& type, const string& value)
00840 {         
00841    return type + "|" + value;
00842 }               
00843                 
00844 int addProblems(list<problemStr>& dest, const list<problemStr>& src)
00845 {    
00846    int n=0;
00847    ITERATE(list<problemStr>, src_p, src)
00848      {
00849      dest.push_back(*src_p);
00850      n++;
00851      } 
00852   return n;
00853 }
00854 
00855 int CReadBlastApp::CollectRNAFeatures(TProblem_locs& problem_locs)
00856 {
00857   ITERATE( diagMap, feat, m_diag)
00858     {
00859     ITERATE(list<problemStr>, problem, feat->second.problems)
00860       {
00861       bool added = false;
00862       string name = feat->first;
00863       string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
00864       ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
00865       if(ipos!=string::npos) name.erase(0, ipos+1);
00866       string range = printed_range(problem->i1, problem->i2);
00867       if( 
00868           (problem->type & eTRNABadStrand || problem->type & eTRNAUndefStrand)   // irrelevant problem
00869          && !problem->misc_feat_message.empty()
00870        )
00871         { 
00872         problem_locs[range].strand = problem->strand;
00873         problem_locs[range].name = name;
00874         problem_locs[range].count = 
00875         problem_locs[range].rnacount = 
00876         problem_locs[range].genecount =  0;
00877         added=true; 
00878         }
00879       if(PrintDetails()) 
00880         NcbiCerr << "CReadBlastApp::CollectRNAFeatures: " << feat->first
00881         << "[" << range << "]: "
00882         << "(" << name << ")"
00883         << (added ? "added" : "skipped")  << NcbiEndl;
00884       }
00885     }
00886   return problem_locs.size();
00887 
00888 }
00889 
00890 int CReadBlastApp::CollectFrameshiftedSeqs(map<string,string>& problem_names)
00891 {
00892   CArgs args = GetArgs();
00893   bool keep_frameshifted = args["kfs"].HasValue();
00894   ITERATE( diagMap, feat, m_diag)
00895     {
00896     ITERATE(list<problemStr>, problem, feat->second.problems)
00897       {
00898       bool added = false;
00899       string name = feat->first;
00900       string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
00901       ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
00902       if(ipos!=string::npos) name.erase(0, ipos+1);
00903       if( 
00904           (problem->type == eFrameShift && !keep_frameshifted)
00905           || 
00906           problem->type == eRemoveOverlap
00907           ||
00908           problem->type & eShortProtein
00909         ) 
00910         { problem_names[name]=ProblemType(problem->type); added=true; }
00911       if(PrintDetails()) 
00912         NcbiCerr << "CollectFrameshiftedSeqs: " << feat->first
00913         << ": "
00914         << "(" << name << ")"
00915         << (added ? "added" : "skipped")  << NcbiEndl;
00916       }
00917     }
00918   return problem_names.size();
00919 }
00920 
00921 void CReadBlastApp::append_misc_feature(CBioseq_set::TSeq_set& seqs, const string& name, EProblem problem_type)
00922 {   
00923   if(m_diag.find(name)==m_diag.end())
00924     {
00925     // should not happen
00926     NcbiCerr << "append_misc_feature: FATAL: do not have problems for " << name << NcbiEndl;
00927     throw;
00928     return;
00929     } 
00930 
00931   NON_CONST_ITERATE(  CBioseq_set::TSeq_set, na, seqs)
00932     {
00933     if( is_prot_entry((*na)->GetSeq())  ) continue;
00934     list<CRef<CSeq_id> >& na_id = (*na)->SetSeq().SetId();
00935     NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, (*na)->SetSeq().SetAnnot())
00936       {
00937       if ( !(*gen_feature)->GetData().IsFtable() ) continue;
00938       typedef map<string, bool> Tmessage_misced;
00939       typedef map<EProblem, Tmessage_misced> Tproblem_misced;
00940       Tproblem_misced problem_misced; // list of problems fixed
00941       ITERATE(list<problemStr>, problem, m_diag[name].problems)
00942         {
00943         if ( !(problem->type & problem_type) ) continue;
00944         int from=-1, to=-1;
00945 //        string lt1, lt2;
00946         ENa_strand strand;
00947         string message="";
00948         from = problem->i1;
00949         if(from<0) continue;
00950 // add new feature
00951         to   = problem->i2;
00952 /*
00953         lt1  = problem->id1;
00954         lt2  = problem->id2;
00955 */
00956         strand = problem->strand;
00957         message = problem->misc_feat_message;
00958         if(message.size()==0) continue; // do not print empty misc_feat, they are empty for a reason
00959         if(problem_misced.find(problem->type) != problem_misced.end() &&
00960            problem_misced[problem->type].find(message) != problem_misced[problem->type].end()
00961           ) continue;
00962         else problem_misced[problem->type][message] = true;
00963         SIZE_TYPE pos;
00964         while((pos=message.find_first_of("\n\r"))!=string::npos)
00965           {
00966           message[pos]=' ';
00967           }
00968         CRef<CSeq_feat> feat(new CSeq_feat());
00969 
00970         feat->SetComment(message);
00971         feat->SetData().SetImp().SetKey("misc_feature");
00972         feat->SetLocation().SetInt().SetFrom(from);
00973         feat->SetLocation().SetInt().SetTo(to);
00974         feat->SetLocation().SetInt().SetId(**na_id.begin());
00975         feat->SetLocation().SetInt().SetStrand(strand);
00976         (*gen_feature)->SetData().SetFtable().push_back(feat);
00977         }
00978 
00979       }
00980     break;
00981     }
00982 
00983   return;
00984 }
00985 
00986 
00987 
00988 
Modified on Mon Sep 15 17:17:27 2014 by modify_doxy.py rev. 426318