NCBI C++ ToolKit
read_blast_result_lib.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: read_blast_result_lib.cpp 62839 2014-05-12 17:42:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 * major top-level routines, including main
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include "read_blast_result.hpp"
35 
36 
37 // static members definition
38 // algorithm control
48 // verbosity
52 
53 ESerialDataFormat s_GetFormat(const string& name)
54 {
55  if (name == "asn") {
56  return eSerial_AsnText;
57  } else if (name == "asnb") {
58  return eSerial_AsnBinary;
59  } else if (name == "xml") {
60  return eSerial_Xml;
61  } else {
62  // Should be caught by argument processing, but as an illustration...
63  THROW1_TRACE(runtime_error, "Bad serial format name " + name);
64  }
65 }
66 
67 
68 vector<long> CReadBlastApp::getGIs(CBioseq::TAnnot::const_iterator& annot)
69 {
70  vector<long> result;
71  result.clear();
72  ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
73  {
74  string name = (*score)->GetId().GetStr();
75  if(name!="use_this_gi") continue;
76  result.push_back((*score)->GetValue().GetInt());
77  }
78 
79  return result;
80 }
81 
82 bool CReadBlastApp::giMatch(const vector<long>& left, const vector<long>& right)
83 {
84 
85  bool result=false;
86  if(PrintDetails()) NcbiCerr << "giMatch starts" << NcbiEndl;
88  ITERATE(vector<long>, gi1, left)
89  {
91  ITERATE(vector<long>, gi2, right)
92  {
93  if(PrintDetails()) NcbiCerr << "try: gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
94  if (*gi1==*gi2)
95  {
96  if(PrintDetails()) NcbiCerr << "giMatch!!! : gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
97  return true;
98  }
99  }
101  }
103  if(PrintDetails()) NcbiCerr << "giMatch oops" << NcbiEndl;
104 
105  return result;
106 }
107 
108 void CReadBlastApp::dumpAlignment( const string& alignment, const string& file)
109 {
110  ofstream out(file.c_str());
111  out<< alignment;
112 }
113 
115 {
116  // open out2 here
117  string fn = GetArgs()["in"].AsString();
118  string::size_type ipos=fn.rfind(".");
119  if(ipos!=string::npos) fn.erase(ipos);
120  ipos=fn.rfind("/");
121  fn.erase(0,ipos+1);
122  string fncdd = fn + "_cdd_html.fsa";
123  fn+= "_html.fsa";
124 
126  CScope scope(*objmgr);
127 
128  auto_ptr<CNcbiOfstream> out2 (new CNcbiOfstream (fn.c_str()));
129  CFastaOstream fasta_out(*out2);
130 
131  auto_ptr<CNcbiOfstream> out2_cdd ( new CNcbiOfstream (fncdd.c_str()));
132  CFastaOstream fasta_out_cdd(*out2_cdd);
133 
134  for (CTypeConstIterator<CBioseq> seq = ConstBegin(); seq; ++seq)
135  {
136  if( hasProblems(*seq, diag, eFrameShift )||
137  hasProblems(*seq, diag, ePartial)
138  )
139  {
140  CBioseq_Handle handle = scope.AddBioseq(*seq);
141  if( hasProblems(*seq, diag, eFrameShift )) fasta_out.Write(handle);
142  if( hasProblems(*seq, diag, ePartial)) fasta_out_cdd.Write(handle);
143  }
144  }
145 
146 }
147 
149  (
150  distanceReportStr *report,
151  ostream& out
152  )
153 {
154  out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
155  out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
156  out << report->q_id_left
157  << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
158  << "\t"
159  << report->q_id_right
160  << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
161  << NcbiEndl;
162  string range1 = report->loc1.Empty()
163  ? printed_range(report->q_loc_left_from, report->q_loc_left_to)
164  : printed_ranges(*(report->loc1));
165  string range2 = report->loc2.Empty()
166  ? printed_range(report->q_loc_right_from, report->q_loc_right_to)
167  : printed_ranges(*(report->loc2));
168  out
169  << "[" << range1 << "] " << report->left_frame << "\t\t"
170  << "[" << range2 << "] " << report->right_frame
171  << NcbiEndl;
172  out << NcbiEndl;
173 
174  out << "Overlap: " << report->space << " aa" << NcbiEndl;
175 }
176 
178  (
179  const perfectHitStr& hit,
180  ostream& out
181  )
182 {
183 
184  out << "Perfect hit: " << hit.s_name << NcbiEndl;
185 
186 }
187 
189 {
190  out << "General info extracted from the header of the submission:" << NcbiEndl;
191  out << NcbiEndl;
192  out << "reldate: ";
193  if ( IsSubmit() && m_Submit.GetSub().CanGetReldate() )
194  {
195  string date;
196  m_Submit.GetSub().GetReldate().GetDate(&date);
197  out << date.c_str();
198  }
199  else
200  out << "unspecified";
201  out << NcbiEndl;
202 
203 
204 // simple texts
205  out << "tool: ";
206  if ( IsSubmit() && m_Submit.GetSub().CanGetTool() )
207  out << m_Submit.GetSub().GetTool();
208  else
209  out << "unspecified";
210  out << NcbiEndl;
211 
212  out << "user_tag: ";
213  if ( IsSubmit() && m_Submit.GetSub().CanGetUser_tag() )
214  out << m_Submit.GetSub().GetUser_tag();
215  else
216  out << "unspecified";
217  out << NcbiEndl;
218 
219  out << "comment: ";
220  if ( IsSubmit() && m_Submit.GetSub().CanGetComment() )
221  out << m_Submit.GetSub().GetComment();
222  else
223  out << "unspecified";
224  out << NcbiEndl;
225 
226 
227  out << NcbiEndl;
228 }
229 
231  (
232  distanceReportStr *report,
233  ostream& out
234  )
235 {
236  out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
237  out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
238  out << "s | " << report->s_name << NcbiEndl;
239  out << report->q_id_left
240  << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
241  << "\t"
242  << report->q_id_right
243  << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
244  << NcbiEndl;
245  out
246  << "[" << report->q_loc_left_from << "..." << report->q_loc_left_to << "]\t\t"
247  << "[" << report->q_loc_right_from << "..." << report->q_loc_right_to << "]"
248  << NcbiEndl;
249  out << NcbiEndl;
250  out
251  << "q\t" << report->q_left_left << "\t" << report->q_left_middle << "\t" << report->q_left_right
252  << "\t" << report->space
253  << "\t" << report->q_right_left << "\t" << report->q_right_middle << "\t" << report->q_right_right
254  << NcbiEndl;
255  out
256  << "s\t" << report->s_left_left << "\t" << report->s_left_middle << "\t" << "..."
257  << "\t" << "..."
258  << "\t" << "..." << "\t" << report->s_left_right << "\t" << "..."
259  << NcbiEndl;
260  out
261  << "s\t" << "..." << "\t" << report->s_right_left << "\t" << "..."
262  << "\t" << "..."
263  << "\t" << "..." << "\t" << report->s_right_middle << "\t" << report->s_right_right
264  << NcbiEndl;
265  out << "diff_left, diff_right: " << report->diff_left << ", " << report->diff_right << NcbiEndl;
266  out << "diff_edge_left, diff_edge_right: " << report->diff_edge_left << ", " << report->diff_edge_right << NcbiEndl;
267  if(report->diff_edge_left > 0)
268  {
269  out << "Potential deletetion of a nucleotide sequence equivalent to " << report->diff_edge_left << " occurred." << NcbiEndl;
270  }
271  else if (report->diff_edge_left < 0)
272  {
273  if ( -report->diff_edge_left < report->space )
274  out << "Potential insertion of a nucleotide sequence equivalent to " << -report->diff_edge_left << " occurred." << NcbiEndl;
275  else
276  out << "Subjects are shifted apart by " << -report->diff_edge_left << NcbiEndl;
277  }
278  else
279  {
280  out << "Potential sequencing error or replacement mutation without insertion or deletion of a nucleotide sequence occurred." << NcbiEndl;
281  }
282 
283 
284 }
285 
287  (
288  const LocMap& loc_map,
289  CSeq_annot::C_Data::TFtable& rna_feats,
290  const CSeq_annot::C_Data::TFtable& feats
291  )
292 {
293  if(PrintDetails()) NcbiCerr << "GetRNAfeats() starts" << NcbiEndl;
294  IncreaseVerbosity();
296  {
297  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature start " << NcbiEndl;
298  if( !(*f)->GetData().IsRna() ) continue;
299  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature is RNA" << NcbiEndl;
300  string loc_string = GetLocationString( (*f)->GetLocation() );
301  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
302  << loc_string
303  << "] feature location" << NcbiEndl;
304  LocMap::const_iterator aname = loc_map.find(loc_string);
305  if(aname == loc_map.end())
306  {
307  // error
308  NcbiCerr << "CReadBlastApp::GetRNAfeats(): ERROR: cannot find gene for location "
309  << loc_string
310  << NcbiEndl;
311  continue;
312  }
313  rna_feats.push_back(aname->second);
314  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
315  << aname->first
316  << "] feature end" << NcbiEndl;
317  }
318  DecreaseVerbosity();
319  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): rna_feats size: "
320  << rna_feats.size()
321  << NcbiEndl;
322  if(PrintDetails()) NcbiCerr << "GetRNAfeats() ends" << NcbiEndl;
323 
324  return;
325 }
326 
327 
328 string GetLocusTag(const CSeq_feat& f, const LocMap& loc_map)
329 {
330  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: start" << NcbiEndl;
331  string loc_string = GetLocationString(f.GetLocation());
332  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: I" << NcbiEndl;
333  LocMap::const_iterator aname = loc_map.find(loc_string);
334  if(aname == loc_map.end())
335  {
336  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not in locmap" << NcbiEndl;
337  return loc_string;
338  }
339  else
340  {
341  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: in locmap" << NcbiEndl;
342  const CGene_ref& gene=aname->second->GetData().GetGene();
343  string qname="";
344  if(gene.IsSetLocus_tag())
345  {
346  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: getting it" << NcbiEndl;
347  return gene.GetLocus_tag();
348  }
349 
350  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not getting it" << NcbiEndl;
351  return qname;
352  }
353 
354  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: INTERNAL ERROR: should be never here" << NcbiEndl;
355  return loc_string;
356 }
358 {
359  const CBioseq_set::TSeq_set* seqset = get_parent_seqset(seq);
360  const CSeq_loc& loc = getGenomicLocation(seq);
362  const CSeq_id& nu_id = inter->GetId();
363 
364  ITERATE(CBioseq_set::TSeq_set, seq_nu, *seqset)
365  {
366  if(!(*seq_nu)->IsSeq()) continue;
367  if(!(*seq_nu)->GetSeq().IsNa()) continue;
368  const CBioseq::TId& nu_ids = (*seq_nu)->GetSeq().GetId();
369  ITERATE(CBioseq::TId, cnu_id , nu_ids)
370  {
371  if((*cnu_id)->Compare(nu_id) == CSeq_id::e_YES) return (*seq_nu)->GetSeq();
372  }
373  }
374  NcbiCerr << "CReadBlastApp::get_nucleotide_seq: INTERNAL FATAL ERROR: could not find nucleotide sequence" << NcbiEndl;
375  throw;
376  return seq; // formality
377 }
378 
380 {
381  string name = GetStringDescr(seq);
382  CSeq_entry* parent = seq.GetParentEntry();
383  if(parent==NULL)
384  {
385  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no parent\n";
386  return NULL;
387  }
388  if(parent->IsSeq())
389  {
390  while(parent!=NULL && !parent->IsSet())
391  parent = parent->GetParentEntry();
392  if(parent==NULL)
393  {
394  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no set ancestor\n";
395  return NULL;
396  }
397  }
398  else if(!parent->IsSet()) return NULL;
399 
400  if(!parent->GetSet().CanGetSeq_set())
401  {
402  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": (grand)parent set does not have Seq_set\n";
403  return NULL;
404  }
405  return &(parent->SetSet().SetSeq_set());
406 }
407 
408 
409 string let1_2_let3(char let1)
410 {
411  switch(let1)
412  {
413  case 'A': return "Ala"; break;
414  case 'Q': return "Gln"; break;
415  case 'W': return "Trp"; break;
416  case 'E': return "Glu"; break;
417  case 'R': return "Arg"; break;
418  case 'T': return "Thr"; break;
419  case 'Y': return "Tyr"; break;
420  case 'U': return "Sec"; break;
421  case 'I': return "Ile"; break;
422  case 'J': return "Xeu"; break; // Ile or Leu, there is no conventional three-letter code
423  case 'P': return "Pro"; break;
424  case 'S': return "Ser"; break;
425  case 'D': return "Asp"; break;
426  case 'F': return "Phe"; break;
427  case 'G': return "Gly"; break;
428  case 'H': return "His"; break;
429  case 'K': return "Lys"; break;
430  case 'L': return "Leu"; break;
431  case 'Z': return "Glx"; break;
432  case 'X': return "Ukn"; break;
433  case 'C': return "Cys"; break;
434  case 'V': return "Val"; break;
435  case 'B': return "Asx"; break;
436  case 'N': return "Asn"; break;
437  case 'M': return "Met"; break;
438  case 'O': return "Pyl"; break;
439  default: break;
440  }
441  NcbiCerr << "let1_2_let3: ERROR: char " << let1 << "(" << (int)let1 << ") is not handled" << NcbiEndl;
442 
443  return "";
444 }
445 
446 int CReadBlastApp::collectPerfectHits(vector<perfectHitStr>& perfect, const CBioseq& seq)
447 {
448  ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
449  {
450  if(!(*annot)->GetData().IsAlign()) continue;
451  check_alignment(annot, seq, perfect);
452  }
453  return perfect.size();
454 }
455 
457  (
458  CBioseq::TAnnot::const_iterator& annot,
459  const CBioseq& seq,
460  vector<perfectHitStr>& results
461  )
462 {
463  int sLen = getLenScore(annot);
464  int qLen = getQueryLen(seq);
465  string q_name = GetProtName(seq);
466 
467  string s_name = getAnnotName(annot);
468 
469  int qFrom, qTo, sFrom, sTo;
470  getBounds(annot, &qFrom, &qTo, &sFrom, &sTo);
471 
472  bool result=false;
473 
474  int qtails = qLen - qTo + qFrom - 1;
475  int stails = sLen - sTo + sFrom - 1;
476  double thr = m_small_tails_threshold; // default should be .1
477  result =
478  (double)qtails/qLen < thr &&
479  (double)stails/sLen < thr &&
480  s_name.find("hypothetical") == string::npos;
481 // hypothetical exhonerating hits do not count
482 
483  if(result)
484  {
485  results.resize(results.size()+1);
486  int ihit = results.size()-1;
487  results[ihit].s_name = s_name;
488  }
489 }
490 
492 {
493  bool result;
494  // if(PrintDetails()) NcbiCerr << "is_prot_entry?" << NcbiEndl;
495  result=seq.GetInst().IsAa();
496  // if(PrintDetails()) NcbiCerr << result << NcbiEndl;
497  return result;
498 }
499 
500 
502 {
503  if(PrintDetails()) NcbiCerr << "SortSeqs: start " << NcbiEndl;
504  PushVerbosity();
505  if(IsSubmit())
506  {
507  if(PrintDetails()) NcbiCerr << "SortSeqs: IsSubmit" << NcbiEndl;
508  if (
510  &&
511  (*m_Submit.GetData().GetEntrys().begin())->IsSet()
512  )
513  {
514  SortSeqs((*m_Submit.SetData().SetEntrys().begin())->SetSet().SetSeq_set());
515  }
516  else
517  {
518  NcbiCerr << "ERROR: submit file does not have proper seqset" << NcbiEndl;
519  }
520  }
521  else
522  {
523  if(PrintDetails()) NcbiCerr << "SortSeqs: IsEntry" << NcbiEndl;
525  }
526 
527  PopVerbosity();
528  if(PrintDetails()) NcbiCerr << "SortSeqs: end" << NcbiEndl;
529  return -1;
530 }
531 
532 
533 
535 {
536 /*
537  NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, where)
538  {
539  if((*left)->IsSet())
540  {
541  CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set();
542  SetParents(parent, seqs_down);
543  }
544  else
545  {
546  (*left)->SetParentEntry(parent);
547  }
548  }
549 */
550  return 0;
551 }
552 
553 // TSimpleSeq
554 
556  const TSimpleSeq& second)
557 {
558  return first.key < second.key;
559 } // less_seq
560 
561 bool CReadBlastApp::less_pair(const pair<int,int>& first,
562  const pair<int,int>& second)
563 {
564  if(first.first != second.first) return first.first < second.first;
565  return first.second < second.second;
566 }
567 
568 
569 
571  const CRef<CSeq_entry>& second)
572 {
573  // if(PrintDetails()) NcbiCerr << "less_seq start" << NcbiEndl;
574  if (first->IsSet() || second->IsSet())
575  {
576  return false;
577  }
578  // if(PrintDetails()) NcbiCerr << "less_seq both seqs" << NcbiEndl;
579  if (!hasGenomicLocation(first->GetSeq()))
580  {
581  if(first->GetSeq().IsAa())
582  NcbiCerr << "less_seq first does not have genomic location or is nucleotide seq" << NcbiEndl;
583  return true; // to take care of nucleotide sequence
584  }
585  if (!hasGenomicLocation(second->GetSeq()))
586  {
587  if(second->GetSeq().IsAa())
588  NcbiCerr << "less_seq second does not have genomic location or is nucleotide seq" << NcbiEndl;
589  return false;
590  }
591  // if(PrintDetails()) NcbiCerr << "less_seq both have genomic locations" << NcbiEndl;
592  const CSeq_loc& left_genomic_int = getGenomicLocation(first->GetSeq());
593  const CSeq_loc& right_genomic_int = getGenomicLocation(second->GetSeq());
594  TSeqPos from1, to1, from2, to2;
595 
596  ENa_strand strand1;
597  ENa_strand strand2;
598 
599  getFromTo(left_genomic_int, from1, to1, strand1);
600  getFromTo(right_genomic_int, from2, to2, strand2);
601 /*
602  if(PrintDetails()) NcbiCerr << "less_seq comparing"
603  << ": " << GetStringDescr( first->GetSeq()) << " = " << from1 << "-" << to1
604  << ", " << GetStringDescr(second->GetSeq()) << " = " << from2 << "-" << to2
605  << NcbiEndl;
606 */
607  return from1 < from2;
608 } // less_seq
609 
611 {
612  if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): start"
613  << ": " << seqs.size()
614  << NcbiEndl;
615 
617  {
618  if ((*seq)->IsSet()) SortSeqs((*seq)->SetSet().SetSeq_set());
619  }
620 
621  seqs.sort(&less_seq); // does not do anything
622  int n =1;
623 
624  if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): stop" << NcbiEndl;
625  return n;
626 }
627 
629 {
630  char buffer[1024];
631  in.getline(buffer, 1024);
632  in.seekg(0);
633 
634  string result = buffer;
635  result.erase(result.find_first_of(": "));
636 
637  if(result=="Seq-entry")
638  {
639  result = buffer;
640  if(result.find("set") != string::npos)
641  return eEntry;
642  return eUndefined;
643  }
644  if(result=="Seq-submit") return eSubmit;
645  if(NStr::StartsWith(result, ">Feature")) return eTbl;
646  return eUndefined;
647 }
648 
650 {
651  return m_coreDataType == eSubmit;
652 }
653 
655 {
656  return m_coreDataType == eEntry;
657 }
658 
660 {
661  return m_coreDataType == eTbl;
662 }
663 
664 
666 {
669 
670  return ::Begin(m_Submit); // some platforms might have warnings without it
671 }
672 
674 {
677 
678  return ::ConstBegin(m_Submit); // some platforms might have warnings without it
679 }
680 
681 
682 char *CReadBlastApp::next_w(char *w)
683 {
684  while(*w && !isspace(*w)) ++w;
685  while(isspace(*w)) ++w;
686  return(w);
687 
688 }
690 {
691  while(*w && isspace(*w)) ++w;
692  return(w);
693 
694 }
696 {
697  int nums=0;
698  for(nums=0 ;seq.IsValid(); ++seq, nums++)
699  {
700  const CSeq_inst& inst = seq->GetInst();
701  if(!inst.IsAa()) continue;
702  break;
703  }
704  return nums;
705 }
706 
708 {
709  int nums=0;
710  for(nums=0 ;seq; ++seq, nums++)
711  {
712  const CSeq_inst& inst = seq->GetInst();
713  if(!inst.IsAa()) continue;
714  break;
715  }
716  return nums;
717 }
718 
719 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::const_iterator& seq,
720  const CBioseq_set::TSeq_set& seqs)
721 {
722 
724  while( seq != seqs.end() &&
725  (
726  (*seq)->IsSet() || // set of sequences, skip
727  !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
728  )
729  )
730  {
731  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
732  // << CSeq_id::GetStringDescr ((*seq)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl;
733  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
734  ++seq;
735  }
737 
738  return (seq != seqs.end()) ;
739 
740 }
741 
742 
743 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::iterator& seq,
744  CBioseq_set::TSeq_set& seqs)
745 {
746 
748  while( seq != seqs.end() &&
749  (
750  (*seq)->IsSet() || // set of sequences, skip
751  !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
752  )
753  )
754  {
755  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
756  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
757  ++seq;
758  }
760 
761  return (seq != seqs.end()) ;
762 
763 }
764 
765 
767  CBioseq_set::TSeq_set::const_iterator& seq,
768  const CBioseq_set::TSeq_set& seqs)
769 {
771  while( seq != seqs.end() &&
772  (
773  (*seq)->IsSet() || // set of sequences, skip
774  !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
775  !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
776  )
777  )
778  {
779  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
780  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
781  ++seq;
782  }
784 
785  return (seq != seqs.end()) ;
786 }
787 
789  CBioseq_set::TSeq_set::iterator& seq,
790  CBioseq_set::TSeq_set& seqs)
791 {
793  while( seq != seqs.end() &&
794  (
795  (*seq)->IsSet() || // set of sequences, skip
796  !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
797  !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
798  )
799  )
800  {
801  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
802  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
803  ++seq;
804  }
806 
807  return (seq != seqs.end()) ;
808 }
809 
810 // ReadPreviousAcc(args["parentacc"].AsString(), m_previous_genome)
811 bool CReadBlastApp::ReadPreviousAcc(const string& file, list<long>& input_acc)
812 {
813  const unsigned int max_acc_len=0xF;
814  ifstream in(file.c_str());
815  if(!in.is_open()) return false; // the input parameter is not a file
816  list<long> scratch_acc;
817  while(in.good())
818  {
819  char buffer[max_acc_len+2];
820  in.getline(buffer, max_acc_len+1);
821 /*
822  if(buffer[0] < 'A' || buffer[0] > 'Z' ||
823  buffer[1] < 'A' || buffer[1] > 'Z' ||
824  buffer[2] != '_') return false;
825 */
826  string test(buffer);
827 // remove white spaces
828  string::size_type ipos=string::npos;
829  while((ipos=test.find_first_of("\t\n\r ")) != string::npos)
830  test.erase(ipos,1);
831 // too long, may be it is a wrong file
832  if(test.size()>max_acc_len) return false;
833 // has non-digits, go away
834  if(test.find_first_not_of("0123456789") != string::npos) return false;
835 // accept
836  scratch_acc.push_back(atol(test.c_str()));
837  }
838  if(!scratch_acc.size()) return false;
839  input_acc = scratch_acc;
840  return true;
841 }
842 
843 /////////////////////////////////////////////////////////////////////////////
844 // MAIN
845 
846 
847 int main(int argc, const char* argv[])
848 {
849  // Execute main application function
850  return CReadBlastApp().AppMain(argc, argv, 0, eDS_Default, 0);
851 }
852 
853 
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:996
CBioseq_Handle –.
#define NcbiCerr
Definition: ncbistre.hpp:399
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:802
string let1_2_let3(char let1)
static void PushVerbosity(void)
const TData & GetData(void) const
Get the Data member data.
static void IncreaseVerbosity(void)
string GetLocusTag(const CSeq_feat &f, const LocMap &loc_map)
int main(int argc, const char *argv[])
ESerialDataFormat s_GetFormat(const string &name)
string printed_ranges(const CSeq_loc &seq_interval)
Definition: shortcuts.cpp:341
static bool has_blast_hits(const CBioseq &seq)
Definition: fit_blast.cpp:368
bool CanGetReldate(void) const
Check if it is safe to call GetReldate method.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:855
static bool less_pair(const pair< int, int > &first, const pair< int, int > &second)
static double m_eThreshold
std::ofstream out("events_result.xml")
main entry point for tests
bool IsAa(void) const
Definition: Bioseq.cpp:350
bool CanGetUser_tag(void) const
Check if it is safe to call GetUser_tag method.
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:762
static void dumpAlignment(const string &alignment, const string &file)
FASTA-format output; see also ReadFasta in
Definition: sequence.hpp:743
static bool giMatch(const vector< long > &left, const vector< long > &right)
ASN.1 text.
Definition: serialdef.hpp:73
CSeq_submit m_Submit
void printGeneralInfo(ostream &out=NcbiCerr)
#define NcbiEndl
Definition: ncbistre.hpp:403
static double m_small_tails_threshold
static bool less_seq(const CRef< CSeq_entry > &first, const CRef< CSeq_entry > &second)
static int collectPerfectHits(vector< perfectHitStr > &perfect, const CBioseq &seq)
const TTool & GetTool(void) const
Get the Tool member data.
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
static int m_rna_overlapThreshold
#define NULL
Definition: ncbistd.hpp:225
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:626
static int skip_toprot(CTypeIterator< CBioseq > &seq)
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
static ECoreDataType getCoreDataType(istream &in)
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:496
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:795
const TReldate & GetReldate(void) const
Get the Reldate member data.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1107
FILE * file
static bool less_simple_seq(const TSimpleSeq &first, const TSimpleSeq &second)
#define THROW1_TRACE(exception_class, exception_arg)
Throw trace.
Definition: ncbiexpt.hpp:258
static void printPerfectHit(const perfectHitStr &hit, ostream &out=NcbiCout)
list< CRef< CSeq_entry > > TSeq_set
const TSub & GetSub(void) const
Get the Sub member data.
CRef< const CSeq_loc > loc1
void SetData(TData &value)
Assign a value to Data data member.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const_iterator end() const
Definition: map.hpp:152
static void printReport(distanceReportStr *report, ostream &out=NcbiCout)
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static int m_shortProteinThreshold
static int m_cds_overlapThreshold
static int m_verbosity_threshold
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TComment & GetComment(void) const
Get the Comment member data.
bool CanGetSeq_set(void) const
Check if it is safe to call GetSeq_set method.
static bool is_prot_entry(const CBioseq &seq)
const_iterator find(const key_type &key) const
Definition: map.hpp:153
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static double m_partThreshold
static bool hasProblems(const CBioseq &seq, diagMap &diag, const EProblem type)
Definition: problems.cpp:141
bool ReadPreviousAcc(const string &file, list< long > &input_acc)
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:714
const TUser_tag & GetUser_tag(void) const
Get the User_tag member data.
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
virtual void Write(const CSeq_entry_Handle &handle, const CSeq_loc *location=0)
Unspecified locations designate complete sequences; non-empty custom titles override the usual title ...
Definition: sequence.cpp:2607
static int m_current_verbosity
static double m_trnascan_scoreThreshold
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5141
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
static void printOverlapReport(distanceReportStr *report, ostream &out=NcbiCout)
static bool hasGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:100
static bool skip_to_valid_seq_cand(CBioseq_set::TSeq_set::const_iterator &seq, const CBioseq_set::TSeq_set &seqs)
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1763
CBioseq_set::TSeq_set * get_parent_seqset(const CBioseq &seq)
#define test(a, b, c, d, e)
Definition: numeric.c:170
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:198
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:748
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
static char * next_w(char *w)
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
static vector< long > getGIs(CBioseq::TAnnot::const_iterator &annot)
static char * skip_space(char *w)
CConstBeginInfo ConstBegin(void)
static void DecreaseVerbosity(void)
Class holding information about root of non-modifiable object hierarchy Do not use it directly...
Definition: iterator.hpp:85
CScope –.
Definition: scope.hpp:90
CBeginInfo Begin(void)
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
Definition: Seq_entry.hpp:55
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1153
static const CSeq_loc & getGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:120
Class holding information about root of non-modifiable object hierarchy Do not use it directly...
Definition: iterator.hpp:57
bool IsEntrys(void) const
Check if variant Entrys is selected.
const TEntrys & GetEntrys(void) const
Get the variant data.
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
ECoreDataType m_coreDataType
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
string GetLocationString(const CSeq_feat &f)
Definition: locations.cpp:228
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
yy_size_t n
static void PopVerbosity(void)
bool IsValid(void) const
Definition: iterator.hpp:252
const CBioseq & get_nucleotide_seq(const CBioseq &seq)
namespace ncbi::objects::
Definition: Seq_feat.hpp:56
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
static void check_alignment(CBioseq::TAnnot::const_iterator &annot, const CBioseq &seq, vector< perfectHitStr > &results)
void dump_fasta_for_pretty_blast(diagMap &diag)
int isspace(Uchar c)
Definition: ncbictype.hpp:69
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
const TId & GetId(void) const
Get the Id member data.
std::istream & in(std::istream &in_, double &x_)
static double m_entireThreshold
static stack< int > m_saved_verbosity
bool CanGetTool(void) const
Check if it is safe to call GetTool method.
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:318
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:760
CRef< CTestThread > thr[k_NumThreadsMax]
Definition: test_mt.cpp:245
static int result
Definition: cursor5.c:11
bool CanGetComment(void) const
Check if it is safe to call GetComment method.
CRef< const CSeq_loc > loc2
SeqIds compared, but are different.
Definition: Seq_id.hpp:441
static int m_n_best_hit
static uschar * buffer
Definition: pcretest.c:187
int SetParents(CSeq_entry *parent, CBioseq_set::TSeq_set &where)
void GetRNAfeats(const LocMap &loc_map, CSeq_annot::C_Data::TFtable &rna_feats, const CSeq_annot::C_Data::TFtable &feats)
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1004
ASN.1 binary.
Definition: serialdef.hpp:74
Modified on Sat Feb 24 14:03:01 2018 by modify_doxy.py rev. 546573