NCBI C++ ToolKit
validerror_bioseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_bioseq.cpp 80227 2017-11-20 14:23:09Z bollin $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ......
27  *
28  * File Description:
29  * validation of bioseq
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbitime.hpp>
37 #include <corelib/ncbimisc.hpp>
39 
45 
46 #include <serial/enumvalues.hpp>
47 #include <serial/iterator.hpp>
48 
49 #include <objects/general/Date.hpp>
54 #include <objects/pub/Pub.hpp>
62 #include <objects/seq/Bioseq.hpp>
63 #include <objects/seq/Seq_inst.hpp>
64 #include <objects/seq/MolInfo.hpp>
68 #include <objects/seq/Seq_ext.hpp>
69 #include <objects/seq/Seg_ext.hpp>
70 #include <objects/seq/Seq_hist.hpp>
75 #include <objects/seq/IUPACaa.hpp>
76 #include <objects/seq/IUPACna.hpp>
77 #include <objects/seq/NCBI2na.hpp>
78 #include <objects/seq/NCBI4na.hpp>
79 #include <objects/seq/NCBI8aa.hpp>
80 #include <objects/seq/NCBI8na.hpp>
81 #include <objects/seq/NCBIeaa.hpp>
82 #include <objects/seq/NCBIpaa.hpp>
83 #include <objects/seq/NCBIpna.hpp>
85 #include <objects/seq/GIBB_mol.hpp>
86 #include <objects/seq/Pubdesc.hpp>
87 
96 
99 
102 
107 
109 
110 #include <objmgr/bioseq_ci.hpp>
111 #include <objmgr/seq_descr_ci.hpp>
112 #include <objmgr/feat_ci.hpp>
113 #include <objmgr/graph_ci.hpp>
114 #include <objmgr/scope.hpp>
115 #include <objmgr/seqdesc_ci.hpp>
116 #include <objmgr/seq_vector.hpp>
117 #include <objmgr/seq_vector_ci.hpp>
119 #include <objmgr/util/sequence.hpp>
120 #include <objmgr/util/feature.hpp>
121 #include <objmgr/bioseq_handle.hpp>
123 #include <objmgr/seq_entry_ci.hpp>
124 #include <objmgr/annot_selector.hpp>
127 
128 #include <objtools/error_codes.hpp>
130 
131 #include <algorithm>
132 
133 #include <objmgr/seq_loc_mapper.hpp>
134 
135 #define NCBI_USE_ERRCODE_X Objtools_Validator
136 
141 USING_SCOPE(feature);
142 
143 class CCdsMatchInfo;
144 
145 class CMrnaMatchInfo : public CObject {
146 public:
147  CMrnaMatchInfo(const CSeq_feat& mrna, CScope* scope);
148  const CSeq_feat& GetSeqfeat(void) const;
149  bool Overlaps(const CSeq_feat& cds) const;
150  void SetMatch(CCdsMatchInfo& match);
151  bool HasMatch(void) const;
152  void SetPseudo(bool val = true) { m_IsPseudo = val; }
153  bool OkWithoutCds(void) const;
154 
155 private:
158 
162 };
163 
164 
165 class CCdsMatchInfo : public CObject {
166 public:
167  CCdsMatchInfo(const CSeq_feat& cds, CScope* scope);
168  const CSeq_feat& GetSeqfeat(void) const;
169  bool Overlaps(const CSeq_feat& mrna) const;
170 #ifdef USE_MRNA_MAP
171  bool AssignXrefMatch(TmRNAList& unmatched_mrnas, const CTSE_Handle& tse);
172  bool AssignOverlapMatch(TmRNAList& unmatched_mrnas, CScope& scope);
173  void UpdateOtherMrnas(const TmRNAList& unmatched_mrnas);
174  size_t CountOtherMrnas() { return m_OtherMrnas.size(); }
175  bool AreMrnaProductsUnique() { return m_ProductsUnique; }
176 #else
177  bool AssignXrefMatch(list<CRef<CMrnaMatchInfo>>& unmatched_mrnas);
178  bool AssignOverlapMatch(list<CRef<CMrnaMatchInfo>>& unmatched_mrnas);
179 #endif
181  bool AssignMatch(TmRNAList& mrna_map, CFeatTree& feat_tree, CScope& scope);
182  bool HasMatch(void) const;
183  void NeedsMatch(bool needs_match);
184  bool NeedsMatch(void) const;
185  const CMrnaMatchInfo& GetMatch(void) const;
186  bool IsPseudo(void) const;
187  void SetPseudo(void);
188 
189 private:
192 
197 #ifdef USE_MRNA_MAP
198  list<CConstRef<CSeq_feat> > m_OtherMrnas;
199  bool m_ProductsUnique;
200 #endif
201 };
202 
203 
204 // =============================================================================
205 // Public
206 // =============================================================================
207 
209  CValidError_base(imp), m_AnnotValidator(imp), m_DescrValidator(imp), m_FeatValidator(imp), m_GeneIt(NULL), m_AllFeatIt(NULL)
210 {
211 }
212 
213 
215 {
216 }
217 
218 
220  const CBioseq& seq)
221 {
222  try {
224 
225  CSeq_entry_Handle appropriate_parent;
226  if (m_Imp.ShouldSubdivide()) {
228  }
229  if (appropriate_parent) {
230  CRef<CScope> tmp_scope(new CScope(*(CObjectManager::GetInstance())));
231  tmp_scope->AddDefaults();
232  CSeq_entry_Handle this_seh = tmp_scope->AddTopLevelSeqEntry(*(appropriate_parent.GetCompleteSeq_entry()));
233  m_FeatValidator.SetScope(*tmp_scope);
234  m_FeatValidator.SetTSE(this_seh);
235  } else {
238  }
239 
240  try {
241  CCacheImpl::SFeatKey gene_key(
244  m_GeneIt = &GetCache().GetFeatFromCache(gene_key);
245 
246  CCacheImpl::SFeatKey all_feat_key(
249  m_AllFeatIt = &GetCache().GetFeatFromCache(all_feat_key);
250  } catch ( const exception& ) {
251  // sequence might be too broken to validate features
252  m_GeneIt = NULL;
253  m_AllFeatIt = NULL;
254  }
256  ValidateSeqIds(seq);
257  ValidateInst(seq);
259  ValidatemRNAGene(seq);
260  ValidateHistory(seq);
261  FOR_EACH_ANNOT_ON_BIOSEQ (annot, seq) {
264  }
265  if (seq.IsSetDescr()) {
266  if (m_CurrentHandle) {
268  if (ctx) {
270  }
271  }
272  }
273  if (IsWGSMaster(seq, m_CurrentHandle.GetScope())) {
275  }
276  if (appropriate_parent) {
279  }
280 
281  } catch ( const exception& e ) {
283  string("Exception while validating bioseq. EXCEPTION: ") +
284  e.what(), seq);
285  }
287  if (m_GeneIt) {
288  m_GeneIt = NULL;
289  }
290  if (m_AllFeatIt) {
291  m_AllFeatIt = NULL;
292  }
293 }
294 
295 
296 static bool s_IsSkippableDbtag (const CDbtag& dbt)
297 {
298  if (!dbt.IsSetDb()) {
299  return false;
300  }
301  const string& db = dbt.GetDb();
302  if (NStr::EqualNocase(db, "TMSMART")
303  || NStr::EqualNocase(db, "BankIt")
304  || NStr::EqualNocase (db, "NCBIFILE")) {
305  return true;
306  } else {
307  return false;
308  }
309 }
310 
311 static char CheckForBadSeqIdChars (const string& id)
312 
313 {
314  FOR_EACH_CHAR_IN_STRING(itr, id) {
315  const char& ch = *itr;
316  if (ch == '|' || ch == ',') return ch;
317  }
318  return '\0';
319 }
320 
321 // VR-748
322 static char CheckForBadLocalIdChars(const string& id)
323 {
324  for (size_t i = 0; i < id.length(); i++) {
325  if (!CSeq_id::IsValidLocalID(id.substr(i, 1))) {
326  return id.c_str()[i];
327  }
328  }
329  return '\0';
330 }
331 
332 
333 static char CheckForBadFileIDSeqIdChars(const string& id)
334 {
335  FOR_EACH_CHAR_IN_STRING(itr, id) {
336  const char& ch = *itr;
337  if (ch == '|' || ch == ',') return ch;
338  }
339  return '\0';
340 }
341 
342 
343 // validation for individual Seq-id
345 {
346  // see if ID can be used to find ctx
348  if (bsh) {
350  if ( !core ) {
351  if ( !m_Imp.IsPatent() ) {
353  "BioseqFind (" + id.AsFastaString() +
354  ") unable to find itself - possible internal error", ctx);
355  }
356  } else if ( core.GetPointer() != &ctx ) {
358  "SeqID " + id.AsFastaString() +
359  " is present on multiple Bioseqs in record", ctx);
360  }
361  } else {
363  "BioseqFind (" + id.AsFastaString() +
364  ") unable to find itself - possible internal error", ctx);
365  }
366 
367  //check formatting
368  const CTextseq_id* tsid = id.GetTextseq_Id();
369 
370  switch (id.Which()) {
371  case CSeq_id::e_Tpg:
372  case CSeq_id::e_Tpe:
373  case CSeq_id::e_Tpd:
374  if ( IsHistAssemblyMissing(ctx) && ctx.IsNa() ) {
376  "TPA record " + ctx.GetId().front()->AsFastaString() +
377  " should have Seq-hist.assembly for PRIMARY block",
378  ctx);
379  }
380  // Fall thru
381  case CSeq_id::e_Genbank:
382  case CSeq_id::e_Embl:
383  case CSeq_id::e_Ddbj:
384  if ( tsid && tsid->IsSetAccession() ) {
385  const string& acc = tsid->GetAccession();
386  const char badch = CheckForBadSeqIdChars (acc);
387  if (badch != '\0') {
389  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
390  }
391  unsigned int num_digits = 0;
392  unsigned int num_letters = 0;
393  size_t num_underscores = 0;
394  bool internal_S = false;
395  bool letter_after_digit = false;
396  bool bad_id_chars = false;
397  size_t i = 0;
398 
399  for ( ; i < acc.length(); ++i ) {
400  if ( isupper((unsigned char) acc[i]) ) {
401  num_letters++;
402  if ( num_digits > 0 || num_underscores > 1 ) {
403  if (acc[i] == 'S' && num_letters == 5 && num_digits == 2 && (! internal_S)) {
404  num_letters--;
405  internal_S = true;
406  } else {
407  letter_after_digit = true;
408  }
409  }
410  } else if ( isdigit((unsigned char) acc[i]) ) {
411  num_digits++;
412  } else if ( acc[i] == '_' ) {
413  num_underscores++;
414  if ( num_digits > 0 || num_underscores > 1 ) {
415  letter_after_digit = true;
416  }
417  } else {
418  bad_id_chars = true;
419  }
420  }
421 
422 
423  if ( letter_after_digit || bad_id_chars ) {
425  "Bad accession " + acc, ctx);
426  } else if (num_underscores == 1) {
427  if (NStr::CompareNocase(acc, 0, 4, "MAP_") != 0 || num_digits != 6) {
429  "Bad accession " + acc, ctx);
430  }
431  } else if (num_letters == 1 && num_digits == 5 && ctx.IsNa()) {
432  } else if (num_letters == 2 && num_digits == 6 && ctx.IsNa()) {
433  } else if (num_letters == 3 && num_digits == 5 && ctx.IsAa()) {
434  } else if (num_letters == 2 && num_digits == 6 && ctx.IsAa() &&
435  ctx.GetInst().GetRepr() == CSeq_inst::eRepr_seg) {
436  } else if (num_letters == 4 && internal_S &&
437  (num_digits == 8 || num_digits == 9 || num_digits == 10) && ctx.IsNa()) {
438  } else if ( num_letters == 4 &&
439  (num_digits == 8 || num_digits == 9) &&
440  ctx.IsNa() ) {
441  } else if (num_letters == 4 && num_digits == 10 && ctx.IsNa()) {
442  } else if (num_letters == 5 && num_digits == 7 && ctx.IsNa()) {
443  } else {
445  "Bad accession " + acc, ctx);
446  }
447  // Check for secondary conflicts
450  }
451  // Fall thru
452  case CSeq_id::e_Other:
453  if ( tsid ) {
454  if ( tsid->IsSetName() ) {
455  const string& name = tsid->GetName();
456  ITERATE (string, s, name) {
457  if (isspace((unsigned char)(*s))) {
460  "Seq-id.name '" + name + "' should be a single "
461  "word without any spaces", ctx);
462  break;
463  }
464  }
465  }
466 
467  if ( tsid->IsSetAccession() && id.IsOther() ) {
468  const string& acc = tsid->GetAccession();
469  const char badch = CheckForBadSeqIdChars (acc);
470  if (badch != '\0') {
472  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
473  }
474  size_t num_letters = 0;
475  size_t num_digits = 0;
476  size_t num_underscores = 0;
477  bool bad_id_chars = false;
478  bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0);
479  size_t i = 0;
480  bool letter_after_digit = false;
481 
482  if ( is_NZ ) {
483  i = 3;
484  }
485 
486  for ( ; i < acc.length(); ++i ) {
487  if ( isupper((unsigned char) acc[i]) ) {
488  num_letters++;
489  } else if ( isdigit((unsigned char) acc[i]) ) {
490  num_digits++;
491  } else if ( acc[i] == '_' ) {
492  num_underscores++;
493  if ( num_digits > 0 || num_underscores > 1 ) {
494  letter_after_digit = true;
495  }
496  } else {
497  bad_id_chars = true;
498  }
499  }
500 
501  if ( letter_after_digit || bad_id_chars ) {
503  "Bad accession " + acc, ctx);
504  } else if ( is_NZ && num_letters == 4 &&
505  ( num_digits == 8 || num_digits == 9 ) && num_underscores == 0 ) {
506  // valid accession - do nothing!
507  } else if ( is_NZ && ValidateAccessionString (acc, false) == eAccessionFormat_valid ) {
508  // valid accession - do nothing!
509  } else if ( num_letters == 2 &&
510  (num_digits == 6 || num_digits == 8 || num_digits == 9) &&
511  num_underscores == 1 ) {
512  // valid accession - do nothing!
513  } else if (num_letters == 4 && num_digits == 10 && ctx.IsNa()) {
514  } else {
516  "Bad accession " + acc, ctx);
517  }
518  }
519  }
520  // Fall thru
521  case CSeq_id::e_Pir:
523  case CSeq_id::e_Prf:
524  if ( tsid ) {
525  if ( ctx.IsNa() &&
526  (!tsid->IsSetAccession() || tsid->GetAccession().empty())) {
527  if ( ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg ||
528  m_Imp.IsGI()) {
529  if (!id.IsDdbj() ||
530  ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
531  string msg = "Missing accession for " + id.AsFastaString();
534  msg, ctx);
535  }
536  }
537  }
538  } else {
540  "Seq-id type not handled", ctx);
541  }
542  break;
543  case CSeq_id::e_Gi:
544  if (id.GetGi() <= ZERO_GI) {
546  "Invalid GI number", ctx);
547  }
548  break;
549  case CSeq_id::e_General:
550  if (!id.GetGeneral().IsSetDb() || NStr::IsBlank(id.GetGeneral().GetDb())) {
551  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier missing database field", ctx);
552  }
553  if (id.GetGeneral().IsSetDb()) {
554  const CDbtag& dbt = id.GetGeneral();
555  size_t dblen = dbt.GetDb().length();
556  EDiagSev sev = eDiag_Error;
557  if (m_Imp.IsLocalGeneralOnly()) {
558  sev = eDiag_Critical;
559  } else if (m_Imp.IsRefSeq()) {
560  sev = eDiag_Error;
561  } else if (m_Imp.IsINSDInSep()) {
562  sev = eDiag_Error;
563  } else if (m_Imp.IsIndexerVersion()) {
564  sev = eDiag_Error;
565  }
566  if (dblen > 20) {
567  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General database longer than 20 characters", ctx);
568  }
569  if (! s_IsSkippableDbtag(dbt)) {
570  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
571  size_t idlen = dbt.GetTag().GetStr().length();
572  if (idlen > 64) {
573  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General identifier longer than 64 characters", ctx);
574  }
575  }
576  }
577  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
578  const string& acc = dbt.GetTag().GetStr();
579  char badch;
580  if (dbt.IsSetDb() && (NStr::Equal(dbt.GetDb(), "NCBIFILE") || NStr::Equal(dbt.GetDb(), "BankIt"))) {
581  badch = CheckForBadFileIDSeqIdChars(acc);
582  } else {
583  badch = CheckForBadSeqIdChars(acc);
584  }
585  if (badch != '\0') {
587  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
588  }
589  }
590  }
591  break;
592  case CSeq_id::e_Local:
593  if (id.IsLocal() && id.GetLocal().IsStr() && id.GetLocal().GetStr().length() > 50) {
594  EDiagSev sev = eDiag_Error;
595  if (! m_Imp.IsINSDInSep()) {
596  sev = eDiag_Critical;
597  } else if (! m_Imp.IsIndexerVersion()) {
598  sev = eDiag_Error;
599  }
600  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "Local identifier longer than 50 characters", ctx);
601  }
602  if (id.IsLocal() && id.GetLocal().IsStr()) {
603  const string& acc = id.GetLocal().GetStr();
604  const char badch = CheckForBadLocalIdChars(acc);
605  if (badch != '\0') {
607  "Bad character '" + string(1, badch) + "' in local ID '" + acc + "'", ctx);
608  }
609  }
610  break;
611  case CSeq_id::e_Pdb:
612  if (id.IsPdb()) {
613  const CPDB_seq_id& pdb = id.GetPdb();
614  if (pdb.IsSetChain() && pdb.IsSetChain_id()) {
616  "PDB Seq-id contains both \'chain\' and \'chain-id\' slots", ctx);
617  }
618  }
619  break;
620  default:
621  break;
622  }
623 
624 #if 0
625  // disabled for now
626  if (!IsNCBIFILESeqId(**i)) {
627  string label;
628  (*i)->GetLabel(&label);
629  if (label.length() > 40) {
631  "Sequence ID is unusually long (" +
632  NStr::IntToString(label.length()) + "): " + label,
633  seq);
634  }
635  }
636 #endif
637 
638 }
639 
640 static bool x_IsWgsSecondary (const CBioseq& seq)
641 
642 {
644  const list< string > *extra_acc = 0;
645  const CSeqdesc& desc = **sd;
646  switch (desc.Which()) {
647  case CSeqdesc::e_Genbank:
648  if (desc.GetGenbank().IsSetExtra_accessions()) {
649  extra_acc = &(desc.GetGenbank().GetExtra_accessions());
650  }
651  break;
652  case CSeqdesc::e_Embl:
653  if (desc.GetEmbl().IsSetExtra_acc()) {
654  extra_acc = &(desc.GetEmbl().GetExtra_acc());
655  }
656  break;
657  default:
658  break;
659  }
660  if ( extra_acc ) {
661  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
664  && (info & CSeq_id::fAcc_master) != 0) {
665  return true;
666  }
667  }
668  }
669  }
670  return false;
671 }
672 
673 // VR-728
674 // cannot have only seq-ids that will be stripped when loading to ID
676 {
677  bool found_good = false;
678  ITERATE(CBioseq::TId, id_it, seq.GetId()) {
679  if (!IsTemporary(**id_it)) {
680  found_good = true;
681  }
682  }
683  if (!found_good) {
685  "The only ids on this Bioseq will be stripped during ID load", seq);
686  }
687 }
688 
689 
691 (const CBioseq& seq)
692 {
693  // Ensure that CBioseq has at least one CSeq_id
694  if ( !seq.IsSetId() || seq.GetId().empty() ) {
696  "No ids on a Bioseq", seq);
697  return;
698  }
699 
700  CSeq_inst::ERepr repr = seq.GetInst().GetRepr();
701 
702  // Loop thru CSeq_ids for this CBioseq. Determine if seq has
703  // gi, NG, or NC. Check that the same CSeq_id not included more
704  // than once.
705  bool has_gi = false;
706  bool is_lrg = false;
707  bool has_ng = false;
708  bool wgs_tech_needs_wgs_accession = false;
709  bool is_segset_accession = false;
710  bool has_wgs_general = false;
711  bool is_eb_db = false;
712 
713  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
714  // first, do standalone validation
715  ValidateSeqId (**i, seq);
716 
717  if ((*i)->IsGeneral() && (*i)->GetGeneral().IsSetDb()) {
718  if (NStr::EqualNocase((*i)->GetGeneral().GetDb(), "LRG")) {
719  is_lrg = true;
720  }
721  if (NStr::StartsWith((*i)->GetGeneral().GetDb(), "WGS:")) {
722  has_wgs_general = true;
723  }
724  } else if ((*i)->IsOther() && (*i)->GetOther().IsSetAccession()) {
725  const string& acc = (*i)->GetOther().GetAccession();
726  if (NStr::StartsWith(acc, "NG_")) {
727  has_ng = true;
728  wgs_tech_needs_wgs_accession = true;
729  } else if (NStr::StartsWith(acc, "NM_")
730  || NStr::StartsWith(acc, "NP_")
731  || NStr::StartsWith(acc, "NR_")) {
732  wgs_tech_needs_wgs_accession = true;
733  }
734  } else if ((*i)->IsEmbl() && (*i)->GetEmbl().IsSetAccession()) {
735  is_eb_db = true;
736  } else if ((*i)->IsDdbj() && (*i)->GetDdbj().IsSetAccession()) {
737  is_eb_db = true;
738  }
739 
740  // Check that no two CSeq_ids for same CBioseq are same type
741  CBioseq::TId::const_iterator j;
742  for (j = i, ++j; j != seq.GetId().end(); ++j) {
743  if ((**i).Compare(**j) != CSeq_id::e_DIFF) {
744  CNcbiOstrstream os;
745  os << "Conflicting ids on a Bioseq: (";
746  (**i).WriteAsFasta(os);
747  os << " - ";
748  (**j).WriteAsFasta(os);
749  os << ")";
751  CNcbiOstrstreamToString (os) /* os.str() */, seq);
752  }
753  }
754 
755  if ( (*i)->IsGenbank() || (*i)->IsEmbl() || (*i)->IsDdbj() ) {
756  wgs_tech_needs_wgs_accession = true;
757  }
758 
759  if ( (*i)->IsGi() ) {
760  has_gi = true;
761  }
762 
763  if ( (*i)->IdentifyAccession() == CSeq_id::eAcc_segset) {
764  is_segset_accession = true;
765  }
766 
767  }
768  if (is_lrg && !has_ng) {
770  "LRG sequence needs NG_ accession", seq);
771  }
772 
773 
774  // Loop thru CSeq_ids to check formatting
775  bool is_wgs = false;
776  unsigned int gi_count = 0;
777  unsigned int accn_count = 0;
778  unsigned int lcl_count = 0;
779  FOR_EACH_SEQID_ON_BIOSEQ (k, seq) {
780  const CTextseq_id* tsid = (*k)->GetTextseq_Id();
781  switch ((**k).Which()) {
782  case CSeq_id::e_Local:
783  lcl_count++;
784  break;
785  case CSeq_id::e_Tpg:
786  case CSeq_id::e_Tpe:
787  case CSeq_id::e_Tpd:
788  case CSeq_id::e_Genbank:
789  case CSeq_id::e_Embl:
790  case CSeq_id::e_Ddbj:
791  if ( tsid && tsid->IsSetAccession() ) {
792  const string& acc = tsid->GetAccession();
793 
794  if ((*k)->IsGenbank() || (*k)->IsEmbl() || (*k)->IsDdbj()) {
795  is_wgs |= acc.length() == 12 || acc.length() == 13 || acc.length() == 14 || acc.length() == 15;
796  }
797 
798  if ( has_gi ) {
799  if (tsid->IsSetVersion() && tsid->GetVersion() == 0) {
801  "Accession " + acc + " has 0 version", seq);
802  }
803  }
804  }
805  // Fall thru
806  case CSeq_id::e_Other:
807  if ( tsid ) {
808 
809  if ( has_gi && !tsid->IsSetAccession() && tsid->IsSetName() ) {
810  if ( (*k)->IsDdbj() && repr == CSeq_inst::eRepr_seg ) {
811  // Don't report ddbj segmented sequence missing accessions
812  } else {
814  "Missing accession for " + tsid->GetName(), seq);
815  }
816  }
817  accn_count++;
818  }
819  break;
820  // Fall thru
821  case CSeq_id::e_Pir:
823  case CSeq_id::e_Prf:
824  if ( tsid) {
825  if ((!tsid->IsSetAccession() || NStr::IsBlank(tsid->GetAccession())) &&
826  (!tsid->IsSetName() || NStr::IsBlank(tsid->GetName())) &&
827  seq.GetInst().IsAa()) {
828  string label = (*k)->AsFastaString();
830  "Missing identifier for " + label, seq);
831  }
832  accn_count++;
833  }
834  break;
835 
836  case CSeq_id::e_Gi:
837  gi_count++;
838  break;
839  default:
840  break;
841  }
842  }
843 
845  if (!SeqIsPatent(seq)) {
846  if ( is_wgs ) {
847  if ( !mi || !mi->IsSetTech() ||
848  ( mi->GetTech() != CMolInfo::eTech_wgs &&
849  mi->GetTech() != CMolInfo::eTech_tsa &&
850  mi->GetTech() != CMolInfo::eTech_targeted) ) {
852  "WGS accession should have Mol-info.tech of wgs", seq);
853  }
854  } else if ( mi && mi->IsSetTech() &&
855  mi->GetTech() == CMolInfo::eTech_wgs &&
856  wgs_tech_needs_wgs_accession &&
857  !is_segset_accession &&
858  !has_wgs_general &&
859  !x_IsWgsSecondary(seq)) {
860  EDiagSev sev = eDiag_Error;
861  if (is_eb_db) {
862  sev = eDiag_Warning;
863  }
864  if (! is_eb_db) {
866  "Mol-info.tech of wgs should have WGS accession", seq);
867  }
868  }
869 
870  if ((IsNTNCNWACAccession(seq) || IsNG(seq)) && mi && seq.IsNa()
871  && (!mi->IsSetBiomol()
873  && mi->GetBiomol() != CMolInfo::eBiomol_cRNA))) {
875  "genomic RefSeq accession should use genomic or cRNA biomol type",
876  seq);
877  }
878  }
879  if (seq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
880  if (mi && mi->IsSetBiomol()) {
881  switch (mi->GetBiomol()) {
894  "Molecule type (DNA) does not match biomol (RNA)", seq);
895  break;
896  default:
897  break;
898  }
899  }
900  }
901 
902  // Check that a sequence with a gi number has exactly one accession
903  if ( gi_count > 0 && accn_count == 0 && !m_Imp.IsPDB() &&
904  repr != CSeq_inst::eRepr_virtual ) {
906  "No accession on sequence with gi number", seq);
907  }
908  if (gi_count > 0 && accn_count > 1) {
910  "Multiple accessions on sequence with gi number", seq);
911  }
912 
913  x_CheckGeneralIDs(seq);
914 
915  if ( m_Imp.IsValidateIdSet() ) {
916  ValidateIDSetAgainstDb(seq);
917  }
918 
919  // C toolkit ensures that there is exactly one CBioseq for a CSeq_id
920  // Not done here because object manager will not allow
921  // the same Seq-id on multiple Bioseqs
922 
923 }
924 
925 
927 {
928  bool rval = false;
929  const CSeq_inst& inst = seq.GetInst();
930  if (inst.IsSetHist() && inst.GetHist().IsSetAssembly()) {
931  return false;
932  }
933  CSeq_inst::TRepr repr = inst.CanGetRepr() ?
935 
936  if ( seq.IsNa() && repr != CSeq_inst::eRepr_seg ) {
937  rval = true;
938  // look for keyword
940  CSeqdesc_CI genbank_i(bsh, CSeqdesc::e_Genbank);
941  if (genbank_i && genbank_i->GetGenbank().IsSetKeywords()) {
942  CGB_block::TKeywords::const_iterator keyword = genbank_i->GetGenbank().GetKeywords().begin();
943  while (keyword != genbank_i->GetGenbank().GetKeywords().end() && rval) {
944  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
945  rval = false;
946  }
947  ++keyword;
948  }
949  }
950  if (rval) {
951  CSeqdesc_CI embl_i(bsh, CSeqdesc::e_Embl);
952  if (embl_i && embl_i->GetEmbl().IsSetKeywords()) {
953  CEMBL_block::TKeywords::const_iterator keyword = embl_i->GetEmbl().GetKeywords().begin();
954  while (keyword != embl_i->GetEmbl().GetKeywords().end() && rval) {
955  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
956  rval = false;
957  }
958  ++keyword;
959  }
960  }
961  }
962  }
963  return rval;
964 }
965 
966 
968 (const string &primary_acc,
969  const CBioseq &seq,
970  int choice)
971 {
972  CSeqdesc_CI sd(m_Scope->GetBioseqHandle(seq), static_cast<CSeqdesc::E_Choice>(choice));
973  for (; sd; ++sd) {
974  const list< string > *extra_acc = 0;
975  if ( choice == CSeqdesc::e_Genbank &&
977  extra_acc = &(sd->GetGenbank().GetExtra_accessions());
978  } else if ( choice == CSeqdesc::e_Embl &&
979  sd->GetEmbl().IsSetExtra_acc() ) {
980  extra_acc = &(sd->GetEmbl().GetExtra_acc());
981  }
982 
983  if ( extra_acc ) {
984  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
985  if ( NStr::CompareNocase(primary_acc, *acc) == 0 ) {
986  // If the same post error
987  PostErr(eDiag_Error,
989  primary_acc + " used for both primary and"
990  " secondary accession", seq);
991  }
992  }
993  }
994  }
995 }
996 
997 
999 {
1000  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
1001  if (it->GetUser().GetObjectType() == CUser_object::eObjectType_Unverified) {
1002  return true;
1003  }
1004  }
1005  return false;
1006 }
1007 
1008 
1010 {
1011  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
1014 
1015  bool has_barcode_tech = false;
1016 
1018  if (di && di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == CMolInfo::eTech_barcode) {
1019  has_barcode_tech = true;
1020  }
1021 
1022  bool has_barcode_keyword = false;
1023  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Genbank); it; ++it) {
1024  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k, it->GetGenbank()) {
1025  if (NStr::EqualNocase (*k, "BARCODE")) {
1026  has_barcode_keyword = true;
1027  break;
1028  }
1029  }
1030  if (has_barcode_keyword && !has_barcode_tech) {
1032  "BARCODE keyword without Molinfo.tech barcode",
1033  *ctx, *it);
1034  }
1035  }
1036  if (has_barcode_tech && !has_barcode_keyword && di) {
1038  "Molinfo.tech barcode without BARCODE keyword",
1039  *ctx, *di);
1040  }
1041  if (has_barcode_keyword && HasUnverified(bsh)) {
1043  "Sequence has both BARCODE and UNVERIFIED keywords",
1044  seq);
1045  }
1046 }
1047 
1048 
1050  const CBioseq& seq)
1051 {
1052  const CSeq_inst& inst = seq.GetInst();
1053 
1054 
1055  // Check representation
1056  if ( !ValidateRepr(inst, seq) ) {
1057  return;
1058  }
1059 
1060  // Check molecule, topology, and strand
1061  if (!inst.IsSetMol()) {
1062  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1063  seq);
1064  } else {
1065  const CSeq_inst::EMol& mol = inst.GetMol();
1066  switch (mol) {
1067 
1068  case CSeq_inst::eMol_na:
1070  "Bioseq.mol is type na", seq);
1071  break;
1072 
1073  case CSeq_inst::eMol_aa:
1074  if ( inst.IsSetTopology() &&
1078  "Non-linear topology set on protein", seq);
1079  }
1080  if ( inst.IsSetStrand() &&
1081  inst.GetStrand() != CSeq_inst::eStrand_ss &&
1084  "Protein not single stranded", seq);
1085  }
1086  break;
1087 
1089  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1090  seq);
1091  break;
1092 
1093  case CSeq_inst::eMol_other:
1095  "Bioseq.mol is type other", seq);
1096  break;
1097 
1098  default:
1099  break;
1100  }
1101  }
1102 
1103  CSeq_inst::ERepr rp = seq.GetInst().GetRepr();
1104 
1105  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_const) {
1106  // Validate raw and constructed sequences
1107  ValidateRawConst(seq);
1108  }
1109 
1110  if (rp == CSeq_inst::eRepr_seg || rp == CSeq_inst::eRepr_ref) {
1111  // Validate segmented and reference sequences
1112  ValidateSegRef(seq);
1113  }
1114 
1115  if (rp == CSeq_inst::eRepr_delta) {
1116  // Validate delta sequences
1117  ValidateDelta(seq);
1118  }
1119 
1120  if (rp == CSeq_inst::eRepr_seg && seq.GetInst().IsSetExt() &&
1121  seq.GetInst().GetExt().IsSeg()) {
1122  // Validate part of segmented sequence
1123  ValidateSeqParts(seq);
1124  }
1125 
1126  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_delta) {
1127  x_ValidateBarcode (seq);
1128  }
1129 
1130  x_ValidateTitle(seq);
1131  /*if ( seq.IsAa() ) {
1132  Validate protein title (amino acids only)
1133  ValidateProteinTitle(seq);
1134  }*/
1135 
1136  if ( seq.IsNa() ) {
1137  // check for N bases at start or stop of sequence,
1138  // or sequence entirely made of Ns
1139  ValidateNsAndGaps(seq);
1140 
1141  }
1142 
1143  // Validate sequence length
1144  ValidateSeqLen(seq);
1145 
1146  // proteins should not have gaps
1147  if (seq.IsAa() && x_HasGap(seq)) {
1148  PostErr(eDiag_Error, eErr_SEQ_INST_ProteinShouldNotHaveGaps, "Protein sequences should not have gaps", seq);
1149  }
1150 }
1151 
1152 
1154 
1155 {
1156  bool is_wgs = false;
1157  bool is_grc = false;
1158 
1160  CSeqdesc_CI user(bsh, CSeqdesc::e_User);
1161  while (user) {
1163  user->GetUser().HasField("BioProject", ".", NStr::eNocase)) {
1164  // bioproject field found
1165  return false;
1166  }
1167  ++user;
1168  }
1169 
1170  CSeqdesc_CI title(bsh, CSeqdesc::e_Title);
1171  while (title) {
1172  if (NStr::StartsWith(title->GetTitle(), "GRC")) {
1173  is_grc = true;
1174  break;
1175  }
1176  ++title;
1177  }
1178 
1179  is_wgs = IsWGS(bsh);
1180 
1181  bool is_gb = false, /* is_eb_db = false, */ is_refseq = false, is_ng = false;
1182 
1183  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, seq) {
1184  const CSeq_id& sid = **sid_itr;
1185  switch (sid.Which()) {
1186  case CSeq_id::e_Genbank:
1187  case CSeq_id::e_Embl:
1188  // is_eb_db = true;
1189  // fall through
1190  case CSeq_id::e_Ddbj:
1191  is_gb = true;
1192  break;
1193  case CSeq_id::e_Other:
1194  {
1195  is_refseq = true;
1196  if (sid.GetOther().IsSetAccession()) {
1197  string acc = sid.GetOther().GetAccession().substr(0, 3);
1198  if (acc == "NG_") {
1199  is_ng = true;
1200  }
1201  }
1202  }
1203  break;
1204  default:
1205  break;
1206  }
1207  }
1208 
1209  if (is_refseq || m_Imp.IsRefSeqConventions()) {
1210  if (is_ng) return false;
1211  } else if (is_gb) {
1212  if (! is_wgs && ! is_grc) return false;
1213  } else {
1214  return false;
1215  }
1216 
1217  const CSeq_inst & inst = seq.GetInst();
1218  CSeq_inst::TRepr repr = inst.GetRepr();
1219 
1220  if (repr == CSeq_inst::eRepr_delta) {
1221  if (x_IsDeltaLitOnly(inst)) return false;
1222  } else if (repr != CSeq_inst::eRepr_map) {
1223  return false;
1224  }
1225 
1226  return true;
1227 }
1228 
1230  const CBioseq& seq)
1231 {
1233 
1234  // Check that proteins in nuc_prot set have a CdRegion
1235  if ( CdError(bsh) ) {
1236  EDiagSev sev = eDiag_Error;
1238  if (bssh) {
1239  CBioseq_Handle nbsh = GetNucBioseq (bssh);
1240  if (nbsh) {
1241  CSeqdesc_CI desc( nbsh, CSeqdesc::e_Molinfo );
1242  const CMolInfo* mi = desc ? &(desc->GetMolinfo()) : 0;
1243  if (mi) {
1244  CMolInfo::TTech tech = mi->IsSetTech() ?
1246  if (tech == CMolInfo::eTech_wgs) {
1247  sev = eDiag_Critical;
1248  }
1249  }
1250  }
1251  }
1253  "No CdRegion in nuc-prot set points to this protein",
1254  seq);
1255  }
1256 
1257  bool is_patent = SeqIsPatent (seq);
1258 
1259  try {
1260  // if there are no Seq-ids, the following tests can't be run
1261  if (seq.IsSetId()) {
1262 
1263  // Check that gene on non-segmented sequence does not have
1264  // multiple intervals
1266 
1268 
1269  // Check for duplicate features and overlapping peptide features.
1271 
1272  // Check for introns within introns.
1273  ValidateTwintrons(seq);
1274 
1275  // check for equivalent source features
1277 
1278  // check for equivalen pub features
1279  x_ValidatePubFeatures (bsh);
1280 
1281  // Check for colliding genes
1283 
1284  // Detect absence of BioProject DBLink for complete bacterial genomes
1286  }
1287 
1288  m_dblink_count = 0;
1289  m_taa_count = 0;
1290  m_bs_count = 0;
1291  m_as_count = 0;
1292  m_pdb_count = 0;
1293  m_sra_count = 0;
1294  m_bp_count = 0;
1295  m_unknown_count = 0;
1296 
1297  // Validate descriptors that affect this bioseq
1299 
1300 
1301  if (m_dblink_count > 1) {
1303  NStr::IntToString(m_dblink_count) + " DBLink user objects apply to a Bioseq", seq);
1304  }
1305 
1306  if (m_taa_count > 1) {
1308  "Trace Assembly Archive entries appear in " + NStr::IntToString(m_taa_count) + " DBLink user objects", seq);
1309  }
1310 
1311  if (m_bs_count > 1) {
1313  "BioSample entries appear in " + NStr::IntToString(m_bs_count) + " DBLink user objects", seq);
1314  }
1315 
1316  if (m_as_count > 1) {
1318  "Assembly entries appear in " + NStr::IntToString(m_as_count) + " DBLink user objects", seq);
1319  }
1320 
1321  if (m_pdb_count > 1) {
1323  "ProbeDB entries appear in " + NStr::IntToString(m_pdb_count) + " DBLink user objects", seq);
1324  }
1325 
1326  if (m_sra_count > 1) {
1328  "Sequence Read Archive entries appear in " + NStr::IntToString(m_sra_count) + " DBLink user objects", seq);
1329  }
1330 
1331  if (m_bp_count > 1) {
1333  "BioProject entries appear in " + NStr::IntToString(m_bp_count) + " DBLink user objects", seq);
1334  }
1335 
1336  if (m_unknown_count > 1) {
1338  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user objects", seq);
1339  } else if (m_unknown_count > 0) {
1341  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user object", seq);
1342  }
1343 
1344  // make sure that there is a pub on this bioseq
1345  if ( !m_Imp.IsNoPubs() ) {
1346  CheckForPubOnBioseq(seq);
1347  }
1348  // make sure that there is a source on this bioseq
1349  if ( !m_Imp.IsNoBioSource() ) {
1350  CheckSoureDescriptor(bsh);
1351  //CheckForBiosourceOnBioseq(seq);
1352  }
1353 
1354  if (x_ShowBioProjectWarning (seq)) {
1356  "BioProject entries not present on CON record", seq);
1357  }
1358 
1359  } catch ( const exception& e ) {
1360  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1362  string("Exception while validating BioseqContext. EXCEPTION: ") +
1363  e.what(), seq);
1364  }
1365  }
1366 
1367  if (!is_patent) {
1368  // flag missing molinfo even if not in Sequin
1370  }
1371 
1373 
1374  CheckTpaHistory(seq);
1375 
1376  // check for multiple publications with identical identifiers
1378 
1379  // look for orphaned proteins
1380  if (seq.IsAa() && !GetNucProtSetParent(bsh) && !x_AllowOrphanedProtein(seq)) {
1382  "Orphaned stand-alone protein", seq);
1383  }
1384 
1385  // look for extra protein features
1386  if (seq.IsAa()) {
1387  CCacheImpl::SFeatKey prot_key(
1389  const CCacheImpl::TFeatValue & prot_feats =
1390  GetCache().GetFeatFromCache(prot_key);
1391 
1392  if (prot_feats.size() > 1) {
1393  ITERATE(CCacheImpl::TFeatValue, feat, prot_feats) {
1395  "Protein sequence has multiple unprocessed protein features",
1396  feat->GetOriginalFeature());
1397  }
1398  }
1399  }
1400 
1401  if (!m_Imp.IsNoCitSubPubs() && !x_HasCitSub(bsh)) {
1403  "Expected submission citation is missing for this Bioseq", seq);
1404  }
1405 
1406 }
1407 
1408 
1410 {
1411  ITERATE(CPub_equiv::Tdata, it, pub.Get()) {
1412  if (x_HasCitSub(**it)) {
1413  return true;
1414  }
1415  }
1416  return false;
1417 }
1418 
1419 
1421 {
1422  if (pub.IsSub()) {
1423  return true;
1424  } else if (pub.IsEquiv() && x_HasCitSub(pub.GetEquiv())) {
1425  return true;
1426  } else {
1427  return false;
1428  }
1429 }
1430 
1431 
1433 {
1434  bool has_cit_sub = false;
1435  CSeqdesc_CI p(bsh, CSeqdesc::e_Pub);
1436  while (p && !has_cit_sub) {
1437  if (p->GetPub().IsSetPub()) {
1438  has_cit_sub = x_HasCitSub(p->GetPub().GetPub());
1439  }
1440  ++p;
1441  }
1442 
1443  return has_cit_sub;
1444 }
1445 
1446 
1448 {
1449  bool is_genbank = false;
1450  bool is_embl = false;
1451  bool is_ddbj = false;
1452  bool is_refseq = m_Imp.IsRefSeqConventions();
1453  bool is_wp = false;
1454  bool is_yp = false;
1455  bool is_gibbmt = false;
1456  bool is_gibbsq = false;
1457  bool is_patent = false;
1458  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
1459  const CSeq_id& sid = **id_it;
1460  switch (sid.Which()) {
1461  case CSeq_id::e_Genbank:
1462  is_genbank = true;
1463  break;
1464  case CSeq_id::e_Embl:
1465  is_embl = true;
1466  break;
1467  case CSeq_id::e_Ddbj:
1468  is_ddbj = true;
1469  break;
1470  case CSeq_id::e_Other:
1471  {
1472  is_refseq = true;
1473  const CTextseq_id* tsid = sid.GetTextseq_Id();
1474  if (tsid != NULL && tsid->IsSetAccession()) {
1475  const string& acc = tsid->GetAccession();
1476  if (NStr::StartsWith(acc, "WP_")) {
1477  is_wp = true;
1478  } else if (NStr::StartsWith(acc, "YP_")) {
1479  is_yp = true;
1480  }
1481  }
1482  }
1483  break;
1484  case CSeq_id::e_Gibbmt:
1485  is_gibbmt = true;
1486  break;
1487  case CSeq_id::e_Gibbsq:
1488  is_gibbsq = true;
1489  break;
1490  case CSeq_id::e_Patent:
1491  is_patent = true;
1492  break;
1493  default:
1494  break;
1495  }
1496  }
1497  if ((is_genbank || is_embl || is_ddbj || is_refseq)
1498  && !is_gibbmt && !is_gibbsq && !is_patent && !is_wp && !is_yp) {
1499  return false;
1500  } else {
1501  return true;
1502  }
1503 }
1504 
1505 
1506 template <class Iterator, class Predicate>
1507 bool lists_match(Iterator iter1, Iterator iter1_stop, Iterator iter2, Iterator iter2_stop, Predicate pred)
1508 {
1509  while (iter1 != iter1_stop && iter2 != iter2_stop) {
1510  if (!pred(*iter1, *iter2)) {
1511  return false;
1512  }
1513  ++iter1;
1514  ++iter2;
1515  }
1516  if (iter1 != iter1_stop || iter2 != iter2_stop) {
1517  return false;
1518  } else {
1519  return true;
1520  }
1521 }
1522 
1523 
1524 static bool s_OrgModEqual (
1525  const CRef<COrgMod>& om1,
1526  const CRef<COrgMod>& om2
1527 )
1528 
1529 {
1530  const COrgMod& omd1 = *(om1);
1531  const COrgMod& omd2 = *(om2);
1532 
1533  const string& str1 = omd1.GetSubname();
1534  const string& str2 = omd2.GetSubname();
1535 
1536  if (NStr::CompareNocase (str1, str2) != 0) return false;
1537 
1538  TORGMOD_SUBTYPE chs1 = omd1.GetSubtype();
1539  TORGMOD_SUBTYPE chs2 = omd2.GetSubtype();
1540 
1541  if (chs1 == chs2) return true;
1542  if (chs2 == NCBI_ORGMOD(other)) return true;
1543 
1544  return false;
1545 }
1546 
1547 
1548 bool s_DbtagEqual (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
1549 {
1550  // is dbt1 == dbt2
1551  return dbt1->Compare(*dbt2) == 0;
1552 }
1553 
1554 
1555 // Two OrgRefs are identical if the taxnames are identical, the dbxrefs are identical,
1556 // and the orgname orgmod lists are identical
1557 static bool s_OrgrefEquivalent (const COrg_ref& org1, const COrg_ref& org2)
1558 {
1559  if ((org1.IsSetTaxname() && !org2.IsSetTaxname())
1560  || (!org1.IsSetTaxname() && org2.IsSetTaxname())
1561  || (org1.IsSetTaxname() && org2.IsSetTaxname()
1562  && !NStr::EqualNocase (org1.GetTaxname(), org2.GetTaxname()))) {
1563  return false;
1564  }
1565 
1566  if ((org1.IsSetDb() && !org2.IsSetDb())
1567  || (!org1.IsSetDb() && org2.IsSetDb())
1568  || (org1.IsSetDb() && org2.IsSetDb()
1569  && !lists_match (org1.GetDb().begin(), org1.GetDb().end(),
1570  org2.GetDb().begin(), org2.GetDb().end(),
1571  s_DbtagEqual))) {
1572  return false;
1573  }
1574 
1575  if ((org1.IsSetOrgname() && !org2.IsSetOrgname())
1576  || (!org1.IsSetOrgname() && org2.IsSetOrgname())) {
1577  return false;
1578  }
1579  if (org1.IsSetOrgname() && org2.IsSetOrgname()) {
1580  const COrgName& on1 = org1.GetOrgname();
1581  const COrgName& on2 = org2.GetOrgname();
1582  if ((on1.IsSetMod() && !on2.IsSetMod())
1583  || (!on1.IsSetMod() && on2.IsSetMod())
1584  || (on1.IsSetMod() && on2.IsSetMod()
1585  && !lists_match (on1.GetMod().begin(), on1.GetMod().end(),
1586  on2.GetMod().begin(), on2.GetMod().end(),
1587  s_OrgModEqual))) {
1588  return false;
1589  }
1590  }
1591 
1592  return true;
1593 }
1594 
1595 
1596 // Two SubSources are equal and duplicates if:
1597 // they have the same subtype
1598 // and the same name (or don't require a name).
1599 
1601  const CRef<CSubSource>& st1,
1602  const CRef<CSubSource>& st2
1603 )
1604 
1605 {
1606  const CSubSource& sbs1 = *(st1);
1607  const CSubSource& sbs2 = *(st2);
1608 
1609  TSUBSOURCE_SUBTYPE chs1 = sbs1.GetSubtype();
1610  TSUBSOURCE_SUBTYPE chs2 = sbs2.GetSubtype();
1611 
1612  if (chs1 != chs2) return false;
1613  if (CSubSource::NeedsNoText(chs2)) return true;
1614 
1615  if (sbs1.IsSetName() && sbs2.IsSetName()) {
1616  if (NStr::CompareNocase (sbs1.GetName(), sbs2.GetName()) == 0) return true;
1617  }
1618  if (! sbs1.IsSetName() && ! sbs2.IsSetName()) return true;
1619 
1620  return false;
1621 }
1622 
1623 
1624 static bool s_BiosrcFullLengthIsOk (const CBioSource& src)
1625 {
1626  if (src.IsSetIs_focus()) {
1627  return true;
1628  }
1630  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_transgenic) {
1631  return true;
1632  }
1633  }
1634  return false;
1635 }
1636 
1637 
1639 {
1640  if (!src.IsSetOrg() || !src.GetOrg().IsSetTaxname()) {
1641  //printf ("taxname not set!\n");
1642  return false;
1643  }
1644  if (NStr::EqualNocase(src.GetOrg().GetTaxname(), "unidentified phage")) {
1645  //printf ("is unidentified phage!\n");
1646  return true;
1647  }
1648  if (src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetLineage()
1649  && NStr::StartsWith(src.GetOrg().GetOrgname().GetLineage(), "Viruses", NStr::eNocase)) {
1650  //printf ("Starts with viruses!\n");
1651  return true;
1652  }
1653 #if 0
1654  if (!src.GetOrg().IsSetOrgname()) {
1655  printf ("Orgname not set!\n");
1656  } else if (!src.GetOrg().GetOrgname().IsSetLineage()) {
1657  printf ("Lineage not set!\n");
1658  } else {
1659  printf ("Lineage is %s!\n", src.GetOrg().GetOrgname().GetLineage().c_str());
1660  }
1661 #endif
1662  return false;
1663 }
1664 
1665 
1667  const CBioseq_Handle& bsh)
1668 {
1669  // don't bother if can't build all feature iterator
1670  if (!m_AllFeatIt) {
1671  return;
1672  }
1673  try {
1674  CCacheImpl::SFeatKey biosrc_key(
1676  const CCacheImpl::TFeatValue & biosrcs = GetCache().GetFeatFromCache(biosrc_key);
1677  CCacheImpl::TFeatValue::const_iterator feat = biosrcs.begin();
1678  if (feat != biosrcs.end()) {
1679  if (IsLocFullLength(feat->GetLocation(), bsh)
1680  && !s_BiosrcFullLengthIsOk(feat->GetData().GetBiosrc())) {
1682  "Source feature is full length, should be descriptor",
1683  feat->GetOriginalFeature());
1684  }
1685 
1686  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1687  ++feat;
1688  for ( ; feat != biosrcs.end(); ++feat_prev, ++feat) {
1689  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1691  "Multiple full-length source features, should only be one if descriptor is transgenic",
1692  feat->GetOriginalFeature());
1693  }
1694 
1695  // compare to see if feature sources are identical
1696  bool are_identical = true;
1697  if (feat_prev->IsSetComment() && feat->IsSetComment()
1698  && !NStr::EqualNocase (feat_prev->GetComment(), feat->GetComment())) {
1699  are_identical = false;
1700  } else {
1701  const CBioSource& src_prev = feat_prev->GetData().GetBiosrc();
1702  const CBioSource& src = feat->GetData().GetBiosrc();
1703  if ((src.IsSetIs_focus() && !src_prev.IsSetIs_focus())
1704  || (!src.IsSetIs_focus() && src_prev.IsSetIs_focus())) {
1705  are_identical = false;
1706  } else if ((src.IsSetSubtype() && !src_prev.IsSetSubtype())
1707  || (!src.IsSetSubtype() && src_prev.IsSetSubtype())
1708  || (src.IsSetSubtype() && src_prev.IsSetSubtype()
1709  && !lists_match (src.GetSubtype().begin(), src.GetSubtype().end(),
1710  src_prev.GetSubtype().begin(), src_prev.GetSubtype().end(),
1712  are_identical = false;
1713  } else if ((src.IsSetOrg() && !src_prev.IsSetOrg())
1714  || (!src.IsSetOrg() && src_prev.IsSetOrg())
1715  || (src.IsSetOrg() && src_prev.IsSetOrg()
1716  && !s_OrgrefEquivalent (src.GetOrg(), src_prev.GetOrg()))) {
1717  are_identical = false;
1718  }
1719  }
1720  if (are_identical && !s_SuppressMultipleEquivBioSources(feat->GetData().GetBiosrc())) {
1722  "Multiple equivalent source features should be combined into one multi-interval feature",
1723  feat->GetOriginalFeature());
1724  }
1725  }
1726  }
1727  } catch ( const exception& e ) {
1728  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1730  string("Exception while validating source features. EXCEPTION: ") +
1731  e.what(), *(bsh.GetCompleteBioseq()));
1732  }
1733  }
1734 
1735 }
1736 
1737 
1738 static void s_MakePubLabelString (const CPubdesc& pd, string& label)
1739 
1740 {
1741  label = "";
1742 
1743  FOR_EACH_PUB_ON_PUBDESC (it, pd) {
1744  if ((*it)->IsGen() && (*it)->GetGen().IsSetCit()
1745  && !(*it)->GetGen().IsSetCit()
1746  && !(*it)->GetGen().IsSetJournal()
1747  && !(*it)->GetGen().IsSetDate()
1748  && (*it)->GetGen().IsSetSerial_number()) {
1749  // skip over just serial number
1750  } else {
1751  (*it)->GetLabel (&label, CPub::eContent, true);
1752  break;
1753  }
1754  }
1755 }
1756 
1757 
1759  const CBioseq_Handle& bsh)
1760 {
1761  // don't bother if can't build feature iterator at all
1762  if (!m_AllFeatIt) {
1763  return;
1764  }
1765  try {
1766  CCacheImpl::SFeatKey pub_key(
1768  const CCacheImpl::TFeatValue & pubs =
1769  GetCache().GetFeatFromCache(pub_key);
1770  CCacheImpl::TFeatValue::const_iterator feat = pubs.begin();
1771  if (feat != pubs.end()) {
1772  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1774  "Publication feature is full length, should be descriptor",
1775  feat->GetOriginalFeature());
1776  }
1777 
1778  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1779  string prev_label;
1780  if( feat_prev != pubs.end()) {
1781  s_MakePubLabelString(feat_prev->GetData().GetPub(), prev_label);
1782  ++feat;
1783  }
1784  for ( ; feat != pubs.end(); ++feat, ++feat_prev) {
1785  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1787  "Publication feature is full length, should be descriptor",
1788  feat->GetOriginalFeature());
1789  }
1790  // compare to see if feature sources are identical
1791  bool are_identical = true;
1792  if (feat_prev->IsSetComment() && feat->IsSetComment()
1793  && !NStr::EqualNocase (feat_prev->GetComment(), feat->GetComment())) {
1794  are_identical = false;
1795  } else {
1796  string label;
1797  s_MakePubLabelString (feat->GetData().GetPub(), label);
1798  if (!NStr::IsBlank (label) && !NStr::IsBlank(prev_label)
1799  && !NStr::EqualNocase (label, prev_label)) {
1800  are_identical = false;
1801  }
1802 
1803  // swap is faster than assignment
1804  prev_label.swap(label);
1805 
1806  // TODO: also check authors
1807  }
1808 
1809  if (are_identical) {
1811  "Multiple equivalent publication features should be combined into one multi-interval feature",
1812  feat->GetOriginalFeature());
1813  }
1814  }
1815  }
1816  } catch ( const exception& e ) {
1817  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1819  string("Exception while validating pub features. EXCEPTION: ") +
1820  e.what(), *(bsh.GetCompleteBioseq()));
1821  }
1822  }
1823 
1824 }
1825 
1826 
1828 {
1829 public:
1830  // faster than lexicographical order
1831  bool operator()(const string& lhs, const string& rhs) const
1832  {
1833  if( lhs.length() != rhs.length() ) {
1834  return (lhs.length() < rhs.length());
1835  }
1836  return NStr::CompareNocase (lhs, rhs) < 0;
1837  }
1838 };
1839 
1841 {
1842 public:
1843  bool operator()(const string& lhs, const string& rhs) const
1844  {
1845  return NStr::CompareNocase (lhs, rhs) < 0;
1846  }
1847 };
1848 
1849 
1851  const CBioseq& seq, const vector<CTempString>& labels)
1852 {
1853  if (labels.size() <= 1) {
1854  // optimize fast case
1855  return;
1856  }
1857 
1858  static const string kWarningPrefix =
1859  "Multiple equivalent publications annotated on this sequence [";
1860  static const string::size_type kMaxSummaryLen = 100;
1861 
1862  // TTempStringCount maps a CTempString to the number of times it appears
1863  // (Note case-insensitivity and non-lexicographical order)
1865  TLabelCount label_count;
1866 
1867  ITERATE(vector<CTempString>, label_it, labels) {
1868  ++label_count[*label_it];
1869  }
1870 
1871  // put the dups into a vector and sort
1872  vector<CTempString> sorted_dup_labels;
1873  ITERATE(TLabelCount, label_count_it, label_count) {
1874  int num_appearances = label_count_it->second;
1875  _ASSERT(num_appearances > 0);
1876  if( num_appearances > 1 ) {
1877  const CTempString & dup_label = label_count_it->first;
1878  sorted_dup_labels.push_back(dup_label);
1879  }
1880  }
1881  sort(BEGIN_COMMA_END(sorted_dup_labels), SCaseInsensitiveLess());
1882 
1883  // find all that appear multiple times
1884  string err_msg = kWarningPrefix; // avoid create and destroy on each iter'n
1885  ITERATE(vector<CTempString>, dup_label_it, sorted_dup_labels) {
1886  const CTempString & summary = *dup_label_it;
1887 
1888  err_msg.resize(kWarningPrefix.length());
1889  if (summary.length() > kMaxSummaryLen) {
1890  err_msg += summary.substr(0, kMaxSummaryLen);
1891  err_msg += "...";
1892  } else {
1893  err_msg += summary;
1894  }
1895  err_msg += "]";
1897  err_msg, seq);
1898  }
1899 }
1900 
1901 
1903  const CBioseq_Handle& bsh)
1904 {
1905  // used to check for dups. Currently only deals with cases where
1906  // there's an otherpub, but check if this comment is out of date.
1907  set<int> muids_seen;
1908  set<int> pmids_seen;
1909 
1910  vector<int> serials;
1911  vector<CTempString> published_labels;
1912  vector<CTempString> unpublished_labels;
1913 
1916 
1917  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Pub); it; ++it) {
1918  CConstRef<CPubdesc> pub = ConstRef(&it->GetPub());
1919  // first, try to receive from cache
1920  const CCacheImpl::CPubdescInfo & pubdesc_info =
1921  GetCache().GetPubdescToInfo(pub);
1922  // note that some (e.g. pmids are ignored other than maybe storing
1923  // in the cache above)
1924  copy(BEGIN_COMMA_END(pubdesc_info.m_published_labels),
1925  back_inserter(published_labels));
1926  copy(BEGIN_COMMA_END(pubdesc_info.m_unpublished_labels),
1927  back_inserter(unpublished_labels));
1928 
1929  int muid = 0;
1930  int pmid = 0;
1931  bool otherpub = false;
1932  FOR_EACH_PUB_ON_PUBDESC (pub_it, *pub) {
1933  switch ( (*pub_it)->Which() ) {
1934  case CPub::e_Muid:
1935  muid = (*pub_it)->GetMuid();
1936  break;
1937  case CPub::e_Pmid:
1938  pmid = (*pub_it)->GetPmid();
1939  break;
1940  default:
1941  otherpub = true;
1942  break;
1943  }
1944  }
1945 
1946  if ( otherpub ) {
1947  bool collision = false;
1948  if ( muid > 0 ) {
1949  if ( muids_seen.find(muid) != muids_seen.end() ) {
1950  collision = true;
1951  } else {
1952  muids_seen.insert(muid);
1953  }
1954  }
1955  if ( pmid > 0 ) {
1956  if ( pmids_seen.find(pmid) != pmids_seen.end() ) {
1957  collision = true;
1958  } else {
1959  pmids_seen.insert(pmid);
1960  }
1961  }
1962  if ( collision ) {
1964  "Multiple publications with same identifier", *ctx, *it);
1965  }
1966  }
1967  }
1968 
1969  x_ReportDuplicatePubLabels (*(bsh.GetCompleteBioseq()), unpublished_labels);
1970  x_ReportDuplicatePubLabels (*(bsh.GetCompleteBioseq()), published_labels);
1971 
1972 }
1973 
1974 
1976 {
1977  if ( !seq.GetInst().IsSetHist() ) {
1978  return;
1979  }
1980 
1981  TGi gi = ZERO_GI;
1982  FOR_EACH_SEQID_ON_BIOSEQ (id, seq) {
1983  if ( (*id)->IsGi() ) {
1984  gi = (*id)->GetGi();
1985  break;
1986  }
1987  }
1988  if ( gi == ZERO_GI ) {
1989  return;
1990  }
1991 
1992  const CSeq_hist& hist = seq.GetInst().GetHist();
1993  if ( hist.IsSetReplaced_by() && hist.GetReplaced_by().IsSetDate() ) {
1994  const CSeq_hist_rec& rec = hist.GetReplaced_by();
1995  ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) {
1996  if ( (*id)->IsGi() ) {
1997  if ( gi == (*id)->GetGi() ) {
1999  "Replaced by gi (" +
2000  NStr::NumericToString(gi) + ") is same as current Bioseq",
2001  seq);
2002  break;
2003  }
2004  }
2005  }
2006  }
2007 
2008  if ( hist.IsSetReplaces() && hist.GetReplaces().IsSetDate() ) {
2009  const CSeq_hist_rec& rec = hist.GetReplaces();
2010  ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) {
2011  if ( (*id)->IsGi() ) {
2012  if ( gi == (*id)->GetGi() ) {
2014  "Replaces gi (" +
2015  NStr::NumericToString(gi) + ") is same as current Bioseq",
2016  seq);
2017  break;
2018  }
2019  }
2020  }
2021  }
2022 }
2023 
2024 
2025 // =============================================================================
2026 // Private
2027 // =============================================================================
2028 
2029 
2030 
2031 
2032 // Is the id contained in the bioseq?
2033 bool CValidError_bioseq::IsIdIn(const CSeq_id& id, const CBioseq& seq)
2034 {
2035  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2036  if (id.Match(**it)) {
2037  return true;
2038  }
2039  }
2040  return false;
2041 }
2042 
2043 
2045 {
2046  if (!inst.IsSetSeq_data()) {
2047  return 0;
2048  }
2049 
2050  const CSeq_data& seqdata = inst.GetSeq_data();
2051  switch (seqdata.Which()) {
2052  case CSeq_data::e_not_set:
2053  return 0;
2054  case CSeq_data::e_Iupacna:
2055  return seqdata.GetIupacna().Get().size();
2056  case CSeq_data::e_Iupacaa:
2057  return seqdata.GetIupacaa().Get().size();
2058  case CSeq_data::e_Ncbi2na:
2059  return seqdata.GetNcbi2na().Get().size();
2060  case CSeq_data::e_Ncbi4na:
2061  return seqdata.GetNcbi4na().Get().size();
2062  case CSeq_data::e_Ncbi8na:
2063  return seqdata.GetNcbi8na().Get().size();
2064  case CSeq_data::e_Ncbipna:
2065  return seqdata.GetNcbipna().Get().size();
2066  case CSeq_data::e_Ncbi8aa:
2067  return seqdata.GetNcbi8aa().Get().size();
2068  case CSeq_data::e_Ncbieaa:
2069  return seqdata.GetNcbieaa().Get().size();
2070  case CSeq_data::e_Ncbipaa:
2071  return seqdata.GetNcbipaa().Get().size();
2073  return seqdata.GetNcbistdaa().Get().size();
2074  default:
2075  return 0;
2076  }
2077 }
2078 
2079 
2080 // Returns true if seq derived from translation ending in "*" or
2081 // seq is 3' partial (i.e. the right of the sequence is incomplete)
2083 {
2084 
2085  // Look for the Cdregion feature used to create this aa product
2086  // Use the Cdregion to translate the associated na sequence
2087  // and check if translation has a '*' at the end. If it does.
2088  // message about 'X' at the end of this aa product sequence is suppressed
2089  try {
2090  const CSeq_feat* sfp = m_Imp.GetCDSGivenProduct(seq);
2091  if ( sfp ) {
2092 
2093  // Translate na CSeq_data
2094  string prot;
2095  CSeqTranslator::Translate(*sfp, *m_Scope, prot);
2096 
2097  if ( prot[prot.size() - 1] == '*' ) {
2098  return true;
2099  }
2100  return false;
2101  }
2102 
2103  // Get CMolInfo for seq and determine if completeness is
2104  // "eCompleteness_no_right or eCompleteness_no_ends. If so
2105  // suppress message about "X" at end of aa sequence is suppressed
2107  if (mi && mi->IsSetCompleteness()) {
2110  return true;
2111  }
2112  }
2113  } catch (CException ) {
2114  } catch (std::exception ) {
2115  }
2116  return false;
2117 }
2118 
2119 
2121 {
2122  CRef<CSeq_loc> loc;
2123  if (!seq.GetInst().IsSetExt()) {
2124  return loc;
2125  }
2126 
2127  if (seq.GetInst().GetExt().IsSeg()) {
2128  CRef<CSeq_loc> nloc(new CSeq_loc());
2129  loc = nloc;
2130  CSeq_loc_mix& mix = loc->SetMix();
2131  ITERATE (list< CRef<CSeq_loc> >, it,
2132  seq.GetInst().GetExt().GetSeg().Get()) {
2133  mix.Set().push_back(*it);
2134  }
2135  } else if (seq.GetInst().GetExt().IsRef()) {
2136  CRef<CSeq_loc> nloc(new CSeq_loc());
2137  loc = nloc;
2138  loc->Add(seq.GetInst().GetExt().GetRef());
2139  }
2140  return loc;
2141 }
2142 
2143 
2144 // Check if CdRegion required but not found
2146 {
2147  if ( bsh && CSeq_inst::IsAa(bsh.GetInst_Mol()) ) {
2148  CSeq_entry_Handle nps =
2150  if ( nps ) {
2151  const CSeq_feat* cds = GetCDSForProduct(bsh);
2152  if ( cds == 0 ) {
2153  const CSeq_feat* mat = GetPROTForProduct(bsh);
2154  if ( mat == 0 ) {
2155  return true;
2156  }
2157  }
2158  }
2159  }
2160 
2161  return false;
2162 }
2163 
2164 
2166 {
2168 
2169  if ( sd ) {
2170  const CMolInfo &mi = sd->GetMolinfo();
2171  if ( mi.IsSetBiomol() ) {
2172  return mi.GetBiomol() == CMolInfo::eBiomol_mRNA;
2173  }
2174  } else if (bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
2175  // if no molinfo, assume rna is mrna
2176  return true;
2177  }
2178 
2179  return false;
2180 }
2181 
2182 
2184 {
2186 
2187  if ( sd ) {
2188  const CMolInfo &mi = sd->GetMolinfo();
2189  if ( mi.IsSetBiomol() ) {
2190  return mi.GetBiomol() == CMolInfo::eBiomol_pre_RNA;
2191  }
2192  }
2193 
2194  return false;
2195 }
2196 
2197 
2199 {
2200  size_t counter = 0;
2201  for ( CSeq_loc_CI slit(loc); slit; ++slit ) {
2202  if ( !IsFarLocation(slit.GetEmbeddingSeq_loc(), m_Imp.GetTSEH()) ) {
2203  ++counter;
2204  }
2205  }
2206  return counter;
2207 }
2208 
2209 
2210 bool CValidError_bioseq::LocOnSeg(const CBioseq& seq, const CSeq_loc& loc)
2211 {
2212  for ( CSeq_loc_CI sli( loc ); sli; ++sli ) {
2213  const CSeq_id& loc_id = sli.GetSeq_id();
2214  FOR_EACH_SEQID_ON_BIOSEQ (seq_id, seq) {
2215  if ( loc_id.Match(**seq_id) ) {
2216  return true;
2217  }
2218  }
2219  }
2220  return false;
2221 }
2222 
2223 
2224 static bool s_NotPeptideException
2225 (const CSeq_feat& curr,
2226  const CSeq_feat& prev)
2227 {
2228  if (curr.IsSetExcept() && curr.GetExcept() && curr.IsSetExcept_text()) {
2229  if (NStr::FindNoCase(curr.GetExcept_text(), "alternative processing") != NPOS) {
2230  return false;
2231  }
2232  }
2233  if (prev.IsSetExcept() && prev.GetExcept() && prev.IsSetExcept_text()) {
2234  if (NStr::FindNoCase(prev.GetExcept_text(), "alternative processing") != NPOS) {
2235  return false;
2236  }
2237  }
2238  return true;
2239 }
2240 
2241 
2243 (const CSeq_feat_Handle& f1,
2244  const CSeq_feat_Handle& f2)
2245 {
2246  CSeq_annot_Handle ah1 = f1.GetAnnot();
2247  CSeq_annot_Handle ah2 = f2.GetAnnot();
2248 
2249  if (!ah1 || !ah2) {
2250  return true;
2251  }
2252 
2255  if (!sap1 || !sap2) {
2256  return true;
2257  }
2258 
2259  if (!sap1->IsSetDesc() || !sap2->IsSetDesc()) {
2260  return true;
2261  }
2262 
2263  CAnnot_descr::Tdata descr1 = sap1->GetDesc().Get();
2264  CAnnot_descr::Tdata descr2 = sap2->GetDesc().Get();
2265 
2266  // Check only on the first? (same as in C toolkit)
2267  const CAnnotdesc& desc1 = descr1.front().GetObject();
2268  const CAnnotdesc& desc2 = descr2.front().GetObject();
2269 
2270  if ( desc1.Which() == desc2.Which() ) {
2271  if ( desc1.IsName() ) {
2272  return NStr::EqualNocase(desc1.GetName(), desc2.GetName());
2273  } else if ( desc1.IsTitle() ) {
2274  return NStr::EqualNocase(desc1.GetTitle(), desc2.GetTitle());
2275  }
2276  }
2277 
2278  return false;
2279 }
2280 
2281 
2282 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
2283 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
2284 
2286 {
2287  if (!IsMaster(seq)) {
2288  return false;
2289  }
2290  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2291  return IsWGS(bsh);
2292 }
2293 
2294 
2296 {
2297  bool rval = false;
2298  if (entry.IsSeq()) {
2299  if (IsMaster(entry.GetSeq()) && IsWGS(entry.GetSeq())) {
2300  rval = true;
2301  }
2302  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2304  if (IsWGSMaster(**it)) {
2305  rval = true;
2306  break;
2307  }
2308  }
2309  }
2310  return rval;
2311 }
2312 
2313 
2315 {
2316  if (!seq.IsSetDescr()) {
2317  return false;
2318  }
2319  ITERATE(CBioseq::TDescr::Tdata, it, seq.GetDescr().Get()) {
2320  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech() && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2321  return true;
2322  }
2323  }
2324  return false;
2325 }
2326 
2327 
2329 {
2330  CSeqdesc_CI molinfo(bsh, CSeqdesc::e_Molinfo);
2331  if (molinfo && molinfo->GetMolinfo().IsSetTech() && molinfo->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2332  return true;
2333  }
2334  return false;
2335 }
2336 
2337 
2339 {
2340  const CTextseq_id* txt = id.GetTextseq_Id();
2341  if (txt == NULL || !txt->IsSetAccession()) {
2342  return false;
2343  }
2346  return true;
2347  } else {
2348  return false;
2349  }
2350 }
2351 
2352 
2354 {
2355  if (!seq.IsSetId()) {
2356  return false;
2357  }
2358  ITERATE(CBioseq::TId, id, seq.GetId()) {
2359  if (IsWGSAccession(**id)) {
2360  return true;
2361  }
2362  }
2363  return false;
2364 }
2365 
2366 
2368 {
2369  const CTextseq_id* txt = id.GetTextseq_Id();
2370  if (txt == NULL || !txt->IsSetAccession()) {
2371  return false;
2372  }
2375  return true;
2376  } else {
2377  return false;
2378  }
2379 }
2380 
2381 
2383 {
2384  if (!seq.IsSetId()) {
2385  return false;
2386  }
2387  ITERATE(CBioseq::TId, id, seq.GetId()) {
2388  if (IsWGSAccession(**id)) {
2389  return true;
2390  }
2391  }
2392  return false;
2393 }
2394 
2395 
2397 {
2398  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2399  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2400  if (desc && desc->GetMolinfo().IsSetCompleteness()) {
2401  CMolInfo::TCompleteness completeness = desc->GetMolinfo().GetCompleteness();
2402  if (completeness == CMolInfo::eCompleteness_partial
2403  || completeness == CMolInfo::eCompleteness_no_left
2404  || completeness == CMolInfo::eCompleteness_no_right
2405  || completeness == CMolInfo::eCompleteness_no_ends) {
2406  return true;
2407  }
2408  }
2409  return false;
2410 }
2411 
2412 
2414 {
2415  FOR_EACH_SEQID_ON_BIOSEQ(id, seq) {
2416  if ((*id)->IsPdb()) {
2417  return true;
2418  }
2419  }
2420  return false;
2421 }
2422 
2423 
2425 {
2426  if (IsPdb(seq) || IsWGSMaster(seq, *m_Scope)) {
2427  return;
2428  }
2429  const CSeq_inst& inst = seq.GetInst();
2430 
2431  TSeqPos len = inst.IsSetLength() ? inst.GetLength() : 0;
2432  if ( seq.IsAa() ) {
2433  if (len <= 3 && !IsPartial(seq, *m_Scope)) {
2434  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2435  NStr::IntToString(len) + " residues", seq);
2436  }
2437  } else {
2438  if ( len <= 10) {
2439  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2440  NStr::IntToString(len) + " residues", seq);
2441  }
2442  }
2443 
2444  /*
2445  if ( (len <= 350000) || m_Imp.IsNC() || m_Imp.IsNT() ) {
2446  return;
2447  }
2448 
2449  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
2450  if ( !bsh ) {
2451  return;
2452  }
2453  CSeqdesc_CI desc( bsh, CSeqdesc::e_Molinfo );
2454  const CMolInfo* mi = desc ? &(desc->GetMolinfo()) : 0;
2455 
2456  if ( inst.GetRepr() == CSeq_inst::eRepr_delta ) {
2457  if ( mi && m_Imp.IsGED() ) {
2458  CMolInfo::TTech tech = mi->IsSetTech() ?
2459  mi->GetTech() : CMolInfo::eTech_unknown;
2460 
2461  if (tech == CMolInfo::eTech_htgs_0 ||
2462  tech == CMolInfo::eTech_htgs_1 ||
2463  tech == CMolInfo::eTech_htgs_2)
2464  {
2465  PostErr(eDiag_Warning, eErr_SEQ_INST_LongHtgsSequence,
2466  "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit",
2467  seq);
2468  } else if (tech == CMolInfo::eTech_htgs_3) {
2469  PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2470  "Phase 3 HTGS sequence exceeds 350kbp limit", seq);
2471  } else if (tech == CMolInfo::eTech_wgs) {
2472  PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2473  "WGS sequence exceeds 350kbp limit", seq);
2474  } else {
2475  len = 0;
2476  bool litHasData = false;
2477  CTypeConstIterator<CSeq_literal> lit(ConstBegin(seq));
2478  for (; lit; ++lit) {
2479  if (lit->IsSetSeq_data()) {
2480  litHasData = true;
2481  }
2482  len += lit->GetLength();
2483  }
2484  if ( len > 500000 && litHasData ) {
2485  PostErr(eDiag_Error, eErr_SEQ_INST_LongLiteralSequence,
2486  "Length of sequence literals exceeds 500kbp limit",
2487  seq);
2488  }
2489  }
2490  }
2491  } else if ( inst.GetRepr() == CSeq_inst::eRepr_raw ) {
2492  if ( mi ) {
2493  CMolInfo::TTech tech = mi->IsSetTech() ?
2494  mi->GetTech() : CMolInfo::eTech_unknown;
2495  if (tech == CMolInfo::eTech_htgs_0 ||
2496  tech == CMolInfo::eTech_htgs_1 ||
2497  tech == CMolInfo::eTech_htgs_2)
2498  {
2499  PostErr(eDiag_Warning, eErr_SEQ_INST_LongHtgsSequence,
2500  "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit",
2501  seq);
2502  } else if (tech == CMolInfo::eTech_htgs_3) {
2503  PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2504  "Phase 3 HTGS sequence exceeds 350kbp limit", seq);
2505  } else if (tech == CMolInfo::eTech_wgs) {
2506  PostErr(eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2507  "WGS sequence exceeds 350kbp limit", seq);
2508  } else {
2509  PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2510  "Length of sequence exceeds 350kbp limit", seq);
2511  }
2512  } else {
2513  PostErr (eDiag_Warning, eErr_SEQ_INST_SequenceExceeds350kbp,
2514  "Length of sequence exceeds 350kbp limit", seq);
2515  }
2516  }
2517  */
2518 }
2519 
2520 
2521 // Assumes that seq is segmented and has Seq-ext data
2523 {
2524  // Get parent CSeq_entry of seq and then find the next
2525  // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set
2526  // of class parts.
2527  const CSeq_entry* se = seq.GetParentEntry();
2528  if (!se) {
2529  return;
2530  }
2531  const CSeq_entry* parent = se->GetParentEntry ();
2532  if (!parent) {
2533  return;
2534  }
2535  if ( !parent->IsSet() || !parent->GetSet().IsSetClass() || parent->GetSet().GetClass() != CBioseq_set::eClass_segset) {
2536  return;
2537  }
2538 
2539  // Loop through seq_set looking for the parts set.
2540  FOR_EACH_SEQENTRY_ON_SEQSET (it, parent->GetSet()) {
2541  if ((*it)->Which() == CSeq_entry::e_Set
2542  && (*it)->GetSet().IsSetClass()
2543  && (*it)->GetSet().GetClass() == CBioseq_set::eClass_parts) {
2544  const CBioseq_set::TSeq_set& parts = (*it)->GetSet().GetSeq_set();
2545  const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get();
2546 
2547  // Make sure the number of locations (excluding null locations)
2548  // match the number of parts
2549  size_t nulls = 0;
2550  ITERATE ( CSeg_ext::Tdata, loc, locs ) {
2551  if ( (*loc)->IsNull() ) {
2552  nulls++;
2553  }
2554  }
2555  if ( locs.size() - nulls < parts.size() ) {
2557  "Parts set contains too many Bioseqs", seq);
2558  return;
2559  } else if ( locs.size() - nulls > parts.size() ) {
2561  "Parts set does not contain enough Bioseqs", seq);
2562  return;
2563  }
2564 
2565  // Now, simultaneously loop through the parts of se_parts and CSeq_locs of
2566  // seq's CSseq-ext. If don't compare, post error.
2567  size_t size = locs.size(); // == parts.size()
2568  CSeg_ext::Tdata::const_iterator loc_it = locs.begin();
2569  CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();
2570  for ( size_t i = 0; i < size; ++i ) {
2571  try {
2572  if ( (*loc_it)->IsNull() ) {
2573  ++loc_it;
2574  continue;
2575  }
2576  if ( !(*part_it)->IsSeq() ) {
2578  "Parts set component is not Bioseq", seq);
2579  return;
2580  }
2581  const CSeq_id& loc_id = GetId(**loc_it, m_Scope);
2582  if ( !IsIdIn(loc_id, (*part_it)->GetSeq()) ) {
2584  "Segmented bioseq seq_ext does not correspond to parts "
2585  "packaging order", seq);
2586  return;
2587  }
2588 
2589  // advance both iterators
2590  ++part_it;
2591  ++loc_it;
2592  } catch (const CObjmgrUtilException&) {
2593  ERR_POST_X(4, "Seq-loc not for unique sequence");
2594  return;
2595  } catch (CException &x1) {
2596  string err_msg = "Unknown error:";
2597  err_msg += x1.what();
2598  ERR_POST_X(5, err_msg);
2599  return;
2600  } catch (std::exception &x2) {
2601  string err_msg = "Unknown error:";
2602  err_msg += x2.what();
2603  ERR_POST_X(5, err_msg);
2604  return;
2605  }
2606  }
2607  }
2608  }
2609 }
2610 
2611 static bool s_IsConWithGaps(const CBioseq& seq)
2612 
2613 {
2614 
2615  if (! seq.IsSetInst ()) return false;
2616  const CSeq_inst& inst = seq.GetInst();
2617  if (! inst.IsSetExt ()) return false;
2618  if (! inst.GetExt().IsDelta()) return false;
2619 
2620  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
2621  if (! (*iter)->IsLiteral() ) continue;
2622  const CSeq_literal& lit = (*iter)->GetLiteral();
2623  if (!lit.IsSetSeq_data()) return true;
2624  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
2625  }
2626 
2627  return false;
2628 }
2629 
2630 
2632 {
2633  bool has_gap = false;
2634  if (seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
2635  ITERATE(CDelta_ext::Tdata, iter, seq.GetInst().GetExt().GetDelta().Get()) {
2636  if ((*iter)->IsLiteral() &&
2637  (!(*iter)->GetLiteral().IsSetSeq_data() || (*iter)->GetLiteral().GetSeq_data().IsGap())) {
2638  has_gap = true;
2639  break;
2640  }
2641  }
2642  }
2643  return has_gap;
2644 }
2645 
2647 {
2649  if (!bsh) {
2650  return;
2651  }
2652 
2653  string title = sequence::CDeflineGenerator().GenerateDefline(bsh);
2654 
2655 /*bsv
2656  CMolInfo::TTech tech = CMolInfo::eTech_unknown;
2657 */
2658  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2659  if (desc) {
2660  const CMolInfo& mi = desc->GetMolinfo();
2661 /*bsv
2662  tech = mi.GetTech();
2663 */
2665  if (m_Imp.IsGenbank()) {
2666  if (NStr::Find(title, "complete genome") != NPOS) {
2667  const CSeq_entry& ctx = *seq.GetParentEntry();
2669  "Complete genome in title without complete flag set",
2670  ctx, *desc);
2671  }
2672  }
2674  (! s_IsConWithGaps (seq)) &&
2675  !m_Imp.IsEmbl() && !m_Imp.IsDdbj()) {
2676  const CSeq_entry& ctx = *seq.GetParentEntry();
2678  "Circular topology without complete flag set", ctx, *desc);
2679  }
2680  }
2681  }
2682 
2683  // warning if title contains complete genome but sequence contains gap features
2684  if (NStr::FindNoCase (title, "complete genome") != NPOS && x_HasGap(seq)) {
2686  "Title contains 'complete genome' but sequence has gaps", seq);
2687  }
2688 
2689 
2690  // note - test for protein titles was moved to CValidError_bioseqset::ValidateNucProtSet
2691  // because it only applied for protein sequences in nuc-prot sets and it's more efficient
2692  // to create the defline generator once per nuc-prot set
2693 }
2694 
2695 static bool HasAssemblyOrNullGap (const CBioseq& seq)
2696 {
2697  const CSeq_inst& inst = seq.GetInst();
2698  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2699  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2700  if ( !(*sg) ) continue;
2701  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2702  const CSeq_literal& lit = (*sg)->GetLiteral();
2703  if (! lit.IsSetSeq_data()) return true;
2704  if (lit.GetSeq_data().IsGap()) return true;
2705  }
2706  }
2707 
2708  return false;
2709 }
2710 
2711 
2713 {
2714  const CSeq_inst& inst = seq.GetInst();
2715  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2716  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2717  if ( !(*sg) ) continue;
2718  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2719  const CSeq_literal& lit = (*sg)->GetLiteral();
2720  if (! lit.IsSetSeq_data()) {
2721  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_data NULL", seq);
2722  } else {
2723  const CSeq_data& data = lit.GetSeq_data();
2724  if (data.Which() == CSeq_data::e_Gap) {
2725  const CSeq_gap& gap = data.GetGap();
2726  if (gap.IsSetType()) {
2727  int gaptype = gap.GetType();
2728  if (gaptype == CSeq_gap::eType_unknown) {
2729  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_gap.unknown", seq);
2730  } else if (gaptype == CSeq_gap::eType_other) {
2731  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_gap.other", seq);
2732  }
2733  } else {
2734  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_gap NULL", seq);
2735  }
2736  }
2737  }
2738  }
2739  }
2740 }
2741 
2742 
2744 {
2745  const CSeq_inst& inst = seq.GetInst();
2746  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2747  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2748  if (!(*sg)) continue;
2749  // CON division - far delta - suppresses errors
2750  if ((**sg).Which() != CDelta_seq::e_Literal) /* continue */ return false;
2751  const CSeq_literal& lit = (*sg)->GetLiteral();
2752  if (!lit.IsSetSeq_data()) {
2753  return true;
2754  } else {
2755  const CSeq_data& data = lit.GetSeq_data();
2756  if (data.Which() == CSeq_data::e_Gap) {
2757  const CSeq_gap& gap = data.GetGap();
2758  CSeq_gap::TType gap_type = gap.IsSetType() ? gap.GetType() : CSeq_gap::eType_unknown;
2759 
2760  if (gap_type != CSeq_gap::eType_centromere && gap_type != CSeq_gap::eType_heterochromatin &&
2761  gap_type != CSeq_gap::eType_short_arm && gap_type != CSeq_gap::eType_telomere &&
2762  gap_type != CSeq_gap::eType_contig) {
2763 
2764  if (!gap.IsSetLinkage_evidence() || gap.GetLinkage_evidence().empty()) {
2765  return true;
2766  }
2767  }
2768  }
2769  }
2770  }
2771  }
2772  return false;
2773 }
2774 
2775 
2777 {
2778  if (HasBadWGSGap(seq)) {
2780  "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.", seq);
2781  }
2782 }
2783 
2784 
2786 {
2787  if (HasBadWGSGap(seq)) {
2789  "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence.", seq);
2790  }
2791 }
2792 
2793 
2795 {
2796  if (HasBadWGSGap(seq)) {
2798  "Genome submission includes wrong gap type. Gaps for genomes should be Assembly Gaps with linkage evidence.", seq);
2799  }
2800 }
2801 
2802 
2803 bool s_FieldHasLabel(const CUser_field& field, const string& label)
2804 {
2805  if (field.IsSetLabel() && field.GetLabel().IsStr() &&
2806  NStr::EqualNocase(field.GetLabel().GetStr(), label)) {
2807  return true;
2808  } else {
2809  return false;
2810  }
2811 }
2812 
2813 
2815 {
2816  if (!field.IsSetData()) {
2817  return false;
2818  }
2819  bool rval = false;
2820  if (field.GetData().IsStr()) {
2821  if (!NStr::IsBlank(field.GetData().GetStr())) {
2822  rval = true;
2823  }
2824  } else if (field.GetData().IsStrs()) {
2826  if (!NStr::IsBlank(*s)) {
2827  rval = true;
2828  break;
2829  }
2830  }
2831  }
2832  return rval;
2833 }
2834 
2835 
2837 {
2838  bool has_biosample = false;
2839  bool has_bioproject = false;
2840 
2841  CSeqdesc_CI d(bsh, CSeqdesc::e_User);
2842  while (d) {
2845  if (s_FieldHasLabel(**it, "BioSample")) {
2846  if (s_FieldHasNonBlankValue(**it)) {
2847  has_biosample = true;
2848  }
2849  } else if (s_FieldHasLabel(**it, "BioProject")) {
2850  if (s_FieldHasNonBlankValue(**it)) {
2851  has_bioproject = true;
2852  }
2853  }
2854  }
2855  }
2856  ++d;
2857  }
2858  if (!has_biosample && !has_bioproject) {
2860  "WGS master lacks both BioSample and BioProject",
2861  *(bsh.GetCompleteBioseq()));
2862  } else if (!has_biosample) {
2864  "WGS master lacks BioSample",
2865  *(bsh.GetCompleteBioseq()));
2866  } else if (!has_bioproject) {
2868  "WGS master lacks BioProject",
2869  *(bsh.GetCompleteBioseq()));
2870  }
2871  if (!has_biosample || !has_bioproject) {
2872  }
2873 }
2874 
2875 
2876 static EDiagSev GetBioseqEndWarning (const CBioseq& seq, bool is_circular, EBioseqEndIsType end_is_char)
2877 {
2878  EDiagSev sev;
2879  bool only_local = true;
2880  bool is_NCACNTNW = false;
2881  bool is_patent = false;
2882  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2883  if (!(*id_it)->IsLocal()) {
2884  only_local = false;
2885  if ((*id_it)->IsPatent()) {
2886  is_patent = true;
2887  } else if (IsNTNCNWACAccession(**id_it)) {
2888  is_NCACNTNW = true;
2889  }
2890  }
2891  }
2892 
2893  if (is_NCACNTNW || is_patent) {
2894  sev = eDiag_Warning;
2895  } else if (is_circular) {
2896  sev = eDiag_Warning;
2897  } else if (only_local) {
2898  sev = eDiag_Warning;
2899  } else if (end_is_char == eBioseqEndIsType_All) {
2900  sev = eDiag_Error;
2901  } else {
2902  sev = eDiag_Warning;
2903  }
2904  return sev;
2905 }
2906 
2907 
2908 void CValidError_bioseq::x_CalculateNsStretchAndTotal(const CBioseq& seq, TSeqPos& num_ns, TSeqPos& max_stretch, bool& n5, bool& n3)
2909 {
2910  num_ns = 0;
2911  max_stretch = 0;
2912  n5 = false;
2913  n3 = false;
2914 
2915  if (HasAssemblyOrNullGap (seq)) return;
2917  if ( !bsh ) {
2918  return;
2919  }
2920 
2922 
2923  TSeqPos this_stretch = 0;
2924  for (TSeqPos i = 0; i < vec.size(); i++) {
2925  if (vec[i] == 'N') {
2926  num_ns++;
2927  if (vec.IsInGap(i)) {
2928  if (max_stretch < this_stretch) {
2929  max_stretch = this_stretch;
2930  }
2931  this_stretch = 0;
2932  } else {
2933  this_stretch++;
2934  if (this_stretch >= 10) {
2935  if (i < 20) {
2936  n5 = true;
2937  }
2938  if (vec.size() > 20 && i > vec.size() - 10) {
2939  n3 = true;
2940  }
2941  }
2942  }
2943  } else {
2944  if (max_stretch < this_stretch) {
2945  max_stretch = this_stretch;
2946  }
2947  this_stretch = 0;
2948  }
2949  }
2950  if (max_stretch < this_stretch) {
2951  max_stretch = this_stretch;
2952  }
2953 }
2954 
2955 
2957 {
2958  TSeqPos num_ns = 0;
2959  TSeqPos max_stretch = 0;
2960  bool n5 = false;
2961  bool n3 = false;
2962  bool rval = false;
2963 
2964  x_CalculateNsStretchAndTotal(seq, num_ns, max_stretch, n5, n3);
2965 
2966  if (max_stretch >= 15) {
2968  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
2969  rval = true;
2970  } else {
2971  if (n5) {
2973  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
2974  rval = true;
2975  }
2976  if (n3) {
2978  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
2979  rval = true;
2980  }
2981  }
2982  return rval;
2983 }
2984 
2985 
2986 // check to see if sequence is all Ns
2988 {
2989  bool rval = true;
2990  bool at_least_one = false;
2991  try {
2992  for (CSeqVector_CI sv_iter(vec); (sv_iter) && rval; ++sv_iter) {
2993  if (*sv_iter != 'N') {
2994  rval = false;
2995  }
2996  at_least_one = true;
2997  }
2998  } catch (CException& ) {
2999 
3000  }
3001  return (rval && at_least_one);
3002 }
3003 
3004 
3005 static int CountNs(const CSeq_data& seq_data, TSeqPos len)
3006 {
3007  int total = 0;
3008  switch (seq_data.Which()) {
3009  case CSeq_data::e_Ncbi4na:
3010  {
3011  vector<char>::const_iterator it = seq_data.GetNcbi4na().Get().begin();
3012  unsigned char mask = 0xf0;
3013  unsigned char shift = 4;
3014  for (size_t n = 0; n < len; n++) {
3015  unsigned char c = ((*it) & mask) >> shift;
3016  mask >>= 4;
3017  shift -= 4;
3018  if (!mask) {
3019  mask = 0xf0;
3020  shift = 4;
3021  ++it;
3022  }
3023  if (c == 15) {
3024  total++;
3025  }
3026  }
3027  }
3028  return total;
3029  case CSeq_data::e_Iupacna:
3030  {
3031  const string& s = seq_data.GetIupacna().Get();
3032  for (size_t n = 0; n < len; n++) {
3033  if (s[n] == 'N') {
3034  total++;
3035  }
3036  }
3037  }
3038  return total;
3039  case CSeq_data::e_Ncbi8na:
3040  case CSeq_data::e_Ncbipna:
3041  {
3042  CSeq_data iupacna;
3043  if (!CSeqportUtil::Convert(seq_data, &iupacna, CSeq_data::e_Iupacna)) {
3044  return total;
3045  }
3046  const string& s = iupacna.GetIupacna().Get();
3047  for (size_t n = 0; n < len; n++) {
3048  if (s[n] == 'N') {
3049  total++;
3050  }
3051  }
3052  }
3053  return total;
3054  default:
3055  return total;
3056  }
3057 }
3058 
3059 
3061 {
3062  int count = 0;
3063  SSeqMapSelector sel;
3065  for (CSeqMap_CI seq_iter(bsh, sel); seq_iter; ++seq_iter) {
3066  switch (seq_iter.GetType()) {
3067  case CSeqMap::eSeqData:
3068  count += CountNs(seq_iter.GetData(), seq_iter.GetLength());
3069  break;
3070  default:
3071  break;
3072  }
3073  }
3074 /*
3075  int pct_n = 0;
3076  try {
3077  CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3078  TSeqPos num_ns = 0;
3079  for (size_t i = 0; i < vec.size(); i++) {
3080  try {
3081  if (vec[i] == 'N' && !vec.IsInGap(i)) {
3082  num_ns++;
3083  }
3084  } catch (CException& e2) {
3085  //bad character
3086  }
3087  }
3088  pct_n = (num_ns * 100) / bsh.GetBioseqLength();
3089  } catch (CException& e) {
3090  pct_n = 100;
3091  }
3092 */
3093  return bsh.GetBioseqLength() ? count * 100 / bsh.GetBioseqLength() : 100;
3094 }
3095 
3096 
3098 {
3099  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()) {
3100  // can't check if no Inst or Repr
3101  return;
3102  }
3103  if (!seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3104  // don't check proteins here
3105  return;
3106  }
3107  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3108 
3109  // only check for raw or for delta sequences that are delta lit only
3110  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3111  return;
3112  }
3113 
3115  if ( !bsh ) {
3116  // no check if Bioseq not in scope
3117  return;
3118  }
3119 
3120  try {
3122 
3123  if (IsAllNs(vec)) {
3124  PostErr(eDiag_Critical, eErr_SEQ_INST_AllNs, "Sequence is all Ns", seq);
3125  return;
3126  }
3127 
3128  // don't bother checking if length is less than 10
3129  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()
3130  || !seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3131  return;
3132  }
3133 
3138  bool begin_ambig = false, end_ambig = false;
3139  if (ShouldCheckForNsAndGap(bsh) && x_IsDeltaLitOnly(seq.GetInst())) {
3140  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
3141  }
3142 
3143  bool is_circular = false;
3145  is_circular = true;
3146  }
3147  EDiagSev sev;
3148  if (begin_n != eBioseqEndIsType_None) {
3149  sev = GetBioseqEndWarning(seq, is_circular, begin_n);
3150  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at beginning of sequence", seq);
3151  } else if (begin_gap != eBioseqEndIsType_None) {
3152  sev = GetBioseqEndWarning(seq, is_circular, begin_gap);
3153  PostErr (sev, eErr_SEQ_INST_TerminalGap, "Gap at beginning of sequence", seq);
3154  }
3155 
3156  if (end_n != eBioseqEndIsType_None) {
3157  sev = GetBioseqEndWarning(seq, is_circular, end_n);
3158  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at end of sequence", seq);
3159  } else if (end_gap != eBioseqEndIsType_None) {
3160  sev = GetBioseqEndWarning(seq, is_circular, end_gap);
3161  PostErr (sev, eErr_SEQ_INST_TerminalGap, "Gap at end of sequence", seq);
3162  }
3163 
3164  if (begin_ambig) {
3166  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases",
3167  seq);
3168  }
3169  if (end_ambig) {
3171  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases",
3172  seq);
3173  }
3174 
3175  // don't check N content for patent sequences
3176  if (SeqIsPatent(seq)) {
3177  return;
3178  }
3179 
3180 
3181  // if TSA, check for percentage of Ns and max stretch of Ns
3182  if (IsBioseqTSA(seq, m_Scope)) {
3183  ReportBadAssemblyGap (seq);
3184  bool n5 = false;
3185  bool n3 = false;
3186  TSeqPos num_ns = 0, max_stretch = 0;
3187  x_CalculateNsStretchAndTotal(seq, num_ns, max_stretch, n5, n3);
3188 
3189  int pct_n = (num_ns * 100) / seq.GetLength();
3190  if (pct_n > 10) {
3192  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3193  }
3194 
3195  if (max_stretch >= 15) {
3197  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3198  } else {
3199  if (n5) {
3201  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3202  }
3203  if (n3) {
3205  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3206  }
3207  }
3208  } else {
3209  // not TSA, just check for really high N percent
3210  int pct_n = PctNs(bsh);
3211  if (pct_n > 50) {
3213  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3214  }
3215  }
3216 
3217  if (!m_Imp.IsRefSeqConventions() && !IsRefSeq(seq) && !IsEmblOrDdbj(seq)) {
3218  if (IsWGS(bsh)) {
3219  ReportBadWGSGap(seq);
3220  } else if (IsBioseqTSA(seq, m_Scope)) {
3221  ReportBadTSAGap(seq);
3222  } else if (m_Imp.IsGenomeSubmission()) {
3223  ReportBadGenomeGap(seq);
3224  }
3225  }
3226  } catch ( exception& ) {
3227  // just ignore, and continue with the validation process.
3228  }
3229 }
3230 
3231 
3232 // Assumes that seq is eRepr_raw or eRepr_inst
3234  const CBioseq& seq)
3235 {
3236  const CSeq_inst& inst = seq.GetInst();
3237  const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();
3238  const string& rpr = tv->FindName(inst.GetRepr(), true);
3239 
3240  if (inst.IsSetFuzz() && (!inst.IsSetSeq_data() || !inst.GetSeq_data().IsGap())) {
3242  "Fuzzy length on " + rpr + " Bioseq", seq);
3243  }
3244 
3245  if (!inst.IsSetLength() || inst.GetLength() == 0) {
3246  string len = inst.IsSetLength() ?
3247  NStr::IntToString(inst.GetLength()) : "0";
3249  "Invalid Bioseq length [" + len + "]", seq);
3250  }
3251 
3252  if (inst.GetRepr() == CSeq_inst::eRepr_raw) {
3253  const CMolInfo* mi = 0;
3255  if ( mi_desc ) {
3256  mi = &(mi_desc->GetMolinfo());
3257  }
3258  CMolInfo::TTech tech =
3259  mi != 0 ? mi->GetTech() : CMolInfo::eTech_unknown;
3260  if (tech == CMolInfo::eTech_htgs_2 &&
3261  !GraphsOnBioseq(seq) &&
3262  !x_IsActiveFin(seq)) {
3264  "HTGS 2 raw seq has no gaps and no graphs", seq);
3265  }
3266  }
3267 
3269 
3270  CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ?
3272  if (seqtyp != CSeq_data::e_Gap) {
3273  switch (seqtyp) {
3274  case CSeq_data::e_Iupacna:
3275  case CSeq_data::e_Ncbi2na:
3276  case CSeq_data::e_Ncbi4na:
3277  case CSeq_data::e_Ncbi8na:
3278  case CSeq_data::e_Ncbipna:
3279  if (inst.IsAa()) {
3281  "Using a nucleic acid alphabet on a protein sequence",
3282  seq);
3283  return;
3284  }
3285  break;
3286  case CSeq_data::e_Iupacaa:
3287  case CSeq_data::e_Ncbi8aa:
3288  case CSeq_data::e_Ncbieaa:
3289  case CSeq_data::e_Ncbipaa:
3291  if (inst.IsNa()) {
3293  "Using a protein alphabet on a nucleic acid",
3294  seq);
3295  return;
3296  }
3297  break;
3298  case CSeq_data::e_Gap:
3299  break;
3300  default:
3302  "Sequence alphabet not set",
3303  seq);
3304  return;
3305  }
3306 
3307  bool check_alphabet = false;
3308  unsigned int factor = 1;
3309  switch (seqtyp) {
3310  case CSeq_data::e_Iupacaa:
3311  case CSeq_data::e_Iupacna:
3312  case CSeq_data::e_Ncbieaa:
3314  check_alphabet = true;
3315  break;
3316  case CSeq_data::e_Ncbi8na:
3317  case CSeq_data::e_Ncbi8aa:
3318  break;
3319  case CSeq_data::e_Ncbi4na:
3320  factor = 2;
3321  break;
3322  case CSeq_data::e_Ncbi2na:
3323  factor = 4;
3324  break;
3325  case CSeq_data::e_Ncbipna:
3326  factor = 5;
3327  break;
3328  case CSeq_data::e_Ncbipaa:
3329  factor = 21;
3330  break;
3331  default:
3332  // Logically, should not occur
3334  "Sequence alphabet not set",
3335  seq);
3336  return;
3337  }
3338  TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0;
3339 
3340  if (calc_len % factor) {
3341  calc_len += factor;
3342  }
3343  calc_len /= factor;
3344 
3345  string s_len = NStr::UIntToString(inst.GetLength());
3346 
3347  size_t data_len = GetDataLen(inst);
3348  string data_len_str = NStr::NumericToString(data_len * factor);
3349  if (calc_len > data_len) {
3351  "Bioseq.seq_data too short [" + data_len_str +
3352  "] for given length [" + s_len + "]", seq);
3353  return;
3354  } else if (calc_len < data_len) {
3356  "Bioseq.seq_data is larger [" + data_len_str +
3357  "] than given length [" + s_len + "]", seq);
3358  }
3359 
3360  if (check_alphabet) {
3361  unsigned int trailingX = 0;
3362  size_t dashes = 0;
3363  bool leading_x = false, found_lower = false;
3364 
3367 
3368  size_t bad_cnt = 0;
3369  TSeqPos pos = 1;
3370  for ( CSeqVector_CI sv_iter(*sv), sv_res_iter(sv_res); (sv_iter) && (sv_res_iter); ++sv_iter, ++sv_res_iter ) {
3371  CSeqVector::TResidue res = *sv_iter;
3372  CSeqVector::TResidue n_res = *sv_res_iter;
3373  if ( !IsResidue(n_res) ) {
3374  if (res == 'U' && bsh.IsSetInst_Mol() && bsh.GetInst_Mol() == CSeq_inst::eMol_rna) {
3375  // U is ok for RNA
3376  } else if (res == '*' && bsh.IsAa()) {
3377  trailingX = 0;
3378  } else {
3379  if ( ! IsResidue(res)) {
3380  if ( ++bad_cnt > 10 ) {
3382  "More than 10 invalid residues. Checking stopped",
3383  seq);
3384  return;
3385  } else {
3387  "Invalid residue [" + NStr::UIntToString(res)
3388  + "] at position [" + NStr::UIntToString(pos) + "]",
3389  seq);
3390  }
3391  } else if (islower (res)) {
3392  found_lower = true;
3393  } else {
3394  string msg = "Invalid";
3395  if (seq.IsNa() && strchr ("EFIJLOPQXZ", res) != NULL) {
3396  msg += " nucleotide";
3397  } else if (seq.IsNa() && res == 'U') {
3398  msg += " nucleotide";
3399  }
3400  msg += " residue ";
3401  if (seqtyp == CSeq_data::e_Ncbistdaa) {
3402  msg += "[" + NStr::UIntToString(res) + "]";
3403  } else {
3404  msg += "'";
3405  msg += res;
3406  msg += "'";
3407  }
3408  msg += " at position [" + NStr::UIntToString(pos) + "]";
3409 
3411  msg, seq);
3412  }
3413  }
3414  } else if ( res == '-' || sv->IsInGap(pos - 1) ) {
3415  dashes++;
3416  } else if ( res == '*') {
3417  trailingX = 0;
3418  } else if ( res == 'X' ) {
3419  trailingX++;
3420  if (pos == 1) {
3421  leading_x = true;
3422  }
3423  } else if (!isalpha (res)) {
3424  string msg = "Invalid residue [";
3425  msg += res;
3426  msg += "] in position [" + NStr::UIntToString(pos) + "]";
3428  msg, seq);
3429  } else {
3430  trailingX = 0;
3431  }
3432  ++pos;
3433  }
3434 
3435  bool gap_at_start = HasBadProteinStart(*sv);
3436  size_t terminations = CountProteinStops(*sv);
3437 
3438  // only show leading or trailing X if product of NNN in nucleotide
3439  if (seq.IsAa() && (leading_x || trailingX > 0)) {
3440  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
3441  const CSeq_feat* cds = GetCDSForProduct(bsh);
3442  if (cds && cds->IsSetLocation()) {
3443  size_t dna_len = GetLength (cds->GetLocation(), m_Scope);
3444  if (dna_len > 5) {
3445  string cds_seq = GetSequenceStringFromLoc (cds->GetLocation(), *m_Scope);
3446  if (cds->GetData().GetCdregion().IsSetFrame()) {
3447  if (cds->GetData().GetCdregion().GetFrame() == 2) {
3448  cds_seq = cds_seq.substr(1);
3449  } else if (cds->GetData().GetCdregion().GetFrame() == 3) {
3450  cds_seq = cds_seq.substr(2);
3451  }
3452  }
3453 
3454  if (!NStr::StartsWith (cds_seq, "NNN")) {
3455  leading_x = false;
3456  }
3457  if (cds_seq.length() >= 3) {
3458  string lastcodon = cds_seq.substr(cds_seq.length() - 3);
3459  if (!NStr::StartsWith(lastcodon, "NNN")) {
3460  trailingX = 0;
3461  }
3462  }
3463  }
3464  }
3465  }
3466 
3467  if (leading_x) {
3469  "Sequence starts with leading X", seq);
3470  }
3471 
3472  if ( trailingX > 0 && !SuppressTrailingXMsg(seq) ) {
3473  // Suppress if cds ends in "*" or 3' partial
3474  string msg = "Sequence ends in " +
3475  NStr::IntToString(trailingX) + " trailing X";
3476  if ( trailingX > 1 ) {
3477  msg += "s";
3478  }
3480  }
3481 
3482  if (found_lower) {
3484  "Sequence contains lower-case characters", seq);
3485  }
3486 
3487  if (terminations > 0 || dashes > 0) {
3488  // Post error indicating terminations found in protein sequence
3489  // if possible, get gene and protein names
3490  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
3491  // First get gene label
3492  string gene_label = "";
3493  try {
3494  const CSeq_feat* cds = GetCDSForProduct(bsh);
3495  if (cds) {
3497  if (gene && gene->IsSetData() && gene->GetData().IsGene()) {
3498  gene->GetData().GetGene().GetLabel(&gene_label);
3499  }
3500  }
3501  } catch (...) {
3502  }
3503  // get protein label
3504  string protein_label = "";
3505  try {
3506  CCacheImpl::SFeatKey prot_key(
3508  const CCacheImpl::TFeatValue & prots =
3509  GetCache().GetFeatFromCache(prot_key);
3510  if( ! prots.empty() ) {
3511  const CSeqFeatData_Base::TProt & first_prot =
3512  prots[0].GetData().GetProt();
3513  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(first_prot, Name) ) {
3514  protein_label = first_prot.GetName().front();
3515  }
3516  }
3517  } catch (CException ) {
3518  } catch (std::exception ) {
3519  }
3520 
3521  if (NStr::IsBlank(gene_label)) {
3522  gene_label = "gene?";
3523  }
3524  if (NStr::IsBlank(protein_label)) {
3525  protein_label = "prot?";
3526  }
3527 
3528  if (dashes > 0) {
3529  if (gap_at_start && dashes == 1) {
3531  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3532  seq);
3533  } else if (gap_at_start) {
3535  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3536  seq);
3538  "[" + NStr::SizetToString (dashes - 1) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3539  seq);
3540  } else {
3542  "[" + NStr::SizetToString (dashes) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3543  seq);
3544  }
3545  }
3546 
3547  if (terminations > 0) {
3548  string msg = "[" + NStr::SizetToString(terminations) + "] termination symbols in protein sequence";
3549  msg += " (" + gene_label + " - " + protein_label + ")";
3550 
3552  }
3553  }
3554  }
3555 
3556  bool is_wgs = IsWGS(bsh);
3557 
3558  if (seq.IsNa() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
3559  // look for runs of Ns and gap characters
3560  bool has_gap_char = false;
3561  size_t run_len = 0;
3562  TSeqPos start_pos = 0;
3563  TSeqPos pos = 1;
3565  const size_t run_len_cutoff = ( is_wgs ? 20 : 100 );
3566  for ( CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter, ++pos ) {
3567  CSeqVector::TResidue res = *sv_iter;
3568  switch(res) {
3569  case 'N':
3570  if (run_len == 0) {
3571  start_pos = pos;
3572  }
3573  run_len++;
3574  break;
3575  case '-':
3576  has_gap_char = true;
3577  ///////////////////////////////////
3578  ////////// FALL-THROUGH! //////////
3579  ///////////////////////////////////
3580  default:
3581  if (run_len >= run_len_cutoff && start_pos > 1)
3582  {
3584  "Run of " + NStr::SizetToString (run_len) + " Ns in raw sequence starting at base "
3585  + NStr::IntToString (start_pos),
3586  seq);
3587  }
3588  run_len = 0;
3589  break;
3590  }
3591  }
3592  if (has_gap_char) {
3594  "Raw nucleotide should not contain gap characters", seq);
3595  }
3596  }
3597  }
3598 }
3599 
3600 
3601 // Assumes seq is eRepr_seg or eRepr_ref
3603 {
3604  string id_test_label;
3605  seq.GetLabel(&id_test_label, CBioseq::eContent);
3606 
3608  const CSeq_inst& inst = seq.GetInst();
3609 
3610  // Validate extension data -- wrap in CSeq_loc_mix for convenience
3611  CRef<CSeq_loc> loc = GetLocFromSeq(seq);
3612  if (loc) {
3613  if (inst.IsSetRepr() && inst.GetRepr() == CSeq_inst::eRepr_seg) {
3614  m_Imp.ValidateSeqLoc(*loc, bsh, true, "Segmented Bioseq", seq);
3615  }
3616 
3617  // Validate Length
3618  try {
3619  TSeqPos loclen = GetLength(*loc, m_Scope);
3620  TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0;
3621  if (seqlen > loclen) {
3623  "Bioseq.seq_data too short [" + NStr::IntToString(loclen) +
3624  "] for given length [" + NStr::IntToString(seqlen) + "]",
3625  seq);
3626  } else if (seqlen < loclen) {
3628  "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) +
3629  "] than given length [" + NStr::IntToString(seqlen) + "]",
3630  seq);
3631  }
3632  } catch (const CObjmgrUtilException&) {
3633  ERR_POST_X(6, Critical << "Unable to calculate length: ");
3634  }
3635  }
3636 
3637  // Check for multiple references to the same Bioseq
3638  if (inst.IsSetExt() && inst.GetExt().IsSeg()) {
3639  const list< CRef<CSeq_loc> >& locs = inst.GetExt().GetSeg().Get();
3640  ITERATE(list< CRef<CSeq_loc> >, i1, locs) {
3641  if (!IsOneBioseq(**i1, m_Scope)) {
3642  continue;
3643  }
3644  const CSeq_id& id1 = GetId(**i1, m_Scope);
3645  list< CRef<CSeq_loc> >::const_iterator i2 = i1;
3646  for (++i2; i2 != locs.end(); ++i2) {
3647  if (!IsOneBioseq(**i2, m_Scope)) {
3648  continue;
3649  }
3650  const CSeq_id& id2 = GetId(**i2, m_Scope);
3651  if (IsSameBioseq(id1, id2, m_Scope)) {
3652  string sid;
3653  id1.GetLabel(&sid);
3654  if ((**i1).IsWhole() && (**i2).IsWhole()) {
3657  "Segmented sequence has multiple references to " +
3658  sid, seq);
3659  } else {
3662  "Segmented sequence has multiple references to " +
3663  sid + " that are not SEQLOC_WHOLE", seq);
3664  }
3665  }
3666  }
3667  }
3668  }
3669 
3670  // Check that partial sequence info on sequence segments is consistent with
3671  // partial sequence info on sequence -- aa sequences only
3672  int partial = SeqLocPartialCheck(*loc, m_Scope);
3673  if (seq.IsAa()) {
3674  bool got_partial = false;
3675  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (sd, seq) {
3676  if (!(*sd)->IsMolinfo() || !(*sd)->GetMolinfo().IsSetCompleteness()) {
3677  continue;
3678  }
3679 
3680  switch ((*sd)->GetMolinfo().GetCompleteness()) {
3682  got_partial = true;
3683  if (!partial) {
3685  "Complete segmented sequence with MolInfo partial", seq);
3686  }
3687  break;
3689  if (!(partial & eSeqlocPartial_Start) || (partial & eSeqlocPartial_Stop)) {
3691  "No-left inconsistent with segmented SeqLoc",
3692  seq);
3693  }
3694  got_partial = true;
3695  break;
3697  if (!(partial & eSeqlocPartial_Stop) || (partial & eSeqlocPartial_Start)) {
3699  "No-right inconsistent with segmented SeqLoc",
3700  seq);
3701  }
3702  got_partial = true;
3703  break;
3705  if (!(partial & eSeqlocPartial_Start) || !(partial & eSeqlocPartial_Stop)) {
3707  "No-ends inconsistent with segmented SeqLoc",
3708  seq);
3709  }
3710  got_partial = true;
3711  break;
3712  default:
3713  break;
3714  }
3715  }
3716  if (!got_partial) {
3718  "Partial segmented sequence without MolInfo partial", seq);
3719  }
3720  }
3721 }
3722 
3723 
3725 {
3726  int max_ns = -1;
3727 
3728  switch (tech) {
3732  max_ns = 80;
3733  break;
3734  case CMolInfo::eTech_wgs:
3735  max_ns = 19;
3736  break;
3737  default:
3738  max_ns = 99;
3739  break;
3740  }
3741  return max_ns;
3742 }
3743 
3744 
3745 static bool s_IsSwissProt (const CBioseq& seq)
3746 {
3747  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
3748  if ((*it)->IsSwissprot()) {
3749  return true;
3750  }
3751  }
3752  return false;
3753 }
3754 
3756 {
3757  TIntId cmp = q1->GetId()->CompareOrdered(*(q2->GetId()));
3758  if (cmp < 0) {
3759  return true;
3760  } else if (cmp > 0) {
3761  return false;
3762  }
3763 
3764  TSeqPos start1 = q1->GetStart(eExtreme_Positional);
3765  TSeqPos start2 = q2->GetStart(eExtreme_Positional);
3766  if (start1 < start2) {
3767  return true;
3768  } else if (start2 < start1) {
3769  return false;
3770  }
3771 
3772  TSeqPos stop1 = q1->GetStop(eExtreme_Positional);
3773  TSeqPos stop2 = q2->GetStop(eExtreme_Positional);
3774 
3775  if (stop1 < stop2) {
3776  return true;
3777  } else {
3778  return false;
3779  }
3780 }
3781 
3782 
3784 {
3785  bool rval = false;
3786 
3787  if (!seq.IsSetInst() || !seq.GetInst().IsSetExt() ||
3788  !seq.GetInst().GetExt().IsDelta()) {
3789  return false;
3790  }
3791 
3792  ITERATE(CDelta_ext::Tdata, sg, seq.GetInst().GetExt().GetDelta().Get()) {
3793  if (!(*sg)) {
3794  // skip NULL element
3795  } else if ((*sg)->IsLoc()) {
3796  const CSeq_id *id = (*sg)->GetLoc().GetId();
3797  if (id) {
3798  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
3799  if ((*id_it)->Compare(*id) == CSeq_id::e_YES) {
3800  rval = true;
3801  break;
3802  }
3803  }
3804  }
3805  if (rval) break;
3806  }
3807  }
3808  return rval;
3809 }
3810 
3811 
3813 {
3814  if (!loc.IsInt()) {
3815  return false;
3816  }
3817 
3818  TSeqPos stop = loc.GetStop(eExtreme_Positional);
3819  TSeqPos start = loc.GetStart(eExtreme_Positional);
3820 
3821  if (start > 0) {
3822  CRef<CSeq_loc> far_loc(new CSeq_loc());
3823  far_loc->SetInt().SetFrom(0);
3824  far_loc->SetInt().SetTo(start - 1);
3825  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
3826  CFeat_CI f(far_bsh.GetScope(), *far_loc);
3827  if (f) {
3828  return true;
3829  }
3830  }
3831  if (stop < far_bsh.GetBioseqLength() - 1) {
3832  CRef<CSeq_loc> far_loc(new CSeq_loc());
3833  far_loc->SetInt().SetFrom(stop + 1);
3834  far_loc->SetInt().SetTo(far_bsh.GetBioseqLength() - 1);
3835  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
3836  CFeat_CI f(far_bsh.GetScope(), *far_loc);
3837  if (f) {
3838  return true;
3839  }
3840  }
3841  return false;
3842 }
3843 
3844 
3846 (const CSeq_loc& loc,
3847  const CBioseq& seq,
3848  TSeqPos& len)
3849 {
3850  if (loc.IsWhole()) {
3852  "Delta seq component should not be of type whole", seq);
3853  }
3854 
3855  const CSeq_id *id = loc.GetId();
3856  if (id) {
3857  if (id->IsGi() && loc.GetId()->GetGi() == ZERO_GI) {
3859  "Delta component is gi|0", seq);
3860  }
3861  if (!loc.IsWhole()
3862  && (id->IsGi()
3863  || id->IsGenbank()
3864  || id->IsEmbl()
3865  || id->IsDdbj() || id->IsTpg()
3866  || id->IsTpe()
3867  || id->IsTpd()
3868  || id->IsOther())) {
3869  TSeqPos stop = loc.GetStop(eExtreme_Positional);
3870  try {
3872  if (bsh) {
3873  TSeqPos seq_len = bsh.GetBioseqLength();
3874  if (seq_len <= stop) {
3875  string id_label = id->AsFastaString();
3877  "Seq-loc extent (" + NStr::IntToString (stop + 1)
3878  + ") greater than length of " + id_label
3879  + " (" + NStr::IntToString(seq_len) + ")",
3880  seq);
3881  }
3882  if (!m_Imp.IsRefSeq() && IsWGS(seq) && HasExcludedAnnotation(loc, bsh)) {
3883  string id_label = id->AsFastaString();
3885  "Scaffold points to some but not all of " +
3886  id_label + ", excluded portion contains features", seq);
3887  }
3888  } else {
3890  "Unable to find far delta sequence component", seq);
3891  }
3892  } catch (CException ) {
3893  } catch (std::exception ) {
3894  }
3895  }
3896  }
3897 
3898  try {
3899  if (seq.IsSetInst ()) {
3900  const CSeq_inst& inst = seq.GetInst();
3901  TSeqPos loc_len = GetLength(loc, m_Scope);
3902  if (loc_len == numeric_limits<TSeqPos>::max()) {
3904  "-1 length on seq-loc of delta seq_ext", seq);
3905  string loc_str;
3906  loc.GetLabel(&loc_str);
3907  if ( loc_str.empty() ) {
3908  loc_str = "?";
3909  }
3910  if (x_IsDeltaLitOnly(inst)) {
3912  "Short length (-1) on seq-loc (" + loc_str + ") of delta seq_ext", seq);
3913  }
3914  } else {
3915  len += loc_len;
3916  }
3917  if ( loc_len <= 10 ) {
3918  string loc_str;
3919  loc.GetLabel(&loc_str);
3920  if ( loc_str.empty() ) {
3921  loc_str = "?";
3922  }
3923  if (x_IsDeltaLitOnly(inst)) {
3925  "Short length (" + NStr::SizetToString(loc_len) +
3926  ") on seq-loc (" + loc_str + ") of delta seq_ext", seq);
3927  }
3928  }
3929  }
3930 
3931  } catch (const CObjmgrUtilException&) {
3932  string loc_str;
3933  loc.GetLabel(&loc_str);
3934  if ( loc_str.empty() ) {
3935  loc_str = "?";
3936  }
3938  "No length for Seq-loc (" + loc_str + ") of delta seq-ext",
3939  seq);
3940  }
3941 }
3942 
3943 
3944 static size_t s_GetDeltaLen (const CDelta_seq& seg, CScope* scope)
3945 {
3946  if (seg.IsLiteral()) {
3947  return seg.GetLiteral().GetLength();
3948  } else if (seg.IsLoc()) {
3949  return GetLength (seg.GetLoc(), scope);
3950  } else {
3951  return 0;
3952  }
3953 }
3954 
3955 
3956 static string linkEvStrings [] = {
3957  "paired-ends",
3958  "align genus",
3959  "align xgenus",
3960  "align trnscpt",
3961  "within clone",
3962  "clone contig",
3963  "map",
3964  "strobe",
3965  "unspecified",
3966  "pcr",
3967  "other",
3968  "UNKNOWN VALUE"
3969 };
3970 
3971 /*bsv
3972 static bool s_IsGapComponent (const CDelta_seq& seg)
3973 {
3974  if (! seg.IsLiteral()) return false;
3975  const CSeq_literal& lit = seg.GetLiteral();
3976  if (! lit.IsSetSeq_data()) return true;
3977  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
3978  return false;
3979 }
3980 */
3981 
3982 static bool s_IsUnspecified(const CSeq_gap& gap)
3983 {
3984  bool is_unspec = false;
3986  const CLinkage_evidence & evidence = **ev_itr;
3987  if (!evidence.CanGetType()) continue;
3988  int linktype = evidence.GetType();
3989  if (linktype == 8) {
3990  is_unspec = true;
3991  }
3992  }
3993  return is_unspec;
3994 }
3995 
3996 
3998 {
3999  // always ignore for circular sequences
4000  if (bsh.GetInst().IsSetTopology() &&
4002  return true;
4003  }
4004 
4005  // ignore if location is genomic and gap is of certain type
4006  if (gap_type != CSeq_gap::eType_centromere &&
4007  gap_type != CSeq_gap::eType_telomere &&
4008  gap_type != CSeq_gap::eType_heterochromatin &&
4009  gap_type != CSeq_gap::eType_short_arm) {
4010  return false;
4011  }
4012 
4013  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
4014  if (src && src->GetSource().IsSetGenome() && src->GetSource().GetGenome() == CBioSource::eGenome_chromosome) {
4015  return true;
4016  } else {
4017  return false;
4018  }
4019 }
4020 
4021 
4022 // Assumes seq is a delta sequence
4024 {
4025  const CSeq_inst& inst = seq.GetInst();
4026 
4027  // Get CMolInfo and tech used for validating technique and gap positioning
4028  const CMolInfo* mi = 0;
4030  if ( mi_desc ) {
4031  mi = &(mi_desc->GetMolinfo());
4032  }
4033  CMolInfo::TTech tech =
4034  mi != 0 ? mi->GetTech() : CMolInfo::eTech_unknown;
4035 
4036 
4037  if (!inst.IsSetExt() || !inst.GetExt().IsDelta() ||
4038  inst.GetExt().GetDelta().Get().empty()) {
4040  "No CDelta_ext data for delta Bioseq", seq);
4041  }
4042 
4043  bool any_tech_ok = false;
4044  bool has_gi = false;
4045  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
4046  if (IsNTNCNWACAccession(**id_it)) {
4047  any_tech_ok = true;
4048  break;
4049  } else if ((*id_it)->IsGi()) {
4050  has_gi = true;
4051  }
4052  }
4054  if (!any_tech_ok && seq.IsNa()
4055  && tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4056  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3
4059  && tech != CMolInfo::eTech_htc && tech != CMolInfo::eTech_barcode
4060  && tech != CMolInfo::eTech_tsa) {
4062  "Delta seq technique should not be [" + NStr::IntToString(tech) + "]", seq);
4063  }
4064 
4065  // set severity for first / last gap error
4066  TSeqPos len = 0;
4067  TSeqPos seg = 0;
4068  bool last_is_gap = false;
4069  int prev_gap_linkage = -1;
4070  CSeq_gap::TType prev_gap_type = CSeq_gap::eType_unknown;
4071  int gap_linkage = -1;
4073  size_t num_gaps = 0;
4074  size_t num_adjacent_gaps = 0;
4075  bool non_interspersed_gaps = false;
4076  bool first = true;
4077  int num_gap_known_or_spec = 0;
4078  int num_gap_unknown_unspec = 0;
4079 
4080  vector<CConstRef<CSeq_loc> > delta_locs;
4081 
4082  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4083  ++seg;
4084  if ( !(*sg) ) {
4086  "NULL pointer in delta seq_ext valnode (segment " +
4087  NStr::IntToString(seg) + ")", seq);
4088  continue;
4089  }
4090  switch ( (**sg).Which() ) {
4091  case CDelta_seq::e_Loc:
4092  {
4093  const CSeq_loc& loc = (**sg).GetLoc();
4094  CConstRef<CSeq_loc> tmp(&loc);
4095  delta_locs.push_back (tmp);
4096 
4097  ValidateDeltaLoc (loc, seq, len);
4098 
4099  if ( !last_is_gap && !first) {
4100  non_interspersed_gaps = true;
4101  }
4102  last_is_gap = false;
4103  prev_gap_linkage = -1;
4104  prev_gap_type = CSeq_gap::eType_unknown;
4105  gap_linkage = CSeq_gap::eType_unknown;
4106  first = false;
4107  break;
4108  }
4109  case CDelta_seq::e_Literal:
4110  {
4111  // The C toolkit code checks for valid alphabet here
4112  // The C++ object serializaton will not load if invalid alphabet
4113  // so no check needed here
4114  const CSeq_literal& lit = (*sg)->GetLiteral();
4115  TSeqPos start_len = len;
4116  len += lit.CanGetLength() ? lit.GetLength() : 0;
4117  if (lit.IsSetSeq_data() && ! lit.GetSeq_data().IsGap()
4118  && (!lit.IsSetLength() || lit.GetLength() == 0)) {
4120  "Seq-lit of length 0 in delta chain", seq);
4121  }
4122 
4123  // Check for invalid residues
4124  if ( lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap() ) {
4125  if ( !last_is_gap && !first) {
4126  non_interspersed_gaps = true;
4127  }
4128  last_is_gap = false;
4129  prev_gap_linkage = -1;
4130  prev_gap_type = CSeq_gap::eType_unknown;
4131  const CSeq_data& data = lit.GetSeq_data();
4132  vector<TSeqPos> badIdx;
4133  CSeqportUtil::Validate(data, &badIdx);
4134  const string* ss = 0;
4135  switch (data.Which()) {
4136  case CSeq_data::e_Iupacaa:
4137  ss = &data.GetIupacaa().Get();
4138  break;
4139  case CSeq_data::e_Iupacna:
4140  ss = &data.GetIupacna().Get();
4141  break;
4142  case CSeq_data::e_Ncbieaa:
4143  ss = &data.GetNcbieaa().Get();
4144  break;
4146  {
4147  const vector<char>& c = data.GetNcbistdaa().Get();
4148  ITERATE (vector<TSeqPos>, ci, badIdx) {
4150  "Invalid residue [" +
4151  NStr::IntToString((int)c[*ci]) + "] at position [" +
4152  NStr::IntToString((*ci) + 1) + "]", seq);
4153  }
4154  break;
4155  }
4156  default:
4157  break;
4158  }
4159 
4160  if ( ss ) {
4161  ITERATE (vector<TSeqPos>, it, badIdx) {
4163  "Invalid residue [" +
4164  ss->substr(*it, 1) + "] at position [" +
4165  NStr::IntToString((*it) + 1) + "]", seq);
4166  }
4167  }
4168 
4169  if (mi) {
4170  // Count adjacent Ns in Seq-lit
4171  int max_ns = s_MaxNsInSeqLitForTech (tech);
4172  size_t adjacent_ns = x_CountAdjacentNs(lit);
4173  if (max_ns > -1 && adjacent_ns > max_ns) {
4175  "Run of " + NStr::NumericToString(adjacent_ns) +
4176  " Ns in delta component " + NStr::UIntToString(seg) +
4177  " that starts at base " + NStr::UIntToString(start_len + 1),
4178  seq);
4179  }
4180  }
4181  } else {
4182  gap_linkage = -1;
4183  gap_type = CSeq_gap::eType_unknown;
4184  if ( lit.IsSetSeq_data() && lit.GetSeq_data().IsGap() ) {
4185  const CSeq_data& data = lit.GetSeq_data();
4186  if (data.Which() == CSeq_data::e_Gap) {
4187  const CSeq_gap& gap = data.GetGap();
4188 
4189  if (gap.IsSetType()) {
4190  gap_type = gap.GetType();
4191  if (gap_type == CSeq_gap::eType_unknown && s_IsUnspecified(gap)) {
4192  num_gap_unknown_unspec++;
4193  }
4194  else {
4195  num_gap_known_or_spec++;
4196  }
4197  }
4198  if(gap.IsSetLinkage())
4199  gap_linkage = gap.GetLinkage();
4200  }
4201  }
4202  if (first && !x_IgnoreEndGap(bsh, gap_type)) {
4203  EDiagSev sev = eDiag_Error;
4204  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4205  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4206  sev = eDiag_Warning;
4207  }
4209  "First delta seq component is a gap", seq);
4210  }
4211 
4212  if(last_is_gap &&
4213  (prev_gap_type == gap_type ||
4214  prev_gap_linkage != gap_linkage ||
4215  gap_linkage != CSeq_gap::eLinkage_unlinked))
4216  ++num_adjacent_gaps;
4217 
4218  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4219  ValidateSeqGap(lit.GetSeq_data().GetGap(), seq);
4220  } else if (!lit.CanGetLength() || lit.GetLength() == 0) {
4221  if (!lit.IsSetFuzz() || !lit.GetFuzz().IsLim() || lit.GetFuzz().GetLim() != CInt_fuzz::eLim_unk) {
4223  "Gap of length 0 in delta chain", seq);
4224  } else {
4226  "Gap of length 0 with unknown fuzz in delta chain", seq);
4227  }
4228  } else if (lit.CanGetLength() && lit.GetLength() != 100) {
4229  if (lit.IsSetFuzz()) {
4231  "Gap of unknown length should have length 100", seq);
4232  }
4233  }
4234  last_is_gap = true;
4235  prev_gap_type = gap_type;
4236  prev_gap_linkage = gap_linkage;
4237  ++num_gaps;
4238  }
4239  first = false;
4240  break;
4241  }
4242  default:
4244  "CDelta_seq::Which() is e_not_set", seq);
4245  }
4246  }
4247 
4248  if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
4249  if (num_gap_unknown_unspec > 1) {
4251  "All " + NStr::IntToString(num_gap_unknown_unspec) +
4252  " Seq-gaps have unknown type and unspecified linkage", seq);
4253  } else {
4255  "Single Seq-gap has unknown type and unspecified linkage", seq);
4256  }
4257  }
4258 
4259  if (inst.GetLength() > len) {
4261  "Bioseq.seq_data too short [" + NStr::IntToString(len) +
4262  "] for given length [" + NStr::IntToString(inst.GetLength()) +
4263  "]", seq);
4264  } else if (inst.GetLength() < len) {
4266  "Bioseq.seq_data is larger [" + NStr::IntToString(len) +
4267  "] than given length [" + NStr::IntToString(inst.GetLength()) +
4268  "]", seq);
4269  }
4270  if ( non_interspersed_gaps && !has_gi && mi &&
4271  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
4272  tech == CMolInfo::eTech_htgs_2) ) {
4273  EDiagSev missing_gaps_sev = eDiag_Error;
4275  while (desc_i) {
4276  if (desc_i->GetUser().IsRefGeneTracking()) {
4277  missing_gaps_sev = eDiag_Info;
4278  break;
4279  }
4280  ++desc_i;
4281  }
4282 
4283  PostErr(missing_gaps_sev, eErr_SEQ_INST_MissingGaps,
4284  "HTGS delta seq should have gaps between all sequence runs", seq);
4285  }
4286  if ( num_adjacent_gaps >= 1 ) {
4287  string msg = (num_adjacent_gaps == 1) ?
4288  "There is 1 adjacent gap in delta seq" :
4289  "There are " + NStr::SizetToString(num_adjacent_gaps) +
4290  " adjacent gaps in delta seq";
4292  }
4293  if (last_is_gap && !x_IgnoreEndGap(bsh, gap_type)) {
4294  EDiagSev sev = eDiag_Error;
4295  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4296  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4297  sev = eDiag_Warning;
4298  }
4300  "Last delta seq component is a gap", seq);
4301  }
4302 
4303  // Validate technique
4304  if (num_gaps == 0 && mi) {
4305  if ( tech == CMolInfo::eTech_htgs_2 &&
4306  !GraphsOnBioseq(seq) &&
4307  !x_IsActiveFin(seq) ) {
4309  "HTGS 2 delta seq has no gaps and no graphs", seq);
4310  }
4311  }
4312 
4313  // look for multiple delta locs overlapping
4314  if (delta_locs.size() > 1) {
4315  stable_sort (delta_locs.begin(), delta_locs.end(), s_LocSortCompare);
4316  vector<CConstRef<CSeq_loc> >::iterator it1 = delta_locs.begin();
4317  vector<CConstRef<CSeq_loc> >::iterator it2 = it1;
4318  ++it2;
4319  while (it2 != delta_locs.end()) {
4320  if ((*it1)->GetId()->Compare(*(*it2)->GetId()) == CSeq_id::e_YES
4321  && Compare (**it1, **it2, m_Scope, fCompareOverlapping) != eNoOverlap) {
4322  string seq_label = (*it1)->GetId()->AsFastaString();
4324  "Overlapping delta range " + NStr::IntToString((*it2)->GetStart(eExtreme_Positional) + 1)
4325  + "-" + NStr::IntToString((*it2)->GetStop(eExtreme_Positional) + 1)
4326  + " and " + NStr::IntToString((*it1)->GetStart(eExtreme_Positional) + 1)
4327  + "-" + NStr::IntToString((*it1)->GetStop(eExtreme_Positional) + 1)
4328  + " on a Bioseq " + seq_label,
4329  seq);
4330  }
4331  ++it1;
4332  ++it2;
4333  }
4334  }
4335 
4336  if (IsSelfReferential(seq)) {
4338  "Self-referential delta sequence", seq);
4339  }
4340 
4341  // look for Ns next to gaps
4342  if (seq.IsNa() && seq.GetLength() > 1 && x_IsDeltaLitOnly(inst)) {
4343  try {
4344  TSeqPos pos = 0;
4346  ITERATE (CDelta_ext::Tdata, delta_i, seq.GetInst().GetExt().GetDelta().Get()) {
4347  if (delta_i->Empty()) {
4348  continue; // Ignore NULLs, reported separately above.
4349  }
4350  const CDelta_seq& seg = **delta_i;
4351  TSeqPos delta_len = (TSeqPos)s_GetDeltaLen (seg, m_Scope);
4352  if (pos > 0) {
4353  if (sv.IsInGap (pos)) {
4354  CSeqVector::TResidue res = sv [pos - 1];
4355  if (res == 'N' && !sv.IsInGap(pos - 1)) {
4357  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString (pos + 1),
4358  seq);
4359  }
4360  }
4361  }
4362  if (delta_len > 0 && pos + delta_len < len) {
4363  if (sv.IsInGap(pos + delta_len - 1)) {
4364  CSeqVector::TResidue res = sv[pos + delta_len];
4365  if (res == 'N' && !sv.IsInGap(pos + delta_len)) {
4367  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString(pos + delta_len + 1),
4368  seq);
4369  }
4370  }
4371  }
4372  pos += delta_len;
4373  }
4374  } catch (CException ) {
4375  } catch (std::exception ) {
4376  }
4377  }
4378 
4379 }
4380 
4381 
4382 bool s_HasGI(const CBioseq& seq)
4383 {
4384  bool has_gi = false;
4385  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4386  if ((*id_it)->IsGi()) {
4387  has_gi = true;
4388  break;
4389  }
4390  }
4391  return has_gi;
4392 }
4393 
4394 
4396 {
4397  if (gap.IsSetLinkage_evidence()) {
4398  int linkcount = 0;
4399  int linkevarray[12];
4400  for (int i = 0; i < 12; i++) {
4401  linkevarray[i] = 0;
4402  }
4403  bool is_unspec = false;
4405  const CLinkage_evidence & evidence = **ev_itr;
4406  if (!evidence.CanGetType()) continue;
4407  int linktype = evidence.GetType();
4408  if (linktype == 8) {
4409  is_unspec = true;
4410  }
4411  linkcount++;
4412  if (linktype == 255) {
4413  (linkevarray[10])++;
4414  }
4415  else if (linktype < 0 || linktype > 9) {
4416  (linkevarray[11])++;
4417  }
4418  else {
4419  (linkevarray[linktype])++;
4420  }
4421  }
4422  if (linkevarray[8] > 0 && linkcount > linkevarray[8]) {
4424  "Seq-gap type has unspecified and additional linkage evidence", seq);
4425  }
4426  for (int i = 0; i < 12; i++) {
4427  if (linkevarray[i] > 1) {
4429  "Linkage evidence '" + linkEvStrings[i] + "' appears " +
4430  NStr::IntToString(linkevarray[i]) + " times", seq);
4431  }
4432  }
4433  if (!gap.IsSetLinkage() || gap.GetLinkage() != CSeq_gap::eLinkage_linked) {
4435  "Seq-gap with linkage evidence must have linkage field set to linked", seq);
4436  }
4437  if (gap.IsSetType()) {
4438  int gaptype = gap.GetType();
4439  if (gaptype != CSeq_gap::eType_fragment &&
4440  gaptype != CSeq_gap::eType_clone &&
4441  gaptype != CSeq_gap::eType_repeat &&
4442  gaptype != CSeq_gap::eType_scaffold) {
4443  if (gaptype == CSeq_gap::eType_unknown && is_unspec) {
4444  /* suppress for legacy records */
4445  } else {
4447  "Seq-gap of type " + NStr::IntToString(gaptype) +
4448  " should not have linkage evidence", seq);
4449  }
4450  }
4451  }
4452  }
4453  else {
4454  if (gap.IsSetType()) {
4455  int gaptype = gap.GetType();
4456  if (gaptype == CSeq_gap::eType_scaffold) {
4458  "Seq-gap type == scaffold is missing required linkage evidence", seq);
4459  }
4460  if (gaptype == CSeq_gap::eType_repeat && gap.IsSetLinkage() && gap.GetLinkage() == CSeq_gap::eLinkage_linked)
4461  {
4462  bool suppress_SEQ_INST_SeqGapProblem = false;
4463  if (seq.IsSetDescr() && s_HasGI(seq))
4464  {
4466  {
4467  if ((**it).IsCreate_date())
4468  {
4469  CDate threshold_date(CTime(2012, 10, 1));
4470  if ((**it).GetCreate_date().Compare(threshold_date) == CDate::eCompare_before)
4471  suppress_SEQ_INST_SeqGapProblem = true;
4472  break;
4473  }
4474  }
4475  }
4476  if (!suppress_SEQ_INST_SeqGapProblem)
4478  "Seq-gap type == repeat and linkage == linked is missing required linkage evidence", seq);
4479 
4480  }
4481  }
4482  }
4483 }
4484 
4485 
4487 (const CSeq_inst& inst,
4488  const CBioseq& seq)
4489 {
4490  bool rtn = true;
4491  const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();
4492  string rpr = tv->FindName(inst.GetRepr(), true);
4493  if (NStr::Equal(rpr, "ref")) {
4494  rpr = "reference";
4495  } else if (NStr::Equal(rpr, "const")) {
4496  rpr = "constructed";
4497  }
4498  const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq";
4499  const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq";
4500  const string err2 = "Missing Seq-data on " + rpr + " Bioseq";
4501  const string err3 = "Seq-data not allowed on " + rpr + " Bioseq";
4502  switch (inst.GetRepr()) {
4504  if (inst.IsSetExt()) {
4505  PostErr(eDiag_Critical, eErr_SEQ_INST_ExtNotAllowed, err0, seq);
4506  rtn = false;
4507  }
4508  if (inst.IsSetSeq_data()) {
4509  PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataNotAllowed, err3, seq);
4510  rtn = false;
4511  }
4512  break;
4513  case CSeq_inst::eRepr_map:
4514  if (!inst.IsSetExt() || !inst.GetExt().IsMap()) {
4515  PostErr(eDiag_Error, eErr_SEQ_INST_ExtBadOrMissing, err1, seq);
4516  rtn = false;
4517  }
4518  if (inst.IsSetSeq_data()) {
4519  PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataNotAllowed, err3, seq);
4520  rtn = false;
4521  }
4522  break;
4523  case CSeq_inst::eRepr_ref:
4524  if (!inst.IsSetExt() || !inst.GetExt().IsRef() ) {
4525  PostErr(eDiag_Error, eErr_SEQ_INST_ExtBadOrMissing, err1, seq);
4526  rtn = false;
4527  }
4528  if (inst.IsSetSeq_data()) {
4529  PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataNotAllowed, err3, seq);
4530  rtn = false;
4531  }
4532  break;
4533  case CSeq_inst::eRepr_seg:
4534  if (!inst.IsSetExt() || !inst.GetExt().IsSeg() ) {
4535  PostErr(eDiag_Error, eErr_SEQ_INST_ExtBadOrMissing, err1, seq);
4536  rtn = false;
4537  }
4538  if (inst.IsSetSeq_data()) {
4539  PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataNotAllowed, err3, seq);
4540  rtn = false;
4541  }
4542  break;
4543  case CSeq_inst::eRepr_raw:
4545  if (inst.IsSetExt()) {
4546  PostErr(eDiag_Critical, eErr_SEQ_INST_ExtNotAllowed, err0, seq);
4547  rtn = false;
4548  }
4549  if (!inst.IsSetSeq_data() ||
4551  || inst.GetSeq_data().Which() == CSeq_data::e_Gap)
4552  {
4553  PostErr(eDiag_Critical, eErr_SEQ_INST_SeqDataNotFound, err2, seq);
4554  rtn = false;
4555  }
4556  break;
4558  if (!inst.IsSetExt() || !inst.GetExt().IsDelta() ) {
4559  PostErr(eDiag_Error, eErr_SEQ_INST_ExtBadOrMissing, err1, seq);
4560  rtn = false;
4561  }
4562  if (inst.IsSetSeq_data()) {
4563  PostErr(eDiag_Error, eErr_SEQ_INST_SeqDataNotAllowed, err3, seq);
4564  rtn = false;
4565  }
4566  break;
4567  default:
4568  PostErr(
4570  "Invalid Bioseq->repr = " +
4571  NStr::IntToString(static_cast<int>(inst.GetRepr())), seq);
4572  rtn = false;
4573  }
4574  return rtn;
4575 }
4576 
4577 
4579  const CBioseq_Handle& bsh)
4580 {
4582  if (!di) {
4583  // add to list of sources with no descriptor later to be reported
4585  return;
4586  }
4587  _ASSERT(di);
4588 
4589  if (m_Imp.IsTransgenic(di->GetSource()) &&
4590  CSeq_inst::IsNa(bsh.GetInst_Mol())) {
4591  // "if" means "if no biosrcs on bsh"
4592  if( GetCache().GetFeatFromCache(
4595  {
4597  "Transgenic source descriptor requires presence of source feature",
4598  *(bsh.GetBioseqCore()));
4599  }
4600  }
4601 
4602  if (!bsh.IsSetInst()
4603  || !bsh.GetInst().IsSetRepr()
4604  || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta
4605  || !bsh.GetInst().IsSetExt()
4606  || !bsh.GetInst().GetExt().IsDelta()
4607  || !bsh.GetInst().GetExt().GetDelta().IsSet()) {
4608  return;
4609  }
4610 
4611  const CBioSource& src = di->GetSource();
4612  if (! src.IsSetGenome()) return;
4613  CBioSource::TGenome genome = src.GetGenome();
4614 
4615  ITERATE (CDelta_ext::Tdata, it, bsh.GetInst().GetExt().GetDelta().Get()) {
4616  if (! (*it)->IsLoc()) continue;
4617  CBioseq_Handle hdl = m_Scope->GetBioseqHandle((*it)->GetLoc());
4618  if (! hdl) continue;
4620  if (! ci) continue;
4621  const CBioSource& crc = ci->GetSource();
4622  // cout << MSerial_AsnText << crc << endl;
4623  if (! crc.CanGetGenome()) continue;
4624  // if (! crc.IsSetGenome()) continue;
4625  CBioSource::TGenome cgenome = crc.GetGenome();
4626  if (genome == cgenome) break;
4627  if (genome == CBioSource::eGenome_unknown || genome == CBioSource::eGenome_genomic) break;
4628  if (cgenome == CBioSource::eGenome_unknown || cgenome == CBioSource::eGenome_genomic) break;
4630  "Genome difference between parent and component",
4631  *(bsh.GetBioseqCore()));
4632  break;
4633  }
4634 }
4635 
4636 
4638 {
4640 
4641  if ( !sd ) {
4643  "No Mol-info applies to this Bioseq",
4644  seq);
4645  }
4646 }
4647 
4648 
4650 {
4651  string label = "";
4652  seq.GetLabel(&label, CBioseq::eBoth);
4653 
4654  if ( !m_CurrentHandle ) {
4655  return;
4656  }
4657 
4658  if ( !CSeqdesc_CI( m_CurrentHandle, CSeqdesc::e_Pub) && m_AllFeatIt) {
4659  // look for pub or feat with cit
4660  ITERATE(CCacheImpl::TFeatValue, all_feat_it, *m_AllFeatIt) {
4661  if (all_feat_it->IsSetCit() || all_feat_it->GetData().IsPub()) {
4662  return;
4663  }
4664  }
4665 
4667  }
4668 }
4669 
4670 
4671 static bool s_LocIntervalsSpanOrigin (const CSeq_loc& loc, CBioseq_Handle bsh)
4672 {
4673  CSeq_loc_CI si(loc);
4674  if (!si) {
4675  return false;
4676  }
4677  if(loc.GetStrand() == eNa_strand_minus) {
4678  if (si.GetRange().GetFrom() != 0) {
4679  return false;
4680  }
4681  ++si;
4682  if (!si || si.GetRange().GetTo() != bsh.GetBioseqLength() - 1) {
4683  return false;
4684  }
4685  ++si;
4686  } else {
4687  if (si.GetRange().GetTo() != bsh.GetBioseqLength() - 1) {
4688  return false;
4689  }
4690  ++si;
4691  if (!si || si.GetRange().GetFrom() != 0) {
4692  return false;
4693  }
4694  ++si;
4695  }
4696  if (si) {
4697  return false;
4698  } else {
4699  return true;
4700  }
4701 }
4702 
4703 
4704 static bool s_LocIntervalsCoverSegs (const CSeq_loc& loc)
4705 {
4706  if (loc.GetStrand() == eNa_strand_minus) {
4707  unsigned int start = loc.GetTotalRange().GetTo();
4708  unsigned int stop = loc.GetTotalRange().GetFrom();
4709  CSeq_loc_CI si(loc);
4710  while (si) {
4711  if (si.GetRange().GetTo() != start) {
4712  return false;
4713  }
4714  start = si.GetRange().GetFrom() - 1;
4715  ++si;
4716  }
4717  if (start != stop - 1) {
4718  return false;
4719  }
4720  } else {
4721  unsigned int start = loc.GetTotalRange().GetFrom();
4722  unsigned int stop = loc.GetTotalRange().GetTo();
4723  CSeq_loc_CI si(loc);
4724  while (si) {
4725  if (si.GetRange().GetFrom() != start) {
4726  return false;
4727  }
4728  start = si.GetRange().GetTo() + 1;
4729  ++si;
4730  }
4731  if (start != stop + 1) {
4732  return false;
4733  }
4734  }
4735  return true;
4736 }
4737 
4738 
4740 {
4741  CRef<CSeq_loc> loc(new CSeq_loc());
4742  loc->SetInt().SetId().Assign(*(bsh.GetSeqId()));
4743  if (from < to) {
4744  loc->SetInt().SetFrom(from);
4745  loc->SetInt().SetTo(to);
4746  } else {
4747  loc->SetInt().SetFrom(to);
4748  loc->SetInt().SetTo(from);
4749  }
4750  CRef<CSeq_loc> rev_loc(new CSeq_loc());
4751  rev_loc->Assign(*loc);
4752  rev_loc->SetInt().SetStrand(eNa_strand_minus);
4753 
4754  TFeatScores mobile_elements;
4757  ITERATE(TFeatScores, m, mobile_elements) {
4758  if (m->second->GetLocation().Compare(*loc) == 0 || m->second->GetLocation().Compare(*rev_loc) == 0) {
4759  return true;
4760  }
4761  }
4762  mobile_elements.clear();
4765  ITERATE(TFeatScores, m, mobile_elements) {
4766  if (m->second->GetLocation().Compare(*loc) == 0 || m->second->GetLocation().Compare(*rev_loc) == 0) {
4767  return true;
4768  }
4769  }
4770 
4771  return false;
4772 }
4773 
4774 
4776 {
4777  CSeq_loc_CI si(loc);
4778  if (!si) {
4779  return false;
4780  }
4781  ENa_strand loc_strand = loc.GetStrand();
4782  while (si) {
4783  TSeqPos gap_start;
4784  if (loc_strand == eNa_strand_minus) {
4785  gap_start = si.GetRange().GetFrom() + 1;
4786  } else {
4787  gap_start = si.GetRange().GetTo() + 1;
4788  }
4789  ++si;
4790  if (si) {
4791  TSeqPos gap_end;
4792  if (loc_strand == eNa_strand_minus) {
4793  gap_end = si.GetRange().GetTo();
4794  } else {
4795  gap_end = si.GetRange().GetFrom();
4796  }
4797  if (gap_end > 0) {
4798  gap_end--;
4799  }
4800  if (!s_HasMobileElementForInterval(gap_start, gap_end, bsh)) {
4801  return false;
4802  }
4803  }
4804  }
4805  return true;
4806 }
4807 
4808 
4810 {
4811  try {
4812  if (!m_GeneIt) {
4813  return;
4814  }
4815 
4817  const CSeq_loc& loc = fi->GetOriginalFeature().GetLocation();
4818  CSeq_loc_CI si(loc);
4819  if ( !(++si) ) { // if only a single interval
4820  continue;
4821  }
4822 
4823  if (fi->IsSetExcept() && fi->IsSetExcept_text()
4824  && NStr::FindNoCase (fi->GetExcept_text(), "trans-splicing") != string::npos) {
4825  //ignore - has exception
4826  continue;
4827  }
4828 
4830  // ignore, "space between" is a mobile element
4831  continue;
4832  }
4833 
4834  const CSeq_id* loc_id = loc.GetId();
4835  const CSeq_id* seq_first_id = seq.GetFirstId();
4836  if ( !IsOneBioseq(loc, m_Scope) ) {
4837  if (!seq.IsSetInst()
4838  || !seq.GetInst().IsSetRepr()
4839  || seq.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
4840  continue;
4841  }
4842  // segmented set - should cover all nucleotides in interval
4844  CConstRef<CSeq_loc> mapped_loc = mapper.Map(fi->GetLocation());
4845  if (mapped_loc) {
4846  if (s_LocIntervalsCoverSegs(*mapped_loc)) {
4847  // covers all the nucleotides
4848  } else if (seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular
4849  && s_LocIntervalsSpanOrigin (*mapped_loc, m_CurrentHandle)) {
4850  // circular and spans the origin, can ignore
4851  } else {
4853  "Gene feature on segmented sequence should cover all bases within its extremes",
4854  fi->GetOriginalFeature());
4855  }
4856  }
4857  } else if (loc_id && seq_first_id && !IsSameBioseq (*loc_id, *seq_first_id, m_Scope)) {
4858  // on segment in segmented set, only report once
4859  continue;
4860  } else if ( seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular
4862  // spans origin
4863  continue;
4864  } else if (m_Imp.IsSmallGenomeSet()) {
4866  "Multiple interval gene feature in small genome set - "
4867  "set trans-splicing exception if appropriate", fi->GetOriginalFeature());
4868  } else {
4870  "Gene feature on non-segmented sequence should not "
4871  "have multiple intervals", fi->GetOriginalFeature());
4872  }
4873  }
4874  } catch ( const exception& e ) {
4875  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
4877  string("Exception while validating multi-interval genes. EXCEPTION: ") +
4878  e.what(), seq);
4879  }
4880  }
4881 }
4882 
4883 
4885 {
4887  if (closest_molinfo) {
4888  const CSeq_entry& ctx = *seq.GetParentEntry();
4890  "Suspicious use of complete", ctx, *closest_molinfo);
4891  } else {
4893  "Suspicious use of complete", seq);
4894  }
4895 }
4896 
4897 
4899 (const CBioseq& seq,
4900  const CMolInfo& mi)
4901 {
4902  if ( !mi.IsSetCompleteness() ) {
4903  return;
4904  }
4905  if ( !seq.IsNa() ) {
4906  return;
4907  }
4908 
4910  CMolInfo::TBiomol biomol = mi.IsSetBiomol() ?
4912  EDiagSev sev = mi.GetTech() == CMolInfo::eTech_htgs_3 ?
4913  eDiag_Warning : /* eDiag_Error */ eDiag_Warning;
4914 
4915  CSeqdesc_CI desc(m_CurrentHandle, CSeqdesc::e_Title);
4916  if ( desc ) {
4917  const string& title = desc->GetTitle();
4918  if (!NStr::IsBlank(title)) {
4919  if (NStr::FindNoCase(title, "complete sequence") != string::npos
4920  || NStr::FindNoCase(title, "complete genome") != string::npos) {
4921  return;
4922  }
4923  }
4924  }
4925 
4926  bool reported = false;
4927 
4928  if ( comp == CMolInfo::eCompleteness_complete ) {
4929  if ( biomol == CMolInfo::eBiomol_genomic ) {
4930  bool is_gb = false;
4931  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
4932  if ( (*it)->IsGenbank() ) {
4933  is_gb = true;
4934  break;
4935  }
4936  }
4937 
4938  if ( is_gb ) {
4939  if (seq.IsSetInst() && seq.GetInst().IsSetTopology()
4941  const CSeq_entry& ctx = *seq.GetParentEntry();
4943  "Circular topology has complete flag set, but title should say complete sequence or complete genome",
4944  ctx);
4945  } else {
4946  x_ReportSuspiciousUseOfComplete(seq, sev);
4947  reported = true;
4948  }
4949  }
4950  }
4951 
4952  if (!reported) {
4953  // for SQD-1484
4954  // warn if completeness = complete, organism not viral and origin not artificial, no location set or location is genomic
4955  CSeqdesc_CI src_desc(m_CurrentHandle, CSeqdesc::e_Source);
4956  if (src_desc) {
4957  const CBioSource& biosrc = src_desc->GetSource();
4958  if ((!biosrc.IsSetLineage()
4959  || (NStr::FindNoCase(biosrc.GetLineage(), "Viruses") == string::npos
4960  && NStr::FindNoCase(biosrc.GetLineage(), "Viroids") == string::npos)) // not viral
4961  && (!biosrc.IsSetOrigin() || biosrc.GetOrigin() != CBioSource::eOrigin_artificial) // not artificial
4962  && (!src_desc->GetSource().IsSetGenome()
4963  || src_desc->GetSource().GetGenome() == CBioSource::eGenome_genomic)) { // location not set or genomic
4964  x_ReportSuspiciousUseOfComplete(seq, eDiag_Warning);
4965  reported = true;
4966  }
4967  }
4968  }
4969  if (!reported && HasAssemblyOrNullGap(seq)) {
4970  // for VR-614
4971  x_ReportSuspiciousUseOfComplete(seq, eDiag_Warning);
4972  }
4973  }
4974 }
4975 
4976 
4977 static bool s_StandaloneProt(const CBioseq_Handle& bsh)
4978 {
4979  // proteins are never standalone within the context of a Genbank / Refseq
4980  // record.
4981 
4983  while ( eh ) {
4984  if ( eh.IsSet() ) {
4985  CBioseq_set_Handle bsh = eh.GetSet();
4986  if ( bsh.IsSetClass() ) {
4987  CBioseq_set::TClass cls = bsh.GetClass();
4988  switch ( cls ) {
4995  return false;
4996  default:
4997  break;
4998  }
4999  }
5000  }
5001  eh = eh.GetParentEntry();
5002  }
5003 
5004  return true;
5005 }
5006 
5007 
5009 {
5010  CBioseq_Handle parent;
5011 
5012  if ( part ) {
5013  CSeq_entry_Handle segset =
5015  if ( segset ) {
5016  for ( CSeq_entry_CI it(segset); it; ++it ) {
5017  if ( it->IsSeq() && it->GetSeq().IsSetInst_Repr() &&
5018  it->GetSeq().GetInst_Repr() == CSeq_inst::eRepr_seg ) {
5019  parent = it->GetSeq();
5020  break;
5021  }
5022  }
5023  }
5024  }
5025  return parent;
5026 }
5027 
5028 
5029 static bool s_SeqIdCompare (const CConstRef<CSeq_id>& q1, const CConstRef<CSeq_id>& q2)
5030 {
5031  // is q1 < q2
5032  return (q1->CompareOrdered(*q2) < 0);
5033 }
5034 
5035 
5036 static bool s_SeqIdMatch (const CConstRef<CSeq_id>& q1, const CConstRef<CSeq_id>& q2)
5037 {
5038  // is q1 == q2
5039  return (q1->CompareOrdered(*q2) == 0);
5040 }
5041 
5042 
5044 {
5045  if (!m_GeneIt) {
5046  return;
5047  }
5048  /*
5049  bool is_circular = bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular;
5050  */
5051  try {
5052  vector< CConstRef < CSeq_feat > > containing_genes;
5053  vector< int > num_contained;
5055  TSeqPos left = fi->GetLocation().GetStart(eExtreme_Positional);
5056  vector< CConstRef < CSeq_feat > >::iterator cit = containing_genes.begin();
5057  vector< int >::iterator nit = num_contained.begin();
5058  while (cit != containing_genes.end() && nit != num_contained.end()) {
5059  ECompare comp = Compare(fi->GetLocation(), (*cit)->GetLocation(), m_Scope, fCompareOverlapping);
5060  if (comp == eContained || comp == eSame) {
5061  (*nit)++;
5062  }
5063  TSeqPos n_right = (*cit)->GetLocation().GetStop(eExtreme_Positional);
5064  if (n_right < left) {
5065  // report if necessary
5066  if (*nit > 4) {
5068  "Gene contains " +