NCBI C++ ToolKit
seqtitle.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqtitle.cpp 74559 2016-09-13 11:58:16Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko
27 *
28 * File Description:
29 * Obtains or constructs a sequence's title. (Corresponds to
30 * CreateDefLine in the C toolkit.)
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <serial/iterator.hpp>
35 
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seq/Seg_ext.hpp>
41 #include <objects/seq/Seq_ext.hpp>
61 
62 #include <objmgr/scope.hpp>
63 #include <objmgr/seq_map_ci.hpp>
64 #include <objmgr/seqdesc_ci.hpp>
65 #include <objmgr/feat_ci.hpp>
66 #include <objmgr/util/feature.hpp>
67 #include <objmgr/util/sequence.hpp>
68 
71 BEGIN_SCOPE(sequence)
72 
73 static string s_TitleFromBioSource (const CBioSource& source,
74  CMolInfo::TTech tech,
75  const string& suffix = kEmptyStr,
76  bool pooled_clones = false);
77 
78 
79 static string s_TitleFromChromosome(const CBioSource& source,
80  const CMolInfo& mol_info);
81 
82 
83 static string s_TitleFromProtein (const CBioseq_Handle& handle,
84  CScope& scope,
85  string& organism,
87 static string s_TitleFromSegment (const CBioseq_Handle& handle,
88  CScope& scope,
90 
91 static void s_FlyCG_PtoR(string& s);
92 
93 
96  fON_virus = 0x2,
97  fON_wgs = 0x4
98 };
99 typedef int TOrganelleNameFlags; // binary OR of EOrganelleNameFlags
100 
101 
102 static const char* s_OrganelleName(CBioSource::TGenome genome,
104 
105 
107 {
108  string prefix, title, suffix;
109  string organism;
113  CConstRef<CDbtag> general_id(NULL);
115  CConstRef<CMolInfo> mol_info(NULL);
116  bool third_party = false;
117  bool tpa_exp = false;
118  bool tpa_inf = false;
119  bool is_nc = false;
120  bool is_nm = false;
121  bool is_nr = false;
122  bool is_tsa = false;
123  bool wgs_master = false;
124  bool tsa_master = false;
126  bool htg_tech = false;
127  bool htgs_draft = false;
128  bool htgs_cancelled = false;
129  bool htgs_pooled = false;
130  bool htgs_unfinished = false;
131  bool use_biosrc = false;
132  CScope& scope = hnd.GetScope();
133 
134  ITERATE (CBioseq_Handle::TId, idh, hnd.GetId()) {
135  CConstRef<CSeq_id> id = idh->GetSeqId();
136  if ( !tsid ) {
137  tsid = id->GetTextseq_Id();
138  }
139  switch (id->Which()) {
140  case CSeq_id::e_Other:
141  case CSeq_id::e_Genbank:
142  case CSeq_id::e_Embl:
143  case CSeq_id::e_Ddbj:
144  {
145  const CTextseq_id& t = *id->GetTextseq_Id();
146  if (t.IsSetAccession()) {
147  const string& acc = t.GetAccession();
150  && NStr::EndsWith(acc, "000000")) {
151  wgs_master = true;
153  && NStr::EndsWith(acc, "000000")) {
154  tsa_master = true;
155  } else if (type == CSeq_id::eAcc_refseq_chromosome) {
156  is_nc = true;
157  } else if (type == CSeq_id::eAcc_refseq_mrna) {
158  is_nm = true;
159  } else if (type == CSeq_id::eAcc_refseq_ncrna) {
160  is_nr = true;
161  }
162  }
163  break;
164  }
165  case CSeq_id::e_General:
166  if ( !id->GetGeneral().IsSkippable() ) {
167  general_id = &id->GetGeneral();
168  }
169  break;
170  case CSeq_id::e_Tpg:
171  case CSeq_id::e_Tpe:
172  case CSeq_id::e_Tpd:
173  third_party = true;
174  break;
175  case CSeq_id::e_Pdb:
176  pdb_id = &id->GetPdb();
177  break;
178  case CSeq_id::e_Patent:
179  pat_id = &id->GetPatent();
180  break;
181  default:
182  break;
183  }
184  }
185 
186  {
188  choices.push_back(CSeqdesc::e_Source);
189  choices.push_back(CSeqdesc::e_Molinfo);
190  int found = 0;
191  for ( CSeqdesc_CI it(hnd, choices); it; ++it ) {
192  if ( it->Which() == CSeqdesc::e_Source ) {
193  if ( !source ) {
194  source = &it->GetSource();
195  if ( (found |= 1) == 3 ) {
196  break;
197  }
198  }
199  }
200  else {
201  if ( !mol_info ) {
202  mol_info = &it->GetMolinfo();
203  tech = mol_info->GetTech();
204  if ( (found |= 2) == 3 ) {
205  break;
206  }
207  }
208  }
209  }
210  }
211 
212  switch (tech) {
216  htgs_unfinished = true;
217  // manufacture all titles for unfinished HTG sequences
219  // fall through
221  htg_tech = true;
222  // fall through
223  case CMolInfo::eTech_est:
224  case CMolInfo::eTech_sts:
226  case CMolInfo::eTech_wgs:
227  use_biosrc = true;
228  break;
229  case CMolInfo::eTech_tsa:
230  is_tsa = true;
231  use_biosrc = true;
232  break;
233  default:
234  break;
235  }
236 
237  if (htg_tech || third_party) {
238  const CGB_block::TKeywords* keywords = 0;
239  for (CSeqdesc_CI gb(hnd, CSeqdesc::e_Genbank); gb; ++gb) {
240  if (gb->GetGenbank().IsSetKeywords()) {
241  keywords = &gb->GetGenbank().GetKeywords();
242  }
243  BREAK(gb);
244  }
245  if ( !keywords ) {
246  for (CSeqdesc_CI embl(hnd, CSeqdesc::e_Embl); embl; ++embl) {
247  if (embl->GetEmbl().IsSetKeywords()) {
248  keywords = &embl->GetEmbl().GetKeywords();
249  }
250  BREAK(embl);
251  }
252  }
253  if (keywords) {
254  ITERATE (CGB_block::TKeywords, it, *keywords) {
255  if (NStr::EqualNocase(*it, "HTGS_DRAFT")) {
256  htgs_draft = true;
257  } else if (NStr::EqualNocase(*it, "HTGS_CANCELLED")) {
258  htgs_cancelled = true;
259  } else if (NStr::EqualNocase(*it, "HTGS_POOLED_MULTICLONE")) {
260  htgs_pooled = true;
261  } else if (NStr::EqualNocase(*it, "TPA:experimental")) {
262  tpa_exp = true;
263  } else if (NStr::EqualNocase(*it, "TPA:inferential")) {
264  tpa_inf = true;
265  }
266  }
267  }
268  }
269 
270  if (!(flags & fGetTitle_Reconstruct)) {
271  size_t search_depth = 0; // no limit
272  // Ignore parents' titles for non-PDB proteins.
274  && pdb_id.IsNull()) {
275  search_depth = 1; // only Bioseq's descriptors
276  }
277  CSeqdesc_CI it(hnd, CSeqdesc::e_Title, search_depth);
278  if (it) {
279  title = it->GetTitle();
280  }
281  }
282 
283  if (title.empty() && use_biosrc && source.NotEmpty()) {
284  if (((tech == CMolInfo::eTech_wgs && !wgs_master) || is_tsa)
285  && general_id.NotEmpty() && general_id->GetTag().IsStr()) {
286  title = s_TitleFromBioSource(*source, tech,
287  general_id->GetTag().GetStr());
288  } else {
289  title = s_TitleFromBioSource(*source, tech, kEmptyStr,
290  htgs_unfinished && htgs_pooled);
291  }
293  }
294 
295  if (title.empty() && is_nc && source.NotEmpty()
296  && mol_info.NotEmpty()) {
297  switch (mol_info->GetBiomol()) {
300  title = s_TitleFromChromosome(*source, *mol_info);
301  if (!title.empty()) {
303  }
304  break;
305  }
306  } else if (title.empty() && is_nm && source.NotEmpty()
307  && (flags & fGetTitle_NoExpensive) == 0) {
308  unsigned int genes = 0, cdregions = 0, prots = 0;
309  CConstRef<CSeq_feat> gene(0), cdregion(0);
310  for (CFeat_CI it(hnd);
311  it; ++it) {
312  switch (it->GetData().Which()) {
314  ++genes;
315  gene.Reset(&it->GetMappedFeature());
316  break;
318  ++cdregions;
319  cdregion.Reset(&it->GetMappedFeature());
320  break;
322  ++prots;
323  break;
324  default:
325  break;
326  }
327  }
328  if (genes == 1 && cdregions == 1 // && prots >= 1
329  && source->GetOrg().IsSetTaxname()) {
330  title = source->GetOrg().GetTaxname() + ' ';
331  string cds_label;
332  feature::GetLabel(*cdregion, &cds_label, feature::fFGL_Content,
333  &scope);
334  if (NStr::EqualNocase(source->GetOrg().GetTaxname(),
335  "Drosophila melanogaster")) {
336  s_FlyCG_PtoR(cds_label);
337  }
338  title += NStr::Replace(cds_label, "isoform ",
339  "transcript variant ");
340  title += " (";
342  &scope);
343  title += "), mRNA";
344  }
345  } else if (title.empty() && is_nr && source.NotEmpty()
346  && source->GetOrg().IsSetTaxname() && mol_info.NotEmpty()) {
349  it; ++it) {
350  if (it->GetData().IsGene()) {
351  title = source->GetOrg().GetTaxname() + ' ';
353  title += ", ";
354  switch (mol_info->GetBiomol()) {
355  case CMolInfo::eBiomol_pre_RNA: title += "precursorRNA"; break;
356  case CMolInfo::eBiomol_mRNA: title += "mRNA"; break;
357  case CMolInfo::eBiomol_rRNA: title += "rRNA"; break;
358  case CMolInfo::eBiomol_tRNA: title += "tRNA"; break;
359  case CMolInfo::eBiomol_snRNA: title += "snRNA"; break;
360  case CMolInfo::eBiomol_scRNA: title += "scRNA"; break;
361  case CMolInfo::eBiomol_cRNA: title += "cRNA"; break;
362  case CMolInfo::eBiomol_snoRNA: title += "snoRNA"; break;
363  case CMolInfo::eBiomol_transcribed_RNA: title+="miscRNA"; break;
364  case CMolInfo::eBiomol_ncRNA: title += "ncRNA"; break;
365  case CMolInfo::eBiomol_tmRNA: title += "tmRNA"; break;
366  default: break;
367  }
368  BREAK(it);
369  }
370  }
371  }
372 
373  // originally further down, but moved up to match the C version
374  while (NStr::EndsWith(title, ".") || NStr::EndsWith(title, " ")) {
375  title.erase(title.end() - 1);
376  }
377 
378  if (title.empty() && pdb_id.NotEmpty()) {
379  CSeqdesc_CI it(hnd, CSeqdesc::e_Pdb);
380  for (; it; ++it) {
381  if ( !it->GetPdb().GetCompound().empty() ) {
382  if (isprint((unsigned char) pdb_id->GetChain())) {
383  title = string("Chain ") + (char)pdb_id->GetChain() + ", ";
384  }
385  title += it->GetPdb().GetCompound().front();
386  BREAK(it);
387  }
388  }
389  }
390 
391  if (title.empty() && pat_id.NotEmpty()) {
392  title = "Sequence " + NStr::IntToString(pat_id->GetSeqid())
393  + " from Patent " + pat_id->GetCit().GetCountry()
394  + ' ' + pat_id->GetCit().GetSomeNumber();
395  }
396 
397  if (title.empty() && hnd.GetBioseqMolType() == CSeq_inst::eMol_aa) {
398  title = s_TitleFromProtein(hnd, scope, organism, flags);
399  if ( !title.empty() ) {
401  }
402  }
403 
404  if (title.empty() && !htg_tech
405  && hnd.GetInst_Repr() == CSeq_inst::eRepr_seg) {
406  title = s_TitleFromSegment(hnd, scope, flags);
407  }
408 
409  if (title.empty() && !htg_tech && source.NotEmpty()) {
410  title = s_TitleFromBioSource(*source, tech);
411  if (title.empty()) {
412  title = "No definition line found";
413  }
414  }
415 
416  if (is_tsa && !title.empty() ) {
417  prefix = "TSA: ";
418  } else if (third_party && !title.empty() ) {
419  bool tpa_start = NStr::StartsWith(title, "TPA: ", NStr::eNocase);
420  if (tpa_exp) {
421  if ( !NStr::StartsWith(title, "TPA_exp:", NStr::eNocase) ) {
422  prefix = "TPA_exp: ";
423  if (tpa_start) {
424  title.erase(0, 5);
425  }
426  }
427  } else if (tpa_inf) {
428  if ( !NStr::StartsWith(title, "TPA_inf:", NStr::eNocase) ) {
429  prefix = "TPA_inf: ";
430  if (tpa_start) {
431  title.erase(0, 5);
432  }
433  }
434  } else if ( !tpa_start ) {
435  prefix = "TPA: ";
436  }
437  }
438 
439  switch (tech) {
441  if (title.find("LOW-PASS") == NPOS) {
442  suffix = ", LOW-PASS SEQUENCE SAMPLING";
443  }
444  break;
447  {
448  if (htgs_draft && title.find("WORKING DRAFT") == NPOS) {
449  suffix = ", WORKING DRAFT SEQUENCE";
450  } else if ( !htgs_draft && !htgs_cancelled
451  && title.find("SEQUENCING IN") == NPOS) {
452  suffix = ", *** SEQUENCING IN PROGRESS ***";
453  }
454 
455  string un;
456  if (tech == CMolInfo::eTech_htgs_1) {
457  un = "un";
458  }
459  if (hnd.GetInst_Repr() == CSeq_inst::eRepr_delta) {
460  unsigned int pieces = 1;
461  for (CSeqMap_CI it(hnd, CSeqMap::fFindGap); it; ++it) {
462  ++pieces;
463  }
464  if (pieces == 1) {
465  // suffix += (", 1 " + un + "ordered piece");
466  } else {
467  suffix += (", " + NStr::IntToString(pieces)
468  + ' ' + un + "ordered pieces");
469  }
470  } else {
471  // suffix += ", in " + un + "ordered pieces";
472  }
473  break;
474  }
476  if (title.find("complete sequence") == NPOS) {
477  suffix = ", complete sequence";
478  }
479  break;
480 
481  case CMolInfo::eTech_est:
482  if (title.find("mRNA sequence") == NPOS) {
483  suffix = ", mRNA sequence";
484  }
485  break;
486 
487  case CMolInfo::eTech_sts:
488  if (title.find("sequence tagged site") == NPOS) {
489  suffix = ", sequence tagged site";
490  }
491  break;
492 
494  if (title.find("genomic survey sequence") == NPOS) {
495  suffix = ", genomic survey sequence";
496  }
497  break;
498 
499  case CMolInfo::eTech_wgs:
500  if (wgs_master) {
501  if (title.find("whole genome shotgun sequencing project") == NPOS){
502  suffix = ", whole genome shotgun sequencing project";
503  }
504  } else if (title.find("whole genome shotgun sequence") == NPOS) {
505  if (source.NotEmpty()) {
506  const char* orgnl = s_OrganelleName(source->GetGenome(),
507  fON_wgs);
508  if (orgnl[0] && title.find(orgnl) == NPOS) {
509  suffix = string(1, ' ') + orgnl;
510  }
511  }
512  suffix += ", whole genome shotgun sequence";
513  }
514  break;
515 
516  case CMolInfo::eTech_tsa:
517  if (tsa_master) {
518  if (title.find("transcriptome shotgun assembly project") == NPOS){
519  suffix = ", transcriptome shotgun assembly project";
520  }
521  } else if (title.find("transcriptome shotgun assembly project") == NPOS) {
522  suffix += ", transcriptome shotgun assembly project";
523  }
524  break;
525  }
526 
527  if (flags & fGetTitle_Organism) {
529  if (source) {
530  org = &source->GetOrg();
531  } else {
532  CSeqdesc_CI it(hnd, CSeqdesc::e_Org);
533  for (; it; ++it) {
534  org = &it->GetOrg();
535  BREAK(it);
536  }
537  }
538 
539  if (organism.empty() && org.NotEmpty() && org->IsSetTaxname()) {
540  organism = org->GetTaxname();
541  }
542  if ( !organism.empty() && title.find(organism) == NPOS) {
543  suffix += " [" + organism + ']';
544  }
545  }
546 
547  return prefix + title + suffix;
548 }
549 
550 
551 bool GetTitle(const CBioseq& seq, string* title_ptr, TGetTitleFlags flags)
552 {
553  string prefix, title, suffix;
554  string organism;
558  CConstRef<CDbtag> general_id(NULL);
560  CConstRef<CMolInfo> mol_info(NULL);
561  bool third_party = false;
562  bool tpa_exp = false;
563  bool tpa_inf = false;
564  bool is_nc = false;
565  bool is_nm = false;
566  bool is_nr = false;
567  bool is_tsa = false;
568  bool wgs_master = false;
569  bool tsa_master = false;
571  bool htg_tech = false;
572  bool htgs_draft = false;
573  bool htgs_cancelled = false;
574  bool htgs_pooled = false;
575  bool htgs_unfinished = false;
576  bool use_biosrc = false;
577 
578  ITERATE (CBioseq::TId, it, seq.GetId()) {
579  CConstRef<CSeq_id> id = *it;
580  if ( !tsid ) {
581  tsid = id->GetTextseq_Id();
582  }
583  switch (id->Which()) {
584  case CSeq_id::e_Other:
585  case CSeq_id::e_Genbank:
586  case CSeq_id::e_Embl:
587  case CSeq_id::e_Ddbj:
588  {
589  const CTextseq_id& t = *id->GetTextseq_Id();
590  if (t.IsSetAccession()) {
591  const string& acc = t.GetAccession();
594  && NStr::EndsWith(acc, "000000")) {
595  wgs_master = true;
597  && NStr::EndsWith(acc, "000000")) {
598  tsa_master = true;
599  } else if (type == CSeq_id::eAcc_refseq_chromosome) {
600  is_nc = true;
601  } else if (type == CSeq_id::eAcc_refseq_mrna) {
602  is_nm = true;
603  } else if (type == CSeq_id::eAcc_refseq_ncrna) {
604  is_nr = true;
605  }
606  }
607  break;
608  }
609  case CSeq_id::e_General:
610  if ( !id->GetGeneral().IsSkippable() ) {
611  general_id = &id->GetGeneral();
612  }
613  break;
614  case CSeq_id::e_Tpg:
615  case CSeq_id::e_Tpe:
616  case CSeq_id::e_Tpd:
617  third_party = true;
618  break;
619  case CSeq_id::e_Pdb:
620  pdb_id = &id->GetPdb();
621  break;
622  case CSeq_id::e_Patent:
623  pat_id = &id->GetPatent();
624  break;
625  default:
626  break;
627  }
628  }
629 
630  {
631  if ( CConstRef<CSeqdesc> desc =
633  source = &desc->GetSource();
634  }
635  if ( CConstRef<CSeqdesc> desc =
637  mol_info = &desc->GetMolinfo();
638  tech = mol_info->GetTech();
639  }
640  }
641 
642  switch (tech) {
646  htgs_unfinished = true;
647  // manufacture all titles for unfinished HTG sequences
649  // fall through
651  htg_tech = true;
652  // fall through
653  case CMolInfo::eTech_est:
654  case CMolInfo::eTech_sts:
656  case CMolInfo::eTech_wgs:
657  use_biosrc = true;
658  break;
659  case CMolInfo::eTech_tsa:
660  is_tsa = true;
661  use_biosrc = true;
662  break;
663  default:
664  break;
665  }
666 
667  if (htg_tech || third_party) {
668  return false;
669  }
670 
671  if (!(flags & fGetTitle_Reconstruct)) {
672  int max_level = kMax_Int; // no limit
673  // Ignore parents' titles for non-PDB proteins.
674  if (seq.IsAa()
675  && pdb_id.IsNull()) {
676  max_level = 0; // only Bioseq's descriptors
677  }
678  int level = 0;
679  if ( CConstRef<CSeqdesc> desc =
680  seq.GetClosestDescriptor(CSeqdesc::e_Title, &level) ) {
681  if ( level <= max_level ) {
682  title = desc->GetTitle();
683  }
684  }
685  }
686 
687  if (title.empty() && use_biosrc && source.NotEmpty()) {
688  if (((tech == CMolInfo::eTech_wgs && !wgs_master) || is_tsa)
689  && general_id.NotEmpty() && general_id->GetTag().IsStr()) {
690  title = s_TitleFromBioSource(*source, tech,
691  general_id->GetTag().GetStr());
692  } else {
693  title = s_TitleFromBioSource(*source, tech, kEmptyStr,
694  htgs_unfinished && htgs_pooled);
695  }
697  }
698 
699  if (title.empty() && is_nc && source.NotEmpty()
700  && mol_info.NotEmpty()) {
701  switch (mol_info->GetBiomol()) {
704  title = s_TitleFromChromosome(*source, *mol_info);
705  if (!title.empty()) {
707  }
708  break;
709  }
710  } else if (title.empty() && is_nm && source.NotEmpty()) {
711  return false;
712  } else if (title.empty() && is_nr && source.NotEmpty()
713  && source->GetOrg().IsSetTaxname()) {
714  return false;
715  }
716 
717  // originally further down, but moved up to match the C version
718  while (NStr::EndsWith(title, ".") || NStr::EndsWith(title, " ")) {
719  title.erase(title.end() - 1);
720  }
721 
722  if (title.empty() && pdb_id.NotEmpty()) {
723  return false;
724  }
725 
726  if (title.empty() && pat_id.NotEmpty()) {
727  title = "Sequence " + NStr::IntToString(pat_id->GetSeqid())
728  + " from Patent " + pat_id->GetCit().GetCountry()
729  + ' ' + pat_id->GetCit().GetSomeNumber();
730  }
731 
732  if (title.empty() && seq.IsAa()) {
733  return false;
734  }
735 
736  if (title.empty() && !htg_tech &&
737  (!seq.IsSetInst() || seq.GetInst().GetRepr() == CSeq_inst::eRepr_seg)) {
738  return false;
739  }
740 
741  if (title.empty() && !htg_tech && source.NotEmpty()) {
742  title = s_TitleFromBioSource(*source, tech);
743  if (title.empty()) {
744  title = "No definition line found";
745  }
746  }
747 
748  if (is_tsa && !title.empty() ) {
749  prefix = "TSA: ";
750  } else if (third_party && !title.empty() ) {
751  bool tpa_start = NStr::StartsWith(title, "TPA: ", NStr::eNocase);
752  if (tpa_exp) {
753  if ( !NStr::StartsWith(title, "TPA_exp:", NStr::eNocase) ) {
754  prefix = "TPA_exp: ";
755  if (tpa_start) {
756  title.erase(0, 5);
757  }
758  }
759  } else if (tpa_inf) {
760  if ( !NStr::StartsWith(title, "TPA_inf:", NStr::eNocase) ) {
761  prefix = "TPA_inf: ";
762  if (tpa_start) {
763  title.erase(0, 5);
764  }
765  }
766  } else if ( !tpa_start ) {
767  prefix = "TPA: ";
768  }
769  }
770 
771  switch (tech) {
773  if (title.find("LOW-PASS") == NPOS) {
774  suffix = ", LOW-PASS SEQUENCE SAMPLING";
775  }
776  break;
779  {
780  if (htgs_draft && title.find("WORKING DRAFT") == NPOS) {
781  suffix = ", WORKING DRAFT SEQUENCE";
782  } else if ( !htgs_draft && !htgs_cancelled
783  && title.find("SEQUENCING IN") == NPOS) {
784  suffix = ", *** SEQUENCING IN PROGRESS ***";
785  }
786 
787  string un;
788  if (tech == CMolInfo::eTech_htgs_1) {
789  un = "un";
790  }
791  if ((!seq.IsSetInst() || seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta)) {
792  return false;
793  } else {
794  // suffix += ", in " + un + "ordered pieces";
795  }
796  break;
797  }
799  if (title.find("complete sequence") == NPOS) {
800  suffix = ", complete sequence";
801  }
802  break;
803 
804  case CMolInfo::eTech_est:
805  if (title.find("mRNA sequence") == NPOS) {
806  suffix = ", mRNA sequence";
807  }
808  break;
809 
810  case CMolInfo::eTech_sts:
811  if (title.find("sequence tagged site") == NPOS) {
812  suffix = ", sequence tagged site";
813  }
814  break;
815 
817  if (title.find("genomic survey sequence") == NPOS) {
818  suffix = ", genomic survey sequence";
819  }
820  break;
821 
822  case CMolInfo::eTech_wgs:
823  if (wgs_master) {
824  if (title.find("whole genome shotgun sequencing project") == NPOS){
825  suffix = ", whole genome shotgun sequencing project";
826  }
827  } else if (title.find("whole genome shotgun sequence") == NPOS) {
828  if (source.NotEmpty()) {
829  const char* orgnl = s_OrganelleName(source->GetGenome(),
830  fON_wgs);
831  if (orgnl[0] && title.find(orgnl) == NPOS) {
832  suffix = string(1, ' ') + orgnl;
833  }
834  }
835  suffix += ", whole genome shotgun sequence";
836  }
837  break;
838 
839  case CMolInfo::eTech_tsa:
840  if (tsa_master) {
841  if (title.find("transcriptome shotgun assembly project") == NPOS){
842  suffix = ", transcriptome shotgun assembly project";
843  }
844  } else if (title.find("transcriptome shotgun assembly project") == NPOS) {
845  suffix += ", transcriptome shotgun assembly project";
846  }
847  break;
848  }
849 
850  if (flags & fGetTitle_Organism) {
852  if (source) {
853  org = &source->GetOrg();
854  } else {
855  if ( CConstRef<CSeqdesc> desc =
857  org = &desc->GetOrg();
858  }
859  }
860 
861  if (organism.empty() && org.NotEmpty() && org->IsSetTaxname()) {
862  organism = org->GetTaxname();
863  }
864  if ( !organism.empty() && title.find(organism) == NPOS) {
865  suffix += " [" + organism + ']';
866  }
867  }
868 
869  *title_ptr = prefix + title + suffix;
870  return true;
871 }
872 
873 
874 static string s_DescribeClones(const string& clone, bool pooled)
875 {
876  SIZE_TYPE count = 1;
877  for (SIZE_TYPE pos = clone.find(';'); pos != NPOS;
878  pos = clone.find(';', pos + 1)) {
879  ++count;
880  }
881  if (pooled) {
882  return ", pooled multiple clones";
883  } else if (count > 3) {
884  return ", " + NStr::SizetToString(count) + " clones,";
885  } else {
886  return " clone " + clone;
887  }
888 }
889 
890 
891 static bool s_EndsWithStrain(const string& name, const string& strain)
892 {
893  // return NStr::EndsWith(name, strain, NStr::eNocase);
894  if (strain.size() >= name.size()) {
895  return false;
896  }
897  SIZE_TYPE pos = name.find(' ');
898  if (pos == NPOS) {
899  return false;
900  }
901  pos = name.find(' ', pos + 1);
902  if (pos == NPOS || pos + strain.size() >= name.size()) {
903  return false;
904  }
905  // XXX - the C Toolkit starts looking unnecessarily early, which could
906  // (at least in theory) lead to false negatives.
907  pos = NStr::FindNoCase(name, strain, name.size() - strain.size() - 1);
908  if (pos == name.size() - strain.size()) {
909  return true;
910  } else if (pos == name.size() - strain.size() - 1
911  && name[pos - 1] == '\'' && name[name.size() - 1] == '\'') {
912  return true;
913  } else {
914  return false;
915  }
916 }
917 
918 
920  CMolInfo::TTech tech,
921  const string& suffix,
922  bool pooled_clones)
923 {
924  string name, chromosome, clone, map_, plasmid, strain, sfx;
925  const COrg_ref& org = source.GetOrg();
926 
927  if (org.IsSetTaxname()) {
928  name = org.GetTaxname();
929  }
930 
931  if (suffix.size() > 0) {
932  sfx = ' ' + suffix;
933  }
934 
935  if (source.IsSetSubtype()) {
936  ITERATE (CBioSource::TSubtype, it, source.GetSubtype()) {
937  switch ((*it)->GetSubtype()) {
939  chromosome = " chromosome " + (*it)->GetName();
940  if (suffix == (*it)->GetName()) {
941  sfx.clear();
942  }
943  break;
945  clone = s_DescribeClones((*it)->GetName(), pooled_clones);
946  break;
948  map_ = " map " + (*it)->GetName();
949  break;
951  if (tech == CMolInfo::eTech_wgs) { // omit otherwise
952  plasmid = " plasmid " + (*it)->GetName();
953  if (suffix == (*it)->GetName()) {
954  sfx.clear();
955  }
956  }
957  break;
958  }
959  }
960  }
961 
962  if (org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
963  ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
964  const string& subname = (*it)->GetSubname();
965  if ((*it)->GetSubtype() == COrgMod::eSubtype_strain
966  && !s_EndsWithStrain(name, subname)) {
967  strain = " strain " + subname.substr(0, subname.find(';'));
968  }
969  }
970  }
971 
972  string title = NStr::TruncateSpaces(name + strain + chromosome + clone
973  + map_ + plasmid + sfx);
974  if ( !title.empty() && islower((unsigned char) title[0])) {
975  title[0] = (char)toupper((unsigned char) title[0]);
976  }
977 
978  return title;
979 }
980 
981 
982 static const char* s_OrganelleName(CBioSource::TGenome genome,
984 {
985  switch (genome) {
986  // unknown, genomic
988  return "chloroplast";
990  return "chromoplast";
992  return "kinetoplast";
994  if ((flags & (fON_with_plasmid | fON_wgs)) == 0) {
995  return "mitochondrion";
996  } else {
997  return "mitochondrial";
998  }
1000  return "plastid";
1002  if ((flags & fON_wgs) == 0) {
1003  return "macronuclear";
1004  }
1005  break;
1007  if ((flags & fON_wgs) == 0) {
1008  return "extrachromosomal";
1009  }
1010  break;
1012  if ((flags & fON_wgs) == 0) {
1013  return "plasmid";
1014  }
1015  break;
1016  // transposon, insertion-seq
1018  return "cyanelle";
1020  if ((flags & fON_virus) == 0) {
1021  if ((flags & (fON_with_plasmid | fON_wgs)) == 0) {
1022  return "provirus";
1023  } else {
1024  return "proviral";
1025  }
1026  }
1027  break;
1029  if ((flags & fON_virus) == 0) {
1030  return "virus";
1031  }
1032  break;
1034  if ((flags & fON_wgs) == 0) {
1035  return "nucleomorph";
1036  }
1037  break;
1039  return "apicoplast";
1041  return "leucoplast";
1043  if ((flags & fON_wgs) == 0) {
1044  return "protoplast";
1045  } else {
1046  return "proplastid";
1047  }
1048  break;
1050  if ((flags & fON_wgs) != 0) {
1051  return "endogenous virus";
1052  }
1053  break;
1055  if ((flags & fON_wgs) != 0) {
1056  return "hydrogenosome";
1057  }
1058  break;
1060  if ((flags & fON_wgs) != 0) {
1061  return "chromosome";
1062  }
1063  break;
1065  if ((flags & fON_wgs) != 0) {
1066  return "chromatophore";
1067  }
1068  break;
1069  }
1070  return kEmptyCStr;
1071 }
1072 
1073 
1075  const CMolInfo& mol_info)
1076 {
1077  string name, chromosome, segment, plasmid_name, orgnl;
1078  string seq_tag, gen_tag;
1079  bool is_plasmid = false;
1081 
1082  if (source.GetOrg().IsSetTaxname()) {
1083  name = source.GetOrg().GetTaxname();
1084  } else {
1085  return kEmptyStr;
1086  }
1087 
1088  string lc_name = name;
1089  NStr::ToLower(lc_name);
1090 
1091  if (lc_name.find("virus") != NPOS || lc_name.find("phage") != NPOS) {
1092  flags |= fON_virus;
1093  }
1094 
1095  if (source.IsSetSubtype()) {
1096  ITERATE (CBioSource::TSubtype, it, source.GetSubtype()) {
1097  switch ((*it)->GetSubtype()) {
1099  chromosome = (*it)->GetName();
1100  break;
1102  segment = (*it)->GetName();
1103  break;
1105  {
1106  plasmid_name = (*it)->GetName();
1107  string lc_plasmid = plasmid_name;
1108  NStr::ToLower(lc_plasmid);
1109  if (lc_plasmid.find("plasmid") == NPOS
1110  && lc_plasmid.find("element") == NPOS) {
1111  plasmid_name = "plasmid " + plasmid_name;
1112  }
1114  break;
1115  }
1116  }
1117  }
1118  }
1119 
1120  orgnl = s_OrganelleName(source.GetGenome(), flags);
1121  if (source.GetGenome() == CBioSource::eGenome_plasmid) {
1122  is_plasmid = true;
1123  }
1124 
1125  switch (mol_info.GetCompleteness()) {
1130  seq_tag = ", partial sequence";
1131  gen_tag = ", genome";
1132  break;
1133  default:
1134  seq_tag = ", complete sequence";
1135  gen_tag = ", complete genome";
1136  break;
1137  }
1138 
1139  if (lc_name.find("plasmid") != NPOS) {
1140  return name + seq_tag;
1141  } else if (is_plasmid) {
1142  if (plasmid_name.empty()) {
1143  return name + " unnamed plasmid" + seq_tag;
1144  } else {
1145  return name + ' ' + plasmid_name + seq_tag;
1146  }
1147  } else if ( !plasmid_name.empty() ) {
1148  if (orgnl.empty()) {
1149  return name + ' ' + plasmid_name + seq_tag;
1150  } else {
1151  return name + ' ' + orgnl + ' ' + plasmid_name + seq_tag;
1152  }
1153  } else if ( !orgnl.empty() ) {
1154  if ( chromosome.empty() ) {
1155  return name + ' ' + orgnl + gen_tag;
1156  } else {
1157  return name + ' ' + orgnl + " chromosome " + chromosome + seq_tag;
1158  }
1159  } else if ( !segment.empty() ) {
1160  if (segment.find("DNA") == NPOS && segment.find("RNA") == NPOS
1161  && segment.find("segment") == NPOS
1162  && segment.find("Segment") == NPOS) {
1163  return name + " segment " + segment + seq_tag;
1164  } else {
1165  return name + ' ' + segment + seq_tag;
1166  }
1167  } else if ( !chromosome.empty() ) {
1168  return name + " chromosome " + chromosome + seq_tag;
1169  } else {
1170  return name + gen_tag;
1171  }
1172 }
1173 
1174 
1176  const CMolInfo& mol_info)
1177 {
1178  string result = x_TitleFromChromosome(source, mol_info);
1179  result = NStr::Replace(result, "Plasmid", "plasmid");
1180  result = NStr::Replace(result, "Element", "element");
1181  if (!result.empty()) {
1182  result[0] = (char)toupper((unsigned char) result[0]);
1183  }
1184  return result;
1185 }
1186 
1187 
1188 static string s_GetProteinName(const CBioseq_Handle& handle, CScope& scope,
1189  CConstRef<CSeq_loc>& cds_loc,
1191 {
1193  CConstRef<CGene_ref> gene;
1194 
1195  CSeq_loc everywhere;
1196  everywhere.SetWhole().Assign(*handle.GetSeqId());
1197 
1198  {{
1199  CConstRef<CSeq_feat> prot_feat
1201  eOverlap_Contained, scope);
1202  if (prot_feat) {
1203  prot = &prot_feat->GetData().GetProt();
1204  }
1205  }}
1206 
1207  {{
1208  CConstRef<CSeq_feat> cds_feat(GetCDSForProduct(handle));
1209  if (cds_feat) {
1210  cds_loc = &cds_feat->GetLocation();
1211  }
1212  }}
1213 
1214  if (cds_loc) {
1215  CConstRef<CSeq_feat> gene_feat = GetOverlappingGene(*cds_loc, scope);
1216  if (gene_feat) {
1217  gene = &gene_feat->GetData().GetGene();
1218  }
1219  }
1220 
1221  if (prot.NotEmpty() && prot->IsSetName() && !prot->GetName().empty()) {
1222  string result;
1223  bool first = true;
1224  ITERATE (CProt_ref::TName, it, prot->GetName()) {
1225  if ( !first ) {
1226  result += "; ";
1227  }
1228  result += *it;
1229  first = false;
1230  if ((flags & fGetTitle_AllProteins) == 0) {
1231  break; // just give the first
1232  }
1233  }
1234  if (NStr::CompareNocase(result, "hypothetical protein") == 0) {
1235  // XXX - gene_feat might not always be exactly what we want
1236  if (gene && gene->IsSetLocus_tag()) {
1237  result += ' ' + gene->GetLocus_tag();
1238  }
1239  }
1240  return result;
1241  } else if (prot.NotEmpty() && prot->IsSetDesc()
1242  && !prot->GetDesc().empty()) {
1243  return prot->GetDesc();
1244  } else if (prot.NotEmpty() && prot->IsSetActivity()
1245  && !prot->GetActivity().empty()) {
1246  return prot->GetActivity().front();
1247  } else if (gene) {
1248  string gene_name;
1249  if (gene->IsSetLocus() && !gene->GetLocus().empty()) {
1250  gene_name = gene->GetLocus();
1251  } else if (gene->IsSetSyn() && !gene->GetSyn().empty()) {
1252  gene_name = *gene->GetSyn().begin();
1253  } else if (gene->IsSetDesc() && !gene->GetDesc().empty()) {
1254  gene_name = gene->GetDesc();
1255  }
1256  if ( !gene_name.empty() ) {
1257  return gene_name + " gene product";
1258  }
1259  }
1260 
1261  return "unnamed protein product";
1262 }
1263 
1264 
1265 static string s_TitleFromProtein(const CBioseq_Handle& handle, CScope& scope,
1266  string& organism, TGetTitleFlags flags)
1267 {
1268  string result;
1269  CConstRef<CSeq_loc> cds_loc;
1270 
1271  if ((flags & fGetTitle_NoExpensive) == 0) {
1272  result = s_GetProteinName(handle, scope, cds_loc, flags);
1273  } else {
1274  result = "unnamed protein product";
1275  }
1276 
1277  {{ // Find organism name (must be specifically associated with this Bioseq)
1278  CConstRef<COrg_ref> org;
1279  for (CSeqdesc_CI it(handle, CSeqdesc::e_Source, 1); it; ++it) {
1280  org = &it->GetSource().GetOrg();
1281  BREAK(it);
1282  }
1283  if (org.Empty() && cds_loc.NotEmpty()) {
1284  for (CFeat_CI it(scope, *cds_loc, CSeqFeatData::e_Biosrc);
1285  it; ++it) {
1286  org = &it->GetData().GetBiosrc().GetOrg();
1287  BREAK(it);
1288  }
1289  }
1290  if (org.NotEmpty() && org->IsSetTaxname()) {
1291  organism = org->GetTaxname();
1292  }
1293  }}
1294 
1295  return result;
1296 }
1297 
1298 
1299 static string s_TitleFromSegment(const CBioseq_Handle& handle, CScope& scope,
1301 {
1302  string organism, product, locus, strain, clone, isolate;
1303  string completeness = "complete";
1304  bool cds_found = false;
1305 
1306  {
1307  CSeqdesc_CI it(handle, CSeqdesc::e_Source);
1308  for (; it; ++it) {
1309  const CBioSource& src = it->GetSource();
1310  const COrg_ref& org = src.GetOrg();
1311  if (org.IsSetTaxname()) {
1312  organism = org.GetTaxname();
1313  if (org.IsSetOrgname()) {
1314  const COrgName& orgname = org.GetOrgname();
1315  if (orgname.IsSetMod()) {
1316  ITERATE (COrgName::TMod, mod, orgname.GetMod()) {
1317  COrgMod::TSubtype subtype = (*mod)->GetSubtype();
1318  const string& subname = (*mod)->GetSubname();
1319  if (subtype == COrgMod::eSubtype_strain) {
1320  if ( !NStr::EndsWith(organism, subname) ) {
1321  strain = subname;
1322  }
1323  break;
1324  } else if (subtype == COrgMod::eSubtype_isolate) {
1325  isolate = subname;
1326  break;
1327  }
1328  }
1329  }
1330  }
1331  }
1332  if (src.IsSetSubtype()) {
1333  ITERATE (CBioSource::TSubtype, ssrc, src.GetSubtype()) {
1334  if ((*ssrc)->GetSubtype() == CSubSource::eSubtype_clone) {
1335  clone = s_DescribeClones((*ssrc)->GetName(), false);
1336  }
1337  }
1338  }
1339  BREAK(it);
1340  }
1341  }
1342 
1343  if (organism.empty()) {
1344  organism = "Unknown";
1345  }
1346 
1347  CSeq_loc everywhere;
1348  everywhere.SetMix().Set() = handle.GetInst_Ext().GetSeg();
1349 
1350  if ((flags & fGetTitle_NoExpensive) == 0) {
1351  CFeat_CI it(scope, everywhere, CSeqFeatData::e_Cdregion);
1352  for (; it; ++it) {
1353  cds_found = true;
1354  if ( !it->IsSetProduct() ) {
1355  continue;
1356  }
1357  const CSeq_loc& product_loc = it->GetProduct();
1358 
1359  if (it->IsSetPartial()) {
1360  completeness = "partial";
1361  }
1362 
1363  CConstRef<CSeq_feat> prot_feat
1365  eOverlap_Interval, scope);
1366  if (product.empty() && prot_feat.NotEmpty()
1367  && prot_feat->GetData().GetProt().IsSetName()) {
1368  product = *prot_feat->GetData().GetProt().GetName().begin();
1369  }
1370 
1371  CConstRef<CSeq_feat> gene_feat
1372  = GetOverlappingGene(it->GetLocation(), scope);
1373  if (locus.empty() && gene_feat.NotEmpty()) {
1374  if (gene_feat->GetData().GetGene().IsSetLocus()) {
1375  locus = gene_feat->GetData().GetGene().GetLocus();
1376  } else if (gene_feat->GetData().GetGene().IsSetSyn()) {
1377  locus = *gene_feat->GetData().GetGene().GetSyn().begin();
1378  }
1379  }
1380 
1381  BREAK(it);
1382  }
1383  }
1384 
1385  string result = organism;
1386  if ( !cds_found) {
1387  if ( !strain.empty() ) {
1388  result += " strain " + strain;
1389  } else if ( !clone.empty() && clone.find(" clone ") != NPOS) {
1390  result += clone;
1391  } else if ( !isolate.empty() ) {
1392  result += " isolate " + isolate;
1393  }
1394  }
1395  if ( !product.empty() ) {
1396  result += ' ' + product;
1397  }
1398  if ( !locus.empty() ) {
1399  result += " (" + locus + ')';
1400  }
1401  if ( !product.empty() || !locus.empty() ) {
1402  result += " gene, " + completeness + " cds";
1403  }
1404  return NStr::TruncateSpaces(result);
1405 }
1406 
1407 
1408 static void s_FlyCG_PtoR(string& s)
1409 {
1410  // s =~ s/\b(CG\d*-)P([[:alpha:]])\b/$1R$2/g, more or less.
1411  SIZE_TYPE pos = 0, len = s.size();
1412  while ((pos = NStr::FindCase(s, "CG", pos)) != NPOS) {
1413  if (pos > 0 && !isspace((unsigned char)s[pos - 1]) ) {
1414  continue;
1415  }
1416  pos += 2;
1417  while (pos + 3 < len && isdigit((unsigned char)s[pos])) {
1418  ++pos;
1419  }
1420  if (s[pos] == '-' && s[pos + 1] == 'P'
1421  && isalpha((unsigned char)s[pos + 2])
1422  && (pos + 3 == len || strchr(" ,;", s[pos + 3])) ) {
1423  s[pos + 1] = 'R';
1424  }
1425  }
1426 }
1427 
1428 
1429 END_SCOPE(sequence)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CConstRef< CSeqdesc > GetClosestDescriptor(CSeqdesc::E_Choice choice, int *level=NULL) const
Definition: Seq_entry.cpp:212
bool IsAa(void) const
Definition: Bioseq.cpp:350
CConstRef –.
Definition: ncbiobj.hpp:1266
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
CFeat_CI –.
Definition: feat_ci.hpp:64
CScope –.
Definition: scope.hpp:92
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
static uch flags
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
string GetLabel(const CSeq_id &id)
@ eAcc_wgs
Definition: Seq_id.hpp:290
@ eAcc_refseq_mrna
Definition: Seq_id.hpp:415
@ eAcc_refseq_ncrna
Definition: Seq_id.hpp:416
@ eAcc_refseq_chromosome
Definition: Seq_id.hpp:429
@ eAcc_tsa
Definition: Seq_id.hpp:273
@ eAcc_division_mask
Definition: Seq_id.hpp:299
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
@ eOverlap_Interval
at least one pair of intervals must overlap
@ eOverlap_Contained
2nd contained within 1st extremes
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
CConstRef< CSeq_feat > GetOverlappingGene(const CSeq_loc &loc, CScope &scope, ETransSplicing eTransSplicing=eTransSplicing_Auto)
Definition: sequence.cpp:1366
string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags)
Definition: seqtitle.cpp:106
int TGetTitleFlags
Definition: sequence.hpp:287
@ fGetTitle_Organism
append [organism]
Definition: sequence.hpp:283
@ fGetTitle_Reconstruct
ignore existing title Seqdesc.
Definition: sequence.hpp:282
@ fGetTitle_NoExpensive
skip potential expensive operations
Definition: sequence.hpp:285
@ fGetTitle_AllProteins
normally just names the first
Definition: sequence.hpp:284
vector< CSeq_id_Handle > TId
const TInst_Ext & GetInst_Ext(void) const
bool IsSetProduct(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
TMol GetBioseqMolType(void) const
Get some values from core:
bool IsSetPartial(void) const
const CSeq_loc & GetLocation(void) const
const CSeq_loc & GetProduct(void) const
vector< CSeqdesc::E_Choice > TDescChoices
Definition: seqdesc_ci.hpp:67
@ fFindGap
Definition: seq_map.hpp:130
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:1401
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BREAK(it)
Definition: ncbistl.hpp:175
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5490
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
list< string > TKeywords
Definition: GB_block_.hpp:93
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetSyn(void) const
synonyms for locus Check if a value has been assigned to Syn data member.
Definition: Gene_ref_.hpp:756
const TSyn & GetSyn(void) const
Get the Syn member data.
Definition: Gene_ref_.hpp:768
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
bool IsSetDesc(void) const
descriptive name Check if a value has been assigned to Desc data member.
Definition: Gene_ref_.hpp:587
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
list< string > TName
Definition: Prot_ref_.hpp:108
const TName & GetName(void) const
Get the Name member data.
Definition: Prot_ref_.hpp:378
bool IsSetName(void) const
protein name Check if a value has been assigned to Name data member.
Definition: Prot_ref_.hpp:366
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TGene & GetGene(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const TSeg & GetSeg(void) const
Get the variant data.
Definition: Seq_ext_.cpp:114
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TPdb & GetPdb(void) const
Get the variant data.
Definition: Seqdesc_.cpp:538
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TCompound & GetCompound(void) const
Get the Compound member data.
Definition: PDB_block_.hpp:447
int len
static char * subname
Definition: mdb_load.c:26
const CharType(& source)[N]
Definition: pointer.h:1149
EIPRangeType t
Definition: ncbi_localip.c:101
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isprint(Uchar c)
Definition: ncbictype.hpp:67
int islower(Uchar c)
Definition: ncbictype.hpp:66
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
static string s_GetProteinName(const CBioseq_Handle &handle, CScope &scope, CConstRef< CSeq_loc > &cds_loc, TGetTitleFlags flags)
Definition: seqtitle.cpp:1188
static string s_TitleFromChromosome(const CBioSource &source, const CMolInfo &mol_info)
Definition: seqtitle.cpp:1175
static const char * s_OrganelleName(CBioSource::TGenome genome, TOrganelleNameFlags flags)
Definition: seqtitle.cpp:982
static string s_TitleFromProtein(const CBioseq_Handle &handle, CScope &scope, string &organism, TGetTitleFlags flags)
Definition: seqtitle.cpp:1265
EOrganelleNameFlags
Definition: seqtitle.cpp:94
@ fON_with_plasmid
Definition: seqtitle.cpp:95
@ fON_virus
Definition: seqtitle.cpp:96
@ fON_wgs
Definition: seqtitle.cpp:97
static void s_FlyCG_PtoR(string &s)
Definition: seqtitle.cpp:1408
static string x_TitleFromChromosome(const CBioSource &source, const CMolInfo &mol_info)
Definition: seqtitle.cpp:1074
static string s_DescribeClones(const string &clone, bool pooled)
Definition: seqtitle.cpp:874
static string s_TitleFromSegment(const CBioseq_Handle &handle, CScope &scope, TGetTitleFlags flags)
Definition: seqtitle.cpp:1299
int TOrganelleNameFlags
Definition: seqtitle.cpp:99
static string s_TitleFromBioSource(const CBioSource &source, CMolInfo::TTech tech, const string &suffix=kEmptyStr, bool pooled_clones=false)
Definition: seqtitle.cpp:919
static bool s_EndsWithStrain(const string &name, const string &strain)
Definition: seqtitle.cpp:891
Definition: type.c:6
else result
Definition: token2.c:20
Modified on Wed Apr 24 14:12:22 2024 by modify_doxy.py rev. 669887