NCBI C++ ToolKit
gc_assembly_parser.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gc_assembly_parser.cpp 74559 2016-09-13 11:58:16Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:
27 * Aleksey Grichenko
28 *
29 * File Description:
30 * GC-Assembly parser used by CScope and CSeq_loc_Mapper to
31 * convert assemblies to seq-entries.
32 *
33 */
34 
35 #include <ncbi_pch.hpp>
36 
38 #include <objmgr/error_codes.hpp>
40 #include <objects/seq/seq__.hpp>
43 
44 
45 #define NCBI_USE_ERRCODE_X ObjMgr_GC_Assembly_Parser
46 
49 
50 
51 const char* CAssemblyParserException::GetErrCodeString(void) const
52 {
53  switch ( GetErrCode() ) {
54  case eUnsupported: return "eUnsupported";
55  case eOtherError: return "eOtherError";
56  default: return CException::GetErrCodeString();
57  }
58 }
59 
60 
61 /////////////////////////////////////////////////////////////////////////////
62 //
63 // CGC_Assembly_Parser
64 //
65 /////////////////////////////////////////////////////////////////////////////
66 
67 
69  TParserFlags flags)
70  : m_Flags(flags)
71 {
72  m_TSE.Reset(new CSeq_entry);
74  x_ParseGCAssembly(assembly, m_TSE);
75 }
76 
77 
79 {
80 }
81 
82 
84  CRef<CSeq_entry> parent)
85 {
86  entry->SetSet().SetLevel(parent ? parent->GetSet().GetLevel() + 1 : 1);
88  entry->SetSet().SetSeq_set(); // mandatory member, must be initialized
89  if (parent) {
90  parent->SetSet().SetSeq_set().push_back(entry);
91  }
92 }
93 
94 
96  CSeq_entry& entry)
97 {
98  if (assm_desc.IsSetDescr() && (m_Flags & fIgnoreDescr) == 0) {
99  const CSeq_descr& descr = assm_desc.GetDescr();
100  ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
102  desc_copy->Assign(**desc);
103  entry.SetDescr().Set().push_back(desc_copy);
104  }
105  }
106  if (assm_desc.IsSetAnnot() && (m_Flags & fIgnoreAnnots) == 0) {
107  ITERATE(CGC_AssemblyDesc::TAnnot, annot, assm_desc.GetAnnot()) {
108  CRef<CSeq_annot> annot_copy(new CSeq_annot);
109  annot_copy->Assign(**annot);
110  entry.SetAnnot().push_back(annot_copy);
111  }
112  }
113 }
114 
115 
117  CRef<CSeq_entry> parent_entry)
118 {
119  if ( gc_assembly.IsUnit() ) {
120  const CGC_AssemblyUnit& unit = gc_assembly.GetUnit();
121  if (unit.IsSetDesc()) {
122  // Add annotations and descriptions.
123  x_CopyData(unit.GetDesc(), *parent_entry);
124  }
125  if ( unit.IsSetMols() ) {
127  CRef<CSeq_entry> entry(new CSeq_entry);
128  x_InitSeq_entry(entry, parent_entry);
129  const CGC_Replicon::TSequence& seq = (*it)->GetSequence();
130  if ( seq.IsSingle() ) {
131  x_ParseGCSequence(seq.GetSingle(), NULL, entry, null);
132  }
133  else {
135  x_ParseGCSequence(**its, NULL, entry, null);
136  }
137  }
138  }
139  }
140  if ( unit.IsSetOther_sequences() ) {
141  CRef<CSeq_entry> entry(new CSeq_entry);
142  x_InitSeq_entry(entry, parent_entry);
144  ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
145  x_ParseGCSequence(**tseq, NULL, entry, null);
146  }
147  }
148  }
149  }
150  else if ( gc_assembly.IsAssembly_set() ) {
151  const CGC_AssemblySet& aset = gc_assembly.GetAssembly_set();
152  if (aset.IsSetDesc()) {
153  // Add annotations and descriptions.
154  x_CopyData(aset.GetDesc(), *parent_entry);
155  }
156  CRef<CSeq_entry> entry(new CSeq_entry);
157  x_InitSeq_entry(entry, parent_entry);
158  x_ParseGCAssembly(aset.GetPrimary_assembly(), entry);
159  if ( aset.IsSetMore_assemblies() ) {
161  aset.GetMore_assemblies()) {
162  x_ParseGCAssembly(**assm, entry);
163  }
164  }
165  }
166 }
167 
168 
170  const CGC_Sequence* parent_seq,
171  CRef<CSeq_entry> parent_entry,
172  CRef<CSeq_id> override_id)
173 {
174  CRef<CSeq_id> id(override_id);
175  if ( !id ) {
176  id.Reset(new CSeq_id);
177  id->Assign(gc_seq.GetSeq_id());
178  }
179 
180  // Special case - structure contains just one (whole) sequence and
181  // the same sequence is mentioned in the synonyms. Must skip this
182  // sequence and use the part instead.
183  CSeq_id_Handle struct_syn;
184  if ( gc_seq.IsSetStructure() ) {
185  if (gc_seq.GetStructure().Get().size() == 1) {
186  const CDelta_seq& delta = *gc_seq.GetStructure().Get().front();
187  if ( delta.IsLoc() ) {
188  const CSeq_loc& delta_loc = delta.GetLoc();
189  switch (delta_loc.Which()) {
190  case CSeq_loc::e_Whole:
191  struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetWhole());
192  break;
193  case CSeq_loc::e_Int:
194  if (delta_loc.GetInt().GetFrom() == 0) {
195  struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetInt().GetId());
196  }
197  break;
198  default:
199  break;
200  }
201  }
202  }
203  }
204  // Same as above, but structure is missing and sequences contain just one item.
205  else if (gc_seq.IsSetSequences() && gc_seq.GetSequences().size() == 1) {
206  const CGC_TaggedSequences& tagged_seq = *gc_seq.GetSequences().front();
207  if (tagged_seq.GetSeqs().size() == 1) {
208  struct_syn = CSeq_id_Handle::GetHandle(tagged_seq.GetSeqs().front()->GetSeq_id());
209  }
210  }
211 
212  // Add synonyms if any.
213  TSeqIds synonyms;
214  synonyms.insert(CSeq_id_Handle::GetHandle(*id));
215  if ( gc_seq.IsSetSeq_id_synonyms() ) {
217  // Add conversion for each synonym which can be used
218  // as a source id.
219  const CGC_TypedSeqId& it_id = **it;
220  switch ( it_id.Which() ) {
223  if ( it_id.GetGenbank().IsSetGi() ) {
224  synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetGi()));
225  }
226  if ( it_id.GetGenbank().IsSetGpipe() ) {
228  }
229  break;
231  {
232  // If some of the ids is used in the structure (see above),
233  // ignore all refseq ids.
235  if ( it_id.GetRefseq().IsSetGi() ) {
236  synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGi()));
237  }
238  if (it_id.GetRefseq().IsSetGpipe()) {
239  synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGpipe()));
240  }
241  break;
242  }
244  // Ignore private local ids.
245  if ((m_Flags & fIgnoreLocalIds) == 0 ||
246  !it_id.GetPrivate().IsLocal()) {
247  synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetPrivate()));
248  }
249  break;
251  if ((m_Flags & fIgnoreExternalIds) == 0 &&
252  ((m_Flags & fIgnoreLocalIds) == 0 ||
253  !it_id.GetExternal().GetId().IsLocal())) {
254  synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetExternal().GetId()));
255  }
256  break;
257  default:
258  NCBI_THROW(CAssemblyParserException, eUnsupported,
259  "Unsupported alias type in GC-Sequence synonyms");
260  break;
261  }
262  }
263  // The sequence is referencing itself?
264  if (synonyms.find(struct_syn) != synonyms.end()) {
266  *gc_seq.GetSequences().front()->GetSeqs().front(),
267  parent_seq,
268  parent_entry,
269  id);
270  return;
271  }
272  }
273 
274  CRef<CSeq_entry> entry;
275  if ( gc_seq.IsSetSequences() ) {
276  entry.Reset(new CSeq_entry);
277  x_InitSeq_entry(entry, parent_entry);
278  }
279  else {
280  entry = parent_entry;
281  }
282 
283  if (gc_seq.IsSetDescr() && (m_Flags & fIgnoreDescr) == 0) {
284  const CSeq_descr& descr = gc_seq.GetDescr();
285  ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
287  desc_copy->Assign(**desc);
288  entry->SetDescr().Set().push_back(desc_copy);
289  }
290  }
291  if (gc_seq.IsSetAnnot() && (m_Flags & fIgnoreAnnots) == 0) {
292  ITERATE(CGC_Sequence::TAnnot, annot, gc_seq.GetAnnot()) {
293  CRef<CSeq_annot> annot_copy(new CSeq_annot);
294  annot_copy->Assign(**annot);
295  entry->SetAnnot().push_back(annot_copy);
296  }
297  }
298 
299  // Create virtual bioseq and use it to initialize the mapper
300  x_AddBioseq(entry, synonyms, gc_seq);
301  if ( !parent_seq ) {
302  // Save top-level sequences.
304  }
305 
306  if ( gc_seq.IsSetSequences() ) {
307  CRef<CSeq_entry> sub_entry(new CSeq_entry);
308  x_InitSeq_entry(sub_entry, entry);
310  ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
311  // To create a sub-level of the existing seq-map we need
312  // both structure at the current level and 'placed' state
313  // on the child sequences. If this is not true, iterate
314  // sub-sequences but treat them as top-level sequences rather
315  // than segments.
316  const CGC_Sequence* parent = 0;
317  if (gc_seq.IsSetStructure() &&
318  (*seq)->GetState() == CGC_TaggedSequences::eState_placed) {
319  parent = &gc_seq;
320  }
321  x_ParseGCSequence(**tseq, parent, sub_entry, null);
322  }
323  }
324  }
325 }
326 
327 
329  const TSeqIds& synonyms,
330  const CGC_Sequence& gc_seq)
331 {
332  CRef<CBioseq> bioseq(new CBioseq);
333  ITERATE(TSeqIds, syn, synonyms) {
334  // Do not add bioseqs with duplicate ids.
335  if ((m_Flags & fSkipDuplicates) != 0 &&
336  !m_AllSeqs.insert(*syn).second ) {
337  return;
338  }
339 
340  CRef<CSeq_id> syn_id(new CSeq_id);
341  syn_id->Assign(*syn->GetSeqId());
342  bioseq->SetId().push_back(syn_id);
343  }
344 
345  bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
346  if ( gc_seq.CanGetLength() ) {
347  bioseq->SetInst().SetLength(gc_seq.GetLength());
348  }
349  if ( gc_seq.IsSetStructure() ) {
350  // Create delta sequence
351  bioseq->SetInst().SetRepr(CSeq_inst::eRepr_delta);
352  // const_cast should be safe here - we are not going to modify data
353  bioseq->SetInst().SetExt().SetDelta(
354  const_cast<CDelta_ext&>(gc_seq.GetStructure()));
355  }
356  else {
357  // Create virtual bioseq without length/data.
358  bioseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
359  }
360  CRef<CSeq_entry> entry(new CSeq_entry);
361  entry->SetSeq(*bioseq);
362  parent_entry->SetSet().SetSeq_set().push_back(entry);
363 }
364 
365 
const TSequences & GetSequences(void) const
Get the Sequences member data.
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:167
list< CRef< CGC_TypedSeqId > > TSeq_id_synonyms
list< CRef< CSeq_annot > > TAnnot
TSeqPos GetLength() const
return the length of this sequence.
bool IsSetOther_sequences(void) const
On primary assembly-unit: here will be the unplaced sequences On alt-loci: list of sequences aligned/...
bool IsSetAnnot(void) const
Feature annotation; Contains Pseudo Autosomal regions on chromosomes and scaffolds in the following f...
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const TSeqs & GetSeqs(void) const
Get the Seqs member data.
const TSet & GetSet(void) const
Get the variant data.
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
void SetLevel(TLevel value)
Assign a value to Level data member.
const TPrivate & GetPrivate(void) const
Get the variant data.
Do not add descriptions to seq-entries and bioseqs.
bool IsSetDescr(void) const
Various attributes assigned at this level: biosrc, comments, publications...
list< CRef< CGC_Sequence > > TSeqs
const TId & GetId(void) const
Get the Id member data.
bool IsSetSequences(void) const
placed: populated both on chromosome and scaffold levels unlocalized: populated on chromosome level C...
CDelta_seq –.
Definition: Delta_seq.hpp:65
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
const TDescr & GetDescr(void) const
Get the Descr member data.
const TSeq_id & GetSeq_id(void) const
Get the Seq_id member data.
const TOther_sequences & GetOther_sequences(void) const
Get the Other_sequences member data.
const TWhole & GetWhole(void) const
Get the variant data.
Definition: Seq_loc_.cpp:172
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const TDesc & GetDesc(void) const
Get the Desc member data.
#define NULL
Definition: ncbistd.hpp:225
const_iterator end() const
Definition: set.hpp:136
void SetDescr(CSeq_descr &value)
Definition: Seq_entry.cpp:134
.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:54
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:893
const TSeq_id_synonyms & GetSeq_id_synonyms(void) const
Get the Seq_id_synonyms member data.
CGC_AssemblyDesc –.
list< CRef< CGC_Sequence > > TSet
segmented sequence + parts
Do not add local private and external ids to bioseqs.
void x_InitSeq_entry(CRef< CSeq_entry > entry, CRef< CSeq_entry > parent)
CRef< CSeq_entry > m_TSE
CGC_TypedSeqId –.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
bool IsSetDesc(void) const
descriptors live in a shared data block Check if a value has been assigned to Desc data member...
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TMore_assemblies & GetMore_assemblies(void) const
Get the More_assemblies member data.
virtual ~CGC_Assembly_Parser(void)
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
void x_CopyData(const CGC_AssemblyDesc &assm_desc, CSeq_entry &entry)
void x_ParseGCAssembly(const CGC_Assembly &gc_assembly, CRef< CSeq_entry > parent_entry)
void x_ParseGCSequence(const CGC_Sequence &gc_seq, const CGC_Sequence *parent_seq, CRef< CSeq_entry > parent_entry, CRef< CSeq_id > override_id)
const TDescr & GetDescr(void) const
Get the Descr member data.
TFrom GetFrom(void) const
Get the From member data.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TExternal & GetExternal(void) const
Get the variant data.
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const TGenbank & GetGenbank(void) const
Get the variant data.
the sequence(s) representing this molecule in the case of 2L and 2R - the molecule is represented by ...
CGC_Assembly_Parser(const CGC_Assembly &assembly, TParserFlags flags=fDefault)
Parse the assembly, convert it to seq-entry, collect additional information (top-level sequences etc)...
CGC_TaggedSequences –.
Int4 delta(size_t dimension_, const Int4 *score_)
E_Choice Which(void) const
Which variant is currently selected.
const TSingle & GetSingle(void) const
Get the variant data.
whole sequence
Definition: Seq_loc_.hpp:100
const TStructure & GetStructure(void) const
Get the Structure member data.
bool IsSetStructure(void) const
locations of ordered scaffolds/components Check if a value has been assigned to Structure data member...
const TGpipe & GetGpipe(void) const
Get the Gpipe member data.
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:286
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
list< CRef< CGC_TaggedSequences > > TSequences
bool IsSetDesc(void) const
descriptors live in a shared data block Check if a value has been assigned to Desc data member...
const TUnit & GetUnit(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
exist only within a replicon. placed sequences on higher sequence
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
const TMols & GetMols(void) const
Get the Mols member data.
const TPrimary_assembly & GetPrimary_assembly(void) const
Get the Primary_assembly member data.
Definition: Seq_entry.hpp:55
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
void SetClass(TClass value)
Assign a value to Class data member.
Skip duplicate sequences (all synonyms are checked).
const TAssembly_set & GetAssembly_set(void) const
Get the variant data.
void x_AddBioseq(CRef< CSeq_entry > parent_entry, const TSeqIds &synonyms, const CGC_Sequence &gc_seq)
bool IsUnit(void) const
Check if variant Unit is selected.
const TLoc & GetLoc(void) const
Get the variant data.
Definition: Delta_seq_.cpp:102
Do not add external ids to bioseqs.
bool IsSetGpipe(void) const
the gpipe accession Check if a value has been assigned to Gpipe data member.
const TDesc & GetDesc(void) const
Get the Desc member data.
SQLRETURN desc_copy(TDS_DESC *dest, TDS_DESC *src)
Definition: descriptor.c:160
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsSetGi(void) const
optional since not all sequences have GIs Similarity: relationship between this synonym to main seqid...
list< CRef< CGC_Assembly > > TMore_assemblies
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:278
const TPublic & GetPublic(void) const
Get the Public member data.
bool CanGetLength() const
Is the length statistic available?
bool IsAssembly_set(void) const
Check if variant Assembly_set is selected.
TAnnot & SetAnnot(void)
Definition: Seq_entry.cpp:195
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string...
Definition: ncbiexpt.hpp:546
bool IsSetDescr(void) const
Various attributes assigned at this level: biosrc, comments, publications...
bool IsSetSeq_id_synonyms(void) const
Other known identifiers: Local / gpipe-satellite / genbank / refseq Check if a value has been assigne...
const TGi & GetGi(void) const
Get the Gi member data.
bool IsLoc(void) const
Check if variant Loc is selected.
Definition: Delta_seq_.hpp:257
#define const
Definition: zconf.h:217
just a nucleic acid
Definition: Seq_inst_.hpp:113
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
const TId & GetId(void) const
Get the Id member data.
bool IsSingle(void) const
Check if variant Single is selected.
bool IsSetMore_assemblies(void) const
Check if a value has been assigned to More_assemblies data member.
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:165
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Do not add annotations to seq-entries and bioseqs.
TLevel GetLevel(void) const
Get the Level member data.
list< CRef< CGC_Replicon > > TMols
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Seq-loc and seq-align mapper exceptions.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
const TRefseq & GetRefseq(void) const
Get the variant data.
bool IsSetMols(void) const
collections of molecules for this assembly Check if a value has been assigned to Mols data member...
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:756
bool IsSetAnnot(void) const
in alt-loci units - contain alignment of this sequence to the primary unit Check if a value has been ...
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:426
list< CRef< CSeq_annot > > TAnnot
Modified on Thu Mar 30 17:14:33 2017 by modify_doxy.py rev. 506947