NCBI C++ ToolKit
seq_loc_from_string.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_loc_from_string.cpp 81846 2018-04-10 12:21:40Z bollin $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat, Michael Kornbluh
27  *
28  * File Description:
29  * Utilities for converting string to CSeq_loc.
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbistr.hpp>
36 
38 
43 
44 #include <util/static_map.hpp>
45 
48 
49 // This anonymous namespace holds types and functions that are
50 // privately used by GetSeqLocFromString
51 namespace {
52 
53  class CLexToken : public CObject
54  {
55  public:
56  CLexToken(unsigned int token_type) { m_TokenType = token_type; m_HasError = false; }
57  virtual ~CLexToken() {}
58  unsigned int GetTokenType() { return m_TokenType; }
59  bool HasError () { return m_HasError; }
60 
61  virtual unsigned int GetInt() { return 0; }
62  virtual string GetString() { return ""; }
63 
64  virtual CRef<CSeq_loc> GetLocation(CSeq_id*, CGetSeqLocFromStringHelper*) { return CRef<CSeq_loc>(NULL); }
65 
66  enum E_TokenType {
67  e_Int = 0,
68  e_String,
69  e_ParenPair,
70  e_Join,
71  e_Order,
72  e_Complement,
73  e_DotDot,
74  e_LeftPartial,
75  e_RightPartial,
76  e_Comma,
77  e_Accession
78  };
79 
80  protected:
81  unsigned int m_TokenType;
82  bool m_HasError;
83  };
84 
85  typedef vector< CRef<CLexToken> > TLexTokenArray;
86 
87  bool s_ParseLex (string text, TLexTokenArray &token_list);
88 
89  class CLexTokenString : public CLexToken
90  {
91  public:
92  CLexTokenString (string token_data);
93  virtual ~CLexTokenString();
94  virtual string GetString() { return m_TokenData; };
95  private:
96  string m_TokenData;
97  };
98 
99  CLexTokenString::CLexTokenString(string token_data) : CLexToken (e_String)
100  {
101  m_TokenData = token_data;
102  }
103 
104  CLexTokenString::~CLexTokenString()
105  {
106  }
107 
108  class CLexTokenInt : public CLexToken
109  {
110  public:
111  CLexTokenInt (unsigned int token_data);
112  virtual ~CLexTokenInt ();
113  virtual unsigned int GetInt() { return m_TokenData; };
114  private:
115  unsigned int m_TokenData;
116  };
117 
118  CLexTokenInt::CLexTokenInt(unsigned int token_data) : CLexToken (e_Int)
119  {
120  m_TokenData = token_data;
121  }
122 
123  CLexTokenInt::~CLexTokenInt()
124  {
125  }
126 
127  class CLexTokenAccession : public CLexToken {
128  public:
129  CLexTokenAccession (const string &token_data);
130  virtual ~CLexTokenAccession();
131  virtual string GetString(void) { return m_TokenData; };
132  private:
133  string m_TokenData;
134  };
135 
136  CLexTokenAccession::CLexTokenAccession( const string &token_data )
137  : CLexToken(e_Accession), m_TokenData(token_data)
138  {
139  }
140 
141  CLexTokenAccession::~CLexTokenAccession()
142  {
143  }
144 
145  class CLexTokenParenPair : public CLexToken
146  {
147  public:
148  CLexTokenParenPair (unsigned int token_type, string between_text);
149  virtual ~CLexTokenParenPair();
150 
151  virtual CRef<CSeq_loc> GetLocation(CSeq_id *id, CGetSeqLocFromStringHelper* helper);
152 
153  static CRef<CSeq_loc> ReadLocFromTokenList (TLexTokenArray token_list, CSeq_id *id, CGetSeqLocFromStringHelper* helper);
154 
155  private:
156  TLexTokenArray m_TokenList;
157  };
158 
159  CLexTokenParenPair::CLexTokenParenPair(unsigned int token_type, string between_text) : CLexToken (token_type)
160  {
161  m_TokenList.clear();
162  m_HasError = ! s_ParseLex (between_text, m_TokenList);
163  }
164 
165  CLexTokenParenPair::~CLexTokenParenPair()
166  {
167  }
168 
169  CRef<CSeq_loc> CLexTokenParenPair::GetLocation(CSeq_id *id, CGetSeqLocFromStringHelper* helper)
170  {
171  CRef<CSeq_loc> retval = ReadLocFromTokenList(m_TokenList, id, helper);
172 
173  if (m_TokenType == e_Complement) {
174  retval = helper->GetRevComplement(*retval);
175  }
176  return retval;
177  }
178 
179  CRef<CSeq_loc> CLexTokenParenPair::ReadLocFromTokenList (TLexTokenArray token_list, CSeq_id *this_id, CGetSeqLocFromStringHelper* helper)
180  {
181  CRef<CSeq_id> id( this_id );
182 
183  CRef<CSeq_loc> retval;
184  CRef<CSeq_loc> add;
185  unsigned int list_pos;
186  TLexTokenArray before_comma_list;
187  vector <unsigned int> comma_pos;
188 
189  retval.Reset();
190  if (token_list.size() < 1) {
191  return retval;
192  }
193 
194  comma_pos.clear();
195  for (list_pos = 0; list_pos < token_list.size(); list_pos++) {
196  if (token_list[list_pos]->GetTokenType() == CLexToken::e_Comma) {
197  comma_pos.push_back (list_pos);
198  }
199  }
200 
201  if (comma_pos.size() > 0) {
202  retval = new CSeq_loc ();
203  list_pos = 0;
204  for (unsigned int k = 0; k < comma_pos.size(); k++) {
205  before_comma_list.clear();
206  while (list_pos < comma_pos[k]) {
207  before_comma_list.push_back (token_list[list_pos]);
208  list_pos++;
209  }
210  add = ReadLocFromTokenList(before_comma_list, id, helper);
211  if (add == NULL) {
212  retval.Reset();
213  return retval;
214  } else {
215  if( retval->Which() == CSeq_loc::e_not_set ) {
216  retval.Reset( new CSeq_loc );
217  retval->Assign( *add );
218  } else {
219  retval = helper->Seq_loc_Add (*retval, *add, 0);
220  }
221  }
222  // skip over comma
223  list_pos ++;
224  }
225  before_comma_list.clear();
226  while (list_pos < token_list.size()) {
227  before_comma_list.push_back (token_list[list_pos]);
228  list_pos++;
229  }
230  add = ReadLocFromTokenList(before_comma_list, id, helper);
231  if( retval->Which() == CSeq_loc::e_not_set ) {
232  retval.Reset( new CSeq_loc );
233  retval->Assign( *add );
234  } else {
235  retval = helper->Seq_loc_Add (*retval, *add, 0);
236  }
237  return retval;
238  } else {
239 
240  switch (token_list[0]->GetTokenType()) {
241  case CLexToken::e_Accession:
242  id = new CSeq_id( token_list[0]->GetString() );
243  token_list.erase( token_list.begin() ); // inefficient
244  // !!!!!FALL-THROUGH!!!!!
245  case CLexToken::e_Int:
246  if (token_list.size() == 1) {
247  // note - subtract one from the int read, because display is 1-based
248  retval = new CSeq_loc (*id, token_list[0]->GetInt() - 1);
249  } else if (token_list[1]->GetTokenType() == CLexToken::e_DotDot) {
250  if (token_list.size() < 3 || token_list[2]->GetTokenType() != CLexToken::e_Int) {
251  retval.Reset();
252  return retval;
253  }
254  if (token_list.size() > 4) {
255  retval.Reset();
256  return retval;
257  }
258  if (token_list.size() == 4 && token_list[3]->GetTokenType() != CLexToken::e_RightPartial) {
259  retval.Reset();
260  return retval;
261  }
262  // note - subtract one from the int read, because display is 1-based
263  retval = new CSeq_loc (*id, token_list[0]->GetInt() - 1, token_list[2]->GetInt() - 1);
264  // if "from" is bigger than "to", then minus strand is implied
265  if (retval && retval->IsInt() &&
266  retval->GetInt().GetFrom() > retval->GetInt().GetTo()) {
267  retval->SetStrand(eNa_strand_minus);
268  TSeqPos swap = retval->GetInt().GetFrom();
269  retval->SetInt().SetFrom(retval->SetInt().SetTo());
270  retval->SetInt().SetTo(swap);
271  }
272 
273  if (token_list.size() == 4) {
274  retval->SetPartialStop(true, eExtreme_Positional);
275  }
276  }
277  break;
278  case CLexToken::e_LeftPartial:
279  if (token_list.size() < 2) {
280  retval.Reset();
281  return retval;
282  } else if (token_list.size() == 2) {
283  // note - subtract one from the int read, because display is 1-based
284  retval = new CSeq_loc (*id, token_list[1]->GetInt() - 1);
285  retval->SetPartialStart(true, eExtreme_Positional);
286  } else if (token_list[2]->GetTokenType() == CLexToken::e_DotDot) {
287  if (token_list.size() < 4 || token_list[3]->GetTokenType() != CLexToken::e_Int) {
288  retval.Reset();
289  return retval;
290  }
291  if (token_list.size() > 5) {
292  retval.Reset();
293  return retval;
294  }
295  if (token_list.size() == 5 && token_list[4]->GetTokenType() != CLexToken::e_RightPartial) {
296  retval.Reset();
297  return retval;
298  }
299  // note - subtract one from the int read, because display is 1-based
300  retval = new CSeq_loc (*id, token_list[1]->GetInt() - 1, token_list[3]->GetInt() - 1);
301  retval->SetPartialStart(true, eExtreme_Positional);
302  if (token_list.size() == 5) {
303  retval->SetPartialStop(true, eExtreme_Positional);
304  }
305  }
306  break;
307 
308  case CLexToken::e_ParenPair:
309  case CLexToken::e_Join:
310  case CLexToken::e_Order:
311  case CLexToken::e_Complement:
312  if (token_list.size() > 1) {
313  retval.Reset();
314  return retval;
315  }
316  retval = token_list[0]->GetLocation(id, helper);
317  break;
318  case CLexToken::e_String:
319  break;
320  case CLexToken::e_DotDot:
321  break;
322  case CLexToken::e_RightPartial:
323  break;
324  case CLexToken::e_Comma:
325  break;
326  default:
327  break;
328  }
329  }
330  return retval;
331  }
332 
333  void s_RemoveWhiteSpace(string& str)
334  {
335  string copy;
336  unsigned int pos;
337 
338  for (pos = 0; pos < str.length(); pos++) {
339  if (!isspace((unsigned char) str[pos]) && (str[pos] != '~')) {
340  copy += str.substr(pos, 1);
341  }
342  }
343 
344  str = copy;
345  }
346 
347  size_t s_GetParenLen (string text)
348  {
349  string::size_type offset = 0;
350  unsigned int paren_count;
351  string::size_type next_quote;
352 
353  if (!NStr::StartsWith(text, "(")) {
354  return 0;
355  }
356 
357  offset++;
358  paren_count = 1;
359 
360  while (offset != text.length() && paren_count > 0) {
361  if (NStr::StartsWith(text.substr(offset), "(")) {
362  paren_count ++;
363  offset++;
364  } else if (NStr::StartsWith(text.substr(offset), ")")) {
365  paren_count --;
366  offset++;
367  } else if (NStr::StartsWith(text.substr(offset), "\"")) {
368  // skip quoted text
369  offset++;
370  next_quote = NStr::Find(text, "\"", offset);
371  if (next_quote == string::npos) {
372  return 0;
373  } else {
374  offset = next_quote + 1;
375  }
376  } else {
377  offset++;
378  }
379  }
380  if (paren_count > 0) {
381  return 0;
382  } else {
383  return offset;
384  }
385  }
386 
387  bool s_ParseLex (string text, TLexTokenArray &token_list)
388  {
389  char ch;
390  bool retval = true;
391  string::size_type paren_len, offset = 0, end_pos;
392 
393  if (NStr::IsBlank(text)) {
394  return false;
395  }
396 
397  s_RemoveWhiteSpace(text);
398 
399  while (offset < text.length() && retval) {
400  ch = text.c_str()[offset];
401  switch ( ch) {
402 
403  case '\"':
404  // skip to end of quotation
405  end_pos = NStr::Find(text, "\"", offset + 1);
406  if (end_pos == string::npos) {
407  retval = false;
408  } else {
409  token_list.push_back( CRef<CLexToken>(new CLexTokenString (text.substr (offset, end_pos - offset + 1))));
410  offset = end_pos + 1;
411  }
412  break;
413  /*------
414  * NUMBER
415  *------*/
416  case '0': case '1': case '2': case '3': case '4':
417  case '5': case '6': case '7': case '8': case '9':
418  end_pos = offset + 1;
419  while (end_pos < text.length() && isdigit (text.c_str()[end_pos])) {
420  end_pos ++;
421  }
422  token_list.push_back (CRef<CLexToken>(new CLexTokenInt (NStr::StringToInt(text.substr(offset, end_pos - offset)))));
423  offset = end_pos;
424  break;
425  // parentheses
426  case '(':
427  paren_len = s_GetParenLen(text.substr(offset));
428  if (paren_len == 0) {
429  retval = false;
430  } else {
431  token_list.push_back (CRef<CLexToken>(new CLexTokenParenPair (CLexToken::e_ParenPair, text.substr(offset + 1, paren_len - 2))));
432  if (token_list[token_list.size() - 1]->HasError()) {
433  retval = false;
434  }
435  offset += paren_len;
436  }
437  break;
438  /*------
439  * JOIN
440  *------*/
441  case 'j':
442  if (NStr::EqualNocase (text.substr(offset, 4), "join")) {
443  offset += 4;
444  paren_len = s_GetParenLen(text.substr(offset));
445  if (paren_len == 0) {
446  retval = false;
447  } else {
448  token_list.push_back (CRef<CLexToken>(new CLexTokenParenPair (CLexToken::e_Join, text.substr(offset + 1, paren_len - 2))));
449  }
450  offset += paren_len;
451  } else {
452  retval = false;
453  }
454  break;
455 
456  /*------
457  * ORDER
458  *------*/
459  case 'o':
460  if (NStr::EqualNocase (text.substr(offset, 5), "order")) {
461  offset += 5;
462  paren_len = s_GetParenLen(text.substr(offset));
463  if (paren_len == 0) {
464  retval = false;
465  } else {
466  token_list.push_back (CRef<CLexToken>(new CLexTokenParenPair (CLexToken::e_Order, text.substr(offset + 1, paren_len - 2))));
467  }
468  } else {
469  retval = false;
470  }
471  break;
472  /*------
473  * COMPLEMENT
474  *------*/
475  case 'c':
476  if (NStr::EqualNocase (text.substr(offset, 10), "complement")) {
477  offset += 10;
478  paren_len = s_GetParenLen(text.substr(offset));
479  if (paren_len == 0) {
480  retval = false;
481  } else {
482  token_list.push_back (CRef<CLexToken>(new CLexTokenParenPair (CLexToken::e_Complement, text.substr(offset + 1, paren_len - 2))));
483  }
484  offset += paren_len;
485  } else {
486  retval = false;
487  }
488  break;
489  case '-':
490  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_DotDot)));
491  offset++;
492  break;
493  case '.':
494  if (NStr::Equal(text.substr(offset, 2), "..")) {
495  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_DotDot)));
496  offset += 2;
497  } else {
498  retval = false;
499  }
500  break;
501  case '>':
502  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_RightPartial)));
503  offset ++;
504  break;
505  case '<':
506  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_LeftPartial)));
507  offset ++;
508  break;
509  case ';':
510  case ',':
511  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_Comma)));
512  offset ++;
513  break;
514  case 't' :
515  if (NStr::Equal(text.substr(offset, 2), "to")) {
516  token_list.push_back (CRef<CLexToken>(new CLexToken (CLexToken::e_DotDot)));
517  offset += 2;
518  } else {
519  retval = false;
520  }
521  break;
522  default:
523  // ACCESSION
524  // (accessions start with a capital letter, then numbers then
525  // an optional version prefix, then a colon)
526  if( isupper(ch) ) {
527  end_pos = offset + 1;
528  while (end_pos < text.length() && isupper (text.c_str()[end_pos])) {
529  end_pos++;
530  }
531  while (end_pos < text.length() && isdigit (text.c_str()[end_pos])) {
532  end_pos++;
533  }
534  if( text.c_str()[end_pos] == '.' ) {
535  ++end_pos;
536  while (end_pos < text.length() && isdigit (text.c_str()[end_pos])) {
537  end_pos++;
538  }
539  }
540  if( text.c_str()[end_pos] != ':' ) {
541  retval = false;
542  }
543  ++end_pos;
544  token_list.push_back (CRef<CLexToken>(new CLexTokenAccession (text.substr(offset, end_pos - offset - 1)))); // "- 1" to ignore colon
545  offset = end_pos;
546  } else {
547  retval = false;
548  }
549  break;
550  }
551  }
552 
553  return retval;
554  }
555 }
556 
558 {
559  // do nothing
560 }
561 
564 {
566  return CRef<CSeq_loc>(GetReverseComplement( loc, &helper ));
567 }
568 
571  const CSeq_loc& loc1,
572  const CSeq_loc& loc2,
574 {
575  // No ISynonymMapper due to lack of a CScope
576  return loc1.Add(loc2, flags, NULL);
577 }
578 
580  const string &text, const CSeq_id *id, CGetSeqLocFromStringHelper *helper)
581 {
582  CRef<CSeq_loc> retval(NULL);
583  TLexTokenArray token_list;
584 
585  token_list.clear();
586 
587  CRef<CSeq_id> this_id(new CSeq_id());
588  this_id->Assign(*id);
589 
590 
591  if (s_ParseLex (text, token_list)) {
592  retval = CLexTokenParenPair::ReadLocFromTokenList (token_list, this_id, helper);
593  }
594 
595  return retval;
596 }
597 
virtual CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags)
CSeq_loc * GetReverseComplement(const CSeq_loc &loc, CReverseComplementHelper *helper)
Get reverse complement of the seq-loc (?).
TTo GetTo(void) const
Get the To member data.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:102
int offset
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:855
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3859
#define NULL
Definition: ncbistd.hpp:225
virtual CRef< CSeq_loc > GetRevComplement(const CSeq_loc &loc)
User-defined methods of the data storage class.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:612
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2786
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3264
TFrom GetFrom(void) const
Get the From member data.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5141
int isupper(Uchar c)
Definition: ncbictype.hpp:70
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
numerical value
Definition: Na_strand.hpp:63
CRef< CSeq_loc > GetSeqLocFromString(const string &text, const CSeq_id *id, CGetSeqLocFromStringHelper *helper)
int TOpFlags
Definition: Seq_loc.hpp:325
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
void SetInt(TInt &v)
Definition: Seq_loc.hpp:968
static uch flags
CObject –.
Definition: ncbiobj.hpp:180
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:278
static bool EqualNocase(const CTempString str, SIZE_TYPE pos, SIZE_TYPE n, const char *pattern)
Case-insensitive equality of a substring with a pattern.
Definition: ncbistr.hpp:5087
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
int isspace(Uchar c)
Definition: ncbictype.hpp:69
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3297
No variant selected.
Definition: Seq_loc_.hpp:97
static bool Equal(const CTempString str, SIZE_TYPE pos, SIZE_TYPE n, const char *pattern, ECase use_case=eCase)
Test for equality of a substring with a pattern.
Definition: ncbistr.hpp:5113
Wraps up any functionality needed that might be outside the scope of this library.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5179
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:768
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:531
Modified on Sat Apr 21 13:45:00 2018 by modify_doxy.py rev. 546573