NCBI C++ ToolKit
ucscregion_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ucscregion_reader.cpp 74007 2016-08-04 22:08:31Z boukn $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Sergiy Gotvyanskyy
27  *
28  * File Description:
29  * Distance matrix readers (UCSC-style Region format)
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbithr.hpp>
37 #include <corelib/ncbiutil.hpp>
38 #include <corelib/ncbiexpt.hpp>
39 #include <corelib/stream_utils.hpp>
40 
41 #include <util/static_map.hpp>
42 #include <util/line_reader.hpp>
43 
44 #include <serial/iterator.hpp>
45 #include <serial/objistrasn.hpp>
46 
47 // Objects includes
53 
59 
63 #include <objects/seq/Seqdesc.hpp>
66 
83 
89 #include <objtools/error_codes.hpp>
90 
91 #include <objmgr/util/feature.hpp>
92 
93 #include <algorithm>
94 
95 
96 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
97 
99 BEGIN_objects_SCOPE // namespace ncbi::objects::
100 
102  CReaderBase(iflags)
103 {
104 }
105 
107 {
108 }
109 
111 {
112  CRef<CSeq_feat> Feat(new CSeq_feat);
113 #if 0
114  vector<string> Tokens;
115  string Delims = ".:- \t";
116  {{
117  size_t IdSubI = 0;
118  for(int I = 0; I < Line.length(); I++) {
120  if(Info > 0) {
121  IdSubI = I;
122  }
123  }
124 
125  if(IdSubI > 0) {
126  string IdStr = Line.substr(0, IdSubI);
127  string SubLine = Line.substr(IdSubI );
128  //cerr << IdStr << endl;
129  //cerr << SubLine << endl;
130  NStr::Tokenize(NStr::TruncateSpaces(SubLine), Delims, Tokens, NStr::eMergeDelims);
131  Tokens.insert(Tokens.begin(), IdStr);
132  } else {
134  }
135  }}
136 
137  //ITERATE(vector<string>, iter, Tokens) {
138  // cerr << "\t" << *iter << endl;
139  //}
140 
141  if(Tokens.size() < K_START+1)
142  return CRef<CSeq_feat>();
143 
144  NStr::ReplaceInPlace(Tokens[K_START], ",", "");
145  NStr::ReplaceInPlace(Tokens[K_START], ".", "");
146  if(Tokens.size() >= K_STOP+1) {
147  NStr::ReplaceInPlace(Tokens[K_STOP], ",", "");
148  NStr::ReplaceInPlace(Tokens[K_STOP], ".", "");
149  }
150 
151  Feat->SetData().SetRegion("region: "+NStr::IntToString(Number));
152 
153  CRef<CSeq_loc> TopLoc(new CSeq_loc);
154  TopLoc->SetInt().SetId().Assign(*CRegionFile::x_ParseId(Tokens[K_ID]));
155  TopLoc->SetInt().SetFrom() = NStr::StringToUInt8(Tokens[K_START])-1;
156  if(Tokens.size() >= K_STOP+1)
157  TopLoc->SetInt().SetTo() = NStr::StringToUInt8(Tokens[K_STOP])-1;
158  else
159  TopLoc->SetInt().SetTo() = TopLoc->GetInt().GetFrom();
160 
161  if(Tokens.size() >= K_STRAND+1) {
162  if(Tokens[K_STRAND] == "+")
163  TopLoc->SetInt().SetStrand() = eNa_strand_plus;
164  else if(Tokens[K_STRAND] == "-")
165  TopLoc->SetInt().SetStrand() = eNa_strand_minus;
166  else
167  TopLoc->SetInt().SetStrand() = eNa_strand_plus;
168  } else {
169  TopLoc->SetInt().SetStrand() = eNa_strand_plus;
170  }
171  Feat->SetLocation().Assign(*TopLoc);
172 
173  if(!Feat->CanGetTitle())
174  Feat->SetTitle() = "Line:"+NStr::IntToString(Number);
175 
176 //cerr << MSerial_AsnText << *Feat;
177 #endif
178 
179  return Feat;
180 }
181 // ----------------------------------------------------------------------------
182 void CUCSCRegionReader::xSmartFieldSplit(vector<string>& fields, CTempString line)
183 {
184  NStr::Tokenize(line, " \t.-:", fields, NStr::eMergeDelims);
185  if (line[line.length()-1] == '-')
186  fields.push_back("-");
187  while (fields.size() > 3)
188  {
189  if (fields.size() == 4 && (fields.back() == "+" || fields.back() == "-"))
190  break;
191  // try to merge first column
192  size_t len = fields[0].length();
193  if (line[len] == '.')
194  {
195  fields[0] += line[len];
196  fields[0] += fields[1];
197  fields.erase(fields.begin()+1);
198  } else {
199  break;
200  }
201  }
202 }
203 // ----------------------------------------------------------------------------
205  const vector<string>& fields,
206  CRef<CSeq_annot>& annot,
207  ILineErrorListener* pEC)
208 {
209  // assign
210  string str_line_number = NStr::IntToString(m_uLineNumber);
211  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
212  CRef<CSeq_feat> feature;
213  feature.Reset( new CSeq_feat );
214  try {
215  x_SetFeatureLocation(feature, fields);
216  feature->SetData().SetRegion() = "region: "+ str_line_number;
217  if(!feature->CanGetTitle())
218  feature->SetTitle() = "Line:" + str_line_number;
219  }
220  catch(CObjReaderLineException& err) {
221  ProcessError(err, pEC);
222  return false;
223  }
224  ftable.push_back( feature );
225  return true;
226 }
227 // ----------------------------------------------------------------------------
229  CRef<CSeq_feat>& feature,
230  const vector<string>& fields )
231 // ----------------------------------------------------------------------------
232 {
233  //
234  // Note:
235  // BED convention for specifying intervals is 0-based, first in, first out.
236  // ASN convention for specifying intervals is 0-based, first in, last in.
237  // Hence, conversion BED->ASN leaves the first leaves the "from" coordinate
238  // unchanged, and decrements the "to" coordinate by one.
239  //
240 
242  int from, to;
243  from = to = -1;
244 
245  //already established: We got at least three columns
246  try {
247  from = NStr::StringToInt(fields[1], NStr::fAllowCommas)-1;
248  }
249  catch(std::exception&) {
252  eDiag_Error,
254  "Invalid data line: Bad \"SeqStart\" value." ) );
255  pErr->Throw();
256  }
257  to = from;
258 
259  if (fields.size()>2)
260  try {
261  to = NStr::StringToInt(fields[2], NStr::fAllowCommas) - 1;
262  }
263  catch(std::exception&) {
266  eDiag_Error,
268  "Invalid data line: Bad \"SeqStop\" value.") );
269  pErr->Throw();
270  }
271 
272  if (from == to) {
273  location->SetPnt().SetPoint(from);
274  }
275  else if (from < to) {
276  location->SetInt().SetFrom(from);
277  location->SetInt().SetTo(to);
278  }
279  else {
282  eDiag_Error,
284  "Invalid data line: \"SeqStop\" less than \"SeqStart\"." ) );
285  pErr->Throw();
286  }
287 
288  size_t strand_field = 3;
289  if (strand_field < fields.size()) {
290  string strand = fields[strand_field];
291  if (strand != "+" && strand != "-" && strand != ".") {
294  eDiag_Error,
296  "Invalid data line: Invalid strand character." ) );
297  pErr->Throw();
298  }
299  location->SetStrand(( fields[strand_field] == "+" ) ?
300  eNa_strand_plus : eNa_strand_minus );
301  }
302  try
303  {
304  CRef<CSeq_id> id = CReadUtil::AsSeqId(fields[0], m_iFlags, false);
305  //CRef<CSeq_id> id (new CSeq_id(fields[0], CSeq_id::fParse_AnyRaw | m_iFlags));
306  location->SetId(*id);
307  feature->SetLocation(*location);
308  }
309  catch(CSeqIdException&)
310  {
313  eDiag_Error,
315  "Malformed sequence id:" ) );
316  pErr->Throw();
317  }
318 
319 }
320 // ----------------------------------------------------------------------------
322 {
323  CRef<CSeq_annot> annot = ReadSeqAnnot(lr, pErrors);
324  return CRef<CSerialObject>(annot);
325 }
326 // ----------------------------------------------------------------------------
328  ILineReader& lr,
329  ILineErrorListener* pEC )
330 {
331  const size_t MAX_RECORDS = 100000;
332 
333  CRef<CSeq_annot> annot;
334  CRef<CAnnot_descr> desc;
335 
336  annot.Reset(new CSeq_annot);
337  desc.Reset(new CAnnot_descr);
338  annot->SetDesc(*desc);
339  CSeq_annot::C_Data::TFtable& tbl = annot->SetData().SetFtable();
340 
341  int featureCount = 0;
342 
343  while (!lr.AtEOF()) {
344 
345  ++m_uLineNumber;
346 
347  CTempString line = *++lr;
348 
349  if (NStr::TruncateSpaces_Unsafe(line).empty()) {
350  continue;
351  }
352  if (xParseComment(line, annot)) {
353  continue;
354  }
355  CTempString record_copy = NStr::TruncateSpaces_Unsafe(line);
356 
357  // parse
358  vector<string> fields;
359 
360  xSmartFieldSplit(fields, record_copy);
361 
362 #if 0
363  try {
364  xCleanColumnValues(fields);
365  }
366  catch(CObjReaderLineException& err) {
367  ProcessError(err, pEC);
368  continue;
369  }
370 #endif
371 
372  if (xParseFeature(fields, annot, pEC)) {
373  ++featureCount;
374  continue;
375  }
376  if (tbl.size() >= MAX_RECORDS) {
377  break;
378  }
379  }
380  // Only return a valid object if there was at least one feature
381  if (0 == featureCount) {
382  return CRef<CSeq_annot>();
383  }
384  //x_AddConversionInfo(annot, pEC);
385  //x_AssignTrackData( annot );
386 
387 #if 0
388  if(m_columncount >= 3) {
389  CRef<CUser_object> columnCountUser( new CUser_object() );
390  columnCountUser->SetType().SetStr( "NCBI_BED_COLUMN_COUNT" );
391  columnCountUser->AddField("NCBI_BED_COLUMN_COUNT", int ( m_columncount ) );
392 
393  CRef<CAnnotdesc> userDesc( new CAnnotdesc() );
394  userDesc->SetUser().Assign( *columnCountUser );
395  annot->SetDesc().Set().push_back( userDesc );
396  }
397 #endif
398  return annot;
399 }
400 
401 
402 END_objects_SCOPE
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:182
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3080
CRef< CSeq_feat > xParseFeatureUCSCFormat(const string &Line, int Number)
See 'ENumToStringFlags::fWithCommas'.
Definition: ncbistr.hpp:286
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:62
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:375
User-defined methods of the data storage class.
CAnnot_descr –.
Definition: Annot_descr.hpp:65
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAnnotdesc –.
Definition: Annotdesc.hpp:65
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:53
CUCSCRegionReader(unsigned int=fNormal)
bool CanGetTitle(void) const
Check if it is safe to call GetTitle method.
Definition: Seq_feat_.hpp:1131
Defines NCBI C++ exception handling.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3319
User-defined methods of the data storage class.
void erase(size_type pos=0)
Truncate the string at some specified position Note: basic_string<> supports additional erase() optio...
Definition: tempstr.hpp:546
TUser & SetUser(void)
Select the variant.
Definition: Annotdesc_.cpp:190
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5107
bool xParseFeature(const vector< string > &, CRef< CSeq_annot > &, ILineErrorListener *)
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3085
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
Merge the delimiters.
Definition: ncbistr.hpp:2407
User-defined methods of the data storage class.
Error message.
Definition: ncbidiag.hpp:647
Lightweight interface for getting lines of data with minimal memory copying.
TReaderFlags m_iFlags
TFrom GetFrom(void) const
Get the From member data.
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:967
void SetType(TType &value)
Assign a value to Type data member.
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:57
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void xSmartFieldSplit(vector< string > &fields, CTempString line)
static vector< string > & Tokenize(const CTempString str, const CTempString delim, vector< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Tokenize a string using the specified set of char delimiters.
Definition: ncbistr.cpp:3468
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1041
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_feat_.hpp:1146
Multi-threading – classes, functions, and features.
Useful/utility classes and methods.
void SetInt(TInt &v)
Definition: Seq_loc.hpp:965
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:222
AutoPtr –.
Definition: ncbimisc.hpp:483
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
Definition: ncbistr.cpp:774
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:103
int len
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1002
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
unsigned int m_uLineNumber
CSeqIdException –.
Definition: Seq_id.hpp:700
bool xParseComment(const CTempString &, CRef< CSeq_annot > &)
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
virtual CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors=0)
Read an object from a given line reader, render it as the most appropriate Genbank object...
static CRef< CSeq_id > AsSeqId(const string &rawId, unsigned int flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
Definition: read_util.cpp:89
static const char location[]
Definition: config.c:97
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
namespace ncbi::objects::
Definition: Seq_feat.hpp:55
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:243
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
void x_SetFeatureLocation(CRef< CSeq_feat > &feature, const vector< string > &fields)
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:361
virtual CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pEC)
Read an object from a given line reader, render it as a single Seq-annot, if possible.
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3456
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5129
Definition: tokens.hpp:107
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:756
NCBI_NORETURN void Throw(void) const
this function to throw this object.
Definition: line_error.cpp:130
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:531
Modified on Sun Aug 28 17:57:56 2016 by modify_doxy.py rev. 506947