NCBI C++ ToolKit
sam_formatter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sam_formatter.cpp 69942 2015-11-23 16:45:34Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko, Aleksey Grichenko
27 *
28 * File Description:
29 * Flat formatter for Sequence Alignment/Map (SAM).
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
37 #include <objmgr/util/sequence.hpp>
41 
42 #include <set>
43 
44 #define NCBI_USE_ERRCODE_X Objtools_Fmt_SAM
45 
48 
49 
51 {
52 public:
56 
57  CSAM_CIGAR_Formatter(THeaders& headers,
58  TLines& body,
59  const CSeq_align& aln,
60  CScope& scope,
61  TFlags flags);
62  virtual ~CSAM_CIGAR_Formatter(void) {}
63 
64 protected:
65  virtual void StartAlignment(void);
66  virtual void StartRow(void)
67  {
68  m_NumDif = 0;
69  }
70  virtual void AddRow(const string& cigar);
71  virtual void AddSegment(CNcbiOstream& cigar,
72  char seg_type,
73  TSeqPos seg_len);
74  virtual void AdjustSeqIdType(CConstRef<CSeq_id>& id);
75 
76 private:
77  enum EReadFlags {
78  fRead_Default = 0x0000,
79  fRead_Reverse = 0x0010 // minus strand
80  };
81  typedef unsigned int TReadFlags;
82 
83  string x_GetSeqIdString(const CSeq_id& id, TFlags flags) const;
84  string x_GetRefIdString(void) const;
85  string x_GetTargetIdString(void) const;
86 
87  TFlags m_Flags;
88  THeaders& m_Head;
89  TLines& m_Rows;
90  int m_NumDif; // count differences
91 
92  set<CBioseq_Handle> m_KnownRefSeqs; // refseqs already in the header
93 };
94 
95 
97  TLines& body,
98  const CSeq_align& aln,
99  CScope& scope,
100  TFlags flags)
101  : CCIGAR_Formatter(aln, &scope),
102  m_Flags(flags),
103  m_Head(headers),
104  m_Rows(body),
105  m_NumDif(0)
106 {
107 }
108 
109 
110 inline
112 {
113  return (flags & CSAM_Formatter::fSAM_PlainSeqIds) ?
114  id.GetSeqIdString(true) : id.AsFastaString();
115 }
116 
117 
118 inline
120 {
121  return x_GetSeqIdString(GetRefId(), m_Flags);
122 }
123 
124 
125 inline
127 {
129 }
130 
131 
133 {
134  CScope* scope = GetScope();
135  if ( !scope) return;
138  CSeq_id_Handle forced_id = sequence::GetId(*id, *scope, force_type);
139  if (forced_id) {
140  id.Reset(forced_id.GetSeqId());
141  }
142 }
143 
144 
146 {
147 }
148 
149 
151  char seg_type,
152  TSeqPos seg_len)
153 {
154  if (seg_type != 'M') {
155  m_NumDif += seg_len;
156  }
157  CCIGAR_Formatter::AddSegment(cigar, seg_type, seg_len);
158 }
159 
160 
161 static int GetIntScore(const CScore& score)
162 {
163  if ( score.GetValue().IsInt() ) {
164  return score.GetValue().GetInt();
165  }
166  return int(score.GetValue().GetReal());
167 }
168 
169 
170 static double GetFloatScore(const CScore& score)
171 {
172  if ( score.GetValue().IsInt() ) {
173  return score.GetValue().GetInt();
174  }
175  return score.GetValue().GetReal();
176 }
177 
178 
179 void CSAM_CIGAR_Formatter::AddRow(const string& cigar)
180 {
182  if (m_KnownRefSeqs.find(refseq) == m_KnownRefSeqs.end()) {
184  "@SQ\tSN:" + x_GetRefIdString() +
185  "\tLN:" + NStr::UInt8ToString(refseq.GetBioseqLength()));
186  m_KnownRefSeqs.insert(refseq);
187  }
188 
189  string id = x_GetTargetIdString();
190 
191  TReadFlags flags = fRead_Default;
192  if ( GetTargetSign() != GetRefSign() ) {
193  flags |= fRead_Reverse;
194  }
195 
196  const TRange& ref_rg = GetRefRange();
197  const TRange& tgt_rg = GetTargetRange();
198  string clip_front, clip_back;
199  if (tgt_rg.GetFrom() > 0) {
200  if(flags & fRead_Reverse) {
201  clip_back = NStr::UInt8ToString(tgt_rg.GetFrom()) + "H";
202  }
203  else {
204  clip_front = NStr::UInt8ToString(tgt_rg.GetFrom()) + "H";
205  }
206  }
207 
208  string seq_data = "*";
209  CBioseq_Handle h;
211  if ( h ) {
212  if ( TSeqPos(tgt_rg.GetToOpen()) < h.GetBioseqLength() ) {
213  if(flags & fRead_Reverse) {
214  clip_front = NStr::UInt8ToString(
215  h.GetBioseqLength() - tgt_rg.GetToOpen()) + "H";
216  }
217  else {
218  clip_back = NStr::UInt8ToString(
219  h.GetBioseqLength() - tgt_rg.GetToOpen()) + "H";
220  }
221  }
223  if(flags & fRead_Reverse) {
224  CSeqVector vect = h.GetSeqVector(
225  CBioseq_Handle::eCoding_Iupac, eNa_strand_minus);
226  vect.GetSeqData(h.GetBioseqLength() - tgt_rg.GetToOpen(),
227  h.GetBioseqLength() - tgt_rg.GetFrom(), seq_data);
228  }
229  else {
230  CSeqVector vect = h.GetSeqVector(
231  CBioseq_Handle::eCoding_Iupac, eNa_strand_plus);
232  vect.GetSeqData(tgt_rg.GetFrom(), tgt_rg.GetToOpen(), seq_data);
233  }
234  }
235  }
236  else {
238  seq_data = string(tgt_rg.GetLength(), 'N'); // ???
239  }
240  }
241 
242  // Add tags
243  string AS; // alignment score, int
244  string EV; // expectation value, float
245  string PI; // percentage identity, float
246  string BS; // bit-score, int?
247  const CSeq_align& aln = GetCurrentSeq_align();
248  if ( aln.IsSetScore() ) {
249  ITERATE(CSeq_align::TScore, score, aln.GetScore()) {
250  if (!(*score)->IsSetId() || !(*score)->GetId().IsStr()) continue;
251  const string& id = (*score)->GetId().GetStr();
253  if (AS.empty() && id == "score") {
254  AS = "\tAS:i:" + NStr::IntToString(GetIntScore(**score));
255  }
256  }
258  if (EV.empty() && id == "e_value") {
259  EV = "\tEV:f:" + NStr::DoubleToString(GetFloatScore(**score));
260  }
261  }
263  if (BS.empty() && id == "bit_score") {
264  BS = "\tBS:f:" + NStr::DoubleToString(GetFloatScore(**score));
265  }
266  }
268  if (PI.empty() && id == "num_ident") {
269  int len = aln.GetAlignLength(false);
270  int ni = GetIntScore(**score);
271  double pi = 100.0;
272  if (ni != len) {
273  pi = min(99.99, 100.0 * ((double)ni)/((double)len));
274  }
275  PI = "\tPI:f:" + NStr::DoubleToString(pi, 2);
276  }
277  }
278  }
279  }
280  string NM;
282  NM = "\tNM:i:" + NStr::IntToString(m_NumDif);
283  }
284  m_Rows.push_back(
285  id + "\t" +
286  NStr::UIntToString(flags) + "\t" +
287  x_GetRefIdString() + "\t" +
288  NStr::UInt8ToString(ref_rg.GetFrom() + 1) + "\t" + // position, 1-based
289  "255\t" + // ??? mapping quality
290  clip_front + cigar + clip_back + "\t" +
291  "*\t" + // ??? mate reference sequence
292  "0\t" + // mate position, 1-based
293  "0\t" + // inferred insert size
294  seq_data + "\t" +
295  "*" + // query quality
296  AS + EV + NM + PI + BS // tags
297  );
298 }
299 
300 
302  CScope& scope,
303  TFlags flags)
304  : m_Out(&out),
305  m_Scope(&scope),
306  m_Flags(flags),
307  m_SO(eSO_Skip),
308  m_GO(eGO_Query)
309 {
310 }
311 
312 
314 {
315  Flush();
316 }
317 
318 
320  const CSeq_id& query_id)
321 {
323  fmt.FormatByTargetId(query_id);
324  return *this;
325 }
326 
327 
329  CSeq_align::TDim query_row)
330 {
332  fmt.FormatByTargetRow(query_row);
333  return *this;
334 }
335 
336 
338  const CSeq_id& query_id)
339 {
340  CSeq_align disc;
342  disc.SetSegs().SetDisc().Assign(aln_set);
343  Print(disc, query_id);
344  return *this;
345 }
346 
347 
349  CSeq_align::TDim query_row)
350 {
351  CSeq_align disc;
353  disc.SetSegs().SetDisc().Assign(aln_set);
354  Print(disc, query_row);
355  return *this;
356 }
357 
358 
360 {
361  switch ( m_SO ) {
362  case eSO_Unsorted:
363  *m_Out << "\tSO:unsorted";
364  break;
365  case eSO_QueryName:
366  *m_Out << "\tSO:queryname";
367  break;
368  case eSO_Coordinate:
369  *m_Out << "\tSO:coordinate";
370  break;
371  case eSO_User:
372  if ( !m_SO_Value.empty() ) {
373  *m_Out << "\tSO:" << m_SO_Value;
374  }
375  break;
376  default:
377  break;
378  }
379 }
380 
381 
383 {
384  switch ( m_GO ) {
385  case eGO_None:
386  *m_Out << "\tGO:none";
387  break;
388  case eGO_Query:
389  *m_Out << "\tGO:query";
390  break;
391  case eGO_Reference:
392  *m_Out << "\tGO:reference";
393  break;
394  case eGO_User:
395  if ( !m_GO_Value.empty() ) {
396  *m_Out << "\tGO:" << m_GO_Value;
397  }
398  break;
399  default:
400  break;
401  }
402 }
403 
404 
406 {
407  if ( !m_Out ) return;
408  // Headers
409  bool have_data = !m_Header.m_Data.empty() || !m_Body.empty();
410  if (have_data) {
411  *m_Out << "@HD\tVN:1.2";
412  x_PrintSOTag();
413  x_PrintGOTag();
414  *m_Out << '\n';
415  }
417  *m_Out << it->second << '\n';
418  }
419  if (have_data && !m_ProgramInfo.m_Id.empty()) {
420  *m_Out << "@PG\tID:" << m_ProgramInfo.m_Id;
421  if ( !m_ProgramInfo.m_Version.empty() ) {
422  *m_Out << "\tVN:" << m_ProgramInfo.m_Version;
423  }
424  if ( !m_ProgramInfo.m_CmdLine.empty() ) {
425  *m_Out << "\tCL:" << m_ProgramInfo.m_CmdLine;
426  }
427  if ( !m_ProgramInfo.m_Desc.empty() ) {
428  *m_Out << "\tDS:" << m_ProgramInfo.m_Desc;
429  }
430  if ( !m_ProgramInfo.m_Name.empty() ) {
431  *m_Out << "\tPN:" << m_ProgramInfo.m_Name;
432  }
433  *m_Out << '\n';
434  }
435 
436  // Alignments
437  ITERATE(TLines, it, m_Body) {
438  *m_Out << *it << '\n';
439  }
440  m_Header.m_Data.clear();
441  m_Body.clear();
442 }
443 
444 
446  const string& line)
447 {
448  ITERATE(TData, it, m_Data) {
449  if (it->first == id) return; // duplicate
450  }
451  m_Data.push_back(TData::value_type(id, line));
452 }
453 
454 
void AddSequence(CSeq_id_Handle id, const string &line)
CBioseq_Handle –.
unsigned int TReadFlags
CScope * GetScope(void) const
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5191
Set coding to printable coding (Iupacna or Iupacaa)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const float pi
Definition: math.hpp:52
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5132
bool IsInt(void) const
Check if variant Int is selected.
Definition: Score_.hpp:397
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string x_GetTargetIdString(void) const
virtual void StartAlignment(void)
void x_PrintSOTag(void) const
CNcbiOstream * m_Out
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:883
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:73
int TFlags
bitwise OR of EFlags
const CSeq_id & GetTargetId(void) const
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:953
discontinuous alignment
Definition: Seq_align_.hpp:104
std::ofstream out("events_result.xml")
main entry point for tests
User-provided string.
position_type GetLength(void) const
Definition: range.hpp:158
virtual void AddSegment(CNcbiOstream &cigar, char seg_type, TSeqPos seg_len)
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static int GetIntScore(const CScore &score)
const TRange & GetRefRange(void) const
string
Definition: cgiapp.hpp:437
Definition: Score.hpp:56
const TValue & GetValue(void) const
Get the Value member data.
Definition: Score_.hpp:457
void Flush(void)
void x_PrintGOTag(void) const
const_iterator find(const key_type &key) const
Definition: set.hpp:137
Base class for CIGAR formatters.
const_iterator end() const
Definition: set.hpp:136
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:871
Drop type prefix in seq-ids.
position_type GetToOpen(void) const
Definition: range.hpp:138
virtual void AddSegment(CNcbiOstream &cigar, char seg_type, TSeqPos seg_len)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:893
CSAM_Formatter::CSAM_Headers THeaders
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5107
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:101
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
SProgramInfo m_ProgramInfo
string x_GetSeqIdString(const CSeq_id &id, TFlags flags) const
const CSeq_id & GetRefId(void) const
int GetRefSign(void) const
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:805
CSeqVector –.
Definition: seq_vector.hpp:64
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1915
EGroupOrder m_GO
TInt GetInt(void) const
Get the variant data.
Definition: Score_.hpp:403
int EGetIdType
Definition: sequence.hpp:124
string m_Id
ID - program id.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:150
list< string > TLines
virtual ~CSAM_CIGAR_Formatter(void)
const TRange & GetTargetRange(void) const
User-provided string.
TReal GetReal(void) const
Get the variant data.
Definition: Score_.hpp:376
set< CBioseq_Handle > m_KnownRefSeqs
CSAM_CIGAR_Formatter(THeaders &headers, TLines &body, const CSeq_align &aln, CScope &scope, TFlags flags)
T min(T x_, T y_)
CConstRef< CSeq_id > GetSeqId(void) const
virtual void AddRow(const string &cigar)
CSAM_Formatter::TFlags TFlags
position_type GetFrom(void) const
Definition: range.hpp:134
return only a gi-based seq-id
Definition: sequence.hpp:97
CSAM_Formatter(CNcbiOstream &out, CScope &scope, TFlags flags=fSAM_Default)
TSeqPos GetBioseqLength(void) const
CScope –.
Definition: scope.hpp:90
string m_CmdLine
CL - command line.
Flat formatter for Sequence Alignment/Map (SAM).
CRef< CScope > m_Scope
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found...
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
static CRef< CScope > m_Scope
int len
GO:query (default)
const CSeq_align & GetCurrentSeq_align(void) const
CSAM_Formatter::TLines TLines
int GetTargetSign(void) const
string m_Version
VN - version.
virtual void AdjustSeqIdType(CConstRef< CSeq_id > &id)
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
void FormatByTargetRow(TNumrow target_row)
string x_GetRefIdString(void) const
const double PI
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:309
CSAM_Formatter & Print(const CSeq_align &aln, const CSeq_id &query_id)
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:70
virtual void StartRow(void)
string m_Name
PN - program name.
User-defined methods of the data storage class.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5210
CSAM_Headers m_Header
string m_Desc
DS - description.
Print sequence data.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:98
The base class for alignment formatters which use CIGAR format.
return only an accession based seq-id
Definition: sequence.hpp:98
unsigned int
Definition: types.hpp:1153
list< pair< CSeq_id_Handle, string > > TData
void FormatByTargetId(const CSeq_id &target_id)
static double GetFloatScore(const CScore &score)
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
Modified on Mon Sep 26 17:44:22 2016 by modify_doxy.py rev. 506947