NCBI C++ ToolKit
fit_blast.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
36  (
37  const CBioseq& left,
38  const CBioseq& right,
39  string& common_subject
40  )
41 {
42  bool result=false;
43  // check if prot
44  if(PrintDetails()) NcbiCerr << "fit_blast, seq level " << NcbiEndl;
45 // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
46 // string qrname = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
47  string qname = GetStringDescr (left);
48  string qrname = GetStringDescr (right);
49  // if(PrintDetails()) NcbiCerr << "left " << CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA) << NcbiEndl;
50  // if(PrintDetails()) NcbiCerr << "right " << CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA) << NcbiEndl;
51  if(PrintDetails()) NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
52  if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
53 // assuming that protein sequences come in one piece of seq-set
54  if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
55  if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
56  if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
57  if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
58 /*
59  if(!hasGenomicInterval(left)) return result;
60  if(!hasGenomicInterval(right)) return result;
61 
62  const CSeq_interval& left_genomic_int = getGenomicInterval(left);
63  const CSeq_interval& right_genomic_int = getGenomicInterval(right);
64 */
65  if(!hasGenomicLocation(left)) return result;
66  if(!hasGenomicLocation(right)) return result;
67 
68  const CSeq_loc& left_genomic_int = getGenomicLocation(left);
69  const CSeq_loc& right_genomic_int = getGenomicLocation(right);
70  TSeqPos from1, to1, from2, to2;
71 
72  ENa_strand left_strand;
73  ENa_strand right_strand;
74  getFromTo(left_genomic_int, from1, to1, left_strand);
75  getFromTo(right_genomic_int, from2, to2, right_strand);
76 
77  int left_frame = (from1-1)%3+1; // might be not making sense of location is composite
78  int right_frame = (from2-1)%3+1;
79  if(left_strand != eNa_strand_plus ) left_frame=-left_frame;
80  if(right_strand != eNa_strand_plus ) right_frame=-right_frame;
81  if(left_strand != right_strand)
82  {
83  if(PrintDetails())
84  {
85  NcbiCerr << "Strands mismatch for " << NcbiEndl;
86  NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
87  NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
88  }
89  return result;
90  }
91  if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
92  bool reverse = left_strand == eNa_strand_minus;
93  int space = ((int)from2-(int)to1)/3+1; // +1 for stop codon, /3 to convert n to aa
94  if(PrintDetails()) NcbiCerr << "space: " << space << NcbiEndl;
95  int left_qLen = getQueryLen(left);
96  int right_qLen = getQueryLen(right);
97  int num=0, numFit=0, numGI=0;
98  int numLeft=0, numRight=0;
99  vector<perfectHitStr> left_perfect;
100  vector<perfectHitStr> right_perfect;
101  collectPerfectHits(left_perfect, left);
102  collectPerfectHits(right_perfect, right);
103 
104  if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
105  int n_frameshift_pairs=0;
106  list<problemStr> problemsl;
107  list<problemStr> problemsr;
108  IncreaseVerbosity();
109  common_subject="";
110  string any_common_subject="";
111  ITERATE(CBioseq::TAnnot, left_annot, left.GetAnnot())
112  {
113  if(PrintDetails()) NcbiCerr << "Next left annot: numLeft: " << numLeft << NcbiEndl;
114  if(!(*left_annot)->GetData().IsAlign()) continue;
115  numLeft++;
116  if(PrintDetails()) NcbiCerr << "Left annot is alignment: numLeft: " << numLeft << NcbiEndl;
117  vector<long> left_gis = getGIs(left_annot);
118  if(PrintDetails()) NcbiCerr << "Left annot is alignment: left_gis: " << left_gis.size() << NcbiEndl;
119 
120  IncreaseVerbosity();
121  ITERATE(CBioseq::TAnnot, right_annot, right.GetAnnot())
122  {
123  if(PrintDetails()) NcbiCerr << "Next right annot: numRight: " << numRight << NcbiEndl;
124  if(!(*right_annot)->GetData().IsAlign()) continue;
125  numRight++;
126  if(PrintDetails()) NcbiCerr << "Right annot is alignment: numRight: " << numRight << NcbiEndl;
127  vector<long> right_gis = getGIs(right_annot);
128  if(PrintDetails()) NcbiCerr << "Right annot is alignment: right_gis: " << right_gis.size() << NcbiEndl;
129  PushVerbosity();
130  if(!giMatch(left_gis, right_gis))
131  {
132  PopVerbosity();
133  continue;
134  }
135  PopVerbosity();
136  if(PrintDetails()) NcbiCerr << "!!!!!!!!! GI Match !!!!!!!!!!" << NcbiEndl;
137  numGI++;
138  distanceReportStr *report = new distanceReportStr;
139  report->left_strand = left_strand;
140  report->right_strand = right_strand;
141  report->s_id = left_gis[0];
142  report->q_loc_left_from = reverse ? from2: from1;
143  report->q_loc_right_from = reverse ? from1 : from2;
144  report->q_loc_left_to = reverse ? to2 : to1;
145  report->q_loc_right_to = reverse ? to1 : to2;
146  int frame_from=report->q_loc_left_from;
147  int frame_to =frame_from;
148  frame_from = min(frame_from, report->q_loc_left_from);
149  frame_to = max(frame_to , report->q_loc_left_from);
150  frame_from = min(frame_from, report->q_loc_right_from);
151  frame_to = max(frame_to , report->q_loc_right_from);
152  frame_from = min(frame_from, report->q_loc_left_to );
153  frame_to = max(frame_to , report->q_loc_left_to );
154  frame_from = min(frame_from, report->q_loc_right_to );
155  frame_to = max(frame_to , report->q_loc_right_to );
156  report->left_frame = left_frame;
157  report->right_frame = right_frame;
158  if( (!reverse && fit_blast(left, right, left_annot, right_annot, left_qLen, right_qLen, space, report)) ||
159  ( reverse && fit_blast(right, left, right_annot, left_annot, right_qLen, left_qLen, space, report))
160  )
161  {
162  numFit++;
163  string this_subject = getAnnotName(left_annot);
164  if(numFit==1) any_common_subject = this_subject;
165  if(common_subject.size()==0 && this_subject.find("hypothetical")==string::npos)
166  {
167 // best subject
168  common_subject = this_subject;
169  if(PrintDetails()) NcbiCerr << "zero common_subject changed to non-hypothetical this_subject " << this_subject
170  << NcbiEndl;
171  }
172  if(PrintDetails()) NcbiCerr << "!!!! Blast bounds match !!!!! numFit: " << numFit << NcbiEndl;
173 
174  char bufferchar[20480]; memset(bufferchar, 0, 20480);
175  strstream buffer(bufferchar, 20480, IOS_BASE::out);
176  printReport(report, buffer);
177  strstream misc_feat;
178  if(common_subject.size())
179  {
180  misc_feat << "potential frameshift: common BLAST hit: "
181  << common_subject << '\0';
182  }
183  else
184  {
185  misc_feat << '\0';
186  }
187  {
188  if(PrintDetails()) NcbiCerr << "added a problem: " << misc_feat.str() << NcbiEndl;
189  if(PrintDetails()) NcbiCerr << "added a problem(left): " << qname << NcbiEndl;
190  if(PrintDetails()) NcbiCerr << "added a problem(right): " << qrname << NcbiEndl;
191  problemStr problem = { eFrameShift, buffer.str(), misc_feat.str(),
192  qname, qrname,
193  frame_from,
194  frame_to ,
195  report->left_strand};
196  problemsl.push_back(problem);
197  }
198  {
199  problemStr problem = { eFrameShift, "", "", "", "", -1, -1, eNa_strand_unknown };
200  problemsr.push_back(problem);
201  }
202  n_frameshift_pairs++;
203  }
204  else
205  delete report;
206  }
207  DecreaseVerbosity();
208  }
209  DecreaseVerbosity();
210  num=numLeft*numRight;
211  if(common_subject.size()==0) common_subject = any_common_subject;
212 
213 
214  if(numFit) // this block determines whether it is frameshift or not
215  {
216 // one of the queries is hypothetical - frameshift!
217  if(qname.find("hypothetical") != string::npos || qrname.find("hypothetical") != string::npos)
218  {
219  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
220  << "one of those is hypo: FRAMESHIFT" << NcbiEndl;
221  result = true;
222  }
223  else
224  {
225 // not enough frameshift evidence - not frameshift!
226  if(numFit<2)
227  {
228  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
229  << "numFit < 2: NOT A FRAMESHIFT" << NcbiEndl;
230  result = false;
231  }
232  else
233  {
234 // there are truely exhonerating hits - not frameshift!
235  if(left_perfect.size() || right_perfect.size())
236  {
237  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
238  << "there are exhonerating hits: NOT A FRAMESHIFT" << NcbiEndl;
239  result = false;
240  }
241 // the rest are frameshifts
242  else
243  {
244  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
245  << "there are no exhonerating hits: FRAMESHIFT" << NcbiEndl;
246  result = true;
247  }
248  }
249  }
250  }
251  else
252  {
253  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
254  << "numFit = 0: NOT A FRAMESHIFT" << NcbiEndl;
255  result = false;
256  }
257  if(PrintDetails()) NcbiCerr << "after, left " << GetStringDescr (left) << NcbiEndl;
258  if(PrintDetails()) NcbiCerr << "after, right " << GetStringDescr (right) << NcbiEndl;
259  if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
260  if(PrintDetails()) NcbiCerr << "final numGI, numFit : " << numGI << " " << numFit << NcbiEndl;
261  if( !result && numFit )
262 // print exhonerating hits to stderr if detailed printing
263  {
264  char bufferchar[2048]; memset(bufferchar, 0, 2048);
265  strstream buffer(bufferchar, 2048, IOS_BASE::out);
266  buffer << "Left sequence has " << left_perfect.size() << " perfect hits." << NcbiEndl;
267  ITERATE(vector<perfectHitStr>, hit, left_perfect)
268  {
269  printPerfectHit(*hit, buffer);
270  }
271  buffer << "Right sequence has " << right_perfect.size() << " perfect hits." << NcbiEndl;
272  ITERATE(vector<perfectHitStr>, hit, right_perfect)
273  {
274  printPerfectHit(*hit, buffer);
275  }
276  buffer << "In total, " << numFit << " pairs of hits (out of " << num << ") match for these two sequences." << NcbiEndl;
277  if(PrintDetails()) NcbiCerr << buffer.str();
278  if(PrintDetails()) NcbiCerr << "that is, left sequence (" << qname << ")." << NcbiEndl;
279  }
280  if(result)
281  {
282  addProblems(m_diag[qname].problems, problemsl);
283  addProblems(m_diag[qrname].problems, problemsr);
284  }
285 
286  return result;
287 }
288 
290  (
291  const CBioseq& left,
292  const CBioseq& right,
293  CBioseq::TAnnot::const_iterator& left_annot,
294  CBioseq::TAnnot::const_iterator& right_annot,
295  int left_qLen, int right_qLen,
296  int space, distanceReportStr* report
297  )
298 {
299  bool result=false;
300  // report->q_id_left = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
301  // report->q_id_right = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
302  report->q_id_left = GetStringDescr (left);
303  report->q_id_right = GetStringDescr (right);
304  report->q_name_left = GetProtName(left);
305  report->q_name_right = GetProtName(right);
306 
307  int left_sLen = getLenScore(left_annot);
308  int right_sLen = getLenScore(right_annot);
309 
310  int left_qFrom, left_qTo, right_qFrom, right_qTo;
311  int left_sFrom, left_sTo, right_sFrom, right_sTo;
312  getBounds(left_annot, &left_qFrom, &left_qTo, &left_sFrom, &left_sTo);
313  getBounds(right_annot, &right_qFrom, &right_qTo, &right_sFrom, &right_sTo);
314 
315  report->q_left_left = left_qFrom-1;
316  report->q_left_middle = left_qTo-left_qFrom+1;
317  report->q_left_right = left_qLen-left_qTo;
318 
319  report->space = space;
320  report->s_name ="cannot get subject name";
321  report->s_name = getAnnotName(left_annot);
322  report->alignment_left = getAnnotComment(left_annot);
323  report->alignment_right = getAnnotComment(right_annot);
324 
325  if(PrintDetails()) NcbiCerr << "fit_blast annot level: " << report->s_name.c_str() << NcbiEndl;
326 
327  report->q_right_left = right_qFrom-1;
328  report->q_right_middle = right_qTo-right_qFrom+1;
329  report->q_right_right = right_qLen-right_qTo;
330 
331  report->s_left_left = left_sFrom-1;
332  report->s_left_middle = left_sTo-left_sFrom+1;
333  report->s_left_right = left_sLen-left_sTo;
334 
335  report->s_right_left = right_sFrom-1;
336  report->s_right_middle = right_sTo-right_sFrom+1;
337  report->s_right_right = right_sLen-right_sTo;
338 
339 
340  int result_left = report->diff_left =
341  (report->s_left_right - report->s_right_right) -
342  (report->q_left_right + report->space + report->q_right_left)
343  ;
344  int result_right = report->diff_right =
345  (report->s_right_left - report->s_left_left ) -
346  (report->q_left_right + report->space + report->q_right_left)
347  ;
348 
349  report->diff_left -= report->q_right_middle;
350  report->diff_right -= report->q_left_middle;
351 
352  report->diff_edge_left = report->s_right_left -
353  (report->s_left_left + report->s_left_middle +
354  report->space + report->q_left_right + report->q_right_left );
355  report->diff_edge_right = report->s_left_right -
356  (report->s_right_right+ report->s_right_middle +
357  report->space + report->q_left_right + report->q_right_left );
358  if(PrintDetails())
359  printReport(report, NcbiCerr);
360 
361 
362 
363  // result = diff<150;
364 // subject's hypothetical or not does not play a role
365  result = result_left > 0 && result_right > 0; // && report->s_name.find("hypothetical")==string::npos;
366  return result;
367 }
369 {
370  bool result;
371  if(PrintDetails()) NcbiCerr << "has_blast_hits?" << NcbiEndl;
372  result=seq.GetAnnot().size()>1;
373  if(PrintDetails()) NcbiCerr << result << NcbiEndl;
374  return result;
375 }
376 
377 
378 
#define NcbiCerr
Definition: ncbistre.hpp:407
static bool has_blast_hits(const CBioseq &seq)
Definition: fit_blast.cpp:368
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:953
std::ofstream out("events_result.xml")
main entry point for tests
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:597
#define NcbiEndl
Definition: ncbistre.hpp:411
const NCBI_NS_NCBI::CEnumeratedTypeValues *ENUM_METHOD_NAME() ENa_strand(void)
Access to ENa_strand's attributes (values, names) as defined in spec.
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:893
static bool PrintDetails(int current_verbosity=m_current_verbosity)
int addProblems(list< problemStr > &dest, const list< problemStr > &src)
Definition: problems.cpp:844
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:356
T max(T x_, T y_)
T min(T x_, T y_)
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
else result
Definition: token2.c:20
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:326
static uschar * buffer
Definition: pcretest.c:187
unsigned int
Definition: types.hpp:1153
bool fit_blast(const CBioseq &left, const CBioseq &right, string &common_subject)
Definition: fit_blast.cpp:36
Modified on Thu Aug 25 18:56:03 2016 by modify_doxy.py rev. 506947