NCBI C++ ToolKit
blastxml2_format.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /* $Id: blastxml2_format.cpp 63986 2014-08-08 14:09:21Z fongah2 $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's offical duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Author:Amelia Fong
00027 *
00028 * ===========================================================================
00029 */
00030 
00031 /// @file blastxml2_format.cpp
00032 /// Formatting of BLAST results in XML2 form, using the BLAST XML2 specification.
00033 
00034 #include <ncbi_pch.hpp>
00035 #include <objmgr/object_manager.hpp>
00036 #include <objects/seqloc/Seq_interval.hpp>
00037 #include <objmgr/util/sequence.hpp>
00038 #include <objects/seqloc/Seq_id.hpp>
00039 
00040 #include <objects/seqalign/Dense_diag.hpp>
00041 #include <objects/seqalign/Dense_seg.hpp>
00042 #include <objects/seqalign/Std_seg.hpp>
00043 
00044 #include <algo/blast/format/blastxml2_format.hpp>
00045 #include <algo/blast/format/blastfmtutil.hpp>
00046 #include <objtools/align_format/showdefline.hpp>
00047 #include <objtools/align_format/align_format_util.hpp>
00048 #include <objtools/blast/seqdb_reader/seqdb.hpp>
00049 
00050 
00051 #include <objects/blastxml2/blastxml2__.hpp>
00052 #include <serial/objostrxml.hpp>
00053 #include <serial/objostrjson.hpp>
00054 
00055 #include <algo/blast/api/version.hpp>
00056 
00057 #include <algorithm>
00058 
00059 BEGIN_NCBI_SCOPE
00060 USING_SCOPE(objects);
00061 USING_SCOPE(blast);
00062 USING_SCOPE(align_format);
00063 
00064 
00065 /// Returns translation frame given the strand, alignment endpoints and
00066 /// total sequence length.
00067 /// @param plus_strand Is this position on a forward strand? [in]
00068 /// @param start Starting position, in 1-offset coordinates. [in]
00069 /// @param end Ending position in 1-offset coordinates [in]
00070 /// @param seq_length Total length of sequence [in]
00071 /// @return Frame number.
00072 static int 
00073 s_GetTranslationFrame(bool plus_strand, int start, int end, int seq_length)
00074 {
00075     int frame;
00076 
00077     if (plus_strand) {
00078         frame = (start - 1) % 3 + 1;
00079     } else {
00080         frame = -((seq_length - end) % 3 + 1);
00081     }
00082     
00083     return frame;
00084 }
00085 
00086 /// Creates a list of blastxml2::CHsp structures for the XML output, given a list of
00087 /// Seq-aligns.
00088 /// @param xhsp_list List of blastxml2::CHsp's to populate [in] [out]
00089 /// @param alnset Set of alignments to get data from [in]
00090 /// @param scope Scope for retrieving sequences [in]
00091 /// @param matrix 256x256 matrix for calculating positives for a protein search.
00092 ///               NULL is passed for a nucleotide search.
00093 /// @param mask_info Masking locations [in]
00094 static void
00095 s_SeqAlignSetToXMLHsps(list<CRef<blastxml2::CHsp> >& xhsp_list,
00096                        const CSeq_align_set& alnset, CRef<CScope> scope,
00097                        const CBlastFormattingMatrix* matrix,
00098                        const ncbi::TMaskedQueryRegions & mask_info,
00099                        int master_gentic_code, int slave_genetic_code)
00100 {
00101     int index = 1;
00102     ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
00103         CRef<blastxml2::CHsp> xhsp(new blastxml2::CHsp());
00104         const CSeq_align& kAlign = *(*iter);
00105         xhsp->SetNum(index);
00106         ++index;
00107         bool query_is_na, subject_is_na;
00108         int query_length, subject_length;
00109 
00110         int score, num_ident;
00111         double bit_score;
00112         double evalue;
00113         int sum_n;
00114         list<TGi> use_this_gi;
00115         CBlastFormatUtil::GetAlnScores(kAlign, score, bit_score, evalue, sum_n, 
00116                                        num_ident, use_this_gi);
00117 
00118         //Print 6 significant digits for double values
00119         char tmp[512];
00120         sprintf(tmp,"%.*g", 6, bit_score );
00121         bit_score = atof(tmp);
00122         sprintf(tmp,"%.*g", 6, evalue );
00123         evalue = atof(tmp);
00124 
00125         xhsp->SetBit_score(bit_score);
00126         xhsp->SetScore(score);
00127         xhsp->SetEvalue(evalue);
00128 
00129         // Extract the full list of subject ids
00130         try {
00131             const CBioseq_Handle& kQueryBioseqHandle = 
00132                 scope->GetBioseqHandle(kAlign.GetSeq_id(0));
00133             query_is_na = kQueryBioseqHandle.IsNa();
00134             query_length = kQueryBioseqHandle.GetBioseqLength();
00135             const CBioseq_Handle& kSubjBioseqHandle = 
00136                 scope->GetBioseqHandle(kAlign.GetSeq_id(1));
00137             subject_is_na = kSubjBioseqHandle.IsNa();
00138             subject_length = kSubjBioseqHandle.GetBioseqLength();
00139         } catch (const CException&) {
00140             // Either query or subject sequence not found - the remaining 
00141             // information cannot be correctly filled. Add this HSP as is
00142             // and continue.
00143             xhsp->SetQuery_from(0);
00144             xhsp->SetQuery_to(0);
00145             xhsp->SetHit_from(0);
00146             xhsp->SetHit_to(0);
00147             xhsp->SetIdentity(num_ident); // This may be inaccurate when 
00148                                           // alignment contains filtered regions.
00149             xhsp->SetQseq(NcbiEmptyString);
00150             xhsp->SetHseq(NcbiEmptyString);
00151             xhsp_list.push_back(xhsp);
00152             continue;
00153         }
00154 
00155         CRef<CSeq_align> final_aln(0);
00156    
00157         // Convert Std-seg and Dense-diag alignments to Dense-seg.
00158         // Std-segs are produced only for translated searches; Dense-diags only 
00159         // for ungapped, not translated searches.
00160         const bool kTranslated = kAlign.GetSegs().IsStd();
00161         if (kTranslated) {
00162             CRef<CSeq_align> densegAln = kAlign.CreateDensegFromStdseg();
00163             // When both query and subject are translated, i.e. tblastx, convert
00164             // to a special type of Dense-seg.
00165             if (query_is_na && subject_is_na)
00166                 final_aln = densegAln->CreateTranslatedDensegFromNADenseg();
00167             else
00168                 final_aln = densegAln;
00169         } else if (kAlign.GetSegs().IsDendiag()) {
00170             final_aln = CBlastFormatUtil::CreateDensegFromDendiag(kAlign);
00171         }
00172         
00173         const CDense_seg& kDenseg = (final_aln ? final_aln->GetSegs().GetDenseg() :
00174                                 kAlign.GetSegs().GetDenseg());
00175 
00176 
00177 
00178 
00179         // Do not trust the identities count in the Seq-align, because if masking 
00180         // was used, then masked residues were not counted as identities. 
00181         // Hence retrieve the sequences present in the alignment and count the 
00182         // identities again.
00183         string query_seq;
00184         string subject_seq;
00185         string middle_seq;
00186         string masked_query_seq;
00187 
00188         // For blastn search, the matches are shown as '|', and mismatches as
00189         // ' '; For all other searches matches are shown as matched characters,
00190         // mismatches as ' ', and positives as '+'.
00191         // This is a blastn search if and only if both query and subject are
00192         // nucleotide, and it is not a translated search.
00193         const bool kIsBlastn =
00194             (query_is_na && subject_is_na && !kTranslated);
00195 
00196         const CDense_seg * ds_pt = &kDenseg;
00197         CRef<CDense_seg> reversed_ds;
00198         // For non-transalted reverse strand alignments, show plus strand on
00199         // query and minus strand on subject. To accomplish this, Dense-seg must
00200         // be reversed.
00201         if (!kTranslated && kDenseg.IsSetStrands() &&
00202             kDenseg.GetStrands().front() == eNa_strand_minus)
00203         {
00204             reversed_ds.Reset(new CDense_seg);
00205             reversed_ds->Assign(kDenseg);
00206             reversed_ds->Reverse();
00207             ds_pt = &(*reversed_ds);
00208        }
00209 
00210         int q_start, q_end, s_start, s_end, q_frame=0, s_frame=0;
00211 
00212         unsigned int num_gaps = 0;
00213         int align_length = 0;
00214 
00215         if (kAlign.GetSegs().IsDendiag())
00216         {
00217             align_length = final_aln->GetAlignLength();
00218             q_start = final_aln->GetSeqStart(0) + 1;
00219             q_end = final_aln->GetSeqStop(0) + 1;
00220             s_start = final_aln->GetSeqStart(1) + 1;
00221             s_end = final_aln->GetSeqStop(1) + 1;
00222         }
00223         else
00224         {
00225             if(!kTranslated)
00226             {
00227                 num_gaps = kAlign.GetTotalGapCount();
00228                 align_length = kAlign.GetAlignLength();
00229             }
00230             q_start = kAlign.GetSeqStart(0) + 1;
00231             q_end = kAlign.GetSeqStop(0) + 1;
00232             s_start = kAlign.GetSeqStart(1) + 1;
00233             s_end = kAlign.GetSeqStop(1) + 1;
00234         }
00235 
00236         if (!kTranslated && query_is_na && subject_is_na) {
00237             xhsp->SetQuery_strand("Plus");
00238             if (eNa_strand_minus == kAlign.GetSeqStrand(0)){
00239                 xhsp->SetQuery_strand("Minus");
00240                 int tmp = s_start;
00241                 s_start = s_end;
00242                 s_end = tmp;
00243             }
00244             else {
00245                 xhsp->SetQuery_strand("Plus");
00246             }
00247 
00248             if (eNa_strand_minus == kAlign.GetSeqStrand(1))
00249                 xhsp->SetQuery_strand("Minus");
00250             else
00251                 xhsp->SetHit_strand("Plus");
00252 
00253         } else if (kTranslated) {
00254             align_length = final_aln->GetAlignLength();
00255             num_gaps = final_aln->GetTotalGapCount();
00256 
00257             if (query_is_na) {
00258                 q_frame = s_GetTranslationFrame(eNa_strand_minus != final_aln->GetSeqStrand(0),
00259                                                 q_start, q_end, query_length);
00260                 xhsp->SetQuery_frame(q_frame);
00261             }
00262             if (subject_is_na) {
00263                 s_frame = s_GetTranslationFrame(eNa_strand_minus != final_aln->GetSeqStrand(1),
00264                                                 s_start, s_end, subject_length);
00265                 xhsp->SetHit_frame(s_frame);
00266             }
00267         }
00268 
00269         xhsp->SetQuery_from(q_start);
00270         xhsp->SetQuery_to(q_end);
00271         xhsp->SetHit_from(s_start);
00272         xhsp->SetHit_to(s_end);
00273 
00274        if (mask_info.empty())
00275         {
00276             CBlastFormatUtil::GetWholeAlnSeqStrings(query_seq,
00277                                                    subject_seq,
00278                                                    *ds_pt,
00279                                                    *scope,
00280                                                    master_gentic_code,
00281                                                    slave_genetic_code);
00282         }
00283        else
00284        {
00285               CDisplaySeqalign::SeqLocCharOption kMaskCharOpt =
00286                                  (kIsBlastn ? CDisplaySeqalign::eN : CDisplaySeqalign::eX);
00287 
00288                 CBlastFormatUtil::GetWholeAlnSeqStrings(query_seq,
00289                                                       masked_query_seq,
00290                                                           subject_seq,
00291                                                       *ds_pt,
00292                                                       *scope,
00293                                                       master_gentic_code,
00294                                                       slave_genetic_code,
00295                                                       mask_info,
00296                                                       kMaskCharOpt,
00297                                                       q_frame);
00298        }
00299 
00300         num_ident = 0;
00301         int num_positives = 0;
00302         middle_seq = query_seq;
00303         // The query and subject sequence strings must be the same size in a 
00304         // correct alignment, but if alignment extends beyond the end of sequence
00305         // because of a bug, one of the sequence strings may be truncated, hence 
00306         // it is necessary to take a minimum here.
00307         // FIXME: Should an exception be thrown instead? 
00308         const unsigned int kMaxOffset = min(query_seq.size(),
00309                                             subject_seq.size());
00310         for (unsigned int i = 0; i < kMaxOffset; ++i) {
00311             if (query_seq[i] == subject_seq[i]) {
00312                 ++num_ident;
00313                 ++num_positives;
00314                 if (kIsBlastn)
00315                     middle_seq[i] = '|';
00316             } else if (matrix &&
00317                        (*matrix)(query_seq[i], subject_seq[i]) > 0 &&
00318                        !kIsBlastn) {
00319                 ++num_positives;
00320                 middle_seq[i] = kIsBlastn ? ' ' : '+';
00321             } else {
00322                 middle_seq[i] = ' ';
00323             }
00324         }
00325         
00326         xhsp->SetIdentity(num_ident);
00327         xhsp->SetGaps(num_gaps);
00328         xhsp->SetAlign_len(align_length);
00329 
00330         if (mask_info.empty())
00331             xhsp->SetQseq(query_seq);
00332         else
00333             xhsp->SetQseq(masked_query_seq);
00334         xhsp->SetHseq(subject_seq);
00335         xhsp->SetMidline(middle_seq);
00336         if(!(query_is_na && subject_is_na && !kTranslated) )
00337             xhsp->SetPositive(num_positives);
00338 
00339         xhsp_list.push_back(xhsp);
00340     }
00341 }
00342 
00343 /// Fill the blastxml2::CHit object in BLAST XML output, given an alignment and other
00344 /// information.
00345 /// @param hit blastxml2::CHit object to fill [in] [out]
00346 /// @param align_in Sequence alignment [in]
00347 /// @param scope Scope for retrieving sequences [in]
00348 /// @param matrix ASCII-alphabet matrix for calculation of positives [in]
00349 /// @param mask_info List of masking locations [in]
00350 /// @param ungapped Is this an ungapped search? [in]
00351 /// @param master_genetic_code query genetic code [in]
00352 /// @param slave_genetic_code subject genetic code [in]
00353 /// @param hasTaxDB Have access to taxonomy file [in]
00354 static void 
00355 s_SeqAlignToXMLHit(CRef<blastxml2::CHit>& hit,
00356                    const CSeq_align& align_in, CRef<CScope> scope,
00357                    const CBlastFormattingMatrix* matrix,
00358                    const ncbi::TMaskedQueryRegions & mask_info,
00359                    bool ungapped, int master_gentice_code,
00360                    int slave_genetic_code, bool hasTaxDB)
00361 {
00362     _ASSERT(align_in.GetSegs().IsDisc());
00363     const CSeq_align_set& kAlignSet = align_in.GetSegs().GetDisc();
00364 
00365     const CSeq_id& kSeqId = kAlignSet.Get().front()->GetSeq_id(1);
00366 
00367     try {
00368         const CBioseq_Handle& subj_handle = scope->GetBioseqHandle(kSeqId);
00369 
00370         CRef<CBlast_def_line_set> bdlRef =
00371                                        CSeqDB::ExtractBlastDefline(subj_handle);
00372         list <CRef<blastxml2::CHitDescr> >  & descr_list = hit->SetDescription();
00373         
00374         if(bdlRef.NotEmpty() && bdlRef->IsSet() && (!bdlRef->Get().empty())) {
00375             ITERATE(list<CRef<CBlast_def_line> >, itr, bdlRef->Get()) {
00376                 const CBlast_def_line & defline = **itr;
00377                 CRef<blastxml2::CHitDescr> hit_exp(new blastxml2::CHitDescr);
00378                 hit_exp->SetId(CShowBlastDefline::GetSeqIdListString(defline.GetSeqid(), true));
00379 
00380                  CRef<CSeq_id> best_id = FindBestChoice(defline.GetSeqid(), CSeq_id::Score);
00381                  CSeq_id_Handle id_handle = CSeq_id_Handle::GetHandle(*best_id);
00382                  string accession = CAlignFormatUtil::GetLabel(id_handle.GetSeqId());
00383                  if(accession != kEmptyStr)
00384                      hit_exp->SetAccession(accession);
00385 
00386                 if(defline.IsSetTitle())
00387                     hit_exp->SetTitle(defline.GetTitle());
00388 
00389                 if(defline.IsSetTaxid() && defline.GetTaxid() != 0) {
00390                     int tax_id = defline.GetTaxid();
00391                     hit_exp->SetTaxid(tax_id);
00392                     if(hasTaxDB) {
00393                          SSeqDBTaxInfo taxinfo;
00394                          CSeqDB::GetTaxInfo(tax_id, taxinfo);
00395                          hit_exp->SetSciname(taxinfo.scientific_name);
00396                     }
00397                 }
00398                 descr_list.push_back(hit_exp);
00399             }
00400         }
00401         else {
00402             CRef<blastxml2::CHitDescr> hit_exp(new blastxml2::CHitDescr);
00403             list<CRef<objects::CSeq_id> > ids;
00404             CShowBlastDefline::GetSeqIdList(subj_handle, ids);
00405             hit_exp->SetId(CShowBlastDefline::GetSeqIdListString(ids, true));
00406             hit_exp->SetTitle(sequence::CDeflineGenerator().GenerateDefline(subj_handle));
00407             descr_list.push_back(hit_exp);
00408         }
00409 
00410         
00411         int length = subj_handle.GetBioseqLength();
00412         hit->SetLen(length);
00413     } catch (const CException&) {
00414         CRef<blastxml2::CHitDescr> hit_exp(new blastxml2::CHitDescr);
00415         hit_exp->SetId(kSeqId.AsFastaString());
00416         hit->SetDescription().push_back(hit_exp);
00417         hit->SetLen(sequence::GetLength(kSeqId, scope));
00418     };
00419         
00420     // For ungapped search, multiple HSPs, possibly from different strands,
00421     // are packed into a single Seq-align.
00422     // The C++ utility functions cannot deal with such Seq-aligns, as they
00423     // expect one Seq-align per alignment (HSP). Hence we need to expand the
00424     // Seq-align-set obtained for an ungapped search.
00425     if (ungapped) {
00426         CRef<CSeq_align_set> expanded_align_set =
00427             CDisplaySeqalign::PrepareBlastUngappedSeqalign(kAlignSet);
00428         
00429         s_SeqAlignSetToXMLHsps(hit->SetHsps(), *expanded_align_set, scope, 
00430                                matrix, mask_info, master_gentice_code, slave_genetic_code);
00431     } else {
00432         s_SeqAlignSetToXMLHsps(hit->SetHsps(), kAlignSet, scope, matrix, 
00433                                mask_info, master_gentice_code, slave_genetic_code);
00434     }
00435 }
00436 
00437 /// Retrieves subject Seq-id from a Seq-align
00438 /// @param align Seq-align object [in]
00439 /// @return Subject Seq-id for this Seq-align.
00440 static const CSeq_id*
00441 s_GetSubjectId(const CSeq_align& align)
00442 {
00443     if (align.GetSegs().IsDenseg()) {
00444         return align.GetSegs().GetDenseg().GetIds()[1];
00445     } else if (align.GetSegs().IsDendiag()) {
00446         return align.GetSegs().GetDendiag().front()->GetIds()[1];
00447     } else if (align.GetSegs().IsStd()) {
00448         return align.GetSegs().GetStd().front()->GetIds()[1];
00449     }
00450 
00451     return NULL;
00452 }
00453  
00454 
00455 
00456 
00457 /// Fills the list of blastxml2::CHit objects, given a list of Seq-aligns.
00458 /// @param hits List of blastxml2::CHit objects to fill [in] [out]
00459 
00460 static void
00461 s_SetBlastXMlHitList(list<CRef<blastxml2::CHit> >& hits, const IBlastXML2ReportData* data, int num)
00462 {
00463     
00464 
00465     CConstRef<objects::CSeq_align_set>  alnset = data->GetAlignmentSet(num);
00466     CSeq_align_set::Tdata::const_iterator iter = alnset->Get().begin();
00467 
00468     CRef<CScope> scope = data->GetScope();
00469     const CBlastFormattingMatrix* matrix = data->GetMatrix();
00470     const ncbi::TMaskedQueryRegions & mask_info = data->GetMaskLocations();
00471     bool ungapped = !(data->IsGappedSearch());
00472     int master_gentice_code = data->GetQueryGeneticCode();
00473     int slave_genetic_code = data->GetDbGeneticCode();
00474     bool hasTaxDB = data->CanGetTaxInfo();
00475 
00476     int index = 1;
00477     while (iter != alnset->Get().end()) {
00478         CRef<blastxml2::CHit> new_hit(new blastxml2::CHit);
00479         new_hit->SetNum(index);
00480         index ++;
00481         // Retrieve the next set of results for a single subject sequence.
00482         // If the next Seq-align is discontinuous, then take it as is, 
00483         // otherwise go along the chain of Seq-aligns until the subject Seq-id
00484         // changes, then wrap the single subject list into a discontinuous 
00485         // Seq-align.
00486         if ((*iter)->GetSegs().IsDisc()) {
00487             s_SeqAlignToXMLHit(new_hit, *(*iter), scope, matrix, mask_info,
00488                                ungapped, master_gentice_code, slave_genetic_code, hasTaxDB);
00489             ++iter;
00490         } else {
00491             CSeq_align_set one_subject_alnset;
00492             CConstRef<CSeq_id> current_id(s_GetSubjectId(*(*iter)));
00493             for ( ; iter != alnset->Get().end(); ++iter) {
00494                 CConstRef<CSeq_id> next_id(s_GetSubjectId(*(*iter)));
00495                 if (!current_id->Match(*next_id)) {
00496                     break;
00497                 }
00498                 one_subject_alnset.Set().push_back(*iter);
00499             }
00500             CSeq_align disc_align_wrap;
00501             disc_align_wrap.SetSegs().SetDisc(one_subject_alnset);
00502             s_SeqAlignToXMLHit(new_hit, disc_align_wrap, scope, matrix,
00503                                mask_info, ungapped, master_gentice_code, slave_genetic_code, hasTaxDB);
00504         }
00505         
00506         hits.push_back(new_hit);
00507     }
00508 }
00509 
00510 
00511 /// Fills the parameters part of the BLAST XML output.
00512 /// @param bxmlout BLAST XML output object [in] [out]
00513 /// @param data Data structure, from which all necessary information can be 
00514 ///             retrieved [in]
00515 static void
00516 s_SetBlastXMLParameters(blastxml2::CParameters & params, const IBlastXML2ReportData* data)
00517 {
00518     string matrix_name = data->GetMatrixName();
00519     if (matrix_name != NcbiEmptyString)
00520         params.SetMatrix(matrix_name);
00521 
00522     params.SetExpect(data->GetEvalueThreshold());
00523 
00524     int val;
00525     string str;
00526     if ((val = data->GetMatchReward()) != 0)
00527         params.SetSc_match(val);
00528 
00529     if ((val = data->GetMismatchPenalty()) != 0)
00530         params.SetSc_mismatch(val);
00531 
00532     if(data->IsGappedSearch()) {
00533         params.SetGap_open(data->GetGapOpeningCost());
00534         params.SetGap_extend(data->GetGapExtensionCost());
00535     }
00536     if ((str = data->GetPHIPattern()) != NcbiEmptyString)
00537         params.SetPattern(str);
00538 
00539     if ((str = data->GetFilterString()) != NcbiEmptyString)
00540         params.SetFilter(str);
00541 
00542     if ((str = data->GetBl2seqMode()) != NcbiEmptyString)
00543         params.SetBl2seq_mode(str);
00544 
00545     if((val = data->GetCompositionBasedStats()) != 0)
00546         params.SetCbs(val);
00547 
00548     if((str = data->GetEntrezQuery()) != NcbiEmptyString)
00549         params.SetEntrez_query(str);
00550 
00551     if((val = data->GetQueryGeneticCode()) != 0)
00552         params.SetQuery_gencode(val);
00553 
00554     if((val = data->GetDbGeneticCode()) != 0)
00555         params.SetDb_gencode(val);
00556 }
00557 
00558 /// Fills the search statistics part of the BLAST XML output for all queries.
00559 /// @param stat_vec Vector of the blastxml2::CStatics objects, to be filled. [in] [out]
00560 /// @param data Data structure, from which all necessary information can be 
00561 ///             retrieved [in] 
00562 static void
00563 s_SetBlastXMLStatistics(blastxml2::CStatistics & stats,
00564                         const IBlastXML2ReportData* data, int num)
00565 {
00566     if(!data->IsBl2seq()) {
00567         stats.SetDb_num(data->GetDbNumSeqs());
00568         stats.SetDb_len(data->GetDbLength());
00569     }
00570 
00571     stats.SetHsp_len(data->GetLengthAdjustment(num));
00572     stats.SetEff_space(data->GetEffectiveSearchSpace(num));
00573     stats.SetKappa(data->GetKappa(num));
00574     stats.SetLambda(data->GetLambda(num));
00575     stats.SetEntropy(data->GetEntropy(num));
00576 }
00577 
00578 
00579 static void
00580 s_SetBlastXMLSearch(blastxml2::CSearch & search,
00581                     const IBlastXML2ReportData* data, int num)
00582 {
00583      CConstRef<objects::CSeq_loc> q_loc = data->GetQuerySeqLoc();
00584      const CSeq_id * q_id = q_loc->GetId();
00585      CRef<CScope> scope = data->GetScope();
00586      try {
00587             CBioseq_Handle bh = scope->GetBioseqHandle(*q_id);
00588             // Get the full query Seq-id string.
00589             const CBioseq& q_bioseq = *bh.GetBioseqCore();
00590             search.SetQuery_id(CBlastFormatUtil::GetSeqIdString(q_bioseq));
00591             string q_title = sequence::CDeflineGenerator().GenerateDefline(bh);
00592             if(q_title != kEmptyStr)
00593                 search.SetQuery_title(q_title);
00594         } catch (const CException&) {
00595             search.SetQuery_id(q_id->AsFastaString());
00596      }
00597 
00598     search.SetQuery_len(sequence::GetLength(*q_loc, &(*scope)));
00599 
00600     if(!data->GetMaskLocations().empty()) {
00601         list<CRef< blastxml2::CRange> > & masks = search.SetQuery_masking();
00602         TMaskedQueryRegions mask_locs = data->GetMaskLocations();
00603         ITERATE(TMaskedQueryRegions,  itr, mask_locs) {
00604             CRef<CSeqLocInfo> loc = *itr;
00605             if(loc->GetStrand() == eNa_strand_minus)
00606                 continue;
00607             CRef<blastxml2::CRange> rng (new blastxml2::CRange);
00608             rng->SetFrom(loc->GetInterval().GetFrom());
00609             rng->SetTo(loc->GetInterval().GetTo());
00610             masks.push_back(rng);
00611         }
00612     }
00613 
00614     blastxml2::CStatistics & stats = search.SetStat();
00615     s_SetBlastXMLStatistics(stats, data, num);
00616 
00617     string msg = data->GetMessages(num);
00618     // Check if the list is empty. Then there is nothing to fill.
00619     if (data->GetAlignmentSet(num).Empty()) {
00620         msg += " \n";
00621         msg += CBlastFormatUtil::kNoHitsFound;
00622         search.SetMessage(msg);
00623         return;
00624     }
00625 
00626     if(msg != kEmptyStr)
00627         search.SetMessage(msg);
00628 
00629     list<CRef<blastxml2::CHit> > & hit_list = search.SetHits();
00630     s_SetBlastXMlHitList(hit_list, data, num);
00631 }
00632 
00633 /// Given BLAST task, returns enumerated value for the publication to be 
00634 /// referenced.
00635 /// @param program BLAST task [in]
00636 /// @return What publication to reference?
00637 static CReference::EPublication
00638 s_GetBlastPublication(EProgram program)
00639 {
00640     CReference::EPublication publication = CReference::eMaxPublications;
00641 
00642     switch (program) {
00643     case eMegablast:
00644         publication = CReference::eMegaBlast; break;
00645     case ePHIBlastp: case ePHIBlastn:
00646         publication = CReference::ePhiBlast; break;
00647     case ePSIBlast:
00648         publication = CReference::eCompBasedStats; break;
00649     case eDeltaBlast:
00650         publication = CReference::eDeltaBlast; break;
00651     default:
00652         publication = CReference::eGappedBlast; break;
00653     }
00654     return publication;
00655 }
00656 
00657 static void s_FillBlastOutput(blastxml2::CBlastOutput& bxmlout, const IBlastXML2ReportData* data)
00658 {
00659     if(data == NULL)
00660          NCBI_THROW(CException, eUnknown, "blastxml2: NULL XML2ReportData pointer");
00661 
00662     bxmlout.Reset();
00663     blastxml2::CReport & report = bxmlout.SetReport();
00664     string program_name = data->GetBlastProgramName();
00665     report.SetProgram(program_name);
00666     report.SetVersion(CBlastFormatUtil::BlastGetVersion(program_name));
00667     EProgram blast_task = data->GetBlastTask();
00668     report.SetReference(CReference::GetString(s_GetBlastPublication(blast_task)));
00669     if(!data->GetSubjectIds().empty()) {
00670         report.SetSearch_target().SetSubjects() = data->GetSubjectIds();
00671     }
00672     else {
00673         report.SetSearch_target().SetDb(data->GetDatabaseName());
00674     }
00675 
00676     blastxml2::CParameters & params = report.SetParams();
00677     s_SetBlastXMLParameters(params, data);
00678 
00679     blastxml2::CResults & results = report.SetResults();
00680     if(data->IsBl2seq()) {
00681         list<CRef<blastxml2::CSearch> > & bl2seq = results.SetBl2seq();
00682         for(int i=0; i < data->GetNumOfSearchResults(); i++ ) {
00683             CRef<blastxml2::CSearch>  search (new blastxml2::CSearch);
00684             s_SetBlastXMLSearch(*search, data, i);
00685             bl2seq.push_back(search);
00686         }
00687 
00688     }
00689     else if(data->IsIterativeSearch()) {
00690         list<CRef<blastxml2::CIteration> > & iterations = results.SetIterations();
00691         for(int i=0; i < data->GetNumOfSearchResults(); i++ ) {
00692             CRef<blastxml2::CIteration> itr (new blastxml2::CIteration);
00693             itr->SetIter_num(i+1);
00694             blastxml2::CSearch & search = itr->SetSearch();
00695             s_SetBlastXMLSearch(search, data, i);
00696             iterations.push_back(itr);
00697         }
00698     }
00699     else {
00700         blastxml2::CSearch & search = results.SetSearch();
00701         s_SetBlastXMLSearch(search, data, 0);
00702     }
00703 
00704 }
00705 
00706 static void
00707 s_WriteXML2Object(blastxml2::CBlastOutput& bxmlout, CNcbiOstream *out_stream)
00708 {
00709     TTypeInfo typeInfo = bxmlout.GetThisTypeInfo();
00710     auto_ptr<CObjectOStreamXml> xml_out(new CObjectOStreamXml (*out_stream,false));
00711     xml_out->SetEncoding(eEncoding_Ascii);
00712     xml_out->SetVerifyData( eSerialVerifyData_No );
00713     //xml_out->SetReferenceDTD();
00714     xml_out->SetReferenceSchema();
00715     xml_out->SetUseSchemaLocation(true);
00716     xml_out->SetEnforcedStdXml();
00717     xml_out->SetDefaultSchemaNamespace("http://blast.ncbi.nlm.nih.gov/");
00718     xml_out->Write(&bxmlout, typeInfo );
00719 }
00720 
00721 /// Fills all fields in the data structure for a BLAST XML report.
00722 /// @param bxmlout BLAST XML report data structure to fill [in] [out]
00723 /// @param data  Data structure, from which all necessary information can be
00724 ///             retrieved [in]
00725 /// @param out_stream Output  stream for incremental output, ignore if NULL [out]
00726 void
00727 BlastXML2_FormatReport(const IBlastXML2ReportData* data, CNcbiOstream *out_stream )
00728 {
00729     blastxml2::CBlastOutput bxmlout;
00730     try {
00731         s_FillBlastOutput(bxmlout, data);
00732         s_WriteXML2Object(bxmlout, out_stream);
00733     }
00734     catch(CException &e){
00735         ERR_POST(Error << e.GetMsg() << e.what() );
00736         return;
00737     }
00738     catch(...){
00739         ERR_POST(Error << "XML format failed" );
00740         return;
00741     }
00742 }
00743 
00744 void
00745 BlastXML2_FormatReport(const IBlastXML2ReportData* data, string file_name)
00746 {
00747     blastxml2::CBlastOutput bxmlout;
00748     try {
00749         CNcbiOfstream out_stream;
00750         out_stream.open(file_name.c_str(), IOS_BASE::out);
00751         if(!out_stream.is_open())
00752              NCBI_THROW(CException, eInvalid, "Cannot open output file");
00753 
00754         s_FillBlastOutput(bxmlout, data);
00755         s_WriteXML2Object(bxmlout, &out_stream);
00756     }
00757     catch(CException &e){
00758         ERR_POST(Error << e.GetMsg() << e.what() );
00759         return;
00760     }
00761     catch(...){
00762         ERR_POST(Error << "XML format failed" );
00763         return;
00764     }
00765 }
00766 
00767 
00768 void
00769 BlastXML2_FormatError(int exit_code, string err_msg,
00770                       CNcbiOstream *out_stream)
00771 {
00772     blastxml2::CBlastOutput bxmlout;
00773     bxmlout.SetError().SetCode(exit_code);
00774     if(err_msg != kEmptyStr) {
00775         bxmlout.SetError().SetMessage(err_msg);
00776     }
00777     s_WriteXML2Object(bxmlout, out_stream);
00778 }
00779 
00780 static void
00781 s_WriteJSONObject(blastxml2::CBlastOutput& bxmlout, CNcbiOstream *out_stream)
00782 {
00783     TTypeInfo typeInfo = bxmlout.GetThisTypeInfo();
00784     auto_ptr<CObjectOStreamJson> json_out(new CObjectOStreamJson (*out_stream,false));
00785     json_out->SetDefaultStringEncoding(eEncoding_Ascii);
00786     //json_out.SetUseIndentation(true);
00787     //json_out.SetUseEol(true);
00788     json_out->Write(&bxmlout, typeInfo );
00789 }
00790 
00791 
00792 void
00793 BlastJSON_FormatReport(const IBlastXML2ReportData* data, string file_name)
00794 {
00795     blastxml2::CBlastOutput bxmlout;
00796     try {
00797         CNcbiOfstream out_stream;
00798         out_stream.open(file_name.c_str(), IOS_BASE::out);
00799         if(!out_stream.is_open())
00800              NCBI_THROW(CException, eInvalid, "Cannot open output file");
00801 
00802         s_FillBlastOutput(bxmlout, data);
00803         s_WriteJSONObject(bxmlout, &out_stream);
00804     }
00805     catch(CException &e){
00806         ERR_POST(Error << e.GetMsg() << e.what() );
00807         return;
00808     }
00809     catch(...){
00810         ERR_POST(Error << "JSON format failed" );
00811         return;
00812     }
00813 }
00814 
00815 void
00816 BlastJSON_FormatReport(const IBlastXML2ReportData* data, CNcbiOstream *out_stream )
00817 {
00818     blastxml2::CBlastOutput bxmlout;
00819     try {
00820         s_FillBlastOutput(bxmlout, data);
00821         s_WriteJSONObject(bxmlout, out_stream);
00822     }
00823     catch(CException &e){
00824         ERR_POST(Error << e.GetMsg() << e.what() );
00825         return;
00826     }
00827     catch(...){
00828         ERR_POST(Error << "JSON format failed" );
00829         return;
00830     }
00831 }
00832 
00833 END_NCBI_SCOPE
Modified on Mon Nov 24 14:34:35 2014 by modify_doxy.py rev. 426318