src/algo/blast/format/blastxml_format.cpp

Go to the documentation of this file.
00001 /* $Id: blastxml_format.cpp 169810 2009-09-03 13:17:44Z madden $
00002 * ===========================================================================
00003 *
00004 *                            PUBLIC DOMAIN NOTICE
00005 *               National Center for Biotechnology Information
00006 *
00007 *  This software/database is a "United States Government Work" under the
00008 *  terms of the United States Copyright Act.  It was written as part of
00009 *  the author's offical duties as a United States Government employee and
00010 *  thus cannot be copyrighted.  This software/database is freely available
00011 *  to the public for use. The National Library of Medicine and the U.S.
00012 *  Government have not placed any restriction on its use or reproduction.
00013 *
00014 *  Although all reasonable efforts have been taken to ensure the accuracy
00015 *  and reliability of the software and data, the NLM and the U.S.
00016 *  Government do not and cannot warrant the performance or results that
00017 *  may be obtained by using this software or data. The NLM and the U.S.
00018 *  Government disclaim all warranties, express or implied, including
00019 *  warranties of performance, merchantability or fitness for any particular
00020 *  purpose.
00021 *
00022 *  Please cite the author in any work or product based on this material.
00023 *
00024 * ===========================================================================
00025 *
00026 * Author: Ilya Dondoshansky
00027 *
00028 * ===========================================================================
00029 */
00030 
00031 /// @file blastxml_format.cpp
00032 /// Formatting of BLAST results in XML form, using the BLAST XML specification.
00033 
00034 #ifndef SKIP_DOXYGEN_PROCESSING
00035 static char const rcsid[] = "$Id: blastxml_format.cpp 169810 2009-09-03 13:17:44Z madden $";
00036 #endif /* SKIP_DOXYGEN_PROCESSING */
00037 
00038 #include <ncbi_pch.hpp>
00039 #include <objmgr/object_manager.hpp>
00040 #include <objects/seqloc/Seq_interval.hpp>
00041 #include <objmgr/util/sequence.hpp>
00042 #include <objects/seqloc/Seq_id.hpp>
00043 
00044 #include <objects/seqalign/Dense_diag.hpp>
00045 #include <objects/seqalign/Dense_seg.hpp>
00046 #include <objects/seqalign/Std_seg.hpp>
00047 
00048 #include <algo/blast/format/blastxml_format.hpp>
00049 #include <algo/blast/format/blastfmtutil.hpp>
00050 #include <objtools/align_format/showdefline.hpp>
00051 #include <objtools/align_format/align_format_util.hpp>
00052 
00053 #include <serial/objostrxml.hpp>
00054 
00055 #include <algo/blast/api/version.hpp>
00056 
00057 #include <algorithm>
00058 
00059 BEGIN_NCBI_SCOPE
00060 USING_SCOPE(objects);
00061 USING_SCOPE(blast);
00062 USING_SCOPE(align_format);
00063 
00064 ncbi::TMaskedQueryRegions mask;
00065 
00066 /// Auxiliary structure used for sorting CRange<int> objects in increasing
00067 /// order of starting positions.
00068 struct SRangeStartSort {
00069     bool operator()(CRange<int>* const& range1, CRange<int>* const& range2) 
00070     {
00071         return (range1->GetFrom() < range2->GetFrom());
00072     }
00073 };
00074 
00075 /// Wrapper class for a vector of CRange<ing> objects, needed to avoid
00076 /// manual memory deallocation.
00077 class CRangeVector : public vector<CRange<int>* >
00078 {
00079 public:
00080     /// Overrides the default destructor to deallocate all vector elements.
00081     ~CRangeVector()
00082     {
00083         for (iterator pItem = begin(); pItem != end(); ++pItem)
00084             delete *pItem;
00085     }
00086 };
00087 
00088 // helper function: serialize given object (could be partially initialized) 
00089 // to string buffer and return it in two parts before and after given tag.
00090 // object to serialize
00091 // tag to devide by, often "</TAG_NAME>"
00092 // start_part beginning of a serialized data before tag
00093 // end_part   end of a serialized data starting from tag
00094 // add_reference_dtd boolen flag, if true  - print a DOCTYPE DTD reference
00095 // add_xml_versioni boolena flag, if true prin "xml version" open priabula
00096 static bool s_SerializeAndSplitBy(const CSerialObject &object,
00097     const char *tag,
00098     string &start_part,
00099     string &end_part,
00100     bool add_reference_dtdi = false,
00101     bool add_xml_versioni = false );
00102 
00103 /// Masks a query sequence string corresponding to an alignment, given a list
00104 /// of mask locations.
00105 /// @param alnvec One alignment [in]
00106 /// @param query_seq Query string corresponding to this alignment [in] [out]
00107 /// @param mask_info List of masking locations [in]
00108 /// @param mask_char How should sequence be masked? [in]
00109 /// @param query_frame If query is translated, what query frame is this 
00110 ///                    alignment for?
00111 static void
00112 s_MaskQuerySeq(CAlnVec& alnvec, string& query_seq, 
00113                const ncbi::TMaskedQueryRegions& mask_info, 
00114                CDisplaySeqalign::SeqLocCharOption mask_char,
00115                int query_frame)
00116 {
00117     const int kNumSegs = alnvec.GetNumSegs();
00118     vector<CRange<int> > segs_v;
00119     for (int index = 0; index < kNumSegs; ++index) {
00120         CRange<int> range(alnvec.GetAlnStart(index), 
00121                           alnvec.GetAlnStop(index));
00122         segs_v.push_back(range);
00123     }
00124 
00125     CRangeVector masks_v;
00126     int aln_stop = query_seq.size() - 1;
00127     ITERATE(ncbi::TMaskedQueryRegions, mask_iter, mask_info) {
00128         if ((*mask_iter)->GetFrame() != query_frame)
00129             continue;
00130         int start = 
00131             alnvec.GetAlnPosFromSeqPos(0, 
00132                                        (*mask_iter)->GetInterval().GetFrom());
00133         int stop = 
00134             alnvec.GetAlnPosFromSeqPos(0, 
00135                                        (*mask_iter)->GetInterval().GetTo());
00136         // For negative frames, start and stop must be swapped.
00137         if (query_frame < 0) {
00138             int tmp = start;
00139             start = stop;
00140             stop = tmp;
00141         }
00142         if (start >= 0) {
00143             if (stop < 0)
00144                 stop = aln_stop;
00145             CRange<int>*  range = new CRange<int>(start, stop);
00146             masks_v.push_back(range);
00147         }
00148     }
00149 
00150     sort(masks_v.begin(), masks_v.end(), SRangeStartSort());
00151 
00152     // Mask the sequence
00153     int mask_index = 0;
00154     for (int seg_index = 0; 
00155          seg_index < (int) segs_v.size() && mask_index < (int) masks_v.size(); 
00156          ++seg_index) {
00157         if (segs_v[seg_index].Empty())
00158             continue;
00159         int seg_start = segs_v[seg_index].GetFrom();
00160         int seg_stop = segs_v[seg_index].GetTo();
00161         int mask_pos;
00162         while (mask_index < (int) masks_v.size() &&
00163                (mask_pos = max(seg_start, masks_v[mask_index]->GetFrom()))
00164                <= seg_stop) {
00165             int mask_stop = min(seg_stop, masks_v[mask_index]->GetTo());
00166             // Mask the respective part of the sequence
00167             for ( ; mask_pos <= mask_stop; ++mask_pos) {
00168         if(  query_seq[mask_pos] == '-' ) continue; // preserve gap
00169                 if (mask_char == CDisplaySeqalign::eX) {
00170                     query_seq[mask_pos] = 'X';
00171                 } else if (mask_char == CDisplaySeqalign::eN){
00172                     query_seq[mask_pos]='N';
00173                 } else if (mask_char == CDisplaySeqalign::eLowerCase) {
00174                     query_seq[mask_pos] =
00175                         tolower((unsigned char)query_seq[mask_pos]);
00176                 } 
00177             }
00178             // Advance to the next mask if this mask is done with. Otherwise
00179             // break out of the loop.
00180             if (mask_pos < seg_stop)
00181                 ++mask_index;
00182             else 
00183                 break;
00184         }
00185     }
00186 }
00187 
00188 /// Returns translation frame given the strand, alignment endpoints and
00189 /// total sequence length.
00190 /// @param plus_strand Is this position on a forward strand? [in]
00191 /// @param start Starting position, in 1-offset coordinates. [in]
00192 /// @param end Ending position in 1-offset coordinates [in]
00193 /// @param seq_length Total length of sequence [in]
00194 /// @return Frame number.
00195 static int 
00196 s_GetTranslationFrame(bool plus_strand, int start, int end, int seq_length)
00197 {
00198     int frame;
00199 
00200     if (plus_strand) {
00201         frame = (start - 1) % 3 + 1;
00202     } else {
00203         frame = -((seq_length - end) % 3 + 1);
00204     }
00205     
00206     return frame;
00207 }
00208 
00209 /// Creates a list of CHsp structures for the XML output, given a list of 
00210 /// Seq-aligns.
00211 /// @param xhsp_list List of CHsp's to populate [in] [out]
00212 /// @param alnset Set of alignments to get data from [in]
00213 /// @param scope Scope for retrieving sequences [in]
00214 /// @param matrix 256x256 matrix for calculating positives for a protein search.
00215 ///               NULL is passed for a nucleotide search.
00216 /// @param mask_info Masking locations [in]
00217 static void
00218 s_SeqAlignSetToXMLHsps(list<CRef<CHsp> >& xhsp_list, 
00219                        const CSeq_align_set& alnset, CScope* scope, 
00220                        const CBlastFormattingMatrix* matrix,
00221                        const ncbi::TMaskedQueryRegions* mask_info,
00222                        int master_gentice_code, int slave_genetic_code)
00223 {
00224     int index = 1;
00225     ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
00226         CRef<CHsp> xhsp(new CHsp());
00227         const CSeq_align& kAlign = *(*iter);
00228         xhsp->SetNum(index);
00229         ++index;
00230         bool query_is_na, subject_is_na;
00231         int query_length, subject_length;
00232 
00233         int score, num_ident;
00234         double bit_score;
00235         double evalue;
00236         int sum_n;
00237         list<int> use_this_gi;
00238         CBlastFormatUtil::GetAlnScores(kAlign, score, bit_score, evalue, sum_n, 
00239                                        num_ident, use_this_gi);
00240         xhsp->SetBit_score(bit_score);
00241         xhsp->SetScore(score);
00242         xhsp->SetEvalue(evalue);
00243 
00244         // Extract the full list of subject ids
00245         try {
00246             const CBioseq_Handle& kQueryBioseqHandle = 
00247                 scope->GetBioseqHandle(kAlign.GetSeq_id(0));
00248             query_is_na = kQueryBioseqHandle.IsNa();
00249             query_length = kQueryBioseqHandle.GetBioseqLength();
00250             const CBioseq_Handle& kSubjBioseqHandle = 
00251                 scope->GetBioseqHandle(kAlign.GetSeq_id(1));
00252             subject_is_na = kSubjBioseqHandle.IsNa();
00253             subject_length = kSubjBioseqHandle.GetBioseqLength();
00254         } catch (const CException&) {
00255             // Either query or subject sequence not found - the remaining 
00256             // information cannot be correctly filled. Add this HSP as is
00257             // and continue.
00258             xhsp->SetQuery_from(0);
00259             xhsp->SetQuery_to(0);
00260             xhsp->SetHit_from(0);
00261             xhsp->SetHit_to(0);
00262             xhsp->SetIdentity(num_ident); // This may be inaccurate when 
00263                                           // alignment contains filtered regions.
00264             xhsp->SetQseq(NcbiEmptyString);
00265             xhsp->SetHseq(NcbiEmptyString);
00266             xhsp_list.push_back(xhsp);
00267             continue;
00268         }
00269 
00270         CRef<CSeq_align> final_aln(0);
00271    
00272         // Convert Std-seg and Dense-diag alignments to Dense-seg.
00273         // Std-segs are produced only for translated searches; Dense-diags only 
00274         // for ungapped, not translated searches.
00275         const bool kTranslated = kAlign.GetSegs().IsStd();
00276         
00277         if (kTranslated) {
00278             CRef<CSeq_align> densegAln = kAlign.CreateDensegFromStdseg();
00279             // When both query and subject are translated, i.e. tblastx, convert
00280             // to a special type of Dense-seg.
00281             if (query_is_na && subject_is_na)
00282                 final_aln = densegAln->CreateTranslatedDensegFromNADenseg();
00283             else
00284                 final_aln = densegAln;
00285         } else if (kAlign.GetSegs().IsDendiag()) {
00286             final_aln = CBlastFormatUtil::CreateDensegFromDendiag(kAlign);
00287         }
00288         
00289         const CDense_seg& kDenseg = (final_aln ? final_aln->GetSegs().GetDenseg() :
00290                                 kAlign.GetSegs().GetDenseg());
00291 
00292         CRef<CAlnVec> aln_vec;
00293 
00294         // For non-transalted reverse strand alignments, show plus strand on 
00295         // query and minus strand on subject. To accomplish this, Dense-seg must
00296         // be reversed.
00297         if (!kTranslated && kDenseg.IsSetStrands() && 
00298             kDenseg.GetStrands().front() == eNa_strand_minus) {
00299             CRef<CDense_seg> reversed_ds(new CDense_seg);
00300             reversed_ds->Assign(kDenseg);
00301             reversed_ds->Reverse();
00302             aln_vec.Reset(new CAlnVec(*reversed_ds, *scope));   
00303         } else {
00304             aln_vec.Reset(new CAlnVec(kDenseg, *scope));
00305         }    
00306 
00307         //Note: do not switch the set order per calnvec specs.
00308         aln_vec->SetGenCode(slave_genetic_code);
00309         aln_vec->SetGenCode(master_gentice_code, 0);
00310 
00311         int align_length, num_gaps, num_gap_opens;
00312         CBlastFormatUtil::GetAlignLengths(*aln_vec, align_length, num_gaps, 
00313                                           num_gap_opens);
00314         
00315         int q_start, q_end, s_start, s_end, q_frame=0, s_frame=0;
00316         
00317         q_start = aln_vec->GetSeqStart(0) + 1;
00318         q_end = aln_vec->GetSeqStop(0) + 1;
00319         s_start = aln_vec->GetSeqStart(1) + 1;
00320         s_end = aln_vec->GetSeqStop(1) + 1;
00321 
00322         if (!kTranslated && query_is_na && subject_is_na) {
00323             q_frame = s_frame = 1;
00324             // For reverse strand alignment, set subject frame to -1 and
00325             // swap start and end coordinates.
00326             if (aln_vec->IsNegativeStrand(1)) {
00327                 s_frame = -1;
00328                 int tmp = s_start;
00329                 s_start = s_end;
00330                 s_end = tmp;
00331             }
00332         } else if (kTranslated) {
00333             if (query_is_na)
00334                 q_frame = s_GetTranslationFrame(aln_vec->IsPositiveStrand(0), 
00335                                                 q_start, q_end, query_length);
00336             if (subject_is_na)
00337                 s_frame = s_GetTranslationFrame(aln_vec->IsPositiveStrand(1), 
00338                                                 s_start, s_end, subject_length); 
00339         }
00340 
00341         xhsp->SetQuery_frame(q_frame);
00342         xhsp->SetHit_frame(s_frame);
00343 
00344         xhsp->SetQuery_from(q_start);
00345         xhsp->SetQuery_to(q_end);
00346         xhsp->SetHit_from(s_start);
00347         xhsp->SetHit_to(s_end);
00348 
00349         // Do not trust the identities count in the Seq-align, because if masking 
00350         // was used, then masked residues were not counted as identities. 
00351         // Hence retrieve the sequences present in the alignment and count the 
00352         // identities again.
00353         string query_seq;
00354         string subject_seq;
00355         string middle_seq;
00356         aln_vec->SetGapChar('-');
00357         aln_vec->GetWholeAlnSeqString(0, query_seq);
00358 
00359         // For blastn search, the matches are shown as '|', and mismatches as 
00360         // ' '; For all other searches matches are shown as matched characters,
00361         // mismatches as ' ', and positives as '+'.
00362         // This is a blastn search if and only if both query and subject are 
00363         // nucleotide, and it is not a translated search.
00364         const bool kIsBlastn = 
00365             (query_is_na && subject_is_na && !kTranslated);
00366 
00367         aln_vec->GetWholeAlnSeqString(1, subject_seq);
00368 
00369         num_ident = 0;
00370         int num_positives = 0;
00371         middle_seq = query_seq;
00372         // The query and subject sequence strings must be the same size in a 
00373         // correct alignment, but if alignment extends beyond the end of sequence
00374         // because of a bug, one of the sequence strings may be truncated, hence 
00375         // it is necessary to take a minimum here.
00376         // FIXME: Should an exception be thrown instead? 
00377         const unsigned int kMaxOffset = min(query_seq.size(),
00378                                             subject_seq.size());
00379         for (unsigned int i = 0; i < kMaxOffset; ++i) {
00380             if (query_seq[i] == subject_seq[i]) {
00381                 ++num_ident;
00382                 ++num_positives;
00383                 if (kIsBlastn)
00384                     middle_seq[i] = '|';
00385             } else if (matrix &&
00386                        (*matrix)(query_seq[i], subject_seq[i]) > 0 &&
00387                        !kIsBlastn) {
00388                 ++num_positives;
00389                 middle_seq[i] = kIsBlastn ? ' ' : '+';
00390             } else {
00391                 middle_seq[i] = ' ';
00392             }
00393         }
00394         
00395         xhsp->SetIdentity(num_ident);
00396         xhsp->SetGaps(num_gaps);
00397         xhsp->SetAlign_len(align_length);
00398 
00399         // Only now, after identities and positives have been computed, it's OK
00400         // to mask the filtered locations on the query sequence.
00401         if (mask_info) {
00402             const CDisplaySeqalign::SeqLocCharOption kMaskCharOpt =
00403                 (kIsBlastn ? CDisplaySeqalign::eN : CDisplaySeqalign::eX);
00404             s_MaskQuerySeq(*aln_vec, query_seq, *mask_info, kMaskCharOpt, 
00405                            q_frame);
00406         }
00407 
00408         xhsp->SetQseq(query_seq);
00409         xhsp->SetHseq(subject_seq);
00410         xhsp->SetMidline(middle_seq);
00411         xhsp->SetPositive(num_positives);
00412 
00413 
00414         xhsp_list.push_back(xhsp);
00415     }
00416 }
00417 
00418 /// Fill the CHit object in BLAST XML output, given an alignment and other
00419 /// information.
00420 /// @param hit CHit object to fill [in] [out]
00421 /// @param align_in Sequence alignment [in]
00422 /// @param scope Scope for retrieving sequences [in]
00423 /// @param matrix ASCII-alphabet matrix for calculation of positives [in]
00424 /// @param mask_info List of masking locations [in]
00425 /// @param ungapped Is this an ungapped search? [in]
00426 static void 
00427 s_SeqAlignToXMLHit(CRef<CHit>& hit, const CSeq_align& align_in, CScope* scope,
00428                    const CBlastFormattingMatrix* matrix, 
00429                    const ncbi::TMaskedQueryRegions* mask_info, 
00430                    bool ungapped, int master_gentice_code, int slave_genetic_code)
00431 {
00432     _ASSERT(align_in.GetSegs().IsDisc());
00433     const CSeq_align_set& kAlignSet = align_in.GetSegs().GetDisc();
00434 
00435     // Check if the list is empty. Then there is nothing to fill.
00436     if (kAlignSet.Get().empty())
00437         return;
00438 
00439     // Create the new CHit object.
00440     hit.Reset(new CHit());
00441 
00442     const CSeq_id& kSeqId = kAlignSet.Get().front()->GetSeq_id(1);
00443 
00444     try {
00445         const CBioseq_Handle& kSubjBioseqHandle = scope->GetBioseqHandle(kSeqId);
00446         /// @todo FIXME Should this be passed somehow? For now the following
00447         /// list is empty.
00448         list<int> use_this_gi; 
00449         string seqid;
00450         string defline;
00451         /// @todo FIXME Should the "show gi" option be passed to the XML 
00452         /// formatter? At this time gis are shown unconditionally.
00453         CShowBlastDefline::GetBioseqHandleDeflineAndId(kSubjBioseqHandle, 
00454                                                        use_this_gi, seqid, 
00455                                                        defline, true);
00456         if (defline == NcbiEmptyString)
00457             defline = "No definition line";
00458         
00459         hit->SetId(seqid);
00460         hit->SetDef(defline);
00461 
00462         // Find the "best" Seq-id, and retrieve accession (without version).
00463         CSeq_id_Handle idh = 
00464             sequence::GetId(kSubjBioseqHandle, sequence::eGetId_Best); 
00465         string accession = CAlignFormatUtil::GetLabel(idh.GetSeqId());
00466         hit->SetAccession(accession);
00467         
00468         int length = sequence::GetLength(kSeqId, scope);
00469         hit->SetLen(length);
00470     } catch (const CException&) {
00471         // If Bioseq handle didn't return some of the information, and not all
00472         // mandatory couldn't be filled, skip this hit completely.
00473         //hit.Reset(NULL);
00474         hit->SetId(kSeqId.AsFastaString());
00475         hit->SetDef("Unknown");
00476         hit->SetAccession("Unknown");
00477         hit->SetLen(0);
00478     };
00479         
00480     // For ungapped search, multiple HSPs, possibly from different strands,
00481     // are packed into a single Seq-align.
00482     // The C++ utility functions cannot deal with such Seq-aligns, as they
00483     // expect one Seq-align per alignment (HSP). Hence we need to expand the
00484     // Seq-align-set obtained for an ungapped search.
00485     if (ungapped) {
00486         CRef<CSeq_align_set> expanded_align_set =
00487             CDisplaySeqalign::PrepareBlastUngappedSeqalign(kAlignSet);
00488         
00489         s_SeqAlignSetToXMLHsps(hit->SetHsps(), *expanded_align_set, scope, 
00490                                matrix, mask_info, master_gentice_code, slave_genetic_code);
00491     } else {
00492         s_SeqAlignSetToXMLHsps(hit->SetHsps(), kAlignSet, scope, matrix, 
00493                                mask_info, master_gentice_code, slave_genetic_code);
00494     }
00495 }
00496 
00497 /// Retrieves subject Seq-id from a Seq-align
00498 /// @param align Seq-align object [in]
00499 /// @return Subject Seq-id for this Seq-align.
00500 static const CSeq_id*
00501 s_GetSubjectId(const CSeq_align& align)
00502 {
00503     if (align.GetSegs().IsDenseg()) {
00504         return align.GetSegs().GetDenseg().GetIds()[1];
00505     } else if (align.GetSegs().IsDendiag()) {
00506         return align.GetSegs().GetDendiag().front()->GetIds()[1];
00507     } else if (align.GetSegs().IsStd()) {
00508         return align.GetSegs().GetStd().front()->GetIds()[1];
00509     }
00510 
00511     return NULL;
00512 }
00513  
00514 /// Fills the list of CHit objects, given a list of Seq-aligns. 
00515 /// @param hits List of CHit objects to fill [in] [out]
00516 /// @param alnset Seq-align-set object containing a list of sequence 
00517 ///               alignments. [in]
00518 /// @param scope Scope for retrieving sequences. [in]
00519 /// @param matrix ASCII-alphabet matrix for calculation of positives. [in]
00520 /// @param mask_info List of masking locations. [in]
00521 /// @param ungapped Is this an ungapped search? [in]
00522 static void
00523 s_SeqAlignSetToXMLHits(list <CRef<CHit> >& hits, const CSeq_align_set& alnset,
00524                        CScope* scope, const CBlastFormattingMatrix* matrix, 
00525                        const ncbi::TMaskedQueryRegions* mask_info,
00526                        bool ungapped, int master_gentice_code, int slave_genetic_code,
00527                        CNcbiOstream *out_stream)
00528 {
00529     // If there are no hits for this query, return with empty Hits list.
00530     if (alnset.Get().empty())
00531         return;
00532     
00533     CSeq_align_set::Tdata::const_iterator iter = alnset.Get().begin();
00534 
00535     int index = 1;
00536     bool incremental_output = (bool)out_stream;
00537     while (iter != alnset.Get().end()) {
00538         CRef<CHit> new_hit;
00539         // Retrieve the next set of results for a single subject sequence.
00540         // If the next Seq-align is discontinuous, then take it as is, 
00541         // otherwise go along the chain of Seq-aligns until the subject Seq-id
00542         // changes, then wrap the single subject list into a discontinuous 
00543         // Seq-align.
00544         if ((*iter)->GetSegs().IsDisc()) {
00545             s_SeqAlignToXMLHit(new_hit, *(*iter), scope, matrix, mask_info, 
00546                                ungapped, master_gentice_code, slave_genetic_code);
00547             ++iter;
00548         } else {
00549             CSeq_align_set one_subject_alnset;
00550             CConstRef<CSeq_id> current_id(s_GetSubjectId(*(*iter)));
00551             for ( ; iter != alnset.Get().end(); ++iter) {
00552                 CConstRef<CSeq_id> next_id(s_GetSubjectId(*(*iter)));
00553                 if (!current_id->Match(*next_id)) {
00554                     break;
00555                 }
00556                 one_subject_alnset.Set().push_back(*iter);
00557             }
00558             CSeq_align disc_align_wrap;
00559             disc_align_wrap.SetSegs().SetDisc(one_subject_alnset);
00560             s_SeqAlignToXMLHit(new_hit, disc_align_wrap, scope, matrix, 
00561                                mask_info, ungapped, master_gentice_code, slave_genetic_code);
00562         }
00563         
00564         if (new_hit) {
00565             new_hit->SetNum(index);
00566             ++index;
00567             if( !incremental_output ) hits.push_back(new_hit);
00568         else
00569         {
00570         CNcbiOstrstream  one_hit_os;
00571         auto_ptr<CObjectOStreamXml> xml_one_hit_os (new CObjectOStreamXml (one_hit_os,false));
00572         xml_one_hit_os->SetEncoding(eEncoding_Ascii);
00573         xml_one_hit_os->SetReferenceDTD(false);
00574         xml_one_hit_os->Write( &(*new_hit), new_hit->GetThisTypeInfo() ); 
00575         // remove leading xml version 
00576         string out_str = string(CNcbiOstrstreamToString(one_hit_os));
00577         string::size_type start_xml_pos = out_str.find("<?xml");
00578         if( start_xml_pos != string::npos ) {
00579             string::size_type  end_xml_pos = out_str.find_first_of("\n\r");
00580             out_str.erase(0,end_xml_pos+1);
00581         }
00582         *out_stream << out_str ; 
00583         }
00584 
00585         }
00586     }
00587 }
00588 
00589 /// Add an "iteration" to the BLAST XML report, corresponding to all alignments
00590 /// for a single query.
00591 /// @param bxmlout BLAST XML output object [in]
00592 /// @param alnset Set of aligments for a given query. [in]
00593 /// @param seqloc This query's Seq-loc. [in]
00594 /// @param scope Scope for retrieving sequences. [in]
00595 /// @param matrix ASCII-alphabet matrix for calculation of positives. [in]
00596 /// @param mask_info List of masking locations. [in]
00597 /// @param index This query's index [in]
00598 /// @param stat Search statistics for this query, already filled. [in]
00599 /// @param is_ungapped Is this an ungapped search? [in]
00600 /// @param out_stream Stream for incremental output, ignore if NULL [out]
00601 static void
00602 s_BlastXMLAddIteration(CBlastOutput& bxmlout, const CSeq_align_set* alnset,
00603                        const CSeq_loc& seqloc, CScope* scope, 
00604                        const CBlastFormattingMatrix* matrix, 
00605                        const ncbi::TMaskedQueryRegions* mask_info,
00606                        int index, CStatistics& stat, bool is_ungapped,
00607                        int master_gentice_code, int slave_genetic_code,
00608                        const vector<string>& messages,
00609                CNcbiOstream *out_stream)
00610 {
00611     bool incremental_output = (bool) out_stream;
00612     list<CRef<CIteration> >& iterations = bxmlout.SetIterations();
00613 
00614     CRef<CIteration> one_query_iter(new CIteration());
00615     
00616     one_query_iter->SetIter_num(index + 1);
00617     
00618     string query_def = NcbiEmptyString;
00619 
00620     // If Bioseq handle cannot return a title string here, it is not critical.
00621     // But make sure the exceptions are caught.
00622     try {
00623         CBioseq_Handle bh = scope->GetBioseqHandle(seqloc);
00624         // Get the full query Seq-id string.
00625         const CBioseq& kQueryBioseq = *bh.GetBioseqCore();
00626         one_query_iter->SetQuery_ID(
00627             CBlastFormatUtil::GetSeqIdString(kQueryBioseq));
00628         query_def = sequence::GetTitle(bh);
00629     } catch (const CException&) {
00630         const CSeq_id& kSeqId = sequence::GetId(seqloc, scope);
00631         one_query_iter->SetQuery_ID(kSeqId.AsFastaString());
00632     };
00633 
00634     if (query_def == NcbiEmptyString)
00635         query_def = "No definition line";
00636     one_query_iter->SetQuery_def(query_def);
00637 
00638     one_query_iter->SetQuery_len(sequence::GetLength(seqloc, scope));
00639     one_query_iter->SetStat(stat);
00640     if (messages.size() > 0 && !messages[index].empty())
00641        one_query_iter->SetMessage(messages[index]);
00642     // have serialized CIteration split and output first portion before hits
00643     string serial_xml_start, serial_xml_end;
00644     if( incremental_output ) {
00645     //bool add_dtd_reference = false, add_xml_version = false;  
00646         bool split_res = s_SerializeAndSplitBy( *one_query_iter, "</Iteration_query-len>",
00647         serial_xml_start, serial_xml_end); 
00648         *out_stream << serial_xml_start << "\n<Iteration_hits>\n"; // PART BEFORE HITS
00649     }
00650     
00651     // Only add hits if they exist.
00652     if (alnset) {
00653         s_SeqAlignSetToXMLHits(one_query_iter->SetHits(), *alnset,
00654                                scope, matrix, mask_info, is_ungapped, 
00655                                master_gentice_code, slave_genetic_code,
00656                    out_stream);
00657     }
00658 
00659     if( incremental_output ) *out_stream << "</Iteration_hits>" << serial_xml_end;
00660     else 
00661     iterations.push_back(one_query_iter);
00662 }
00663 
00664 /// Fills the parameters part of the BLAST XML output.
00665 /// @param bxmlout BLAST XML output object [in] [out]
00666 /// @param data Data structure, from which all necessary information can be 
00667 ///             retrieved [in]
00668 static void
00669 s_SetBlastXMLParameters(CBlastOutput& bxmlout, const IBlastXMLReportData* data)
00670 {
00671     CParameters& params = bxmlout.SetParam();
00672     string matrix_name = data->GetMatrixName();
00673     if (matrix_name != NcbiEmptyString)
00674         params.SetMatrix(matrix_name);
00675     params.SetExpect(data->GetEvalueThreshold());
00676     params.SetGap_open(data->GetGapOpeningCost());
00677     params.SetGap_extend(data->GetGapExtensionCost());
00678 
00679     int val;
00680     if ((val = data->GetMatchReward()) != 0)
00681         params.SetSc_match(val);
00682 
00683     if ((val = data->GetMismatchPenalty()) != 0)
00684         params.SetSc_mismatch(val);
00685 
00686     string str;
00687     if ((str = data->GetPHIPattern()) != NcbiEmptyString)
00688         params.SetPattern(str);
00689 
00690     if ((str = data->GetFilterString()) != NcbiEmptyString)
00691         params.SetFilter(str);
00692 }
00693 
00694 /// Fills the search statistics part of the BLAST XML output for all queries.
00695 /// @param stat_vec Vector of the CStatistics objects, to be filled. [in] [out]
00696 /// @param data Data structure, from which all necessary information can be 
00697 ///             retrieved [in] 
00698 static void
00699 s_BlastXMLGetStatistics(vector<CRef<CStatistics> >& stat_vec,
00700                         const IBlastXMLReportData* data)
00701 {
00702     int db_numseq = data->GetDbNumSeqs();
00703     Int8 db_length = data->GetDbLength();
00704 
00705     for (unsigned int index = 0; index < data->GetNumQueries(); ++index) {
00706         CRef<CStatistics> stat(new CStatistics());
00707         stat->SetDb_num(db_numseq);
00708         stat->SetDb_len((int)db_length);
00709         stat->SetHsp_len(data->GetLengthAdjustment(index));
00710         stat->SetEff_space((double)data->GetEffectiveSearchSpace(index));
00711         stat->SetKappa(data->GetKappa(index));
00712         stat->SetLambda(data->GetLambda(index));
00713         stat->SetEntropy(data->GetEntropy(index));
00714         stat_vec.push_back(stat);
00715     }
00716 }
00717 
00718 /// Given BLAST task, returns enumerated value for the publication to be 
00719 /// referenced.
00720 /// @param program BLAST task [in]
00721 /// @return What publication to reference?
00722 static CReference::EPublication
00723 s_GetBlastPublication(EProgram program)
00724 {
00725     CReference::EPublication publication = CReference::eMaxPublications;
00726 
00727     switch (program) {
00728     case eMegablast:
00729         publication = CReference::eMegaBlast; break;
00730     case ePHIBlastp: case ePHIBlastn:
00731         publication = CReference::ePhiBlast; break;
00732     case ePSIBlast:
00733         publication = CReference::eCompBasedStats; break;
00734     default:
00735         publication = CReference::eGappedBlast; break;
00736     }
00737     return publication;
00738 }
00739 
00740 /// Fills all fields in the data structure for a BLAST XML report.
00741 /// @param bxmlout BLAST XML report data structure to fill [in] [out]
00742 /// @param data  Data structure, from which all necessary information can be 
00743 ///             retrieved [in]
00744 /// @param out_stream Output  stream for incremental output, ignore if NULL [out]
00745 void 
00746 BlastXML_FormatReport(CBlastOutput& bxmlout, const IBlastXMLReportData* data, CNcbiOstream *out_stream)
00747 {
00748     bool incremental_output = (bool)out_stream;
00749     string program_name = data->GetBlastProgramName();
00750     bxmlout.SetProgram(program_name);
00751     bxmlout.SetVersion(CBlastFormatUtil::BlastGetVersion(program_name));
00752     EProgram blast_task = data->GetBlastTask();
00753     bxmlout.SetReference(CReference::GetString(s_GetBlastPublication(blast_task)));
00754     bxmlout.SetDb(data->GetDatabaseName());
00755 
00756     const CSeq_loc* kSeqLoc = data->GetQuery(0);
00757     if (!kSeqLoc)
00758         NCBI_THROW(CException, eUnknown, "Query Seq-loc is not available");
00759 
00760     CRef<CScope> scope(data->GetScope(0));
00761     
00762     string query_def = NcbiEmptyString;
00763 
00764     // Try to retrieve all Seq-ids, using a Bioseq handle. If this fails,
00765     // report the one available Seq-id, retrieved from the query Seq-loc.
00766     try {
00767         CBioseq_Handle bh = scope->GetBioseqHandle(*kSeqLoc);
00768         // Get the full query Seq-id string.
00769         const CBioseq& kQueryBioseq = *bh.GetBioseqCore();
00770         bxmlout.SetQuery_ID(CBlastFormatUtil::GetSeqIdString(kQueryBioseq));
00771         query_def = sequence::GetTitle(bh);
00772     } catch (const CException&) {
00773         const CSeq_id& seqid = sequence::GetId(*kSeqLoc, scope);
00774         bxmlout.SetQuery_ID(seqid.AsFastaString());
00775     };
00776 
00777     if (query_def == NcbiEmptyString)
00778         query_def = "No definition line";
00779 
00780     bxmlout.SetQuery_def(query_def);
00781 
00782     bxmlout.SetQuery_len(sequence::GetLength(*kSeqLoc, scope));
00783 
00784     s_SetBlastXMLParameters(bxmlout, data);
00785 
00786     auto_ptr< CBlastFormattingMatrix > matrix(data->GetMatrix());
00787 
00788     vector<CRef<CStatistics> > stat_vec;
00789     s_BlastXMLGetStatistics(stat_vec, data);
00790     //serialized data before and after BlastOutput_param
00791     string serial_xml_start, serial_xml_end;
00792     if( incremental_output ) {
00793     bool add_dtd_reference = true, add_xml_version = true;
00794     bool dummy_res = s_SerializeAndSplitBy( bxmlout, "</BlastOutput_param>", 
00795         serial_xml_start, serial_xml_end,
00796         add_dtd_reference, add_xml_version );
00797     // incremental_output   
00798         *out_stream << serial_xml_start << "\n<BlastOutput_iterations>" ; 
00799     }
00800 
00801     for (unsigned int index = 0; index < data->GetNumQueries(); ++index) {
00802         // Check that this query's Seq-loc is available.
00803         const CSeq_loc* seqloc = data->GetQuery(index);
00804         if (!seqloc) {
00805             string message = 
00806                 "Unable to retrieve query " + NStr::IntToString(index);
00807             NCBI_THROW(CException, eUnknown, message);
00808         }
00809         s_BlastXMLAddIteration(bxmlout, data->GetAlignment(index), *seqloc, 
00810                                data->GetScope(index), matrix.get(), 
00811                                data->GetMaskLocations(index), index, 
00812                                *stat_vec[index], !data->GetGappedMode(),
00813                                data->GetMasterGeneticCode(),  data->GetSlaveGeneticCode(),
00814                                data->GetMessages(),
00815                    out_stream);
00816     }
00817     if(incremental_output) *out_stream <<  "\n</BlastOutput_iterations>" << serial_xml_end << endl;
00818 }
00819 
00820 /// serialize givem object and split data by provided XML tag for futher manual integrationa
00821 //  <start of a  XML data ><TAG_NAME></TAG_NAME>< .., end of XML data>
00822 //static bool s_SerializeAndSplit(TConstObjectPtr object, TTypeInfo typeInfo )
00823 static bool s_SerializeAndSplitBy(const CSerialObject &object,
00824         const char *tag, // tag name to break XML data by in form </TAG_NAME>
00825         string &start_part,   // part before </TAG_NAME>
00826         string &end_part,
00827         bool add_reference_dtd,    // part starting from </TAG_NAME> 
00828         bool add_xml_version )
00829 {
00830     bool res_code = false; // not  implemented
00831     TTypeInfo typeInfo = object.GetThisTypeInfo();
00832     string breake_by_tag = tag;
00833     start_part="<NOT SET>";
00834     end_part="</NOT SET>";
00835     CNcbiOstrstream one_iter_ss_os;
00836     {
00837     auto_ptr<CObjectOStreamXml> xml_one_iter_os(new CObjectOStreamXml (one_iter_ss_os,false));
00838     xml_one_iter_os->SetEncoding(eEncoding_Ascii);
00839     xml_one_iter_os->SetVerifyData( eSerialVerifyData_No );
00840     xml_one_iter_os->SetReferenceDTD(add_reference_dtd);
00841     if( add_xml_version )
00842         xml_one_iter_os->Write(&object, typeInfo );
00843     else 
00844         xml_one_iter_os->WriteObject(&object, typeInfo );
00845     }
00846     string out_str = string(CNcbiOstrstreamToString(one_iter_ss_os));
00847     string::size_type  iterations_insert_point = out_str.find( breake_by_tag );
00848     if( iterations_insert_point != string::npos ){
00849     iterations_insert_point += breake_by_tag.length();
00850     start_part = out_str.substr(0,iterations_insert_point);
00851     end_part = out_str.substr(iterations_insert_point);
00852     res_code = true;
00853     }
00854     else {
00855     start_part = out_str;
00856     }
00857     return res_code;
00858 }
00859 
00860 END_NCBI_SCOPE
00861 
00862 

Generated on Wed Dec 9 03:56:57 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:17:46 2009 by modify_doxy.py rev. 173732